Struct ops Qdisc_ops
Qdisc ops is a type of struct_ops which allows the implementation of a custom qdisc in BPF.
Usage
BPF qdisc aims to be a flexible and easy-to-use infrastructure that allows users to quickly experiment with different scheduling algorithms/policies.
Fields and ops
struct Qdisc_ops {
struct Qdisc_ops *next;
const struct Qdisc_class_ops *cl_ops;
char id[IFNAMSIZ];
int priv_size;
unsigned int static_flags;
int (*enqueue)(struct sk_buff *skb,
struct Qdisc *sch,
struct sk_buff **to_free);
struct sk_buff * (*dequeue)(struct Qdisc *);
struct sk_buff * (*peek)(struct Qdisc *);
int (*init)(struct Qdisc *sch, struct nlattr *arg,
struct netlink_ext_ack *extack);
void (*reset)(struct Qdisc *);
void (*destroy)(struct Qdisc *);
int (*change)(struct Qdisc *sch,
struct nlattr *arg,
struct netlink_ext_ack *extack);
void (*attach)(struct Qdisc *sch);
int (*change_tx_queue_len)(struct Qdisc *, unsigned int);
void (*change_real_num_tx)(struct Qdisc *sch,
unsigned int new_real_tx);
int (*dump)(struct Qdisc *, struct sk_buff *);
int (*dump_stats)(struct Qdisc *, struct gnet_dump *);
void (*ingress_block_set)(struct Qdisc *sch,
u32 block_index);
void (*egress_block_set)(struct Qdisc *sch,
u32 block_index);
u32 (*ingress_block_get)(struct Qdisc *sch);
u32 (*egress_block_get)(struct Qdisc *sch);
struct module *owner;
};
next
struct Qdisc_ops *next;
All registered Qdisc_ops are linked together in a linked list, this field is the linked list header. It should always be unspecified by BPF as its managed by the kernel.
cl_ops
const struct Qdisc_class_ops *cl_ops;
This field are operations specific to classful qdiscs, which are not yet implemented as of v6.16.
id
char id[IFNAMSIZ];
The unique identifier of this qdisc type.
priv_size
int priv_size;
The amount of bytes stored in qdisc->privdata
. Typically used by builtin qdisc types, not available to BPF Qdisc as of v6.16.
static_flags
unsigned int static_flags;
A set of flags which will be the initial value of qdisc->flags
.
enqueue
int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free);
This op is called for every packet that is directed to a network device using the sch
instance of this BPF qdisc. The skb
is the packet that should be added to whatever data structure is implemented by the BPF qdisc.
Recognized return values are:
NET_XMIT_SUCCESS
(0x00) - The packet has been added to the data structure.NET_XMIT_DROP
(0x01) - The packet has been dropped.NET_XMIT_CN
(0x02) - Does not guarantee that this packet is lost. It indicates that the device will soon be dropping packets, or already drops some packets of the same priority; prompting us to send less aggressively.
If adding this packet causes other packets (perhaps older or of lower priority) to be removed from the data structure, the bpf_qdisc_skb_drop
kfunc can be used to enqueue these other packets to to_free
so memory associated with these dropped packets is freed.
dequeue
struct sk_buff * (*dequeue)(struct Qdisc *sch);
This op is called periodically when a network device with the sch
instance of this BPF Qdisc is ready to send packets. The op should return a packet to be sent if available, or NULL
if no packet is available. The returned packet should be removed from the data structure.
peek
struct sk_buff * (*peek)(struct Qdisc *sch);
This op can be called by a network device with the sch
instance of this BPF Qdisc to see if any packets are available and which would be first. The op should return a packet to be sent if available, or NULL
if no packet is available. The packet will not actually be sent, and the BPF Qdisc should hold onto the packet in its data structure.
init
int (*init)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack);
This op is called to initialize a qdisc instance sch
which will use the current ops for its implementation. arg
are the netlink arguments used to create this new qdisc instance.
extack
is the extended acknowledge, used to carry verbose error messages, which BPF qdiscs cannot utilize as of v6.16.
reset
void (*reset)(struct Qdisc *);
This op is called on an already initialized qdisc instance to reset it to its initial state.
destroy
void (*destroy)(struct Qdisc *);
This op is called when a qdisc instance is no longer in use, such as when its switched out or the device goes away.
change
int (*change)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack);
This op is called when settings for qdisc instance sch
are updated after it has been initialized. arg
are the netlink arguments containing the new settings.
extack
is the extended acknowledge, used to carry verbose error messages, which BPF qdiscs cannot utilize as of v6.16.
attach
void (*attach)(struct Qdisc *sch);
This op is called when qdisc instance sch
is attached to a network device.
change_tx_queue_len
int (*change_tx_queue_len)(struct Qdisc *sch, unsigned int new_len);
This op is called when a change of transmission queue length is requested. Returning 0
indicates success, any other value indicates an error.
change_real_num_tx
void (*change_real_num_tx)(struct Qdisc *sch, unsigned int new_real_tx);
This op is called to inform the qdisc of the number of transmission queues used by the network device to which qdisc instance sch
is attached.
dump
int (*dump)(struct Qdisc *sch, struct sk_buff *skb);
This op is called to dump information such as settings of the current qdisc instance sch
. skb
is a socket buffer (network packet) which will be the netlink message sent to userspace. The packet will already have other netlink data in there, this op is expected to append netlink attributes to the end, being pre-defined or custom, as long as netlink "type-length-attribute" format is used.
Returning NULL
means success, a negative value indicates failure.
dump_stats
int (*dump_stats)(struct Qdisc *sch, struct gnet_dump *d);
This op is called to fill d
with statistics about qdisc instance sch
.
Returning NULL
means success, a negative value indicates failure.
ingress_block_set
void (*ingress_block_set)(struct Qdisc *sch, u32 block_index);
This op is called when a TC block is specified for ingress via the TCA_INGRESS_BLOCK
attribute.
egress_block_set
void (*egress_block_set)(struct Qdisc *sch, u32 block_index);
This op is called when a TC block is specified for egress via the TCA_EGRESS_BLOCK
attribute.
ingress_block_get
u32 (*ingress_block_get)(struct Qdisc *sch);
This op is used to query the current ingress TC block associated with this qdisc instance sch
.
egress_block_get
u32 (*egress_block_get)(struct Qdisc *sch);
This op is used to query the current egress TC block associated with this qdisc instance sch
.
owner
struct module *owner;
This is a field internally used by the kernel to associate an owner for the ops.
Types
struct Qdisc
struct Qdisc {
int (*enqueue)(struct sk_buff *skb,
struct Qdisc *sch,
struct sk_buff **to_free);
struct sk_buff * (*dequeue)(struct Qdisc *sch);
unsigned int flags;
u32 limit;
const struct Qdisc_ops *ops;
struct qdisc_size_table __rcu *stab;
struct hlist_node hash;
u32 handle;
u32 parent;
struct netdev_queue *dev_queue;
struct net_rate_estimator __rcu *rate_est;
struct gnet_stats_basic_sync __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
int pad;
refcount_t refcnt;
/*
* For performance sake on SMP, we put highly modified fields at the end
*/
struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
struct qdisc_skb_head q;
struct gnet_stats_basic_sync bstats;
struct gnet_stats_queue qstats;
int owner;
unsigned long state;
unsigned long state2; /* must be written under qdisc spinlock */
struct Qdisc *next_sched;
struct sk_buff_head skb_bad_txq;
spinlock_t busylock ____cacheline_aligned_in_smp;
spinlock_t seqlock;
struct rcu_head rcu;
netdevice_tracker dev_tracker;
struct lock_class_key root_lock_key;
/* private data */
long privdata[] ____cacheline_aligned;
};
All fields are read-only accept for limit
, q->qlen
, and qstats
.
flags
#define TCQ_F_BUILTIN 1
#define TCQ_F_INGRESS 2
#define TCQ_F_CAN_BYPASS 4
#define TCQ_F_MQROOT 8
#define TCQ_F_ONETXQUEUE 0x10 /* dequeue_skb() can assume all skbs are for
* q->dev_queue : It can test
* netif_xmit_frozen_or_stopped() before
* dequeueing next packet.
* Its true for MQ/MQPRIO slaves, or non
* multiqueue device.
*/
#define TCQ_F_WARN_NONWC (1 << 16)
#define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */
#define TCQ_F_NOPARENT 0x40 /* root of its hierarchy :
* qdisc_tree_decrease_qlen() should stop.
*/
#define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */
privdata
struct sk_buff
See skbuff.h
for full structure.
All fields are read-only, except for tstamp
and cb
.
Example
FIFO (First In First Out)
A BPF implementation of the most basic qdisc, a simple queue without scheduling or ordering logic.
// SPDX-License-Identifier: GPL-2.0
// Copyright by Amery Hung <amery.hung@bytedance.com>
#include <vmlinux.h>
#include "bpf_experimental.h"
#include "bpf_qdisc_common.h"
char _license[] SEC("license") = "GPL";
struct skb_node {
struct sk_buff __kptr * skb;
struct bpf_list_node node;
};
private(A) struct bpf_spin_lock q_fifo_lock;
private(A) struct bpf_list_head q_fifo __contains(skb_node, node);
bool init_called;
SEC("struct_ops/bpf_fifo_enqueue")
int BPF_PROG(bpf_fifo_enqueue, struct sk_buff *skb, struct Qdisc *sch,
struct bpf_sk_buff_ptr *to_free)
{
struct skb_node *skbn;
u32 pkt_len;
if (sch->q.qlen == sch->limit)
goto drop;
skbn = bpf_obj_new(typeof(*skbn));
if (!skbn)
goto drop;
pkt_len = qdisc_pkt_len(skb);
sch->q.qlen++;
skb = bpf_kptr_xchg(&skbn->skb, skb);
if (skb)
bpf_qdisc_skb_drop(skb, to_free);
bpf_spin_lock(&q_fifo_lock);
bpf_list_push_back(&q_fifo, &skbn->node);
bpf_spin_unlock(&q_fifo_lock);
sch->qstats.backlog += pkt_len;
return NET_XMIT_SUCCESS;
drop:
bpf_qdisc_skb_drop(skb, to_free);
return NET_XMIT_DROP;
}
SEC("struct_ops/bpf_fifo_dequeue")
struct sk_buff *BPF_PROG(bpf_fifo_dequeue, struct Qdisc *sch)
{
struct bpf_list_node *node;
struct sk_buff *skb = NULL;
struct skb_node *skbn;
bpf_spin_lock(&q_fifo_lock);
node = bpf_list_pop_front(&q_fifo);
bpf_spin_unlock(&q_fifo_lock);
if (!node)
return NULL;
skbn = container_of(node, struct skb_node, node);
skb = bpf_kptr_xchg(&skbn->skb, skb);
bpf_obj_drop(skbn);
if (!skb)
return NULL;
sch->qstats.backlog -= qdisc_pkt_len(skb);
bpf_qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
SEC("struct_ops/bpf_fifo_init")
int BPF_PROG(bpf_fifo_init, struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
sch->limit = 1000;
init_called = true;
return 0;
}
SEC("struct_ops/bpf_fifo_reset")
void BPF_PROG(bpf_fifo_reset, struct Qdisc *sch)
{
struct bpf_list_node *node;
struct skb_node *skbn;
int i;
bpf_for(i, 0, sch->q.qlen) {
struct sk_buff *skb = NULL;
bpf_spin_lock(&q_fifo_lock);
node = bpf_list_pop_front(&q_fifo);
bpf_spin_unlock(&q_fifo_lock);
if (!node)
break;
skbn = container_of(node, struct skb_node, node);
skb = bpf_kptr_xchg(&skbn->skb, skb);
if (skb)
bpf_kfree_skb(skb);
bpf_obj_drop(skbn);
}
sch->q.qlen = 0;
}
SEC("struct_ops")
void BPF_PROG(bpf_fifo_destroy, struct Qdisc *sch)
{
}
SEC(".struct_ops")
struct Qdisc_ops fifo = {
.enqueue = (void *)bpf_fifo_enqueue,
.dequeue = (void *)bpf_fifo_dequeue,
.init = (void *)bpf_fifo_init,
.reset = (void *)bpf_fifo_reset,
.destroy = (void *)bpf_fifo_destroy,
.id = "bpf_fifo",
};