Program type BPF_PROG_TYPE_STRUCT_OPS
This program types allows for the implementation of certain kernel features in BPF.
Usage
The kernel uses the "struct ops" pattern in C to implement interfaces. The kernel defines a struct with function pointers as field types. An implementation can create an instance of this struct and provide pointers to its own functions that implement the signatures.
This program type allows for the creation of these struct implementations with BPF so locations in the kernel that allow it can BPF to implement the functionality. For example the TCP congestion control algorithm.
Context
The context provided to each function is an array of 64 bit integers, representing the argument list of the function in the struct it is implementing. The BPF_PROG
macro from libbpf can be used to write your program in a recognizable way, the macro will take care of unpacking the array into named variables of the provided type.
Attachment
User perspective
From a eBPF user perspective, all one has to do is to create an instance of the struct to be used as implementation, any function pointer fields you wish to use should point to eBPF programs. This struct must be placed in the .struct_ops
section like so:
/* Copyright (c) 2019 Facebook */
SEC(".struct_ops")
struct tcp_congestion_ops dctcp_nouse = {
.init = (void *)dctcp_init,
.set_state = (void *)dctcp_state,
.flags = TCP_CONG_NEEDS_ECN,
.name = "bpf_dctcp_nouse",
};
The full file is turned into a light skeleton. After loading the light skeleton the struct will appear as a map in the skel->maps
structure. Then the map is passed to the bpf_map__attach_struct_ops
function to attach it.
Loader/kernel perspective
The loader locates the struct(s) in the .struct_ops
section. It uses BTF to get the name and structure of the struct. The values of non-function fields are present in the .struct_ops
section itself and for the function pointer fields that were set there will be ELF relocation entries pointing to the respective eBPF programs.
The loader will figure out which eBPF programs were referenced and load them. The loader then will inspect the Vmlinux BTF of the host to find the BTF type ID of the struct_ops struct. Once we have that, the loader creates a BPF_MAP_TYPE_STRUCT_OPS
map. While loading the BTF type ID of the struct_ops struct is provided via the btf_vmlinux_value_type_id
argument. The btf_value_type_id
is set to the BTF ID of the programs struct. The key of the map must be an int
/4 bytes and max_entries
of 1.
The kernel has now allocated memory for one instance of the struct_ops struct and associated it with the call location. So the next step for the loader is to write to the only element in the map, element 0
. The value of the map is the instance of the struct as C would lay it out in memory. Except the values of the function pointer fields are populated with the file descriptors of the eBPF programs, the kernel will transform these into the actual memory locations of the JITed eBPF programs. As soon as the update happens, the struct_ops programs are attached. The map holds the refcounts to the programs, so they don't have to be pinned. As soon as element 0
of the map is deleted or the map is cleaned up due to being unreferenced, the struct_ops programs are detached. So typically the map gets pinned.
Example
DC TCP
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
/* WARNING: This implemenation is not necessarily the same
* as the tcp_dctcp.c. The purpose is mainly for testing
* the kernel BPF logic.
*/
#include <stddef.h>
#include <linux/bpf.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/tcp.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_tcp_helpers.h"
char _license[] SEC("license") = "GPL";
int stg_result = 0;
struct {
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, int);
} sk_stg_map SEC(".maps");
#define DCTCP_MAX_ALPHA 1024U
struct dctcp {
__u32 old_delivered;
__u32 old_delivered_ce;
__u32 prior_rcv_nxt;
__u32 dctcp_alpha;
__u32 next_seq;
__u32 ce_state;
__u32 loss_cwnd;
};
static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */
static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA;
static __always_inline void dctcp_reset(const struct tcp_sock *tp,
struct dctcp *ca)
{
ca->next_seq = tp->snd_nxt;
ca->old_delivered = tp->delivered;
ca->old_delivered_ce = tp->delivered_ce;
}
SEC("struct_ops/dctcp_init")
void BPF_PROG(dctcp_init, struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
int *stg;
ca->prior_rcv_nxt = tp->rcv_nxt;
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
ca->loss_cwnd = 0;
ca->ce_state = 0;
stg = bpf_sk_storage_get(&sk_stg_map, (void *)tp, NULL, 0);
if (stg) {
stg_result = *stg;
bpf_sk_storage_delete(&sk_stg_map, (void *)tp);
}
dctcp_reset(tp, ca);
}
SEC("struct_ops/dctcp_ssthresh")
__u32 BPF_PROG(dctcp_ssthresh, struct sock *sk)
{
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
}
SEC("struct_ops/dctcp_update_alpha")
void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
/* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) {
__u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
__u32 alpha = ca->dctcp_alpha;
/* alpha = (1 - g) * alpha + g * F */
alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
if (delivered_ce) {
__u32 delivered = tp->delivered - ca->old_delivered;
/* If dctcp_shift_g == 1, a 32bit value would overflow
* after 8 M packets.
*/
delivered_ce <<= (10 - dctcp_shift_g);
delivered_ce /= max(1U, delivered);
alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
}
ca->dctcp_alpha = alpha;
dctcp_reset(tp, ca);
}
}
static __always_inline void dctcp_react_to_loss(struct sock *sk)
{
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
ca->loss_cwnd = tp->snd_cwnd;
tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
}
SEC("struct_ops/dctcp_state")
void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state)
{
if (new_state == TCP_CA_Recovery &&
new_state != BPF_CORE_READ_BITFIELD(inet_csk(sk), icsk_ca_state))
dctcp_react_to_loss(sk);
/* We handle RTO in dctcp_cwnd_event to ensure that we perform only
* one loss-adjustment per RTT.
*/
}
static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state)
{
struct tcp_sock *tp = tcp_sk(sk);
if (ce_state == 1)
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
else
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
/* Minimal DCTP CE state machine:
*
* S: 0 <- last pkt was non-CE
* 1 <- last pkt was CE
*/
static __always_inline
void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
__u32 *prior_rcv_nxt, __u32 *ce_state)
{
__u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
if (*ce_state != new_ce_state) {
/* CE state has changed, force an immediate ACK to
* reflect the new CE state. If an ACK was delayed,
* send that first to reflect the prior CE state.
*/
if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
dctcp_ece_ack_cwr(sk, *ce_state);
bpf_tcp_send_ack(sk, *prior_rcv_nxt);
}
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
*prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
*ce_state = new_ce_state;
dctcp_ece_ack_cwr(sk, new_ce_state);
}
SEC("struct_ops/dctcp_cwnd_event")
void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev)
{
struct dctcp *ca = inet_csk_ca(sk);
switch (ev) {
case CA_EVENT_ECN_IS_CE:
case CA_EVENT_ECN_NO_CE:
dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
break;
case CA_EVENT_LOSS:
dctcp_react_to_loss(sk);
break;
default:
/* Don't care for the rest. */
break;
}
}
SEC("struct_ops/dctcp_cwnd_undo")
__u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk)
{
const struct dctcp *ca = inet_csk_ca(sk);
return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
}
extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
SEC("struct_ops/dctcp_reno_cong_avoid")
void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
{
tcp_reno_cong_avoid(sk, ack, acked);
}
SEC(".struct_ops")
struct tcp_congestion_ops dctcp_nouse = {
.init = (void *)dctcp_init,
.set_state = (void *)dctcp_state,
.flags = TCP_CONG_NEEDS_ECN,
.name = "bpf_dctcp_nouse",
};
SEC(".struct_ops")
struct tcp_congestion_ops dctcp = {
.init = (void *)dctcp_init,
.in_ack_event = (void *)dctcp_update_alpha,
.cwnd_event = (void *)dctcp_cwnd_event,
.ssthresh = (void *)dctcp_ssthresh,
.cong_avoid = (void *)dctcp_cong_avoid,
.undo_cwnd = (void *)dctcp_cwnd_undo,
.set_state = (void *)dctcp_state,
.flags = TCP_CONG_NEEDS_ECN,
.name = "bpf_dctcp",
};
Helper functions
Not all helper functions are available in all program types. These are the helper calls available for struct ops programs:
Supported helper functions
bpf_tcp_send_ack
bpf_sk_storage_get
bpf_sk_storage_delete
bpf_map_lookup_elem
bpf_map_update_elem
bpf_map_delete_elem
bpf_map_push_elem
bpf_map_pop_elem
bpf_map_peek_elem
bpf_map_lookup_percpu_elem
bpf_get_prandom_u32
bpf_get_smp_processor_id
bpf_get_numa_node_id
bpf_tail_call
bpf_ktime_get_ns
bpf_ktime_get_boot_ns
bpf_ringbuf_output
bpf_ringbuf_reserve
bpf_ringbuf_submit
bpf_ringbuf_discard
bpf_ringbuf_query
bpf_for_each_map_elem
bpf_loop
bpf_strncmp
bpf_spin_lock
bpf_spin_unlock
bpf_jiffies64
bpf_per_cpu_ptr
bpf_this_cpu_ptr
bpf_timer_init
bpf_timer_set_callback
bpf_timer_start
bpf_timer_cancel
bpf_trace_printk
bpf_get_current_task
bpf_get_current_task_btf
bpf_probe_read_user
bpf_probe_read_kernel
bpf_probe_read_user_str
bpf_probe_read_kernel_str
bpf_snprintf_btf
bpf_snprintf
bpf_task_pt_regs
bpf_trace_vprintk
bpf_cgrp_storage_get
bpf_cgrp_storage_delete
bpf_dynptr_data
bpf_dynptr_from_mem
bpf_dynptr_read
bpf_dynptr_write
bpf_kptr_xchg
bpf_ktime_get_tai_ns
bpf_ringbuf_discard_dynptr
bpf_ringbuf_reserve_dynptr
bpf_ringbuf_submit_dynptr
bpf_user_ringbuf_drain
KFuncs
Supported kfuncs
bbr_cwnd_event
bbr_init
bbr_main
bbr_min_tso_segs
bbr_set_state
bbr_sndbuf_expand
bbr_ssthresh
bbr_undo_cwnd
bpf_arena_alloc_pages
bpf_arena_free_pages
bpf_cast_to_kern_ctx
bpf_cgroup_acquire
bpf_cgroup_ancestor
bpf_cgroup_from_id
bpf_cgroup_release
bpf_cpumask_acquire
bpf_cpumask_and
bpf_cpumask_any_and_distribute
bpf_cpumask_any_distribute
bpf_cpumask_clear
bpf_cpumask_clear_cpu
bpf_cpumask_copy
bpf_cpumask_create
bpf_cpumask_empty
bpf_cpumask_equal
bpf_cpumask_first
bpf_cpumask_first_and
bpf_cpumask_first_zero
bpf_cpumask_full
bpf_cpumask_intersects
bpf_cpumask_or
bpf_cpumask_release
bpf_cpumask_set_cpu
bpf_cpumask_setall
bpf_cpumask_subset
bpf_cpumask_test_and_clear_cpu
bpf_cpumask_test_and_set_cpu
bpf_cpumask_test_cpu
bpf_cpumask_weight
bpf_cpumask_xor
bpf_dynptr_adjust
bpf_dynptr_clone
bpf_dynptr_is_null
bpf_dynptr_is_rdonly
bpf_dynptr_size
bpf_dynptr_slice
bpf_dynptr_slice_rdwr
bpf_iter_bits_destroy
bpf_iter_bits_new
bpf_iter_bits_next
bpf_iter_css_destroy
bpf_iter_css_new
bpf_iter_css_next
bpf_iter_css_task_destroy
bpf_iter_css_task_new
bpf_iter_css_task_next
bpf_iter_num_destroy
bpf_iter_num_new
bpf_iter_num_next
bpf_iter_task_destroy
bpf_iter_task_new
bpf_iter_task_next
bpf_iter_task_vma_destroy
bpf_iter_task_vma_new
bpf_iter_task_vma_next
bpf_list_pop_back
bpf_list_pop_front
bpf_list_push_back_impl
bpf_list_push_front_impl
bpf_map_sum_elem_count
bpf_obj_drop_impl
bpf_obj_new_impl
bpf_percpu_obj_drop_impl
bpf_percpu_obj_new_impl
bpf_preempt_disable
bpf_preempt_enable
bpf_rbtree_add_impl
bpf_rbtree_first
bpf_rbtree_remove
bpf_rcu_read_lock
bpf_rcu_read_unlock
bpf_rdonly_cast
bpf_refcount_acquire_impl
bpf_task_acquire
bpf_task_from_pid
bpf_task_get_cgroup1
bpf_task_release
bpf_task_under_cgroup
bpf_throw
bpf_wq_init
bpf_wq_set_callback_impl
bpf_wq_start
crash_kexec
cubictcp_acked
cubictcp_cong_avoid
cubictcp_cwnd_event
cubictcp_init
cubictcp_recalc_ssthresh
cubictcp_state
dctcp_cwnd_event
dctcp_cwnd_undo
dctcp_init
dctcp_ssthresh
dctcp_state
dctcp_update_alpha
tcp_cong_avoid_ai
tcp_reno_cong_avoid
tcp_reno_ssthresh
tcp_reno_undo_cwnd
tcp_slow_start