Program type BPF_PROG_TYPE_SK_MSG
Socket message programs are called for every sendmsg
or sendfile
syscall. This program type can pass verdict on individual packets or larger L7 messages chunked over multiple syscalls.
Usage
Socket MSG programs are attached to BPF_MAP_TYPE_SOCKMAP
or BPF_MAP_TYPE_SOCKHASH
maps and will be invoked sendmsg
or sendfile
syscalls are executed on sockets which are part of the map the program is attached to.
The program returns a verdict on what to do with the data the process wants to send.
SK_PASS
- The message may pass to the socket or it has been redirected with a helper.SK_DROP
- The message should be dropped.
The bpf_msg_apply_bytes
helper function can be used to indicate for which bytes the verdict applies. This has two cases. First BPF program applies
verdict to fewer bytes than in the current sendmsg
/sendfile
this will apply the verdict to the first N bytes of the message then run the BPF program again with data pointers recalculated to the N+1 byte. The second case is the BPF program applies a verdict to more bytes than the current sendmsg
or sendfile
system call. In this case the infrastructure will cache the verdict and apply it to future sendmsg
/sendfile
calls until the byte limit is reached. This avoids the overhead of running BPF programs on large payloads.
The helper bpf_msg_cork_bytes
handles a different case where a BPF program can not reach a verdict on a message until it receives more bytes AND the program doesn't want to forward the packet until it is known to be "good". The example case being a user (albeit a dumb one probably) sends messages in 1B system calls. The BPF program can call bpf_msg_cork_bytes
with the required byte limit to reach a verdict and then the program will only be called again once N bytes are received.
Context
Socket message programs are invoked with a struct sk_msg_md
context. All field are readable, none are writable.
struct sk_msg_md {
__bpf_md_ptr(void *, data);
__bpf_md_ptr(void *, data_end);
__u32 family;
__u32 remote_ip4; /* Stored in network byte order */
__u32 local_ip4; /* Stored in network byte order */
__u32 remote_ip6[4]; /* Stored in network byte order */
__u32 local_ip6[4]; /* Stored in network byte order */
__u32 remote_port; /* Stored in network byte order */
__u32 local_port; /* stored in host byte order */
__u32 size; /* Total size of sk_msg */
__bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
};
Attachment
This program type must always be loaded with the expected_attach_type
of BPF_SK_MSG_VERDICT
.
Socket message programs are attached to BPF_MAP_TYPE_SOCKMAP
or BPF_MAP_TYPE_SOCKHASH
using the BPF_PROG_ATTACH
syscall (bpf_prog_attach
libbpf function).
Example
Example of redirecting a message:
// Copyright (c) 2020 Cloudflare
struct {
__uint(type, BPF_MAP_TYPE_SOCKMAP);
__uint(max_entries, 2);
__type(key, __u32);
__type(value, __u64);
} sock_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_SOCKHASH);
__uint(max_entries, 2);
__type(key, __u32);
__type(value, __u64);
} sock_hash SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 2);
__type(key, int);
__type(value, unsigned int);
} verdict_map SEC(".maps");
SEC("sk_msg")
int prog_msg_verdict(struct sk_msg_md *msg)
{
unsigned int *count;
__u32 zero = 0;
int verdict;
if (test_sockmap)
verdict = bpf_msg_redirect_map(msg, &sock_map, zero, 0);
else
verdict = bpf_msg_redirect_hash(msg, &sock_hash, &zero, 0);
count = bpf_map_lookup_elem(&verdict_map, &verdict);
if (count)
(*count)++;
return verdict;
}
Example of dropping based on PID and TPID:
// Copyright (c) 2020 Isovalent, Inc.
struct {
__uint(type, BPF_MAP_TYPE_SOCKMAP);
__uint(max_entries, 2);
__type(key, __u32);
__type(value, __u64);
} sock_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_SOCKHASH);
__uint(max_entries, 2);
__type(key, __u32);
__type(value, __u64);
} sock_hash SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, __u32);
__type(value, __u64);
} socket_storage SEC(".maps");
SEC("sk_msg")
int prog_msg_verdict(struct sk_msg_md *msg)
{
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
int verdict = SK_PASS;
__u32 pid, tpid;
__u64 *sk_stg;
pid = bpf_get_current_pid_tgid() >> 32;
sk_stg = bpf_sk_storage_get(&socket_storage, msg->sk, 0, BPF_SK_STORAGE_GET_F_CREATE);
if (!sk_stg)
return SK_DROP;
*sk_stg = pid;
bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid);
if (pid != tpid)
verdict = SK_DROP;
bpf_sk_storage_delete(&socket_storage, (void *)msg->sk);
return verdict;
}
Helper functions
Supported helper functions
bpf_cgrp_storage_delete
bpf_cgrp_storage_get
bpf_dynptr_data
bpf_dynptr_from_mem
bpf_dynptr_read
bpf_dynptr_write
bpf_for_each_map_elem
bpf_get_cgroup_classid
bpf_get_current_ancestor_cgroup_id
bpf_get_current_cgroup_id
bpf_get_current_pid_tgid
bpf_get_current_task
bpf_get_current_task_btf
bpf_get_current_uid_gid
bpf_get_netns_cookie
bpf_get_ns_current_pid_tgid
v6.10bpf_get_numa_node_id
bpf_get_prandom_u32
bpf_get_smp_processor_id
bpf_jiffies64
bpf_kptr_xchg
bpf_ktime_get_boot_ns
bpf_ktime_get_coarse_ns
bpf_ktime_get_ns
bpf_ktime_get_tai_ns
bpf_loop
bpf_map_delete_elem
bpf_map_lookup_elem
bpf_map_lookup_percpu_elem
bpf_map_peek_elem
bpf_map_pop_elem
bpf_map_push_elem
bpf_map_update_elem
bpf_msg_apply_bytes
bpf_msg_cork_bytes
bpf_msg_pop_data
bpf_msg_pull_data
bpf_msg_push_data
bpf_msg_redirect_hash
bpf_msg_redirect_map
bpf_per_cpu_ptr
bpf_perf_event_output
bpf_probe_read_kernel
bpf_probe_read_kernel_str
bpf_probe_read_user
bpf_probe_read_user_str
bpf_ringbuf_discard
bpf_ringbuf_discard_dynptr
bpf_ringbuf_output
bpf_ringbuf_query
bpf_ringbuf_reserve
bpf_ringbuf_reserve_dynptr
bpf_ringbuf_submit
bpf_ringbuf_submit_dynptr
bpf_sk_storage_delete
bpf_sk_storage_get
bpf_skc_to_tcp6_sock
bpf_skc_to_tcp_request_sock
bpf_skc_to_tcp_sock
bpf_skc_to_tcp_timewait_sock
bpf_skc_to_udp6_sock
bpf_skc_to_unix_sock
bpf_snprintf
bpf_snprintf_btf
bpf_spin_lock
bpf_spin_unlock
bpf_strncmp
bpf_tail_call
bpf_task_pt_regs
bpf_this_cpu_ptr
bpf_timer_cancel
bpf_timer_init
bpf_timer_set_callback
bpf_timer_start
bpf_trace_printk
bpf_trace_vprintk
bpf_user_ringbuf_drain
KFuncs
There are currently no kfuncs supported for this program type