Program type BPF_PROG_TYPE_RAW_TRACEPOINT
Raw tracepoint programs are similar to tracepoint programs, but the kernel does no pre-processing on the arguments and passes the raw arguments directly to the tracepoint program.
Usage
Raw tracepoint programs are typically put into an ELF section prefixed with raw_tp/
or in a raw_tracepoint
section. When loading as a BPF_PROG_TYPE_TRACING
program, the raw tracepoint is typically located in a section prefixed with tp_btf/
.
Raw tracepoints are attached to the same tracepoints as normal tracepoint programs. The reason why you might want to use raw tracepoints over normal tracepoints is due to the performance improvement. For normal tracepoints, the kernel will cast or transform arguments even if the arguments are never used. By taking the raw arguments, the BPF program can do the casting or transformation only if the arguments are used, thereby making a more efficient tracepoint program.
Context
The context for raw tracepoint programs is a pointer to a struct bpf_raw_tracepoint_args
:
struct bpf_raw_tracepoint_args {
__u64 args[0];
};
The args
array contains the raw arguments to the tracepoint. The number of arguments is determined by the tracepoint. The verifier will enforce that the number of arguments matches the number of arguments expected by the tracepoint. The BPF program can cast the u64 values to the expected types or use the bpf_probe_read
/bpf_probe_read_kernel
helper function to read the arguments.
Attachment
Raw tracepoints can be attached in two ways, first is with a dedicated syscall, the second method is with the more generic BPF link syscall.
Syscall
The dedicated syscall BPF_RAW_TRACEPOINT_OPEN
can be used to attach the raw tracepoint. This requires the name
field to be set to a string containing the name of the tracepoint to which the user whishes to attach to. The prog_fd
attribute field should be set to the file descriptor of the BPF program to attach.
Docs could be improved
This part of the docs is incomplete, contributions are very welcome
BPF link
A BPF link can also be used to attach a raw tracepoint program. To do so the raw tracepoint must be loaded with BPF_PROG_TYPE_TRACING
program type instead of the BPF_PROG_TYPE_RAW_TRACEPOINT
program type. The expected_attach_type
should be BPF_TRACE_RAW_TP
and the attach_btf_id
attribute set to the BTF ID of the tracepoint the program should be attached to.
After that a link should be created via the link create syscall command syscall. The attach type set to BPF_TRACE_RAW_TP
.
Example
raw tracepoint
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (c) 2021 Facebook
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(int));
__uint(map_flags, BPF_F_PRESERVE_ELEMS);
} events SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct bpf_perf_event_value));
__uint(max_entries, 1);
} prev_readings SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct bpf_perf_event_value));
__uint(max_entries, 1);
} diff_readings SEC(".maps");
SEC("raw_tp/sched_switch")
int BPF_PROG(on_switch)
{
struct bpf_perf_event_value val, *prev_val, *diff_val;
__u32 key = bpf_get_smp_processor_id();
__u32 zero = 0;
long err;
prev_val = bpf_map_lookup_elem(&prev_readings, &zero);
if (!prev_val)
return 0;
diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
if (!diff_val)
return 0;
err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
if (err)
return 0;
diff_val->counter = val.counter - prev_val->counter;
diff_val->enabled = val.enabled - prev_val->enabled;
diff_val->running = val.running - prev_val->running;
*prev_val = val;
return 0;
}
char LICENSE[] SEC("license") = "Dual BSD/GPL";
tracing program
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Facebook
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include "runqslower.h"
#define TASK_RUNNING 0
#define BPF_F_CURRENT_CPU 0xffffffffULL
const volatile __u64 min_us = 0;
const volatile pid_t targ_pid = 0;
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, u64);
} start SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u32));
} events SEC(".maps");
/* record enqueue timestamp */
__always_inline
static int trace_enqueue(struct task_struct *t)
{
u32 pid = t->pid;
u64 *ptr;
if (!pid || (targ_pid && targ_pid != pid))
return 0;
ptr = bpf_task_storage_get(&start, t, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!ptr)
return 0;
*ptr = bpf_ktime_get_ns();
return 0;
}
SEC("tp_btf/sched_wakeup")
int handle__sched_wakeup(u64 *ctx)
{
/* TP_PROTO(struct task_struct *p) */
struct task_struct *p = (void *)ctx[0];
return trace_enqueue(p);
}
SEC("tp_btf/sched_wakeup_new")
int handle__sched_wakeup_new(u64 *ctx)
{
/* TP_PROTO(struct task_struct *p) */
struct task_struct *p = (void *)ctx[0];
return trace_enqueue(p);
}
SEC("tp_btf/sched_switch")
int handle__sched_switch(u64 *ctx)
{
/* TP_PROTO(bool preempt, struct task_struct *prev,
* struct task_struct *next)
*/
struct task_struct *prev = (struct task_struct *)ctx[1];
struct task_struct *next = (struct task_struct *)ctx[2];
struct runq_event event = {};
u64 *tsp, delta_us;
long state;
u32 pid;
/* ivcsw: treat like an enqueue event and store timestamp */
if (prev->__state == TASK_RUNNING)
trace_enqueue(prev);
pid = next->pid;
/* For pid mismatch, save a bpf_task_storage_get */
if (!pid || (targ_pid && targ_pid != pid))
return 0;
/* fetch timestamp and calculate delta */
tsp = bpf_task_storage_get(&start, next, 0, 0);
if (!tsp)
return 0; /* missed enqueue */
delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
if (min_us && delta_us <= min_us)
return 0;
event.pid = pid;
event.delta_us = delta_us;
bpf_get_current_comm(&event.task, sizeof(event.task));
/* output */
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
&event, sizeof(event));
bpf_task_storage_delete(&start, next);
return 0;
}
char LICENSE[] SEC("license") = "GPL";
Helper functions
Not all helper functions are available in all program types. These are the helper calls available for raw tracepoint programs:
Supported helper functions
bpf_cgrp_storage_delete
bpf_cgrp_storage_get
bpf_copy_from_user
bpf_copy_from_user_task
bpf_current_task_under_cgroup
bpf_dynptr_data
bpf_dynptr_from_mem
bpf_dynptr_read
bpf_dynptr_write
bpf_find_vma
bpf_for_each_map_elem
bpf_get_attach_cookie
bpf_get_branch_snapshot
bpf_get_current_ancestor_cgroup_id
bpf_get_current_cgroup_id
bpf_get_current_comm
bpf_get_current_pid_tgid
bpf_get_current_task
bpf_get_current_task_btf
bpf_get_current_uid_gid
bpf_get_func_ip
bpf_get_ns_current_pid_tgid
bpf_get_numa_node_id
bpf_get_prandom_u32
bpf_get_smp_processor_id
bpf_get_stack
bpf_get_stackid
bpf_get_task_stack
bpf_jiffies64
bpf_kptr_xchg
bpf_ktime_get_boot_ns
bpf_ktime_get_ns
bpf_ktime_get_tai_ns
bpf_loop
bpf_map_delete_elem
bpf_map_lookup_elem
bpf_map_lookup_percpu_elem
bpf_map_peek_elem
bpf_map_pop_elem
bpf_map_push_elem
bpf_map_update_elem
bpf_per_cpu_ptr
bpf_perf_event_output
bpf_perf_event_read
bpf_perf_event_read_value
bpf_probe_read
bpf_probe_read_kernel
bpf_probe_read_kernel_str
bpf_probe_read_str
bpf_probe_read_user
bpf_probe_read_user_str
bpf_probe_write_user
bpf_ringbuf_discard
bpf_ringbuf_discard_dynptr
bpf_ringbuf_output
bpf_ringbuf_query
bpf_ringbuf_reserve
bpf_ringbuf_reserve_dynptr
bpf_ringbuf_submit
bpf_ringbuf_submit_dynptr
bpf_send_signal
bpf_send_signal_thread
bpf_snprintf
bpf_snprintf_btf
bpf_spin_lock
bpf_spin_unlock
bpf_strncmp
bpf_tail_call
bpf_task_pt_regs
bpf_task_storage_delete
bpf_task_storage_get
bpf_this_cpu_ptr
bpf_timer_cancel
bpf_timer_init
bpf_timer_set_callback
bpf_timer_start
bpf_trace_printk
bpf_trace_vprintk
bpf_user_ringbuf_drain
KFuncs
There are currently no kfuncs supported for this program type