Program type `BPF_PROG_TYPE_PERF_EVENT`

v4.9

Perf event programs that can be attached to hardware and software perf events. Once attached the BPF program is executed each time the perf event is triggered.

Usage

Perf event programs are typically used for profiling and tracing. These programs are called with the CPU register state at the time of the event. This allows the programs to collect information for each event and aggregate it in a customized way.

Perf event programs are typically placed in the perf_event ELF header.

Context

C Structure

struct bpf_perf_event_data {
    bpf_user_pt_regs_t regs;
    __u64 sample_period;
    __u64 addr;
};

`regs`

This field contains the CPU registers at the time of the event. The type of the field is different for each architecture since each architecture has different registers. The helpers in tools/lib/bpf/bpf_tracing.h can be used to access the registers in a portable way.

`sample_period`

This field contains the amount of times this perf even has been triggered.

`addr`

Docs could be improved

This part of the docs is incomplete, contributions are very welcome

Attachment

here are three methods of attaching perf event programs, from oldest and least recommended to newest and most recommended, however, all methods have this first part in common.

Next step is to open a new perf event using the perf_event_open syscall:

struct perf_event_attr attr = {
    .sample_freq = SAMPLE_FREQ,
    .freq = 1,
    .type = PERF_TYPE_HARDWARE,
    .config = PERF_COUNT_HW_CPU_CYCLES,
};

syscall(SYS_perf_event_open, 
    &attr,  /* struct perf_event_attr * */
    -1,     /* pid_t pid */
    0       /* int cpu */
    -1,     /* int group_fd */
    PERF_FLAG_FD_CLOEXEC /* unsigned long flags */
);

This syscall will return a file descriptor on success. Perf event programs can be attached to any event, as long as it is of type PERF_TYPE_HARDWARE or PERF_TYPE_SOFTWARE.

ioctl method

This is the oldest and least recommended method. After we have the perf event file descriptor we execute two ioctl syscalls to attach our BPF program to the trace event and to enable the trace.

ioctl(perf_event_fd, PERF_EVENT_IOC_SET_BPF, bpf_prog_fd); to attach.

ioctl(perf_event_fd, PERF_EVENT_IOC_ENABLE, 0); to enable.

The perf event program can be temporarily disabled with the PERF_EVENT_IOC_DISABLE ioctl option. Otherwise the perf event program stays attached until the perf_event goes away due to the closing of the perf_event FD or the program exiting. The perf event holds a reference to the BPF program so it will stay loaded until no more perf event program reference it.

`perf_event_open` PMU

Docs could be improved

This part of the docs is incomplete, contributions are very welcome

BPF link

This is the newest and most recommended method of attaching perf event programs.

After we have gotten the perf event file descriptor we attach the program by making a bpf link via the link create syscall command.

We call the syscall command with the BPF_PERF_EVENT attach_type, target_fd set to the perf event file descriptor, prog_fd to the file descriptor of the tracepoint program, and optionally a cookie

Examples

profiling example

/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/ptrace.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/bpf_perf_event.h>
#include <uapi/linux/perf_event.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

struct key_t {
    char comm[TASK_COMM_LEN];
    u32 kernstack;
    u32 userstack;
};

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __type(key, struct key_t);
    __type(value, u64);
    __uint(max_entries, 10000);
} counts SEC(".maps");

struct {
    __uint(type, BPF_MAP_TYPE_STACK_TRACE);
    __uint(key_size, sizeof(u32));
    __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
    __uint(max_entries, 10000);
} stackmap SEC(".maps");

#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)

SEC("perf_event")
int bpf_prog1(struct bpf_perf_event_data *ctx)
{
    char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
    char time_fmt2[] = "Get Time Failed, ErrCode: %d";
    char addr_fmt[] = "Address recorded on event: %llx";
    char fmt[] = "CPU-%d period %lld ip %llx";
    u32 cpu = bpf_get_smp_processor_id();
    struct bpf_perf_event_value value_buf;
    struct key_t key;
    u64 *val, one = 1;
    int ret;

    if (ctx->sample_period < 10000)
        /* ignore warmup */
        return 0;
    bpf_get_current_comm(&key.comm, sizeof(key.comm));
    key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS);
    key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS);
    if ((int)key.kernstack < 0 && (int)key.userstack < 0) {
        bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period,
                PT_REGS_IP(&ctx->regs));
        return 0;
    }

    ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
    if (!ret)
    bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
    else
    bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);

    if (ctx->addr != 0)
    bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr);

    val = bpf_map_lookup_elem(&counts, &key);
    if (val)
        (*val)++;
    else
        bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST);
    return 0;
}

char _license[] SEC("license") = "GPL";

recording instruction pointer

/* Copyright 2016 Netflix, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/ptrace.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/bpf_perf_event.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

#define MAX_IPS     8192

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __type(key, u64);
    __type(value, u32);
    __uint(max_entries, MAX_IPS);
} ip_map SEC(".maps");

SEC("perf_event")
int do_sample(struct bpf_perf_event_data *ctx)
{
    u64 ip;
    u32 *value, init_val = 1;

    ip = PT_REGS_IP(&ctx->regs);
    value = bpf_map_lookup_elem(&ip_map, &ip);
    if (value)
        *value += 1;
    else
        /* E2BIG not tested for this example only */
        bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST);

    return 0;
}
char _license[] SEC("license") = "GPL";

Helper functions

Supported helper functions

KFuncs

Supported kfuncs

__bpf_trap v6.12 -
bpf_arena_alloc_pages v6.12 -
bpf_arena_free_pages v6.12 -
bpf_arena_reserve_pages v6.12 -
bpf_cast_to_kern_ctx v6.12 -
bpf_cgroup_acquire v6.12 -
bpf_cgroup_ancestor v6.12 -
bpf_cgroup_from_id v6.12 -
bpf_cgroup_read_xattr v6.12 -
bpf_cgroup_release v6.12 -
bpf_copy_from_user_dynptr v6.12 -
bpf_copy_from_user_str v6.12 -
bpf_copy_from_user_str_dynptr v6.12 -
bpf_copy_from_user_task_dynptr v6.12 -
bpf_copy_from_user_task_str v6.12 -
bpf_copy_from_user_task_str_dynptr v6.12 -
bpf_cpumask_acquire v6.12 -
bpf_cpumask_and v6.12 -
bpf_cpumask_any_and_distribute v6.12 -
bpf_cpumask_any_distribute v6.12 -
bpf_cpumask_clear v6.12 -
bpf_cpumask_clear_cpu v6.12 -
bpf_cpumask_copy v6.12 -
bpf_cpumask_create v6.12 -
bpf_cpumask_empty v6.12 -
bpf_cpumask_equal v6.12 -
bpf_cpumask_first v6.12 -
bpf_cpumask_first_and v6.12 -
bpf_cpumask_first_zero v6.12 -
bpf_cpumask_full v6.12 -
bpf_cpumask_intersects v6.12 -
bpf_cpumask_or v6.12 -
bpf_cpumask_populate v6.12 -
bpf_cpumask_release v6.12 -
bpf_cpumask_set_cpu v6.12 -
bpf_cpumask_setall v6.12 -
bpf_cpumask_subset v6.12 -
bpf_cpumask_test_and_clear_cpu v6.12 -
bpf_cpumask_test_and_set_cpu v6.12 -
bpf_cpumask_test_cpu v6.12 -
bpf_cpumask_weight v6.12 -
bpf_cpumask_xor v6.12 -
bpf_dynptr_adjust v6.12 -
bpf_dynptr_clone v6.12 -
bpf_dynptr_copy v6.12 -
bpf_dynptr_from_skb v6.12 -
bpf_dynptr_is_null v6.12 -
bpf_dynptr_is_rdonly v6.12 -
bpf_dynptr_memset v6.12 -
bpf_dynptr_size v6.12 -
bpf_dynptr_slice v6.12 -
bpf_dynptr_slice_rdwr v6.12 -
bpf_get_dentry_xattr v6.12 -
bpf_get_file_xattr v6.12 -
bpf_get_fsverity_digest v6.12 -
bpf_get_kmem_cache v6.12 -
bpf_get_task_exe_file v6.12 -
bpf_iter_bits_destroy v6.12 -
bpf_iter_bits_new v6.12 -
bpf_iter_bits_next v6.12 -
bpf_iter_css_destroy v6.12 -
bpf_iter_css_new v6.12 -
bpf_iter_css_next v6.12 -
bpf_iter_css_task_destroy v6.12 -
bpf_iter_css_task_new v6.12 -
bpf_iter_css_task_next v6.12 -
bpf_iter_dmabuf_destroy v6.12 -
bpf_iter_dmabuf_new v6.12 -
bpf_iter_dmabuf_next v6.12 -
bpf_iter_kmem_cache_destroy v6.12 -
bpf_iter_kmem_cache_new v6.12 -
bpf_iter_kmem_cache_next v6.12 -
bpf_iter_num_destroy v6.12 -
bpf_iter_num_new v6.12 -
bpf_iter_num_next v6.12 -
bpf_iter_scx_dsq_destroy v6.12 -
bpf_iter_scx_dsq_new v6.12 -
bpf_iter_scx_dsq_next v6.12 -
bpf_iter_task_destroy v6.12 -
bpf_iter_task_new v6.12 -
bpf_iter_task_next v6.12 -
bpf_iter_task_vma_destroy v6.12 -
bpf_iter_task_vma_new v6.12 -
bpf_iter_task_vma_next v6.12 -
bpf_key_put v6.12 -
bpf_list_back v6.12 -
bpf_list_front v6.12 -
bpf_list_pop_back v6.12 -
bpf_list_pop_front v6.12 -
bpf_list_push_back_impl v6.12 -
bpf_list_push_front_impl v6.12 -
bpf_local_irq_restore v6.12 -
bpf_local_irq_save v6.12 -
bpf_lookup_system_key v6.12 -
bpf_lookup_user_key v6.12 -
bpf_map_sum_elem_count v6.12 -
bpf_obj_drop_impl v6.12 -
bpf_obj_new_impl v6.12 -
bpf_path_d_path v6.12 -
bpf_percpu_obj_drop_impl v6.12 -
bpf_percpu_obj_new_impl v6.12 -
bpf_preempt_disable v6.12 -
bpf_preempt_enable v6.12 -
bpf_probe_read_kernel_dynptr v6.12 -
bpf_probe_read_kernel_str_dynptr v6.12 -
bpf_probe_read_user_dynptr v6.12 -
bpf_probe_read_user_str_dynptr v6.12 -
bpf_put_file v6.12 -
bpf_rbtree_add_impl v6.12 -
bpf_rbtree_first v6.12 -
bpf_rbtree_left v6.12 -
bpf_rbtree_remove v6.12 -
bpf_rbtree_right v6.12 -
bpf_rbtree_root v6.12 -
bpf_rcu_read_lock v6.12 -
bpf_rcu_read_unlock v6.12 -
bpf_rdonly_cast v6.12 -
bpf_refcount_acquire_impl v6.12 -
bpf_remove_dentry_xattr v6.12 -
bpf_res_spin_lock v6.12 -
bpf_res_spin_lock_irqsave v6.12 -
bpf_res_spin_unlock v6.12 -
bpf_res_spin_unlock_irqrestore v6.12 -
bpf_send_signal_task v6.12 -
bpf_set_dentry_xattr v6.12 -
bpf_sock_destroy v6.12 -
bpf_strchr v6.12 -
bpf_strchrnul v6.12 -
bpf_strcmp v6.12 -
bpf_strcspn v6.12 -
bpf_stream_vprintk v6.12 -
bpf_strlen v6.12 -
bpf_strnchr v6.12 -
bpf_strnlen v6.12 -
bpf_strnstr v6.12 -
bpf_strrchr v6.12 -
bpf_strspn v6.12 -
bpf_strstr v6.12 -
bpf_task_acquire v6.12 -
bpf_task_from_pid v6.12 -
bpf_task_from_vpid v6.12 -
bpf_task_get_cgroup1 v6.12 -
bpf_task_release v6.12 -
bpf_task_under_cgroup v6.12 -
bpf_throw v6.12 -
bpf_verify_pkcs7_signature v6.12 -
bpf_wq_init v6.12 -
bpf_wq_set_callback_impl v6.12 -
bpf_wq_start v6.12 -
crash_kexec v6.12 -
css_rstat_flush v6.12 -
css_rstat_updated v6.12 -
scx_bpf_cpu_node v6.12 -
scx_bpf_cpu_rq v6.12 -
scx_bpf_cpuperf_cap v6.12 -
scx_bpf_cpuperf_cur v6.12 -
scx_bpf_cpuperf_set v6.12 -
scx_bpf_destroy_dsq v6.12 -
scx_bpf_dsq_nr_queued v6.12 -
scx_bpf_dump_bstr v6.12 -
scx_bpf_error_bstr v6.12 -
scx_bpf_events v6.12 -
scx_bpf_exit_bstr v6.12 -
scx_bpf_get_idle_cpumask v6.12 -
scx_bpf_get_idle_cpumask_node v6.12 -
scx_bpf_get_idle_smtmask v6.12 -
scx_bpf_get_idle_smtmask_node v6.12 -
scx_bpf_get_online_cpumask v6.12 -
scx_bpf_get_possible_cpumask v6.12 -
scx_bpf_kick_cpu v6.12 -
scx_bpf_now v6.12 -
scx_bpf_nr_cpu_ids v6.12 -
scx_bpf_nr_node_ids v6.12 -
scx_bpf_pick_any_cpu v6.12 -
scx_bpf_pick_any_cpu_node v6.12 -
scx_bpf_pick_idle_cpu v6.12 -
scx_bpf_pick_idle_cpu_node v6.12 -
scx_bpf_put_cpumask v6.12 -
scx_bpf_put_idle_cpumask v6.12 -
scx_bpf_select_cpu_and v6.12 -
scx_bpf_task_cgroup v6.12 -
scx_bpf_task_cpu v6.12 -
scx_bpf_task_running v6.12 -
scx_bpf_test_and_clear_cpu_idle v6.12 -

Program type BPF_PROG_TYPE_PERF_EVENT

Usage

Context

regs

sample_period

addr

Attachment

ioctl method

perf_event_open PMU

BPF link

Examples

Helper functions

KFuncs

Program type `BPF_PROG_TYPE_PERF_EVENT`

`regs`

`sample_period`

`addr`

`perf_event_open` PMU