Struct ops `sched_ext_ops`

Sched ext (Scheduler extension) Ops can be used to implement a custom scheduler in BPF.

Usage

The Linux kernel provides built-in scheduler implementations like CFS and EEVDF. These schedulers are designed to provide a good balance between fairness and performance for most workloads. However, there are use cases where a custom scheduler is needed to meet specific requirements. The BPF scheduler extension provides a way to implement a custom scheduler in BPF.

Fields and ops

A BPF scheduler can implement an arbitrary scheduling policy by implementing and loading operations in this table. Note that a userland scheduling policy can also be implemented using the BPF scheduler as a shim layer.

Note

The following definition has been modified from the one found in the kernel for the sake of readability. This does not impact the definition for the purposes of implementing a BPF program.

struct sched_ext_ops {
    char name[SCX_OPS_NAME_LEN];
    u32  dispatch_max_batch;
    u64  flags;
    u32  timeout_ms;
    u32  exit_dump_len;
    u64  hotplug_seq;

    s32  (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
    void (*enqueue)(struct task_struct *p, u64 enq_flags);
    void (*dequeue)(struct task_struct *p, u64 deq_flags);
    void (*dispatch)(s32 cpu, struct task_struct *prev);
    void (*tick)(struct task_struct *p);
    void (*runnable)(struct task_struct *p, u64 enq_flags);
    void (*running)(struct task_struct *p);
    void (*stopping)(struct task_struct *p, bool runnable);
    void (*quiescent)(struct task_struct *p, u64 deq_flags);
    bool (*yield)(struct task_struct *from, struct task_struct *to);
    bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
    void (*set_weight)(struct task_struct *p, u32 weight);
    void (*set_cpumask)(struct task_struct *p, const struct cpumask *cpumask);
    void (*update_idle)(s32 cpu, bool idle);
    void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
    void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);

    s32  (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
    void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);

    void (*enable)(struct task_struct *p);
    void (*disable)(struct task_struct *p);

    void (*dump)(struct scx_dump_ctx *ctx);
    void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
    void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);

#ifdef CONFIG_EXT_GROUP_SCHED
    s32  (*cgroup_init)(struct cgroup *cgrp, struct scx_cgroup_init_args *args);
    void (*cgroup_exit)(struct cgroup *cgrp);
    s32  (*cgroup_prep_move)(struct task_struct *p, struct cgroup *from, struct cgroup *to);
    void (*cgroup_move)(struct task_struct *p, struct cgroup *from, struct cgroup *to);
    void (*cgroup_cancel_move)(struct task_struct *p, struct cgroup *from, struct cgroup *to);
    void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
#endif /* CONFIG_EXT_GROUP_SCHED */

    void (*cpu_online)(s32 cpu);
    void (*cpu_offline)(s32 cpu);

    s32  (*init)(void);
    void (*exit)(struct scx_exit_info *info);
};

Struct ops sched_ext_ops

Usage

Fields and ops

name

dispatch_max_batch

flags

timeout_ms

exit_dump_len

hotplug_seq

select_cpu

enqueue

dequeue

dispatch

tick

runnable

running

stopping

quiescent

yield

core_sched_before

set_weight

set_cpumask

update_idle

cpu_acquire

cpu_release

init_task

exit_task

enable

disable

dump

dump_cpu

dump_task

cgroup_init

cgroup_exit

cgroup_prep_move

cgroup_move

cgroup_cancel_move

cgroup_set_weight

cpu_online

cpu_offline

init

exit

Types

enum scx_ops_flags

SCX_OPS_KEEP_BUILTIN_IDLE

SCX_OPS_ENQ_LAST

SCX_OPS_ENQ_EXITING

SCX_OPS_SWITCH_PARTIAL

SCX_OPS_ENQ_MIGRATION_DISABLED

SCX_OPS_ALLOW_QUEUED_WAKEUP

SCX_OPS_BUILTIN_IDLE_PER_NODE

SCX_OPS_HAS_CGROUP_WEIGHT

enum scx_enq_flags

SCX_ENQ_WAKEUP

SCX_ENQ_HEAD

SCX_ENQ_CPU_SELECTED

SCX_ENQ_PREEMPT

SCX_ENQ_REENQ

SCX_ENQ_LAST

SCX_ENQ_CLEAR_OPSS

SCX_ENQ_DSQ_PRIQ

enum scx_deq_flags

SCX_DEQ_SLEEP

SCX_DEQ_CORE_SCHED_EXEC

enum scx_dsq_id_flags

SCX_DSQ_FLAG_BUILTIN

SCX_DSQ_FLAG_LOCAL_ON

SCX_DSQ_GLOBAL

SCX_DSQ_LOCAL

SCX_DSQ_LOCAL_ON

enum scx_ent_flags

SCX_TASK_QUEUED

SCX_TASK_RESET_RUNNABLE_AT

SCX_TASK_DEQD_FOR_SLEEP

SCX_TASK_CURSOR

enum scx_task_state

SCX_TASK_NONE

SCX_TASK_INIT

SCX_TASK_READY

SCX_TASK_ENABLED

Struct ops `sched_ext_ops`

`name`

`dispatch_max_batch`

`flags`

`timeout_ms`

`exit_dump_len`

`hotplug_seq`

`select_cpu`

`enqueue`

`dequeue`

`dispatch`

`tick`

`runnable`

`running`

`stopping`

`quiescent`

`yield`

`core_sched_before`

`set_weight`

`set_cpumask`

`update_idle`

`cpu_acquire`

`cpu_release`

`init_task`

`exit_task`

`enable`

`disable`

`dump`

`dump_cpu`

`dump_task`

`cgroup_init`

`cgroup_exit`

`cgroup_prep_move`

`cgroup_move`

`cgroup_cancel_move`

`cgroup_set_weight`

`cpu_online`

`cpu_offline`

`init`

`exit`

`enum scx_ops_flags`

`SCX_OPS_KEEP_BUILTIN_IDLE`

`SCX_OPS_ENQ_LAST`

`SCX_OPS_ENQ_EXITING`

`SCX_OPS_SWITCH_PARTIAL`

`SCX_OPS_ENQ_MIGRATION_DISABLED`

`SCX_OPS_ALLOW_QUEUED_WAKEUP`

`SCX_OPS_BUILTIN_IDLE_PER_NODE`

`SCX_OPS_HAS_CGROUP_WEIGHT`

`enum scx_enq_flags`

`SCX_ENQ_WAKEUP`

`SCX_ENQ_HEAD`

`SCX_ENQ_CPU_SELECTED`

`SCX_ENQ_PREEMPT`

`SCX_ENQ_REENQ`

`SCX_ENQ_LAST`

`SCX_ENQ_CLEAR_OPSS`

`SCX_ENQ_DSQ_PRIQ`

`enum scx_deq_flags`

`SCX_DEQ_SLEEP`

`SCX_DEQ_CORE_SCHED_EXEC`

`enum scx_dsq_id_flags`

`SCX_DSQ_FLAG_BUILTIN`

`SCX_DSQ_FLAG_LOCAL_ON`

`SCX_DSQ_GLOBAL`

`SCX_DSQ_LOCAL`

`SCX_DSQ_LOCAL_ON`

`enum scx_ent_flags`

`SCX_TASK_QUEUED`

`SCX_TASK_RESET_RUNNABLE_AT`

`SCX_TASK_DEQD_FOR_SLEEP`

`SCX_TASK_CURSOR`

`enum scx_task_state`

`SCX_TASK_NONE`

`SCX_TASK_INIT`

`SCX_TASK_READY`

`SCX_TASK_ENABLED`

`enum scx_kf_mask`

`SCX_KF_UNLOCKED`

`SCX_KF_CPU_RELEASE`