| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * A scheduler that validates ops.dequeue() is called correctly: |
| * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF |
| * scheduler entirely: no ops.dequeue() should be called |
| * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody: |
| * ops.dequeue() must be called when they leave custody |
| * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by |
| * exactly one ops.dequeue() (validate 1:1 pairing and state machine) |
| * |
| * Copyright (c) 2026 NVIDIA Corporation. |
| */ |
| |
| #include <scx/common.bpf.h> |
| |
| #define SHARED_DSQ 0 |
| |
| /* |
| * BPF internal queue. |
| * |
| * Tasks are stored here and consumed from ops.dispatch(), validating that |
| * tasks on BPF internal structures still get ops.dequeue() when they |
| * leave. |
| */ |
| struct { |
| __uint(type, BPF_MAP_TYPE_QUEUE); |
| __uint(max_entries, 32768); |
| __type(value, s32); |
| } global_queue SEC(".maps"); |
| |
| char _license[] SEC("license") = "GPL"; |
| |
| UEI_DEFINE(uei); |
| |
| /* |
| * Counters to track the lifecycle of tasks: |
| * - enqueue_cnt: Number of times ops.enqueue() was called |
| * - dequeue_cnt: Number of times ops.dequeue() was called (any type) |
| * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag) |
| * - change_dequeue_cnt: Number of property change dequeues |
| * - bpf_queue_full: Number of times the BPF internal queue was full |
| */ |
| u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full; |
| |
| /* |
| * Test scenarios: |
| * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF |
| * scheduler, no dequeue callbacks) |
| * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF |
| * scheduler, no dequeue callbacks) |
| * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler, |
| * dequeue callbacks expected) |
| * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF |
| * scheduler, no dequeue callbacks) |
| * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF |
| * scheduler, no dequeue callbacks) |
| * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler, |
| * dequeue callbacks expected) |
| * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(), |
| * consume in ops.dispatch() and dispatch to local DSQ (validates dequeue |
| * for tasks stored in internal BPF data structures) |
| */ |
| u32 test_scenario; |
| |
| /* |
| * Per-task state to track lifecycle and validate workflow semantics. |
| * State transitions: |
| * NONE -> ENQUEUED (on enqueue) |
| * NONE -> DISPATCHED (on direct dispatch to terminal DSQ) |
| * ENQUEUED -> DISPATCHED (on dispatch dequeue) |
| * DISPATCHED -> NONE (on property change dequeue or re-enqueue) |
| * ENQUEUED -> NONE (on property change dequeue before dispatch) |
| */ |
| enum task_state { |
| TASK_NONE = 0, |
| TASK_ENQUEUED, |
| TASK_DISPATCHED, |
| }; |
| |
| struct task_ctx { |
| enum task_state state; /* Current state in the workflow */ |
| u64 enqueue_seq; /* Sequence number for debugging */ |
| }; |
| |
| struct { |
| __uint(type, BPF_MAP_TYPE_TASK_STORAGE); |
| __uint(map_flags, BPF_F_NO_PREALLOC); |
| __type(key, int); |
| __type(value, struct task_ctx); |
| } task_ctx_stor SEC(".maps"); |
| |
| static struct task_ctx *try_lookup_task_ctx(struct task_struct *p) |
| { |
| return bpf_task_storage_get(&task_ctx_stor, p, 0, 0); |
| } |
| |
| s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p, |
| s32 prev_cpu, u64 wake_flags) |
| { |
| struct task_ctx *tctx; |
| |
| tctx = try_lookup_task_ctx(p); |
| if (!tctx) |
| return prev_cpu; |
| |
| switch (test_scenario) { |
| case 0: |
| /* |
| * Direct dispatch to the local DSQ. |
| * |
| * Task bypasses BPF scheduler entirely: no enqueue |
| * tracking, no ops.dequeue() callbacks. |
| */ |
| scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); |
| tctx->state = TASK_DISPATCHED; |
| break; |
| case 1: |
| /* |
| * Direct dispatch to the global DSQ. |
| * |
| * Task bypasses BPF scheduler entirely: no enqueue |
| * tracking, no ops.dequeue() callbacks. |
| */ |
| scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); |
| tctx->state = TASK_DISPATCHED; |
| break; |
| case 2: |
| /* |
| * Dispatch to a shared user DSQ. |
| * |
| * Task enters BPF scheduler management: track |
| * enqueue/dequeue lifecycle and validate state |
| * transitions. |
| */ |
| if (tctx->state == TASK_ENQUEUED) |
| scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", |
| p->pid, p->comm, tctx->enqueue_seq); |
| |
| scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0); |
| |
| __sync_fetch_and_add(&enqueue_cnt, 1); |
| |
| tctx->state = TASK_ENQUEUED; |
| tctx->enqueue_seq++; |
| break; |
| } |
| |
| return prev_cpu; |
| } |
| |
| void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags) |
| { |
| struct task_ctx *tctx; |
| s32 pid = p->pid; |
| |
| tctx = try_lookup_task_ctx(p); |
| if (!tctx) |
| return; |
| |
| switch (test_scenario) { |
| case 3: |
| /* |
| * Direct dispatch to the local DSQ. |
| * |
| * Task bypasses BPF scheduler entirely: no enqueue |
| * tracking, no ops.dequeue() callbacks. |
| */ |
| scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); |
| tctx->state = TASK_DISPATCHED; |
| break; |
| case 4: |
| /* |
| * Direct dispatch to the global DSQ. |
| * |
| * Task bypasses BPF scheduler entirely: no enqueue |
| * tracking, no ops.dequeue() callbacks. |
| */ |
| scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); |
| tctx->state = TASK_DISPATCHED; |
| break; |
| case 5: |
| /* |
| * Dispatch to shared user DSQ. |
| * |
| * Task enters BPF scheduler management: track |
| * enqueue/dequeue lifecycle and validate state |
| * transitions. |
| */ |
| if (tctx->state == TASK_ENQUEUED) |
| scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", |
| p->pid, p->comm, tctx->enqueue_seq); |
| |
| scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); |
| |
| __sync_fetch_and_add(&enqueue_cnt, 1); |
| |
| tctx->state = TASK_ENQUEUED; |
| tctx->enqueue_seq++; |
| break; |
| case 6: |
| /* |
| * Store task in BPF internal queue. |
| * |
| * Task enters BPF scheduler management: track |
| * enqueue/dequeue lifecycle and validate state |
| * transitions. |
| */ |
| if (tctx->state == TASK_ENQUEUED) |
| scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", |
| p->pid, p->comm, tctx->enqueue_seq); |
| |
| if (bpf_map_push_elem(&global_queue, &pid, 0)) { |
| scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); |
| __sync_fetch_and_add(&bpf_queue_full, 1); |
| |
| tctx->state = TASK_DISPATCHED; |
| } else { |
| __sync_fetch_and_add(&enqueue_cnt, 1); |
| |
| tctx->state = TASK_ENQUEUED; |
| tctx->enqueue_seq++; |
| } |
| break; |
| default: |
| /* For all other scenarios, dispatch to the global DSQ */ |
| scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); |
| tctx->state = TASK_DISPATCHED; |
| break; |
| } |
| |
| scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE); |
| } |
| |
| void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags) |
| { |
| struct task_ctx *tctx; |
| |
| __sync_fetch_and_add(&dequeue_cnt, 1); |
| |
| tctx = try_lookup_task_ctx(p); |
| if (!tctx) |
| return; |
| |
| /* |
| * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global), |
| * ops.dequeue() should never be called because tasks bypass the |
| * BPF scheduler entirely. If we get here, it's a kernel bug. |
| */ |
| if (test_scenario == 0 || test_scenario == 3) { |
| scx_bpf_error("%d (%s): dequeue called for local DSQ scenario", |
| p->pid, p->comm); |
| return; |
| } |
| |
| if (test_scenario == 1 || test_scenario == 4) { |
| scx_bpf_error("%d (%s): dequeue called for global DSQ scenario", |
| p->pid, p->comm); |
| return; |
| } |
| |
| if (deq_flags & SCX_DEQ_SCHED_CHANGE) { |
| /* |
| * Property change interrupting the workflow. Valid from |
| * both ENQUEUED and DISPATCHED states. Transitions task |
| * back to NONE state. |
| */ |
| __sync_fetch_and_add(&change_dequeue_cnt, 1); |
| |
| /* Validate state transition */ |
| if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED) |
| scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu", |
| p->pid, p->comm, tctx->state, tctx->enqueue_seq); |
| |
| /* |
| * Transition back to NONE: task outside scheduler control. |
| * |
| * Scenario 6: dispatch() checks tctx->state after popping a |
| * PID, if the task is in state NONE, it was dequeued by |
| * property change and must not be dispatched (this |
| * prevents "target CPU not allowed"). |
| */ |
| tctx->state = TASK_NONE; |
| } else { |
| /* |
| * Regular dispatch dequeue: kernel is moving the task from |
| * BPF custody to a terminal DSQ. Normally we come from |
| * ENQUEUED state. We can also see TASK_NONE if the task |
| * was dequeued by property change (SCX_DEQ_SCHED_CHANGE) |
| * while it was already on a DSQ (dispatched but not yet |
| * consumed); in that case we just leave state as NONE. |
| */ |
| __sync_fetch_and_add(&dispatch_dequeue_cnt, 1); |
| |
| /* |
| * Must be ENQUEUED (normal path) or NONE (already dequeued |
| * by property change while on a DSQ). |
| */ |
| if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE) |
| scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu", |
| p->pid, p->comm, tctx->state, tctx->enqueue_seq); |
| |
| if (tctx->state == TASK_ENQUEUED) |
| tctx->state = TASK_DISPATCHED; |
| |
| /* NONE: leave as-is, task was already property-change dequeued */ |
| } |
| } |
| |
| void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev) |
| { |
| if (test_scenario == 6) { |
| struct task_ctx *tctx; |
| struct task_struct *p; |
| s32 pid; |
| |
| if (bpf_map_pop_elem(&global_queue, &pid)) |
| return; |
| |
| p = bpf_task_from_pid(pid); |
| if (!p) |
| return; |
| |
| /* |
| * If the task was dequeued by property change |
| * (ops.dequeue() set tctx->state = TASK_NONE), skip |
| * dispatch. |
| */ |
| tctx = try_lookup_task_ctx(p); |
| if (!tctx || tctx->state == TASK_NONE) { |
| bpf_task_release(p); |
| return; |
| } |
| |
| /* |
| * Dispatch to this CPU's local DSQ if allowed, otherwise |
| * fallback to the global DSQ. |
| */ |
| if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) |
| scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0); |
| else |
| scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); |
| |
| bpf_task_release(p); |
| } else { |
| scx_bpf_dsq_move_to_local(SHARED_DSQ, 0); |
| } |
| } |
| |
| s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p, |
| struct scx_init_task_args *args) |
| { |
| struct task_ctx *tctx; |
| |
| tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, |
| BPF_LOCAL_STORAGE_GET_F_CREATE); |
| if (!tctx) |
| return -ENOMEM; |
| |
| return 0; |
| } |
| |
| s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init) |
| { |
| s32 ret; |
| |
| ret = scx_bpf_create_dsq(SHARED_DSQ, -1); |
| if (ret) |
| return ret; |
| |
| return 0; |
| } |
| |
| void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei) |
| { |
| UEI_RECORD(uei, ei); |
| } |
| |
| SEC(".struct_ops.link") |
| struct sched_ext_ops dequeue_ops = { |
| .select_cpu = (void *)dequeue_select_cpu, |
| .enqueue = (void *)dequeue_enqueue, |
| .dequeue = (void *)dequeue_dequeue, |
| .dispatch = (void *)dequeue_dispatch, |
| .init_task = (void *)dequeue_init_task, |
| .init = (void *)dequeue_init, |
| .exit = (void *)dequeue_exit, |
| .flags = SCX_OPS_ENQ_LAST, |
| .name = "dequeue_test", |
| }; |