tools/sched_ext/scx_pair.bpf.c - linux - Git at Google

 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * A demo sched_ext core-scheduler which always makes every sibling CPU pair
  * execute from the same CPU cgroup.
  *
  * This scheduler is a minimal implementation and would need some form of
  * priority handling both inside each cgroup and across the cgroups to be
  * practically useful.
  *
  * Each CPU in the system is paired with exactly one other CPU, according to a
  * "stride" value that can be specified when the BPF scheduler program is first
  * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee
  * that they will only ever schedule tasks that belong to the same CPU cgroup.
  *
  * Scheduler Initialization
  * ------------------------
  *
  * The scheduler BPF program is first initialized from user space, before it is
  * enabled. During this initialization process, each CPU on the system is
  * assigned several values that are constant throughout its runtime:
  *
  * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling
  *		  decisions. Paired CPUs always schedule tasks from the same
  *		  CPU cgroup, and synchronize with each other to guarantee
  *		  that this constraint is not violated.
  * 2. *Pair ID*:  Each CPU pair is assigned a Pair ID, which is used to access
  *		  a struct pair_ctx object that is shared between the pair.
  * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the
  *		       pair. Each struct pair_ctx has an active_mask field,
  *		       which is a bitmap used to indicate whether each core
  *		       in the pair currently has an actively running task.
  *		       This index specifies which entry in the bitmap corresponds
  *		       to each CPU in the pair.
  *
  * During this initialization, the CPUs are paired according to a "stride" that
  * may be specified when invoking the user space program that initializes and
  * loads the scheduler. By default, the stride is 1/2 the total number of CPUs.
  *
  * Tasks and cgroups
  * -----------------
  *
  * Every cgroup in the system is registered with the scheduler using the
  * pair_cgroup_init() callback, and every task in the system is associated with
  * exactly one cgroup. At a high level, the idea with the pair scheduler is to
  * always schedule tasks from the same cgroup within a given CPU pair. When a
  * task is enqueued (i.e. passed to the pair_enqueue() callback function), its
  * cgroup ID is read from its task struct, and then a corresponding queue map
  * is used to FIFO-enqueue the task for that cgroup.
  *
  * If you look through the implementation of the scheduler, you'll notice that
  * there is quite a bit of complexity involved with looking up the per-cgroup
  * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash
  * BPF hash map that is used to map a cgroup ID to a globally unique ID that's
  * allocated in the BPF program. This is done because we use separate maps to
  * store the FIFO queue of tasks, and the length of that map, per cgroup. This
  * complexity is only present because of current deficiencies in BPF that will
  * soon be addressed. The main point to keep in mind is that newly enqueued
  * tasks are added to their cgroup's FIFO queue.
  *
  * Dispatching tasks
  * -----------------
  *
  * This section will describe how enqueued tasks are dispatched and scheduled.
  * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is
  * as follows:
  *
  * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is
  *    the structure that's used to synchronize amongst the two pair CPUs in their
  *    scheduling decisions. After any of the following events have occurred:
  *
  * - The cgroup's slice run has expired, or
  * - The cgroup becomes empty, or
  * - Either CPU in the pair is preempted by a higher priority scheduling class
  *
  * The cgroup transitions to the draining state and stops executing new tasks
  * from the cgroup.
  *
  * 2. If the pair is still executing a task, mark the pair_ctx as draining, and
  *    wait for the pair CPU to be preempted.
  *
  * 3. Otherwise, if the pair CPU is not running a task, we can move onto
  *    scheduling new tasks. Pop the next cgroup id from the top_q queue.
  *
  * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.
  *
  * Note again that this scheduling behavior is simple, but the implementation
  * is complex mostly because this it hits several BPF shortcomings and has to
  * work around in often awkward ways. Most of the shortcomings are expected to
  * be resolved in the near future which should allow greatly simplifying this
  * scheduler.
  *
  * Dealing with preemption
  * -----------------------
  *
  * SCX is the lowest priority sched_class, and could be preempted by them at
  * any time. To address this, the scheduler implements pair_cpu_release() and
  * pair_cpu_acquire() callbacks which are invoked by the core scheduler when
  * the scheduler loses and gains control of the CPU respectively.
  *
  * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and
  * then invoke:
  *
  * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
  *
  * This preempts the pair CPU, and waits until it has re-entered the scheduler
  * before returning. This is necessary to ensure that the higher priority
  * sched_class that preempted our scheduler does not schedule a task
  * concurrently with our pair CPU.
  *
  * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption
  * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable
  * pair scheduling.
  *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
 #include <scx/common.bpf.h>
 #include "scx_pair.h"

 char _license[] SEC("license") = "GPL";

 /* !0 for veristat, set during init */
 const volatile u32 nr_cpu_ids = 1;

 /* a pair of CPUs stay on a cgroup for this duration */
 const volatile u32 pair_batch_dur_ns;

 /* cpu ID -> pair cpu ID */
 const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu);

 /* cpu ID -> pair_id */
 const volatile u32 RESIZABLE_ARRAY(rodata, pair_id);

 /* CPU ID -> CPU # in the pair (0 or 1) */
 const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx);

 struct pair_ctx {
 	struct bpf_spin_lock	lock;

 	/* the cgroup the pair is currently executing */
 	u64			cgid;

 	/* the pair started executing the current cgroup at */
 	u64			started_at;

 	/* whether the current cgroup is draining */
 	bool			draining;

 	/* the CPUs that are currently active on the cgroup */
 	u32			active_mask;

 	/*
 	 * the CPUs that are currently preempted and running tasks in a
 	 * different scheduler.
 	 */
 	u32			preempted_mask;
 };

 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
 	__type(value, struct pair_ctx);
 } pair_ctx SEC(".maps");

 /* queue of cgrp_q's possibly with tasks on them */
 struct {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
 	/*
 	 * Because it's difficult to build strong synchronization encompassing
 	 * multiple non-trivial operations in BPF, this queue is managed in an
 	 * opportunistic way so that we guarantee that a cgroup w/ active tasks
 	 * is always on it but possibly multiple times. Once we have more robust
 	 * synchronization constructs and e.g. linked list, we should be able to
 	 * do this in a prettier way but for now just size it big enough.
 	 */
 	__uint(max_entries, 4 * MAX_CGRPS);
 	__type(value, u64);
 } top_q SEC(".maps");

 /* per-cgroup q which FIFOs the tasks from the cgroup */
 struct cgrp_q {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
 	__uint(max_entries, MAX_QUEUED);
 	__type(value, u32);
 };

 /*
  * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local
  * storage; however, a cgroup local storage can only be accessed from the BPF
  * progs attached to the cgroup. For now, work around by allocating array of
  * cgrp_q's and then allocating per-cgroup indices.
  *
  * Another caveat: It's difficult to populate a large array of maps statically
  * or from BPF. Initialize it from userland.
  */
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
 	__uint(max_entries, MAX_CGRPS);
 	__type(key, s32);
 	__array(values, struct cgrp_q);
 } cgrp_q_arr SEC(".maps");

 static u64 cgrp_q_len[MAX_CGRPS];

 /*
  * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be
  * useful to have as a map type.
  */
 static u32 cgrp_q_idx_cursor;
 static u64 cgrp_q_idx_busy[MAX_CGRPS];

 /*
  * All added up, the following is what we do:
  *
  * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking
  *    for a free ID. If not found, fail cgroup creation with -EBUSY.
  *
  * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following
  *    cgrp_q_idx_hash.
  *
  * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from
  *    cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.
  *
  * This is sadly complicated for something pretty simple. Hopefully, we should
  * be able to simplify in the future.
  */
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(max_entries, MAX_CGRPS);
 	__uint(key_size, sizeof(u64));		/* cgrp ID */
 	__uint(value_size, sizeof(s32));	/* cgrp_q idx */
 } cgrp_q_idx_hash SEC(".maps");

 /* statistics */
 u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;
 u64 nr_exps, nr_exp_waits, nr_exp_empty;
 u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;

 UEI_DEFINE(uei);

 void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	struct cgroup *cgrp;
 	struct cgrp_q *cgq;
 	s32 pid = p->pid;
 	u64 cgid;
 	u32 *q_idx;
 	u64 *cgq_len;

 	__sync_fetch_and_add(&nr_total, 1);

 	cgrp = scx_bpf_task_cgroup(p);
 	cgid = cgrp->kn->id;
 	bpf_cgroup_release(cgrp);

 	/* find the cgroup's q and push @p into it */
 	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
 	if (!q_idx) {
 		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
 		return;
 	}

 	cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);
 	if (!cgq) {
 		scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",
 			      cgid, *q_idx);
 		return;
 	}

 	if (bpf_map_push_elem(cgq, &pid, 0)) {
 		scx_bpf_error("cgroup[%llu] queue overflow", cgid);
 		return;
 	}

 	/* bump q len, if going 0 -> 1, queue cgroup into the top_q */
 	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
 	if (!cgq_len) {
 		scx_bpf_error("MEMBER_VTPR malfunction");
 		return;
 	}

 	if (!__sync_fetch_and_add(cgq_len, 1) &&
 	    bpf_map_push_elem(&top_q, &cgid, 0)) {
 		scx_bpf_error("top_q overflow");
 		return;
 	}
 }

 static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)
 {
 	u32 *vptr;

 	vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids);
 	if (!vptr)
 		return -EINVAL;

 	*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);
 	if (!(*pairc))
 		return -EINVAL;

 	vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids);
 	if (!vptr)
 		return -EINVAL;

 	*mask = 1U << *vptr;

 	return 0;
 }

 __attribute__((noinline))
 static int try_dispatch(s32 cpu)
 {
 	struct pair_ctx *pairc;
 	struct bpf_map *cgq_map;
 	struct task_struct *p;
 	u64 now = scx_bpf_now();
 	bool kick_pair = false;
 	bool expired, pair_preempted;
 	u32 *vptr, in_pair_mask;
 	s32 pid, q_idx;
 	u64 cgid;
 	int ret;

 	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
 	if (ret) {
 		scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",
 			      cpu);
 		return -ENOENT;
 	}

 	bpf_spin_lock(&pairc->lock);
 	pairc->active_mask &= ~in_pair_mask;

 	expired = time_before(pairc->started_at + pair_batch_dur_ns, now);
 	if (expired || pairc->draining) {
 		u64 new_cgid = 0;

 		__sync_fetch_and_add(&nr_exps, 1);

 		/*
 		 * We're done with the current cgid. An obvious optimization
 		 * would be not draining if the next cgroup is the current one.
 		 * For now, be dumb and always expire.
 		 */
 		pairc->draining = true;

 		pair_preempted = pairc->preempted_mask;
 		if (pairc->active_mask || pair_preempted) {
 			/*
 			 * The other CPU is still active, or is no longer under
 			 * our control due to e.g. being preempted by a higher
 			 * priority sched_class. We want to wait until this
 			 * cgroup expires, or until control of our pair CPU has
 			 * been returned to us.
 			 *
 			 * If the pair controls its CPU, and the time already
 			 * expired, kick.  When the other CPU arrives at
 			 * dispatch and clears its active mask, it'll push the
 			 * pair to the next cgroup and kick this CPU.
 			 */
 			__sync_fetch_and_add(&nr_exp_waits, 1);
 			bpf_spin_unlock(&pairc->lock);
 			if (expired && !pair_preempted)
 				kick_pair = true;
 			goto out_maybe_kick;
 		}

 		bpf_spin_unlock(&pairc->lock);

 		/*
 		 * Pick the next cgroup. It'd be easier / cleaner to not drop
 		 * pairc->lock and use stronger synchronization here especially
 		 * given that we'll be switching cgroups significantly less
 		 * frequently than tasks. Unfortunately, bpf_spin_lock can't
 		 * really protect anything non-trivial. Let's do opportunistic
 		 * operations instead.
 		 */
 		bpf_repeat(BPF_MAX_LOOPS) {
 			u32 *q_idx;
 			u64 *cgq_len;

 			if (bpf_map_pop_elem(&top_q, &new_cgid)) {
 				/* no active cgroup, go idle */
 				__sync_fetch_and_add(&nr_exp_empty, 1);
 				return 0;
 			}

 			q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid);
 			if (!q_idx)
 				continue;

 			/*
 			 * This is the only place where empty cgroups are taken
 			 * off the top_q.
 			 */
 			cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
 			if (!cgq_len || !*cgq_len)
 				continue;

 			/*
 			 * If it has any tasks, requeue as we may race and not
 			 * execute it.
 			 */
 			bpf_map_push_elem(&top_q, &new_cgid, 0);
 			break;
 		}

 		bpf_spin_lock(&pairc->lock);

 		/*
 		 * The other CPU may already have started on a new cgroup while
 		 * we dropped the lock. Make sure that we're still draining and
 		 * start on the new cgroup.
 		 */
 		if (pairc->draining && !pairc->active_mask) {
 			__sync_fetch_and_add(&nr_cgrp_next, 1);
 			pairc->cgid = new_cgid;
 			pairc->started_at = now;
 			pairc->draining = false;
 			kick_pair = true;
 		} else {
 			__sync_fetch_and_add(&nr_cgrp_coll, 1);
 		}
 	}

 	cgid = pairc->cgid;
 	pairc->active_mask |= in_pair_mask;
 	bpf_spin_unlock(&pairc->lock);

 	/* again, it'd be better to do all these with the lock held, oh well */
 	vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
 	if (!vptr) {
 		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
 		return -ENOENT;
 	}
 	q_idx = *vptr;

 	/* claim one task from cgrp_q w/ q_idx */
 	bpf_repeat(BPF_MAX_LOOPS) {
 		u64 *cgq_len, len;

 		cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]);
 		if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) {
 			/* the cgroup must be empty, expire and repeat */
 			__sync_fetch_and_add(&nr_cgrp_empty, 1);
 			bpf_spin_lock(&pairc->lock);
 			pairc->draining = true;
 			pairc->active_mask &= ~in_pair_mask;
 			bpf_spin_unlock(&pairc->lock);
 			return -EAGAIN;
 		}

 		if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)
 			continue;

 		break;
 	}

 	cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx);
 	if (!cgq_map) {
 		scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",
 			      cgid, q_idx);
 		return -ENOENT;
 	}

 	if (bpf_map_pop_elem(cgq_map, &pid)) {
 		scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",
 			      cgid, q_idx);
 		return -ENOENT;
 	}

 	p = bpf_task_from_pid(pid);
 	if (p) {
 		__sync_fetch_and_add(&nr_dispatched, 1);
 		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
 		bpf_task_release(p);
 	} else {
 		/* we don't handle dequeues, retry on lost tasks */
 		__sync_fetch_and_add(&nr_missing, 1);
 		return -EAGAIN;
 	}

 out_maybe_kick:
 	if (kick_pair) {
 		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
 		if (pair) {
 			__sync_fetch_and_add(&nr_kicks, 1);
 			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
 		}
 	}
 	return 0;
 }

 void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
 {
 	bpf_repeat(BPF_MAX_LOOPS) {
 		if (try_dispatch(cpu) != -EAGAIN)
 			break;
 	}
 }

 void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)
 {
 	int ret;
 	u32 in_pair_mask;
 	struct pair_ctx *pairc;
 	bool kick_pair;

 	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
 	if (ret)
 		return;

 	bpf_spin_lock(&pairc->lock);
 	pairc->preempted_mask &= ~in_pair_mask;
 	/* Kick the pair CPU, unless it was also preempted. */
 	kick_pair = !pairc->preempted_mask;
 	bpf_spin_unlock(&pairc->lock);

 	if (kick_pair) {
 		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);

 		if (pair) {
 			__sync_fetch_and_add(&nr_kicks, 1);
 			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
 		}
 	}
 }

 void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
 {
 	int ret;
 	u32 in_pair_mask;
 	struct pair_ctx *pairc;
 	bool kick_pair;

 	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
 	if (ret)
 		return;

 	bpf_spin_lock(&pairc->lock);
 	pairc->preempted_mask |= in_pair_mask;
 	pairc->active_mask &= ~in_pair_mask;
 	/* Kick the pair CPU if it's still running. */
 	kick_pair = pairc->active_mask;
 	pairc->draining = true;
 	bpf_spin_unlock(&pairc->lock);

 	if (kick_pair) {
 		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);

 		if (pair) {
 			__sync_fetch_and_add(&nr_kicks, 1);
 			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
 		}
 	}
 	__sync_fetch_and_add(&nr_preemptions, 1);
 }

 s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
 {
 	u64 cgid = cgrp->kn->id;
 	s32 i, q_idx;

 	bpf_for(i, 0, MAX_CGRPS) {
 		q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;
 		if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1))
 			break;
 	}
 	if (i == MAX_CGRPS)
 		return -EBUSY;

 	if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {
 		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
 		if (busy)
 			*busy = 0;
 		return -EBUSY;
 	}

 	return 0;
 }

 void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)
 {
 	u64 cgid = cgrp->kn->id;
 	s32 *q_idx;

 	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
 	if (q_idx) {
 		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]);
 		if (busy)
 			*busy = 0;
 		bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);
 	}
 }

 void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
 {
 	UEI_RECORD(uei, ei);
 }

 SCX_OPS_DEFINE(pair_ops,
 	       .enqueue			= (void *)pair_enqueue,
 	       .dispatch		= (void *)pair_dispatch,
 	       .cpu_acquire		= (void *)pair_cpu_acquire,
 	       .cpu_release		= (void *)pair_cpu_release,
 	       .cgroup_init		= (void *)pair_cgroup_init,
 	       .cgroup_exit		= (void *)pair_cgroup_exit,
 	       .exit			= (void *)pair_exit,
 	       .name			= "pair");
	/* SPDX-License-Identifier: GPL-2.0 */
	/*
	* A demo sched_ext core-scheduler which always makes every sibling CPU pair
	* execute from the same CPU cgroup.
	*
	* This scheduler is a minimal implementation and would need some form of
	* priority handling both inside each cgroup and across the cgroups to be
	* practically useful.
	*
	* Each CPU in the system is paired with exactly one other CPU, according to a
	* "stride" value that can be specified when the BPF scheduler program is first
	* loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee
	* that they will only ever schedule tasks that belong to the same CPU cgroup.
	*
	* Scheduler Initialization
	* ------------------------
	*
	* The scheduler BPF program is first initialized from user space, before it is
	* enabled. During this initialization process, each CPU on the system is
	* assigned several values that are constant throughout its runtime:
	*
	* 1. Pair CPU: The CPU that it synchronizes with when making scheduling
	* decisions. Paired CPUs always schedule tasks from the same
	* CPU cgroup, and synchronize with each other to guarantee
	* that this constraint is not violated.
	* 2. Pair ID: Each CPU pair is assigned a Pair ID, which is used to access
	* a struct pair_ctx object that is shared between the pair.
	* 3. In-pair-index: An index, 0 or 1, that is assigned to each core in the
	* pair. Each struct pair_ctx has an active_mask field,
	* which is a bitmap used to indicate whether each core
	* in the pair currently has an actively running task.
	* This index specifies which entry in the bitmap corresponds
	* to each CPU in the pair.
	*
	* During this initialization, the CPUs are paired according to a "stride" that
	* may be specified when invoking the user space program that initializes and
	* loads the scheduler. By default, the stride is 1/2 the total number of CPUs.
	*
	* Tasks and cgroups
	* -----------------
	*
	* Every cgroup in the system is registered with the scheduler using the
	* pair_cgroup_init() callback, and every task in the system is associated with
	* exactly one cgroup. At a high level, the idea with the pair scheduler is to
	* always schedule tasks from the same cgroup within a given CPU pair. When a
	* task is enqueued (i.e. passed to the pair_enqueue() callback function), its
	* cgroup ID is read from its task struct, and then a corresponding queue map
	* is used to FIFO-enqueue the task for that cgroup.
	*
	* If you look through the implementation of the scheduler, you'll notice that
	* there is quite a bit of complexity involved with looking up the per-cgroup
	* FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash
	* BPF hash map that is used to map a cgroup ID to a globally unique ID that's
	* allocated in the BPF program. This is done because we use separate maps to
	* store the FIFO queue of tasks, and the length of that map, per cgroup. This
	* complexity is only present because of current deficiencies in BPF that will
	* soon be addressed. The main point to keep in mind is that newly enqueued
	* tasks are added to their cgroup's FIFO queue.
	*
	* Dispatching tasks
	* -----------------
	*
	* This section will describe how enqueued tasks are dispatched and scheduled.
	* Tasks are dispatched in pair_dispatch(), and at a high level the workflow is
	* as follows:
	*
	* 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is
	* the structure that's used to synchronize amongst the two pair CPUs in their
	* scheduling decisions. After any of the following events have occurred:
	*
	* - The cgroup's slice run has expired, or
	* - The cgroup becomes empty, or
	* - Either CPU in the pair is preempted by a higher priority scheduling class
	*
	* The cgroup transitions to the draining state and stops executing new tasks
	* from the cgroup.
	*
	* 2. If the pair is still executing a task, mark the pair_ctx as draining, and
	* wait for the pair CPU to be preempted.
	*
	* 3. Otherwise, if the pair CPU is not running a task, we can move onto
	* scheduling new tasks. Pop the next cgroup id from the top_q queue.
	*
	* 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.
	*
	* Note again that this scheduling behavior is simple, but the implementation
	* is complex mostly because this it hits several BPF shortcomings and has to
	* work around in often awkward ways. Most of the shortcomings are expected to
	* be resolved in the near future which should allow greatly simplifying this
	* scheduler.
	*
	* Dealing with preemption
	* -----------------------
	*
	* SCX is the lowest priority sched_class, and could be preempted by them at
	* any time. To address this, the scheduler implements pair_cpu_release() and
	* pair_cpu_acquire() callbacks which are invoked by the core scheduler when
	* the scheduler loses and gains control of the CPU respectively.
	*
	* In pair_cpu_release(), we mark the pair_ctx as having been preempted, and
	* then invoke:
	*
	* scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT \| SCX_KICK_WAIT);
	*
	* This preempts the pair CPU, and waits until it has re-entered the scheduler
	* before returning. This is necessary to ensure that the higher priority
	* sched_class that preempted our scheduler does not schedule a task
	* concurrently with our pair CPU.
	*
	* When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption
	* in the pair_ctx, and send another resched IPI to the pair CPU to re-enable
	* pair scheduling.
	*
	* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
	* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
	* Copyright (c) 2022 David Vernet <dvernet@meta.com>
	*/
	#include <scx/common.bpf.h>
	#include "scx_pair.h"

	char _license[] SEC("license") = "GPL";

	/* !0 for veristat, set during init */
	const volatile u32 nr_cpu_ids = 1;

	/* a pair of CPUs stay on a cgroup for this duration */
	const volatile u32 pair_batch_dur_ns;

	/* cpu ID -> pair cpu ID */
	const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu);

	/* cpu ID -> pair_id */
	const volatile u32 RESIZABLE_ARRAY(rodata, pair_id);

	/* CPU ID -> CPU # in the pair (0 or 1) */
	const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx);

	struct pair_ctx {
	struct bpf_spin_lock lock;

	/* the cgroup the pair is currently executing */
	u64 cgid;

	/* the pair started executing the current cgroup at */
	u64 started_at;

	/* whether the current cgroup is draining */
	bool draining;

	/* the CPUs that are currently active on the cgroup */
	u32 active_mask;

	/*
	* the CPUs that are currently preempted and running tasks in a
	* different scheduler.
	*/
	u32 preempted_mask;
	};

	struct {
	__uint(type, BPF_MAP_TYPE_ARRAY);
	__type(key, u32);
	__type(value, struct pair_ctx);
	} pair_ctx SEC(".maps");

	/* queue of cgrp_q's possibly with tasks on them */
	struct {
	__uint(type, BPF_MAP_TYPE_QUEUE);
	/*
	* Because it's difficult to build strong synchronization encompassing
	* multiple non-trivial operations in BPF, this queue is managed in an
	* opportunistic way so that we guarantee that a cgroup w/ active tasks
	* is always on it but possibly multiple times. Once we have more robust
	* synchronization constructs and e.g. linked list, we should be able to
	* do this in a prettier way but for now just size it big enough.
	*/
	__uint(max_entries, 4 * MAX_CGRPS);
	__type(value, u64);
	} top_q SEC(".maps");

	/* per-cgroup q which FIFOs the tasks from the cgroup */
	struct cgrp_q {
	__uint(type, BPF_MAP_TYPE_QUEUE);
	__uint(max_entries, MAX_QUEUED);
	__type(value, u32);
	};

	/*
	* Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local
	* storage; however, a cgroup local storage can only be accessed from the BPF
	* progs attached to the cgroup. For now, work around by allocating array of
	* cgrp_q's and then allocating per-cgroup indices.
	*
	* Another caveat: It's difficult to populate a large array of maps statically
	* or from BPF. Initialize it from userland.
	*/
	struct {
	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
	__uint(max_entries, MAX_CGRPS);
	__type(key, s32);
	__array(values, struct cgrp_q);
	} cgrp_q_arr SEC(".maps");

	static u64 cgrp_q_len[MAX_CGRPS];

	/*
	* This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be
	* useful to have as a map type.
	*/
	static u32 cgrp_q_idx_cursor;
	static u64 cgrp_q_idx_busy[MAX_CGRPS];

	/*
	* All added up, the following is what we do:
	*
	* 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking
	* for a free ID. If not found, fail cgroup creation with -EBUSY.
	*
	* 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following
	* cgrp_q_idx_hash.
	*
	* 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from
	* cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.
	*
	* This is sadly complicated for something pretty simple. Hopefully, we should
	* be able to simplify in the future.
	*/
	struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(max_entries, MAX_CGRPS);
	__uint(key_size, sizeof(u64)); /* cgrp ID */
	__uint(value_size, sizeof(s32)); /* cgrp_q idx */
	} cgrp_q_idx_hash SEC(".maps");

	/* statistics */
	u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;
	u64 nr_exps, nr_exp_waits, nr_exp_empty;
	u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;

	UEI_DEFINE(uei);

	void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)
	{
	struct cgroup *cgrp;
	struct cgrp_q *cgq;
	s32 pid = p->pid;
	u64 cgid;
	u32 *q_idx;
	u64 *cgq_len;

	__sync_fetch_and_add(&nr_total, 1);

	cgrp = scx_bpf_task_cgroup(p);
	cgid = cgrp->kn->id;
	bpf_cgroup_release(cgrp);

	/* find the cgroup's q and push @p into it */
	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
	if (!q_idx) {
	scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
	return;
	}

	cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);
	if (!cgq) {
	scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",
	cgid, *q_idx);
	return;
	}

	if (bpf_map_push_elem(cgq, &pid, 0)) {
	scx_bpf_error("cgroup[%llu] queue overflow", cgid);
	return;
	}

	/* bump q len, if going 0 -> 1, queue cgroup into the top_q */
	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
	if (!cgq_len) {
	scx_bpf_error("MEMBER_VTPR malfunction");
	return;
	}

	if (!__sync_fetch_and_add(cgq_len, 1) &&
	bpf_map_push_elem(&top_q, &cgid, 0)) {
	scx_bpf_error("top_q overflow");
	return;
	}
	}

	static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx *pairc, u32 mask)
	{
	u32 *vptr;

	vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids);
	if (!vptr)
	return -EINVAL;

	*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);
	if (!(*pairc))
	return -EINVAL;

	vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids);
	if (!vptr)
	return -EINVAL;

	mask = 1U << vptr;

	return 0;
	}

	__attribute__((noinline))
	static int try_dispatch(s32 cpu)
	{
	struct pair_ctx *pairc;
	struct bpf_map *cgq_map;
	struct task_struct *p;
	u64 now = scx_bpf_now();
	bool kick_pair = false;
	bool expired, pair_preempted;
	u32 *vptr, in_pair_mask;
	s32 pid, q_idx;
	u64 cgid;
	int ret;

	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
	if (ret) {
	scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",
	cpu);
	return -ENOENT;
	}

	bpf_spin_lock(&pairc->lock);
	pairc->active_mask &= ~in_pair_mask;

	expired = time_before(pairc->started_at + pair_batch_dur_ns, now);
	if (expired \|\| pairc->draining) {
	u64 new_cgid = 0;

	__sync_fetch_and_add(&nr_exps, 1);

	/*
	* We're done with the current cgid. An obvious optimization
	* would be not draining if the next cgroup is the current one.
	* For now, be dumb and always expire.
	*/
	pairc->draining = true;

	pair_preempted = pairc->preempted_mask;
	if (pairc->active_mask \|\| pair_preempted) {
	/*
	* The other CPU is still active, or is no longer under
	* our control due to e.g. being preempted by a higher
	* priority sched_class. We want to wait until this
	* cgroup expires, or until control of our pair CPU has
	* been returned to us.
	*
	* If the pair controls its CPU, and the time already
	* expired, kick. When the other CPU arrives at
	* dispatch and clears its active mask, it'll push the
	* pair to the next cgroup and kick this CPU.
	*/
	__sync_fetch_and_add(&nr_exp_waits, 1);
	bpf_spin_unlock(&pairc->lock);
	if (expired && !pair_preempted)
	kick_pair = true;
	goto out_maybe_kick;
	}

	bpf_spin_unlock(&pairc->lock);

	/*
	* Pick the next cgroup. It'd be easier / cleaner to not drop
	* pairc->lock and use stronger synchronization here especially
	* given that we'll be switching cgroups significantly less
	* frequently than tasks. Unfortunately, bpf_spin_lock can't
	* really protect anything non-trivial. Let's do opportunistic
	* operations instead.
	*/
	bpf_repeat(BPF_MAX_LOOPS) {
	u32 *q_idx;
	u64 *cgq_len;

	if (bpf_map_pop_elem(&top_q, &new_cgid)) {
	/* no active cgroup, go idle */
	__sync_fetch_and_add(&nr_exp_empty, 1);
	return 0;
	}

	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid);
	if (!q_idx)
	continue;

	/*
	* This is the only place where empty cgroups are taken
	* off the top_q.
	*/
	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
	if (!cgq_len \|\| !*cgq_len)
	continue;

	/*
	* If it has any tasks, requeue as we may race and not
	* execute it.
	*/
	bpf_map_push_elem(&top_q, &new_cgid, 0);
	break;
	}

	bpf_spin_lock(&pairc->lock);

	/*
	* The other CPU may already have started on a new cgroup while
	* we dropped the lock. Make sure that we're still draining and
	* start on the new cgroup.
	*/
	if (pairc->draining && !pairc->active_mask) {
	__sync_fetch_and_add(&nr_cgrp_next, 1);
	pairc->cgid = new_cgid;
	pairc->started_at = now;
	pairc->draining = false;
	kick_pair = true;
	} else {
	__sync_fetch_and_add(&nr_cgrp_coll, 1);
	}
	}

	cgid = pairc->cgid;
	pairc->active_mask \|= in_pair_mask;
	bpf_spin_unlock(&pairc->lock);

	/* again, it'd be better to do all these with the lock held, oh well */
	vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
	if (!vptr) {
	scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
	return -ENOENT;
	}
	q_idx = *vptr;

	/* claim one task from cgrp_q w/ q_idx */
	bpf_repeat(BPF_MAX_LOOPS) {
	u64 *cgq_len, len;

	cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]);
	if (!cgq_len \|\| !(len = (volatile u64 )cgq_len)) {
	/* the cgroup must be empty, expire and repeat */
	__sync_fetch_and_add(&nr_cgrp_empty, 1);
	bpf_spin_lock(&pairc->lock);
	pairc->draining = true;
	pairc->active_mask &= ~in_pair_mask;
	bpf_spin_unlock(&pairc->lock);
	return -EAGAIN;
	}

	if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)
	continue;

	break;
	}

	cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx);
	if (!cgq_map) {
	scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",
	cgid, q_idx);
	return -ENOENT;
	}

	if (bpf_map_pop_elem(cgq_map, &pid)) {
	scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",
	cgid, q_idx);
	return -ENOENT;
	}

	p = bpf_task_from_pid(pid);
	if (p) {
	__sync_fetch_and_add(&nr_dispatched, 1);
	scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
	bpf_task_release(p);
	} else {
	/* we don't handle dequeues, retry on lost tasks */
	__sync_fetch_and_add(&nr_missing, 1);
	return -EAGAIN;
	}

	out_maybe_kick:
	if (kick_pair) {
	s32 pair = (s32 )ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
	if (pair) {
	__sync_fetch_and_add(&nr_kicks, 1);
	scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
	}
	}
	return 0;
	}

	void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
	{
	bpf_repeat(BPF_MAX_LOOPS) {
	if (try_dispatch(cpu) != -EAGAIN)
	break;
	}
	}

	void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)
	{
	int ret;
	u32 in_pair_mask;
	struct pair_ctx *pairc;
	bool kick_pair;

	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
	if (ret)
	return;

	bpf_spin_lock(&pairc->lock);
	pairc->preempted_mask &= ~in_pair_mask;
	/* Kick the pair CPU, unless it was also preempted. */
	kick_pair = !pairc->preempted_mask;
	bpf_spin_unlock(&pairc->lock);

	if (kick_pair) {
	s32 pair = (s32 )ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);

	if (pair) {
	__sync_fetch_and_add(&nr_kicks, 1);
	scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
	}
	}
	}

	void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
	{
	int ret;
	u32 in_pair_mask;
	struct pair_ctx *pairc;
	bool kick_pair;

	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
	if (ret)
	return;

	bpf_spin_lock(&pairc->lock);
	pairc->preempted_mask \|= in_pair_mask;
	pairc->active_mask &= ~in_pair_mask;
	/* Kick the pair CPU if it's still running. */
	kick_pair = pairc->active_mask;
	pairc->draining = true;
	bpf_spin_unlock(&pairc->lock);

	if (kick_pair) {
	s32 pair = (s32 )ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);

	if (pair) {
	__sync_fetch_and_add(&nr_kicks, 1);
	scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT \| SCX_KICK_WAIT);
	}
	}
	__sync_fetch_and_add(&nr_preemptions, 1);
	}

	s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
	{
	u64 cgid = cgrp->kn->id;
	s32 i, q_idx;

	bpf_for(i, 0, MAX_CGRPS) {
	q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;
	if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1))
	break;
	}
	if (i == MAX_CGRPS)
	return -EBUSY;

	if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {
	u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
	if (busy)
	*busy = 0;
	return -EBUSY;
	}

	return 0;
	}

	void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)
	{
	u64 cgid = cgrp->kn->id;
	s32 *q_idx;

	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
	if (q_idx) {
	u64 busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
	if (busy)
	*busy = 0;
	bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);
	}
	}

	void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
	{
	UEI_RECORD(uei, ei);
	}

	SCX_OPS_DEFINE(pair_ops,
	.enqueue = (void *)pair_enqueue,
	.dispatch = (void *)pair_dispatch,
	.cpu_acquire = (void *)pair_cpu_acquire,
	.cpu_release = (void *)pair_cpu_release,
	.cgroup_init = (void *)pair_cgroup_init,
	.cgroup_exit = (void *)pair_cgroup_exit,
	.exit = (void *)pair_exit,
	.name = "pair");