tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
 #ifndef BPF_ARENA_SPIN_LOCK_H
 #define BPF_ARENA_SPIN_LOCK_H

 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_atomic.h"

 #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
 #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)

 #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)

 #define EBUSY 16
 #define EOPNOTSUPP 95
 #define ETIMEDOUT 110

 #ifndef __arena
 #define __arena __attribute__((address_space(1)))
 #endif

 extern unsigned long CONFIG_NR_CPUS __kconfig;

 /*
  * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
  * PowerPC overrides the definition to define lock->val as u32 instead of
  * atomic_t, leading to compilation errors.  Import a local definition below so
  * that we don't depend on the vmlinux.h version.
  */

 struct __qspinlock {
 	union {
 		atomic_t val;
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 		struct {
 			u8 locked;
 			u8 pending;
 		};
 		struct {
 			u16 locked_pending;
 			u16 tail;
 		};
 #else
 		struct {
 			u16 tail;
 			u16 locked_pending;
 		};
 		struct {
 			u8 reserved[2];
 			u8 pending;
 			u8 locked;
 		};
 #endif
 	};
 };

 #define arena_spinlock_t struct __qspinlock
 /* FIXME: Using typedef causes CO-RE relocation error */
 /* typedef struct qspinlock arena_spinlock_t; */

 struct arena_mcs_spinlock {
 	struct arena_mcs_spinlock __arena *next;
 	int locked;
 	int count;
 };

 struct arena_qnode {
 	struct arena_mcs_spinlock mcs;
 };

 #define _Q_MAX_NODES		4
 #define _Q_PENDING_LOOPS	1

 /*
  * Bitfields in the atomic value:
  *
  *  0- 7: locked byte
  *     8: pending
  *  9-15: not used
  * 16-17: tail index
  * 18-31: tail cpu (+1)
  */
 #define _Q_MAX_CPUS		1024

 #define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
 				      << _Q_ ## type ## _OFFSET)
 #define _Q_LOCKED_OFFSET	0
 #define _Q_LOCKED_BITS		8
 #define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)

 #define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
 #define _Q_PENDING_BITS		8
 #define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)

 #define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
 #define _Q_TAIL_IDX_BITS	2
 #define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)

 #define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
 #define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
 #define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)

 #define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
 #define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)

 #define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
 #define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)

 struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];

 static inline u32 encode_tail(int cpu, int idx)
 {
 	u32 tail;

 	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
 	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */

 	return tail;
 }

 static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
 {
 	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
 	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;

 	return &qnodes[cpu][idx].mcs;
 }

 static inline
 struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
 {
 	return &((struct arena_qnode __arena *)base + idx)->mcs;
 }

 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)

 /**
  * xchg_tail - Put in the new queue tail code word & retrieve previous one
  * @lock : Pointer to queued spinlock structure
  * @tail : The new queue tail code word
  * Return: The previous queue tail code word
  *
  * xchg(lock, tail)
  *
  * p,*,* -> n,*,* ; prev = xchg(lock, node)
  */
 static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
 {
 	u32 old, new;

 	old = atomic_read(&lock->val);
 	do {
 		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
 		/*
 		 * We can use relaxed semantics since the caller ensures that
 		 * the MCS node is properly initialized before updating the
 		 * tail.
 		 */
 		/* These loops are not expected to stall, but we still need to
 		 * prove to the verifier they will terminate eventually.
 		 */
 		cond_break_label(out);
 	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));

 	return old;
 out:
 	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
 	return old;
 }

 /**
  * clear_pending - clear the pending bit.
  * @lock: Pointer to queued spinlock structure
  *
  * *,1,* -> *,0,*
  */
 static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
 {
 	WRITE_ONCE(lock->pending, 0);
 }

 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queued spinlock structure
  *
  * *,1,0 -> *,0,1
  *
  * Lock stealing is not allowed if this function is used.
  */
 static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
 {
 	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
 }

 /**
  * set_locked - Set the lock bit and own the lock
  * @lock: Pointer to queued spinlock structure
  *
  * *,*,0 -> *,0,1
  */
 static __always_inline void set_locked(arena_spinlock_t __arena *lock)
 {
 	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
 }

 static __always_inline
 u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
 {
 	u32 old, new;

 	old = atomic_read(&lock->val);
 	do {
 		new = old | _Q_PENDING_VAL;
 		/*
 		 * These loops are not expected to stall, but we still need to
 		 * prove to the verifier they will terminate eventually.
 		 */
 		cond_break_label(out);
 	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));

 	return old;
 out:
 	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
 	return old;
 }

 /**
  * arena_spin_trylock - try to acquire the queued spinlock
  * @lock : Pointer to queued spinlock structure
  * Return: 1 if lock acquired, 0 if failed
  */
 static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
 {
 	int val = atomic_read(&lock->val);

 	if (unlikely(val))
 		return 0;

 	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
 }

 __noinline
 int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
 {
 	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
 	int ret = -ETIMEDOUT;
 	u32 old, tail;
 	int idx;

 	/*
 	 * Wait for in-progress pending->locked hand-overs with a bounded
 	 * number of spins so that we guarantee forward progress.
 	 *
 	 * 0,1,0 -> 0,0,1
 	 */
 	if (val == _Q_PENDING_VAL) {
 		int cnt = _Q_PENDING_LOOPS;
 		val = atomic_cond_read_relaxed_label(&lock->val,
 						     (VAL != _Q_PENDING_VAL) || !cnt--,
 						     release_err);
 	}

 	/*
 	 * If we observe any contention; queue.
 	 */
 	if (val & ~_Q_LOCKED_MASK)
 		goto queue;

 	/*
 	 * trylock || pending
 	 *
 	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
 	 */
 	val = arena_fetch_set_pending_acquire(lock);

 	/*
 	 * If we observe contention, there is a concurrent locker.
 	 *
 	 * Undo and queue; our setting of PENDING might have made the
 	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
 	 * on @next to become !NULL.
 	 */
 	if (unlikely(val & ~_Q_LOCKED_MASK)) {

 		/* Undo PENDING if we set it. */
 		if (!(val & _Q_PENDING_MASK))
 			clear_pending(lock);

 		goto queue;
 	}

 	/*
 	 * We're pending, wait for the owner to go away.
 	 *
 	 * 0,1,1 -> *,1,0
 	 *
 	 * this wait loop must be a load-acquire such that we match the
 	 * store-release that clears the locked bit and create lock
 	 * sequentiality; this is because not all
 	 * clear_pending_set_locked() implementations imply full
 	 * barriers.
 	 */
 	if (val & _Q_LOCKED_MASK)
 		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);

 	/*
 	 * take ownership and clear the pending bit.
 	 *
 	 * 0,1,0 -> 0,0,1
 	 */
 	clear_pending_set_locked(lock);
 	return 0;

 	/*
 	 * End of pending bit optimistic spinning and beginning of MCS
 	 * queuing.
 	 */
 queue:
 	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
 	idx = node0->count++;
 	tail = encode_tail(bpf_get_smp_processor_id(), idx);

 	/*
 	 * 4 nodes are allocated based on the assumption that there will not be
 	 * nested NMIs taking spinlocks. That may not be true in some
 	 * architectures even though the chance of needing more than 4 nodes
 	 * will still be extremely unlikely. When that happens, we simply return
 	 * an error. Original qspinlock has a trylock fallback in this case.
 	 */
 	if (unlikely(idx >= _Q_MAX_NODES)) {
 		ret = -EBUSY;
 		goto release_node_err;
 	}

 	node = grab_mcs_node(node0, idx);

 	/*
 	 * Ensure that we increment the head node->count before initialising
 	 * the actual node. If the compiler is kind enough to reorder these
 	 * stores, then an IRQ could overwrite our assignments.
 	 */
 	barrier();

 	node->locked = 0;
 	node->next = NULL;

 	/*
 	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
 	 * attempt the trylock once more in the hope someone let go while we
 	 * weren't watching.
 	 */
 	if (arena_spin_trylock(lock))
 		goto release;

 	/*
 	 * Ensure that the initialisation of @node is complete before we
 	 * publish the updated tail via xchg_tail() and potentially link
 	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
 	 */
 	smp_wmb();

 	/*
 	 * Publish the updated tail.
 	 * We have already touched the queueing cacheline; don't bother with
 	 * pending stuff.
 	 *
 	 * p,*,* -> n,*,*
 	 */
 	old = xchg_tail(lock, tail);
 	next = NULL;

 	/*
 	 * if there was a previous node; link it and wait until reaching the
 	 * head of the waitqueue.
 	 */
 	if (old & _Q_TAIL_MASK) {
 		prev = decode_tail(old);

 		/* Link @node into the waitqueue. */
 		WRITE_ONCE(prev->next, node);

 		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);

 		/*
 		 * While waiting for the MCS lock, the next pointer may have
 		 * been set by another lock waiter. We cannot prefetch here
 		 * due to lack of equivalent instruction in BPF ISA.
 		 */
 		next = READ_ONCE(node->next);
 	}

 	/*
 	 * we're at the head of the waitqueue, wait for the owner & pending to
 	 * go away.
 	 *
 	 * *,x,y -> *,0,0
 	 *
 	 * this wait loop must use a load-acquire such that we match the
 	 * store-release that clears the locked bit and create lock
 	 * sequentiality; this is because the set_locked() function below
 	 * does not imply a full barrier.
 	 */
 	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
 					     release_node_err);

 	/*
 	 * claim the lock:
 	 *
 	 * n,0,0 -> 0,0,1 : lock, uncontended
 	 * *,*,0 -> *,*,1 : lock, contended
 	 *
 	 * If the queue head is the only one in the queue (lock value == tail)
 	 * and nobody is pending, clear the tail code and grab the lock.
 	 * Otherwise, we only need to grab the lock.
 	 */

 	/*
 	 * In the PV case we might already have _Q_LOCKED_VAL set, because
 	 * of lock stealing; therefore we must also allow:
 	 *
 	 * n,0,1 -> 0,0,1
 	 *
 	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
 	 *       above wait condition, therefore any concurrent setting of
 	 *       PENDING will make the uncontended transition fail.
 	 */
 	if ((val & _Q_TAIL_MASK) == tail) {
 		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
 			goto release; /* No contention */
 	}

 	/*
 	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
 	 * which will then detect the remaining tail and queue behind us
 	 * ensuring we'll see a @next.
 	 */
 	set_locked(lock);

 	/*
 	 * contended path; wait for next if not observed yet, release.
 	 */
 	if (!next)
 		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);

 	arch_mcs_spin_unlock_contended(&next->locked);

 release:;
 	/*
 	 * release the node
 	 *
 	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
 	 * decrements count it incremented before returning, thus we're fine.
 	 * For contexts interrupting us, they either observe our dec or not.
 	 * Just ensure the compiler doesn't reorder this statement, as a
 	 * this_cpu_dec implicitly implied that.
 	 */
 	barrier();
 	node0->count--;
 	return 0;
 release_node_err:
 	barrier();
 	node0->count--;
 	goto release_err;
 release_err:
 	return ret;
 }

 /**
  * arena_spin_lock - acquire a queued spinlock
  * @lock: Pointer to queued spinlock structure
  *
  * On error, returned value will be negative.
  * On success, zero is returned.
  *
  * The return value _must_ be tested against zero for success,
  * instead of checking it against negative, for passing the
  * BPF verifier.
  *
  * The user should do:
  *	if (arena_spin_lock(...) != 0) // failure
  *		or
  *	if (arena_spin_lock(...) == 0) // success
  *		or
  *	if (arena_spin_lock(...)) // failure
  *		or
  *	if (!arena_spin_lock(...)) // success
  * instead of:
  *	if (arena_spin_lock(...) < 0) // failure
  *
  * The return value can still be inspected later.
  */
 static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
 {
 	int val = 0;

 	if (CONFIG_NR_CPUS > 1024)
 		return -EOPNOTSUPP;

 	bpf_preempt_disable();
 	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
 		return 0;

 	val = arena_spin_lock_slowpath(lock, val);
 	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
 	if (val)
 		bpf_preempt_enable();
 	return val;
 }

 /**
  * arena_spin_unlock - release a queued spinlock
  * @lock : Pointer to queued spinlock structure
  */
 static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
 {
 	/*
 	 * unlock() needs release semantics:
 	 */
 	smp_store_release(&lock->locked, 0);
 	bpf_preempt_enable();
 }

 #define arena_spin_lock_irqsave(lock, flags)             \
 	({                                               \
 		int __ret;                               \
 		bpf_local_irq_save(&(flags));            \
 		__ret = arena_spin_lock((lock));         \
 		if (__ret)                               \
 			bpf_local_irq_restore(&(flags)); \
 		(__ret);                                 \
 	})

 #define arena_spin_unlock_irqrestore(lock, flags) \
 	({                                        \
 		arena_spin_unlock((lock));        \
 		bpf_local_irq_restore(&(flags));  \
 	})

 #endif

 #endif /* BPF_ARENA_SPIN_LOCK_H */
	// SPDX-License-Identifier: GPL-2.0
	/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
	#ifndef BPF_ARENA_SPIN_LOCK_H
	#define BPF_ARENA_SPIN_LOCK_H

	#include <vmlinux.h>
	#include <bpf/bpf_helpers.h>
	#include "bpf_atomic.h"

	#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
	#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)

	#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)

	#define EBUSY 16
	#define EOPNOTSUPP 95
	#define ETIMEDOUT 110

	#ifndef __arena
	#define __arena __attribute__((address_space(1)))
	#endif

	extern unsigned long CONFIG_NR_CPUS __kconfig;

	/*
	* Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
	* PowerPC overrides the definition to define lock->val as u32 instead of
	* atomic_t, leading to compilation errors. Import a local definition below so
	* that we don't depend on the vmlinux.h version.
	*/

	struct __qspinlock {
	union {
	atomic_t val;
	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
	struct {
	u8 locked;
	u8 pending;
	};
	struct {
	u16 locked_pending;
	u16 tail;
	};
	#else
	struct {
	u16 tail;
	u16 locked_pending;
	};
	struct {
	u8 reserved[2];
	u8 pending;
	u8 locked;
	};
	#endif
	};
	};

	#define arena_spinlock_t struct __qspinlock
	/* FIXME: Using typedef causes CO-RE relocation error */
	/* typedef struct qspinlock arena_spinlock_t; */

	struct arena_mcs_spinlock {
	struct arena_mcs_spinlock __arena *next;
	int locked;
	int count;
	};

	struct arena_qnode {
	struct arena_mcs_spinlock mcs;
	};

	#define _Q_MAX_NODES 4
	#define _Q_PENDING_LOOPS 1

	/*
	* Bitfields in the atomic value:
	*
	* 0- 7: locked byte
	* 8: pending
	* 9-15: not used
	* 16-17: tail index
	* 18-31: tail cpu (+1)
	*/
	#define _Q_MAX_CPUS 1024

	#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\
	<< _Q_ ## type ## _OFFSET)
	#define _Q_LOCKED_OFFSET 0
	#define _Q_LOCKED_BITS 8
	#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)

	#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
	#define _Q_PENDING_BITS 8
	#define _Q_PENDING_MASK _Q_SET_MASK(PENDING)

	#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
	#define _Q_TAIL_IDX_BITS 2
	#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX)

	#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
	#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
	#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)

	#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
	#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK \| _Q_TAIL_CPU_MASK)

	#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
	#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET)

	struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];

	static inline u32 encode_tail(int cpu, int idx)
	{
	u32 tail;

	tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
	tail \|= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */

	return tail;
	}

	static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
	{
	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
	u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;

	return &qnodes[cpu][idx].mcs;
	}

	static inline
	struct arena_mcs_spinlock __arena grab_mcs_node(struct arena_mcs_spinlock __arena base, int idx)
	{
	return &((struct arena_qnode __arena *)base + idx)->mcs;
	}

	#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK \| _Q_PENDING_MASK)

	/**
	* xchg_tail - Put in the new queue tail code word & retrieve previous one
	* @lock : Pointer to queued spinlock structure
	* @tail : The new queue tail code word
	* Return: The previous queue tail code word
	*
	* xchg(lock, tail)
	*
	* p,, -> n,, ; prev = xchg(lock, node)
	*/
	static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
	{
	u32 old, new;

	old = atomic_read(&lock->val);
	do {
	new = (old & _Q_LOCKED_PENDING_MASK) \| tail;
	/*
	* We can use relaxed semantics since the caller ensures that
	* the MCS node is properly initialized before updating the
	* tail.
	*/
	/* These loops are not expected to stall, but we still need to
	* prove to the verifier they will terminate eventually.
	*/
	cond_break_label(out);
	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));

	return old;
	out:
	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
	return old;
	}

	/**
	* clear_pending - clear the pending bit.
	* @lock: Pointer to queued spinlock structure
	*
	* ,1, -> ,0,
	*/
	static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
	{
	WRITE_ONCE(lock->pending, 0);
	}

	/**
	* clear_pending_set_locked - take ownership and clear the pending bit.
	* @lock: Pointer to queued spinlock structure
	*
	* ,1,0 -> ,0,1
	*
	* Lock stealing is not allowed if this function is used.
	*/
	static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
	{
	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
	}

	/**
	* set_locked - Set the lock bit and own the lock
	* @lock: Pointer to queued spinlock structure
	*
	* ,,0 -> *,0,1
	*/
	static __always_inline void set_locked(arena_spinlock_t __arena *lock)
	{
	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
	}

	static __always_inline
	u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
	{
	u32 old, new;

	old = atomic_read(&lock->val);
	do {
	new = old \| _Q_PENDING_VAL;
	/*
	* These loops are not expected to stall, but we still need to
	* prove to the verifier they will terminate eventually.
	*/
	cond_break_label(out);
	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));

	return old;
	out:
	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
	return old;
	}

	/**
	* arena_spin_trylock - try to acquire the queued spinlock
	* @lock : Pointer to queued spinlock structure
	* Return: 1 if lock acquired, 0 if failed
	*/
	static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
	{
	int val = atomic_read(&lock->val);

	if (unlikely(val))
	return 0;

	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
	}

	__noinline
	int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
	{
	struct arena_mcs_spinlock __arena prev, next, node0, node;
	int ret = -ETIMEDOUT;
	u32 old, tail;
	int idx;

	/*
	* Wait for in-progress pending->locked hand-overs with a bounded
	* number of spins so that we guarantee forward progress.
	*
	* 0,1,0 -> 0,0,1
	*/
	if (val == _Q_PENDING_VAL) {
	int cnt = _Q_PENDING_LOOPS;
	val = atomic_cond_read_relaxed_label(&lock->val,
	(VAL != _Q_PENDING_VAL) \|\| !cnt--,
	release_err);
	}

	/*
	* If we observe any contention; queue.
	*/
	if (val & ~_Q_LOCKED_MASK)
	goto queue;

	/*
	* trylock \|\| pending
	*
	* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
	*/
	val = arena_fetch_set_pending_acquire(lock);

	/*
	* If we observe contention, there is a concurrent locker.
	*
	* Undo and queue; our setting of PENDING might have made the
	* n,0,0 -> 0,0,0 transition fail and it will now be waiting
	* on @next to become !NULL.
	*/
	if (unlikely(val & ~_Q_LOCKED_MASK)) {

	/* Undo PENDING if we set it. */
	if (!(val & _Q_PENDING_MASK))
	clear_pending(lock);

	goto queue;
	}

	/*
	* We're pending, wait for the owner to go away.
	*
	* 0,1,1 -> *,1,0
	*
	* this wait loop must be a load-acquire such that we match the
	* store-release that clears the locked bit and create lock
	* sequentiality; this is because not all
	* clear_pending_set_locked() implementations imply full
	* barriers.
	*/
	if (val & _Q_LOCKED_MASK)
	smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);

	/*
	* take ownership and clear the pending bit.
	*
	* 0,1,0 -> 0,0,1
	*/
	clear_pending_set_locked(lock);
	return 0;

	/*
	* End of pending bit optimistic spinning and beginning of MCS
	* queuing.
	*/
	queue:
	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
	idx = node0->count++;
	tail = encode_tail(bpf_get_smp_processor_id(), idx);

	/*
	* 4 nodes are allocated based on the assumption that there will not be
	* nested NMIs taking spinlocks. That may not be true in some
	* architectures even though the chance of needing more than 4 nodes
	* will still be extremely unlikely. When that happens, we simply return
	* an error. Original qspinlock has a trylock fallback in this case.
	*/
	if (unlikely(idx >= _Q_MAX_NODES)) {
	ret = -EBUSY;
	goto release_node_err;
	}

	node = grab_mcs_node(node0, idx);

	/*
	* Ensure that we increment the head node->count before initialising
	* the actual node. If the compiler is kind enough to reorder these
	* stores, then an IRQ could overwrite our assignments.
	*/
	barrier();

	node->locked = 0;
	node->next = NULL;

	/*
	* We touched a (possibly) cold cacheline in the per-cpu queue node;
	* attempt the trylock once more in the hope someone let go while we
	* weren't watching.
	*/
	if (arena_spin_trylock(lock))
	goto release;

	/*
	* Ensure that the initialisation of @node is complete before we
	* publish the updated tail via xchg_tail() and potentially link
	* @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
	*/
	smp_wmb();

	/*
	* Publish the updated tail.
	* We have already touched the queueing cacheline; don't bother with
	* pending stuff.
	*
	* p,, -> n,,
	*/
	old = xchg_tail(lock, tail);
	next = NULL;

	/*
	* if there was a previous node; link it and wait until reaching the
	* head of the waitqueue.
	*/
	if (old & _Q_TAIL_MASK) {
	prev = decode_tail(old);

	/* Link @node into the waitqueue. */
	WRITE_ONCE(prev->next, node);

	arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);

	/*
	* While waiting for the MCS lock, the next pointer may have
	* been set by another lock waiter. We cannot prefetch here
	* due to lack of equivalent instruction in BPF ISA.
	*/
	next = READ_ONCE(node->next);
	}

	/*
	* we're at the head of the waitqueue, wait for the owner & pending to
	* go away.
	*
	* ,x,y -> ,0,0
	*
	* this wait loop must use a load-acquire such that we match the
	* store-release that clears the locked bit and create lock
	* sequentiality; this is because the set_locked() function below
	* does not imply a full barrier.
	*/
	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
	release_node_err);

	/*
	* claim the lock:
	*
	* n,0,0 -> 0,0,1 : lock, uncontended
	* ,,0 -> ,,1 : lock, contended
	*
	* If the queue head is the only one in the queue (lock value == tail)
	* and nobody is pending, clear the tail code and grab the lock.
	* Otherwise, we only need to grab the lock.
	*/

	/*
	* In the PV case we might already have _Q_LOCKED_VAL set, because
	* of lock stealing; therefore we must also allow:
	*
	* n,0,1 -> 0,0,1
	*
	* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
	* above wait condition, therefore any concurrent setting of
	* PENDING will make the uncontended transition fail.
	*/
	if ((val & _Q_TAIL_MASK) == tail) {
	if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
	goto release; /* No contention */
	}

	/*
	* Either somebody is queued behind us or _Q_PENDING_VAL got set
	* which will then detect the remaining tail and queue behind us
	* ensuring we'll see a @next.
	*/
	set_locked(lock);

	/*
	* contended path; wait for next if not observed yet, release.
	*/
	if (!next)
	next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);

	arch_mcs_spin_unlock_contended(&next->locked);

	release:;
	/*
	* release the node
	*
	* Doing a normal dec vs this_cpu_dec is fine. An upper context always
	* decrements count it incremented before returning, thus we're fine.
	* For contexts interrupting us, they either observe our dec or not.
	* Just ensure the compiler doesn't reorder this statement, as a
	* this_cpu_dec implicitly implied that.
	*/
	barrier();
	node0->count--;
	return 0;
	release_node_err:
	barrier();
	node0->count--;
	goto release_err;
	release_err:
	return ret;
	}

	/**
	* arena_spin_lock - acquire a queued spinlock
	* @lock: Pointer to queued spinlock structure
	*
	* On error, returned value will be negative.
	* On success, zero is returned.
	*
	* The return value _must_ be tested against zero for success,
	* instead of checking it against negative, for passing the
	* BPF verifier.
	*
	* The user should do:
	* if (arena_spin_lock(...) != 0) // failure
	* or
	* if (arena_spin_lock(...) == 0) // success
	* or
	* if (arena_spin_lock(...)) // failure
	* or
	* if (!arena_spin_lock(...)) // success
	* instead of:
	* if (arena_spin_lock(...) < 0) // failure
	*
	* The return value can still be inspected later.
	*/
	static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
	{
	int val = 0;

	if (CONFIG_NR_CPUS > 1024)
	return -EOPNOTSUPP;

	bpf_preempt_disable();
	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
	return 0;

	val = arena_spin_lock_slowpath(lock, val);
	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
	if (val)
	bpf_preempt_enable();
	return val;
	}

	/**
	* arena_spin_unlock - release a queued spinlock
	* @lock : Pointer to queued spinlock structure
	*/
	static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
	{
	/*
	* unlock() needs release semantics:
	*/
	smp_store_release(&lock->locked, 0);
	bpf_preempt_enable();
	}

	#define arena_spin_lock_irqsave(lock, flags) \
	({ \
	int __ret; \
	bpf_local_irq_save(&(flags)); \
	__ret = arena_spin_lock((lock)); \
	if (__ret) \
	bpf_local_irq_restore(&(flags)); \
	(__ret); \
	})

	#define arena_spin_unlock_irqrestore(lock, flags) \
	({ \
	arena_spin_unlock((lock)); \
	bpf_local_irq_restore(&(flags)); \
	})

	#endif

	#endif /* BPF_ARENA_SPIN_LOCK_H */