| // SPDX-License-Identifier: GPL-2.0 |
| /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ |
| #ifndef BPF_ARENA_SPIN_LOCK_H |
| #define BPF_ARENA_SPIN_LOCK_H |
| |
| #include <vmlinux.h> |
| #include <bpf/bpf_helpers.h> |
| #include "bpf_atomic.h" |
| |
| #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) |
| #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) |
| |
| #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) |
| |
| #define EBUSY 16 |
| #define EOPNOTSUPP 95 |
| #define ETIMEDOUT 110 |
| |
| #ifndef __arena |
| #define __arena __attribute__((address_space(1))) |
| #endif |
| |
| extern unsigned long CONFIG_NR_CPUS __kconfig; |
| |
| /* |
| * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but |
| * PowerPC overrides the definition to define lock->val as u32 instead of |
| * atomic_t, leading to compilation errors. Import a local definition below so |
| * that we don't depend on the vmlinux.h version. |
| */ |
| |
| struct __qspinlock { |
| union { |
| atomic_t val; |
| #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
| struct { |
| u8 locked; |
| u8 pending; |
| }; |
| struct { |
| u16 locked_pending; |
| u16 tail; |
| }; |
| #else |
| struct { |
| u16 tail; |
| u16 locked_pending; |
| }; |
| struct { |
| u8 reserved[2]; |
| u8 pending; |
| u8 locked; |
| }; |
| #endif |
| }; |
| }; |
| |
| #define arena_spinlock_t struct __qspinlock |
| /* FIXME: Using typedef causes CO-RE relocation error */ |
| /* typedef struct qspinlock arena_spinlock_t; */ |
| |
| struct arena_mcs_spinlock { |
| struct arena_mcs_spinlock __arena *next; |
| int locked; |
| int count; |
| }; |
| |
| struct arena_qnode { |
| struct arena_mcs_spinlock mcs; |
| }; |
| |
| #define _Q_MAX_NODES 4 |
| #define _Q_PENDING_LOOPS 1 |
| |
| /* |
| * Bitfields in the atomic value: |
| * |
| * 0- 7: locked byte |
| * 8: pending |
| * 9-15: not used |
| * 16-17: tail index |
| * 18-31: tail cpu (+1) |
| */ |
| #define _Q_MAX_CPUS 1024 |
| |
| #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ |
| << _Q_ ## type ## _OFFSET) |
| #define _Q_LOCKED_OFFSET 0 |
| #define _Q_LOCKED_BITS 8 |
| #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) |
| |
| #define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) |
| #define _Q_PENDING_BITS 8 |
| #define _Q_PENDING_MASK _Q_SET_MASK(PENDING) |
| |
| #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) |
| #define _Q_TAIL_IDX_BITS 2 |
| #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) |
| |
| #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) |
| #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) |
| #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) |
| |
| #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET |
| #define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) |
| |
| #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) |
| #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) |
| |
| struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; |
| |
| static inline u32 encode_tail(int cpu, int idx) |
| { |
| u32 tail; |
| |
| tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; |
| tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ |
| |
| return tail; |
| } |
| |
| static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) |
| { |
| u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; |
| u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; |
| |
| return &qnodes[cpu][idx].mcs; |
| } |
| |
| static inline |
| struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) |
| { |
| return &((struct arena_qnode __arena *)base + idx)->mcs; |
| } |
| |
| #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) |
| |
| /** |
| * xchg_tail - Put in the new queue tail code word & retrieve previous one |
| * @lock : Pointer to queued spinlock structure |
| * @tail : The new queue tail code word |
| * Return: The previous queue tail code word |
| * |
| * xchg(lock, tail) |
| * |
| * p,*,* -> n,*,* ; prev = xchg(lock, node) |
| */ |
| static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) |
| { |
| u32 old, new; |
| |
| old = atomic_read(&lock->val); |
| do { |
| new = (old & _Q_LOCKED_PENDING_MASK) | tail; |
| /* |
| * We can use relaxed semantics since the caller ensures that |
| * the MCS node is properly initialized before updating the |
| * tail. |
| */ |
| /* These loops are not expected to stall, but we still need to |
| * prove to the verifier they will terminate eventually. |
| */ |
| cond_break_label(out); |
| } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); |
| |
| return old; |
| out: |
| bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); |
| return old; |
| } |
| |
| /** |
| * clear_pending - clear the pending bit. |
| * @lock: Pointer to queued spinlock structure |
| * |
| * *,1,* -> *,0,* |
| */ |
| static __always_inline void clear_pending(arena_spinlock_t __arena *lock) |
| { |
| WRITE_ONCE(lock->pending, 0); |
| } |
| |
| /** |
| * clear_pending_set_locked - take ownership and clear the pending bit. |
| * @lock: Pointer to queued spinlock structure |
| * |
| * *,1,0 -> *,0,1 |
| * |
| * Lock stealing is not allowed if this function is used. |
| */ |
| static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) |
| { |
| WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); |
| } |
| |
| /** |
| * set_locked - Set the lock bit and own the lock |
| * @lock: Pointer to queued spinlock structure |
| * |
| * *,*,0 -> *,0,1 |
| */ |
| static __always_inline void set_locked(arena_spinlock_t __arena *lock) |
| { |
| WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); |
| } |
| |
| static __always_inline |
| u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) |
| { |
| u32 old, new; |
| |
| old = atomic_read(&lock->val); |
| do { |
| new = old | _Q_PENDING_VAL; |
| /* |
| * These loops are not expected to stall, but we still need to |
| * prove to the verifier they will terminate eventually. |
| */ |
| cond_break_label(out); |
| } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); |
| |
| return old; |
| out: |
| bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); |
| return old; |
| } |
| |
| /** |
| * arena_spin_trylock - try to acquire the queued spinlock |
| * @lock : Pointer to queued spinlock structure |
| * Return: 1 if lock acquired, 0 if failed |
| */ |
| static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) |
| { |
| int val = atomic_read(&lock->val); |
| |
| if (unlikely(val)) |
| return 0; |
| |
| return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); |
| } |
| |
| __noinline |
| int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) |
| { |
| struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; |
| int ret = -ETIMEDOUT; |
| u32 old, tail; |
| int idx; |
| |
| /* |
| * Wait for in-progress pending->locked hand-overs with a bounded |
| * number of spins so that we guarantee forward progress. |
| * |
| * 0,1,0 -> 0,0,1 |
| */ |
| if (val == _Q_PENDING_VAL) { |
| int cnt = _Q_PENDING_LOOPS; |
| val = atomic_cond_read_relaxed_label(&lock->val, |
| (VAL != _Q_PENDING_VAL) || !cnt--, |
| release_err); |
| } |
| |
| /* |
| * If we observe any contention; queue. |
| */ |
| if (val & ~_Q_LOCKED_MASK) |
| goto queue; |
| |
| /* |
| * trylock || pending |
| * |
| * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock |
| */ |
| val = arena_fetch_set_pending_acquire(lock); |
| |
| /* |
| * If we observe contention, there is a concurrent locker. |
| * |
| * Undo and queue; our setting of PENDING might have made the |
| * n,0,0 -> 0,0,0 transition fail and it will now be waiting |
| * on @next to become !NULL. |
| */ |
| if (unlikely(val & ~_Q_LOCKED_MASK)) { |
| |
| /* Undo PENDING if we set it. */ |
| if (!(val & _Q_PENDING_MASK)) |
| clear_pending(lock); |
| |
| goto queue; |
| } |
| |
| /* |
| * We're pending, wait for the owner to go away. |
| * |
| * 0,1,1 -> *,1,0 |
| * |
| * this wait loop must be a load-acquire such that we match the |
| * store-release that clears the locked bit and create lock |
| * sequentiality; this is because not all |
| * clear_pending_set_locked() implementations imply full |
| * barriers. |
| */ |
| if (val & _Q_LOCKED_MASK) |
| smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); |
| |
| /* |
| * take ownership and clear the pending bit. |
| * |
| * 0,1,0 -> 0,0,1 |
| */ |
| clear_pending_set_locked(lock); |
| return 0; |
| |
| /* |
| * End of pending bit optimistic spinning and beginning of MCS |
| * queuing. |
| */ |
| queue: |
| node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; |
| idx = node0->count++; |
| tail = encode_tail(bpf_get_smp_processor_id(), idx); |
| |
| /* |
| * 4 nodes are allocated based on the assumption that there will not be |
| * nested NMIs taking spinlocks. That may not be true in some |
| * architectures even though the chance of needing more than 4 nodes |
| * will still be extremely unlikely. When that happens, we simply return |
| * an error. Original qspinlock has a trylock fallback in this case. |
| */ |
| if (unlikely(idx >= _Q_MAX_NODES)) { |
| ret = -EBUSY; |
| goto release_node_err; |
| } |
| |
| node = grab_mcs_node(node0, idx); |
| |
| /* |
| * Ensure that we increment the head node->count before initialising |
| * the actual node. If the compiler is kind enough to reorder these |
| * stores, then an IRQ could overwrite our assignments. |
| */ |
| barrier(); |
| |
| node->locked = 0; |
| node->next = NULL; |
| |
| /* |
| * We touched a (possibly) cold cacheline in the per-cpu queue node; |
| * attempt the trylock once more in the hope someone let go while we |
| * weren't watching. |
| */ |
| if (arena_spin_trylock(lock)) |
| goto release; |
| |
| /* |
| * Ensure that the initialisation of @node is complete before we |
| * publish the updated tail via xchg_tail() and potentially link |
| * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. |
| */ |
| smp_wmb(); |
| |
| /* |
| * Publish the updated tail. |
| * We have already touched the queueing cacheline; don't bother with |
| * pending stuff. |
| * |
| * p,*,* -> n,*,* |
| */ |
| old = xchg_tail(lock, tail); |
| next = NULL; |
| |
| /* |
| * if there was a previous node; link it and wait until reaching the |
| * head of the waitqueue. |
| */ |
| if (old & _Q_TAIL_MASK) { |
| prev = decode_tail(old); |
| |
| /* Link @node into the waitqueue. */ |
| WRITE_ONCE(prev->next, node); |
| |
| arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); |
| |
| /* |
| * While waiting for the MCS lock, the next pointer may have |
| * been set by another lock waiter. We cannot prefetch here |
| * due to lack of equivalent instruction in BPF ISA. |
| */ |
| next = READ_ONCE(node->next); |
| } |
| |
| /* |
| * we're at the head of the waitqueue, wait for the owner & pending to |
| * go away. |
| * |
| * *,x,y -> *,0,0 |
| * |
| * this wait loop must use a load-acquire such that we match the |
| * store-release that clears the locked bit and create lock |
| * sequentiality; this is because the set_locked() function below |
| * does not imply a full barrier. |
| */ |
| val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), |
| release_node_err); |
| |
| /* |
| * claim the lock: |
| * |
| * n,0,0 -> 0,0,1 : lock, uncontended |
| * *,*,0 -> *,*,1 : lock, contended |
| * |
| * If the queue head is the only one in the queue (lock value == tail) |
| * and nobody is pending, clear the tail code and grab the lock. |
| * Otherwise, we only need to grab the lock. |
| */ |
| |
| /* |
| * In the PV case we might already have _Q_LOCKED_VAL set, because |
| * of lock stealing; therefore we must also allow: |
| * |
| * n,0,1 -> 0,0,1 |
| * |
| * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the |
| * above wait condition, therefore any concurrent setting of |
| * PENDING will make the uncontended transition fail. |
| */ |
| if ((val & _Q_TAIL_MASK) == tail) { |
| if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) |
| goto release; /* No contention */ |
| } |
| |
| /* |
| * Either somebody is queued behind us or _Q_PENDING_VAL got set |
| * which will then detect the remaining tail and queue behind us |
| * ensuring we'll see a @next. |
| */ |
| set_locked(lock); |
| |
| /* |
| * contended path; wait for next if not observed yet, release. |
| */ |
| if (!next) |
| next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); |
| |
| arch_mcs_spin_unlock_contended(&next->locked); |
| |
| release:; |
| /* |
| * release the node |
| * |
| * Doing a normal dec vs this_cpu_dec is fine. An upper context always |
| * decrements count it incremented before returning, thus we're fine. |
| * For contexts interrupting us, they either observe our dec or not. |
| * Just ensure the compiler doesn't reorder this statement, as a |
| * this_cpu_dec implicitly implied that. |
| */ |
| barrier(); |
| node0->count--; |
| return 0; |
| release_node_err: |
| barrier(); |
| node0->count--; |
| goto release_err; |
| release_err: |
| return ret; |
| } |
| |
| /** |
| * arena_spin_lock - acquire a queued spinlock |
| * @lock: Pointer to queued spinlock structure |
| * |
| * On error, returned value will be negative. |
| * On success, zero is returned. |
| * |
| * The return value _must_ be tested against zero for success, |
| * instead of checking it against negative, for passing the |
| * BPF verifier. |
| * |
| * The user should do: |
| * if (arena_spin_lock(...) != 0) // failure |
| * or |
| * if (arena_spin_lock(...) == 0) // success |
| * or |
| * if (arena_spin_lock(...)) // failure |
| * or |
| * if (!arena_spin_lock(...)) // success |
| * instead of: |
| * if (arena_spin_lock(...) < 0) // failure |
| * |
| * The return value can still be inspected later. |
| */ |
| static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) |
| { |
| int val = 0; |
| |
| if (CONFIG_NR_CPUS > 1024) |
| return -EOPNOTSUPP; |
| |
| bpf_preempt_disable(); |
| if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) |
| return 0; |
| |
| val = arena_spin_lock_slowpath(lock, val); |
| /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ |
| if (val) |
| bpf_preempt_enable(); |
| return val; |
| } |
| |
| /** |
| * arena_spin_unlock - release a queued spinlock |
| * @lock : Pointer to queued spinlock structure |
| */ |
| static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) |
| { |
| /* |
| * unlock() needs release semantics: |
| */ |
| smp_store_release(&lock->locked, 0); |
| bpf_preempt_enable(); |
| } |
| |
| #define arena_spin_lock_irqsave(lock, flags) \ |
| ({ \ |
| int __ret; \ |
| bpf_local_irq_save(&(flags)); \ |
| __ret = arena_spin_lock((lock)); \ |
| if (__ret) \ |
| bpf_local_irq_restore(&(flags)); \ |
| (__ret); \ |
| }) |
| |
| #define arena_spin_unlock_irqrestore(lock, flags) \ |
| ({ \ |
| arena_spin_unlock((lock)); \ |
| bpf_local_irq_restore(&(flags)); \ |
| }) |
| |
| #endif |
| |
| #endif /* BPF_ARENA_SPIN_LOCK_H */ |