Merge branch 'locking-introduce-nested-bh-locking'
Sebastian Andrzej Siewior says:
====================
locking: Introduce nested-BH locking.
Disabling bottoms halves acts as per-CPU BKL. On PREEMPT_RT code within
local_bh_disable() section remains preemtible. As a result high prior
tasks (or threaded interrupts) will be blocked by lower-prio task (or
threaded interrupts) which are long running which includes softirq
sections.
The proposed way out is to introduce explicit per-CPU locks for
resources which are protected by local_bh_disable() and use those only
on PREEMPT_RT so there is no additional overhead for !PREEMPT_RT builds.
The series introduces the infrastructure and converts large parts of
networking which is largest stake holder here. Once this done the
per-CPU lock from local_bh_disable() on PREEMPT_RT can be lifted.
Performance testing. Baseline is net-next as of commit 93bda33046e7a
("Merge branch'net-constify-ctl_table-arguments-of-utility-functions'")
plus v6.10-rc1. A 10GiG link is used between two hosts. The command
xdp-bench redirect-cpu --cpu 3 --remote-action drop eth1 -e
was invoked on the receiving side with a ixgbe. The sending side uses
pktgen_sample03_burst_single_flow.sh on i40e.
Baseline:
| eth1->? 9,018,604 rx/s 0 err,drop/s
| receive total 9,018,604 pkt/s 0 drop/s 0 error/s
| cpu:7 9,018,604 pkt/s 0 drop/s 0 error/s
| enqueue to cpu 3 9,018,602 pkt/s 0 drop/s 7.00 bulk-avg
| cpu:7->3 9,018,602 pkt/s 0 drop/s 7.00 bulk-avg
| kthread total 9,018,606 pkt/s 0 drop/s 214,698 sched
| cpu:3 9,018,606 pkt/s 0 drop/s 214,698 sched
| xdp_stats 0 pass/s 9,018,606 drop/s 0 redir/s
| cpu:3 0 pass/s 9,018,606 drop/s 0 redir/s
| redirect_err 0 error/s
| xdp_exception 0 hit/s
perf top --sort cpu,symbol --no-children:
| 18.14% 007 [k] bpf_prog_4f0ffbb35139c187_cpumap_l4_hash
| 13.29% 007 [k] ixgbe_poll
| 12.66% 003 [k] cpu_map_kthread_run
| 7.23% 003 [k] page_frag_free
| 6.76% 007 [k] xdp_do_redirect
| 3.76% 007 [k] cpu_map_redirect
| 3.13% 007 [k] bq_flush_to_queue
| 2.51% 003 [k] xdp_return_frame
| 1.93% 007 [k] try_to_wake_up
| 1.78% 007 [k] _raw_spin_lock
| 1.74% 007 [k] cpu_map_enqueue
| 1.56% 003 [k] bpf_prog_57cd311f2e27366b_cpumap_drop
With this series applied:
| eth1->? 10,329,340 rx/s 0 err,drop/s
| receive total 10,329,340 pkt/s 0 drop/s 0 error/s
| cpu:6 10,329,340 pkt/s 0 drop/s 0 error/s
| enqueue to cpu 3 10,329,338 pkt/s 0 drop/s 8.00 bulk-avg
| cpu:6->3 10,329,338 pkt/s 0 drop/s 8.00 bulk-avg
| kthread total 10,329,321 pkt/s 0 drop/s 96,297 sched
| cpu:3 10,329,321 pkt/s 0 drop/s 96,297 sched
| xdp_stats 0 pass/s 10,329,321 drop/s 0 redir/s
| cpu:3 0 pass/s 10,329,321 drop/s 0 redir/s
| redirect_err 0 error/s
| xdp_exception 0 hit/s
perf top --sort cpu,symbol --no-children:
| 20.90% 006 [k] bpf_prog_4f0ffbb35139c187_cpumap_l4_hash
| 12.62% 006 [k] ixgbe_poll
| 9.82% 003 [k] page_frag_free
| 8.73% 003 [k] cpu_map_bpf_prog_run_xdp
| 6.63% 006 [k] xdp_do_redirect
| 4.94% 003 [k] cpu_map_kthread_run
| 4.28% 006 [k] cpu_map_redirect
| 4.03% 006 [k] bq_flush_to_queue
| 3.01% 003 [k] xdp_return_frame
| 1.95% 006 [k] _raw_spin_lock
| 1.94% 003 [k] bpf_prog_57cd311f2e27366b_cpumap_drop
This diff appears to be noise.
v8: https://lore.kernel.org/all/20240619072253.504963-1-bigeasy@linutronix.de
v7: https://lore.kernel.org/all/20240618072526.379909-1-bigeasy@linutronix.de
v6: https://lore.kernel.org/all/20240612170303.3896084-1-bigeasy@linutronix.de
v5: https://lore.kernel.org/all/20240607070427.1379327-1-bigeasy@linutronix.de
v4: https://lore.kernel.org/all/20240604154425.878636-1-bigeasy@linutronix.de
v3: https://lore.kernel.org/all/20240529162927.403425-1-bigeasy@linutronix.de
v2: https://lore.kernel.org/all/20240503182957.1042122-1-bigeasy@linutronix.de
v1: https://lore.kernel.org/all/20231215171020.687342-1-bigeasy@linutronix.de
====================
Link: https://patch.msgid.link/20240620132727.660738-1-bigeasy@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
diff --git a/include/linux/filter.h b/include/linux/filter.h
index b02aea2..c034952 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -733,21 +733,101 @@ struct bpf_nh_params {
};
};
+/* flags for bpf_redirect_info kern_flags */
+#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */
+#define BPF_RI_F_RI_INIT BIT(1)
+#define BPF_RI_F_CPU_MAP_INIT BIT(2)
+#define BPF_RI_F_DEV_MAP_INIT BIT(3)
+#define BPF_RI_F_XSK_MAP_INIT BIT(4)
+
struct bpf_redirect_info {
u64 tgt_index;
void *tgt_value;
struct bpf_map *map;
u32 flags;
- u32 kern_flags;
u32 map_id;
enum bpf_map_type map_type;
struct bpf_nh_params nh;
+ u32 kern_flags;
};
-DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
+struct bpf_net_context {
+ struct bpf_redirect_info ri;
+ struct list_head cpu_map_flush_list;
+ struct list_head dev_map_flush_list;
+ struct list_head xskmap_map_flush_list;
+};
-/* flags for bpf_redirect_info kern_flags */
-#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */
+static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
+{
+ struct task_struct *tsk = current;
+
+ if (tsk->bpf_net_context != NULL)
+ return NULL;
+ bpf_net_ctx->ri.kern_flags = 0;
+
+ tsk->bpf_net_context = bpf_net_ctx;
+ return bpf_net_ctx;
+}
+
+static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
+{
+ if (bpf_net_ctx)
+ current->bpf_net_context = NULL;
+}
+
+static inline struct bpf_net_context *bpf_net_ctx_get(void)
+{
+ return current->bpf_net_context;
+}
+
+static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
+{
+ struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
+
+ if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
+ memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));
+ bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT;
+ }
+
+ return &bpf_net_ctx->ri;
+}
+
+static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void)
+{
+ struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
+
+ if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) {
+ INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list);
+ bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT;
+ }
+
+ return &bpf_net_ctx->cpu_map_flush_list;
+}
+
+static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void)
+{
+ struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
+
+ if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) {
+ INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list);
+ bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT;
+ }
+
+ return &bpf_net_ctx->dev_map_flush_list;
+}
+
+static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void)
+{
+ struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
+
+ if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) {
+ INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list);
+ bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT;
+ }
+
+ return &bpf_net_ctx->xskmap_map_flush_list;
+}
/* Compute the linear packet data range [data, data_end) which
* will be accessed by various program types (cls_bpf, act_bpf,
@@ -1018,25 +1098,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
-void bpf_clear_redirect_map(struct bpf_map *map);
-
static inline bool xdp_return_frame_no_direct(void)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}
static inline void xdp_set_return_frame_no_direct(void)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}
static inline void xdp_clear_return_frame_no_direct(void)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}
@@ -1592,7 +1670,7 @@ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 inde
u64 flags, const u64 flag_mask,
void *lookup_elem(struct bpf_map *map, u32 key))
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
/* Lower bits of the flags are used as return code on lookup failure */
diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index e55010f..091dc0b 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -51,4 +51,25 @@
#define local_unlock_irqrestore(lock, flags) \
__local_unlock_irqrestore(lock, flags)
+DEFINE_GUARD(local_lock, local_lock_t __percpu*,
+ local_lock(_T),
+ local_unlock(_T))
+DEFINE_GUARD(local_lock_irq, local_lock_t __percpu*,
+ local_lock_irq(_T),
+ local_unlock_irq(_T))
+DEFINE_LOCK_GUARD_1(local_lock_irqsave, local_lock_t __percpu,
+ local_lock_irqsave(_T->lock, _T->flags),
+ local_unlock_irqrestore(_T->lock, _T->flags),
+ unsigned long flags)
+
+#define local_lock_nested_bh(_lock) \
+ __local_lock_nested_bh(_lock)
+
+#define local_unlock_nested_bh(_lock) \
+ __local_unlock_nested_bh(_lock)
+
+DEFINE_GUARD(local_lock_nested_bh, local_lock_t __percpu*,
+ local_lock_nested_bh(_T),
+ local_unlock_nested_bh(_T))
+
#endif
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index 975e33b..8dd71fb 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -62,6 +62,17 @@ do { \
local_lock_debug_init(lock); \
} while (0)
+#define __spinlock_nested_bh_init(lock) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
+ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \
+ 0, LD_WAIT_CONFIG, LD_WAIT_INV, \
+ LD_LOCK_NORMAL); \
+ local_lock_debug_init(lock); \
+} while (0)
+
#define __local_lock(lock) \
do { \
preempt_disable(); \
@@ -98,6 +109,15 @@ do { \
local_irq_restore(flags); \
} while (0)
+#define __local_lock_nested_bh(lock) \
+ do { \
+ lockdep_assert_in_softirq(); \
+ local_lock_acquire(this_cpu_ptr(lock)); \
+ } while (0)
+
+#define __local_unlock_nested_bh(lock) \
+ local_lock_release(this_cpu_ptr(lock))
+
#else /* !CONFIG_PREEMPT_RT */
/*
@@ -138,4 +158,15 @@ typedef spinlock_t local_lock_t;
#define __local_unlock_irqrestore(lock, flags) __local_unlock(lock)
+#define __local_lock_nested_bh(lock) \
+do { \
+ lockdep_assert_in_softirq_func(); \
+ spin_lock(this_cpu_ptr(lock)); \
+} while (0)
+
+#define __local_unlock_nested_bh(lock) \
+do { \
+ spin_unlock(this_cpu_ptr((lock))); \
+} while (0)
+
#endif /* CONFIG_PREEMPT_RT */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 08b0d1d..3f5a551 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -600,6 +600,8 @@ do { \
(!in_softirq() || in_irq() || in_nmi())); \
} while (0)
+extern void lockdep_assert_in_softirq_func(void);
+
#else
# define might_lock(lock) do { } while (0)
# define might_lock_read(lock) do { } while (0)
@@ -613,6 +615,7 @@ do { \
# define lockdep_assert_preemption_enabled() do { } while (0)
# define lockdep_assert_preemption_disabled() do { } while (0)
# define lockdep_assert_in_softirq() do { } while (0)
+# define lockdep_assert_in_softirq_func() do { } while (0)
#endif
#ifdef CONFIG_PROVE_RAW_LOCK_NESTING
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c83b390..4e81660 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -43,6 +43,7 @@
#include <linux/netdev_features.h>
#include <linux/neighbour.h>
+#include <linux/netdevice_xmit.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
@@ -3201,6 +3202,7 @@ static inline bool dev_has_header(const struct net_device *dev)
struct softnet_data {
struct list_head poll_list;
struct sk_buff_head process_queue;
+ local_lock_t process_queue_bh_lock;
/* stats */
unsigned int processed;
@@ -3223,13 +3225,7 @@ struct softnet_data {
struct sk_buff_head xfrm_backlog;
#endif
/* written and read only by owning cpu: */
- struct {
- u16 recursion;
- u8 more;
-#ifdef CONFIG_NET_EGRESS
- u8 skip_txqueue;
-#endif
- } xmit;
+ struct netdev_xmit xmit;
#ifdef CONFIG_RPS
/* input_queue_head should be written by cpu owning this struct,
* and only read by other cpus. Worth using a cache line.
@@ -3257,10 +3253,18 @@ struct softnet_data {
DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
{
return this_cpu_read(softnet_data.xmit.recursion);
}
+#else
+static inline int dev_recursion_level(void)
+{
+ return current->net_xmit.recursion;
+}
+
+#endif
void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);
@@ -4872,18 +4876,35 @@ static inline ktime_t netdev_get_tstamp(struct net_device *dev,
return hwtstamps->hwtstamp;
}
-static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
- struct sk_buff *skb, struct net_device *dev,
- bool more)
+#ifndef CONFIG_PREEMPT_RT
+static inline void netdev_xmit_set_more(bool more)
{
__this_cpu_write(softnet_data.xmit.more, more);
- return ops->ndo_start_xmit(skb, dev);
}
static inline bool netdev_xmit_more(void)
{
return __this_cpu_read(softnet_data.xmit.more);
}
+#else
+static inline void netdev_xmit_set_more(bool more)
+{
+ current->net_xmit.more = more;
+}
+
+static inline bool netdev_xmit_more(void)
+{
+ return current->net_xmit.more;
+}
+#endif
+
+static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
+ struct sk_buff *skb, struct net_device *dev,
+ bool more)
+{
+ netdev_xmit_set_more(more);
+ return ops->ndo_start_xmit(skb, dev);
+}
static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)
diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
new file mode 100644
index 0000000..38325e0
--- /dev/null
+++ b/include/linux/netdevice_xmit.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_NETDEVICE_XMIT_H
+#define _LINUX_NETDEVICE_XMIT_H
+
+struct netdev_xmit {
+ u16 recursion;
+ u8 more;
+#ifdef CONFIG_NET_EGRESS
+ u8 skip_txqueue;
+#endif
+};
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61591ac..5ff5e65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,7 @@
#include <linux/signal_types.h>
#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
+#include <linux/netdevice_xmit.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
@@ -53,6 +54,7 @@ struct bio_list;
struct blk_plug;
struct bpf_local_storage;
struct bpf_run_ctx;
+struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
@@ -975,7 +977,9 @@ struct task_struct {
/* delay due to memory thrashing */
unsigned in_thrashing:1;
#endif
-
+#ifdef CONFIG_PREEMPT_RT
+ struct netdev_xmit net_xmit;
+#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
struct restart_block restart_block;
@@ -1506,6 +1510,8 @@ struct task_struct {
/* Used for BPF run context */
struct bpf_run_ctx *bpf_ctx;
#endif
+ /* Used by BPF for per-TASK xdp storage */
+ struct bpf_net_context *bpf_net_context;
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
unsigned long lowest_stack;
diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
index 3fab9de..888c1ce 100644
--- a/include/net/seg6_local.h
+++ b/include/net/seg6_local.h
@@ -19,6 +19,7 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb);
struct seg6_bpf_srh_state {
+ local_lock_t bh_lock;
struct ipv6_sr_hdr *srh;
u16 hdrlen;
bool valid;
diff --git a/include/net/sock.h b/include/net/sock.h
index b30ea0c..cce23ac 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -544,6 +544,11 @@ struct sock {
netns_tracker ns_tracker;
};
+struct sock_bh_locked {
+ struct sock *sock;
+ local_lock_t bh_lock;
+};
+
enum sk_pacing {
SK_PACING_NONE = 0,
SK_PACING_NEEDED = 1,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a8e3441..068e994 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -79,8 +79,6 @@ struct bpf_cpu_map {
struct bpf_cpu_map_entry __rcu **cpu_map;
};
-static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
@@ -240,12 +238,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
int xdp_n, struct xdp_cpumap_stats *stats,
struct list_head *list)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int nframes;
if (!rcpu->prog)
return xdp_n;
rcu_read_lock_bh();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
@@ -255,6 +255,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
if (unlikely(!list_empty(list)))
cpu_map_bpf_prog_run_skb(rcpu, list, stats);
+ bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
@@ -706,7 +707,7 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
@@ -758,7 +759,7 @@ int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
void __cpu_map_flush(void)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -772,20 +773,9 @@ void __cpu_map_flush(void)
#ifdef CONFIG_DEBUG_NET
bool cpu_map_check_flush(void)
{
- if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
+ if (list_empty(bpf_net_ctx_get_cpu_map_flush_list()))
return false;
__cpu_map_flush();
return true;
}
#endif
-
-static int __init cpu_map_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
- return 0;
-}
-
-subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 7f3b344..317ac2d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -83,7 +83,6 @@ struct bpf_dtab {
u32 n_buckets;
};
-static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -196,7 +195,14 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
+ /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
+ * during NAPI callback and cleared after the XDP redirect. There is no
+ * explicit RCU read section which protects bpf_redirect_info->map but
+ * local_bh_disable() also marks the beginning an RCU section. This
+ * makes the complete softirq callback RCU protected. Thus after
+ * following synchronize_rcu() there no bpf_redirect_info->map == map
+ * assignment.
+ */
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -408,7 +414,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
*/
void __dev_flush(void)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -422,7 +428,7 @@ void __dev_flush(void)
#ifdef CONFIG_DEBUG_NET
bool dev_check_flush(void)
{
- if (list_empty(this_cpu_ptr(&dev_flush_list)))
+ if (list_empty(bpf_net_ctx_get_dev_flush_list()))
return false;
__dev_flush();
return true;
@@ -453,7 +459,7 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
+ struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
@@ -1153,15 +1159,11 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
- int cpu;
-
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 99076db..f314bdd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2355,6 +2355,7 @@ __latent_entropy struct task_struct *copy_process(
RCU_INIT_POINTER(p->bpf_storage, NULL);
p->bpf_ctx = NULL;
#endif
+ p->bpf_net_context = NULL;
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 8475a07..438c608 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -413,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr)
&& addr < (unsigned long)__lock_text_end;
}
EXPORT_SYMBOL(in_lock_functions);
+
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT)
+void notrace lockdep_assert_in_softirq_func(void)
+{
+ lockdep_assert_in_softirq();
+}
+EXPORT_SYMBOL(lockdep_assert_in_softirq_func);
+#endif
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 36ae54f..a6d7f79 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -283,9 +283,10 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
u32 repeat)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int err = 0, act, ret, i, nframes = 0, batch_sz;
struct xdp_frame **frames = xdp->frames;
+ struct bpf_redirect_info *ri;
struct xdp_page_head *head;
struct xdp_frame *frm;
bool redirect = false;
@@ -295,6 +296,8 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
batch_sz = min_t(u32, repeat, xdp->batch_size);
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ ri = bpf_net_ctx_get_ri();
xdp_set_return_frame_no_direct();
for (i = 0; i < batch_sz; i++) {
@@ -359,6 +362,7 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
}
xdp_clear_return_frame_no_direct();
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
return err;
}
@@ -394,6 +398,7 @@ static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx,
static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
u32 *retval, u32 *time, bool xdp)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct bpf_prog_array_item item = {.prog = prog};
struct bpf_run_ctx *old_ctx;
struct bpf_cg_run_ctx run_ctx;
@@ -419,10 +424,14 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
do {
run_ctx.prog_item = &item;
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
if (xdp)
*retval = bpf_prog_run_xdp(prog, ctx);
else
*retval = bpf_prog_run(prog, ctx);
+
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
} while (bpf_test_timer_continue(&t, 1, repeat, &ret, time));
bpf_reset_run_ctx(old_ctx);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index bf30c50b..3c9f653 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -137,6 +137,7 @@ static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
struct brnf_frag_data {
+ local_lock_t bh_lock;
char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
u8 encap_size;
u8 size;
@@ -144,7 +145,9 @@ struct brnf_frag_data {
__be16 vlan_proto;
};
-static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage);
+static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static void nf_bridge_info_free(struct sk_buff *skb)
{
@@ -850,6 +853,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
unsigned int mtu, mtu_reserved;
+ int ret;
mtu_reserved = nf_bridge_mtu_reduction(skb);
mtu = skb->dev->mtu;
@@ -882,6 +886,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
+ local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
data = this_cpu_ptr(&brnf_frag_data_storage);
if (skb_vlan_tag_present(skb)) {
@@ -897,7 +902,9 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
data->size);
- return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
+ ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
+ return ret;
}
if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) &&
skb->protocol == htons(ETH_P_IPV6)) {
@@ -909,6 +916,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+ local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
data = this_cpu_ptr(&brnf_frag_data_storage);
data->encap_size = nf_bridge_encap_header_len(skb);
data->size = ETH_HLEN + data->encap_size;
@@ -916,8 +924,12 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
data->size);
- if (v6ops)
- return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
+ if (v6ops) {
+ ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
+ return ret;
+ }
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
kfree_skb(skb);
return -EMSGSIZE;
diff --git a/net/core/dev.c b/net/core/dev.c
index 093d82b..b94fb4e6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -229,7 +229,7 @@ static inline void backlog_lock_irq_save(struct softnet_data *sd,
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_save(*flags);
}
@@ -237,7 +237,7 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irq(&sd->input_pkt_queue.lock);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_disable();
}
@@ -246,7 +246,7 @@ static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_restore(*flags);
}
@@ -254,7 +254,7 @@ static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irq(&sd->input_pkt_queue.lock);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_enable();
}
@@ -449,7 +449,9 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
* queue in the local softnet handler.
*/
-DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
+ .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
+};
EXPORT_PER_CPU_SYMBOL(softnet_data);
/* Page_pool has a lockless array/stack to alloc/recycle pages.
@@ -3940,6 +3942,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
}
+#ifndef CONFIG_PREEMPT_RT
static bool netdev_xmit_txqueue_skipped(void)
{
return __this_cpu_read(softnet_data.xmit.skip_txqueue);
@@ -3950,6 +3953,19 @@ void netdev_xmit_skip_txqueue(bool skip)
__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+
+#else
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return current->net_xmit.skip_txqueue;
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ current->net_xmit.skip_txqueue = skip;
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+#endif
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_NET_XGRESS
@@ -4029,10 +4045,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int sch_ret;
if (!entry)
return skb;
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
@@ -4061,10 +4080,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
break;
}
*ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_RX_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
/* used by tc_run */
case TC_ACT_STOLEN:
@@ -4074,8 +4095,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
fallthrough;
case TC_ACT_CONSUMED:
*ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
@@ -4085,11 +4108,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int sch_ret;
if (!entry)
return skb;
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
* already set by the caller.
*/
@@ -4105,10 +4131,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_XMIT_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
/* used by tc_run */
case TC_ACT_STOLEN:
@@ -4118,8 +4146,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
fallthrough;
case TC_ACT_CONSUMED:
*ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
@@ -5935,6 +5965,7 @@ static void flush_backlog(struct work_struct *work)
}
backlog_unlock_irq_enable(sd);
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
@@ -5942,6 +5973,7 @@ static void flush_backlog(struct work_struct *work)
rps_input_queue_head_incr(sd);
}
}
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
local_bh_enable();
}
@@ -6063,7 +6095,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
while (again) {
struct sk_buff *skb;
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
while ((skb = __skb_dequeue(&sd->process_queue))) {
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
@@ -6072,7 +6106,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
return work;
}
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
}
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
backlog_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
@@ -6087,8 +6123,10 @@ static int process_backlog(struct napi_struct *napi, int quota)
napi->state &= NAPIF_STATE_THREADED;
again = false;
} else {
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
}
backlog_unlock_irq_enable(sd);
}
@@ -6301,6 +6339,7 @@ enum {
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
unsigned flags, u16 budget)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
bool skip_schedule = false;
unsigned long timeout;
int rc;
@@ -6318,6 +6357,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (flags & NAPI_F_PREFER_BUSY_POLL) {
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
@@ -6340,6 +6380,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
netpoll_poll_unlock(have_poll_lock);
if (rc == budget)
__busy_poll_stop(napi, skip_schedule);
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
}
@@ -6349,6 +6390,7 @@ static void __napi_busy_loop(unsigned int napi_id,
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
void *have_poll_lock = NULL;
struct napi_struct *napi;
@@ -6367,6 +6409,7 @@ static void __napi_busy_loop(unsigned int napi_id,
int work = 0;
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (!napi_poll) {
unsigned long val = READ_ONCE(napi->state);
@@ -6397,6 +6440,7 @@ static void __napi_busy_loop(unsigned int napi_id,
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
skb_defer_free_flush(this_cpu_ptr(&softnet_data));
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
if (!loop_end || loop_end(loop_end_arg, start_time))
@@ -6824,6 +6868,7 @@ static int napi_thread_wait(struct napi_struct *napi)
static void napi_threaded_poll_loop(struct napi_struct *napi)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct softnet_data *sd;
unsigned long last_qs = jiffies;
@@ -6832,6 +6877,8 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
void *have;
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
sd = this_cpu_ptr(&softnet_data);
sd->in_napi_threaded_poll = true;
@@ -6847,6 +6894,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
net_rps_action_and_irq_enable(sd);
}
skb_defer_free_flush(sd);
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
if (!repoll)
@@ -6872,10 +6920,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int budget = READ_ONCE(net_hotdata.netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
sd->in_net_rx_action = true;
local_irq_disable();
@@ -6928,7 +6978,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
sd->in_net_rx_action = false;
net_rps_action_and_irq_enable(sd);
-end:;
+end:
+ bpf_net_ctx_clear(bpf_net_ctx);
}
struct netdev_adjacent {
diff --git a/net/core/dev.h b/net/core/dev.h
index 58f88d2..5654325 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -150,6 +150,8 @@ struct napi_struct *napi_by_id(unsigned int napi_id);
void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
#define XMIT_RECURSION_LIMIT 8
+
+#ifndef CONFIG_PREEMPT_RT
static inline bool dev_xmit_recursion(void)
{
return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
@@ -165,6 +167,22 @@ static inline void dev_xmit_recursion_dec(void)
{
__this_cpu_dec(softnet_data.xmit.recursion);
}
+#else
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ current->net_xmit.recursion++;
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ current->net_xmit.recursion--;
+}
+#endif
int dev_set_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg,
diff --git a/net/core/filter.c b/net/core/filter.c
index b077e74..eb1c442 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1658,9 +1658,12 @@ struct bpf_scratchpad {
__be32 diff[MAX_BPF_STACK / sizeof(__be32)];
u8 buff[MAX_BPF_STACK];
};
+ local_lock_t bh_lock;
};
-static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
+static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static inline int __bpf_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
@@ -2021,6 +2024,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
u32 diff_size = from_size + to_size;
int i, j = 0;
+ __wsum ret;
/* This is quite flexible, some examples:
*
@@ -2034,12 +2038,15 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
diff_size > sizeof(sp->diff)))
return -EINVAL;
+ local_lock_nested_bh(&bpf_sp.bh_lock);
for (i = 0; i < from_size / sizeof(__be32); i++, j++)
sp->diff[j] = ~from[i];
for (i = 0; i < to_size / sizeof(__be32); i++, j++)
sp->diff[j] = to[i];
- return csum_partial(sp->diff, diff_size, seed);
+ ret = csum_partial(sp->diff, diff_size, seed);
+ local_unlock_nested_bh(&bpf_sp.bh_lock);
+ return ret;
}
static const struct bpf_func_proto bpf_csum_diff_proto = {
@@ -2476,9 +2483,6 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING,
};
-DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
-EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
-
static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -2491,7 +2495,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev)
int skb_do_redirect(struct sk_buff *skb)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct net *net = dev_net(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
@@ -2524,7 +2528,7 @@ int skb_do_redirect(struct sk_buff *skb)
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return TC_ACT_SHOT;
@@ -2545,7 +2549,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return TC_ACT_SHOT;
@@ -2567,7 +2571,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
int, plen, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely((plen && plen < sizeof(*params)) || flags))
return TC_ACT_SHOT;
@@ -4293,30 +4297,13 @@ void xdp_do_check_flushed(struct napi_struct *napi)
}
#endif
-void bpf_clear_redirect_map(struct bpf_map *map)
-{
- struct bpf_redirect_info *ri;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- ri = per_cpu_ptr(&bpf_redirect_info, cpu);
- /* Avoid polluting remote cacheline due to writes if
- * not needed. Once we pass this test, we need the
- * cmpxchg() to make sure it hasn't been changed in
- * the meantime by remote CPU.
- */
- if (unlikely(READ_ONCE(ri->map) == map))
- cmpxchg(&ri->map, map, NULL);
- }
-}
-
DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
u32 xdp_master_redirect(struct xdp_buff *xdp)
{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct net_device *master, *slave;
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
@@ -4388,7 +4375,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
map = READ_ONCE(ri->map);
/* The map pointer is cleared when the map is being torn
- * down by bpf_clear_redirect_map()
+ * down by dev_map_free()
*/
if (unlikely(!map)) {
err = -ENOENT;
@@ -4433,7 +4420,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
if (map_type == BPF_MAP_TYPE_XSKMAP)
@@ -4447,7 +4434,7 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect);
int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
if (map_type == BPF_MAP_TYPE_XSKMAP)
@@ -4464,7 +4451,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
enum bpf_map_type map_type, u32 map_id,
u32 flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct bpf_map *map;
int err;
@@ -4476,7 +4463,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
map = READ_ONCE(ri->map);
/* The map pointer is cleared when the map is being torn
- * down by bpf_clear_redirect_map()
+ * down by dev_map_free()
*/
if (unlikely(!map)) {
err = -ENOENT;
@@ -4518,7 +4505,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
@@ -4554,7 +4541,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return XDP_ABORTED;
@@ -6455,6 +6442,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
void *srh_tlvs, *srh_end, *ptr;
int srhoff = 0;
+ lockdep_assert_held(&srh_state->bh_lock);
if (srh == NULL)
return -EINVAL;
@@ -6511,6 +6499,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
int hdroff = 0;
int err;
+ lockdep_assert_held(&srh_state->bh_lock);
switch (action) {
case SEG6_LOCAL_ACTION_END_X:
if (!seg6_bpf_has_valid_srh(skb))
@@ -6587,6 +6576,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
int srhoff = 0;
int ret;
+ lockdep_assert_held(&srh_state->bh_lock);
if (unlikely(srh == NULL))
return -EINVAL;
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 4a0797f..afb05f5 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -38,13 +38,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
struct dst_entry *dst, bool can_redirect)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int ret;
- /* Migration disable and BH disable are needed to protect per-cpu
- * redirect_info between BPF prog and skb_do_redirect().
+ /* Disabling BH is needed to protect per-CPU bpf_redirect_info between
+ * BPF prog and skb_do_redirect().
*/
- migrate_disable();
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
@@ -77,8 +78,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
break;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
- migrate_enable();
return ret;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2315c08..eb9a7e65 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -277,6 +277,7 @@ static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
#endif
struct napi_alloc_cache {
+ local_lock_t bh_lock;
struct page_frag_cache page;
struct page_frag_1k page_small;
unsigned int skb_count;
@@ -284,7 +285,9 @@ struct napi_alloc_cache {
};
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
/* Double check that napi_get_frags() allocates skbs with
* skb->head being backed by slab, not a page fragment.
@@ -306,11 +309,16 @@ void napi_get_frags_check(struct napi_struct *napi)
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ void *data;
fragsz = SKB_DATA_ALIGN(fragsz);
- return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
align_mask);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ return data;
+
}
EXPORT_SYMBOL(__napi_alloc_frag_align);
@@ -318,19 +326,15 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
void *data;
- fragsz = SKB_DATA_ALIGN(fragsz);
if (in_hardirq() || irqs_disabled()) {
struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
+ fragsz = SKB_DATA_ALIGN(fragsz);
data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
align_mask);
} else {
- struct napi_alloc_cache *nc;
-
local_bh_disable();
- nc = this_cpu_ptr(&napi_alloc_cache);
- data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
- align_mask);
+ data = __napi_alloc_frag_align(fragsz, align_mask);
local_bh_enable();
}
return data;
@@ -342,16 +346,20 @@ static struct sk_buff *napi_skb_cache_get(void)
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!nc->skb_count)) {
nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
GFP_ATOMIC,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
- if (unlikely(!nc->skb_count))
+ if (unlikely(!nc->skb_count)) {
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return NULL;
+ }
}
skb = nc->skb_cache[--nc->skb_count];
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
return skb;
@@ -744,9 +752,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
pfmemalloc = nc->pfmemalloc;
} else {
local_bh_disable();
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
+
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
}
@@ -810,11 +822,11 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
goto skb_success;
}
- nc = this_cpu_ptr(&napi_alloc_cache);
-
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ nc = this_cpu_ptr(&napi_alloc_cache);
if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
/* we are artificially inflating the allocation size, but
* that is not as bad as it may look like, as:
@@ -836,6 +848,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
data = page_frag_alloc(&nc->page, len, gfp_mask);
pfmemalloc = nc->page.pfmemalloc;
}
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!data))
return NULL;
@@ -1435,6 +1448,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
if (!kasan_mempool_poison_object(skb))
return;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
@@ -1446,6 +1460,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
nc->skb_cache + NAPI_SKB_CACHE_HALF);
nc->skb_count = NAPI_SKB_CACHE_HALF;
}
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8e49d69..fd17f25f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -93,7 +93,9 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
+static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
@@ -882,7 +884,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
+
sock_net_set(ctl_sk, net);
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
@@ -907,6 +911,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
local_bh_enable();
#ifdef CONFIG_TCP_MD5SIG
@@ -1002,7 +1007,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
sock_net_set(ctl_sk, net);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
@@ -1017,6 +1023,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
local_bh_enable();
}
@@ -3615,7 +3622,7 @@ void __init tcp_v4_init(void)
sk->sk_clockid = CLOCK_MONOTONIC;
- per_cpu(ipv4_tcp_sk, cpu) = sk;
+ per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
}
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c
index 8512cb0..d8a4f19 100644
--- a/net/ipv4/tcp_sigpool.c
+++ b/net/ipv4/tcp_sigpool.c
@@ -10,7 +10,14 @@
#include <net/tcp.h>
static size_t __scratch_size;
-static DEFINE_PER_CPU(void __rcu *, sigpool_scratch);
+struct sigpool_scratch {
+ local_lock_t bh_lock;
+ void __rcu *pad;
+};
+
+static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
struct sigpool_entry {
struct crypto_ahash *hash;
@@ -72,7 +79,7 @@ static int sigpool_reserve_scratch(size_t size)
break;
}
- old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu),
+ old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
scratch, lockdep_is_held(&cpool_mutex));
if (!cpu_online(cpu) || !old_scratch) {
kfree(old_scratch);
@@ -93,7 +100,7 @@ static void sigpool_scratch_free(void)
int cpu;
for_each_possible_cpu(cpu)
- kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu),
+ kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
NULL, lockdep_is_held(&cpool_mutex)));
__scratch_size = 0;
}
@@ -277,7 +284,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RC
/* Pairs with tcp_sigpool_reserve_scratch(), scratch area is
* valid (allocated) until tcp_sigpool_end().
*/
- c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch));
+ local_lock_nested_bh(&sigpool_scratch.bh_lock);
+ c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad));
return 0;
}
EXPORT_SYMBOL_GPL(tcp_sigpool_start);
@@ -286,6 +294,7 @@ void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH)
{
struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req);
+ local_unlock_nested_bh(&sigpool_scratch.bh_lock);
rcu_read_unlock_bh();
ahash_request_free(c->req);
crypto_free_ahash(hash);
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index c434940..c74705e 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1380,7 +1380,9 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
return err;
}
-DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
{
@@ -1388,6 +1390,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
this_cpu_ptr(&seg6_bpf_srh_states);
struct ipv6_sr_hdr *srh = srh_state->srh;
+ lockdep_assert_held(&srh_state->bh_lock);
if (unlikely(srh == NULL))
return false;
@@ -1408,8 +1411,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
static int input_action_end_bpf(struct sk_buff *skb,
struct seg6_local_lwt *slwt)
{
- struct seg6_bpf_srh_state *srh_state =
- this_cpu_ptr(&seg6_bpf_srh_states);
+ struct seg6_bpf_srh_state *srh_state;
struct ipv6_sr_hdr *srh;
int ret;
@@ -1420,10 +1422,14 @@ static int input_action_end_bpf(struct sk_buff *skb,
}
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
- /* preempt_disable is needed to protect the per-CPU buffer srh_state,
- * which is also accessed by the bpf_lwt_seg6_* helpers
+ /* The access to the per-CPU buffer srh_state is protected by running
+ * always in softirq context (with disabled BH). On PREEMPT_RT the
+ * required locking is provided by the following local_lock_nested_bh()
+ * statement. It is also accessed by the bpf_lwt_seg6_* helpers via
+ * bpf_prog_run_save_cb().
*/
- preempt_disable();
+ local_lock_nested_bh(&seg6_bpf_srh_states.bh_lock);
+ srh_state = this_cpu_ptr(&seg6_bpf_srh_states);
srh_state->srh = srh;
srh_state->hdrlen = srh->hdrlen << 3;
srh_state->valid = true;
@@ -1446,15 +1452,15 @@ static int input_action_end_bpf(struct sk_buff *skb,
if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
goto drop;
+ local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock);
- preempt_enable();
if (ret != BPF_REDIRECT)
seg6_lookup_nexthop(skb, NULL, 0);
return dst_input(skb);
drop:
- preempt_enable();
+ local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock);
kfree_skb(skb);
return -EINVAL;
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7d1c098..ed062e0 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -35,8 +35,6 @@
#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
-static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
-
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -372,7 +370,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
- struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+ struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list();
int err;
err = xsk_rcv(xs, xdp);
@@ -387,7 +385,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
void __xsk_map_flush(void)
{
- struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+ struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list();
struct xdp_sock *xs, *tmp;
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
@@ -399,7 +397,7 @@ void __xsk_map_flush(void)
#ifdef CONFIG_DEBUG_NET
bool xsk_map_check_flush(void)
{
- if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
+ if (list_empty(bpf_net_ctx_get_xskmap_flush_list()))
return false;
__xsk_map_flush();
return true;
@@ -1772,7 +1770,7 @@ static struct pernet_operations xsk_net_ops = {
static int __init xsk_init(void)
{
- int err, cpu;
+ int err;
err = proto_register(&xsk_proto, 0 /* no slab */);
if (err)
@@ -1790,8 +1788,6 @@ static int __init xsk_init(void)
if (err)
goto out_pernet;
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
return 0;
out_pernet: