tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0

 /* bpf_fq is intended for testing the bpf qdisc infrastructure and not a direct
  * copy of sch_fq. bpf_fq implements the scheduling algorithm of sch_fq before
  * 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") was
  * introduced. It gives each flow a fair chance to transmit packets in a
  * round-robin fashion. Note that for flow pacing, bpf_fq currently only
  * respects skb->tstamp but not skb->sk->sk_pacing_rate. In addition, if there
  * are multiple bpf_fq instances, they will have a shared view of flows and
  * configuration since some key data structure such as fq_prio_flows,
  * fq_nonprio_flows, and fq_bpf_data are global.
  *
  * To use bpf_fq alone without running selftests, use the following commands.
  *
  * 1. Register bpf_fq to the kernel
  *     bpftool struct_ops register bpf_qdisc_fq.bpf.o /sys/fs/bpf
  * 2. Add bpf_fq to an interface
  *     tc qdisc add dev <interface name> root handle <handle> bpf_fq
  * 3. Delete bpf_fq attached to the interface
  *     tc qdisc delete dev <interface name> root
  * 4. Unregister bpf_fq
  *     bpftool struct_ops unregister name fq
  *
  * The qdisc name, bpf_fq, used in tc commands is defined by Qdisc_ops.id.
  * The struct_ops_map_name, fq, used in the bpftool command is the name of the
  * Qdisc_ops.
  *
  * SEC(".struct_ops")
  * struct Qdisc_ops fq = {
  *         ...
  *         .id        = "bpf_fq",
  * };
  */

 #include <vmlinux.h>
 #include <errno.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_experimental.h"
 #include "bpf_qdisc_common.h"

 char _license[] SEC("license") = "GPL";

 #define NSEC_PER_USEC 1000L
 #define NSEC_PER_SEC 1000000000L

 #define NUM_QUEUE (1 << 20)

 struct fq_bpf_data {
 	u32 quantum;
 	u32 initial_quantum;
 	u32 flow_refill_delay;
 	u32 flow_plimit;
 	u64 horizon;
 	u32 orphan_mask;
 	u32 timer_slack;
 	u64 time_next_delayed_flow;
 	u64 unthrottle_latency_ns;
 	u8 horizon_drop;
 	u32 new_flow_cnt;
 	u32 old_flow_cnt;
 	u64 ktime_cache;
 };

 enum {
 	CLS_RET_PRIO	= 0,
 	CLS_RET_NONPRIO = 1,
 	CLS_RET_ERR	= 2,
 };

 struct skb_node {
 	u64 tstamp;
 	struct sk_buff __kptr * skb;
 	struct bpf_rb_node node;
 };

 struct fq_flow_node {
 	int credit;
 	u32 qlen;
 	u64 age;
 	u64 time_next_packet;
 	struct bpf_list_node list_node;
 	struct bpf_rb_node rb_node;
 	struct bpf_rb_root queue __contains(skb_node, node);
 	struct bpf_spin_lock lock;
 	struct bpf_refcount refcount;
 };

 struct dequeue_nonprio_ctx {
 	bool stop_iter;
 	u64 expire;
 	u64 now;
 };

 struct remove_flows_ctx {
 	bool gc_only;
 	u32 reset_cnt;
 	u32 reset_max;
 };

 struct unset_throttled_flows_ctx {
 	bool unset_all;
 	u64 now;
 };

 struct fq_stashed_flow {
 	struct fq_flow_node __kptr * flow;
 };

 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, __u64);
 	__type(value, struct fq_stashed_flow);
 	__uint(max_entries, NUM_QUEUE);
 } fq_nonprio_flows SEC(".maps");

 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, __u64);
 	__type(value, struct fq_stashed_flow);
 	__uint(max_entries, 1);
 } fq_prio_flows SEC(".maps");

 private(A) struct bpf_spin_lock fq_delayed_lock;
 private(A) struct bpf_rb_root fq_delayed __contains(fq_flow_node, rb_node);

 private(B) struct bpf_spin_lock fq_new_flows_lock;
 private(B) struct bpf_list_head fq_new_flows __contains(fq_flow_node, list_node);

 private(C) struct bpf_spin_lock fq_old_flows_lock;
 private(C) struct bpf_list_head fq_old_flows __contains(fq_flow_node, list_node);

 private(D) struct fq_bpf_data q;

 /* Wrapper for bpf_kptr_xchg that expects NULL dst */
 static void bpf_kptr_xchg_back(void *map_val, void *ptr)
 {
 	void *ret;

 	ret = bpf_kptr_xchg(map_val, ptr);
 	if (ret)
 		bpf_obj_drop(ret);
 }

 static bool skbn_tstamp_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
 {
 	struct skb_node *skbn_a;
 	struct skb_node *skbn_b;

 	skbn_a = container_of(a, struct skb_node, node);
 	skbn_b = container_of(b, struct skb_node, node);

 	return skbn_a->tstamp < skbn_b->tstamp;
 }

 static bool fn_time_next_packet_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
 {
 	struct fq_flow_node *flow_a;
 	struct fq_flow_node *flow_b;

 	flow_a = container_of(a, struct fq_flow_node, rb_node);
 	flow_b = container_of(b, struct fq_flow_node, rb_node);

 	return flow_a->time_next_packet < flow_b->time_next_packet;
 }

 static void
 fq_flows_add_head(struct bpf_list_head *head, struct bpf_spin_lock *lock,
 		  struct fq_flow_node *flow, u32 *flow_cnt)
 {
 	bpf_spin_lock(lock);
 	bpf_list_push_front(head, &flow->list_node);
 	bpf_spin_unlock(lock);
 	*flow_cnt += 1;
 }

 static void
 fq_flows_add_tail(struct bpf_list_head *head, struct bpf_spin_lock *lock,
 		  struct fq_flow_node *flow, u32 *flow_cnt)
 {
 	bpf_spin_lock(lock);
 	bpf_list_push_back(head, &flow->list_node);
 	bpf_spin_unlock(lock);
 	*flow_cnt += 1;
 }

 static void
 fq_flows_remove_front(struct bpf_list_head *head, struct bpf_spin_lock *lock,
 		      struct bpf_list_node **node, u32 *flow_cnt)
 {
 	bpf_spin_lock(lock);
 	*node = bpf_list_pop_front(head);
 	bpf_spin_unlock(lock);
 	*flow_cnt -= 1;
 }

 static bool
 fq_flows_is_empty(struct bpf_list_head *head, struct bpf_spin_lock *lock)
 {
 	struct bpf_list_node *node;

 	bpf_spin_lock(lock);
 	node = bpf_list_pop_front(head);
 	if (node) {
 		bpf_list_push_front(head, node);
 		bpf_spin_unlock(lock);
 		return false;
 	}
 	bpf_spin_unlock(lock);

 	return true;
 }

 /* flow->age is used to denote the state of the flow (not-detached, detached, throttled)
  * as well as the timestamp when the flow is detached.
  *
  * 0: not-detached
  * 1 - (~0ULL-1): detached
  * ~0ULL: throttled
  */
 static void fq_flow_set_detached(struct fq_flow_node *flow)
 {
 	flow->age = bpf_jiffies64();
 }

 static bool fq_flow_is_detached(struct fq_flow_node *flow)
 {
 	return flow->age != 0 && flow->age != ~0ULL;
 }

 static bool sk_listener(struct sock *sk)
 {
 	return (1 << sk->__sk_common.skc_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
 }

 static void fq_gc(void);

 static int fq_new_flow(void *flow_map, struct fq_stashed_flow **sflow, u64 hash)
 {
 	struct fq_stashed_flow tmp = {};
 	struct fq_flow_node *flow;
 	int ret;

 	flow = bpf_obj_new(typeof(*flow));
 	if (!flow)
 		return -ENOMEM;

 	flow->credit = q.initial_quantum,
 	flow->qlen = 0,
 	flow->age = 1,
 	flow->time_next_packet = 0,

 	ret = bpf_map_update_elem(flow_map, &hash, &tmp, 0);
 	if (ret == -ENOMEM || ret == -E2BIG) {
 		fq_gc();
 		bpf_map_update_elem(&fq_nonprio_flows, &hash, &tmp, 0);
 	}

 	*sflow = bpf_map_lookup_elem(flow_map, &hash);
 	if (!*sflow) {
 		bpf_obj_drop(flow);
 		return -ENOMEM;
 	}

 	bpf_kptr_xchg_back(&(*sflow)->flow, flow);
 	return 0;
 }

 static int
 fq_classify(struct sk_buff *skb, struct fq_stashed_flow **sflow)
 {
 	struct sock *sk = skb->sk;
 	int ret = CLS_RET_NONPRIO;
 	u64 hash = 0;

 	if ((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL) {
 		*sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash);
 		ret = CLS_RET_PRIO;
 	} else {
 		if (!sk || sk_listener(sk)) {
 			hash = bpf_skb_get_hash(skb) & q.orphan_mask;
 			/* Avoid collision with an existing flow hash, which
 			 * only uses the lower 32 bits of hash, by setting the
 			 * upper half of hash to 1.
 			 */
 			hash |= (1ULL << 32);
 		} else if (sk->__sk_common.skc_state == TCP_CLOSE) {
 			hash = bpf_skb_get_hash(skb) & q.orphan_mask;
 			hash |= (1ULL << 32);
 		} else {
 			hash = sk->__sk_common.skc_hash;
 		}
 		*sflow = bpf_map_lookup_elem(&fq_nonprio_flows, &hash);
 	}

 	if (!*sflow)
 		ret = fq_new_flow(&fq_nonprio_flows, sflow, hash) < 0 ?
 		      CLS_RET_ERR : CLS_RET_NONPRIO;

 	return ret;
 }

 static bool fq_packet_beyond_horizon(struct sk_buff *skb)
 {
 	return (s64)skb->tstamp > (s64)(q.ktime_cache + q.horizon);
 }

 SEC("struct_ops/bpf_fq_enqueue")
 int BPF_PROG(bpf_fq_enqueue, struct sk_buff *skb, struct Qdisc *sch,
 	     struct bpf_sk_buff_ptr *to_free)
 {
 	struct fq_flow_node *flow = NULL, *flow_copy;
 	struct fq_stashed_flow *sflow;
 	u64 time_to_send, jiffies;
 	struct skb_node *skbn;
 	int ret;

 	if (sch->q.qlen >= sch->limit)
 		goto drop;

 	if (!skb->tstamp) {
 		time_to_send = q.ktime_cache = bpf_ktime_get_ns();
 	} else {
 		if (fq_packet_beyond_horizon(skb)) {
 			q.ktime_cache = bpf_ktime_get_ns();
 			if (fq_packet_beyond_horizon(skb)) {
 				if (q.horizon_drop)
 					goto drop;

 				skb->tstamp = q.ktime_cache + q.horizon;
 			}
 		}
 		time_to_send = skb->tstamp;
 	}

 	ret = fq_classify(skb, &sflow);
 	if (ret == CLS_RET_ERR)
 		goto drop;

 	flow = bpf_kptr_xchg(&sflow->flow, flow);
 	if (!flow)
 		goto drop;

 	if (ret == CLS_RET_NONPRIO) {
 		if (flow->qlen >= q.flow_plimit) {
 			bpf_kptr_xchg_back(&sflow->flow, flow);
 			goto drop;
 		}

 		if (fq_flow_is_detached(flow)) {
 			flow_copy = bpf_refcount_acquire(flow);

 			jiffies = bpf_jiffies64();
 			if ((s64)(jiffies - (flow_copy->age + q.flow_refill_delay)) > 0) {
 				if (flow_copy->credit < q.quantum)
 					flow_copy->credit = q.quantum;
 			}
 			flow_copy->age = 0;
 			fq_flows_add_tail(&fq_new_flows, &fq_new_flows_lock, flow_copy,
 					  &q.new_flow_cnt);
 		}
 	}

 	skbn = bpf_obj_new(typeof(*skbn));
 	if (!skbn) {
 		bpf_kptr_xchg_back(&sflow->flow, flow);
 		goto drop;
 	}

 	skbn->tstamp = skb->tstamp = time_to_send;

 	sch->qstats.backlog += qdisc_pkt_len(skb);

 	skb = bpf_kptr_xchg(&skbn->skb, skb);
 	if (skb)
 		bpf_qdisc_skb_drop(skb, to_free);

 	bpf_spin_lock(&flow->lock);
 	bpf_rbtree_add(&flow->queue, &skbn->node, skbn_tstamp_less);
 	bpf_spin_unlock(&flow->lock);

 	flow->qlen++;
 	bpf_kptr_xchg_back(&sflow->flow, flow);

 	sch->q.qlen++;
 	return NET_XMIT_SUCCESS;

 drop:
 	bpf_qdisc_skb_drop(skb, to_free);
 	sch->qstats.drops++;
 	return NET_XMIT_DROP;
 }

 static int fq_unset_throttled_flows(u32 index, struct unset_throttled_flows_ctx *ctx)
 {
 	struct bpf_rb_node *node = NULL;
 	struct fq_flow_node *flow;

 	bpf_spin_lock(&fq_delayed_lock);

 	node = bpf_rbtree_first(&fq_delayed);
 	if (!node) {
 		bpf_spin_unlock(&fq_delayed_lock);
 		return 1;
 	}

 	flow = container_of(node, struct fq_flow_node, rb_node);
 	if (!ctx->unset_all && flow->time_next_packet > ctx->now) {
 		q.time_next_delayed_flow = flow->time_next_packet;
 		bpf_spin_unlock(&fq_delayed_lock);
 		return 1;
 	}

 	node = bpf_rbtree_remove(&fq_delayed, &flow->rb_node);

 	bpf_spin_unlock(&fq_delayed_lock);

 	if (!node)
 		return 1;

 	flow = container_of(node, struct fq_flow_node, rb_node);
 	flow->age = 0;
 	fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);

 	return 0;
 }

 static void fq_flow_set_throttled(struct fq_flow_node *flow)
 {
 	flow->age = ~0ULL;

 	if (q.time_next_delayed_flow > flow->time_next_packet)
 		q.time_next_delayed_flow = flow->time_next_packet;

 	bpf_spin_lock(&fq_delayed_lock);
 	bpf_rbtree_add(&fq_delayed, &flow->rb_node, fn_time_next_packet_less);
 	bpf_spin_unlock(&fq_delayed_lock);
 }

 static void fq_check_throttled(u64 now)
 {
 	struct unset_throttled_flows_ctx ctx = {
 		.unset_all = false,
 		.now = now,
 	};
 	unsigned long sample;

 	if (q.time_next_delayed_flow > now)
 		return;

 	sample = (unsigned long)(now - q.time_next_delayed_flow);
 	q.unthrottle_latency_ns -= q.unthrottle_latency_ns >> 3;
 	q.unthrottle_latency_ns += sample >> 3;

 	q.time_next_delayed_flow = ~0ULL;
 	bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &ctx, 0);
 }

 static struct sk_buff*
 fq_dequeue_nonprio_flows(u32 index, struct dequeue_nonprio_ctx *ctx)
 {
 	u64 time_next_packet, time_to_send;
 	struct bpf_rb_node *rb_node;
 	struct sk_buff *skb = NULL;
 	struct bpf_list_head *head;
 	struct bpf_list_node *node;
 	struct bpf_spin_lock *lock;
 	struct fq_flow_node *flow;
 	struct skb_node *skbn;
 	bool is_empty;
 	u32 *cnt;

 	if (q.new_flow_cnt) {
 		head = &fq_new_flows;
 		lock = &fq_new_flows_lock;
 		cnt = &q.new_flow_cnt;
 	} else if (q.old_flow_cnt) {
 		head = &fq_old_flows;
 		lock = &fq_old_flows_lock;
 		cnt = &q.old_flow_cnt;
 	} else {
 		if (q.time_next_delayed_flow != ~0ULL)
 			ctx->expire = q.time_next_delayed_flow;
 		goto break_loop;
 	}

 	fq_flows_remove_front(head, lock, &node, cnt);
 	if (!node)
 		goto break_loop;

 	flow = container_of(node, struct fq_flow_node, list_node);
 	if (flow->credit <= 0) {
 		flow->credit += q.quantum;
 		fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);
 		return NULL;
 	}

 	bpf_spin_lock(&flow->lock);
 	rb_node = bpf_rbtree_first(&flow->queue);
 	if (!rb_node) {
 		bpf_spin_unlock(&flow->lock);
 		is_empty = fq_flows_is_empty(&fq_old_flows, &fq_old_flows_lock);
 		if (head == &fq_new_flows && !is_empty) {
 			fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);
 		} else {
 			fq_flow_set_detached(flow);
 			bpf_obj_drop(flow);
 		}
 		return NULL;
 	}

 	skbn = container_of(rb_node, struct skb_node, node);
 	time_to_send = skbn->tstamp;

 	time_next_packet = (time_to_send > flow->time_next_packet) ?
 		time_to_send : flow->time_next_packet;
 	if (ctx->now < time_next_packet) {
 		bpf_spin_unlock(&flow->lock);
 		flow->time_next_packet = time_next_packet;
 		fq_flow_set_throttled(flow);
 		return NULL;
 	}

 	rb_node = bpf_rbtree_remove(&flow->queue, rb_node);
 	bpf_spin_unlock(&flow->lock);

 	if (!rb_node)
 		goto add_flow_and_break;

 	skbn = container_of(rb_node, struct skb_node, node);
 	skb = bpf_kptr_xchg(&skbn->skb, skb);
 	bpf_obj_drop(skbn);

 	if (!skb)
 		goto add_flow_and_break;

 	flow->credit -= qdisc_skb_cb(skb)->pkt_len;
 	flow->qlen--;

 add_flow_and_break:
 	fq_flows_add_head(head, lock, flow, cnt);

 break_loop:
 	ctx->stop_iter = true;
 	return skb;
 }

 static struct sk_buff *fq_dequeue_prio(void)
 {
 	struct fq_flow_node *flow = NULL;
 	struct fq_stashed_flow *sflow;
 	struct bpf_rb_node *rb_node;
 	struct sk_buff *skb = NULL;
 	struct skb_node *skbn;
 	u64 hash = 0;

 	sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash);
 	if (!sflow)
 		return NULL;

 	flow = bpf_kptr_xchg(&sflow->flow, flow);
 	if (!flow)
 		return NULL;

 	bpf_spin_lock(&flow->lock);
 	rb_node = bpf_rbtree_first(&flow->queue);
 	if (!rb_node) {
 		bpf_spin_unlock(&flow->lock);
 		goto out;
 	}

 	skbn = container_of(rb_node, struct skb_node, node);
 	rb_node = bpf_rbtree_remove(&flow->queue, &skbn->node);
 	bpf_spin_unlock(&flow->lock);

 	if (!rb_node)
 		goto out;

 	skbn = container_of(rb_node, struct skb_node, node);
 	skb = bpf_kptr_xchg(&skbn->skb, skb);
 	bpf_obj_drop(skbn);

 out:
 	bpf_kptr_xchg_back(&sflow->flow, flow);

 	return skb;
 }

 SEC("struct_ops/bpf_fq_dequeue")
 struct sk_buff *BPF_PROG(bpf_fq_dequeue, struct Qdisc *sch)
 {
 	struct dequeue_nonprio_ctx cb_ctx = {};
 	struct sk_buff *skb = NULL;
 	int i;

 	if (!sch->q.qlen)
 		goto out;

 	skb = fq_dequeue_prio();
 	if (skb)
 		goto dequeue;

 	q.ktime_cache = cb_ctx.now = bpf_ktime_get_ns();
 	fq_check_throttled(q.ktime_cache);
 	bpf_for(i, 0, sch->limit) {
 		skb = fq_dequeue_nonprio_flows(i, &cb_ctx);
 		if (cb_ctx.stop_iter)
 			break;
 	};

 	if (skb) {
 dequeue:
 		sch->q.qlen--;
 		sch->qstats.backlog -= qdisc_pkt_len(skb);
 		bpf_qdisc_bstats_update(sch, skb);
 		return skb;
 	}

 	if (cb_ctx.expire)
 		bpf_qdisc_watchdog_schedule(sch, cb_ctx.expire, q.timer_slack);
 out:
 	return NULL;
 }

 static int fq_remove_flows_in_list(u32 index, void *ctx)
 {
 	struct bpf_list_node *node;
 	struct fq_flow_node *flow;

 	bpf_spin_lock(&fq_new_flows_lock);
 	node = bpf_list_pop_front(&fq_new_flows);
 	bpf_spin_unlock(&fq_new_flows_lock);
 	if (!node) {
 		bpf_spin_lock(&fq_old_flows_lock);
 		node = bpf_list_pop_front(&fq_old_flows);
 		bpf_spin_unlock(&fq_old_flows_lock);
 		if (!node)
 			return 1;
 	}

 	flow = container_of(node, struct fq_flow_node, list_node);
 	bpf_obj_drop(flow);

 	return 0;
 }

 extern unsigned CONFIG_HZ __kconfig;

 /* limit number of collected flows per round */
 #define FQ_GC_MAX 8
 #define FQ_GC_AGE (3*CONFIG_HZ)

 static bool fq_gc_candidate(struct fq_flow_node *flow)
 {
 	u64 jiffies = bpf_jiffies64();

 	return fq_flow_is_detached(flow) &&
 	       ((s64)(jiffies - (flow->age + FQ_GC_AGE)) > 0);
 }

 static int
 fq_remove_flows(struct bpf_map *flow_map, u64 *hash,
 		struct fq_stashed_flow *sflow, struct remove_flows_ctx *ctx)
 {
 	if (sflow->flow &&
 	    (!ctx->gc_only || fq_gc_candidate(sflow->flow))) {
 		bpf_map_delete_elem(flow_map, hash);
 		ctx->reset_cnt++;
 	}

 	return ctx->reset_cnt < ctx->reset_max ? 0 : 1;
 }

 static void fq_gc(void)
 {
 	struct remove_flows_ctx cb_ctx = {
 		.gc_only = true,
 		.reset_cnt = 0,
 		.reset_max = FQ_GC_MAX,
 	};

 	bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &cb_ctx, 0);
 }

 SEC("struct_ops/bpf_fq_reset")
 void BPF_PROG(bpf_fq_reset, struct Qdisc *sch)
 {
 	struct unset_throttled_flows_ctx utf_ctx = {
 		.unset_all = true,
 	};
 	struct remove_flows_ctx rf_ctx = {
 		.gc_only = false,
 		.reset_cnt = 0,
 		.reset_max = NUM_QUEUE,
 	};
 	struct fq_stashed_flow *sflow;
 	u64 hash = 0;

 	sch->q.qlen = 0;
 	sch->qstats.backlog = 0;

 	bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &rf_ctx, 0);

 	rf_ctx.reset_cnt = 0;
 	bpf_for_each_map_elem(&fq_prio_flows, fq_remove_flows, &rf_ctx, 0);
 	fq_new_flow(&fq_prio_flows, &sflow, hash);

 	bpf_loop(NUM_QUEUE, fq_remove_flows_in_list, NULL, 0);
 	q.new_flow_cnt = 0;
 	q.old_flow_cnt = 0;

 	bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &utf_ctx, 0);
 }

 SEC("struct_ops/bpf_fq_init")
 int BPF_PROG(bpf_fq_init, struct Qdisc *sch, struct nlattr *opt,
 	     struct netlink_ext_ack *extack)
 {
 	struct net_device *dev = sch->dev_queue->dev;
 	u32 psched_mtu = dev->mtu + dev->hard_header_len;
 	struct fq_stashed_flow *sflow;
 	u64 hash = 0;

 	if (fq_new_flow(&fq_prio_flows, &sflow, hash) < 0)
 		return -ENOMEM;

 	sch->limit = 10000;
 	q.initial_quantum = 10 * psched_mtu;
 	q.quantum = 2 * psched_mtu;
 	q.flow_refill_delay = 40;
 	q.flow_plimit = 100;
 	q.horizon = 10ULL * NSEC_PER_SEC;
 	q.horizon_drop = 1;
 	q.orphan_mask = 1024 - 1;
 	q.timer_slack = 10 * NSEC_PER_USEC;
 	q.time_next_delayed_flow = ~0ULL;
 	q.unthrottle_latency_ns = 0ULL;
 	q.new_flow_cnt = 0;
 	q.old_flow_cnt = 0;

 	return 0;
 }

 SEC("struct_ops")
 void BPF_PROG(bpf_fq_destroy, struct Qdisc *sch)
 {
 }

 SEC(".struct_ops")
 struct Qdisc_ops fq = {
 	.enqueue   = (void *)bpf_fq_enqueue,
 	.dequeue   = (void *)bpf_fq_dequeue,
 	.reset     = (void *)bpf_fq_reset,
 	.init      = (void *)bpf_fq_init,
 	.destroy   = (void *)bpf_fq_destroy,
 	.id        = "bpf_fq",
 };
	// SPDX-License-Identifier: GPL-2.0

	/* bpf_fq is intended for testing the bpf qdisc infrastructure and not a direct
	* copy of sch_fq. bpf_fq implements the scheduling algorithm of sch_fq before
	* 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") was
	* introduced. It gives each flow a fair chance to transmit packets in a
	* round-robin fashion. Note that for flow pacing, bpf_fq currently only
	* respects skb->tstamp but not skb->sk->sk_pacing_rate. In addition, if there
	* are multiple bpf_fq instances, they will have a shared view of flows and
	* configuration since some key data structure such as fq_prio_flows,
	* fq_nonprio_flows, and fq_bpf_data are global.
	*
	* To use bpf_fq alone without running selftests, use the following commands.
	*
	* 1. Register bpf_fq to the kernel
	* bpftool struct_ops register bpf_qdisc_fq.bpf.o /sys/fs/bpf
	* 2. Add bpf_fq to an interface
	* tc qdisc add dev <interface name> root handle <handle> bpf_fq
	* 3. Delete bpf_fq attached to the interface
	* tc qdisc delete dev <interface name> root
	* 4. Unregister bpf_fq
	* bpftool struct_ops unregister name fq
	*
	* The qdisc name, bpf_fq, used in tc commands is defined by Qdisc_ops.id.
	* The struct_ops_map_name, fq, used in the bpftool command is the name of the
	* Qdisc_ops.
	*
	* SEC(".struct_ops")
	* struct Qdisc_ops fq = {
	* ...
	* .id = "bpf_fq",
	* };
	*/

	#include <vmlinux.h>
	#include <errno.h>
	#include <bpf/bpf_helpers.h>
	#include "bpf_experimental.h"
	#include "bpf_qdisc_common.h"

	char _license[] SEC("license") = "GPL";

	#define NSEC_PER_USEC 1000L
	#define NSEC_PER_SEC 1000000000L

	#define NUM_QUEUE (1 << 20)

	struct fq_bpf_data {
	u32 quantum;
	u32 initial_quantum;
	u32 flow_refill_delay;
	u32 flow_plimit;
	u64 horizon;
	u32 orphan_mask;
	u32 timer_slack;
	u64 time_next_delayed_flow;
	u64 unthrottle_latency_ns;
	u8 horizon_drop;
	u32 new_flow_cnt;
	u32 old_flow_cnt;
	u64 ktime_cache;
	};

	enum {
	CLS_RET_PRIO = 0,
	CLS_RET_NONPRIO = 1,
	CLS_RET_ERR = 2,
	};

	struct skb_node {
	u64 tstamp;
	struct sk_buff __kptr * skb;
	struct bpf_rb_node node;
	};

	struct fq_flow_node {
	int credit;
	u32 qlen;
	u64 age;
	u64 time_next_packet;
	struct bpf_list_node list_node;
	struct bpf_rb_node rb_node;
	struct bpf_rb_root queue __contains(skb_node, node);
	struct bpf_spin_lock lock;
	struct bpf_refcount refcount;
	};

	struct dequeue_nonprio_ctx {
	bool stop_iter;
	u64 expire;
	u64 now;
	};

	struct remove_flows_ctx {
	bool gc_only;
	u32 reset_cnt;
	u32 reset_max;
	};

	struct unset_throttled_flows_ctx {
	bool unset_all;
	u64 now;
	};

	struct fq_stashed_flow {
	struct fq_flow_node __kptr * flow;
	};

	struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__type(key, __u64);
	__type(value, struct fq_stashed_flow);
	__uint(max_entries, NUM_QUEUE);
	} fq_nonprio_flows SEC(".maps");

	struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__type(key, __u64);
	__type(value, struct fq_stashed_flow);
	__uint(max_entries, 1);
	} fq_prio_flows SEC(".maps");

	private(A) struct bpf_spin_lock fq_delayed_lock;
	private(A) struct bpf_rb_root fq_delayed __contains(fq_flow_node, rb_node);

	private(B) struct bpf_spin_lock fq_new_flows_lock;
	private(B) struct bpf_list_head fq_new_flows __contains(fq_flow_node, list_node);

	private(C) struct bpf_spin_lock fq_old_flows_lock;
	private(C) struct bpf_list_head fq_old_flows __contains(fq_flow_node, list_node);

	private(D) struct fq_bpf_data q;

	/* Wrapper for bpf_kptr_xchg that expects NULL dst */
	static void bpf_kptr_xchg_back(void map_val, void ptr)
	{
	void *ret;

	ret = bpf_kptr_xchg(map_val, ptr);
	if (ret)
	bpf_obj_drop(ret);
	}

	static bool skbn_tstamp_less(struct bpf_rb_node a, const struct bpf_rb_node b)
	{
	struct skb_node *skbn_a;
	struct skb_node *skbn_b;

	skbn_a = container_of(a, struct skb_node, node);
	skbn_b = container_of(b, struct skb_node, node);

	return skbn_a->tstamp < skbn_b->tstamp;
	}

	static bool fn_time_next_packet_less(struct bpf_rb_node a, const struct bpf_rb_node b)
	{
	struct fq_flow_node *flow_a;
	struct fq_flow_node *flow_b;

	flow_a = container_of(a, struct fq_flow_node, rb_node);
	flow_b = container_of(b, struct fq_flow_node, rb_node);

	return flow_a->time_next_packet < flow_b->time_next_packet;
	}

	static void
	fq_flows_add_head(struct bpf_list_head head, struct bpf_spin_lock lock,
	struct fq_flow_node flow, u32 flow_cnt)
	{
	bpf_spin_lock(lock);
	bpf_list_push_front(head, &flow->list_node);
	bpf_spin_unlock(lock);
	*flow_cnt += 1;
	}

	static void
	fq_flows_add_tail(struct bpf_list_head head, struct bpf_spin_lock lock,
	struct fq_flow_node flow, u32 flow_cnt)
	{
	bpf_spin_lock(lock);
	bpf_list_push_back(head, &flow->list_node);
	bpf_spin_unlock(lock);
	*flow_cnt += 1;
	}

	static void
	fq_flows_remove_front(struct bpf_list_head head, struct bpf_spin_lock lock,
	struct bpf_list_node *node, u32 flow_cnt)
	{
	bpf_spin_lock(lock);
	*node = bpf_list_pop_front(head);
	bpf_spin_unlock(lock);
	*flow_cnt -= 1;
	}

	static bool
	fq_flows_is_empty(struct bpf_list_head head, struct bpf_spin_lock lock)
	{
	struct bpf_list_node *node;

	bpf_spin_lock(lock);
	node = bpf_list_pop_front(head);
	if (node) {
	bpf_list_push_front(head, node);
	bpf_spin_unlock(lock);
	return false;
	}
	bpf_spin_unlock(lock);

	return true;
	}

	/* flow->age is used to denote the state of the flow (not-detached, detached, throttled)
	* as well as the timestamp when the flow is detached.
	*
	* 0: not-detached
	* 1 - (~0ULL-1): detached
	* ~0ULL: throttled
	*/
	static void fq_flow_set_detached(struct fq_flow_node *flow)
	{
	flow->age = bpf_jiffies64();
	}

	static bool fq_flow_is_detached(struct fq_flow_node *flow)
	{
	return flow->age != 0 && flow->age != ~0ULL;
	}

	static bool sk_listener(struct sock *sk)
	{
	return (1 << sk->__sk_common.skc_state) & (TCPF_LISTEN \| TCPF_NEW_SYN_RECV);
	}

	static void fq_gc(void);

	static int fq_new_flow(void flow_map, struct fq_stashed_flow *sflow, u64 hash)
	{
	struct fq_stashed_flow tmp = {};
	struct fq_flow_node *flow;
	int ret;

	flow = bpf_obj_new(typeof(*flow));
	if (!flow)
	return -ENOMEM;

	flow->credit = q.initial_quantum,
	flow->qlen = 0,
	flow->age = 1,
	flow->time_next_packet = 0,

	ret = bpf_map_update_elem(flow_map, &hash, &tmp, 0);
	if (ret == -ENOMEM \|\| ret == -E2BIG) {
	fq_gc();
	bpf_map_update_elem(&fq_nonprio_flows, &hash, &tmp, 0);
	}

	*sflow = bpf_map_lookup_elem(flow_map, &hash);
	if (!*sflow) {
	bpf_obj_drop(flow);
	return -ENOMEM;
	}

	bpf_kptr_xchg_back(&(*sflow)->flow, flow);
	return 0;
	}

	static int
	fq_classify(struct sk_buff skb, struct fq_stashed_flow *sflow)
	{
	struct sock *sk = skb->sk;
	int ret = CLS_RET_NONPRIO;
	u64 hash = 0;

	if ((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL) {
	*sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash);
	ret = CLS_RET_PRIO;
	} else {
	if (!sk \|\| sk_listener(sk)) {
	hash = bpf_skb_get_hash(skb) & q.orphan_mask;
	/* Avoid collision with an existing flow hash, which
	* only uses the lower 32 bits of hash, by setting the
	* upper half of hash to 1.
	*/
	hash \|= (1ULL << 32);
	} else if (sk->__sk_common.skc_state == TCP_CLOSE) {
	hash = bpf_skb_get_hash(skb) & q.orphan_mask;
	hash \|= (1ULL << 32);
	} else {
	hash = sk->__sk_common.skc_hash;
	}
	*sflow = bpf_map_lookup_elem(&fq_nonprio_flows, &hash);
	}

	if (!*sflow)
	ret = fq_new_flow(&fq_nonprio_flows, sflow, hash) < 0 ?
	CLS_RET_ERR : CLS_RET_NONPRIO;

	return ret;
	}

	static bool fq_packet_beyond_horizon(struct sk_buff *skb)
	{
	return (s64)skb->tstamp > (s64)(q.ktime_cache + q.horizon);
	}

	SEC("struct_ops/bpf_fq_enqueue")
	int BPF_PROG(bpf_fq_enqueue, struct sk_buff skb, struct Qdisc sch,
	struct bpf_sk_buff_ptr *to_free)
	{
	struct fq_flow_node flow = NULL, flow_copy;
	struct fq_stashed_flow *sflow;
	u64 time_to_send, jiffies;
	struct skb_node *skbn;
	int ret;

	if (sch->q.qlen >= sch->limit)
	goto drop;

	if (!skb->tstamp) {
	time_to_send = q.ktime_cache = bpf_ktime_get_ns();
	} else {
	if (fq_packet_beyond_horizon(skb)) {
	q.ktime_cache = bpf_ktime_get_ns();
	if (fq_packet_beyond_horizon(skb)) {
	if (q.horizon_drop)
	goto drop;

	skb->tstamp = q.ktime_cache + q.horizon;
	}
	}
	time_to_send = skb->tstamp;
	}

	ret = fq_classify(skb, &sflow);
	if (ret == CLS_RET_ERR)
	goto drop;

	flow = bpf_kptr_xchg(&sflow->flow, flow);
	if (!flow)
	goto drop;

	if (ret == CLS_RET_NONPRIO) {
	if (flow->qlen >= q.flow_plimit) {
	bpf_kptr_xchg_back(&sflow->flow, flow);
	goto drop;
	}

	if (fq_flow_is_detached(flow)) {
	flow_copy = bpf_refcount_acquire(flow);

	jiffies = bpf_jiffies64();
	if ((s64)(jiffies - (flow_copy->age + q.flow_refill_delay)) > 0) {
	if (flow_copy->credit < q.quantum)
	flow_copy->credit = q.quantum;
	}
	flow_copy->age = 0;
	fq_flows_add_tail(&fq_new_flows, &fq_new_flows_lock, flow_copy,
	&q.new_flow_cnt);
	}
	}

	skbn = bpf_obj_new(typeof(*skbn));
	if (!skbn) {
	bpf_kptr_xchg_back(&sflow->flow, flow);
	goto drop;
	}

	skbn->tstamp = skb->tstamp = time_to_send;

	sch->qstats.backlog += qdisc_pkt_len(skb);

	skb = bpf_kptr_xchg(&skbn->skb, skb);
	if (skb)
	bpf_qdisc_skb_drop(skb, to_free);

	bpf_spin_lock(&flow->lock);
	bpf_rbtree_add(&flow->queue, &skbn->node, skbn_tstamp_less);
	bpf_spin_unlock(&flow->lock);

	flow->qlen++;
	bpf_kptr_xchg_back(&sflow->flow, flow);

	sch->q.qlen++;
	return NET_XMIT_SUCCESS;

	drop:
	bpf_qdisc_skb_drop(skb, to_free);
	sch->qstats.drops++;
	return NET_XMIT_DROP;
	}

	static int fq_unset_throttled_flows(u32 index, struct unset_throttled_flows_ctx *ctx)
	{
	struct bpf_rb_node *node = NULL;
	struct fq_flow_node *flow;

	bpf_spin_lock(&fq_delayed_lock);

	node = bpf_rbtree_first(&fq_delayed);
	if (!node) {
	bpf_spin_unlock(&fq_delayed_lock);
	return 1;
	}

	flow = container_of(node, struct fq_flow_node, rb_node);
	if (!ctx->unset_all && flow->time_next_packet > ctx->now) {
	q.time_next_delayed_flow = flow->time_next_packet;
	bpf_spin_unlock(&fq_delayed_lock);
	return 1;
	}

	node = bpf_rbtree_remove(&fq_delayed, &flow->rb_node);

	bpf_spin_unlock(&fq_delayed_lock);

	if (!node)
	return 1;

	flow = container_of(node, struct fq_flow_node, rb_node);
	flow->age = 0;
	fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);

	return 0;
	}

	static void fq_flow_set_throttled(struct fq_flow_node *flow)
	{
	flow->age = ~0ULL;

	if (q.time_next_delayed_flow > flow->time_next_packet)
	q.time_next_delayed_flow = flow->time_next_packet;

	bpf_spin_lock(&fq_delayed_lock);
	bpf_rbtree_add(&fq_delayed, &flow->rb_node, fn_time_next_packet_less);
	bpf_spin_unlock(&fq_delayed_lock);
	}

	static void fq_check_throttled(u64 now)
	{
	struct unset_throttled_flows_ctx ctx = {
	.unset_all = false,
	.now = now,
	};
	unsigned long sample;

	if (q.time_next_delayed_flow > now)
	return;

	sample = (unsigned long)(now - q.time_next_delayed_flow);
	q.unthrottle_latency_ns -= q.unthrottle_latency_ns >> 3;
	q.unthrottle_latency_ns += sample >> 3;

	q.time_next_delayed_flow = ~0ULL;
	bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &ctx, 0);
	}

	static struct sk_buff*
	fq_dequeue_nonprio_flows(u32 index, struct dequeue_nonprio_ctx *ctx)
	{
	u64 time_next_packet, time_to_send;
	struct bpf_rb_node *rb_node;
	struct sk_buff *skb = NULL;
	struct bpf_list_head *head;
	struct bpf_list_node *node;
	struct bpf_spin_lock *lock;
	struct fq_flow_node *flow;
	struct skb_node *skbn;
	bool is_empty;
	u32 *cnt;

	if (q.new_flow_cnt) {
	head = &fq_new_flows;
	lock = &fq_new_flows_lock;
	cnt = &q.new_flow_cnt;
	} else if (q.old_flow_cnt) {
	head = &fq_old_flows;
	lock = &fq_old_flows_lock;
	cnt = &q.old_flow_cnt;
	} else {
	if (q.time_next_delayed_flow != ~0ULL)
	ctx->expire = q.time_next_delayed_flow;
	goto break_loop;
	}

	fq_flows_remove_front(head, lock, &node, cnt);
	if (!node)
	goto break_loop;

	flow = container_of(node, struct fq_flow_node, list_node);
	if (flow->credit <= 0) {
	flow->credit += q.quantum;
	fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);
	return NULL;
	}

	bpf_spin_lock(&flow->lock);
	rb_node = bpf_rbtree_first(&flow->queue);
	if (!rb_node) {
	bpf_spin_unlock(&flow->lock);
	is_empty = fq_flows_is_empty(&fq_old_flows, &fq_old_flows_lock);
	if (head == &fq_new_flows && !is_empty) {
	fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);
	} else {
	fq_flow_set_detached(flow);
	bpf_obj_drop(flow);
	}
	return NULL;
	}

	skbn = container_of(rb_node, struct skb_node, node);
	time_to_send = skbn->tstamp;

	time_next_packet = (time_to_send > flow->time_next_packet) ?
	time_to_send : flow->time_next_packet;
	if (ctx->now < time_next_packet) {
	bpf_spin_unlock(&flow->lock);
	flow->time_next_packet = time_next_packet;
	fq_flow_set_throttled(flow);
	return NULL;
	}

	rb_node = bpf_rbtree_remove(&flow->queue, rb_node);
	bpf_spin_unlock(&flow->lock);

	if (!rb_node)
	goto add_flow_and_break;

	skbn = container_of(rb_node, struct skb_node, node);
	skb = bpf_kptr_xchg(&skbn->skb, skb);
	bpf_obj_drop(skbn);

	if (!skb)
	goto add_flow_and_break;

	flow->credit -= qdisc_skb_cb(skb)->pkt_len;
	flow->qlen--;

	add_flow_and_break:
	fq_flows_add_head(head, lock, flow, cnt);

	break_loop:
	ctx->stop_iter = true;
	return skb;
	}

	static struct sk_buff *fq_dequeue_prio(void)
	{
	struct fq_flow_node *flow = NULL;
	struct fq_stashed_flow *sflow;
	struct bpf_rb_node *rb_node;
	struct sk_buff *skb = NULL;
	struct skb_node *skbn;
	u64 hash = 0;

	sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash);
	if (!sflow)
	return NULL;

	flow = bpf_kptr_xchg(&sflow->flow, flow);
	if (!flow)
	return NULL;

	bpf_spin_lock(&flow->lock);
	rb_node = bpf_rbtree_first(&flow->queue);
	if (!rb_node) {
	bpf_spin_unlock(&flow->lock);
	goto out;
	}

	skbn = container_of(rb_node, struct skb_node, node);
	rb_node = bpf_rbtree_remove(&flow->queue, &skbn->node);
	bpf_spin_unlock(&flow->lock);

	if (!rb_node)
	goto out;

	skbn = container_of(rb_node, struct skb_node, node);
	skb = bpf_kptr_xchg(&skbn->skb, skb);
	bpf_obj_drop(skbn);

	out:
	bpf_kptr_xchg_back(&sflow->flow, flow);

	return skb;
	}

	SEC("struct_ops/bpf_fq_dequeue")
	struct sk_buff BPF_PROG(bpf_fq_dequeue, struct Qdisc sch)
	{
	struct dequeue_nonprio_ctx cb_ctx = {};
	struct sk_buff *skb = NULL;
	int i;

	if (!sch->q.qlen)
	goto out;

	skb = fq_dequeue_prio();
	if (skb)
	goto dequeue;

	q.ktime_cache = cb_ctx.now = bpf_ktime_get_ns();
	fq_check_throttled(q.ktime_cache);
	bpf_for(i, 0, sch->limit) {
	skb = fq_dequeue_nonprio_flows(i, &cb_ctx);
	if (cb_ctx.stop_iter)
	break;
	};

	if (skb) {
	dequeue:
	sch->q.qlen--;
	sch->qstats.backlog -= qdisc_pkt_len(skb);
	bpf_qdisc_bstats_update(sch, skb);
	return skb;
	}

	if (cb_ctx.expire)
	bpf_qdisc_watchdog_schedule(sch, cb_ctx.expire, q.timer_slack);
	out:
	return NULL;
	}

	static int fq_remove_flows_in_list(u32 index, void *ctx)
	{
	struct bpf_list_node *node;
	struct fq_flow_node *flow;

	bpf_spin_lock(&fq_new_flows_lock);
	node = bpf_list_pop_front(&fq_new_flows);
	bpf_spin_unlock(&fq_new_flows_lock);
	if (!node) {
	bpf_spin_lock(&fq_old_flows_lock);
	node = bpf_list_pop_front(&fq_old_flows);
	bpf_spin_unlock(&fq_old_flows_lock);
	if (!node)
	return 1;
	}

	flow = container_of(node, struct fq_flow_node, list_node);
	bpf_obj_drop(flow);

	return 0;
	}

	extern unsigned CONFIG_HZ __kconfig;

	/* limit number of collected flows per round */
	#define FQ_GC_MAX 8
	#define FQ_GC_AGE (3*CONFIG_HZ)

	static bool fq_gc_candidate(struct fq_flow_node *flow)
	{
	u64 jiffies = bpf_jiffies64();

	return fq_flow_is_detached(flow) &&
	((s64)(jiffies - (flow->age + FQ_GC_AGE)) > 0);
	}

	static int
	fq_remove_flows(struct bpf_map flow_map, u64 hash,
	struct fq_stashed_flow sflow, struct remove_flows_ctx ctx)
	{
	if (sflow->flow &&
	(!ctx->gc_only \|\| fq_gc_candidate(sflow->flow))) {
	bpf_map_delete_elem(flow_map, hash);
	ctx->reset_cnt++;
	}

	return ctx->reset_cnt < ctx->reset_max ? 0 : 1;
	}

	static void fq_gc(void)
	{
	struct remove_flows_ctx cb_ctx = {
	.gc_only = true,
	.reset_cnt = 0,
	.reset_max = FQ_GC_MAX,
	};

	bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &cb_ctx, 0);
	}

	SEC("struct_ops/bpf_fq_reset")
	void BPF_PROG(bpf_fq_reset, struct Qdisc *sch)
	{
	struct unset_throttled_flows_ctx utf_ctx = {
	.unset_all = true,
	};
	struct remove_flows_ctx rf_ctx = {
	.gc_only = false,
	.reset_cnt = 0,
	.reset_max = NUM_QUEUE,
	};
	struct fq_stashed_flow *sflow;
	u64 hash = 0;

	sch->q.qlen = 0;
	sch->qstats.backlog = 0;

	bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &rf_ctx, 0);

	rf_ctx.reset_cnt = 0;
	bpf_for_each_map_elem(&fq_prio_flows, fq_remove_flows, &rf_ctx, 0);
	fq_new_flow(&fq_prio_flows, &sflow, hash);

	bpf_loop(NUM_QUEUE, fq_remove_flows_in_list, NULL, 0);
	q.new_flow_cnt = 0;
	q.old_flow_cnt = 0;

	bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &utf_ctx, 0);
	}

	SEC("struct_ops/bpf_fq_init")
	int BPF_PROG(bpf_fq_init, struct Qdisc sch, struct nlattr opt,
	struct netlink_ext_ack *extack)
	{
	struct net_device *dev = sch->dev_queue->dev;
	u32 psched_mtu = dev->mtu + dev->hard_header_len;
	struct fq_stashed_flow *sflow;
	u64 hash = 0;

	if (fq_new_flow(&fq_prio_flows, &sflow, hash) < 0)
	return -ENOMEM;

	sch->limit = 10000;
	q.initial_quantum = 10 * psched_mtu;
	q.quantum = 2 * psched_mtu;
	q.flow_refill_delay = 40;
	q.flow_plimit = 100;
	q.horizon = 10ULL * NSEC_PER_SEC;
	q.horizon_drop = 1;
	q.orphan_mask = 1024 - 1;
	q.timer_slack = 10 * NSEC_PER_USEC;
	q.time_next_delayed_flow = ~0ULL;
	q.unthrottle_latency_ns = 0ULL;
	q.new_flow_cnt = 0;
	q.old_flow_cnt = 0;

	return 0;
	}

	SEC("struct_ops")
	void BPF_PROG(bpf_fq_destroy, struct Qdisc *sch)
	{
	}

	SEC(".struct_ops")
	struct Qdisc_ops fq = {
	.enqueue = (void *)bpf_fq_enqueue,
	.dequeue = (void *)bpf_fq_dequeue,
	.reset = (void *)bpf_fq_reset,
	.init = (void *)bpf_fq_init,
	.destroy = (void *)bpf_fq_destroy,
	.id = "bpf_fq",
	};