tools/perf/util/bpf_skel/off_cpu.bpf.c - linux - Git at Google

 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 // Copyright (c) 2022 Google
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>

 /* task->flags for off-cpu analysis */
 #define PF_KTHREAD   0x00200000  /* I am a kernel thread */

 /* task->state for off-cpu analysis */
 #define TASK_INTERRUPTIBLE	0x0001
 #define TASK_UNINTERRUPTIBLE	0x0002

 /* create a new thread */
 #define CLONE_THREAD  0x10000

 #define MAX_STACKS   32
 #define MAX_ENTRIES  102400

 #define MAX_CPUS  4096
 #define MAX_OFFCPU_LEN 37

 // We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1
 struct __stack {
 	u64 array[MAX_STACKS];
 };

 struct tstamp_data {
 	__u32 stack_id;
 	__u32 state;
 	__u64 timestamp;
 	struct __stack stack;
 };

 struct offcpu_key {
 	__u32 pid;
 	__u32 tgid;
 	__u32 stack_id;
 	__u32 state;
 	__u64 cgroup_id;
 };

 struct {
 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
 	__uint(key_size, sizeof(__u32));
 	__uint(value_size, MAX_STACKS * sizeof(__u64));
 	__uint(max_entries, MAX_ENTRIES);
 } stacks SEC(".maps");

 struct offcpu_data {
 	u64 array[MAX_OFFCPU_LEN];
 };

 struct {
 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
 	__uint(key_size, sizeof(int));
 	__uint(value_size, sizeof(int));
 	__uint(max_entries, MAX_CPUS);
 } offcpu_output SEC(".maps");

 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(__u32));
 	__uint(value_size, sizeof(struct offcpu_data));
 	__uint(max_entries, 1);
 } offcpu_payload SEC(".maps");

 struct {
 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 	__uint(map_flags, BPF_F_NO_PREALLOC);
 	__type(key, int);
 	__type(value, struct tstamp_data);
 } tstamp SEC(".maps");

 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(key_size, sizeof(struct offcpu_key));
 	__uint(value_size, sizeof(__u64));
 	__uint(max_entries, MAX_ENTRIES);
 } off_cpu SEC(".maps");

 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(key_size, sizeof(__u32));
 	__uint(value_size, sizeof(__u8));
 	__uint(max_entries, 1);
 } cpu_filter SEC(".maps");

 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(key_size, sizeof(__u32));
 	__uint(value_size, sizeof(__u8));
 	__uint(max_entries, 1);
 } task_filter SEC(".maps");

 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(key_size, sizeof(__u64));
 	__uint(value_size, sizeof(__u8));
 	__uint(max_entries, 1);
 } cgroup_filter SEC(".maps");

 /* new kernel task_struct definition */
 struct task_struct___new {
 	long __state;
 } __attribute__((preserve_access_index));

 /* old kernel task_struct definition */
 struct task_struct___old {
 	long state;
 } __attribute__((preserve_access_index));

 int enabled = 0;

 const volatile int has_cpu = 0;
 const volatile int has_task = 0;
 const volatile int has_cgroup = 0;
 const volatile int uses_tgid = 0;

 const volatile bool has_prev_state = false;
 const volatile bool needs_cgroup = false;
 const volatile bool uses_cgroup_v1 = false;

 int perf_subsys_id = -1;

 __u64 offcpu_thresh_ns;

 /*
  * Old kernel used to call it task_struct->state and now it's '__state'.
  * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
  *
  * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
  */
 static inline int get_task_state(struct task_struct *t)
 {
 	/* recast pointer to capture new type for compiler */
 	struct task_struct___new *t_new = (void *)t;

 	if (bpf_core_field_exists(t_new->__state)) {
 		return BPF_CORE_READ(t_new, __state);
 	} else {
 		/* recast pointer to capture old type for compiler */
 		struct task_struct___old *t_old = (void *)t;

 		return BPF_CORE_READ(t_old, state);
 	}
 }

 static inline __u64 get_cgroup_id(struct task_struct *t)
 {
 	struct cgroup *cgrp;

 	if (!uses_cgroup_v1)
 		return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);

 	if (perf_subsys_id == -1) {
 #if __has_builtin(__builtin_preserve_enum_value)
 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
 						     perf_event_cgrp_id);
 #else
 		perf_subsys_id = perf_event_cgrp_id;
 #endif
 	}

 	cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
 	return BPF_CORE_READ(cgrp, kn, id);
 }

 static inline int can_record(struct task_struct *t, int state)
 {
 	/* kernel threads don't have user stack */
 	if (t->flags & PF_KTHREAD)
 		return 0;

 	if (state != TASK_INTERRUPTIBLE &&
 	    state != TASK_UNINTERRUPTIBLE)
 		return 0;

 	if (has_cpu) {
 		__u32 cpu = bpf_get_smp_processor_id();
 		__u8 *ok;

 		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
 		if (!ok)
 			return 0;
 	}

 	if (has_task) {
 		__u8 *ok;
 		__u32 pid;

 		if (uses_tgid)
 			pid = t->tgid;
 		else
 			pid = t->pid;

 		ok = bpf_map_lookup_elem(&task_filter, &pid);
 		if (!ok)
 			return 0;
 	}

 	if (has_cgroup) {
 		__u8 *ok;
 		__u64 cgrp_id = get_cgroup_id(t);

 		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
 		if (!ok)
 			return 0;
 	}

 	return 1;
 }

 static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n)
 {
 	int len = 0;

 	for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
 		to->array[n + 2 + i] = from->array[i];

 	return len;
 }

 /**
  * off_cpu_dump - dump off-cpu samples to ring buffer
  * @data: payload for dumping off-cpu samples
  * @key: off-cpu data
  * @stack: stack trace of the task before being scheduled out
  *
  * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id
  * information of the task, and dump it as a raw sample to perf ring buffer
  */
 static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
 			struct __stack *stack, __u64 delta)
 {
 	int n = 0, len = 0;

 	data->array[n++] = (u64)key->tgid << 32 | key->pid;
 	data->array[n++] = delta;

 	/* data->array[n] is callchain->nr (updated later) */
 	data->array[n + 1] = PERF_CONTEXT_USER;
 	data->array[n + 2] = 0;
 	len = copy_stack(stack, data, n);

 	/* update length of callchain */
 	data->array[n] = len + 1;
 	n += len + 2;

 	data->array[n++] = key->cgroup_id;

 	return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64));
 }

 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
 			struct task_struct *next, int state)
 {
 	__u64 ts;
 	__u32 stack_id;
 	struct tstamp_data *pelem;

 	ts = bpf_ktime_get_ns();

 	if (!can_record(prev, state))
 		goto next;

 	stack_id = bpf_get_stackid(ctx, &stacks,
 				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);

 	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
 	if (!pelem)
 		goto next;

 	pelem->timestamp = ts;
 	pelem->state = state;
 	pelem->stack_id = stack_id;

 	/*
 	 * If stacks are successfully collected by bpf_get_stackid(), collect them once more
 	 * in task_storage for direct off-cpu sample dumping
 	 */
 	if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) {
 		/*
 		 * This empty if block is used to avoid 'result unused warning' from bpf_get_stack().
 		 * If the collection fails, continue with the logic for the next task.
 		 */
 	}
 next:
 	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);

 	if (pelem && pelem->timestamp) {
 		struct offcpu_key key = {
 			.pid = next->pid,
 			.tgid = next->tgid,
 			.stack_id = pelem->stack_id,
 			.state = pelem->state,
 			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
 		};
 		__u64 delta = ts - pelem->timestamp;
 		__u64 *total;

 		if (delta >= offcpu_thresh_ns) {
 			int zero = 0;
 			struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);

 			if (data)
 				off_cpu_dump(ctx, data, &key, &pelem->stack, delta);
 		} else {
 			total = bpf_map_lookup_elem(&off_cpu, &key);
 			if (total)
 				*total += delta;
 			else
 				bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
 		}

 		/* prevent to reuse the timestamp later */
 		pelem->timestamp = 0;
 	}

 	return 0;
 }

 SEC("tp_btf/task_newtask")
 int on_newtask(u64 *ctx)
 {
 	struct task_struct *task;
 	u64 clone_flags;
 	u32 pid;
 	u8 val = 1;

 	if (!uses_tgid)
 		return 0;

 	task = (struct task_struct *)bpf_get_current_task();

 	pid = BPF_CORE_READ(task, tgid);
 	if (!bpf_map_lookup_elem(&task_filter, &pid))
 		return 0;

 	task = (struct task_struct *)ctx[0];
 	clone_flags = ctx[1];

 	pid = task->tgid;
 	if (!(clone_flags & CLONE_THREAD))
 		bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);

 	return 0;
 }

 SEC("tp_btf/sched_switch")
 int on_switch(u64 *ctx)
 {
 	struct task_struct *prev, *next;
 	int prev_state;

 	if (!enabled)
 		return 0;

 	prev = (struct task_struct *)ctx[1];
 	next = (struct task_struct *)ctx[2];

 	if (has_prev_state)
 		prev_state = (int)ctx[3];
 	else
 		prev_state = get_task_state(prev);

 	return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
 }

 char LICENSE[] SEC("license") = "Dual BSD/GPL";
	// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
	// Copyright (c) 2022 Google
	#include "vmlinux.h"
	#include <bpf/bpf_helpers.h>
	#include <bpf/bpf_tracing.h>
	#include <bpf/bpf_core_read.h>

	/* task->flags for off-cpu analysis */
	#define PF_KTHREAD 0x00200000 /* I am a kernel thread */

	/* task->state for off-cpu analysis */
	#define TASK_INTERRUPTIBLE 0x0001
	#define TASK_UNINTERRUPTIBLE 0x0002

	/* create a new thread */
	#define CLONE_THREAD 0x10000

	#define MAX_STACKS 32
	#define MAX_ENTRIES 102400

	#define MAX_CPUS 4096
	#define MAX_OFFCPU_LEN 37

	// We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1
	struct __stack {
	u64 array[MAX_STACKS];
	};

	struct tstamp_data {
	__u32 stack_id;
	__u32 state;
	__u64 timestamp;
	struct __stack stack;
	};

	struct offcpu_key {
	__u32 pid;
	__u32 tgid;
	__u32 stack_id;
	__u32 state;
	__u64 cgroup_id;
	};

	struct {
	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, MAX_STACKS * sizeof(__u64));
	__uint(max_entries, MAX_ENTRIES);
	} stacks SEC(".maps");

	struct offcpu_data {
	u64 array[MAX_OFFCPU_LEN];
	};

	struct {
	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
	__uint(key_size, sizeof(int));
	__uint(value_size, sizeof(int));
	__uint(max_entries, MAX_CPUS);
	} offcpu_output SEC(".maps");

	struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(struct offcpu_data));
	__uint(max_entries, 1);
	} offcpu_payload SEC(".maps");

	struct {
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
	__uint(map_flags, BPF_F_NO_PREALLOC);
	__type(key, int);
	__type(value, struct tstamp_data);
	} tstamp SEC(".maps");

	struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(struct offcpu_key));
	__uint(value_size, sizeof(__u64));
	__uint(max_entries, MAX_ENTRIES);
	} off_cpu SEC(".maps");

	struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(__u8));
	__uint(max_entries, 1);
	} cpu_filter SEC(".maps");

	struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(__u8));
	__uint(max_entries, 1);
	} task_filter SEC(".maps");

	struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(__u64));
	__uint(value_size, sizeof(__u8));
	__uint(max_entries, 1);
	} cgroup_filter SEC(".maps");

	/* new kernel task_struct definition */
	struct task_struct___new {
	long __state;
	} __attribute__((preserve_access_index));

	/* old kernel task_struct definition */
	struct task_struct___old {
	long state;
	} __attribute__((preserve_access_index));

	int enabled = 0;

	const volatile int has_cpu = 0;
	const volatile int has_task = 0;
	const volatile int has_cgroup = 0;
	const volatile int uses_tgid = 0;

	const volatile bool has_prev_state = false;
	const volatile bool needs_cgroup = false;
	const volatile bool uses_cgroup_v1 = false;

	int perf_subsys_id = -1;

	__u64 offcpu_thresh_ns;

	/*
	* Old kernel used to call it task_struct->state and now it's '__state'.
	* Use BPF CO-RE "ignored suffix rule" to deal with it like below:
	*
	* https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
	*/
	static inline int get_task_state(struct task_struct *t)
	{
	/* recast pointer to capture new type for compiler */
	struct task_struct___new t_new = (void )t;

	if (bpf_core_field_exists(t_new->__state)) {
	return BPF_CORE_READ(t_new, __state);
	} else {
	/* recast pointer to capture old type for compiler */
	struct task_struct___old t_old = (void )t;

	return BPF_CORE_READ(t_old, state);
	}
	}

	static inline __u64 get_cgroup_id(struct task_struct *t)
	{
	struct cgroup *cgrp;

	if (!uses_cgroup_v1)
	return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);

	if (perf_subsys_id == -1) {
	#if __has_builtin(__builtin_preserve_enum_value)
	perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
	perf_event_cgrp_id);
	#else
	perf_subsys_id = perf_event_cgrp_id;
	#endif
	}

	cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
	return BPF_CORE_READ(cgrp, kn, id);
	}

	static inline int can_record(struct task_struct *t, int state)
	{
	/* kernel threads don't have user stack */
	if (t->flags & PF_KTHREAD)
	return 0;

	if (state != TASK_INTERRUPTIBLE &&
	state != TASK_UNINTERRUPTIBLE)
	return 0;

	if (has_cpu) {
	__u32 cpu = bpf_get_smp_processor_id();
	__u8 *ok;

	ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
	if (!ok)
	return 0;
	}

	if (has_task) {
	__u8 *ok;
	__u32 pid;

	if (uses_tgid)
	pid = t->tgid;
	else
	pid = t->pid;

	ok = bpf_map_lookup_elem(&task_filter, &pid);
	if (!ok)
	return 0;
	}

	if (has_cgroup) {
	__u8 *ok;
	__u64 cgrp_id = get_cgroup_id(t);

	ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
	if (!ok)
	return 0;
	}

	return 1;
	}

	static inline int copy_stack(struct __stack from, struct offcpu_data to, int n)
	{
	int len = 0;

	for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
	to->array[n + 2 + i] = from->array[i];

	return len;
	}

	/**
	* off_cpu_dump - dump off-cpu samples to ring buffer
	* @data: payload for dumping off-cpu samples
	* @key: off-cpu data
	* @stack: stack trace of the task before being scheduled out
	*
	* If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id
	* information of the task, and dump it as a raw sample to perf ring buffer
	*/
	static int off_cpu_dump(void ctx, struct offcpu_data data, struct offcpu_key *key,
	struct __stack *stack, __u64 delta)
	{
	int n = 0, len = 0;

	data->array[n++] = (u64)key->tgid << 32 \| key->pid;
	data->array[n++] = delta;

	/* data->array[n] is callchain->nr (updated later) */
	data->array[n + 1] = PERF_CONTEXT_USER;
	data->array[n + 2] = 0;
	len = copy_stack(stack, data, n);

	/* update length of callchain */
	data->array[n] = len + 1;
	n += len + 2;

	data->array[n++] = key->cgroup_id;

	return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64));
	}

	static int off_cpu_stat(u64 ctx, struct task_struct prev,
	struct task_struct *next, int state)
	{
	__u64 ts;
	__u32 stack_id;
	struct tstamp_data *pelem;

	ts = bpf_ktime_get_ns();

	if (!can_record(prev, state))
	goto next;

	stack_id = bpf_get_stackid(ctx, &stacks,
	BPF_F_FAST_STACK_CMP \| BPF_F_USER_STACK);

	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
	BPF_LOCAL_STORAGE_GET_F_CREATE);
	if (!pelem)
	goto next;

	pelem->timestamp = ts;
	pelem->state = state;
	pelem->stack_id = stack_id;

	/*
	* If stacks are successfully collected by bpf_get_stackid(), collect them once more
	* in task_storage for direct off-cpu sample dumping
	*/
	if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) {
	/*
	* This empty if block is used to avoid 'result unused warning' from bpf_get_stack().
	* If the collection fails, continue with the logic for the next task.
	*/
	}
	next:
	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);

	if (pelem && pelem->timestamp) {
	struct offcpu_key key = {
	.pid = next->pid,
	.tgid = next->tgid,
	.stack_id = pelem->stack_id,
	.state = pelem->state,
	.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
	};
	__u64 delta = ts - pelem->timestamp;
	__u64 *total;

	if (delta >= offcpu_thresh_ns) {
	int zero = 0;
	struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);

	if (data)
	off_cpu_dump(ctx, data, &key, &pelem->stack, delta);
	} else {
	total = bpf_map_lookup_elem(&off_cpu, &key);
	if (total)
	*total += delta;
	else
	bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
	}

	/* prevent to reuse the timestamp later */
	pelem->timestamp = 0;
	}

	return 0;
	}

	SEC("tp_btf/task_newtask")
	int on_newtask(u64 *ctx)
	{
	struct task_struct *task;
	u64 clone_flags;
	u32 pid;
	u8 val = 1;

	if (!uses_tgid)
	return 0;

	task = (struct task_struct *)bpf_get_current_task();

	pid = BPF_CORE_READ(task, tgid);
	if (!bpf_map_lookup_elem(&task_filter, &pid))
	return 0;

	task = (struct task_struct *)ctx[0];
	clone_flags = ctx[1];

	pid = task->tgid;
	if (!(clone_flags & CLONE_THREAD))
	bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);

	return 0;
	}

	SEC("tp_btf/sched_switch")
	int on_switch(u64 *ctx)
	{
	struct task_struct prev, next;
	int prev_state;

	if (!enabled)
	return 0;

	prev = (struct task_struct *)ctx[1];
	next = (struct task_struct *)ctx[2];

	if (has_prev_state)
	prev_state = (int)ctx[3];
	else
	prev_state = get_task_state(prev);

	return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
	}

	char LICENSE[] SEC("license") = "Dual BSD/GPL";