kernel/unwind/deferred.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * Deferred user space unwinding
  */
 #include <linux/sched/task_stack.h>
 #include <linux/unwind_deferred.h>
 #include <linux/sched/clock.h>
 #include <linux/task_work.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/mm.h>

 /*
  * For requesting a deferred user space stack trace from NMI context
  * the architecture must support a safe cmpxchg in NMI context.
  * For those architectures that do not have that, then it cannot ask
  * for a deferred user space stack trace from an NMI context. If it
  * does, then it will get -EINVAL.
  */
 #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
 # define CAN_USE_IN_NMI		1
 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
 {
 	u32 old = 0;

 	return try_cmpxchg(&info->id.cnt, &old, cnt);
 }
 #else
 # define CAN_USE_IN_NMI		0
 /* When NMIs are not allowed, this always succeeds */
 static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
 {
 	info->id.cnt = cnt;
 	return true;
 }
 #endif

 /* Make the cache fit in a 4K page */
 #define UNWIND_MAX_ENTRIES					\
 	((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))

 /* Guards adding to or removing from the list of callbacks */
 static DEFINE_MUTEX(callback_mutex);
 static LIST_HEAD(callbacks);

 #define RESERVED_BITS	(UNWIND_PENDING | UNWIND_USED)

 /* Zero'd bits are available for assigning callback users */
 static unsigned long unwind_mask = RESERVED_BITS;
 DEFINE_STATIC_SRCU(unwind_srcu);

 static inline bool unwind_pending(struct unwind_task_info *info)
 {
 	return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
 }

 /*
  * This is a unique percpu identifier for a given task entry context.
  * Conceptually, it's incremented every time the CPU enters the kernel from
  * user space, so that each "entry context" on the CPU gets a unique ID.  In
  * reality, as an optimization, it's only incremented on demand for the first
  * deferred unwind request after a given entry-from-user.
  *
  * It's combined with the CPU id to make a systemwide-unique "context cookie".
  */
 static DEFINE_PER_CPU(u32, unwind_ctx_ctr);

 /*
  * The context cookie is a unique identifier that is assigned to a user
  * space stacktrace. As the user space stacktrace remains the same while
  * the task is in the kernel, the cookie is an identifier for the stacktrace.
  * Although it is possible for the stacktrace to get another cookie if another
  * request is made after the cookie was cleared and before reentering user
  * space.
  */
 static u64 get_cookie(struct unwind_task_info *info)
 {
 	u32 cnt = 1;

 	if (info->id.cpu)
 		return info->id.id;

 	/* LSB is always set to ensure 0 is an invalid value */
 	cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
 	if (try_assign_cnt(info, cnt)) {
 		/* Update the per cpu counter */
 		__this_cpu_write(unwind_ctx_ctr, cnt);
 	}
 	/* Interrupts are disabled, the CPU will always be same */
 	info->id.cpu = smp_processor_id() + 1; /* Must be non zero */

 	return info->id.id;
 }

 /**
  * unwind_user_faultable - Produce a user stacktrace in faultable context
  * @trace: The descriptor that will store the user stacktrace
  *
  * This must be called in a known faultable context (usually when entering
  * or exiting user space). Depending on the available implementations
  * the @trace will be loaded with the addresses of the user space stacktrace
  * if it can be found.
  *
  * Return: 0 on success and negative on error
  *         On success @trace will contain the user space stacktrace
  */
 int unwind_user_faultable(struct unwind_stacktrace *trace)
 {
 	struct unwind_task_info *info = &current->unwind_info;
 	struct unwind_cache *cache;

 	/* Should always be called from faultable context */
 	might_fault();

 	if (!current->mm)
 		return -EINVAL;

 	if (!info->cache) {
 		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
 				      GFP_KERNEL);
 		if (!info->cache)
 			return -ENOMEM;
 	}

 	cache = info->cache;
 	trace->entries = cache->entries;

 	if (cache->nr_entries) {
 		/*
 		 * The user stack has already been previously unwound in this
 		 * entry context.  Skip the unwind and use the cache.
 		 */
 		trace->nr = cache->nr_entries;
 		return 0;
 	}

 	trace->nr = 0;
 	unwind_user(trace, UNWIND_MAX_ENTRIES);

 	cache->nr_entries = trace->nr;

 	/* Clear nr_entries on way back to user space */
 	set_bit(UNWIND_USED_BIT, &info->unwind_mask);

 	return 0;
 }

 static void process_unwind_deferred(struct task_struct *task)
 {
 	struct unwind_task_info *info = &task->unwind_info;
 	struct unwind_stacktrace trace;
 	struct unwind_work *work;
 	unsigned long bits;
 	u64 cookie;

 	if (WARN_ON_ONCE(!unwind_pending(info)))
 		return;

 	/* Clear pending bit but make sure to have the current bits */
 	bits = atomic_long_fetch_andnot(UNWIND_PENDING,
 				  (atomic_long_t *)&info->unwind_mask);
 	/*
 	 * From here on out, the callback must always be called, even if it's
 	 * just an empty trace.
 	 */
 	trace.nr = 0;
 	trace.entries = NULL;

 	unwind_user_faultable(&trace);

 	if (info->cache)
 		bits &= ~(info->cache->unwind_completed);

 	cookie = info->id.id;

 	guard(srcu)(&unwind_srcu);
 	list_for_each_entry_srcu(work, &callbacks, list,
 				 srcu_read_lock_held(&unwind_srcu)) {
 		if (test_bit(work->bit, &bits)) {
 			work->func(work, &trace, cookie);
 			if (info->cache)
 				info->cache->unwind_completed |= BIT(work->bit);
 		}
 	}
 }

 static void unwind_deferred_task_work(struct callback_head *head)
 {
 	process_unwind_deferred(current);
 }

 void unwind_deferred_task_exit(struct task_struct *task)
 {
 	struct unwind_task_info *info = &current->unwind_info;

 	if (!unwind_pending(info))
 		return;

 	process_unwind_deferred(task);

 	task_work_cancel(task, &info->work);
 }

 /**
  * unwind_deferred_request - Request a user stacktrace on task kernel exit
  * @work: Unwind descriptor requesting the trace
  * @cookie: The cookie of the first request made for this task
  *
  * Schedule a user space unwind to be done in task work before exiting the
  * kernel.
  *
  * The returned @cookie output is the generated cookie of the very first
  * request for a user space stacktrace for this task since it entered the
  * kernel. It can be from a request by any caller of this infrastructure.
  * Its value will also be passed to the callback function.  It can be
  * used to stitch kernel and user stack traces together in post-processing.
  *
  * It's valid to call this function multiple times for the same @work within
  * the same task entry context.  Each call will return the same cookie
  * while the task hasn't left the kernel. If the callback is not pending
  * because it has already been previously called for the same entry context,
  * it will be called again with the same stack trace and cookie.
  *
  * Return: 0 if the callback successfully was queued.
  *         1 if the callback is pending or was already executed.
  *         Negative if there's an error.
  *         @cookie holds the cookie of the first request by any user
  */
 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 {
 	struct unwind_task_info *info = &current->unwind_info;
 	unsigned long old, bits;
 	unsigned long bit;
 	int ret;

 	*cookie = 0;

 	if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
 	    !user_mode(task_pt_regs(current)))
 		return -EINVAL;

 	/*
 	 * NMI requires having safe cmpxchg operations.
 	 * Trigger a warning to make it obvious that an architecture
 	 * is using this in NMI when it should not be.
 	 */
 	if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
 		return -EINVAL;

 	/* Do not allow cancelled works to request again */
 	bit = READ_ONCE(work->bit);
 	if (WARN_ON_ONCE(bit < 0))
 		return -EINVAL;

 	/* Only need the mask now */
 	bit = BIT(bit);

 	guard(irqsave)();

 	*cookie = get_cookie(info);

 	old = READ_ONCE(info->unwind_mask);

 	/* Is this already queued or executed */
 	if (old & bit)
 		return 1;

 	/*
 	 * This work's bit hasn't been set yet. Now set it with the PENDING
 	 * bit and fetch the current value of unwind_mask. If ether the
 	 * work's bit or PENDING was already set, then this is already queued
 	 * to have a callback.
 	 */
 	bits = UNWIND_PENDING | bit;
 	old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
 	if (old & bits) {
 		/*
 		 * If the work's bit was set, whatever set it had better
 		 * have also set pending and queued a callback.
 		 */
 		WARN_ON_ONCE(!(old & UNWIND_PENDING));
 		return old & bit;
 	}

 	/* The work has been claimed, now schedule it. */
 	ret = task_work_add(current, &info->work, TWA_RESUME);

 	if (WARN_ON_ONCE(ret))
 		WRITE_ONCE(info->unwind_mask, 0);

 	return ret;
 }

 void unwind_deferred_cancel(struct unwind_work *work)
 {
 	struct task_struct *g, *t;
 	int bit;

 	if (!work)
 		return;

 	bit = work->bit;

 	/* No work should be using a reserved bit */
 	if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
 		return;

 	guard(mutex)(&callback_mutex);
 	list_del_rcu(&work->list);

 	/* Do not allow any more requests and prevent callbacks */
 	work->bit = -1;

 	__clear_bit(bit, &unwind_mask);

 	synchronize_srcu(&unwind_srcu);

 	guard(rcu)();
 	/* Clear this bit from all threads */
 	for_each_process_thread(g, t) {
 		clear_bit(bit, &t->unwind_info.unwind_mask);
 		if (t->unwind_info.cache)
 			clear_bit(bit, &t->unwind_info.cache->unwind_completed);
 	}
 }

 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
 {
 	memset(work, 0, sizeof(*work));

 	guard(mutex)(&callback_mutex);

 	/* See if there's a bit in the mask available */
 	if (unwind_mask == ~0UL)
 		return -EBUSY;

 	work->bit = ffz(unwind_mask);
 	__set_bit(work->bit, &unwind_mask);

 	list_add_rcu(&work->list, &callbacks);
 	work->func = func;
 	return 0;
 }

 void unwind_task_init(struct task_struct *task)
 {
 	struct unwind_task_info *info = &task->unwind_info;

 	memset(info, 0, sizeof(*info));
 	init_task_work(&info->work, unwind_deferred_task_work);
 	info->unwind_mask = 0;
 }

 void unwind_task_free(struct task_struct *task)
 {
 	struct unwind_task_info *info = &task->unwind_info;

 	kfree(info->cache);
 	task_work_cancel(task, &info->work);
 }
	// SPDX-License-Identifier: GPL-2.0
	/*
	* Deferred user space unwinding
	*/
	#include <linux/sched/task_stack.h>
	#include <linux/unwind_deferred.h>
	#include <linux/sched/clock.h>
	#include <linux/task_work.h>
	#include <linux/kernel.h>
	#include <linux/sched.h>
	#include <linux/sizes.h>
	#include <linux/slab.h>
	#include <linux/mm.h>

	/*
	* For requesting a deferred user space stack trace from NMI context
	* the architecture must support a safe cmpxchg in NMI context.
	* For those architectures that do not have that, then it cannot ask
	* for a deferred user space stack trace from an NMI context. If it
	* does, then it will get -EINVAL.
	*/
	#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
	# define CAN_USE_IN_NMI 1
	static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
	{
	u32 old = 0;

	return try_cmpxchg(&info->id.cnt, &old, cnt);
	}
	#else
	# define CAN_USE_IN_NMI 0
	/* When NMIs are not allowed, this always succeeds */
	static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
	{
	info->id.cnt = cnt;
	return true;
	}
	#endif

	/* Make the cache fit in a 4K page */
	#define UNWIND_MAX_ENTRIES \
	((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))

	/* Guards adding to or removing from the list of callbacks */
	static DEFINE_MUTEX(callback_mutex);
	static LIST_HEAD(callbacks);

	#define RESERVED_BITS (UNWIND_PENDING \| UNWIND_USED)

	/* Zero'd bits are available for assigning callback users */
	static unsigned long unwind_mask = RESERVED_BITS;
	DEFINE_STATIC_SRCU(unwind_srcu);

	static inline bool unwind_pending(struct unwind_task_info *info)
	{
	return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
	}

	/*
	* This is a unique percpu identifier for a given task entry context.
	* Conceptually, it's incremented every time the CPU enters the kernel from
	* user space, so that each "entry context" on the CPU gets a unique ID. In
	* reality, as an optimization, it's only incremented on demand for the first
	* deferred unwind request after a given entry-from-user.
	*
	* It's combined with the CPU id to make a systemwide-unique "context cookie".
	*/
	static DEFINE_PER_CPU(u32, unwind_ctx_ctr);

	/*
	* The context cookie is a unique identifier that is assigned to a user
	* space stacktrace. As the user space stacktrace remains the same while
	* the task is in the kernel, the cookie is an identifier for the stacktrace.
	* Although it is possible for the stacktrace to get another cookie if another
	* request is made after the cookie was cleared and before reentering user
	* space.
	*/
	static u64 get_cookie(struct unwind_task_info *info)
	{
	u32 cnt = 1;

	if (info->id.cpu)
	return info->id.id;

	/* LSB is always set to ensure 0 is an invalid value */
	cnt \|= __this_cpu_read(unwind_ctx_ctr) + 2;
	if (try_assign_cnt(info, cnt)) {
	/* Update the per cpu counter */
	__this_cpu_write(unwind_ctx_ctr, cnt);
	}
	/* Interrupts are disabled, the CPU will always be same */
	info->id.cpu = smp_processor_id() + 1; /* Must be non zero */

	return info->id.id;
	}

	/**
	* unwind_user_faultable - Produce a user stacktrace in faultable context
	* @trace: The descriptor that will store the user stacktrace
	*
	* This must be called in a known faultable context (usually when entering
	* or exiting user space). Depending on the available implementations
	* the @trace will be loaded with the addresses of the user space stacktrace
	* if it can be found.
	*
	* Return: 0 on success and negative on error
	* On success @trace will contain the user space stacktrace
	*/
	int unwind_user_faultable(struct unwind_stacktrace *trace)
	{
	struct unwind_task_info *info = &current->unwind_info;
	struct unwind_cache *cache;

	/* Should always be called from faultable context */
	might_fault();

	if (!current->mm)
	return -EINVAL;

	if (!info->cache) {
	info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
	GFP_KERNEL);
	if (!info->cache)
	return -ENOMEM;
	}

	cache = info->cache;
	trace->entries = cache->entries;

	if (cache->nr_entries) {
	/*
	* The user stack has already been previously unwound in this
	* entry context. Skip the unwind and use the cache.
	*/
	trace->nr = cache->nr_entries;
	return 0;
	}

	trace->nr = 0;
	unwind_user(trace, UNWIND_MAX_ENTRIES);

	cache->nr_entries = trace->nr;

	/* Clear nr_entries on way back to user space */
	set_bit(UNWIND_USED_BIT, &info->unwind_mask);

	return 0;
	}

	static void process_unwind_deferred(struct task_struct *task)
	{
	struct unwind_task_info *info = &task->unwind_info;
	struct unwind_stacktrace trace;
	struct unwind_work *work;
	unsigned long bits;
	u64 cookie;

	if (WARN_ON_ONCE(!unwind_pending(info)))
	return;

	/* Clear pending bit but make sure to have the current bits */
	bits = atomic_long_fetch_andnot(UNWIND_PENDING,
	(atomic_long_t *)&info->unwind_mask);
	/*
	* From here on out, the callback must always be called, even if it's
	* just an empty trace.
	*/
	trace.nr = 0;
	trace.entries = NULL;

	unwind_user_faultable(&trace);

	if (info->cache)
	bits &= ~(info->cache->unwind_completed);

	cookie = info->id.id;

	guard(srcu)(&unwind_srcu);
	list_for_each_entry_srcu(work, &callbacks, list,
	srcu_read_lock_held(&unwind_srcu)) {
	if (test_bit(work->bit, &bits)) {
	work->func(work, &trace, cookie);
	if (info->cache)
	info->cache->unwind_completed \|= BIT(work->bit);
	}
	}
	}

	static void unwind_deferred_task_work(struct callback_head *head)
	{
	process_unwind_deferred(current);
	}

	void unwind_deferred_task_exit(struct task_struct *task)
	{
	struct unwind_task_info *info = &current->unwind_info;

	if (!unwind_pending(info))
	return;

	process_unwind_deferred(task);

	task_work_cancel(task, &info->work);
	}

	/**
	* unwind_deferred_request - Request a user stacktrace on task kernel exit
	* @work: Unwind descriptor requesting the trace
	* @cookie: The cookie of the first request made for this task
	*
	* Schedule a user space unwind to be done in task work before exiting the
	* kernel.
	*
	* The returned @cookie output is the generated cookie of the very first
	* request for a user space stacktrace for this task since it entered the
	* kernel. It can be from a request by any caller of this infrastructure.
	* Its value will also be passed to the callback function. It can be
	* used to stitch kernel and user stack traces together in post-processing.
	*
	* It's valid to call this function multiple times for the same @work within
	* the same task entry context. Each call will return the same cookie
	* while the task hasn't left the kernel. If the callback is not pending
	* because it has already been previously called for the same entry context,
	* it will be called again with the same stack trace and cookie.
	*
	* Return: 0 if the callback successfully was queued.
	* 1 if the callback is pending or was already executed.
	* Negative if there's an error.
	* @cookie holds the cookie of the first request by any user
	*/
	int unwind_deferred_request(struct unwind_work work, u64 cookie)
	{
	struct unwind_task_info *info = &current->unwind_info;
	unsigned long old, bits;
	unsigned long bit;
	int ret;

	*cookie = 0;

	if ((current->flags & (PF_KTHREAD \| PF_EXITING)) \|\|
	!user_mode(task_pt_regs(current)))
	return -EINVAL;

	/*
	* NMI requires having safe cmpxchg operations.
	* Trigger a warning to make it obvious that an architecture
	* is using this in NMI when it should not be.
	*/
	if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
	return -EINVAL;

	/* Do not allow cancelled works to request again */
	bit = READ_ONCE(work->bit);
	if (WARN_ON_ONCE(bit < 0))
	return -EINVAL;

	/* Only need the mask now */
	bit = BIT(bit);

	guard(irqsave)();

	*cookie = get_cookie(info);

	old = READ_ONCE(info->unwind_mask);

	/* Is this already queued or executed */
	if (old & bit)
	return 1;

	/*
	* This work's bit hasn't been set yet. Now set it with the PENDING
	* bit and fetch the current value of unwind_mask. If ether the
	* work's bit or PENDING was already set, then this is already queued
	* to have a callback.
	*/
	bits = UNWIND_PENDING \| bit;
	old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
	if (old & bits) {
	/*
	* If the work's bit was set, whatever set it had better
	* have also set pending and queued a callback.
	*/
	WARN_ON_ONCE(!(old & UNWIND_PENDING));
	return old & bit;
	}

	/* The work has been claimed, now schedule it. */
	ret = task_work_add(current, &info->work, TWA_RESUME);

	if (WARN_ON_ONCE(ret))
	WRITE_ONCE(info->unwind_mask, 0);

	return ret;
	}

	void unwind_deferred_cancel(struct unwind_work *work)
	{
	struct task_struct g, t;
	int bit;

	if (!work)
	return;

	bit = work->bit;

	/* No work should be using a reserved bit */
	if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
	return;

	guard(mutex)(&callback_mutex);
	list_del_rcu(&work->list);

	/* Do not allow any more requests and prevent callbacks */
	work->bit = -1;

	__clear_bit(bit, &unwind_mask);

	synchronize_srcu(&unwind_srcu);

	guard(rcu)();
	/* Clear this bit from all threads */
	for_each_process_thread(g, t) {
	clear_bit(bit, &t->unwind_info.unwind_mask);
	if (t->unwind_info.cache)
	clear_bit(bit, &t->unwind_info.cache->unwind_completed);
	}
	}

	int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
	{
	memset(work, 0, sizeof(*work));

	guard(mutex)(&callback_mutex);

	/* See if there's a bit in the mask available */
	if (unwind_mask == ~0UL)
	return -EBUSY;

	work->bit = ffz(unwind_mask);
	__set_bit(work->bit, &unwind_mask);

	list_add_rcu(&work->list, &callbacks);
	work->func = func;
	return 0;
	}

	void unwind_task_init(struct task_struct *task)
	{
	struct unwind_task_info *info = &task->unwind_info;

	memset(info, 0, sizeof(*info));
	init_task_work(&info->work, unwind_deferred_task_work);
	info->unwind_mask = 0;
	}

	void unwind_task_free(struct task_struct *task)
	{
	struct unwind_task_info *info = &task->unwind_info;

	kfree(info->cache);
	task_work_cancel(task, &info->work);
	}