| // SPDX-License-Identifier: GPL-2.0+ |
| /* |
| * Restartable sequences system call |
| * |
| * Copyright (C) 2015, Google, Inc., |
| * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> |
| * Copyright (C) 2015-2018, EfficiOS Inc., |
| * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> |
| */ |
| |
| /* |
| * Restartable sequences are a lightweight interface that allows |
| * user-level code to be executed atomically relative to scheduler |
| * preemption and signal delivery. Typically used for implementing |
| * per-cpu operations. |
| * |
| * It allows user-space to perform update operations on per-cpu data |
| * without requiring heavy-weight atomic operations. |
| * |
| * Detailed algorithm of rseq user-space assembly sequences: |
| * |
| * init(rseq_cs) |
| * cpu = TLS->rseq::cpu_id_start |
| * [1] TLS->rseq::rseq_cs = rseq_cs |
| * [start_ip] ---------------------------- |
| * [2] if (cpu != TLS->rseq::cpu_id) |
| * goto abort_ip; |
| * [3] <last_instruction_in_cs> |
| * [post_commit_ip] ---------------------------- |
| * |
| * The address of jump target abort_ip must be outside the critical |
| * region, i.e.: |
| * |
| * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] |
| * |
| * Steps [2]-[3] (inclusive) need to be a sequence of instructions in |
| * userspace that can handle being interrupted between any of those |
| * instructions, and then resumed to the abort_ip. |
| * |
| * 1. Userspace stores the address of the struct rseq_cs assembly |
| * block descriptor into the rseq_cs field of the registered |
| * struct rseq TLS area. This update is performed through a single |
| * store within the inline assembly instruction sequence. |
| * [start_ip] |
| * |
| * 2. Userspace tests to check whether the current cpu_id field match |
| * the cpu number loaded before start_ip, branching to abort_ip |
| * in case of a mismatch. |
| * |
| * If the sequence is preempted or interrupted by a signal |
| * at or after start_ip and before post_commit_ip, then the kernel |
| * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return |
| * ip to abort_ip before returning to user-space, so the preempted |
| * execution resumes at abort_ip. |
| * |
| * 3. Userspace critical section final instruction before |
| * post_commit_ip is the commit. The critical section is |
| * self-terminating. |
| * [post_commit_ip] |
| * |
| * 4. <success> |
| * |
| * On failure at [2], or if interrupted by preempt or signal delivery |
| * between [1] and [3]: |
| * |
| * [abort_ip] |
| * F1. <failure> |
| */ |
| |
| /* Required to select the proper per_cpu ops for rseq_stats_inc() */ |
| #define RSEQ_BUILD_SLOW_PATH |
| |
| #include <linux/debugfs.h> |
| #include <linux/hrtimer.h> |
| #include <linux/percpu.h> |
| #include <linux/prctl.h> |
| #include <linux/ratelimit.h> |
| #include <linux/rseq_entry.h> |
| #include <linux/sched.h> |
| #include <linux/syscalls.h> |
| #include <linux/uaccess.h> |
| #include <linux/types.h> |
| #include <linux/rseq.h> |
| #include <asm/ptrace.h> |
| |
| #define CREATE_TRACE_POINTS |
| #include <trace/events/rseq.h> |
| |
| DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); |
| |
| static inline void rseq_control_debug(bool on) |
| { |
| if (on) |
| static_branch_enable(&rseq_debug_enabled); |
| else |
| static_branch_disable(&rseq_debug_enabled); |
| } |
| |
| static int __init rseq_setup_debug(char *str) |
| { |
| bool on; |
| |
| if (kstrtobool(str, &on)) |
| return -EINVAL; |
| rseq_control_debug(on); |
| return 1; |
| } |
| __setup("rseq_debug=", rseq_setup_debug); |
| |
| #ifdef CONFIG_TRACEPOINTS |
| /* |
| * Out of line, so the actual update functions can be in a header to be |
| * inlined into the exit to user code. |
| */ |
| void __rseq_trace_update(struct task_struct *t) |
| { |
| trace_rseq_update(t); |
| } |
| |
| void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, |
| unsigned long offset, unsigned long abort_ip) |
| { |
| trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); |
| } |
| #endif /* CONFIG_TRACEPOINTS */ |
| |
| #ifdef CONFIG_RSEQ_STATS |
| DEFINE_PER_CPU(struct rseq_stats, rseq_stats); |
| |
| static int rseq_stats_show(struct seq_file *m, void *p) |
| { |
| struct rseq_stats stats = { }; |
| unsigned int cpu; |
| |
| for_each_possible_cpu(cpu) { |
| stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); |
| stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); |
| stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); |
| stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); |
| stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); |
| stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); |
| stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); |
| stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); |
| if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { |
| stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); |
| stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); |
| stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); |
| stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); |
| stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); |
| } |
| } |
| |
| seq_printf(m, "exit: %16lu\n", stats.exit); |
| seq_printf(m, "signal: %16lu\n", stats.signal); |
| seq_printf(m, "slowp: %16lu\n", stats.slowpath); |
| seq_printf(m, "fastp: %16lu\n", stats.fastpath); |
| seq_printf(m, "ids: %16lu\n", stats.ids); |
| seq_printf(m, "cs: %16lu\n", stats.cs); |
| seq_printf(m, "clear: %16lu\n", stats.clear); |
| seq_printf(m, "fixup: %16lu\n", stats.fixup); |
| if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { |
| seq_printf(m, "sgrant: %16lu\n", stats.s_granted); |
| seq_printf(m, "sexpir: %16lu\n", stats.s_expired); |
| seq_printf(m, "srevok: %16lu\n", stats.s_revoked); |
| seq_printf(m, "syield: %16lu\n", stats.s_yielded); |
| seq_printf(m, "sabort: %16lu\n", stats.s_aborted); |
| } |
| return 0; |
| } |
| |
| static int rseq_stats_open(struct inode *inode, struct file *file) |
| { |
| return single_open(file, rseq_stats_show, inode->i_private); |
| } |
| |
| static const struct file_operations stat_ops = { |
| .open = rseq_stats_open, |
| .read = seq_read, |
| .llseek = seq_lseek, |
| .release = single_release, |
| }; |
| |
| static int __init rseq_stats_init(struct dentry *root_dir) |
| { |
| debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); |
| return 0; |
| } |
| #else |
| static inline void rseq_stats_init(struct dentry *root_dir) { } |
| #endif /* CONFIG_RSEQ_STATS */ |
| |
| static int rseq_debug_show(struct seq_file *m, void *p) |
| { |
| bool on = static_branch_unlikely(&rseq_debug_enabled); |
| |
| seq_printf(m, "%d\n", on); |
| return 0; |
| } |
| |
| static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, |
| size_t count, loff_t *ppos) |
| { |
| bool on; |
| |
| if (kstrtobool_from_user(ubuf, count, &on)) |
| return -EINVAL; |
| |
| rseq_control_debug(on); |
| return count; |
| } |
| |
| static int rseq_debug_open(struct inode *inode, struct file *file) |
| { |
| return single_open(file, rseq_debug_show, inode->i_private); |
| } |
| |
| static const struct file_operations debug_ops = { |
| .open = rseq_debug_open, |
| .read = seq_read, |
| .write = rseq_debug_write, |
| .llseek = seq_lseek, |
| .release = single_release, |
| }; |
| |
| static void rseq_slice_ext_init(struct dentry *root_dir); |
| |
| static int __init rseq_debugfs_init(void) |
| { |
| struct dentry *root_dir = debugfs_create_dir("rseq", NULL); |
| |
| debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); |
| rseq_stats_init(root_dir); |
| if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) |
| rseq_slice_ext_init(root_dir); |
| return 0; |
| } |
| __initcall(rseq_debugfs_init); |
| |
| static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) |
| { |
| return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); |
| } |
| |
| static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) |
| { |
| struct rseq __user *urseq = t->rseq.usrptr; |
| u64 csaddr; |
| |
| scoped_user_read_access(urseq, efault) |
| unsafe_get_user(csaddr, &urseq->rseq_cs, efault); |
| if (likely(!csaddr)) |
| return true; |
| return rseq_update_user_cs(t, regs, csaddr); |
| efault: |
| return false; |
| } |
| |
| static void rseq_slowpath_update_usr(struct pt_regs *regs) |
| { |
| /* |
| * Preserve rseq state and user_irq state. The generic entry code |
| * clears user_irq on the way out, the non-generic entry |
| * architectures are not having user_irq. |
| */ |
| const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; |
| struct task_struct *t = current; |
| struct rseq_ids ids; |
| u32 node_id; |
| bool event; |
| |
| if (unlikely(t->flags & PF_EXITING)) |
| return; |
| |
| rseq_stat_inc(rseq_stats.slowpath); |
| |
| /* |
| * Read and clear the event pending bit first. If the task |
| * was not preempted or migrated or a signal is on the way, |
| * there is no point in doing any of the heavy lifting here |
| * on production kernels. In that case TIF_NOTIFY_RESUME |
| * was raised by some other functionality. |
| * |
| * This is correct because the read/clear operation is |
| * guarded against scheduler preemption, which makes it CPU |
| * local atomic. If the task is preempted right after |
| * re-enabling preemption then TIF_NOTIFY_RESUME is set |
| * again and this function is invoked another time _before_ |
| * the task is able to return to user mode. |
| * |
| * On a debug kernel, invoke the fixup code unconditionally |
| * with the result handed in to allow the detection of |
| * inconsistencies. |
| */ |
| scoped_guard(irq) { |
| event = t->rseq.event.sched_switch; |
| t->rseq.event.all &= evt_mask.all; |
| ids.cpu_id = task_cpu(t); |
| ids.mm_cid = task_mm_cid(t); |
| } |
| |
| if (!event) |
| return; |
| |
| node_id = cpu_to_node(ids.cpu_id); |
| |
| if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { |
| /* |
| * Clear the errors just in case this might survive magically, but |
| * leave the rest intact. |
| */ |
| t->rseq.event.error = 0; |
| force_sig(SIGSEGV); |
| } |
| } |
| |
| void __rseq_handle_slowpath(struct pt_regs *regs) |
| { |
| /* |
| * If invoked from hypervisors before entering the guest via |
| * resume_user_mode_work(), then @regs is a NULL pointer. |
| * |
| * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises |
| * it before returning from the ioctl() to user space when |
| * rseq_event.sched_switch is set. |
| * |
| * So it's safe to ignore here instead of pointlessly updating it |
| * in the vcpu_run() loop. |
| */ |
| if (!regs) |
| return; |
| |
| rseq_slowpath_update_usr(regs); |
| } |
| |
| void __rseq_signal_deliver(int sig, struct pt_regs *regs) |
| { |
| rseq_stat_inc(rseq_stats.signal); |
| /* |
| * Don't update IDs, they are handled on exit to user if |
| * necessary. The important thing is to abort a critical section of |
| * the interrupted context as after this point the instruction |
| * pointer in @regs points to the signal handler. |
| */ |
| if (unlikely(!rseq_handle_cs(current, regs))) { |
| /* |
| * Clear the errors just in case this might survive |
| * magically, but leave the rest intact. |
| */ |
| current->rseq.event.error = 0; |
| force_sigsegv(sig); |
| } |
| } |
| |
| /* |
| * Terminate the process if a syscall is issued within a restartable |
| * sequence. |
| */ |
| void __rseq_debug_syscall_return(struct pt_regs *regs) |
| { |
| struct task_struct *t = current; |
| u64 csaddr; |
| |
| if (!t->rseq.event.has_rseq) |
| return; |
| if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) |
| goto fail; |
| if (likely(!csaddr)) |
| return; |
| if (unlikely(csaddr >= TASK_SIZE)) |
| goto fail; |
| if (rseq_debug_update_user_cs(t, regs, csaddr)) |
| return; |
| fail: |
| force_sig(SIGSEGV); |
| } |
| |
| #ifdef CONFIG_DEBUG_RSEQ |
| /* Kept around to keep GENERIC_ENTRY=n architectures supported. */ |
| void rseq_syscall(struct pt_regs *regs) |
| { |
| __rseq_debug_syscall_return(regs); |
| } |
| #endif |
| |
| static bool rseq_reset_ids(void) |
| { |
| struct rseq_ids ids = { |
| .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, |
| .mm_cid = 0, |
| }; |
| |
| /* |
| * If this fails, terminate it because this leaves the kernel in |
| * stupid state as exit to user space will try to fixup the ids |
| * again. |
| */ |
| if (rseq_set_ids(current, &ids, 0)) |
| return true; |
| |
| force_sig(SIGSEGV); |
| return false; |
| } |
| |
| /* The original rseq structure size (including padding) is 32 bytes. */ |
| #define ORIG_RSEQ_SIZE 32 |
| |
| /* |
| * sys_rseq - setup restartable sequences for caller thread. |
| */ |
| SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) |
| { |
| u32 rseqfl = 0; |
| |
| if (flags & RSEQ_FLAG_UNREGISTER) { |
| if (flags & ~RSEQ_FLAG_UNREGISTER) |
| return -EINVAL; |
| /* Unregister rseq for current thread. */ |
| if (current->rseq.usrptr != rseq || !current->rseq.usrptr) |
| return -EINVAL; |
| if (rseq_len != current->rseq.len) |
| return -EINVAL; |
| if (current->rseq.sig != sig) |
| return -EPERM; |
| if (!rseq_reset_ids()) |
| return -EFAULT; |
| rseq_reset(current); |
| return 0; |
| } |
| |
| if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) |
| return -EINVAL; |
| |
| if (current->rseq.usrptr) { |
| /* |
| * If rseq is already registered, check whether |
| * the provided address differs from the prior |
| * one. |
| */ |
| if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) |
| return -EINVAL; |
| if (current->rseq.sig != sig) |
| return -EPERM; |
| /* Already registered. */ |
| return -EBUSY; |
| } |
| |
| /* |
| * If there was no rseq previously registered, ensure the provided rseq |
| * is properly aligned, as communcated to user-space through the ELF |
| * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq |
| * size, the required alignment is the original struct rseq alignment. |
| * |
| * The rseq_len is required to be greater or equal to the original rseq |
| * size. In order to be valid, rseq_len is either the original rseq size, |
| * or large enough to contain all supported fields, as communicated to |
| * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. |
| */ |
| if (rseq_len < ORIG_RSEQ_SIZE || |
| (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || |
| (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || |
| rseq_len < offsetof(struct rseq, end)))) |
| return -EINVAL; |
| if (!access_ok(rseq, rseq_len)) |
| return -EFAULT; |
| |
| if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { |
| rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; |
| if (rseq_slice_extension_enabled() && |
| (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) |
| rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; |
| } |
| |
| scoped_user_write_access(rseq, efault) { |
| /* |
| * If the rseq_cs pointer is non-NULL on registration, clear it to |
| * avoid a potential segfault on return to user-space. The proper thing |
| * to do would have been to fail the registration but this would break |
| * older libcs that reuse the rseq area for new threads without |
| * clearing the fields. Don't bother reading it, just reset it. |
| */ |
| unsafe_put_user(0UL, &rseq->rseq_cs, efault); |
| unsafe_put_user(rseqfl, &rseq->flags, efault); |
| /* Initialize IDs in user space */ |
| unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); |
| unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); |
| unsafe_put_user(0U, &rseq->node_id, efault); |
| unsafe_put_user(0U, &rseq->mm_cid, efault); |
| unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); |
| } |
| |
| /* |
| * Activate the registration by setting the rseq area address, length |
| * and signature in the task struct. |
| */ |
| current->rseq.usrptr = rseq; |
| current->rseq.len = rseq_len; |
| current->rseq.sig = sig; |
| |
| #ifdef CONFIG_RSEQ_SLICE_EXTENSION |
| current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); |
| #endif |
| |
| /* |
| * If rseq was previously inactive, and has just been |
| * registered, ensure the cpu_id_start and cpu_id fields |
| * are updated before returning to user-space. |
| */ |
| current->rseq.event.has_rseq = true; |
| rseq_force_update(); |
| return 0; |
| |
| efault: |
| return -EFAULT; |
| } |
| |
| #ifdef CONFIG_RSEQ_SLICE_EXTENSION |
| struct slice_timer { |
| struct hrtimer timer; |
| void *cookie; |
| }; |
| |
| static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC; |
| static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; |
| unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; |
| static DEFINE_PER_CPU(struct slice_timer, slice_timer); |
| DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); |
| |
| /* |
| * When the timer expires and the task is still in user space, the return |
| * from interrupt will revoke the grant and schedule. If the task already |
| * entered the kernel via a syscall and the timer fires before the syscall |
| * work was able to cancel it, then depending on the preemption model this |
| * will either reschedule on return from interrupt or in the syscall work |
| * below. |
| */ |
| static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) |
| { |
| struct slice_timer *st = container_of(tmr, struct slice_timer, timer); |
| |
| /* |
| * Validate that the task which armed the timer is still on the |
| * CPU. It could have been scheduled out without canceling the |
| * timer. |
| */ |
| if (st->cookie == current && current->rseq.slice.state.granted) { |
| rseq_stat_inc(rseq_stats.s_expired); |
| set_need_resched_current(); |
| } |
| return HRTIMER_NORESTART; |
| } |
| |
| bool __rseq_arm_slice_extension_timer(void) |
| { |
| struct slice_timer *st = this_cpu_ptr(&slice_timer); |
| struct task_struct *curr = current; |
| |
| lockdep_assert_irqs_disabled(); |
| |
| /* |
| * This check prevents a task, which got a time slice extension |
| * granted, from exceeding the maximum scheduling latency when the |
| * grant expired before going out to user space. Don't bother to |
| * clear the grant here, it will be cleaned up automatically before |
| * going out to user space after being scheduled back in. |
| */ |
| if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { |
| set_need_resched_current(); |
| return true; |
| } |
| |
| /* |
| * Store the task pointer as a cookie for comparison in the timer |
| * function. This is safe as the timer is CPU local and cannot be |
| * in the expiry function at this point. |
| */ |
| st->cookie = curr; |
| hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); |
| /* Arm the syscall entry work */ |
| set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); |
| return false; |
| } |
| |
| static void rseq_cancel_slice_extension_timer(void) |
| { |
| struct slice_timer *st = this_cpu_ptr(&slice_timer); |
| |
| /* |
| * st->cookie can be safely read as preemption is disabled and the |
| * timer is CPU local. |
| * |
| * As this is most probably the first expiring timer, the cancel is |
| * expensive as it has to reprogram the hardware, but that's less |
| * expensive than going through a full hrtimer_interrupt() cycle |
| * for nothing. |
| * |
| * hrtimer_try_to_cancel() is sufficient here as the timer is CPU |
| * local and once the hrtimer code disabled interrupts the timer |
| * callback cannot be running. |
| */ |
| if (st->cookie == current) |
| hrtimer_try_to_cancel(&st->timer); |
| } |
| |
| static inline void rseq_slice_set_need_resched(struct task_struct *curr) |
| { |
| /* |
| * The interrupt guard is required to prevent inconsistent state in |
| * this case: |
| * |
| * set_tsk_need_resched() |
| * --> Interrupt |
| * wakeup() |
| * set_tsk_need_resched() |
| * set_preempt_need_resched() |
| * schedule_on_return() |
| * clear_tsk_need_resched() |
| * clear_preempt_need_resched() |
| * set_preempt_need_resched() <- Inconsistent state |
| * |
| * This is safe vs. a remote set of TIF_NEED_RESCHED because that |
| * only sets the already set bit and does not create inconsistent |
| * state. |
| */ |
| scoped_guard(irq) |
| set_need_resched_current(); |
| } |
| |
| static void rseq_slice_validate_ctrl(u32 expected) |
| { |
| u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all; |
| u32 uval; |
| |
| if (get_user(uval, sctrl) || uval != expected) |
| force_sig(SIGSEGV); |
| } |
| |
| /* |
| * Invoked from syscall entry if a time slice extension was granted and the |
| * kernel did not clear it before user space left the critical section. |
| * |
| * While the recommended way to relinquish the CPU side effect free is |
| * rseq_slice_yield(2), any syscall within a granted slice terminates the |
| * grant and immediately reschedules if required. This supports onion layer |
| * applications, where the code requesting the grant cannot control the |
| * code within the critical section. |
| */ |
| void rseq_syscall_enter_work(long syscall) |
| { |
| struct task_struct *curr = current; |
| struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; |
| |
| clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); |
| |
| if (static_branch_unlikely(&rseq_debug_enabled)) |
| rseq_slice_validate_ctrl(ctrl.all); |
| |
| /* |
| * The kernel might have raced, revoked the grant and updated |
| * userspace, but kept the SLICE work set. |
| */ |
| if (!ctrl.granted) |
| return; |
| |
| /* |
| * Required to stabilize the per CPU timer pointer and to make |
| * set_tsk_need_resched() correct on PREEMPT[RT] kernels. |
| * |
| * Leaving the scope will reschedule on preemption models FULL, |
| * LAZY and RT if necessary. |
| */ |
| scoped_guard(preempt) { |
| rseq_cancel_slice_extension_timer(); |
| /* |
| * Now that preemption is disabled, quickly check whether |
| * the task was already rescheduled before arriving here. |
| */ |
| if (!curr->rseq.event.sched_switch) { |
| rseq_slice_set_need_resched(curr); |
| |
| if (syscall == __NR_rseq_slice_yield) { |
| rseq_stat_inc(rseq_stats.s_yielded); |
| /* Update the yielded state for syscall return */ |
| curr->rseq.slice.yielded = 1; |
| } else { |
| rseq_stat_inc(rseq_stats.s_aborted); |
| } |
| } |
| } |
| /* Reschedule on NONE/VOLUNTARY preemption models */ |
| cond_resched(); |
| |
| /* Clear the grant in kernel state and user space */ |
| curr->rseq.slice.state.granted = false; |
| if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) |
| force_sig(SIGSEGV); |
| } |
| |
| int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) |
| { |
| switch (arg2) { |
| case PR_RSEQ_SLICE_EXTENSION_GET: |
| if (arg3) |
| return -EINVAL; |
| return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; |
| |
| case PR_RSEQ_SLICE_EXTENSION_SET: { |
| u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; |
| bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); |
| |
| if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) |
| return -EINVAL; |
| if (!rseq_slice_extension_enabled()) |
| return -ENOTSUPP; |
| if (!current->rseq.usrptr) |
| return -ENXIO; |
| |
| /* No change? */ |
| if (enable == !!current->rseq.slice.state.enabled) |
| return 0; |
| |
| if (get_user(rflags, ¤t->rseq.usrptr->flags)) |
| goto die; |
| |
| if (current->rseq.slice.state.enabled) |
| valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; |
| |
| if ((rflags & valid) != valid) |
| goto die; |
| |
| rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; |
| rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; |
| if (enable) |
| rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; |
| |
| if (put_user(rflags, ¤t->rseq.usrptr->flags)) |
| goto die; |
| |
| current->rseq.slice.state.enabled = enable; |
| return 0; |
| } |
| default: |
| return -EINVAL; |
| } |
| die: |
| force_sig(SIGSEGV); |
| return -EFAULT; |
| } |
| |
| /** |
| * sys_rseq_slice_yield - yield the current processor side effect free if a |
| * task granted with a time slice extension is done with |
| * the critical work before being forced out. |
| * |
| * Return: 1 if the task successfully yielded the CPU within the granted slice. |
| * 0 if the slice extension was either never granted or was revoked by |
| * going over the granted extension, using a syscall other than this one |
| * or being scheduled out earlier due to a subsequent interrupt. |
| * |
| * The syscall does not schedule because the syscall entry work immediately |
| * relinquishes the CPU and schedules if required. |
| */ |
| SYSCALL_DEFINE0(rseq_slice_yield) |
| { |
| int yielded = !!current->rseq.slice.yielded; |
| |
| current->rseq.slice.yielded = 0; |
| return yielded; |
| } |
| |
| static int rseq_slice_ext_show(struct seq_file *m, void *p) |
| { |
| seq_printf(m, "%d\n", rseq_slice_ext_nsecs); |
| return 0; |
| } |
| |
| static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf, |
| size_t count, loff_t *ppos) |
| { |
| unsigned int nsecs; |
| |
| if (kstrtouint_from_user(ubuf, count, 10, &nsecs)) |
| return -EINVAL; |
| |
| if (nsecs < rseq_slice_ext_nsecs_min) |
| return -ERANGE; |
| |
| if (nsecs > rseq_slice_ext_nsecs_max) |
| return -ERANGE; |
| |
| rseq_slice_ext_nsecs = nsecs; |
| |
| return count; |
| } |
| |
| static int rseq_slice_ext_open(struct inode *inode, struct file *file) |
| { |
| return single_open(file, rseq_slice_ext_show, inode->i_private); |
| } |
| |
| static const struct file_operations slice_ext_ops = { |
| .open = rseq_slice_ext_open, |
| .read = seq_read, |
| .write = rseq_slice_ext_write, |
| .llseek = seq_lseek, |
| .release = single_release, |
| }; |
| |
| static void rseq_slice_ext_init(struct dentry *root_dir) |
| { |
| debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops); |
| } |
| |
| static int __init rseq_slice_cmdline(char *str) |
| { |
| bool on; |
| |
| if (kstrtobool(str, &on)) |
| return 0; |
| |
| if (!on) |
| static_branch_disable(&rseq_slice_extension_key); |
| return 1; |
| } |
| __setup("rseq_slice_ext=", rseq_slice_cmdline); |
| |
| static int __init rseq_slice_init(void) |
| { |
| unsigned int cpu; |
| |
| for_each_possible_cpu(cpu) { |
| hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, |
| CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); |
| } |
| return 0; |
| } |
| device_initcall(rseq_slice_init); |
| #else |
| static void rseq_slice_ext_init(struct dentry *root_dir) { } |
| #endif /* CONFIG_RSEQ_SLICE_EXTENSION */ |