| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * Copyright (c) 2024, Microsoft Corporation. |
| * |
| * The main part of the mshv_root module, providing APIs to create |
| * and manage guest partitions. |
| * |
| * Authors: Microsoft Linux virtualization team |
| */ |
| |
| #include <linux/kernel.h> |
| #include <linux/module.h> |
| #include <linux/fs.h> |
| #include <linux/miscdevice.h> |
| #include <linux/slab.h> |
| #include <linux/file.h> |
| #include <linux/anon_inodes.h> |
| #include <linux/mm.h> |
| #include <linux/io.h> |
| #include <linux/cpuhotplug.h> |
| #include <linux/random.h> |
| #include <asm/mshyperv.h> |
| #include <linux/hyperv.h> |
| #include <linux/notifier.h> |
| #include <linux/reboot.h> |
| #include <linux/kexec.h> |
| #include <linux/page-flags.h> |
| #include <linux/crash_dump.h> |
| #include <linux/panic_notifier.h> |
| #include <linux/vmalloc.h> |
| |
| #include "mshv_eventfd.h" |
| #include "mshv.h" |
| #include "mshv_root.h" |
| |
| MODULE_AUTHOR("Microsoft"); |
| MODULE_LICENSE("GPL"); |
| MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); |
| |
| /* TODO move this to mshyperv.h when needed outside driver */ |
| static inline bool hv_parent_partition(void) |
| { |
| return hv_root_partition(); |
| } |
| |
| /* TODO move this to another file when debugfs code is added */ |
| enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ |
| #if defined(CONFIG_X86) |
| VpRootDispatchThreadBlocked = 201, |
| #elif defined(CONFIG_ARM64) |
| VpRootDispatchThreadBlocked = 94, |
| #endif |
| VpStatsMaxCounter |
| }; |
| |
| struct hv_stats_page { |
| union { |
| u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ |
| u8 data[HV_HYP_PAGE_SIZE]; |
| }; |
| } __packed; |
| |
| struct mshv_root mshv_root; |
| |
| enum hv_scheduler_type hv_scheduler_type; |
| |
| /* Once we implement the fast extended hypercall ABI they can go away. */ |
| static void * __percpu *root_scheduler_input; |
| static void * __percpu *root_scheduler_output; |
| |
| static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); |
| static int mshv_dev_open(struct inode *inode, struct file *filp); |
| static int mshv_dev_release(struct inode *inode, struct file *filp); |
| static int mshv_vp_release(struct inode *inode, struct file *filp); |
| static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); |
| static int mshv_partition_release(struct inode *inode, struct file *filp); |
| static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); |
| static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); |
| static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); |
| static int mshv_init_async_handler(struct mshv_partition *partition); |
| static void mshv_async_hvcall_handler(void *data, u64 *status); |
| |
| static const union hv_input_vtl input_vtl_zero; |
| static const union hv_input_vtl input_vtl_normal = { |
| .target_vtl = HV_NORMAL_VTL, |
| .use_target_vtl = 1, |
| }; |
| |
| static const struct vm_operations_struct mshv_vp_vm_ops = { |
| .fault = mshv_vp_fault, |
| }; |
| |
| static const struct file_operations mshv_vp_fops = { |
| .owner = THIS_MODULE, |
| .release = mshv_vp_release, |
| .unlocked_ioctl = mshv_vp_ioctl, |
| .llseek = noop_llseek, |
| .mmap = mshv_vp_mmap, |
| }; |
| |
| static const struct file_operations mshv_partition_fops = { |
| .owner = THIS_MODULE, |
| .release = mshv_partition_release, |
| .unlocked_ioctl = mshv_partition_ioctl, |
| .llseek = noop_llseek, |
| }; |
| |
| static const struct file_operations mshv_dev_fops = { |
| .owner = THIS_MODULE, |
| .open = mshv_dev_open, |
| .release = mshv_dev_release, |
| .unlocked_ioctl = mshv_dev_ioctl, |
| .llseek = noop_llseek, |
| }; |
| |
| static struct miscdevice mshv_dev = { |
| .minor = MISC_DYNAMIC_MINOR, |
| .name = "mshv", |
| .fops = &mshv_dev_fops, |
| .mode = 0600, |
| }; |
| |
| /* |
| * Only allow hypercalls that have a u64 partition id as the first member of |
| * the input structure. |
| * These are sorted by value. |
| */ |
| static u16 mshv_passthru_hvcalls[] = { |
| HVCALL_GET_PARTITION_PROPERTY, |
| HVCALL_SET_PARTITION_PROPERTY, |
| HVCALL_INSTALL_INTERCEPT, |
| HVCALL_GET_VP_REGISTERS, |
| HVCALL_SET_VP_REGISTERS, |
| HVCALL_TRANSLATE_VIRTUAL_ADDRESS, |
| HVCALL_CLEAR_VIRTUAL_INTERRUPT, |
| HVCALL_REGISTER_INTERCEPT_RESULT, |
| HVCALL_ASSERT_VIRTUAL_INTERRUPT, |
| HVCALL_GET_GPA_PAGES_ACCESS_STATES, |
| HVCALL_SIGNAL_EVENT_DIRECT, |
| HVCALL_POST_MESSAGE_DIRECT, |
| HVCALL_GET_VP_CPUID_VALUES, |
| }; |
| |
| static bool mshv_hvcall_is_async(u16 code) |
| { |
| switch (code) { |
| case HVCALL_SET_PARTITION_PROPERTY: |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, |
| bool partition_locked, |
| void __user *user_args) |
| { |
| u64 status; |
| int ret = 0, i; |
| bool is_async; |
| struct mshv_root_hvcall args; |
| struct page *page; |
| unsigned int pages_order; |
| void *input_pg = NULL; |
| void *output_pg = NULL; |
| |
| if (copy_from_user(&args, user_args, sizeof(args))) |
| return -EFAULT; |
| |
| if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || |
| mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) |
| return -EINVAL; |
| |
| if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) |
| return -EINVAL; |
| |
| for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) |
| if (args.code == mshv_passthru_hvcalls[i]) |
| break; |
| |
| if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) |
| return -EINVAL; |
| |
| is_async = mshv_hvcall_is_async(args.code); |
| if (is_async) { |
| /* async hypercalls can only be called from partition fd */ |
| if (!partition_locked) |
| return -EINVAL; |
| ret = mshv_init_async_handler(partition); |
| if (ret) |
| return ret; |
| } |
| |
| pages_order = args.out_ptr ? 1 : 0; |
| page = alloc_pages(GFP_KERNEL, pages_order); |
| if (!page) |
| return -ENOMEM; |
| input_pg = page_address(page); |
| |
| if (args.out_ptr) |
| output_pg = (char *)input_pg + PAGE_SIZE; |
| else |
| output_pg = NULL; |
| |
| if (copy_from_user(input_pg, (void __user *)args.in_ptr, |
| args.in_sz)) { |
| ret = -EFAULT; |
| goto free_pages_out; |
| } |
| |
| /* |
| * NOTE: This only works because all the allowed hypercalls' input |
| * structs begin with a u64 partition_id field. |
| */ |
| *(u64 *)input_pg = partition->pt_id; |
| |
| if (args.reps) |
| status = hv_do_rep_hypercall(args.code, args.reps, 0, |
| input_pg, output_pg); |
| else |
| status = hv_do_hypercall(args.code, input_pg, output_pg); |
| |
| if (hv_result(status) == HV_STATUS_CALL_PENDING) { |
| if (is_async) { |
| mshv_async_hvcall_handler(partition, &status); |
| } else { /* Paranoia check. This shouldn't happen! */ |
| ret = -EBADFD; |
| goto free_pages_out; |
| } |
| } |
| |
| if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { |
| ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); |
| if (!ret) |
| ret = -EAGAIN; |
| } else if (!hv_result_success(status)) { |
| ret = hv_result_to_errno(status); |
| } |
| |
| /* |
| * Always return the status and output data regardless of result. |
| * The VMM may need it to determine how to proceed. E.g. the status may |
| * contain the number of reps completed if a rep hypercall partially |
| * succeeded. |
| */ |
| args.status = hv_result(status); |
| args.reps = args.reps ? hv_repcomp(status) : 0; |
| if (copy_to_user(user_args, &args, sizeof(args))) |
| ret = -EFAULT; |
| |
| if (output_pg && |
| copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) |
| ret = -EFAULT; |
| |
| free_pages_out: |
| free_pages((unsigned long)input_pg, pages_order); |
| |
| return ret; |
| } |
| |
| static inline bool is_ghcb_mapping_available(void) |
| { |
| #if IS_ENABLED(CONFIG_X86_64) |
| return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; |
| #else |
| return 0; |
| #endif |
| } |
| |
| static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, |
| struct hv_register_assoc *registers) |
| { |
| return hv_call_get_vp_registers(vp_index, partition_id, |
| count, input_vtl_zero, registers); |
| } |
| |
| static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, |
| struct hv_register_assoc *registers) |
| { |
| return hv_call_set_vp_registers(vp_index, partition_id, |
| count, input_vtl_zero, registers); |
| } |
| |
| /* |
| * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by |
| * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, |
| * done by the hypervisor. |
| * "Intercept" suspend leads to asynchronous message delivery to dom0 which |
| * should be awaited to keep the VP loop consistent (i.e. no message pending |
| * upon VP resume). |
| * VP intercept suspend can't be done when the VP is explicitly suspended |
| * already, and thus can be only two possible race scenarios: |
| * 1. implicit suspend bit set -> explicit suspend bit set -> message sent |
| * 2. implicit suspend bit set -> message sent -> explicit suspend bit set |
| * Checking for implicit suspend bit set after explicit suspend request has |
| * succeeded in either case allows us to reliably identify, if there is a |
| * message to receive and deliver to VMM. |
| */ |
| static int |
| mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) |
| { |
| struct hv_register_assoc explicit_suspend = { |
| .name = HV_REGISTER_EXPLICIT_SUSPEND |
| }; |
| struct hv_register_assoc intercept_suspend = { |
| .name = HV_REGISTER_INTERCEPT_SUSPEND |
| }; |
| union hv_explicit_suspend_register *es = |
| &explicit_suspend.value.explicit_suspend; |
| union hv_intercept_suspend_register *is = |
| &intercept_suspend.value.intercept_suspend; |
| int ret; |
| |
| es->suspended = 1; |
| |
| ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, |
| 1, &explicit_suspend); |
| if (ret) { |
| vp_err(vp, "Failed to explicitly suspend vCPU\n"); |
| return ret; |
| } |
| |
| ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, |
| 1, &intercept_suspend); |
| if (ret) { |
| vp_err(vp, "Failed to get intercept suspend state\n"); |
| return ret; |
| } |
| |
| *message_in_flight = is->suspended; |
| |
| return 0; |
| } |
| |
| /* |
| * This function is used when VPs are scheduled by the hypervisor's |
| * scheduler. |
| * |
| * Caller has to make sure the registers contain cleared |
| * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers |
| * exactly in this order (the hypervisor clears them sequentially) to avoid |
| * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND |
| * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the |
| * opposite order. |
| */ |
| static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) |
| { |
| long ret; |
| struct hv_register_assoc suspend_regs[2] = { |
| { .name = HV_REGISTER_INTERCEPT_SUSPEND }, |
| { .name = HV_REGISTER_EXPLICIT_SUSPEND } |
| }; |
| size_t count = ARRAY_SIZE(suspend_regs); |
| |
| /* Resume VP execution */ |
| ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, |
| count, suspend_regs); |
| if (ret) { |
| vp_err(vp, "Failed to resume vp execution. %lx\n", ret); |
| return ret; |
| } |
| |
| ret = wait_event_interruptible(vp->run.vp_suspend_queue, |
| vp->run.kicked_by_hv == 1); |
| if (ret) { |
| bool message_in_flight; |
| |
| /* |
| * Otherwise the waiting was interrupted by a signal: suspend |
| * the vCPU explicitly and copy message in flight (if any). |
| */ |
| ret = mshv_suspend_vp(vp, &message_in_flight); |
| if (ret) |
| return ret; |
| |
| /* Return if no message in flight */ |
| if (!message_in_flight) |
| return -EINTR; |
| |
| /* Wait for the message in flight. */ |
| wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); |
| } |
| |
| /* |
| * Reset the flag to make the wait_event call above work |
| * next time. |
| */ |
| vp->run.kicked_by_hv = 0; |
| |
| return 0; |
| } |
| |
| static int |
| mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, |
| struct hv_output_dispatch_vp *res) |
| { |
| struct hv_input_dispatch_vp *input; |
| struct hv_output_dispatch_vp *output; |
| u64 status; |
| |
| preempt_disable(); |
| input = *this_cpu_ptr(root_scheduler_input); |
| output = *this_cpu_ptr(root_scheduler_output); |
| |
| memset(input, 0, sizeof(*input)); |
| memset(output, 0, sizeof(*output)); |
| |
| input->partition_id = vp->vp_partition->pt_id; |
| input->vp_index = vp->vp_index; |
| input->time_slice = 0; /* Run forever until something happens */ |
| input->spec_ctrl = 0; /* TODO: set sensible flags */ |
| input->flags = flags; |
| |
| vp->run.flags.root_sched_dispatched = 1; |
| status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); |
| vp->run.flags.root_sched_dispatched = 0; |
| |
| *res = *output; |
| preempt_enable(); |
| |
| if (!hv_result_success(status)) |
| vp_err(vp, "%s: status %s\n", __func__, |
| hv_result_to_string(status)); |
| |
| return hv_result_to_errno(status); |
| } |
| |
| static int |
| mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) |
| { |
| struct hv_register_assoc explicit_suspend = { |
| .name = HV_REGISTER_EXPLICIT_SUSPEND, |
| .value.explicit_suspend.suspended = 0, |
| }; |
| int ret; |
| |
| ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, |
| 1, &explicit_suspend); |
| |
| if (ret) |
| vp_err(vp, "Failed to unsuspend\n"); |
| |
| return ret; |
| } |
| |
| #if IS_ENABLED(CONFIG_X86_64) |
| static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) |
| { |
| if (!vp->vp_register_page) |
| return 0; |
| return vp->vp_register_page->interrupt_vectors.as_uint64; |
| } |
| #else |
| static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) |
| { |
| return 0; |
| } |
| #endif |
| |
| static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) |
| { |
| struct hv_stats_page **stats = vp->vp_stats_pages; |
| u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; |
| u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; |
| |
| if (self_vp_cntrs[VpRootDispatchThreadBlocked]) |
| return self_vp_cntrs[VpRootDispatchThreadBlocked]; |
| return parent_vp_cntrs[VpRootDispatchThreadBlocked]; |
| } |
| |
| static int |
| mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) |
| { |
| int ret; |
| |
| ret = wait_event_interruptible(vp->run.vp_suspend_queue, |
| (vp->run.kicked_by_hv == 1 && |
| !mshv_vp_dispatch_thread_blocked(vp)) || |
| mshv_vp_interrupt_pending(vp)); |
| if (ret) |
| return -EINTR; |
| |
| vp->run.flags.root_sched_blocked = 0; |
| vp->run.kicked_by_hv = 0; |
| |
| return 0; |
| } |
| |
| static int mshv_pre_guest_mode_work(struct mshv_vp *vp) |
| { |
| const ulong work_flags = _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING | |
| _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME; |
| ulong th_flags; |
| |
| th_flags = read_thread_flags(); |
| while (th_flags & work_flags) { |
| int ret; |
| |
| /* nb: following will call schedule */ |
| ret = mshv_do_pre_guest_mode_work(th_flags); |
| |
| if (ret) |
| return ret; |
| |
| th_flags = read_thread_flags(); |
| } |
| |
| return 0; |
| } |
| |
| /* Must be called with interrupts enabled */ |
| static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) |
| { |
| long ret; |
| |
| if (vp->run.flags.root_sched_blocked) { |
| /* |
| * Dispatch state of this VP is blocked. Need to wait |
| * for the hypervisor to clear the blocked state before |
| * dispatching it. |
| */ |
| ret = mshv_vp_wait_for_hv_kick(vp); |
| if (ret) |
| return ret; |
| } |
| |
| do { |
| u32 flags = 0; |
| struct hv_output_dispatch_vp output; |
| |
| ret = mshv_pre_guest_mode_work(vp); |
| if (ret) |
| break; |
| |
| if (vp->run.flags.intercept_suspend) |
| flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; |
| |
| if (mshv_vp_interrupt_pending(vp)) |
| flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; |
| |
| ret = mshv_vp_dispatch(vp, flags, &output); |
| if (ret) |
| break; |
| |
| vp->run.flags.intercept_suspend = 0; |
| |
| if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { |
| if (output.dispatch_event == |
| HV_VP_DISPATCH_EVENT_SUSPEND) { |
| /* |
| * TODO: remove the warning once VP canceling |
| * is supported |
| */ |
| WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), |
| "%s: vp#%d: unexpected explicit suspend\n", |
| __func__, vp->vp_index); |
| /* |
| * Need to clear explicit suspend before |
| * dispatching. |
| * Explicit suspend is either: |
| * - set right after the first VP dispatch or |
| * - set explicitly via hypercall |
| * Since the latter case is not yet supported, |
| * simply clear it here. |
| */ |
| ret = mshv_vp_clear_explicit_suspend(vp); |
| if (ret) |
| break; |
| |
| ret = mshv_vp_wait_for_hv_kick(vp); |
| if (ret) |
| break; |
| } else { |
| vp->run.flags.root_sched_blocked = 1; |
| ret = mshv_vp_wait_for_hv_kick(vp); |
| if (ret) |
| break; |
| } |
| } else { |
| /* HV_VP_DISPATCH_STATE_READY */ |
| if (output.dispatch_event == |
| HV_VP_DISPATCH_EVENT_INTERCEPT) |
| vp->run.flags.intercept_suspend = 1; |
| } |
| } while (!vp->run.flags.intercept_suspend); |
| |
| return ret; |
| } |
| |
| static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, |
| "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); |
| |
| static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) |
| { |
| long rc; |
| |
| if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) |
| rc = mshv_run_vp_with_root_scheduler(vp); |
| else |
| rc = mshv_run_vp_with_hyp_scheduler(vp); |
| |
| if (rc) |
| return rc; |
| |
| if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, |
| sizeof(struct hv_message))) |
| rc = -EFAULT; |
| |
| return rc; |
| } |
| |
| static int |
| mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, |
| struct hv_vp_state_data state_data, |
| unsigned long user_pfn, size_t page_count, |
| bool is_set) |
| { |
| int completed, ret = 0; |
| unsigned long check; |
| struct page **pages; |
| |
| if (page_count > INT_MAX) |
| return -EINVAL; |
| /* |
| * Check the arithmetic for wraparound/overflow. |
| * The last page address in the buffer is: |
| * (user_pfn + (page_count - 1)) * PAGE_SIZE |
| */ |
| if (check_add_overflow(user_pfn, (page_count - 1), &check)) |
| return -EOVERFLOW; |
| if (check_mul_overflow(check, PAGE_SIZE, &check)) |
| return -EOVERFLOW; |
| |
| /* Pin user pages so hypervisor can copy directly to them */ |
| pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); |
| if (!pages) |
| return -ENOMEM; |
| |
| for (completed = 0; completed < page_count; completed += ret) { |
| unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; |
| int remaining = page_count - completed; |
| |
| ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, |
| &pages[completed]); |
| if (ret < 0) { |
| vp_err(vp, "%s: Failed to pin user pages error %i\n", |
| __func__, ret); |
| goto unpin_pages; |
| } |
| } |
| |
| if (is_set) |
| ret = hv_call_set_vp_state(vp->vp_index, |
| vp->vp_partition->pt_id, |
| state_data, page_count, pages, |
| 0, NULL); |
| else |
| ret = hv_call_get_vp_state(vp->vp_index, |
| vp->vp_partition->pt_id, |
| state_data, page_count, pages, |
| NULL); |
| |
| unpin_pages: |
| unpin_user_pages(pages, completed); |
| kfree(pages); |
| return ret; |
| } |
| |
| static long |
| mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, |
| struct mshv_get_set_vp_state __user *user_args, |
| bool is_set) |
| { |
| struct mshv_get_set_vp_state args; |
| long ret = 0; |
| union hv_output_get_vp_state vp_state; |
| u32 data_sz; |
| struct hv_vp_state_data state_data = {}; |
| |
| if (copy_from_user(&args, user_args, sizeof(args))) |
| return -EFAULT; |
| |
| if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || |
| !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || |
| !PAGE_ALIGNED(args.buf_ptr)) |
| return -EINVAL; |
| |
| if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) |
| return -EFAULT; |
| |
| switch (args.type) { |
| case MSHV_VP_STATE_LAPIC: |
| state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; |
| data_sz = HV_HYP_PAGE_SIZE; |
| break; |
| case MSHV_VP_STATE_XSAVE: |
| { |
| u64 data_sz_64; |
| |
| ret = hv_call_get_partition_property(vp->vp_partition->pt_id, |
| HV_PARTITION_PROPERTY_XSAVE_STATES, |
| &state_data.xsave.states.as_uint64); |
| if (ret) |
| return ret; |
| |
| ret = hv_call_get_partition_property(vp->vp_partition->pt_id, |
| HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, |
| &data_sz_64); |
| if (ret) |
| return ret; |
| |
| data_sz = (u32)data_sz_64; |
| state_data.xsave.flags = 0; |
| /* Always request legacy states */ |
| state_data.xsave.states.legacy_x87 = 1; |
| state_data.xsave.states.legacy_sse = 1; |
| state_data.type = HV_GET_SET_VP_STATE_XSAVE; |
| break; |
| } |
| case MSHV_VP_STATE_SIMP: |
| state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; |
| data_sz = HV_HYP_PAGE_SIZE; |
| break; |
| case MSHV_VP_STATE_SIEFP: |
| state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; |
| data_sz = HV_HYP_PAGE_SIZE; |
| break; |
| case MSHV_VP_STATE_SYNTHETIC_TIMERS: |
| state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; |
| data_sz = sizeof(vp_state.synthetic_timers_state); |
| break; |
| default: |
| return -EINVAL; |
| } |
| |
| if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) |
| return -EFAULT; |
| |
| if (data_sz > args.buf_sz) |
| return -EINVAL; |
| |
| /* If the data is transmitted via pfns, delegate to helper */ |
| if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { |
| unsigned long user_pfn = PFN_DOWN(args.buf_ptr); |
| size_t page_count = PFN_DOWN(args.buf_sz); |
| |
| return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, |
| page_count, is_set); |
| } |
| |
| /* Paranoia check - this shouldn't happen! */ |
| if (data_sz > sizeof(vp_state)) { |
| vp_err(vp, "Invalid vp state data size!\n"); |
| return -EINVAL; |
| } |
| |
| if (is_set) { |
| if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) |
| return -EFAULT; |
| |
| return hv_call_set_vp_state(vp->vp_index, |
| vp->vp_partition->pt_id, |
| state_data, 0, NULL, |
| sizeof(vp_state), (u8 *)&vp_state); |
| } |
| |
| ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, |
| state_data, 0, NULL, &vp_state); |
| if (ret) |
| return ret; |
| |
| if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) |
| return -EFAULT; |
| |
| return 0; |
| } |
| |
| static long |
| mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) |
| { |
| struct mshv_vp *vp = filp->private_data; |
| long r = -ENOTTY; |
| |
| if (mutex_lock_killable(&vp->vp_mutex)) |
| return -EINTR; |
| |
| switch (ioctl) { |
| case MSHV_RUN_VP: |
| r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); |
| break; |
| case MSHV_GET_VP_STATE: |
| r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); |
| break; |
| case MSHV_SET_VP_STATE: |
| r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); |
| break; |
| case MSHV_ROOT_HVCALL: |
| r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, |
| (void __user *)arg); |
| break; |
| default: |
| vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); |
| break; |
| } |
| mutex_unlock(&vp->vp_mutex); |
| |
| return r; |
| } |
| |
| static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) |
| { |
| struct mshv_vp *vp = vmf->vma->vm_file->private_data; |
| |
| switch (vmf->vma->vm_pgoff) { |
| case MSHV_VP_MMAP_OFFSET_REGISTERS: |
| vmf->page = virt_to_page(vp->vp_register_page); |
| break; |
| case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: |
| vmf->page = virt_to_page(vp->vp_intercept_msg_page); |
| break; |
| case MSHV_VP_MMAP_OFFSET_GHCB: |
| vmf->page = virt_to_page(vp->vp_ghcb_page); |
| break; |
| default: |
| return VM_FAULT_SIGBUS; |
| } |
| |
| get_page(vmf->page); |
| |
| return 0; |
| } |
| |
| static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) |
| { |
| struct mshv_vp *vp = file->private_data; |
| |
| switch (vma->vm_pgoff) { |
| case MSHV_VP_MMAP_OFFSET_REGISTERS: |
| if (!vp->vp_register_page) |
| return -ENODEV; |
| break; |
| case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: |
| if (!vp->vp_intercept_msg_page) |
| return -ENODEV; |
| break; |
| case MSHV_VP_MMAP_OFFSET_GHCB: |
| if (!vp->vp_ghcb_page) |
| return -ENODEV; |
| break; |
| default: |
| return -EINVAL; |
| } |
| |
| vma->vm_ops = &mshv_vp_vm_ops; |
| return 0; |
| } |
| |
| static int |
| mshv_vp_release(struct inode *inode, struct file *filp) |
| { |
| struct mshv_vp *vp = filp->private_data; |
| |
| /* Rest of VP cleanup happens in destroy_partition() */ |
| mshv_partition_put(vp->vp_partition); |
| return 0; |
| } |
| |
| static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) |
| { |
| union hv_stats_object_identity identity = { |
| .vp.partition_id = partition_id, |
| .vp.vp_index = vp_index, |
| }; |
| |
| identity.vp.stats_area_type = HV_STATS_AREA_SELF; |
| hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); |
| |
| identity.vp.stats_area_type = HV_STATS_AREA_PARENT; |
| hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); |
| } |
| |
| static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, |
| void *stats_pages[]) |
| { |
| union hv_stats_object_identity identity = { |
| .vp.partition_id = partition_id, |
| .vp.vp_index = vp_index, |
| }; |
| int err; |
| |
| identity.vp.stats_area_type = HV_STATS_AREA_SELF; |
| err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, |
| &stats_pages[HV_STATS_AREA_SELF]); |
| if (err) |
| return err; |
| |
| identity.vp.stats_area_type = HV_STATS_AREA_PARENT; |
| err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, |
| &stats_pages[HV_STATS_AREA_PARENT]); |
| if (err) |
| goto unmap_self; |
| |
| return 0; |
| |
| unmap_self: |
| identity.vp.stats_area_type = HV_STATS_AREA_SELF; |
| hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); |
| return err; |
| } |
| |
| static long |
| mshv_partition_ioctl_create_vp(struct mshv_partition *partition, |
| void __user *arg) |
| { |
| struct mshv_create_vp args; |
| struct mshv_vp *vp; |
| struct page *intercept_message_page, *register_page, *ghcb_page; |
| void *stats_pages[2]; |
| long ret; |
| |
| if (copy_from_user(&args, arg, sizeof(args))) |
| return -EFAULT; |
| |
| if (args.vp_index >= MSHV_MAX_VPS) |
| return -EINVAL; |
| |
| if (partition->pt_vp_array[args.vp_index]) |
| return -EEXIST; |
| |
| ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, |
| 0 /* Only valid for root partition VPs */); |
| if (ret) |
| return ret; |
| |
| ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, |
| HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, |
| input_vtl_zero, |
| &intercept_message_page); |
| if (ret) |
| goto destroy_vp; |
| |
| if (!mshv_partition_encrypted(partition)) { |
| ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, |
| HV_VP_STATE_PAGE_REGISTERS, |
| input_vtl_zero, |
| ®ister_page); |
| if (ret) |
| goto unmap_intercept_message_page; |
| } |
| |
| if (mshv_partition_encrypted(partition) && |
| is_ghcb_mapping_available()) { |
| ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, |
| HV_VP_STATE_PAGE_GHCB, |
| input_vtl_normal, |
| &ghcb_page); |
| if (ret) |
| goto unmap_register_page; |
| } |
| |
| if (hv_parent_partition()) { |
| ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, |
| stats_pages); |
| if (ret) |
| goto unmap_ghcb_page; |
| } |
| |
| vp = kzalloc(sizeof(*vp), GFP_KERNEL); |
| if (!vp) |
| goto unmap_stats_pages; |
| |
| vp->vp_partition = mshv_partition_get(partition); |
| if (!vp->vp_partition) { |
| ret = -EBADF; |
| goto free_vp; |
| } |
| |
| mutex_init(&vp->vp_mutex); |
| init_waitqueue_head(&vp->run.vp_suspend_queue); |
| atomic64_set(&vp->run.vp_signaled_count, 0); |
| |
| vp->vp_index = args.vp_index; |
| vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); |
| if (!mshv_partition_encrypted(partition)) |
| vp->vp_register_page = page_to_virt(register_page); |
| |
| if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) |
| vp->vp_ghcb_page = page_to_virt(ghcb_page); |
| |
| if (hv_parent_partition()) |
| memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); |
| |
| /* |
| * Keep anon_inode_getfd last: it installs fd in the file struct and |
| * thus makes the state accessible in user space. |
| */ |
| ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, |
| O_RDWR | O_CLOEXEC); |
| if (ret < 0) |
| goto put_partition; |
| |
| /* already exclusive with the partition mutex for all ioctls */ |
| partition->pt_vp_count++; |
| partition->pt_vp_array[args.vp_index] = vp; |
| |
| return ret; |
| |
| put_partition: |
| mshv_partition_put(partition); |
| free_vp: |
| kfree(vp); |
| unmap_stats_pages: |
| if (hv_parent_partition()) |
| mshv_vp_stats_unmap(partition->pt_id, args.vp_index); |
| unmap_ghcb_page: |
| if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { |
| hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, |
| HV_VP_STATE_PAGE_GHCB, |
| input_vtl_normal); |
| } |
| unmap_register_page: |
| if (!mshv_partition_encrypted(partition)) { |
| hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, |
| HV_VP_STATE_PAGE_REGISTERS, |
| input_vtl_zero); |
| } |
| unmap_intercept_message_page: |
| hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, |
| HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, |
| input_vtl_zero); |
| destroy_vp: |
| hv_call_delete_vp(partition->pt_id, args.vp_index); |
| return ret; |
| } |
| |
| static int mshv_init_async_handler(struct mshv_partition *partition) |
| { |
| if (completion_done(&partition->async_hypercall)) { |
| pt_err(partition, |
| "Cannot issue async hypercall while another one in progress!\n"); |
| return -EPERM; |
| } |
| |
| reinit_completion(&partition->async_hypercall); |
| return 0; |
| } |
| |
| static void mshv_async_hvcall_handler(void *data, u64 *status) |
| { |
| struct mshv_partition *partition = data; |
| |
| wait_for_completion(&partition->async_hypercall); |
| pt_dbg(partition, "Async hypercall completed!\n"); |
| |
| *status = partition->async_hypercall_status; |
| } |
| |
| static int |
| mshv_partition_region_share(struct mshv_mem_region *region) |
| { |
| u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; |
| |
| if (region->flags.large_pages) |
| flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; |
| |
| return hv_call_modify_spa_host_access(region->partition->pt_id, |
| region->pages, region->nr_pages, |
| HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, |
| flags, true); |
| } |
| |
| static int |
| mshv_partition_region_unshare(struct mshv_mem_region *region) |
| { |
| u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; |
| |
| if (region->flags.large_pages) |
| flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; |
| |
| return hv_call_modify_spa_host_access(region->partition->pt_id, |
| region->pages, region->nr_pages, |
| 0, |
| flags, false); |
| } |
| |
| static int |
| mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, |
| u64 page_offset, u64 page_count) |
| { |
| if (page_offset + page_count > region->nr_pages) |
| return -EINVAL; |
| |
| if (region->flags.large_pages) |
| map_flags |= HV_MAP_GPA_LARGE_PAGE; |
| |
| /* ask the hypervisor to map guest ram */ |
| return hv_call_map_gpa_pages(region->partition->pt_id, |
| region->start_gfn + page_offset, |
| page_count, map_flags, |
| region->pages + page_offset); |
| } |
| |
| static int |
| mshv_region_map(struct mshv_mem_region *region) |
| { |
| u32 map_flags = region->hv_map_flags; |
| |
| return mshv_region_remap_pages(region, map_flags, |
| 0, region->nr_pages); |
| } |
| |
| static void |
| mshv_region_evict_pages(struct mshv_mem_region *region, |
| u64 page_offset, u64 page_count) |
| { |
| if (region->flags.range_pinned) |
| unpin_user_pages(region->pages + page_offset, page_count); |
| |
| memset(region->pages + page_offset, 0, |
| page_count * sizeof(struct page *)); |
| } |
| |
| static void |
| mshv_region_evict(struct mshv_mem_region *region) |
| { |
| mshv_region_evict_pages(region, 0, region->nr_pages); |
| } |
| |
| static int |
| mshv_region_populate_pages(struct mshv_mem_region *region, |
| u64 page_offset, u64 page_count) |
| { |
| u64 done_count, nr_pages; |
| struct page **pages; |
| __u64 userspace_addr; |
| int ret; |
| |
| if (page_offset + page_count > region->nr_pages) |
| return -EINVAL; |
| |
| for (done_count = 0; done_count < page_count; done_count += ret) { |
| pages = region->pages + page_offset + done_count; |
| userspace_addr = region->start_uaddr + |
| (page_offset + done_count) * |
| HV_HYP_PAGE_SIZE; |
| nr_pages = min(page_count - done_count, |
| MSHV_PIN_PAGES_BATCH_SIZE); |
| |
| /* |
| * Pinning assuming 4k pages works for large pages too. |
| * All page structs within the large page are returned. |
| * |
| * Pin requests are batched because pin_user_pages_fast |
| * with the FOLL_LONGTERM flag does a large temporary |
| * allocation of contiguous memory. |
| */ |
| if (region->flags.range_pinned) |
| ret = pin_user_pages_fast(userspace_addr, |
| nr_pages, |
| FOLL_WRITE | FOLL_LONGTERM, |
| pages); |
| else |
| ret = -EOPNOTSUPP; |
| |
| if (ret < 0) |
| goto release_pages; |
| } |
| |
| if (PageHuge(region->pages[page_offset])) |
| region->flags.large_pages = true; |
| |
| return 0; |
| |
| release_pages: |
| mshv_region_evict_pages(region, page_offset, done_count); |
| return ret; |
| } |
| |
| static int |
| mshv_region_populate(struct mshv_mem_region *region) |
| { |
| return mshv_region_populate_pages(region, 0, region->nr_pages); |
| } |
| |
| static struct mshv_mem_region * |
| mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) |
| { |
| struct mshv_mem_region *region; |
| |
| hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { |
| if (gfn >= region->start_gfn && |
| gfn < region->start_gfn + region->nr_pages) |
| return region; |
| } |
| |
| return NULL; |
| } |
| |
| static struct mshv_mem_region * |
| mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) |
| { |
| struct mshv_mem_region *region; |
| |
| hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { |
| if (uaddr >= region->start_uaddr && |
| uaddr < region->start_uaddr + |
| (region->nr_pages << HV_HYP_PAGE_SHIFT)) |
| return region; |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * NB: caller checks and makes sure mem->size is page aligned |
| * Returns: 0 with regionpp updated on success, or -errno |
| */ |
| static int mshv_partition_create_region(struct mshv_partition *partition, |
| struct mshv_user_mem_region *mem, |
| struct mshv_mem_region **regionpp, |
| bool is_mmio) |
| { |
| struct mshv_mem_region *region; |
| u64 nr_pages = HVPFN_DOWN(mem->size); |
| |
| /* Reject overlapping regions */ |
| if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || |
| mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || |
| mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || |
| mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) |
| return -EEXIST; |
| |
| region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); |
| if (!region) |
| return -ENOMEM; |
| |
| region->nr_pages = nr_pages; |
| region->start_gfn = mem->guest_pfn; |
| region->start_uaddr = mem->userspace_addr; |
| region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; |
| if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) |
| region->hv_map_flags |= HV_MAP_GPA_WRITABLE; |
| if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) |
| region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; |
| |
| /* Note: large_pages flag populated when we pin the pages */ |
| if (!is_mmio) |
| region->flags.range_pinned = true; |
| |
| region->partition = partition; |
| |
| *regionpp = region; |
| |
| return 0; |
| } |
| |
| /* |
| * Map guest ram. if snp, make sure to release that from the host first |
| * Side Effects: In case of failure, pages are unpinned when feasible. |
| */ |
| static int |
| mshv_partition_mem_region_map(struct mshv_mem_region *region) |
| { |
| struct mshv_partition *partition = region->partition; |
| int ret; |
| |
| ret = mshv_region_populate(region); |
| if (ret) { |
| pt_err(partition, "Failed to populate memory region: %d\n", |
| ret); |
| goto err_out; |
| } |
| |
| /* |
| * For an SNP partition it is a requirement that for every memory region |
| * that we are going to map for this partition we should make sure that |
| * host access to that region is released. This is ensured by doing an |
| * additional hypercall which will update the SLAT to release host |
| * access to guest memory regions. |
| */ |
| if (mshv_partition_encrypted(partition)) { |
| ret = mshv_partition_region_unshare(region); |
| if (ret) { |
| pt_err(partition, |
| "Failed to unshare memory region (guest_pfn: %llu): %d\n", |
| region->start_gfn, ret); |
| goto evict_region; |
| } |
| } |
| |
| ret = mshv_region_map(region); |
| if (ret && mshv_partition_encrypted(partition)) { |
| int shrc; |
| |
| shrc = mshv_partition_region_share(region); |
| if (!shrc) |
| goto evict_region; |
| |
| pt_err(partition, |
| "Failed to share memory region (guest_pfn: %llu): %d\n", |
| region->start_gfn, shrc); |
| /* |
| * Don't unpin if marking shared failed because pages are no |
| * longer mapped in the host, ie root, anymore. |
| */ |
| goto err_out; |
| } |
| |
| return 0; |
| |
| evict_region: |
| mshv_region_evict(region); |
| err_out: |
| return ret; |
| } |
| |
| /* |
| * This maps two things: guest RAM and for pci passthru mmio space. |
| * |
| * mmio: |
| * - vfio overloads vm_pgoff to store the mmio start pfn/spa. |
| * - Two things need to happen for mapping mmio range: |
| * 1. mapped in the uaddr so VMM can access it. |
| * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. |
| * |
| * This function takes care of the second. The first one is managed by vfio, |
| * and hence is taken care of via vfio_pci_mmap_fault(). |
| */ |
| static long |
| mshv_map_user_memory(struct mshv_partition *partition, |
| struct mshv_user_mem_region mem) |
| { |
| struct mshv_mem_region *region; |
| struct vm_area_struct *vma; |
| bool is_mmio; |
| ulong mmio_pfn; |
| long ret; |
| |
| if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || |
| !access_ok((const void *)mem.userspace_addr, mem.size)) |
| return -EINVAL; |
| |
| mmap_read_lock(current->mm); |
| vma = vma_lookup(current->mm, mem.userspace_addr); |
| is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; |
| mmio_pfn = is_mmio ? vma->vm_pgoff : 0; |
| mmap_read_unlock(current->mm); |
| |
| if (!vma) |
| return -EINVAL; |
| |
| ret = mshv_partition_create_region(partition, &mem, ®ion, |
| is_mmio); |
| if (ret) |
| return ret; |
| |
| if (is_mmio) |
| ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, |
| mmio_pfn, HVPFN_DOWN(mem.size)); |
| else |
| ret = mshv_partition_mem_region_map(region); |
| |
| if (ret) |
| goto errout; |
| |
| /* Install the new region */ |
| hlist_add_head(®ion->hnode, &partition->pt_mem_regions); |
| |
| return 0; |
| |
| errout: |
| vfree(region); |
| return ret; |
| } |
| |
| /* Called for unmapping both the guest ram and the mmio space */ |
| static long |
| mshv_unmap_user_memory(struct mshv_partition *partition, |
| struct mshv_user_mem_region mem) |
| { |
| struct mshv_mem_region *region; |
| u32 unmap_flags = 0; |
| |
| if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) |
| return -EINVAL; |
| |
| region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); |
| if (!region) |
| return -EINVAL; |
| |
| /* Paranoia check */ |
| if (region->start_uaddr != mem.userspace_addr || |
| region->start_gfn != mem.guest_pfn || |
| region->nr_pages != HVPFN_DOWN(mem.size)) |
| return -EINVAL; |
| |
| hlist_del(®ion->hnode); |
| |
| if (region->flags.large_pages) |
| unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; |
| |
| /* ignore unmap failures and continue as process may be exiting */ |
| hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, |
| region->nr_pages, unmap_flags); |
| |
| mshv_region_evict(region); |
| |
| vfree(region); |
| return 0; |
| } |
| |
| static long |
| mshv_partition_ioctl_set_memory(struct mshv_partition *partition, |
| struct mshv_user_mem_region __user *user_mem) |
| { |
| struct mshv_user_mem_region mem; |
| |
| if (copy_from_user(&mem, user_mem, sizeof(mem))) |
| return -EFAULT; |
| |
| if (!mem.size || |
| !PAGE_ALIGNED(mem.size) || |
| !PAGE_ALIGNED(mem.userspace_addr) || |
| (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || |
| mshv_field_nonzero(mem, rsvd)) |
| return -EINVAL; |
| |
| if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) |
| return mshv_unmap_user_memory(partition, mem); |
| |
| return mshv_map_user_memory(partition, mem); |
| } |
| |
| static long |
| mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, |
| void __user *user_args) |
| { |
| struct mshv_user_ioeventfd args; |
| |
| if (copy_from_user(&args, user_args, sizeof(args))) |
| return -EFAULT; |
| |
| return mshv_set_unset_ioeventfd(partition, &args); |
| } |
| |
| static long |
| mshv_partition_ioctl_irqfd(struct mshv_partition *partition, |
| void __user *user_args) |
| { |
| struct mshv_user_irqfd args; |
| |
| if (copy_from_user(&args, user_args, sizeof(args))) |
| return -EFAULT; |
| |
| return mshv_set_unset_irqfd(partition, &args); |
| } |
| |
| static long |
| mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, |
| void __user *user_args) |
| { |
| struct mshv_gpap_access_bitmap args; |
| union hv_gpa_page_access_state *states; |
| long ret, i; |
| union hv_gpa_page_access_state_flags hv_flags = {}; |
| u8 hv_type_mask; |
| ulong bitmap_buf_sz, states_buf_sz; |
| int written = 0; |
| |
| if (copy_from_user(&args, user_args, sizeof(args))) |
| return -EFAULT; |
| |
| if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || |
| args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || |
| mshv_field_nonzero(args, rsvd) || !args.page_count || |
| !args.bitmap_ptr) |
| return -EINVAL; |
| |
| if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) |
| return -E2BIG; |
| |
| /* Num bytes needed to store bitmap; one bit per page rounded up */ |
| bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); |
| |
| /* Sanity check */ |
| if (bitmap_buf_sz > states_buf_sz) |
| return -EBADFD; |
| |
| switch (args.access_type) { |
| case MSHV_GPAP_ACCESS_TYPE_ACCESSED: |
| hv_type_mask = 1; |
| if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { |
| hv_flags.clear_accessed = 1; |
| /* not accessed implies not dirty */ |
| hv_flags.clear_dirty = 1; |
| } else { /* MSHV_GPAP_ACCESS_OP_SET */ |
| hv_flags.set_accessed = 1; |
| } |
| break; |
| case MSHV_GPAP_ACCESS_TYPE_DIRTY: |
| hv_type_mask = 2; |
| if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { |
| hv_flags.clear_dirty = 1; |
| } else { /* MSHV_GPAP_ACCESS_OP_SET */ |
| hv_flags.set_dirty = 1; |
| /* dirty implies accessed */ |
| hv_flags.set_accessed = 1; |
| } |
| break; |
| } |
| |
| states = vzalloc(states_buf_sz); |
| if (!states) |
| return -ENOMEM; |
| |
| ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, |
| args.gpap_base, hv_flags, &written, |
| states); |
| if (ret) |
| goto free_return; |
| |
| /* |
| * Overwrite states buffer with bitmap - the bits in hv_type_mask |
| * correspond to bitfields in hv_gpa_page_access_state |
| */ |
| for (i = 0; i < written; ++i) |
| __assign_bit(i, (ulong *)states, |
| states[i].as_uint8 & hv_type_mask); |
| |
| /* zero the unused bits in the last byte(s) of the returned bitmap */ |
| for (i = written; i < bitmap_buf_sz * 8; ++i) |
| __clear_bit(i, (ulong *)states); |
| |
| if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) |
| ret = -EFAULT; |
| |
| free_return: |
| vfree(states); |
| return ret; |
| } |
| |
| static long |
| mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, |
| void __user *user_args) |
| { |
| struct mshv_user_irq_entry *entries = NULL; |
| struct mshv_user_irq_table args; |
| long ret; |
| |
| if (copy_from_user(&args, user_args, sizeof(args))) |
| return -EFAULT; |
| |
| if (args.nr > MSHV_MAX_GUEST_IRQS || |
| mshv_field_nonzero(args, rsvd)) |
| return -EINVAL; |
| |
| if (args.nr) { |
| struct mshv_user_irq_table __user *urouting = user_args; |
| |
| entries = vmemdup_user(urouting->entries, |
| array_size(sizeof(*entries), |
| args.nr)); |
| if (IS_ERR(entries)) |
| return PTR_ERR(entries); |
| } |
| ret = mshv_update_routing_table(partition, entries, args.nr); |
| kvfree(entries); |
| |
| return ret; |
| } |
| |
| static long |
| mshv_partition_ioctl_initialize(struct mshv_partition *partition) |
| { |
| long ret; |
| |
| if (partition->pt_initialized) |
| return 0; |
| |
| ret = hv_call_initialize_partition(partition->pt_id); |
| if (ret) |
| goto withdraw_mem; |
| |
| partition->pt_initialized = true; |
| |
| return 0; |
| |
| withdraw_mem: |
| hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); |
| |
| return ret; |
| } |
| |
| static long |
| mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) |
| { |
| struct mshv_partition *partition = filp->private_data; |
| long ret; |
| void __user *uarg = (void __user *)arg; |
| |
| if (mutex_lock_killable(&partition->pt_mutex)) |
| return -EINTR; |
| |
| switch (ioctl) { |
| case MSHV_INITIALIZE_PARTITION: |
| ret = mshv_partition_ioctl_initialize(partition); |
| break; |
| case MSHV_SET_GUEST_MEMORY: |
| ret = mshv_partition_ioctl_set_memory(partition, uarg); |
| break; |
| case MSHV_CREATE_VP: |
| ret = mshv_partition_ioctl_create_vp(partition, uarg); |
| break; |
| case MSHV_IRQFD: |
| ret = mshv_partition_ioctl_irqfd(partition, uarg); |
| break; |
| case MSHV_IOEVENTFD: |
| ret = mshv_partition_ioctl_ioeventfd(partition, uarg); |
| break; |
| case MSHV_SET_MSI_ROUTING: |
| ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); |
| break; |
| case MSHV_GET_GPAP_ACCESS_BITMAP: |
| ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, |
| uarg); |
| break; |
| case MSHV_ROOT_HVCALL: |
| ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); |
| break; |
| default: |
| ret = -ENOTTY; |
| } |
| |
| mutex_unlock(&partition->pt_mutex); |
| return ret; |
| } |
| |
| static int |
| disable_vp_dispatch(struct mshv_vp *vp) |
| { |
| int ret; |
| struct hv_register_assoc dispatch_suspend = { |
| .name = HV_REGISTER_DISPATCH_SUSPEND, |
| .value.dispatch_suspend.suspended = 1, |
| }; |
| |
| ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, |
| 1, &dispatch_suspend); |
| if (ret) |
| vp_err(vp, "failed to suspend\n"); |
| |
| return ret; |
| } |
| |
| static int |
| get_vp_signaled_count(struct mshv_vp *vp, u64 *count) |
| { |
| int ret; |
| struct hv_register_assoc root_signal_count = { |
| .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, |
| }; |
| |
| ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, |
| 1, &root_signal_count); |
| |
| if (ret) { |
| vp_err(vp, "Failed to get root signal count"); |
| *count = 0; |
| return ret; |
| } |
| |
| *count = root_signal_count.value.reg64; |
| |
| return ret; |
| } |
| |
| static void |
| drain_vp_signals(struct mshv_vp *vp) |
| { |
| u64 hv_signal_count; |
| u64 vp_signal_count; |
| |
| get_vp_signaled_count(vp, &hv_signal_count); |
| |
| vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); |
| |
| /* |
| * There should be at most 1 outstanding notification, but be extra |
| * careful anyway. |
| */ |
| while (hv_signal_count != vp_signal_count) { |
| WARN_ON(hv_signal_count - vp_signal_count != 1); |
| |
| if (wait_event_interruptible(vp->run.vp_suspend_queue, |
| vp->run.kicked_by_hv == 1)) |
| break; |
| vp->run.kicked_by_hv = 0; |
| vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); |
| } |
| } |
| |
| static void drain_all_vps(const struct mshv_partition *partition) |
| { |
| int i; |
| struct mshv_vp *vp; |
| |
| /* |
| * VPs are reachable from ISR. It is safe to not take the partition |
| * lock because nobody else can enter this function and drop the |
| * partition from the list. |
| */ |
| for (i = 0; i < MSHV_MAX_VPS; i++) { |
| vp = partition->pt_vp_array[i]; |
| if (!vp) |
| continue; |
| /* |
| * Disable dispatching of the VP in the hypervisor. After this |
| * the hypervisor guarantees it won't generate any signals for |
| * the VP and the hypervisor's VP signal count won't change. |
| */ |
| disable_vp_dispatch(vp); |
| drain_vp_signals(vp); |
| } |
| } |
| |
| static void |
| remove_partition(struct mshv_partition *partition) |
| { |
| spin_lock(&mshv_root.pt_ht_lock); |
| hlist_del_rcu(&partition->pt_hnode); |
| spin_unlock(&mshv_root.pt_ht_lock); |
| |
| synchronize_rcu(); |
| } |
| |
| /* |
| * Tear down a partition and remove it from the list. |
| * Partition's refcount must be 0 |
| */ |
| static void destroy_partition(struct mshv_partition *partition) |
| { |
| struct mshv_vp *vp; |
| struct mshv_mem_region *region; |
| int i, ret; |
| struct hlist_node *n; |
| |
| if (refcount_read(&partition->pt_ref_count)) { |
| pt_err(partition, |
| "Attempt to destroy partition but refcount > 0\n"); |
| return; |
| } |
| |
| if (partition->pt_initialized) { |
| /* |
| * We only need to drain signals for root scheduler. This should be |
| * done before removing the partition from the partition list. |
| */ |
| if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) |
| drain_all_vps(partition); |
| |
| /* Remove vps */ |
| for (i = 0; i < MSHV_MAX_VPS; ++i) { |
| vp = partition->pt_vp_array[i]; |
| if (!vp) |
| continue; |
| |
| if (hv_parent_partition()) |
| mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); |
| |
| if (vp->vp_register_page) { |
| (void)hv_call_unmap_vp_state_page(partition->pt_id, |
| vp->vp_index, |
| HV_VP_STATE_PAGE_REGISTERS, |
| input_vtl_zero); |
| vp->vp_register_page = NULL; |
| } |
| |
| (void)hv_call_unmap_vp_state_page(partition->pt_id, |
| vp->vp_index, |
| HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, |
| input_vtl_zero); |
| vp->vp_intercept_msg_page = NULL; |
| |
| if (vp->vp_ghcb_page) { |
| (void)hv_call_unmap_vp_state_page(partition->pt_id, |
| vp->vp_index, |
| HV_VP_STATE_PAGE_GHCB, |
| input_vtl_normal); |
| vp->vp_ghcb_page = NULL; |
| } |
| |
| kfree(vp); |
| |
| partition->pt_vp_array[i] = NULL; |
| } |
| |
| /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ |
| hv_call_finalize_partition(partition->pt_id); |
| |
| partition->pt_initialized = false; |
| } |
| |
| remove_partition(partition); |
| |
| /* Remove regions, regain access to the memory and unpin the pages */ |
| hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, |
| hnode) { |
| hlist_del(®ion->hnode); |
| |
| if (mshv_partition_encrypted(partition)) { |
| ret = mshv_partition_region_share(region); |
| if (ret) { |
| pt_err(partition, |
| "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", |
| ret); |
| return; |
| } |
| } |
| |
| mshv_region_evict(region); |
| |
| vfree(region); |
| } |
| |
| /* Withdraw and free all pages we deposited */ |
| hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); |
| hv_call_delete_partition(partition->pt_id); |
| |
| mshv_free_routing_table(partition); |
| kfree(partition); |
| } |
| |
| struct |
| mshv_partition *mshv_partition_get(struct mshv_partition *partition) |
| { |
| if (refcount_inc_not_zero(&partition->pt_ref_count)) |
| return partition; |
| return NULL; |
| } |
| |
| struct |
| mshv_partition *mshv_partition_find(u64 partition_id) |
| __must_hold(RCU) |
| { |
| struct mshv_partition *p; |
| |
| hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, |
| partition_id) |
| if (p->pt_id == partition_id) |
| return p; |
| |
| return NULL; |
| } |
| |
| void |
| mshv_partition_put(struct mshv_partition *partition) |
| { |
| if (refcount_dec_and_test(&partition->pt_ref_count)) |
| destroy_partition(partition); |
| } |
| |
| static int |
| mshv_partition_release(struct inode *inode, struct file *filp) |
| { |
| struct mshv_partition *partition = filp->private_data; |
| |
| mshv_eventfd_release(partition); |
| |
| cleanup_srcu_struct(&partition->pt_irq_srcu); |
| |
| mshv_partition_put(partition); |
| |
| return 0; |
| } |
| |
| static int |
| add_partition(struct mshv_partition *partition) |
| { |
| spin_lock(&mshv_root.pt_ht_lock); |
| |
| hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, |
| partition->pt_id); |
| |
| spin_unlock(&mshv_root.pt_ht_lock); |
| |
| return 0; |
| } |
| |
| static long |
| mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) |
| { |
| struct mshv_create_partition args; |
| u64 creation_flags; |
| struct hv_partition_creation_properties creation_properties = {}; |
| union hv_partition_isolation_properties isolation_properties = {}; |
| struct mshv_partition *partition; |
| struct file *file; |
| int fd; |
| long ret; |
| |
| if (copy_from_user(&args, user_arg, sizeof(args))) |
| return -EFAULT; |
| |
| if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || |
| args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) |
| return -EINVAL; |
| |
| /* Only support EXO partitions */ |
| creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | |
| HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; |
| |
| if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) |
| creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; |
| if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) |
| creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; |
| if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) |
| creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; |
| |
| switch (args.pt_isolation) { |
| case MSHV_PT_ISOLATION_NONE: |
| isolation_properties.isolation_type = |
| HV_PARTITION_ISOLATION_TYPE_NONE; |
| break; |
| } |
| |
| partition = kzalloc(sizeof(*partition), GFP_KERNEL); |
| if (!partition) |
| return -ENOMEM; |
| |
| partition->pt_module_dev = module_dev; |
| partition->isolation_type = isolation_properties.isolation_type; |
| |
| refcount_set(&partition->pt_ref_count, 1); |
| |
| mutex_init(&partition->pt_mutex); |
| |
| mutex_init(&partition->pt_irq_lock); |
| |
| init_completion(&partition->async_hypercall); |
| |
| INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); |
| |
| INIT_HLIST_HEAD(&partition->pt_devices); |
| |
| INIT_HLIST_HEAD(&partition->pt_mem_regions); |
| |
| mshv_eventfd_init(partition); |
| |
| ret = init_srcu_struct(&partition->pt_irq_srcu); |
| if (ret) |
| goto free_partition; |
| |
| ret = hv_call_create_partition(creation_flags, |
| creation_properties, |
| isolation_properties, |
| &partition->pt_id); |
| if (ret) |
| goto cleanup_irq_srcu; |
| |
| ret = add_partition(partition); |
| if (ret) |
| goto delete_partition; |
| |
| ret = mshv_init_async_handler(partition); |
| if (ret) |
| goto remove_partition; |
| |
| fd = get_unused_fd_flags(O_CLOEXEC); |
| if (fd < 0) { |
| ret = fd; |
| goto remove_partition; |
| } |
| |
| file = anon_inode_getfile("mshv_partition", &mshv_partition_fops, |
| partition, O_RDWR); |
| if (IS_ERR(file)) { |
| ret = PTR_ERR(file); |
| goto put_fd; |
| } |
| |
| fd_install(fd, file); |
| |
| return fd; |
| |
| put_fd: |
| put_unused_fd(fd); |
| remove_partition: |
| remove_partition(partition); |
| delete_partition: |
| hv_call_delete_partition(partition->pt_id); |
| cleanup_irq_srcu: |
| cleanup_srcu_struct(&partition->pt_irq_srcu); |
| free_partition: |
| kfree(partition); |
| |
| return ret; |
| } |
| |
| static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, |
| unsigned long arg) |
| { |
| struct miscdevice *misc = filp->private_data; |
| |
| switch (ioctl) { |
| case MSHV_CREATE_PARTITION: |
| return mshv_ioctl_create_partition((void __user *)arg, |
| misc->this_device); |
| } |
| |
| return -ENOTTY; |
| } |
| |
| static int |
| mshv_dev_open(struct inode *inode, struct file *filp) |
| { |
| return 0; |
| } |
| |
| static int |
| mshv_dev_release(struct inode *inode, struct file *filp) |
| { |
| return 0; |
| } |
| |
| static int mshv_cpuhp_online; |
| static int mshv_root_sched_online; |
| |
| static const char *scheduler_type_to_string(enum hv_scheduler_type type) |
| { |
| switch (type) { |
| case HV_SCHEDULER_TYPE_LP: |
| return "classic scheduler without SMT"; |
| case HV_SCHEDULER_TYPE_LP_SMT: |
| return "classic scheduler with SMT"; |
| case HV_SCHEDULER_TYPE_CORE_SMT: |
| return "core scheduler"; |
| case HV_SCHEDULER_TYPE_ROOT: |
| return "root scheduler"; |
| default: |
| return "unknown scheduler"; |
| }; |
| } |
| |
| /* TODO move this to hv_common.c when needed outside */ |
| static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) |
| { |
| struct hv_input_get_system_property *input; |
| struct hv_output_get_system_property *output; |
| unsigned long flags; |
| u64 status; |
| |
| local_irq_save(flags); |
| input = *this_cpu_ptr(hyperv_pcpu_input_arg); |
| output = *this_cpu_ptr(hyperv_pcpu_output_arg); |
| |
| memset(input, 0, sizeof(*input)); |
| memset(output, 0, sizeof(*output)); |
| input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; |
| |
| status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); |
| if (!hv_result_success(status)) { |
| local_irq_restore(flags); |
| pr_err("%s: %s\n", __func__, hv_result_to_string(status)); |
| return hv_result_to_errno(status); |
| } |
| |
| *out = output->scheduler_type; |
| local_irq_restore(flags); |
| |
| return 0; |
| } |
| |
| /* Retrieve and stash the supported scheduler type */ |
| static int __init mshv_retrieve_scheduler_type(struct device *dev) |
| { |
| int ret; |
| |
| ret = hv_retrieve_scheduler_type(&hv_scheduler_type); |
| if (ret) |
| return ret; |
| |
| dev_info(dev, "Hypervisor using %s\n", |
| scheduler_type_to_string(hv_scheduler_type)); |
| |
| switch (hv_scheduler_type) { |
| case HV_SCHEDULER_TYPE_CORE_SMT: |
| case HV_SCHEDULER_TYPE_LP_SMT: |
| case HV_SCHEDULER_TYPE_ROOT: |
| case HV_SCHEDULER_TYPE_LP: |
| /* Supported scheduler, nothing to do */ |
| break; |
| default: |
| dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", |
| hv_scheduler_type); |
| return -EOPNOTSUPP; |
| } |
| |
| return 0; |
| } |
| |
| static int mshv_root_scheduler_init(unsigned int cpu) |
| { |
| void **inputarg, **outputarg, *p; |
| |
| inputarg = (void **)this_cpu_ptr(root_scheduler_input); |
| outputarg = (void **)this_cpu_ptr(root_scheduler_output); |
| |
| /* Allocate two consecutive pages. One for input, one for output. */ |
| p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); |
| if (!p) |
| return -ENOMEM; |
| |
| *inputarg = p; |
| *outputarg = (char *)p + HV_HYP_PAGE_SIZE; |
| |
| return 0; |
| } |
| |
| static int mshv_root_scheduler_cleanup(unsigned int cpu) |
| { |
| void *p, **inputarg, **outputarg; |
| |
| inputarg = (void **)this_cpu_ptr(root_scheduler_input); |
| outputarg = (void **)this_cpu_ptr(root_scheduler_output); |
| |
| p = *inputarg; |
| |
| *inputarg = NULL; |
| *outputarg = NULL; |
| |
| kfree(p); |
| |
| return 0; |
| } |
| |
| /* Must be called after retrieving the scheduler type */ |
| static int |
| root_scheduler_init(struct device *dev) |
| { |
| int ret; |
| |
| if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) |
| return 0; |
| |
| root_scheduler_input = alloc_percpu(void *); |
| root_scheduler_output = alloc_percpu(void *); |
| |
| if (!root_scheduler_input || !root_scheduler_output) { |
| dev_err(dev, "Failed to allocate root scheduler buffers\n"); |
| ret = -ENOMEM; |
| goto out; |
| } |
| |
| ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", |
| mshv_root_scheduler_init, |
| mshv_root_scheduler_cleanup); |
| |
| if (ret < 0) { |
| dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); |
| goto out; |
| } |
| |
| mshv_root_sched_online = ret; |
| |
| return 0; |
| |
| out: |
| free_percpu(root_scheduler_input); |
| free_percpu(root_scheduler_output); |
| return ret; |
| } |
| |
| static void |
| root_scheduler_deinit(void) |
| { |
| if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) |
| return; |
| |
| cpuhp_remove_state(mshv_root_sched_online); |
| free_percpu(root_scheduler_input); |
| free_percpu(root_scheduler_output); |
| } |
| |
| static int mshv_reboot_notify(struct notifier_block *nb, |
| unsigned long code, void *unused) |
| { |
| cpuhp_remove_state(mshv_cpuhp_online); |
| return 0; |
| } |
| |
| struct notifier_block mshv_reboot_nb = { |
| .notifier_call = mshv_reboot_notify, |
| }; |
| |
| static void mshv_root_partition_exit(void) |
| { |
| unregister_reboot_notifier(&mshv_reboot_nb); |
| root_scheduler_deinit(); |
| } |
| |
| static int __init mshv_root_partition_init(struct device *dev) |
| { |
| int err; |
| |
| if (mshv_retrieve_scheduler_type(dev)) |
| return -ENODEV; |
| |
| err = root_scheduler_init(dev); |
| if (err) |
| return err; |
| |
| err = register_reboot_notifier(&mshv_reboot_nb); |
| if (err) |
| goto root_sched_deinit; |
| |
| return 0; |
| |
| root_sched_deinit: |
| root_scheduler_deinit(); |
| return err; |
| } |
| |
| static int __init mshv_parent_partition_init(void) |
| { |
| int ret; |
| struct device *dev; |
| union hv_hypervisor_version_info version_info; |
| |
| if (!hv_root_partition() || is_kdump_kernel()) |
| return -ENODEV; |
| |
| if (hv_get_hypervisor_version(&version_info)) |
| return -ENODEV; |
| |
| ret = misc_register(&mshv_dev); |
| if (ret) |
| return ret; |
| |
| dev = mshv_dev.this_device; |
| |
| if (version_info.build_number < MSHV_HV_MIN_VERSION || |
| version_info.build_number > MSHV_HV_MAX_VERSION) { |
| dev_err(dev, "Running on unvalidated Hyper-V version\n"); |
| dev_err(dev, "Versions: current: %u min: %u max: %u\n", |
| version_info.build_number, MSHV_HV_MIN_VERSION, |
| MSHV_HV_MAX_VERSION); |
| } |
| |
| mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); |
| if (!mshv_root.synic_pages) { |
| dev_err(dev, "Failed to allocate percpu synic page\n"); |
| ret = -ENOMEM; |
| goto device_deregister; |
| } |
| |
| ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", |
| mshv_synic_init, |
| mshv_synic_cleanup); |
| if (ret < 0) { |
| dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); |
| goto free_synic_pages; |
| } |
| |
| mshv_cpuhp_online = ret; |
| |
| ret = mshv_root_partition_init(dev); |
| if (ret) |
| goto remove_cpu_state; |
| |
| ret = mshv_irqfd_wq_init(); |
| if (ret) |
| goto exit_partition; |
| |
| spin_lock_init(&mshv_root.pt_ht_lock); |
| hash_init(mshv_root.pt_htable); |
| |
| hv_setup_mshv_handler(mshv_isr); |
| |
| return 0; |
| |
| exit_partition: |
| if (hv_root_partition()) |
| mshv_root_partition_exit(); |
| remove_cpu_state: |
| cpuhp_remove_state(mshv_cpuhp_online); |
| free_synic_pages: |
| free_percpu(mshv_root.synic_pages); |
| device_deregister: |
| misc_deregister(&mshv_dev); |
| return ret; |
| } |
| |
| static void __exit mshv_parent_partition_exit(void) |
| { |
| hv_setup_mshv_handler(NULL); |
| mshv_port_table_fini(); |
| misc_deregister(&mshv_dev); |
| mshv_irqfd_wq_cleanup(); |
| if (hv_root_partition()) |
| mshv_root_partition_exit(); |
| cpuhp_remove_state(mshv_cpuhp_online); |
| free_percpu(mshv_root.synic_pages); |
| } |
| |
| module_init(mshv_parent_partition_init); |
| module_exit(mshv_parent_partition_exit); |