|  | // SPDX-License-Identifier: GPL-2.0-only | 
|  | /* | 
|  | * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. | 
|  | * | 
|  | * Copyright (C) 2018 IBM Corp.  All rights reserved. | 
|  | *     Author: Alexey Kardashevskiy <aik@ozlabs.ru> | 
|  | * | 
|  | * Register an on-GPU RAM region for cacheable access. | 
|  | * | 
|  | * Derived from original vfio_pci_igd.c: | 
|  | * Copyright (C) 2016 Red Hat, Inc.  All rights reserved. | 
|  | *	Author: Alex Williamson <alex.williamson@redhat.com> | 
|  | */ | 
|  |  | 
|  | #include <linux/io.h> | 
|  | #include <linux/pci.h> | 
|  | #include <linux/uaccess.h> | 
|  | #include <linux/vfio.h> | 
|  | #include <linux/sched/mm.h> | 
|  | #include <linux/mmu_context.h> | 
|  | #include <asm/kvm_ppc.h> | 
|  | #include "vfio_pci_private.h" | 
|  |  | 
|  | #define CREATE_TRACE_POINTS | 
|  | #include "trace.h" | 
|  |  | 
|  | EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); | 
|  | EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); | 
|  | EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); | 
|  |  | 
|  | struct vfio_pci_nvgpu_data { | 
|  | unsigned long gpu_hpa; /* GPU RAM physical address */ | 
|  | unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ | 
|  | unsigned long useraddr; /* GPU RAM userspace address */ | 
|  | unsigned long size; /* Size of the GPU RAM window (usually 128GB) */ | 
|  | struct mm_struct *mm; | 
|  | struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */ | 
|  | struct pci_dev *gpdev; | 
|  | struct notifier_block group_notifier; | 
|  | }; | 
|  |  | 
|  | static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev, | 
|  | char __user *buf, size_t count, loff_t *ppos, bool iswrite) | 
|  | { | 
|  | unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; | 
|  | struct vfio_pci_nvgpu_data *data = vdev->region[i].data; | 
|  | loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; | 
|  | loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK; | 
|  | size_t sizealigned; | 
|  | void __iomem *ptr; | 
|  |  | 
|  | if (pos >= vdev->region[i].size) | 
|  | return -EINVAL; | 
|  |  | 
|  | count = min(count, (size_t)(vdev->region[i].size - pos)); | 
|  |  | 
|  | /* | 
|  | * We map only a bit of GPU RAM for a short time instead of mapping it | 
|  | * for the guest lifetime as: | 
|  | * | 
|  | * 1) we do not know GPU RAM size, only aperture which is 4-8 times | 
|  | *    bigger than actual RAM size (16/32GB RAM vs. 128GB aperture); | 
|  | * 2) mapping GPU RAM allows CPU to prefetch and if this happens | 
|  | *    before NVLink bridge is reset (which fences GPU RAM), | 
|  | *    hardware management interrupts (HMI) might happen, this | 
|  | *    will freeze NVLink bridge. | 
|  | * | 
|  | * This is not fast path anyway. | 
|  | */ | 
|  | sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE); | 
|  | ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned); | 
|  | if (!ptr) | 
|  | return -EFAULT; | 
|  |  | 
|  | if (iswrite) { | 
|  | if (copy_from_user(ptr + posoff, buf, count)) | 
|  | count = -EFAULT; | 
|  | else | 
|  | *ppos += count; | 
|  | } else { | 
|  | if (copy_to_user(buf, ptr + posoff, count)) | 
|  | count = -EFAULT; | 
|  | else | 
|  | *ppos += count; | 
|  | } | 
|  |  | 
|  | iounmap(ptr); | 
|  |  | 
|  | return count; | 
|  | } | 
|  |  | 
|  | static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev, | 
|  | struct vfio_pci_region *region) | 
|  | { | 
|  | struct vfio_pci_nvgpu_data *data = region->data; | 
|  | long ret; | 
|  |  | 
|  | /* If there were any mappings at all... */ | 
|  | if (data->mm) { | 
|  | if (data->mem) { | 
|  | ret = mm_iommu_put(data->mm, data->mem); | 
|  | WARN_ON(ret); | 
|  | } | 
|  |  | 
|  | mmdrop(data->mm); | 
|  | } | 
|  |  | 
|  | vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, | 
|  | &data->group_notifier); | 
|  |  | 
|  | pnv_npu2_unmap_lpar_dev(data->gpdev); | 
|  |  | 
|  | kfree(data); | 
|  | } | 
|  |  | 
|  | static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf) | 
|  | { | 
|  | vm_fault_t ret; | 
|  | struct vm_area_struct *vma = vmf->vma; | 
|  | struct vfio_pci_region *region = vma->vm_private_data; | 
|  | struct vfio_pci_nvgpu_data *data = region->data; | 
|  | unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT; | 
|  | unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT; | 
|  | unsigned long vm_pgoff = vma->vm_pgoff & | 
|  | ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); | 
|  | unsigned long pfn = nv2pg + vm_pgoff + vmf_off; | 
|  |  | 
|  | ret = vmf_insert_pfn(vma, vmf->address, pfn); | 
|  | trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT, | 
|  | vmf->address, ret); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = { | 
|  | .fault = vfio_pci_nvgpu_mmap_fault, | 
|  | }; | 
|  |  | 
|  | static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev, | 
|  | struct vfio_pci_region *region, struct vm_area_struct *vma) | 
|  | { | 
|  | int ret; | 
|  | struct vfio_pci_nvgpu_data *data = region->data; | 
|  |  | 
|  | if (data->useraddr) | 
|  | return -EPERM; | 
|  |  | 
|  | if (vma->vm_end - vma->vm_start > data->size) | 
|  | return -EINVAL; | 
|  |  | 
|  | vma->vm_private_data = region; | 
|  | vma->vm_flags |= VM_PFNMAP; | 
|  | vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops; | 
|  |  | 
|  | /* | 
|  | * Calling mm_iommu_newdev() here once as the region is not | 
|  | * registered yet and therefore right initialization will happen now. | 
|  | * Other places will use mm_iommu_find() which returns | 
|  | * registered @mem and does not go gup(). | 
|  | */ | 
|  | data->useraddr = vma->vm_start; | 
|  | data->mm = current->mm; | 
|  |  | 
|  | atomic_inc(&data->mm->mm_count); | 
|  | ret = (int) mm_iommu_newdev(data->mm, data->useraddr, | 
|  | vma_pages(vma), data->gpu_hpa, &data->mem); | 
|  |  | 
|  | trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr, | 
|  | vma->vm_end - vma->vm_start, ret); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev, | 
|  | struct vfio_pci_region *region, struct vfio_info_cap *caps) | 
|  | { | 
|  | struct vfio_pci_nvgpu_data *data = region->data; | 
|  | struct vfio_region_info_cap_nvlink2_ssatgt cap = { | 
|  | .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, | 
|  | .header.version = 1, | 
|  | .tgt = data->gpu_tgt | 
|  | }; | 
|  |  | 
|  | return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); | 
|  | } | 
|  |  | 
|  | static const struct vfio_pci_regops vfio_pci_nvgpu_regops = { | 
|  | .rw = vfio_pci_nvgpu_rw, | 
|  | .release = vfio_pci_nvgpu_release, | 
|  | .mmap = vfio_pci_nvgpu_mmap, | 
|  | .add_capability = vfio_pci_nvgpu_add_capability, | 
|  | }; | 
|  |  | 
|  | static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, | 
|  | unsigned long action, void *opaque) | 
|  | { | 
|  | struct kvm *kvm = opaque; | 
|  | struct vfio_pci_nvgpu_data *data = container_of(nb, | 
|  | struct vfio_pci_nvgpu_data, | 
|  | group_notifier); | 
|  |  | 
|  | if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm && | 
|  | pnv_npu2_map_lpar_dev(data->gpdev, | 
|  | kvm->arch.lpid, MSR_DR | MSR_PR)) | 
|  | return NOTIFY_BAD; | 
|  |  | 
|  | return NOTIFY_OK; | 
|  | } | 
|  |  | 
|  | int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) | 
|  | { | 
|  | int ret; | 
|  | u64 reg[2]; | 
|  | u64 tgt = 0; | 
|  | struct device_node *npu_node, *mem_node; | 
|  | struct pci_dev *npu_dev; | 
|  | struct vfio_pci_nvgpu_data *data; | 
|  | uint32_t mem_phandle = 0; | 
|  | unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM; | 
|  |  | 
|  | /* | 
|  | * PCI config space does not tell us about NVLink presense but | 
|  | * platform does, use this. | 
|  | */ | 
|  | npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0); | 
|  | if (!npu_dev) | 
|  | return -ENODEV; | 
|  |  | 
|  | npu_node = pci_device_to_OF_node(npu_dev); | 
|  | if (!npu_node) | 
|  | return -EINVAL; | 
|  |  | 
|  | if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) | 
|  | return -ENODEV; | 
|  |  | 
|  | mem_node = of_find_node_by_phandle(mem_phandle); | 
|  | if (!mem_node) | 
|  | return -EINVAL; | 
|  |  | 
|  | if (of_property_read_variable_u64_array(mem_node, "reg", reg, | 
|  | ARRAY_SIZE(reg), ARRAY_SIZE(reg)) != | 
|  | ARRAY_SIZE(reg)) | 
|  | return -EINVAL; | 
|  |  | 
|  | if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { | 
|  | dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); | 
|  | return -EFAULT; | 
|  | } | 
|  |  | 
|  | data = kzalloc(sizeof(*data), GFP_KERNEL); | 
|  | if (!data) | 
|  | return -ENOMEM; | 
|  |  | 
|  | data->gpu_hpa = reg[0]; | 
|  | data->gpu_tgt = tgt; | 
|  | data->size = reg[1]; | 
|  |  | 
|  | dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa, | 
|  | data->gpu_hpa + data->size - 1); | 
|  |  | 
|  | data->gpdev = vdev->pdev; | 
|  | data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier; | 
|  |  | 
|  | ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, | 
|  | &events, &data->group_notifier); | 
|  | if (ret) | 
|  | goto free_exit; | 
|  |  | 
|  | /* | 
|  | * We have just set KVM, we do not need the listener anymore. | 
|  | * Also, keeping it registered means that if more than one GPU is | 
|  | * assigned, we will get several similar notifiers notifying about | 
|  | * the same device again which does not help with anything. | 
|  | */ | 
|  | vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, | 
|  | &data->group_notifier); | 
|  |  | 
|  | ret = vfio_pci_register_dev_region(vdev, | 
|  | PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, | 
|  | VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, | 
|  | &vfio_pci_nvgpu_regops, | 
|  | data->size, | 
|  | VFIO_REGION_INFO_FLAG_READ | | 
|  | VFIO_REGION_INFO_FLAG_WRITE | | 
|  | VFIO_REGION_INFO_FLAG_MMAP, | 
|  | data); | 
|  | if (ret) | 
|  | goto free_exit; | 
|  |  | 
|  | return 0; | 
|  | free_exit: | 
|  | kfree(data); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * IBM NPU2 bridge | 
|  | */ | 
|  | struct vfio_pci_npu2_data { | 
|  | void *base; /* ATSD register virtual address, for emulated access */ | 
|  | unsigned long mmio_atsd; /* ATSD physical address */ | 
|  | unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ | 
|  | unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ | 
|  | }; | 
|  |  | 
|  | static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev, | 
|  | char __user *buf, size_t count, loff_t *ppos, bool iswrite) | 
|  | { | 
|  | unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; | 
|  | struct vfio_pci_npu2_data *data = vdev->region[i].data; | 
|  | loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; | 
|  |  | 
|  | if (pos >= vdev->region[i].size) | 
|  | return -EINVAL; | 
|  |  | 
|  | count = min(count, (size_t)(vdev->region[i].size - pos)); | 
|  |  | 
|  | if (iswrite) { | 
|  | if (copy_from_user(data->base + pos, buf, count)) | 
|  | return -EFAULT; | 
|  | } else { | 
|  | if (copy_to_user(buf, data->base + pos, count)) | 
|  | return -EFAULT; | 
|  | } | 
|  | *ppos += count; | 
|  |  | 
|  | return count; | 
|  | } | 
|  |  | 
|  | static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev, | 
|  | struct vfio_pci_region *region, struct vm_area_struct *vma) | 
|  | { | 
|  | int ret; | 
|  | struct vfio_pci_npu2_data *data = region->data; | 
|  | unsigned long req_len = vma->vm_end - vma->vm_start; | 
|  |  | 
|  | if (req_len != PAGE_SIZE) | 
|  | return -EINVAL; | 
|  |  | 
|  | vma->vm_flags |= VM_PFNMAP; | 
|  | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 
|  |  | 
|  | ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT, | 
|  | req_len, vma->vm_page_prot); | 
|  | trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start, | 
|  | vma->vm_end - vma->vm_start, ret); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static void vfio_pci_npu2_release(struct vfio_pci_device *vdev, | 
|  | struct vfio_pci_region *region) | 
|  | { | 
|  | struct vfio_pci_npu2_data *data = region->data; | 
|  |  | 
|  | memunmap(data->base); | 
|  | kfree(data); | 
|  | } | 
|  |  | 
|  | static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev, | 
|  | struct vfio_pci_region *region, struct vfio_info_cap *caps) | 
|  | { | 
|  | struct vfio_pci_npu2_data *data = region->data; | 
|  | struct vfio_region_info_cap_nvlink2_ssatgt captgt = { | 
|  | .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, | 
|  | .header.version = 1, | 
|  | .tgt = data->gpu_tgt | 
|  | }; | 
|  | struct vfio_region_info_cap_nvlink2_lnkspd capspd = { | 
|  | .header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, | 
|  | .header.version = 1, | 
|  | .link_speed = data->link_speed | 
|  | }; | 
|  | int ret; | 
|  |  | 
|  | ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt)); | 
|  | if (ret) | 
|  | return ret; | 
|  |  | 
|  | return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd)); | 
|  | } | 
|  |  | 
|  | static const struct vfio_pci_regops vfio_pci_npu2_regops = { | 
|  | .rw = vfio_pci_npu2_rw, | 
|  | .mmap = vfio_pci_npu2_mmap, | 
|  | .release = vfio_pci_npu2_release, | 
|  | .add_capability = vfio_pci_npu2_add_capability, | 
|  | }; | 
|  |  | 
|  | int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) | 
|  | { | 
|  | int ret; | 
|  | struct vfio_pci_npu2_data *data; | 
|  | struct device_node *nvlink_dn; | 
|  | u32 nvlink_index = 0, mem_phandle = 0; | 
|  | struct pci_dev *npdev = vdev->pdev; | 
|  | struct device_node *npu_node = pci_device_to_OF_node(npdev); | 
|  | struct pci_controller *hose = pci_bus_to_host(npdev->bus); | 
|  | u64 mmio_atsd = 0; | 
|  | u64 tgt = 0; | 
|  | u32 link_speed = 0xff; | 
|  |  | 
|  | /* | 
|  | * PCI config space does not tell us about NVLink presense but | 
|  | * platform does, use this. | 
|  | */ | 
|  | if (!pnv_pci_get_gpu_dev(vdev->pdev)) | 
|  | return -ENODEV; | 
|  |  | 
|  | if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) | 
|  | return -ENODEV; | 
|  |  | 
|  | /* | 
|  | * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links | 
|  | * so we can allocate one register per link, using nvlink index as | 
|  | * a key. | 
|  | * There is always at least one ATSD register so as long as at least | 
|  | * NVLink bridge #0 is passed to the guest, ATSD will be available. | 
|  | */ | 
|  | nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); | 
|  | if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", | 
|  | &nvlink_index))) | 
|  | return -ENODEV; | 
|  |  | 
|  | if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index, | 
|  | &mmio_atsd)) { | 
|  | dev_warn(&vdev->pdev->dev, "No available ATSD found\n"); | 
|  | mmio_atsd = 0; | 
|  | } | 
|  |  | 
|  | if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { | 
|  | dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); | 
|  | return -EFAULT; | 
|  | } | 
|  |  | 
|  | if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) { | 
|  | dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n"); | 
|  | return -EFAULT; | 
|  | } | 
|  |  | 
|  | data = kzalloc(sizeof(*data), GFP_KERNEL); | 
|  | if (!data) | 
|  | return -ENOMEM; | 
|  |  | 
|  | data->mmio_atsd = mmio_atsd; | 
|  | data->gpu_tgt = tgt; | 
|  | data->link_speed = link_speed; | 
|  | if (data->mmio_atsd) { | 
|  | data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT); | 
|  | if (!data->base) { | 
|  | ret = -ENOMEM; | 
|  | goto free_exit; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * We want to expose the capability even if this specific NVLink | 
|  | * did not get its own ATSD register because capabilities | 
|  | * belong to VFIO regions and normally there will be ATSD register | 
|  | * assigned to the NVLink bridge. | 
|  | */ | 
|  | ret = vfio_pci_register_dev_region(vdev, | 
|  | PCI_VENDOR_ID_IBM | | 
|  | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, | 
|  | VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, | 
|  | &vfio_pci_npu2_regops, | 
|  | data->mmio_atsd ? PAGE_SIZE : 0, | 
|  | VFIO_REGION_INFO_FLAG_READ | | 
|  | VFIO_REGION_INFO_FLAG_WRITE | | 
|  | VFIO_REGION_INFO_FLAG_MMAP, | 
|  | data); | 
|  | if (ret) | 
|  | goto free_exit; | 
|  |  | 
|  | return 0; | 
|  |  | 
|  | free_exit: | 
|  | if (data->base) | 
|  | memunmap(data->base); | 
|  | kfree(data); | 
|  |  | 
|  | return ret; | 
|  | } |