aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vfio
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/pci/Kconfig6
-rw-r--r--drivers/vfio/pci/Makefile1
-rw-r--r--drivers/vfio/pci/trace.h102
-rw-r--r--drivers/vfio/pci/vfio_pci.c42
-rw-r--r--drivers/vfio/pci/vfio_pci_nvlink2.c482
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h20
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c74
7 files changed, 696 insertions, 31 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 42dc1d3d71cf..d0f8e4f5a039 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -38,3 +38,9 @@ config VFIO_PCI_IGD
38 and LPC bridge config space. 38 and LPC bridge config space.
39 39
40 To enable Intel IGD assignment through vfio-pci, say Y. 40 To enable Intel IGD assignment through vfio-pci, say Y.
41
42config VFIO_PCI_NVLINK2
43 def_bool y
44 depends on VFIO_PCI && PPC_POWERNV
45 help
46 VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 76d8ec058edd..9662c063a6b1 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,5 +1,6 @@
1 1
2vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o 2vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
3vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o 3vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
4vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
4 5
5obj-$(CONFIG_VFIO_PCI) += vfio-pci.o 6obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h
new file mode 100644
index 000000000000..228ccdb8d1c8
--- /dev/null
+++ b/drivers/vfio/pci/trace.h
@@ -0,0 +1,102 @@
1/* SPDX-License-Identifier: GPL-2.0+ */
2/*
3 * VFIO PCI mmap/mmap_fault tracepoints
4 *
5 * Copyright (C) 2018 IBM Corp. All rights reserved.
6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#undef TRACE_SYSTEM
14#define TRACE_SYSTEM vfio_pci
15
16#if !defined(_TRACE_VFIO_PCI_H) || defined(TRACE_HEADER_MULTI_READ)
17#define _TRACE_VFIO_PCI_H
18
19#include <linux/tracepoint.h>
20
21TRACE_EVENT(vfio_pci_nvgpu_mmap_fault,
22 TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
23 vm_fault_t ret),
24 TP_ARGS(pdev, hpa, ua, ret),
25
26 TP_STRUCT__entry(
27 __field(const char *, name)
28 __field(unsigned long, hpa)
29 __field(unsigned long, ua)
30 __field(int, ret)
31 ),
32
33 TP_fast_assign(
34 __entry->name = dev_name(&pdev->dev),
35 __entry->hpa = hpa;
36 __entry->ua = ua;
37 __entry->ret = ret;
38 ),
39
40 TP_printk("%s: %lx -> %lx ret=%d", __entry->name, __entry->hpa,
41 __entry->ua, __entry->ret)
42);
43
44TRACE_EVENT(vfio_pci_nvgpu_mmap,
45 TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
46 unsigned long size, int ret),
47 TP_ARGS(pdev, hpa, ua, size, ret),
48
49 TP_STRUCT__entry(
50 __field(const char *, name)
51 __field(unsigned long, hpa)
52 __field(unsigned long, ua)
53 __field(unsigned long, size)
54 __field(int, ret)
55 ),
56
57 TP_fast_assign(
58 __entry->name = dev_name(&pdev->dev),
59 __entry->hpa = hpa;
60 __entry->ua = ua;
61 __entry->size = size;
62 __entry->ret = ret;
63 ),
64
65 TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
66 __entry->ua, __entry->size, __entry->ret)
67);
68
69TRACE_EVENT(vfio_pci_npu2_mmap,
70 TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
71 unsigned long size, int ret),
72 TP_ARGS(pdev, hpa, ua, size, ret),
73
74 TP_STRUCT__entry(
75 __field(const char *, name)
76 __field(unsigned long, hpa)
77 __field(unsigned long, ua)
78 __field(unsigned long, size)
79 __field(int, ret)
80 ),
81
82 TP_fast_assign(
83 __entry->name = dev_name(&pdev->dev),
84 __entry->hpa = hpa;
85 __entry->ua = ua;
86 __entry->size = size;
87 __entry->ret = ret;
88 ),
89
90 TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
91 __entry->ua, __entry->size, __entry->ret)
92);
93
94#endif /* _TRACE_VFIO_PCI_H */
95
96#undef TRACE_INCLUDE_PATH
97#define TRACE_INCLUDE_PATH .
98#undef TRACE_INCLUDE_FILE
99#define TRACE_INCLUDE_FILE trace
100
101/* This part must be outside protection */
102#include <trace/define_trace.h>
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 50cdedfca9fe..a89fa5d4e877 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -289,14 +289,37 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
289 if (ret) { 289 if (ret) {
290 dev_warn(&vdev->pdev->dev, 290 dev_warn(&vdev->pdev->dev,
291 "Failed to setup Intel IGD regions\n"); 291 "Failed to setup Intel IGD regions\n");
292 vfio_pci_disable(vdev); 292 goto disable_exit;
293 return ret; 293 }
294 }
295
296 if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
297 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
298 ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
299 if (ret && ret != -ENODEV) {
300 dev_warn(&vdev->pdev->dev,
301 "Failed to setup NVIDIA NV2 RAM region\n");
302 goto disable_exit;
303 }
304 }
305
306 if (pdev->vendor == PCI_VENDOR_ID_IBM &&
307 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
308 ret = vfio_pci_ibm_npu2_init(vdev);
309 if (ret && ret != -ENODEV) {
310 dev_warn(&vdev->pdev->dev,
311 "Failed to setup NVIDIA NV2 ATSD region\n");
312 goto disable_exit;
294 } 313 }
295 } 314 }
296 315
297 vfio_pci_probe_mmaps(vdev); 316 vfio_pci_probe_mmaps(vdev);
298 317
299 return 0; 318 return 0;
319
320disable_exit:
321 vfio_pci_disable(vdev);
322 return ret;
300} 323}
301 324
302static void vfio_pci_disable(struct vfio_pci_device *vdev) 325static void vfio_pci_disable(struct vfio_pci_device *vdev)
@@ -750,6 +773,12 @@ static long vfio_pci_ioctl(void *device_data,
750 if (ret) 773 if (ret)
751 return ret; 774 return ret;
752 775
776 if (vdev->region[i].ops->add_capability) {
777 ret = vdev->region[i].ops->add_capability(vdev,
778 &vdev->region[i], &caps);
779 if (ret)
780 return ret;
781 }
753 } 782 }
754 } 783 }
755 784
@@ -1117,6 +1146,15 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
1117 return -EINVAL; 1146 return -EINVAL;
1118 if ((vma->vm_flags & VM_SHARED) == 0) 1147 if ((vma->vm_flags & VM_SHARED) == 0)
1119 return -EINVAL; 1148 return -EINVAL;
1149 if (index >= VFIO_PCI_NUM_REGIONS) {
1150 int regnum = index - VFIO_PCI_NUM_REGIONS;
1151 struct vfio_pci_region *region = vdev->region + regnum;
1152
1153 if (region && region->ops && region->ops->mmap &&
1154 (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
1155 return region->ops->mmap(vdev, region, vma);
1156 return -EINVAL;
1157 }
1120 if (index >= VFIO_PCI_ROM_REGION_INDEX) 1158 if (index >= VFIO_PCI_ROM_REGION_INDEX)
1121 return -EINVAL; 1159 return -EINVAL;
1122 if (!vdev->bar_mmap_supported[index]) 1160 if (!vdev->bar_mmap_supported[index])
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
new file mode 100644
index 000000000000..054a2cf9dd8e
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -0,0 +1,482 @@
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
4 *
5 * Copyright (C) 2018 IBM Corp. All rights reserved.
6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * Register an on-GPU RAM region for cacheable access.
13 *
14 * Derived from original vfio_pci_igd.c:
15 * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
16 * Author: Alex Williamson <alex.williamson@redhat.com>
17 */
18
19#include <linux/io.h>
20#include <linux/pci.h>
21#include <linux/uaccess.h>
22#include <linux/vfio.h>
23#include <linux/sched/mm.h>
24#include <linux/mmu_context.h>
25#include <asm/kvm_ppc.h>
26#include "vfio_pci_private.h"
27
28#define CREATE_TRACE_POINTS
29#include "trace.h"
30
31EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
32EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
33EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
34
35struct vfio_pci_nvgpu_data {
36 unsigned long gpu_hpa; /* GPU RAM physical address */
37 unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
38 unsigned long useraddr; /* GPU RAM userspace address */
39 unsigned long size; /* Size of the GPU RAM window (usually 128GB) */
40 struct mm_struct *mm;
41 struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */
42 struct pci_dev *gpdev;
43 struct notifier_block group_notifier;
44};
45
46static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev,
47 char __user *buf, size_t count, loff_t *ppos, bool iswrite)
48{
49 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
50 struct vfio_pci_nvgpu_data *data = vdev->region[i].data;
51 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
52 loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK;
53 size_t sizealigned;
54 void __iomem *ptr;
55
56 if (pos >= vdev->region[i].size)
57 return -EINVAL;
58
59 count = min(count, (size_t)(vdev->region[i].size - pos));
60
61 /*
62 * We map only a bit of GPU RAM for a short time instead of mapping it
63 * for the guest lifetime as:
64 *
65 * 1) we do not know GPU RAM size, only aperture which is 4-8 times
66 * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture);
67 * 2) mapping GPU RAM allows CPU to prefetch and if this happens
68 * before NVLink bridge is reset (which fences GPU RAM),
69 * hardware management interrupts (HMI) might happen, this
70 * will freeze NVLink bridge.
71 *
72 * This is not fast path anyway.
73 */
74 sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE);
75 ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned);
76 if (!ptr)
77 return -EFAULT;
78
79 if (iswrite) {
80 if (copy_from_user(ptr + posoff, buf, count))
81 count = -EFAULT;
82 else
83 *ppos += count;
84 } else {
85 if (copy_to_user(buf, ptr + posoff, count))
86 count = -EFAULT;
87 else
88 *ppos += count;
89 }
90
91 iounmap(ptr);
92
93 return count;
94}
95
96static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev,
97 struct vfio_pci_region *region)
98{
99 struct vfio_pci_nvgpu_data *data = region->data;
100 long ret;
101
102 /* If there were any mappings at all... */
103 if (data->mm) {
104 ret = mm_iommu_put(data->mm, data->mem);
105 WARN_ON(ret);
106
107 mmdrop(data->mm);
108 }
109
110 vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
111 &data->group_notifier);
112
113 pnv_npu2_unmap_lpar_dev(data->gpdev);
114
115 kfree(data);
116}
117
118static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf)
119{
120 vm_fault_t ret;
121 struct vm_area_struct *vma = vmf->vma;
122 struct vfio_pci_region *region = vma->vm_private_data;
123 struct vfio_pci_nvgpu_data *data = region->data;
124 unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
125 unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT;
126 unsigned long vm_pgoff = vma->vm_pgoff &
127 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
128 unsigned long pfn = nv2pg + vm_pgoff + vmf_off;
129
130 ret = vmf_insert_pfn(vma, vmf->address, pfn);
131 trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT,
132 vmf->address, ret);
133
134 return ret;
135}
136
137static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = {
138 .fault = vfio_pci_nvgpu_mmap_fault,
139};
140
141static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev,
142 struct vfio_pci_region *region, struct vm_area_struct *vma)
143{
144 int ret;
145 struct vfio_pci_nvgpu_data *data = region->data;
146
147 if (data->useraddr)
148 return -EPERM;
149
150 if (vma->vm_end - vma->vm_start > data->size)
151 return -EINVAL;
152
153 vma->vm_private_data = region;
154 vma->vm_flags |= VM_PFNMAP;
155 vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops;
156
157 /*
158 * Calling mm_iommu_newdev() here once as the region is not
159 * registered yet and therefore right initialization will happen now.
160 * Other places will use mm_iommu_find() which returns
161 * registered @mem and does not go gup().
162 */
163 data->useraddr = vma->vm_start;
164 data->mm = current->mm;
165
166 atomic_inc(&data->mm->mm_count);
167 ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
168 (vma->vm_end - vma->vm_start) >> PAGE_SHIFT,
169 data->gpu_hpa, &data->mem);
170
171 trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr,
172 vma->vm_end - vma->vm_start, ret);
173
174 return ret;
175}
176
177static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev,
178 struct vfio_pci_region *region, struct vfio_info_cap *caps)
179{
180 struct vfio_pci_nvgpu_data *data = region->data;
181 struct vfio_region_info_cap_nvlink2_ssatgt cap = { 0 };
182
183 cap.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT;
184 cap.header.version = 1;
185 cap.tgt = data->gpu_tgt;
186
187 return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
188}
189
190static const struct vfio_pci_regops vfio_pci_nvgpu_regops = {
191 .rw = vfio_pci_nvgpu_rw,
192 .release = vfio_pci_nvgpu_release,
193 .mmap = vfio_pci_nvgpu_mmap,
194 .add_capability = vfio_pci_nvgpu_add_capability,
195};
196
197static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb,
198 unsigned long action, void *opaque)
199{
200 struct kvm *kvm = opaque;
201 struct vfio_pci_nvgpu_data *data = container_of(nb,
202 struct vfio_pci_nvgpu_data,
203 group_notifier);
204
205 if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm &&
206 pnv_npu2_map_lpar_dev(data->gpdev,
207 kvm->arch.lpid, MSR_DR | MSR_PR))
208 return NOTIFY_BAD;
209
210 return NOTIFY_OK;
211}
212
213int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
214{
215 int ret;
216 u64 reg[2];
217 u64 tgt = 0;
218 struct device_node *npu_node, *mem_node;
219 struct pci_dev *npu_dev;
220 struct vfio_pci_nvgpu_data *data;
221 uint32_t mem_phandle = 0;
222 unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM;
223
224 /*
225 * PCI config space does not tell us about NVLink presense but
226 * platform does, use this.
227 */
228 npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0);
229 if (!npu_dev)
230 return -ENODEV;
231
232 npu_node = pci_device_to_OF_node(npu_dev);
233 if (!npu_node)
234 return -EINVAL;
235
236 if (of_property_read_u32(npu_node, "memory-region", &mem_phandle))
237 return -EINVAL;
238
239 mem_node = of_find_node_by_phandle(mem_phandle);
240 if (!mem_node)
241 return -EINVAL;
242
243 if (of_property_read_variable_u64_array(mem_node, "reg", reg,
244 ARRAY_SIZE(reg), ARRAY_SIZE(reg)) !=
245 ARRAY_SIZE(reg))
246 return -EINVAL;
247
248 if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
249 dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
250 return -EFAULT;
251 }
252
253 data = kzalloc(sizeof(*data), GFP_KERNEL);
254 if (!data)
255 return -ENOMEM;
256
257 data->gpu_hpa = reg[0];
258 data->gpu_tgt = tgt;
259 data->size = reg[1];
260
261 dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa,
262 data->gpu_hpa + data->size - 1);
263
264 data->gpdev = vdev->pdev;
265 data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier;
266
267 ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
268 &events, &data->group_notifier);
269 if (ret)
270 goto free_exit;
271
272 /*
273 * We have just set KVM, we do not need the listener anymore.
274 * Also, keeping it registered means that if more than one GPU is
275 * assigned, we will get several similar notifiers notifying about
276 * the same device again which does not help with anything.
277 */
278 vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
279 &data->group_notifier);
280
281 ret = vfio_pci_register_dev_region(vdev,
282 PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
283 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
284 &vfio_pci_nvgpu_regops,
285 data->size,
286 VFIO_REGION_INFO_FLAG_READ |
287 VFIO_REGION_INFO_FLAG_WRITE |
288 VFIO_REGION_INFO_FLAG_MMAP,
289 data);
290 if (ret)
291 goto free_exit;
292
293 return 0;
294free_exit:
295 kfree(data);
296
297 return ret;
298}
299
300/*
301 * IBM NPU2 bridge
302 */
303struct vfio_pci_npu2_data {
304 void *base; /* ATSD register virtual address, for emulated access */
305 unsigned long mmio_atsd; /* ATSD physical address */
306 unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
307 unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */
308};
309
310static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev,
311 char __user *buf, size_t count, loff_t *ppos, bool iswrite)
312{
313 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
314 struct vfio_pci_npu2_data *data = vdev->region[i].data;
315 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
316
317 if (pos >= vdev->region[i].size)
318 return -EINVAL;
319
320 count = min(count, (size_t)(vdev->region[i].size - pos));
321
322 if (iswrite) {
323 if (copy_from_user(data->base + pos, buf, count))
324 return -EFAULT;
325 } else {
326 if (copy_to_user(buf, data->base + pos, count))
327 return -EFAULT;
328 }
329 *ppos += count;
330
331 return count;
332}
333
334static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev,
335 struct vfio_pci_region *region, struct vm_area_struct *vma)
336{
337 int ret;
338 struct vfio_pci_npu2_data *data = region->data;
339 unsigned long req_len = vma->vm_end - vma->vm_start;
340
341 if (req_len != PAGE_SIZE)
342 return -EINVAL;
343
344 vma->vm_flags |= VM_PFNMAP;
345 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
346
347 ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT,
348 req_len, vma->vm_page_prot);
349 trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start,
350 vma->vm_end - vma->vm_start, ret);
351
352 return ret;
353}
354
355static void vfio_pci_npu2_release(struct vfio_pci_device *vdev,
356 struct vfio_pci_region *region)
357{
358 struct vfio_pci_npu2_data *data = region->data;
359
360 memunmap(data->base);
361 kfree(data);
362}
363
364static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev,
365 struct vfio_pci_region *region, struct vfio_info_cap *caps)
366{
367 struct vfio_pci_npu2_data *data = region->data;
368 struct vfio_region_info_cap_nvlink2_ssatgt captgt = { 0 };
369 struct vfio_region_info_cap_nvlink2_lnkspd capspd = { 0 };
370 int ret;
371
372 captgt.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT;
373 captgt.header.version = 1;
374 captgt.tgt = data->gpu_tgt;
375
376 capspd.header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD;
377 capspd.header.version = 1;
378 capspd.link_speed = data->link_speed;
379
380 ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt));
381 if (ret)
382 return ret;
383
384 return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd));
385}
386
387static const struct vfio_pci_regops vfio_pci_npu2_regops = {
388 .rw = vfio_pci_npu2_rw,
389 .mmap = vfio_pci_npu2_mmap,
390 .release = vfio_pci_npu2_release,
391 .add_capability = vfio_pci_npu2_add_capability,
392};
393
394int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
395{
396 int ret;
397 struct vfio_pci_npu2_data *data;
398 struct device_node *nvlink_dn;
399 u32 nvlink_index = 0;
400 struct pci_dev *npdev = vdev->pdev;
401 struct device_node *npu_node = pci_device_to_OF_node(npdev);
402 struct pci_controller *hose = pci_bus_to_host(npdev->bus);
403 u64 mmio_atsd = 0;
404 u64 tgt = 0;
405 u32 link_speed = 0xff;
406
407 /*
408 * PCI config space does not tell us about NVLink presense but
409 * platform does, use this.
410 */
411 if (!pnv_pci_get_gpu_dev(vdev->pdev))
412 return -ENODEV;
413
414 /*
415 * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links
416 * so we can allocate one register per link, using nvlink index as
417 * a key.
418 * There is always at least one ATSD register so as long as at least
419 * NVLink bridge #0 is passed to the guest, ATSD will be available.
420 */
421 nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
422 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
423 &nvlink_index)))
424 return -ENODEV;
425
426 if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
427 &mmio_atsd)) {
428 dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
429 mmio_atsd = 0;
430 }
431
432 if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
433 dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
434 return -EFAULT;
435 }
436
437 if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) {
438 dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n");
439 return -EFAULT;
440 }
441
442 data = kzalloc(sizeof(*data), GFP_KERNEL);
443 if (!data)
444 return -ENOMEM;
445
446 data->mmio_atsd = mmio_atsd;
447 data->gpu_tgt = tgt;
448 data->link_speed = link_speed;
449 if (data->mmio_atsd) {
450 data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT);
451 if (!data->base) {
452 ret = -ENOMEM;
453 goto free_exit;
454 }
455 }
456
457 /*
458 * We want to expose the capability even if this specific NVLink
459 * did not get its own ATSD register because capabilities
460 * belong to VFIO regions and normally there will be ATSD register
461 * assigned to the NVLink bridge.
462 */
463 ret = vfio_pci_register_dev_region(vdev,
464 PCI_VENDOR_ID_IBM |
465 VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
466 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
467 &vfio_pci_npu2_regops,
468 data->mmio_atsd ? PAGE_SIZE : 0,
469 VFIO_REGION_INFO_FLAG_READ |
470 VFIO_REGION_INFO_FLAG_WRITE |
471 VFIO_REGION_INFO_FLAG_MMAP,
472 data);
473 if (ret)
474 goto free_exit;
475
476 return 0;
477
478free_exit:
479 kfree(data);
480
481 return ret;
482}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index cde3b5d3441a..127071b84dd7 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -59,6 +59,12 @@ struct vfio_pci_regops {
59 size_t count, loff_t *ppos, bool iswrite); 59 size_t count, loff_t *ppos, bool iswrite);
60 void (*release)(struct vfio_pci_device *vdev, 60 void (*release)(struct vfio_pci_device *vdev,
61 struct vfio_pci_region *region); 61 struct vfio_pci_region *region);
62 int (*mmap)(struct vfio_pci_device *vdev,
63 struct vfio_pci_region *region,
64 struct vm_area_struct *vma);
65 int (*add_capability)(struct vfio_pci_device *vdev,
66 struct vfio_pci_region *region,
67 struct vfio_info_cap *caps);
62}; 68};
63 69
64struct vfio_pci_region { 70struct vfio_pci_region {
@@ -157,4 +163,18 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev)
157 return -ENODEV; 163 return -ENODEV;
158} 164}
159#endif 165#endif
166#ifdef CONFIG_VFIO_PCI_NVLINK2
167extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev);
168extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev);
169#else
170static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
171{
172 return -ENODEV;
173}
174
175static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
176{
177 return -ENODEV;
178}
179#endif
160#endif /* VFIO_PCI_PRIVATE_H */ 180#endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index b30926e11d87..c424913324e3 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -152,11 +152,12 @@ static long tce_iommu_unregister_pages(struct tce_container *container,
152 struct mm_iommu_table_group_mem_t *mem; 152 struct mm_iommu_table_group_mem_t *mem;
153 struct tce_iommu_prereg *tcemem; 153 struct tce_iommu_prereg *tcemem;
154 bool found = false; 154 bool found = false;
155 long ret;
155 156
156 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) 157 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
157 return -EINVAL; 158 return -EINVAL;
158 159
159 mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); 160 mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
160 if (!mem) 161 if (!mem)
161 return -ENOENT; 162 return -ENOENT;
162 163
@@ -168,9 +169,13 @@ static long tce_iommu_unregister_pages(struct tce_container *container,
168 } 169 }
169 170
170 if (!found) 171 if (!found)
171 return -ENOENT; 172 ret = -ENOENT;
173 else
174 ret = tce_iommu_prereg_free(container, tcemem);
172 175
173 return tce_iommu_prereg_free(container, tcemem); 176 mm_iommu_put(container->mm, mem);
177
178 return ret;
174} 179}
175 180
176static long tce_iommu_register_pages(struct tce_container *container, 181static long tce_iommu_register_pages(struct tce_container *container,
@@ -185,22 +190,24 @@ static long tce_iommu_register_pages(struct tce_container *container,
185 ((vaddr + size) < vaddr)) 190 ((vaddr + size) < vaddr))
186 return -EINVAL; 191 return -EINVAL;
187 192
188 mem = mm_iommu_find(container->mm, vaddr, entries); 193 mem = mm_iommu_get(container->mm, vaddr, entries);
189 if (mem) { 194 if (mem) {
190 list_for_each_entry(tcemem, &container->prereg_list, next) { 195 list_for_each_entry(tcemem, &container->prereg_list, next) {
191 if (tcemem->mem == mem) 196 if (tcemem->mem == mem) {
192 return -EBUSY; 197 ret = -EBUSY;
198 goto put_exit;
199 }
193 } 200 }
201 } else {
202 ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
203 if (ret)
204 return ret;
194 } 205 }
195 206
196 ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
197 if (ret)
198 return ret;
199
200 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); 207 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
201 if (!tcemem) { 208 if (!tcemem) {
202 mm_iommu_put(container->mm, mem); 209 ret = -ENOMEM;
203 return -ENOMEM; 210 goto put_exit;
204 } 211 }
205 212
206 tcemem->mem = mem; 213 tcemem->mem = mem;
@@ -209,10 +216,22 @@ static long tce_iommu_register_pages(struct tce_container *container,
209 container->enabled = true; 216 container->enabled = true;
210 217
211 return 0; 218 return 0;
219
220put_exit:
221 mm_iommu_put(container->mm, mem);
222 return ret;
212} 223}
213 224
214static bool tce_page_is_contained(struct page *page, unsigned page_shift) 225static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
226 unsigned int page_shift)
215{ 227{
228 struct page *page;
229 unsigned long size = 0;
230
231 if (mm_iommu_is_devmem(mm, hpa, page_shift, &size))
232 return size == (1UL << page_shift);
233
234 page = pfn_to_page(hpa >> PAGE_SHIFT);
216 /* 235 /*
217 * Check that the TCE table granularity is not bigger than the size of 236 * Check that the TCE table granularity is not bigger than the size of
218 * a page we just found. Otherwise the hardware can get access to 237 * a page we just found. Otherwise the hardware can get access to
@@ -371,6 +390,7 @@ static void tce_iommu_release(void *iommu_data)
371{ 390{
372 struct tce_container *container = iommu_data; 391 struct tce_container *container = iommu_data;
373 struct tce_iommu_group *tcegrp; 392 struct tce_iommu_group *tcegrp;
393 struct tce_iommu_prereg *tcemem, *tmtmp;
374 long i; 394 long i;
375 395
376 while (tce_groups_attached(container)) { 396 while (tce_groups_attached(container)) {
@@ -393,13 +413,8 @@ static void tce_iommu_release(void *iommu_data)
393 tce_iommu_free_table(container, tbl); 413 tce_iommu_free_table(container, tbl);
394 } 414 }
395 415
396 while (!list_empty(&container->prereg_list)) { 416 list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
397 struct tce_iommu_prereg *tcemem; 417 WARN_ON(tce_iommu_prereg_free(container, tcemem));
398
399 tcemem = list_first_entry(&container->prereg_list,
400 struct tce_iommu_prereg, next);
401 WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
402 }
403 418
404 tce_iommu_disable(container); 419 tce_iommu_disable(container);
405 if (container->mm) 420 if (container->mm)
@@ -492,7 +507,8 @@ static int tce_iommu_clear(struct tce_container *container,
492 507
493 direction = DMA_NONE; 508 direction = DMA_NONE;
494 oldhpa = 0; 509 oldhpa = 0;
495 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); 510 ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa,
511 &direction);
496 if (ret) 512 if (ret)
497 continue; 513 continue;
498 514
@@ -530,7 +546,6 @@ static long tce_iommu_build(struct tce_container *container,
530 enum dma_data_direction direction) 546 enum dma_data_direction direction)
531{ 547{
532 long i, ret = 0; 548 long i, ret = 0;
533 struct page *page;
534 unsigned long hpa; 549 unsigned long hpa;
535 enum dma_data_direction dirtmp; 550 enum dma_data_direction dirtmp;
536 551
@@ -541,15 +556,16 @@ static long tce_iommu_build(struct tce_container *container,
541 if (ret) 556 if (ret)
542 break; 557 break;
543 558
544 page = pfn_to_page(hpa >> PAGE_SHIFT); 559 if (!tce_page_is_contained(container->mm, hpa,
545 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 560 tbl->it_page_shift)) {
546 ret = -EPERM; 561 ret = -EPERM;
547 break; 562 break;
548 } 563 }
549 564
550 hpa |= offset; 565 hpa |= offset;
551 dirtmp = direction; 566 dirtmp = direction;
552 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 567 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
568 &dirtmp);
553 if (ret) { 569 if (ret) {
554 tce_iommu_unuse_page(container, hpa); 570 tce_iommu_unuse_page(container, hpa);
555 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 571 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
@@ -576,7 +592,6 @@ static long tce_iommu_build_v2(struct tce_container *container,
576 enum dma_data_direction direction) 592 enum dma_data_direction direction)
577{ 593{
578 long i, ret = 0; 594 long i, ret = 0;
579 struct page *page;
580 unsigned long hpa; 595 unsigned long hpa;
581 enum dma_data_direction dirtmp; 596 enum dma_data_direction dirtmp;
582 597
@@ -589,8 +604,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
589 if (ret) 604 if (ret)
590 break; 605 break;
591 606
592 page = pfn_to_page(hpa >> PAGE_SHIFT); 607 if (!tce_page_is_contained(container->mm, hpa,
593 if (!tce_page_is_contained(page, tbl->it_page_shift)) { 608 tbl->it_page_shift)) {
594 ret = -EPERM; 609 ret = -EPERM;
595 break; 610 break;
596 } 611 }
@@ -603,7 +618,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
603 if (mm_iommu_mapped_inc(mem)) 618 if (mm_iommu_mapped_inc(mem))
604 break; 619 break;
605 620
606 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 621 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
622 &dirtmp);
607 if (ret) { 623 if (ret) {
608 /* dirtmp cannot be DMA_NONE here */ 624 /* dirtmp cannot be DMA_NONE here */
609 tce_iommu_unuse_page_v2(container, tbl, entry + i); 625 tce_iommu_unuse_page_v2(container, tbl, entry + i);