diff options
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/Makefile | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 31 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device.c | 46 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 255 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 355 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_module.c | 41 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_pasid.c | 97 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 147 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process.c | 383 |
9 files changed, 1349 insertions, 10 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 978654f56b4f..e829a3fa7d8e 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile | |||
@@ -4,6 +4,8 @@ | |||
4 | 4 | ||
5 | ccflags-y := -Iinclude/drm -Idrivers/gpu/drm/amd/include/ | 5 | ccflags-y := -Iinclude/drm -Idrivers/gpu/drm/amd/include/ |
6 | 6 | ||
7 | amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o | 7 | amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ |
8 | kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ | ||
9 | kfd_process.o | ||
8 | 10 | ||
9 | obj-$(CONFIG_HSA_AMD) += amdkfd.o | 11 | obj-$(CONFIG_HSA_AMD) += amdkfd.o |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index d7c32eb7d16a..58441cd1b1d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | |||
@@ -38,6 +38,7 @@ | |||
38 | 38 | ||
39 | static long kfd_ioctl(struct file *, unsigned int, unsigned long); | 39 | static long kfd_ioctl(struct file *, unsigned int, unsigned long); |
40 | static int kfd_open(struct inode *, struct file *); | 40 | static int kfd_open(struct inode *, struct file *); |
41 | static int kfd_mmap(struct file *, struct vm_area_struct *); | ||
41 | 42 | ||
42 | static const char kfd_dev_name[] = "kfd"; | 43 | static const char kfd_dev_name[] = "kfd"; |
43 | 44 | ||
@@ -46,6 +47,7 @@ static const struct file_operations kfd_fops = { | |||
46 | .unlocked_ioctl = kfd_ioctl, | 47 | .unlocked_ioctl = kfd_ioctl, |
47 | .compat_ioctl = kfd_ioctl, | 48 | .compat_ioctl = kfd_ioctl, |
48 | .open = kfd_open, | 49 | .open = kfd_open, |
50 | .mmap = kfd_mmap, | ||
49 | }; | 51 | }; |
50 | 52 | ||
51 | static int kfd_char_dev_major = -1; | 53 | static int kfd_char_dev_major = -1; |
@@ -98,9 +100,22 @@ struct device *kfd_chardev(void) | |||
98 | 100 | ||
99 | static int kfd_open(struct inode *inode, struct file *filep) | 101 | static int kfd_open(struct inode *inode, struct file *filep) |
100 | { | 102 | { |
103 | struct kfd_process *process; | ||
104 | |||
101 | if (iminor(inode) != 0) | 105 | if (iminor(inode) != 0) |
102 | return -ENODEV; | 106 | return -ENODEV; |
103 | 107 | ||
108 | process = kfd_create_process(current); | ||
109 | if (IS_ERR(process)) | ||
110 | return PTR_ERR(process); | ||
111 | |||
112 | process->is_32bit_user_mode = is_compat_task(); | ||
113 | |||
114 | dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", | ||
115 | process->pasid, process->is_32bit_user_mode); | ||
116 | |||
117 | kfd_init_apertures(process); | ||
118 | |||
104 | return 0; | 119 | return 0; |
105 | } | 120 | } |
106 | 121 | ||
@@ -156,8 +171,9 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) | |||
156 | "ioctl cmd 0x%x (#%d), arg 0x%lx\n", | 171 | "ioctl cmd 0x%x (#%d), arg 0x%lx\n", |
157 | cmd, _IOC_NR(cmd), arg); | 172 | cmd, _IOC_NR(cmd), arg); |
158 | 173 | ||
159 | /* TODO: add function that retrieves process */ | 174 | process = kfd_get_process(current); |
160 | process = NULL; | 175 | if (IS_ERR(process)) |
176 | return PTR_ERR(process); | ||
161 | 177 | ||
162 | switch (cmd) { | 178 | switch (cmd) { |
163 | case KFD_IOC_GET_VERSION: | 179 | case KFD_IOC_GET_VERSION: |
@@ -208,3 +224,14 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) | |||
208 | 224 | ||
209 | return err; | 225 | return err; |
210 | } | 226 | } |
227 | |||
228 | static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) | ||
229 | { | ||
230 | struct kfd_process *process; | ||
231 | |||
232 | process = kfd_get_process(current); | ||
233 | if (IS_ERR(process)) | ||
234 | return PTR_ERR(process); | ||
235 | |||
236 | return kfd_doorbell_mmap(process, vma); | ||
237 | } | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 53b2e19cabe0..71a03f7b0049 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c | |||
@@ -26,8 +26,11 @@ | |||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include "kfd_priv.h" | 27 | #include "kfd_priv.h" |
28 | 28 | ||
29 | #define MQD_SIZE_ALIGNED 768 | ||
30 | |||
29 | static const struct kfd_device_info kaveri_device_info = { | 31 | static const struct kfd_device_info kaveri_device_info = { |
30 | .max_pasid_bits = 16, | 32 | .max_pasid_bits = 16, |
33 | .mqd_size_aligned = MQD_SIZE_ALIGNED | ||
31 | }; | 34 | }; |
32 | 35 | ||
33 | struct kfd_deviceid { | 36 | struct kfd_deviceid { |
@@ -92,6 +95,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev) | |||
92 | kfd->kgd = kgd; | 95 | kfd->kgd = kgd; |
93 | kfd->device_info = device_info; | 96 | kfd->device_info = device_info; |
94 | kfd->pdev = pdev; | 97 | kfd->pdev = pdev; |
98 | kfd->init_complete = false; | ||
95 | 99 | ||
96 | return kfd; | 100 | return kfd; |
97 | } | 101 | } |
@@ -99,23 +103,53 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev) | |||
99 | bool kgd2kfd_device_init(struct kfd_dev *kfd, | 103 | bool kgd2kfd_device_init(struct kfd_dev *kfd, |
100 | const struct kgd2kfd_shared_resources *gpu_resources) | 104 | const struct kgd2kfd_shared_resources *gpu_resources) |
101 | { | 105 | { |
106 | unsigned int size; | ||
107 | |||
102 | kfd->shared_resources = *gpu_resources; | 108 | kfd->shared_resources = *gpu_resources; |
103 | 109 | ||
104 | if (kfd_topology_add_device(kfd) != 0) | 110 | /* calculate max size of mqds needed for queues */ |
105 | return false; | 111 | size = max_num_of_processes * |
112 | max_num_of_queues_per_process * | ||
113 | kfd->device_info->mqd_size_aligned; | ||
114 | |||
115 | /* add another 512KB for all other allocations on gart */ | ||
116 | size += 512 * 1024; | ||
117 | |||
118 | if (kfd2kgd->init_sa_manager(kfd->kgd, size)) { | ||
119 | dev_err(kfd_device, | ||
120 | "Error initializing sa manager for device (%x:%x)\n", | ||
121 | kfd->pdev->vendor, kfd->pdev->device); | ||
122 | goto out; | ||
123 | } | ||
124 | |||
125 | kfd_doorbell_init(kfd); | ||
126 | |||
127 | if (kfd_topology_add_device(kfd) != 0) { | ||
128 | dev_err(kfd_device, | ||
129 | "Error adding device (%x:%x) to topology\n", | ||
130 | kfd->pdev->vendor, kfd->pdev->device); | ||
131 | goto kfd_topology_add_device_error; | ||
132 | } | ||
133 | |||
106 | 134 | ||
107 | kfd->init_complete = true; | 135 | kfd->init_complete = true; |
108 | dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor, | 136 | dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor, |
109 | kfd->pdev->device); | 137 | kfd->pdev->device); |
110 | 138 | ||
111 | return true; | 139 | goto out; |
140 | |||
141 | kfd_topology_add_device_error: | ||
142 | kfd2kgd->fini_sa_manager(kfd->kgd); | ||
143 | dev_err(kfd_device, | ||
144 | "device (%x:%x) NOT added due to errors\n", | ||
145 | kfd->pdev->vendor, kfd->pdev->device); | ||
146 | out: | ||
147 | return kfd->init_complete; | ||
112 | } | 148 | } |
113 | 149 | ||
114 | void kgd2kfd_device_exit(struct kfd_dev *kfd) | 150 | void kgd2kfd_device_exit(struct kfd_dev *kfd) |
115 | { | 151 | { |
116 | int err = kfd_topology_remove_device(kfd); | 152 | kfd_topology_remove_device(kfd); |
117 | |||
118 | BUG_ON(err != 0); | ||
119 | 153 | ||
120 | kfree(kfd); | 154 | kfree(kfd); |
121 | } | 155 | } |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c new file mode 100644 index 000000000000..0dcb78755686 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | |||
@@ -0,0 +1,255 @@ | |||
1 | /* | ||
2 | * Copyright 2014 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | #include "kfd_priv.h" | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/mman.h> | ||
25 | #include <linux/slab.h> | ||
26 | |||
27 | /* | ||
28 | * This extension supports a kernel level doorbells management for | ||
29 | * the kernel queues. | ||
30 | * Basically the last doorbells page is devoted to kernel queues | ||
31 | * and that's assures that any user process won't get access to the | ||
32 | * kernel doorbells page | ||
33 | */ | ||
34 | static DEFINE_MUTEX(doorbell_mutex); | ||
35 | static unsigned long doorbell_available_index[ | ||
36 | DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)] = { 0 }; | ||
37 | |||
38 | #define KERNEL_DOORBELL_PASID 1 | ||
39 | #define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 | ||
40 | |||
41 | /* | ||
42 | * Each device exposes a doorbell aperture, a PCI MMIO aperture that | ||
43 | * receives 32-bit writes that are passed to queues as wptr values. | ||
44 | * The doorbells are intended to be written by applications as part | ||
45 | * of queueing work on user-mode queues. | ||
46 | * We assign doorbells to applications in PAGE_SIZE-sized and aligned chunks. | ||
47 | * We map the doorbell address space into user-mode when a process creates | ||
48 | * its first queue on each device. | ||
49 | * Although the mapping is done by KFD, it is equivalent to an mmap of | ||
50 | * the /dev/kfd with the particular device encoded in the mmap offset. | ||
51 | * There will be other uses for mmap of /dev/kfd, so only a range of | ||
52 | * offsets (KFD_MMAP_DOORBELL_START-END) is used for doorbells. | ||
53 | */ | ||
54 | |||
55 | /* # of doorbell bytes allocated for each process. */ | ||
56 | static inline size_t doorbell_process_allocation(void) | ||
57 | { | ||
58 | return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * | ||
59 | KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, | ||
60 | PAGE_SIZE); | ||
61 | } | ||
62 | |||
63 | /* Doorbell calculations for device init. */ | ||
64 | void kfd_doorbell_init(struct kfd_dev *kfd) | ||
65 | { | ||
66 | size_t doorbell_start_offset; | ||
67 | size_t doorbell_aperture_size; | ||
68 | size_t doorbell_process_limit; | ||
69 | |||
70 | /* | ||
71 | * We start with calculations in bytes because the input data might | ||
72 | * only be byte-aligned. | ||
73 | * Only after we have done the rounding can we assume any alignment. | ||
74 | */ | ||
75 | |||
76 | doorbell_start_offset = | ||
77 | roundup(kfd->shared_resources.doorbell_start_offset, | ||
78 | doorbell_process_allocation()); | ||
79 | |||
80 | doorbell_aperture_size = | ||
81 | rounddown(kfd->shared_resources.doorbell_aperture_size, | ||
82 | doorbell_process_allocation()); | ||
83 | |||
84 | if (doorbell_aperture_size > doorbell_start_offset) | ||
85 | doorbell_process_limit = | ||
86 | (doorbell_aperture_size - doorbell_start_offset) / | ||
87 | doorbell_process_allocation(); | ||
88 | else | ||
89 | doorbell_process_limit = 0; | ||
90 | |||
91 | kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address + | ||
92 | doorbell_start_offset; | ||
93 | |||
94 | kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); | ||
95 | kfd->doorbell_process_limit = doorbell_process_limit - 1; | ||
96 | |||
97 | kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, | ||
98 | doorbell_process_allocation()); | ||
99 | |||
100 | BUG_ON(!kfd->doorbell_kernel_ptr); | ||
101 | |||
102 | pr_debug("kfd: doorbell initialization:\n"); | ||
103 | pr_debug("kfd: doorbell base == 0x%08lX\n", | ||
104 | (uintptr_t)kfd->doorbell_base); | ||
105 | |||
106 | pr_debug("kfd: doorbell_id_offset == 0x%08lX\n", | ||
107 | kfd->doorbell_id_offset); | ||
108 | |||
109 | pr_debug("kfd: doorbell_process_limit == 0x%08lX\n", | ||
110 | doorbell_process_limit); | ||
111 | |||
112 | pr_debug("kfd: doorbell_kernel_offset == 0x%08lX\n", | ||
113 | (uintptr_t)kfd->doorbell_base); | ||
114 | |||
115 | pr_debug("kfd: doorbell aperture size == 0x%08lX\n", | ||
116 | kfd->shared_resources.doorbell_aperture_size); | ||
117 | |||
118 | pr_debug("kfd: doorbell kernel address == 0x%08lX\n", | ||
119 | (uintptr_t)kfd->doorbell_kernel_ptr); | ||
120 | } | ||
121 | |||
122 | int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) | ||
123 | { | ||
124 | phys_addr_t address; | ||
125 | struct kfd_dev *dev; | ||
126 | |||
127 | /* | ||
128 | * For simplicitly we only allow mapping of the entire doorbell | ||
129 | * allocation of a single device & process. | ||
130 | */ | ||
131 | if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) | ||
132 | return -EINVAL; | ||
133 | |||
134 | /* Find kfd device according to gpu id */ | ||
135 | dev = kfd_device_by_id(vma->vm_pgoff); | ||
136 | if (dev == NULL) | ||
137 | return -EINVAL; | ||
138 | |||
139 | /* Find if pdd exists for combination of process and gpu id */ | ||
140 | if (!kfd_get_process_device_data(dev, process, 0)) | ||
141 | return -EINVAL; | ||
142 | |||
143 | /* Calculate physical address of doorbell */ | ||
144 | address = kfd_get_process_doorbells(dev, process); | ||
145 | |||
146 | vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | | ||
147 | VM_DONTDUMP | VM_PFNMAP; | ||
148 | |||
149 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | ||
150 | |||
151 | pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n" | ||
152 | " target user address == 0x%08llX\n" | ||
153 | " physical address == 0x%08llX\n" | ||
154 | " vm_flags == 0x%04lX\n" | ||
155 | " size == 0x%04lX\n", | ||
156 | (unsigned long long) vma->vm_start, address, vma->vm_flags, | ||
157 | doorbell_process_allocation()); | ||
158 | |||
159 | |||
160 | return io_remap_pfn_range(vma, | ||
161 | vma->vm_start, | ||
162 | address >> PAGE_SHIFT, | ||
163 | doorbell_process_allocation(), | ||
164 | vma->vm_page_prot); | ||
165 | } | ||
166 | |||
167 | |||
168 | /* get kernel iomem pointer for a doorbell */ | ||
169 | u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, | ||
170 | unsigned int *doorbell_off) | ||
171 | { | ||
172 | u32 inx; | ||
173 | |||
174 | BUG_ON(!kfd || !doorbell_off); | ||
175 | |||
176 | mutex_lock(&doorbell_mutex); | ||
177 | inx = find_first_zero_bit(doorbell_available_index, | ||
178 | KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); | ||
179 | |||
180 | __set_bit(inx, doorbell_available_index); | ||
181 | mutex_unlock(&doorbell_mutex); | ||
182 | |||
183 | if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) | ||
184 | return NULL; | ||
185 | |||
186 | /* | ||
187 | * Calculating the kernel doorbell offset using "faked" kernel | ||
188 | * pasid that allocated for kernel queues only | ||
189 | */ | ||
190 | *doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() / | ||
191 | sizeof(u32)) + inx; | ||
192 | |||
193 | pr_debug("kfd: get kernel queue doorbell\n" | ||
194 | " doorbell offset == 0x%08d\n" | ||
195 | " kernel address == 0x%08lX\n", | ||
196 | *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx)); | ||
197 | |||
198 | return kfd->doorbell_kernel_ptr + inx; | ||
199 | } | ||
200 | |||
201 | void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) | ||
202 | { | ||
203 | unsigned int inx; | ||
204 | |||
205 | BUG_ON(!kfd || !db_addr); | ||
206 | |||
207 | inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr); | ||
208 | |||
209 | mutex_lock(&doorbell_mutex); | ||
210 | __clear_bit(inx, doorbell_available_index); | ||
211 | mutex_unlock(&doorbell_mutex); | ||
212 | } | ||
213 | |||
214 | inline void write_kernel_doorbell(u32 __iomem *db, u32 value) | ||
215 | { | ||
216 | if (db) { | ||
217 | writel(value, db); | ||
218 | pr_debug("writing %d to doorbell address 0x%p\n", value, db); | ||
219 | } | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 | ||
224 | * to doorbells with the process's doorbell page | ||
225 | */ | ||
226 | unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, | ||
227 | struct kfd_process *process, | ||
228 | unsigned int queue_id) | ||
229 | { | ||
230 | /* | ||
231 | * doorbell_id_offset accounts for doorbells taken by KGD. | ||
232 | * pasid * doorbell_process_allocation/sizeof(u32) adjusts | ||
233 | * to the process's doorbells | ||
234 | */ | ||
235 | return kfd->doorbell_id_offset + | ||
236 | process->pasid * (doorbell_process_allocation()/sizeof(u32)) + | ||
237 | queue_id; | ||
238 | } | ||
239 | |||
240 | uint64_t kfd_get_number_elems(struct kfd_dev *kfd) | ||
241 | { | ||
242 | uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - | ||
243 | kfd->shared_resources.doorbell_start_offset) / | ||
244 | doorbell_process_allocation() + 1; | ||
245 | |||
246 | return num_of_elems; | ||
247 | |||
248 | } | ||
249 | |||
250 | phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, | ||
251 | struct kfd_process *process) | ||
252 | { | ||
253 | return dev->doorbell_base + | ||
254 | process->pasid * doorbell_process_allocation(); | ||
255 | } | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c new file mode 100644 index 000000000000..2dfc4c0e85a4 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | |||
@@ -0,0 +1,355 @@ | |||
1 | /* | ||
2 | * Copyright 2014 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | #include <linux/device.h> | ||
25 | #include <linux/export.h> | ||
26 | #include <linux/err.h> | ||
27 | #include <linux/fs.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/uaccess.h> | ||
31 | #include <linux/compat.h> | ||
32 | #include <uapi/linux/kfd_ioctl.h> | ||
33 | #include <linux/time.h> | ||
34 | #include "kfd_priv.h" | ||
35 | #include <linux/mm.h> | ||
36 | #include <uapi/asm-generic/mman-common.h> | ||
37 | #include <asm/processor.h> | ||
38 | |||
39 | /* | ||
40 | * The primary memory I/O features being added for revisions of gfxip | ||
41 | * beyond 7.0 (Kaveri) are: | ||
42 | * | ||
43 | * Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b | ||
44 | * | ||
45 | * “Flat” shader memory access – These are new shader vector memory | ||
46 | * operations that do not reference a T#/V# so a “pointer” is what is | ||
47 | * sourced from the vector gprs for direct access to memory. | ||
48 | * This pointer space has the Shared(LDS) and Private(Scratch) memory | ||
49 | * mapped into this pointer space as apertures. | ||
50 | * The hardware then determines how to direct the memory request | ||
51 | * based on what apertures the request falls in. | ||
52 | * | ||
53 | * Unaligned support and alignment check | ||
54 | * | ||
55 | * | ||
56 | * System Unified Address - SUA | ||
57 | * | ||
58 | * The standard usage for GPU virtual addresses are that they are mapped by | ||
59 | * a set of page tables we call GPUVM and these page tables are managed by | ||
60 | * a combination of vidMM/driver software components. The current virtual | ||
61 | * address (VA) range for GPUVM is 40b. | ||
62 | * | ||
63 | * As of gfxip7.1 and beyond we’re adding the ability for compute memory | ||
64 | * clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access | ||
65 | * the same page tables used by host x86 processors and that are managed by | ||
66 | * the operating system. This is via a technique and hardware called ATC/IOMMU. | ||
67 | * The GPU has the capability of accessing both the GPUVM and ATC address | ||
68 | * spaces for a given VMID (process) simultaneously and we call this feature | ||
69 | * system unified address (SUA). | ||
70 | * | ||
71 | * There are three fundamental address modes of operation for a given VMID | ||
72 | * (process) on the GPU: | ||
73 | * | ||
74 | * HSA64 – 64b pointers and the default address space is ATC | ||
75 | * HSA32 – 32b pointers and the default address space is ATC | ||
76 | * GPUVM – 64b pointers and the default address space is GPUVM (driver | ||
77 | * model mode) | ||
78 | * | ||
79 | * | ||
80 | * HSA64 - ATC/IOMMU 64b | ||
81 | * | ||
82 | * A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized | ||
83 | * by the CPU so an AMD CPU can only access the high area | ||
84 | * (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space | ||
85 | * so the actual VA carried to translation is 48b. There is a “hole” in | ||
86 | * the middle of the 64b VA space. | ||
87 | * | ||
88 | * The GPU not only has access to all of the CPU accessible address space via | ||
89 | * ATC/IOMMU, but it also has access to the GPUVM address space. The “system | ||
90 | * unified address” feature (SUA) is the mapping of GPUVM and ATC address | ||
91 | * spaces into a unified pointer space. The method we take for 64b mode is | ||
92 | * to map the full 40b GPUVM address space into the hole of the 64b address | ||
93 | * space. | ||
94 | |||
95 | * The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we | ||
96 | * direct requests to be translated via GPUVM page tables instead of the | ||
97 | * IOMMU path. | ||
98 | * | ||
99 | * | ||
100 | * 64b to 49b Address conversion | ||
101 | * | ||
102 | * Note that there are still significant portions of unused regions (holes) | ||
103 | * in the 64b address space even for the GPU. There are several places in | ||
104 | * the pipeline (sw and hw), we wish to compress the 64b virtual address | ||
105 | * to a 49b address. This 49b address is constituted of an “ATC” bit | ||
106 | * plus a 48b virtual address. This 49b address is what is passed to the | ||
107 | * translation hardware. ATC==0 means the 48b address is a GPUVM address | ||
108 | * (max of 2^40 – 1) intended to be translated via GPUVM page tables. | ||
109 | * ATC==1 means the 48b address is intended to be translated via IOMMU | ||
110 | * page tables. | ||
111 | * | ||
112 | * A 64b pointer is compared to the apertures that are defined (Base/Limit), in | ||
113 | * this case the GPUVM aperture (red) is defined and if a pointer falls in this | ||
114 | * aperture, we subtract the GPUVM_Base address and set the ATC bit to zero | ||
115 | * as part of the 64b to 49b conversion. | ||
116 | * | ||
117 | * Where this 64b to 49b conversion is done is a function of the usage. | ||
118 | * Most GPU memory access is via memory objects where the driver builds | ||
119 | * a descriptor which consists of a base address and a memory access by | ||
120 | * the GPU usually consists of some kind of an offset or Cartesian coordinate | ||
121 | * that references this memory descriptor. This is the case for shader | ||
122 | * instructions that reference the T# or V# constants, or for specified | ||
123 | * locations of assets (ex. the shader program location). In these cases | ||
124 | * the driver is what handles the 64b to 49b conversion and the base | ||
125 | * address in the descriptor (ex. V# or T# or shader program location) | ||
126 | * is defined as a 48b address w/ an ATC bit. For this usage a given | ||
127 | * memory object cannot straddle multiple apertures in the 64b address | ||
128 | * space. For example a shader program cannot jump in/out between ATC | ||
129 | * and GPUVM space. | ||
130 | * | ||
131 | * In some cases we wish to pass a 64b pointer to the GPU hardware and | ||
132 | * the GPU hw does the 64b to 49b conversion before passing memory | ||
133 | * requests to the cache/memory system. This is the case for the | ||
134 | * S_LOAD and FLAT_* shader memory instructions where we have 64b pointers | ||
135 | * in scalar and vector GPRs respectively. | ||
136 | * | ||
137 | * In all cases (no matter where the 64b -> 49b conversion is done), the gfxip | ||
138 | * hardware sends a 48b address along w/ an ATC bit, to the memory controller | ||
139 | * on the memory request interfaces. | ||
140 | * | ||
141 | * <client>_MC_rdreq_atc // read request ATC bit | ||
142 | * | ||
143 | * 0 : <client>_MC_rdreq_addr is a GPUVM VA | ||
144 | * | ||
145 | * 1 : <client>_MC_rdreq_addr is a ATC VA | ||
146 | * | ||
147 | * | ||
148 | * “Spare” aperture (APE1) | ||
149 | * | ||
150 | * We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use | ||
151 | * apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the | ||
152 | * config tables for setting cache policies. The “spare” (APE1) aperture is | ||
153 | * motivated by getting a different Mtype from the default. | ||
154 | * The default aperture isn’t an actual base/limit aperture; it is just the | ||
155 | * address space that doesn’t hit any defined base/limit apertures. | ||
156 | * The following diagram is a complete picture of the gfxip7.x SUA apertures. | ||
157 | * The APE1 can be placed either below or above | ||
158 | * the hole (cannot be in the hole). | ||
159 | * | ||
160 | * | ||
161 | * General Aperture definitions and rules | ||
162 | * | ||
163 | * An aperture register definition consists of a Base, Limit, Mtype, and | ||
164 | * usually an ATC bit indicating which translation tables that aperture uses. | ||
165 | * In all cases (for SUA and DUA apertures discussed later), aperture base | ||
166 | * and limit definitions are 64KB aligned. | ||
167 | * | ||
168 | * <ape>_Base[63:0] = { <ape>_Base_register[63:16], 0x0000 } | ||
169 | * | ||
170 | * <ape>_Limit[63:0] = { <ape>_Limit_register[63:16], 0xFFFF } | ||
171 | * | ||
172 | * The base and limit are considered inclusive to an aperture so being | ||
173 | * inside an aperture means (address >= Base) AND (address <= Limit). | ||
174 | * | ||
175 | * In no case is a payload that straddles multiple apertures expected to work. | ||
176 | * For example a load_dword_x4 that starts in one aperture and ends in another, | ||
177 | * does not work. For the vector FLAT_* ops we have detection capability in | ||
178 | * the shader for reporting a “memory violation” back to the | ||
179 | * SQ block for use in traps. | ||
180 | * A memory violation results when an op falls into the hole, | ||
181 | * or a payload straddles multiple apertures. The S_LOAD instruction | ||
182 | * does not have this detection. | ||
183 | * | ||
184 | * Apertures cannot overlap. | ||
185 | * | ||
186 | * | ||
187 | * | ||
188 | * HSA32 - ATC/IOMMU 32b | ||
189 | * | ||
190 | * For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR | ||
191 | * instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b | ||
192 | * will not fit so there is only partial visibility to the GPUVM | ||
193 | * space (defined by the aperture) for S_LOAD and FLAT_* ops. | ||
194 | * There is no spare (APE1) aperture for HSA32 mode. | ||
195 | * | ||
196 | * | ||
197 | * GPUVM 64b mode (driver model) | ||
198 | * | ||
199 | * This mode is related to HSA64 in that the difference really is that | ||
200 | * the default aperture is GPUVM (ATC==0) and not ATC space. | ||
201 | * We have gfxip7.x hardware that has FLAT_* and S_LOAD support for | ||
202 | * SUA GPUVM mode, but does not support HSA32/HSA64. | ||
203 | * | ||
204 | * | ||
205 | * Device Unified Address - DUA | ||
206 | * | ||
207 | * Device unified address (DUA) is the name of the feature that maps the | ||
208 | * Shared(LDS) memory and Private(Scratch) memory into the overall address | ||
209 | * space for use by the new FLAT_* vector memory ops. The Shared and | ||
210 | * Private memories are mapped as apertures into the address space, | ||
211 | * and the hardware detects when a FLAT_* memory request is to be redirected | ||
212 | * to the LDS or Scratch memory when it falls into one of these apertures. | ||
213 | * Like the SUA apertures, the Shared/Private apertures are 64KB aligned and | ||
214 | * the base/limit is “in” the aperture. For both HSA64 and GPUVM SUA modes, | ||
215 | * the Shared/Private apertures are always placed in a limited selection of | ||
216 | * options in the hole of the 64b address space. For HSA32 mode, the | ||
217 | * Shared/Private apertures can be placed anywhere in the 32b space | ||
218 | * except at 0. | ||
219 | * | ||
220 | * | ||
221 | * HSA64 Apertures for FLAT_* vector ops | ||
222 | * | ||
223 | * For HSA64 SUA mode, the Shared and Private apertures are always placed | ||
224 | * in the hole w/ a limited selection of possible locations. The requests | ||
225 | * that fall in the private aperture are expanded as a function of the | ||
226 | * work-item id (tid) and redirected to the location of the | ||
227 | * “hidden private memory”. The hidden private can be placed in either GPUVM | ||
228 | * or ATC space. The addresses that fall in the shared aperture are | ||
229 | * re-directed to the on-chip LDS memory hardware. | ||
230 | * | ||
231 | * | ||
232 | * HSA32 Apertures for FLAT_* vector ops | ||
233 | * | ||
234 | * In HSA32 mode, the Private and Shared apertures can be placed anywhere | ||
235 | * in the 32b space except at 0 (Private or Shared Base at zero disables | ||
236 | * the apertures). If the base address of the apertures are non-zero | ||
237 | * (ie apertures exists), the size is always 64KB. | ||
238 | * | ||
239 | * | ||
240 | * GPUVM Apertures for FLAT_* vector ops | ||
241 | * | ||
242 | * In GPUVM mode, the Shared/Private apertures are specified identically | ||
243 | * to HSA64 mode where they are always in the hole at a limited selection | ||
244 | * of locations. | ||
245 | * | ||
246 | * | ||
247 | * Aperture Definitions for SUA and DUA | ||
248 | * | ||
249 | * The interpretation of the aperture register definitions for a given | ||
250 | * VMID is a function of the “SUA Mode” which is one of HSA64, HSA32, or | ||
251 | * GPUVM64 discussed in previous sections. The mode is first decoded, and | ||
252 | * then the remaining register decode is a function of the mode. | ||
253 | * | ||
254 | * | ||
255 | * SUA Mode Decode | ||
256 | * | ||
257 | * For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from | ||
258 | * the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and | ||
259 | * the SH_MEM_CONFIG:PTR32 bits. | ||
260 | * | ||
261 | * COMPUTE_DISPATCH_INITIATOR:DATA_ATC SH_MEM_CONFIG:PTR32 Mode | ||
262 | * | ||
263 | * 1 0 HSA64 | ||
264 | * | ||
265 | * 1 1 HSA32 | ||
266 | * | ||
267 | * 0 X GPUVM64 | ||
268 | * | ||
269 | * In general the hardware will ignore the PTR32 bit and treat | ||
270 | * as “0” whenever DATA_ATC = “0”, but sw should set PTR32=0 | ||
271 | * when DATA_ATC=0. | ||
272 | * | ||
273 | * The DATA_ATC bit is only set for compute dispatches. | ||
274 | * All “Draw” dispatches are hardcoded to GPUVM64 mode | ||
275 | * for FLAT_* / S_LOAD operations. | ||
276 | */ | ||
277 | |||
278 | #define MAKE_GPUVM_APP_BASE(gpu_num) \ | ||
279 | (((uint64_t)(gpu_num) << 61) + 0x1000000000000) | ||
280 | |||
281 | #define MAKE_GPUVM_APP_LIMIT(base) \ | ||
282 | (((uint64_t)(base) & 0xFFFFFF0000000000) | 0xFFFFFFFFFF) | ||
283 | |||
284 | #define MAKE_SCRATCH_APP_BASE(gpu_num) \ | ||
285 | (((uint64_t)(gpu_num) << 61) + 0x100000000) | ||
286 | |||
287 | #define MAKE_SCRATCH_APP_LIMIT(base) \ | ||
288 | (((uint64_t)base & 0xFFFFFFFF00000000) | 0xFFFFFFFF) | ||
289 | |||
290 | #define MAKE_LDS_APP_BASE(gpu_num) \ | ||
291 | (((uint64_t)(gpu_num) << 61) + 0x0) | ||
292 | #define MAKE_LDS_APP_LIMIT(base) \ | ||
293 | (((uint64_t)(base) & 0xFFFFFFFF00000000) | 0xFFFFFFFF) | ||
294 | |||
295 | int kfd_init_apertures(struct kfd_process *process) | ||
296 | { | ||
297 | uint8_t id = 0; | ||
298 | struct kfd_dev *dev; | ||
299 | struct kfd_process_device *pdd; | ||
300 | |||
301 | mutex_lock(&process->mutex); | ||
302 | |||
303 | /*Iterating over all devices*/ | ||
304 | while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && | ||
305 | id < NUM_OF_SUPPORTED_GPUS) { | ||
306 | |||
307 | pdd = kfd_get_process_device_data(dev, process, 1); | ||
308 | |||
309 | /* | ||
310 | * For 64 bit process aperture will be statically reserved in | ||
311 | * the x86_64 non canonical process address space | ||
312 | * amdkfd doesn't currently support apertures for 32 bit process | ||
313 | */ | ||
314 | if (process->is_32bit_user_mode) { | ||
315 | pdd->lds_base = pdd->lds_limit = 0; | ||
316 | pdd->gpuvm_base = pdd->gpuvm_limit = 0; | ||
317 | pdd->scratch_base = pdd->scratch_limit = 0; | ||
318 | } else { | ||
319 | /* | ||
320 | * node id couldn't be 0 - the three MSB bits of | ||
321 | * aperture shoudn't be 0 | ||
322 | */ | ||
323 | pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); | ||
324 | |||
325 | pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); | ||
326 | |||
327 | pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); | ||
328 | |||
329 | pdd->gpuvm_limit = | ||
330 | MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); | ||
331 | |||
332 | pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); | ||
333 | |||
334 | pdd->scratch_limit = | ||
335 | MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); | ||
336 | } | ||
337 | |||
338 | dev_dbg(kfd_device, "node id %u\n", id); | ||
339 | dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id); | ||
340 | dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base); | ||
341 | dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit); | ||
342 | dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base); | ||
343 | dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit); | ||
344 | dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base); | ||
345 | dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit); | ||
346 | |||
347 | id++; | ||
348 | } | ||
349 | |||
350 | mutex_unlock(&process->mutex); | ||
351 | |||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | |||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 45654be039ff..a05116b0a07d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c | |||
@@ -22,7 +22,6 @@ | |||
22 | 22 | ||
23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/notifier.h> | ||
26 | #include <linux/moduleparam.h> | 25 | #include <linux/moduleparam.h> |
27 | #include <linux/device.h> | 26 | #include <linux/device.h> |
28 | #include "kfd_priv.h" | 27 | #include "kfd_priv.h" |
@@ -46,6 +45,16 @@ static const struct kgd2kfd_calls kgd2kfd = { | |||
46 | .resume = kgd2kfd_resume, | 45 | .resume = kgd2kfd_resume, |
47 | }; | 46 | }; |
48 | 47 | ||
48 | int max_num_of_processes = KFD_MAX_NUM_OF_PROCESSES_DEFAULT; | ||
49 | module_param(max_num_of_processes, int, 0444); | ||
50 | MODULE_PARM_DESC(max_num_of_processes, | ||
51 | "Kernel cmdline parameter that defines the amdkfd maximum number of supported processes"); | ||
52 | |||
53 | int max_num_of_queues_per_process = KFD_MAX_NUM_OF_QUEUES_PER_PROCESS_DEFAULT; | ||
54 | module_param(max_num_of_queues_per_process, int, 0444); | ||
55 | MODULE_PARM_DESC(max_num_of_queues_per_process, | ||
56 | "Kernel cmdline parameter that defines the amdkfd maximum number of supported queues per process"); | ||
57 | |||
49 | bool kgd2kfd_init(unsigned interface_version, | 58 | bool kgd2kfd_init(unsigned interface_version, |
50 | const struct kfd2kgd_calls *f2g, | 59 | const struct kfd2kgd_calls *f2g, |
51 | const struct kgd2kfd_calls **g2f) | 60 | const struct kgd2kfd_calls **g2f) |
@@ -57,6 +66,10 @@ bool kgd2kfd_init(unsigned interface_version, | |||
57 | if (interface_version != KFD_INTERFACE_VERSION) | 66 | if (interface_version != KFD_INTERFACE_VERSION) |
58 | return false; | 67 | return false; |
59 | 68 | ||
69 | /* Protection against multiple amd kgd loads */ | ||
70 | if (kfd2kgd) | ||
71 | return true; | ||
72 | |||
60 | kfd2kgd = f2g; | 73 | kfd2kgd = f2g; |
61 | *g2f = &kgd2kfd; | 74 | *g2f = &kgd2kfd; |
62 | 75 | ||
@@ -72,6 +85,26 @@ static int __init kfd_module_init(void) | |||
72 | { | 85 | { |
73 | int err; | 86 | int err; |
74 | 87 | ||
88 | kfd2kgd = NULL; | ||
89 | |||
90 | /* Verify module parameters */ | ||
91 | if ((max_num_of_processes < 0) || | ||
92 | (max_num_of_processes > KFD_MAX_NUM_OF_PROCESSES)) { | ||
93 | pr_err("kfd: max_num_of_processes must be between 0 to KFD_MAX_NUM_OF_PROCESSES\n"); | ||
94 | return -1; | ||
95 | } | ||
96 | |||
97 | if ((max_num_of_queues_per_process < 0) || | ||
98 | (max_num_of_queues_per_process > | ||
99 | KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)) { | ||
100 | pr_err("kfd: max_num_of_queues_per_process must be between 0 to KFD_MAX_NUM_OF_QUEUES_PER_PROCESS\n"); | ||
101 | return -1; | ||
102 | } | ||
103 | |||
104 | err = kfd_pasid_init(); | ||
105 | if (err < 0) | ||
106 | goto err_pasid; | ||
107 | |||
75 | err = kfd_chardev_init(); | 108 | err = kfd_chardev_init(); |
76 | if (err < 0) | 109 | if (err < 0) |
77 | goto err_ioctl; | 110 | goto err_ioctl; |
@@ -80,6 +113,8 @@ static int __init kfd_module_init(void) | |||
80 | if (err < 0) | 113 | if (err < 0) |
81 | goto err_topology; | 114 | goto err_topology; |
82 | 115 | ||
116 | kfd_process_create_wq(); | ||
117 | |||
83 | dev_info(kfd_device, "Initialized module\n"); | 118 | dev_info(kfd_device, "Initialized module\n"); |
84 | 119 | ||
85 | return 0; | 120 | return 0; |
@@ -87,13 +122,17 @@ static int __init kfd_module_init(void) | |||
87 | err_topology: | 122 | err_topology: |
88 | kfd_chardev_exit(); | 123 | kfd_chardev_exit(); |
89 | err_ioctl: | 124 | err_ioctl: |
125 | kfd_pasid_exit(); | ||
126 | err_pasid: | ||
90 | return err; | 127 | return err; |
91 | } | 128 | } |
92 | 129 | ||
93 | static void __exit kfd_module_exit(void) | 130 | static void __exit kfd_module_exit(void) |
94 | { | 131 | { |
132 | kfd_process_destroy_wq(); | ||
95 | kfd_topology_shutdown(); | 133 | kfd_topology_shutdown(); |
96 | kfd_chardev_exit(); | 134 | kfd_chardev_exit(); |
135 | kfd_pasid_exit(); | ||
97 | dev_info(kfd_device, "Removed module\n"); | 136 | dev_info(kfd_device, "Removed module\n"); |
98 | } | 137 | } |
99 | 138 | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c new file mode 100644 index 000000000000..2458ab7c0c6e --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c | |||
@@ -0,0 +1,97 @@ | |||
1 | /* | ||
2 | * Copyright 2014 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | #include <linux/slab.h> | ||
24 | #include <linux/types.h> | ||
25 | #include "kfd_priv.h" | ||
26 | |||
27 | static unsigned long *pasid_bitmap; | ||
28 | static unsigned int pasid_limit; | ||
29 | static DEFINE_MUTEX(pasid_mutex); | ||
30 | |||
31 | int kfd_pasid_init(void) | ||
32 | { | ||
33 | pasid_limit = max_num_of_processes; | ||
34 | |||
35 | pasid_bitmap = kzalloc(DIV_ROUND_UP(pasid_limit, BITS_PER_BYTE), | ||
36 | GFP_KERNEL); | ||
37 | if (!pasid_bitmap) | ||
38 | return -ENOMEM; | ||
39 | |||
40 | set_bit(0, pasid_bitmap); /* PASID 0 is reserved. */ | ||
41 | |||
42 | return 0; | ||
43 | } | ||
44 | |||
45 | void kfd_pasid_exit(void) | ||
46 | { | ||
47 | kfree(pasid_bitmap); | ||
48 | } | ||
49 | |||
50 | bool kfd_set_pasid_limit(unsigned int new_limit) | ||
51 | { | ||
52 | if (new_limit < pasid_limit) { | ||
53 | bool ok; | ||
54 | |||
55 | mutex_lock(&pasid_mutex); | ||
56 | |||
57 | /* ensure that no pasids >= new_limit are in-use */ | ||
58 | ok = (find_next_bit(pasid_bitmap, pasid_limit, new_limit) == | ||
59 | pasid_limit); | ||
60 | if (ok) | ||
61 | pasid_limit = new_limit; | ||
62 | |||
63 | mutex_unlock(&pasid_mutex); | ||
64 | |||
65 | return ok; | ||
66 | } | ||
67 | |||
68 | return true; | ||
69 | } | ||
70 | |||
71 | inline unsigned int kfd_get_pasid_limit(void) | ||
72 | { | ||
73 | return pasid_limit; | ||
74 | } | ||
75 | |||
76 | unsigned int kfd_pasid_alloc(void) | ||
77 | { | ||
78 | unsigned int found; | ||
79 | |||
80 | mutex_lock(&pasid_mutex); | ||
81 | |||
82 | found = find_first_zero_bit(pasid_bitmap, pasid_limit); | ||
83 | if (found == pasid_limit) | ||
84 | found = 0; | ||
85 | else | ||
86 | set_bit(found, pasid_bitmap); | ||
87 | |||
88 | mutex_unlock(&pasid_mutex); | ||
89 | |||
90 | return found; | ||
91 | } | ||
92 | |||
93 | void kfd_pasid_free(unsigned int pasid) | ||
94 | { | ||
95 | BUG_ON(pasid == 0 || pasid >= pasid_limit); | ||
96 | clear_bit(pasid, pasid_bitmap); | ||
97 | } | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index b58b86dcc057..77d15dbebb0c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/atomic.h> | 30 | #include <linux/atomic.h> |
31 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
32 | #include <linux/spinlock.h> | 32 | #include <linux/spinlock.h> |
33 | #include <linux/kfd_ioctl.h> | ||
33 | #include <kgd_kfd_interface.h> | 34 | #include <kgd_kfd_interface.h> |
34 | 35 | ||
35 | #define KFD_SYSFS_FILE_MODE 0444 | 36 | #define KFD_SYSFS_FILE_MODE 0444 |
@@ -41,9 +42,26 @@ | |||
41 | #define kfd_alloc_struct(ptr_to_struct) \ | 42 | #define kfd_alloc_struct(ptr_to_struct) \ |
42 | ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) | 43 | ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) |
43 | 44 | ||
45 | /* Kernel module parameter to specify maximum number of supported processes */ | ||
46 | extern int max_num_of_processes; | ||
47 | |||
48 | #define KFD_MAX_NUM_OF_PROCESSES_DEFAULT 32 | ||
49 | #define KFD_MAX_NUM_OF_PROCESSES 512 | ||
50 | |||
51 | /* | ||
52 | * Kernel module parameter to specify maximum number of supported queues | ||
53 | * per process | ||
54 | */ | ||
55 | extern int max_num_of_queues_per_process; | ||
56 | |||
57 | #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS_DEFAULT 128 | ||
58 | #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024 | ||
59 | |||
60 | |||
44 | struct kfd_device_info { | 61 | struct kfd_device_info { |
45 | unsigned int max_pasid_bits; | 62 | unsigned int max_pasid_bits; |
46 | size_t ih_ring_entry_size; | 63 | size_t ih_ring_entry_size; |
64 | uint16_t mqd_size_aligned; | ||
47 | }; | 65 | }; |
48 | 66 | ||
49 | struct kfd_dev { | 67 | struct kfd_dev { |
@@ -54,6 +72,21 @@ struct kfd_dev { | |||
54 | 72 | ||
55 | unsigned int id; /* topology stub index */ | 73 | unsigned int id; /* topology stub index */ |
56 | 74 | ||
75 | phys_addr_t doorbell_base; /* Start of actual doorbells used by | ||
76 | * KFD. It is aligned for mapping | ||
77 | * into user mode | ||
78 | */ | ||
79 | size_t doorbell_id_offset; /* Doorbell offset (from KFD doorbell | ||
80 | * to HW doorbell, GFX reserved some | ||
81 | * at the start) | ||
82 | */ | ||
83 | size_t doorbell_process_limit; /* Number of processes we have doorbell | ||
84 | * space for. | ||
85 | */ | ||
86 | u32 __iomem *doorbell_kernel_ptr; /* This is a pointer for a doorbells | ||
87 | * page used by kernel queue | ||
88 | */ | ||
89 | |||
57 | struct kgd2kfd_shared_resources shared_resources; | 90 | struct kgd2kfd_shared_resources shared_resources; |
58 | 91 | ||
59 | bool init_complete; | 92 | bool init_complete; |
@@ -69,15 +102,122 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd); | |||
69 | 102 | ||
70 | extern const struct kfd2kgd_calls *kfd2kgd; | 103 | extern const struct kfd2kgd_calls *kfd2kgd; |
71 | 104 | ||
105 | struct kfd_mem_obj { | ||
106 | void *bo; | ||
107 | uint64_t gpu_addr; | ||
108 | uint32_t *cpu_ptr; | ||
109 | }; | ||
110 | |||
111 | enum kfd_mempool { | ||
112 | KFD_MEMPOOL_SYSTEM_CACHEABLE = 1, | ||
113 | KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2, | ||
114 | KFD_MEMPOOL_FRAMEBUFFER = 3, | ||
115 | }; | ||
116 | |||
72 | /* Character device interface */ | 117 | /* Character device interface */ |
73 | int kfd_chardev_init(void); | 118 | int kfd_chardev_init(void); |
74 | void kfd_chardev_exit(void); | 119 | void kfd_chardev_exit(void); |
75 | struct device *kfd_chardev(void); | 120 | struct device *kfd_chardev(void); |
76 | 121 | ||
122 | |||
123 | /* Data that is per-process-per device. */ | ||
124 | struct kfd_process_device { | ||
125 | /* | ||
126 | * List of all per-device data for a process. | ||
127 | * Starts from kfd_process.per_device_data. | ||
128 | */ | ||
129 | struct list_head per_device_list; | ||
130 | |||
131 | /* The device that owns this data. */ | ||
132 | struct kfd_dev *dev; | ||
133 | |||
134 | |||
135 | /*Apertures*/ | ||
136 | uint64_t lds_base; | ||
137 | uint64_t lds_limit; | ||
138 | uint64_t gpuvm_base; | ||
139 | uint64_t gpuvm_limit; | ||
140 | uint64_t scratch_base; | ||
141 | uint64_t scratch_limit; | ||
142 | |||
143 | /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ | ||
144 | bool bound; | ||
145 | }; | ||
146 | |||
77 | /* Process data */ | 147 | /* Process data */ |
78 | struct kfd_process { | 148 | struct kfd_process { |
149 | /* | ||
150 | * kfd_process are stored in an mm_struct*->kfd_process* | ||
151 | * hash table (kfd_processes in kfd_process.c) | ||
152 | */ | ||
153 | struct hlist_node kfd_processes; | ||
154 | |||
155 | struct mm_struct *mm; | ||
156 | |||
157 | struct mutex mutex; | ||
158 | |||
159 | /* | ||
160 | * In any process, the thread that started main() is the lead | ||
161 | * thread and outlives the rest. | ||
162 | * It is here because amd_iommu_bind_pasid wants a task_struct. | ||
163 | */ | ||
164 | struct task_struct *lead_thread; | ||
165 | |||
166 | /* We want to receive a notification when the mm_struct is destroyed */ | ||
167 | struct mmu_notifier mmu_notifier; | ||
168 | |||
169 | /* Use for delayed freeing of kfd_process structure */ | ||
170 | struct rcu_head rcu; | ||
171 | |||
172 | unsigned int pasid; | ||
173 | |||
174 | /* | ||
175 | * List of kfd_process_device structures, | ||
176 | * one for each device the process is using. | ||
177 | */ | ||
178 | struct list_head per_device_data; | ||
179 | |||
180 | /* The process's queues. */ | ||
181 | size_t queue_array_size; | ||
182 | |||
183 | /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ | ||
184 | struct kfd_queue **queues; | ||
185 | |||
186 | unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; | ||
187 | |||
188 | /*Is the user space process 32 bit?*/ | ||
189 | bool is_32bit_user_mode; | ||
79 | }; | 190 | }; |
80 | 191 | ||
192 | void kfd_process_create_wq(void); | ||
193 | void kfd_process_destroy_wq(void); | ||
194 | struct kfd_process *kfd_create_process(const struct task_struct *); | ||
195 | struct kfd_process *kfd_get_process(const struct task_struct *); | ||
196 | |||
197 | struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, | ||
198 | struct kfd_process *p, | ||
199 | int create_pdd); | ||
200 | |||
201 | /* PASIDs */ | ||
202 | int kfd_pasid_init(void); | ||
203 | void kfd_pasid_exit(void); | ||
204 | bool kfd_set_pasid_limit(unsigned int new_limit); | ||
205 | unsigned int kfd_get_pasid_limit(void); | ||
206 | unsigned int kfd_pasid_alloc(void); | ||
207 | void kfd_pasid_free(unsigned int pasid); | ||
208 | |||
209 | /* Doorbells */ | ||
210 | void kfd_doorbell_init(struct kfd_dev *kfd); | ||
211 | int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); | ||
212 | u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, | ||
213 | unsigned int *doorbell_off); | ||
214 | void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); | ||
215 | u32 read_kernel_doorbell(u32 __iomem *db); | ||
216 | void write_kernel_doorbell(u32 __iomem *db, u32 value); | ||
217 | unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, | ||
218 | struct kfd_process *process, | ||
219 | unsigned int queue_id); | ||
220 | |||
81 | extern struct device *kfd_device; | 221 | extern struct device *kfd_device; |
82 | 222 | ||
83 | /* Topology */ | 223 | /* Topology */ |
@@ -96,4 +236,11 @@ void kgd2kfd_interrupt(struct kfd_dev *dev, const void *ih_ring_entry); | |||
96 | void kgd2kfd_suspend(struct kfd_dev *dev); | 236 | void kgd2kfd_suspend(struct kfd_dev *dev); |
97 | int kgd2kfd_resume(struct kfd_dev *dev); | 237 | int kgd2kfd_resume(struct kfd_dev *dev); |
98 | 238 | ||
239 | /* amdkfd Apertures */ | ||
240 | int kfd_init_apertures(struct kfd_process *process); | ||
241 | |||
242 | uint64_t kfd_get_number_elems(struct kfd_dev *kfd); | ||
243 | phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, | ||
244 | struct kfd_process *process); | ||
245 | |||
99 | #endif | 246 | #endif |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c new file mode 100644 index 000000000000..5596f698cc11 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c | |||
@@ -0,0 +1,383 @@ | |||
1 | /* | ||
2 | * Copyright 2014 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | #include <linux/mutex.h> | ||
24 | #include <linux/log2.h> | ||
25 | #include <linux/sched.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/notifier.h> | ||
28 | struct mm_struct; | ||
29 | |||
30 | #include "kfd_priv.h" | ||
31 | |||
32 | /* | ||
33 | * Initial size for the array of queues. | ||
34 | * The allocated size is doubled each time | ||
35 | * it is exceeded up to MAX_PROCESS_QUEUES. | ||
36 | */ | ||
37 | #define INITIAL_QUEUE_ARRAY_SIZE 16 | ||
38 | |||
39 | /* | ||
40 | * List of struct kfd_process (field kfd_process). | ||
41 | * Unique/indexed by mm_struct* | ||
42 | */ | ||
43 | #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ | ||
44 | static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); | ||
45 | static DEFINE_MUTEX(kfd_processes_mutex); | ||
46 | |||
47 | DEFINE_STATIC_SRCU(kfd_processes_srcu); | ||
48 | |||
49 | static struct workqueue_struct *kfd_process_wq; | ||
50 | |||
51 | struct kfd_process_release_work { | ||
52 | struct work_struct kfd_work; | ||
53 | struct kfd_process *p; | ||
54 | }; | ||
55 | |||
56 | static struct kfd_process *find_process(const struct task_struct *thread); | ||
57 | static struct kfd_process *create_process(const struct task_struct *thread); | ||
58 | |||
59 | void kfd_process_create_wq(void) | ||
60 | { | ||
61 | if (!kfd_process_wq) | ||
62 | kfd_process_wq = create_workqueue("kfd_process_wq"); | ||
63 | } | ||
64 | |||
65 | void kfd_process_destroy_wq(void) | ||
66 | { | ||
67 | if (kfd_process_wq) { | ||
68 | flush_workqueue(kfd_process_wq); | ||
69 | destroy_workqueue(kfd_process_wq); | ||
70 | kfd_process_wq = NULL; | ||
71 | } | ||
72 | } | ||
73 | |||
74 | struct kfd_process *kfd_create_process(const struct task_struct *thread) | ||
75 | { | ||
76 | struct kfd_process *process; | ||
77 | |||
78 | BUG_ON(!kfd_process_wq); | ||
79 | |||
80 | if (thread->mm == NULL) | ||
81 | return ERR_PTR(-EINVAL); | ||
82 | |||
83 | /* Only the pthreads threading model is supported. */ | ||
84 | if (thread->group_leader->mm != thread->mm) | ||
85 | return ERR_PTR(-EINVAL); | ||
86 | |||
87 | /* Take mmap_sem because we call __mmu_notifier_register inside */ | ||
88 | down_write(&thread->mm->mmap_sem); | ||
89 | |||
90 | /* | ||
91 | * take kfd processes mutex before starting of process creation | ||
92 | * so there won't be a case where two threads of the same process | ||
93 | * create two kfd_process structures | ||
94 | */ | ||
95 | mutex_lock(&kfd_processes_mutex); | ||
96 | |||
97 | /* A prior open of /dev/kfd could have already created the process. */ | ||
98 | process = find_process(thread); | ||
99 | if (process) | ||
100 | pr_debug("kfd: process already found\n"); | ||
101 | |||
102 | if (!process) | ||
103 | process = create_process(thread); | ||
104 | |||
105 | mutex_unlock(&kfd_processes_mutex); | ||
106 | |||
107 | up_write(&thread->mm->mmap_sem); | ||
108 | |||
109 | return process; | ||
110 | } | ||
111 | |||
112 | struct kfd_process *kfd_get_process(const struct task_struct *thread) | ||
113 | { | ||
114 | struct kfd_process *process; | ||
115 | |||
116 | if (thread->mm == NULL) | ||
117 | return ERR_PTR(-EINVAL); | ||
118 | |||
119 | /* Only the pthreads threading model is supported. */ | ||
120 | if (thread->group_leader->mm != thread->mm) | ||
121 | return ERR_PTR(-EINVAL); | ||
122 | |||
123 | process = find_process(thread); | ||
124 | |||
125 | return process; | ||
126 | } | ||
127 | |||
128 | static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) | ||
129 | { | ||
130 | struct kfd_process *process; | ||
131 | |||
132 | hash_for_each_possible_rcu(kfd_processes_table, process, | ||
133 | kfd_processes, (uintptr_t)mm) | ||
134 | if (process->mm == mm) | ||
135 | return process; | ||
136 | |||
137 | return NULL; | ||
138 | } | ||
139 | |||
140 | static struct kfd_process *find_process(const struct task_struct *thread) | ||
141 | { | ||
142 | struct kfd_process *p; | ||
143 | int idx; | ||
144 | |||
145 | idx = srcu_read_lock(&kfd_processes_srcu); | ||
146 | p = find_process_by_mm(thread->mm); | ||
147 | srcu_read_unlock(&kfd_processes_srcu, idx); | ||
148 | |||
149 | return p; | ||
150 | } | ||
151 | |||
152 | static void kfd_process_wq_release(struct work_struct *work) | ||
153 | { | ||
154 | struct kfd_process_release_work *my_work; | ||
155 | struct kfd_process_device *pdd, *temp; | ||
156 | struct kfd_process *p; | ||
157 | |||
158 | my_work = (struct kfd_process_release_work *) work; | ||
159 | |||
160 | p = my_work->p; | ||
161 | |||
162 | mutex_lock(&p->mutex); | ||
163 | |||
164 | list_for_each_entry_safe(pdd, temp, &p->per_device_data, | ||
165 | per_device_list) { | ||
166 | list_del(&pdd->per_device_list); | ||
167 | |||
168 | kfree(pdd); | ||
169 | } | ||
170 | |||
171 | kfd_pasid_free(p->pasid); | ||
172 | |||
173 | mutex_unlock(&p->mutex); | ||
174 | |||
175 | mutex_destroy(&p->mutex); | ||
176 | |||
177 | kfree(p->queues); | ||
178 | |||
179 | kfree(p); | ||
180 | |||
181 | kfree((void *)work); | ||
182 | } | ||
183 | |||
184 | static void kfd_process_destroy_delayed(struct rcu_head *rcu) | ||
185 | { | ||
186 | struct kfd_process_release_work *work; | ||
187 | struct kfd_process *p; | ||
188 | |||
189 | BUG_ON(!kfd_process_wq); | ||
190 | |||
191 | p = container_of(rcu, struct kfd_process, rcu); | ||
192 | BUG_ON(atomic_read(&p->mm->mm_count) <= 0); | ||
193 | |||
194 | mmdrop(p->mm); | ||
195 | |||
196 | work = (struct kfd_process_release_work *) | ||
197 | kmalloc(sizeof(struct kfd_process_release_work), GFP_KERNEL); | ||
198 | |||
199 | if (work) { | ||
200 | INIT_WORK((struct work_struct *) work, kfd_process_wq_release); | ||
201 | work->p = p; | ||
202 | queue_work(kfd_process_wq, (struct work_struct *) work); | ||
203 | } | ||
204 | } | ||
205 | |||
206 | static void kfd_process_notifier_release(struct mmu_notifier *mn, | ||
207 | struct mm_struct *mm) | ||
208 | { | ||
209 | struct kfd_process *p; | ||
210 | |||
211 | /* | ||
212 | * The kfd_process structure can not be free because the | ||
213 | * mmu_notifier srcu is read locked | ||
214 | */ | ||
215 | p = container_of(mn, struct kfd_process, mmu_notifier); | ||
216 | BUG_ON(p->mm != mm); | ||
217 | |||
218 | mutex_lock(&kfd_processes_mutex); | ||
219 | hash_del_rcu(&p->kfd_processes); | ||
220 | mutex_unlock(&kfd_processes_mutex); | ||
221 | synchronize_srcu(&kfd_processes_srcu); | ||
222 | |||
223 | /* | ||
224 | * Because we drop mm_count inside kfd_process_destroy_delayed | ||
225 | * and because the mmu_notifier_unregister function also drop | ||
226 | * mm_count we need to take an extra count here. | ||
227 | */ | ||
228 | atomic_inc(&p->mm->mm_count); | ||
229 | mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm); | ||
230 | mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); | ||
231 | } | ||
232 | |||
233 | static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { | ||
234 | .release = kfd_process_notifier_release, | ||
235 | }; | ||
236 | |||
237 | static struct kfd_process *create_process(const struct task_struct *thread) | ||
238 | { | ||
239 | struct kfd_process *process; | ||
240 | int err = -ENOMEM; | ||
241 | |||
242 | process = kzalloc(sizeof(*process), GFP_KERNEL); | ||
243 | |||
244 | if (!process) | ||
245 | goto err_alloc_process; | ||
246 | |||
247 | process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, | ||
248 | sizeof(process->queues[0]), GFP_KERNEL); | ||
249 | if (!process->queues) | ||
250 | goto err_alloc_queues; | ||
251 | |||
252 | process->pasid = kfd_pasid_alloc(); | ||
253 | if (process->pasid == 0) | ||
254 | goto err_alloc_pasid; | ||
255 | |||
256 | mutex_init(&process->mutex); | ||
257 | |||
258 | process->mm = thread->mm; | ||
259 | |||
260 | /* register notifier */ | ||
261 | process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; | ||
262 | err = __mmu_notifier_register(&process->mmu_notifier, process->mm); | ||
263 | if (err) | ||
264 | goto err_mmu_notifier; | ||
265 | |||
266 | hash_add_rcu(kfd_processes_table, &process->kfd_processes, | ||
267 | (uintptr_t)process->mm); | ||
268 | |||
269 | process->lead_thread = thread->group_leader; | ||
270 | |||
271 | process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE; | ||
272 | |||
273 | INIT_LIST_HEAD(&process->per_device_data); | ||
274 | |||
275 | return process; | ||
276 | |||
277 | err_mmu_notifier: | ||
278 | kfd_pasid_free(process->pasid); | ||
279 | err_alloc_pasid: | ||
280 | kfree(process->queues); | ||
281 | err_alloc_queues: | ||
282 | kfree(process); | ||
283 | err_alloc_process: | ||
284 | return ERR_PTR(err); | ||
285 | } | ||
286 | |||
287 | struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, | ||
288 | struct kfd_process *p, | ||
289 | int create_pdd) | ||
290 | { | ||
291 | struct kfd_process_device *pdd = NULL; | ||
292 | |||
293 | list_for_each_entry(pdd, &p->per_device_data, per_device_list) | ||
294 | if (pdd->dev == dev) | ||
295 | return pdd; | ||
296 | |||
297 | if (create_pdd) { | ||
298 | pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); | ||
299 | if (pdd != NULL) { | ||
300 | pdd->dev = dev; | ||
301 | list_add(&pdd->per_device_list, &p->per_device_data); | ||
302 | } | ||
303 | } | ||
304 | |||
305 | return pdd; | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * Direct the IOMMU to bind the process (specifically the pasid->mm) | ||
310 | * to the device. | ||
311 | * Unbinding occurs when the process dies or the device is removed. | ||
312 | * | ||
313 | * Assumes that the process lock is held. | ||
314 | */ | ||
315 | struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, | ||
316 | struct kfd_process *p) | ||
317 | { | ||
318 | struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p, 1); | ||
319 | |||
320 | if (pdd == NULL) | ||
321 | return ERR_PTR(-ENOMEM); | ||
322 | |||
323 | if (pdd->bound) | ||
324 | return pdd; | ||
325 | |||
326 | pdd->bound = true; | ||
327 | |||
328 | return pdd; | ||
329 | } | ||
330 | |||
331 | void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) | ||
332 | { | ||
333 | struct kfd_process *p; | ||
334 | struct kfd_process_device *pdd; | ||
335 | int idx, i; | ||
336 | |||
337 | BUG_ON(dev == NULL); | ||
338 | |||
339 | idx = srcu_read_lock(&kfd_processes_srcu); | ||
340 | |||
341 | hash_for_each_rcu(kfd_processes_table, i, p, kfd_processes) | ||
342 | if (p->pasid == pasid) | ||
343 | break; | ||
344 | |||
345 | srcu_read_unlock(&kfd_processes_srcu, idx); | ||
346 | |||
347 | BUG_ON(p->pasid != pasid); | ||
348 | |||
349 | mutex_lock(&p->mutex); | ||
350 | |||
351 | pdd = kfd_get_process_device_data(dev, p, 0); | ||
352 | |||
353 | /* | ||
354 | * Just mark pdd as unbound, because we still need it to call | ||
355 | * amd_iommu_unbind_pasid() in when the process exits. | ||
356 | * We don't call amd_iommu_unbind_pasid() here | ||
357 | * because the IOMMU called us. | ||
358 | */ | ||
359 | if (pdd) | ||
360 | pdd->bound = false; | ||
361 | |||
362 | mutex_unlock(&p->mutex); | ||
363 | } | ||
364 | |||
365 | struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p) | ||
366 | { | ||
367 | return list_first_entry(&p->per_device_data, | ||
368 | struct kfd_process_device, | ||
369 | per_device_list); | ||
370 | } | ||
371 | |||
372 | struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, | ||
373 | struct kfd_process_device *pdd) | ||
374 | { | ||
375 | if (list_is_last(&pdd->per_device_list, &p->per_device_data)) | ||
376 | return NULL; | ||
377 | return list_next_entry(pdd, per_device_list); | ||
378 | } | ||
379 | |||
380 | bool kfd_has_process_device_data(struct kfd_process *p) | ||
381 | { | ||
382 | return !(list_empty(&p->per_device_data)); | ||
383 | } | ||