diff options
author | Dave Airlie <airlied@redhat.com> | 2018-05-15 01:59:10 -0400 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2018-05-15 02:06:08 -0400 |
commit | c76f0b2cc2f1be1a8a20f0fe2c0f30919bc559fb (patch) | |
tree | 1aeeb74795b2951952aa443f7104d6c090c58141 /drivers/gpu/drm/amd/amdgpu | |
parent | 444ac87becd8a2ff76f9e4194dd98da4f5d5586d (diff) | |
parent | af47b390273f1068bdb1d01263a81948c4e2f97a (diff) |
Merge tag 'drm-amdkfd-next-2018-05-14' of git://people.freedesktop.org/~gabbayo/linux into drm-next
This is amdkfd pull for 4.18. The major new features are:
- Add support for GFXv9 dGPUs (VEGA)
- Add support for userptr memory mapping
In addition, there are a couple of small fixes and improvements, such as:
- Fix lock handling
- Fix rollback packet in kernel kfd_queue
- Optimize kfd signal handling
- Fix CP hang in APU
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180514070126.GA1827@odedg-x270
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/Makefile | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 26 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 13 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 1043 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 572 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 111 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h | 11 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 38 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/soc15d.h | 5 |
13 files changed, 1766 insertions, 79 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 2ca2b5154d52..f3002020df6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile | |||
@@ -130,7 +130,8 @@ amdgpu-y += \ | |||
130 | amdgpu_amdkfd.o \ | 130 | amdgpu_amdkfd.o \ |
131 | amdgpu_amdkfd_fence.o \ | 131 | amdgpu_amdkfd_fence.o \ |
132 | amdgpu_amdkfd_gpuvm.o \ | 132 | amdgpu_amdkfd_gpuvm.o \ |
133 | amdgpu_amdkfd_gfx_v8.o | 133 | amdgpu_amdkfd_gfx_v8.o \ |
134 | amdgpu_amdkfd_gfx_v9.o | ||
134 | 135 | ||
135 | # add cgs | 136 | # add cgs |
136 | amdgpu-y += amdgpu_cgs.o | 137 | amdgpu-y += amdgpu_cgs.o |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 4d36203ffb11..cd0e8f192e6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | |||
@@ -92,6 +92,10 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) | |||
92 | case CHIP_POLARIS11: | 92 | case CHIP_POLARIS11: |
93 | kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); | 93 | kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); |
94 | break; | 94 | break; |
95 | case CHIP_VEGA10: | ||
96 | case CHIP_RAVEN: | ||
97 | kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); | ||
98 | break; | ||
95 | default: | 99 | default: |
96 | dev_dbg(adev->dev, "kfd not supported on this ASIC\n"); | 100 | dev_dbg(adev->dev, "kfd not supported on this ASIC\n"); |
97 | return; | 101 | return; |
@@ -175,6 +179,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) | |||
175 | &gpu_resources.doorbell_physical_address, | 179 | &gpu_resources.doorbell_physical_address, |
176 | &gpu_resources.doorbell_aperture_size, | 180 | &gpu_resources.doorbell_aperture_size, |
177 | &gpu_resources.doorbell_start_offset); | 181 | &gpu_resources.doorbell_start_offset); |
182 | if (adev->asic_type >= CHIP_VEGA10) { | ||
183 | /* On SOC15 the BIF is involved in routing | ||
184 | * doorbells using the low 12 bits of the | ||
185 | * address. Communicate the assignments to | ||
186 | * KFD. KFD uses two doorbell pages per | ||
187 | * process in case of 64-bit doorbells so we | ||
188 | * can use each doorbell assignment twice. | ||
189 | */ | ||
190 | gpu_resources.sdma_doorbell[0][0] = | ||
191 | AMDGPU_DOORBELL64_sDMA_ENGINE0; | ||
192 | gpu_resources.sdma_doorbell[0][1] = | ||
193 | AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; | ||
194 | gpu_resources.sdma_doorbell[1][0] = | ||
195 | AMDGPU_DOORBELL64_sDMA_ENGINE1; | ||
196 | gpu_resources.sdma_doorbell[1][1] = | ||
197 | AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; | ||
198 | /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for | ||
199 | * SDMA, IH and VCN. So don't use them for the CP. | ||
200 | */ | ||
201 | gpu_resources.reserved_doorbell_mask = 0x1f0; | ||
202 | gpu_resources.reserved_doorbell_val = 0x0f0; | ||
203 | } | ||
178 | 204 | ||
179 | kgd2kfd->device_init(adev->kfd, &gpu_resources); | 205 | kgd2kfd->device_init(adev->kfd, &gpu_resources); |
180 | } | 206 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index c2c2bea731e0..12367a9951e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/types.h> | 28 | #include <linux/types.h> |
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/mmu_context.h> | 30 | #include <linux/mmu_context.h> |
31 | #include <linux/workqueue.h> | ||
31 | #include <kgd_kfd_interface.h> | 32 | #include <kgd_kfd_interface.h> |
32 | #include <drm/ttm/ttm_execbuf_util.h> | 33 | #include <drm/ttm/ttm_execbuf_util.h> |
33 | #include "amdgpu_sync.h" | 34 | #include "amdgpu_sync.h" |
@@ -59,7 +60,9 @@ struct kgd_mem { | |||
59 | 60 | ||
60 | uint32_t mapping_flags; | 61 | uint32_t mapping_flags; |
61 | 62 | ||
63 | atomic_t invalid; | ||
62 | struct amdkfd_process_info *process_info; | 64 | struct amdkfd_process_info *process_info; |
65 | struct page **user_pages; | ||
63 | 66 | ||
64 | struct amdgpu_sync sync; | 67 | struct amdgpu_sync sync; |
65 | 68 | ||
@@ -84,6 +87,9 @@ struct amdkfd_process_info { | |||
84 | struct list_head vm_list_head; | 87 | struct list_head vm_list_head; |
85 | /* List head for all KFD BOs that belong to a KFD process. */ | 88 | /* List head for all KFD BOs that belong to a KFD process. */ |
86 | struct list_head kfd_bo_list; | 89 | struct list_head kfd_bo_list; |
90 | /* List of userptr BOs that are valid or invalid */ | ||
91 | struct list_head userptr_valid_list; | ||
92 | struct list_head userptr_inval_list; | ||
87 | /* Lock to protect kfd_bo_list */ | 93 | /* Lock to protect kfd_bo_list */ |
88 | struct mutex lock; | 94 | struct mutex lock; |
89 | 95 | ||
@@ -91,6 +97,11 @@ struct amdkfd_process_info { | |||
91 | unsigned int n_vms; | 97 | unsigned int n_vms; |
92 | /* Eviction Fence */ | 98 | /* Eviction Fence */ |
93 | struct amdgpu_amdkfd_fence *eviction_fence; | 99 | struct amdgpu_amdkfd_fence *eviction_fence; |
100 | |||
101 | /* MMU-notifier related fields */ | ||
102 | atomic_t evicted_bos; | ||
103 | struct delayed_work restore_userptr_work; | ||
104 | struct pid *pid; | ||
94 | }; | 105 | }; |
95 | 106 | ||
96 | int amdgpu_amdkfd_init(void); | 107 | int amdgpu_amdkfd_init(void); |
@@ -104,12 +115,14 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); | |||
104 | void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); | 115 | void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); |
105 | void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); | 116 | void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); |
106 | 117 | ||
118 | int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); | ||
107 | int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, | 119 | int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, |
108 | uint32_t vmid, uint64_t gpu_addr, | 120 | uint32_t vmid, uint64_t gpu_addr, |
109 | uint32_t *ib_cmd, uint32_t ib_len); | 121 | uint32_t *ib_cmd, uint32_t ib_len); |
110 | 122 | ||
111 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); | 123 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); |
112 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); | 124 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); |
125 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); | ||
113 | 126 | ||
114 | bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); | 127 | bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); |
115 | 128 | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index ea54e53172b9..0ff36d45a597 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | |||
@@ -98,8 +98,6 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, | |||
98 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | 98 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, |
99 | unsigned int vmid); | 99 | unsigned int vmid); |
100 | 100 | ||
101 | static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, | ||
102 | uint32_t hpd_size, uint64_t hpd_gpu_addr); | ||
103 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); | 101 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); |
104 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | 102 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, |
105 | uint32_t queue_id, uint32_t __user *wptr, | 103 | uint32_t queue_id, uint32_t __user *wptr, |
@@ -183,7 +181,6 @@ static const struct kfd2kgd_calls kfd2kgd = { | |||
183 | .free_pasid = amdgpu_pasid_free, | 181 | .free_pasid = amdgpu_pasid_free, |
184 | .program_sh_mem_settings = kgd_program_sh_mem_settings, | 182 | .program_sh_mem_settings = kgd_program_sh_mem_settings, |
185 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, | 183 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, |
186 | .init_pipeline = kgd_init_pipeline, | ||
187 | .init_interrupts = kgd_init_interrupts, | 184 | .init_interrupts = kgd_init_interrupts, |
188 | .hqd_load = kgd_hqd_load, | 185 | .hqd_load = kgd_hqd_load, |
189 | .hqd_sdma_load = kgd_hqd_sdma_load, | 186 | .hqd_sdma_load = kgd_hqd_sdma_load, |
@@ -309,13 +306,6 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | |||
309 | return 0; | 306 | return 0; |
310 | } | 307 | } |
311 | 308 | ||
312 | static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, | ||
313 | uint32_t hpd_size, uint64_t hpd_gpu_addr) | ||
314 | { | ||
315 | /* amdgpu owns the per-pipe state */ | ||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) | 309 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) |
320 | { | 310 | { |
321 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | 311 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index 89264c9a5e9f..6ef9762b4b00 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | |||
@@ -57,8 +57,6 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, | |||
57 | uint32_t sh_mem_bases); | 57 | uint32_t sh_mem_bases); |
58 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | 58 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, |
59 | unsigned int vmid); | 59 | unsigned int vmid); |
60 | static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, | ||
61 | uint32_t hpd_size, uint64_t hpd_gpu_addr); | ||
62 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); | 60 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); |
63 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | 61 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, |
64 | uint32_t queue_id, uint32_t __user *wptr, | 62 | uint32_t queue_id, uint32_t __user *wptr, |
@@ -141,7 +139,6 @@ static const struct kfd2kgd_calls kfd2kgd = { | |||
141 | .free_pasid = amdgpu_pasid_free, | 139 | .free_pasid = amdgpu_pasid_free, |
142 | .program_sh_mem_settings = kgd_program_sh_mem_settings, | 140 | .program_sh_mem_settings = kgd_program_sh_mem_settings, |
143 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, | 141 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, |
144 | .init_pipeline = kgd_init_pipeline, | ||
145 | .init_interrupts = kgd_init_interrupts, | 142 | .init_interrupts = kgd_init_interrupts, |
146 | .hqd_load = kgd_hqd_load, | 143 | .hqd_load = kgd_hqd_load, |
147 | .hqd_sdma_load = kgd_hqd_sdma_load, | 144 | .hqd_sdma_load = kgd_hqd_sdma_load, |
@@ -270,13 +267,6 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | |||
270 | return 0; | 267 | return 0; |
271 | } | 268 | } |
272 | 269 | ||
273 | static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, | ||
274 | uint32_t hpd_size, uint64_t hpd_gpu_addr) | ||
275 | { | ||
276 | /* amdgpu owns the per-pipe state */ | ||
277 | return 0; | ||
278 | } | ||
279 | |||
280 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) | 270 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) |
281 | { | 271 | { |
282 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | 272 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c new file mode 100644 index 000000000000..8f37991df61b --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | |||
@@ -0,0 +1,1043 @@ | |||
1 | /* | ||
2 | * Copyright 2014-2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | #define pr_fmt(fmt) "kfd2kgd: " fmt | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/fdtable.h> | ||
27 | #include <linux/uaccess.h> | ||
28 | #include <linux/firmware.h> | ||
29 | #include <drm/drmP.h> | ||
30 | #include "amdgpu.h" | ||
31 | #include "amdgpu_amdkfd.h" | ||
32 | #include "amdgpu_ucode.h" | ||
33 | #include "soc15_hw_ip.h" | ||
34 | #include "gc/gc_9_0_offset.h" | ||
35 | #include "gc/gc_9_0_sh_mask.h" | ||
36 | #include "vega10_enum.h" | ||
37 | #include "sdma0/sdma0_4_0_offset.h" | ||
38 | #include "sdma0/sdma0_4_0_sh_mask.h" | ||
39 | #include "sdma1/sdma1_4_0_offset.h" | ||
40 | #include "sdma1/sdma1_4_0_sh_mask.h" | ||
41 | #include "athub/athub_1_0_offset.h" | ||
42 | #include "athub/athub_1_0_sh_mask.h" | ||
43 | #include "oss/osssys_4_0_offset.h" | ||
44 | #include "oss/osssys_4_0_sh_mask.h" | ||
45 | #include "soc15_common.h" | ||
46 | #include "v9_structs.h" | ||
47 | #include "soc15.h" | ||
48 | #include "soc15d.h" | ||
49 | |||
50 | /* HACK: MMHUB and GC both have VM-related register with the same | ||
51 | * names but different offsets. Define the MMHUB register we need here | ||
52 | * with a prefix. A proper solution would be to move the functions | ||
53 | * programming these registers into gfx_v9_0.c and mmhub_v1_0.c | ||
54 | * respectively. | ||
55 | */ | ||
56 | #define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 | ||
57 | #define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 | ||
58 | |||
59 | #define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 | ||
60 | #define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 | ||
61 | |||
62 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b | ||
63 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 | ||
64 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c | ||
65 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 | ||
66 | |||
67 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b | ||
68 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 | ||
69 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c | ||
70 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 | ||
71 | |||
72 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b | ||
73 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 | ||
74 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c | ||
75 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 | ||
76 | |||
77 | #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 | ||
78 | #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 | ||
79 | #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 | ||
80 | #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 | ||
81 | |||
82 | #define V9_PIPE_PER_MEC (4) | ||
83 | #define V9_QUEUES_PER_PIPE_MEC (8) | ||
84 | |||
85 | enum hqd_dequeue_request_type { | ||
86 | NO_ACTION = 0, | ||
87 | DRAIN_PIPE, | ||
88 | RESET_WAVES | ||
89 | }; | ||
90 | |||
91 | /* | ||
92 | * Register access functions | ||
93 | */ | ||
94 | |||
95 | static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, | ||
96 | uint32_t sh_mem_config, | ||
97 | uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, | ||
98 | uint32_t sh_mem_bases); | ||
99 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | ||
100 | unsigned int vmid); | ||
101 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); | ||
102 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | ||
103 | uint32_t queue_id, uint32_t __user *wptr, | ||
104 | uint32_t wptr_shift, uint32_t wptr_mask, | ||
105 | struct mm_struct *mm); | ||
106 | static int kgd_hqd_dump(struct kgd_dev *kgd, | ||
107 | uint32_t pipe_id, uint32_t queue_id, | ||
108 | uint32_t (**dump)[2], uint32_t *n_regs); | ||
109 | static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, | ||
110 | uint32_t __user *wptr, struct mm_struct *mm); | ||
111 | static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, | ||
112 | uint32_t engine_id, uint32_t queue_id, | ||
113 | uint32_t (**dump)[2], uint32_t *n_regs); | ||
114 | static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, | ||
115 | uint32_t pipe_id, uint32_t queue_id); | ||
116 | static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); | ||
117 | static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, | ||
118 | enum kfd_preempt_type reset_type, | ||
119 | unsigned int utimeout, uint32_t pipe_id, | ||
120 | uint32_t queue_id); | ||
121 | static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, | ||
122 | unsigned int utimeout); | ||
123 | static int kgd_address_watch_disable(struct kgd_dev *kgd); | ||
124 | static int kgd_address_watch_execute(struct kgd_dev *kgd, | ||
125 | unsigned int watch_point_id, | ||
126 | uint32_t cntl_val, | ||
127 | uint32_t addr_hi, | ||
128 | uint32_t addr_lo); | ||
129 | static int kgd_wave_control_execute(struct kgd_dev *kgd, | ||
130 | uint32_t gfx_index_val, | ||
131 | uint32_t sq_cmd); | ||
132 | static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, | ||
133 | unsigned int watch_point_id, | ||
134 | unsigned int reg_offset); | ||
135 | |||
136 | static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, | ||
137 | uint8_t vmid); | ||
138 | static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, | ||
139 | uint8_t vmid); | ||
140 | static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, | ||
141 | uint32_t page_table_base); | ||
142 | static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); | ||
143 | static void set_scratch_backing_va(struct kgd_dev *kgd, | ||
144 | uint64_t va, uint32_t vmid); | ||
145 | static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); | ||
146 | static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid); | ||
147 | |||
148 | /* Because of REG_GET_FIELD() being used, we put this function in the | ||
149 | * asic specific file. | ||
150 | */ | ||
151 | static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, | ||
152 | struct tile_config *config) | ||
153 | { | ||
154 | struct amdgpu_device *adev = (struct amdgpu_device *)kgd; | ||
155 | |||
156 | config->gb_addr_config = adev->gfx.config.gb_addr_config; | ||
157 | |||
158 | config->tile_config_ptr = adev->gfx.config.tile_mode_array; | ||
159 | config->num_tile_configs = | ||
160 | ARRAY_SIZE(adev->gfx.config.tile_mode_array); | ||
161 | config->macro_tile_config_ptr = | ||
162 | adev->gfx.config.macrotile_mode_array; | ||
163 | config->num_macro_tile_configs = | ||
164 | ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); | ||
165 | |||
166 | return 0; | ||
167 | } | ||
168 | |||
169 | static const struct kfd2kgd_calls kfd2kgd = { | ||
170 | .init_gtt_mem_allocation = alloc_gtt_mem, | ||
171 | .free_gtt_mem = free_gtt_mem, | ||
172 | .get_local_mem_info = get_local_mem_info, | ||
173 | .get_gpu_clock_counter = get_gpu_clock_counter, | ||
174 | .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, | ||
175 | .alloc_pasid = amdgpu_pasid_alloc, | ||
176 | .free_pasid = amdgpu_pasid_free, | ||
177 | .program_sh_mem_settings = kgd_program_sh_mem_settings, | ||
178 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, | ||
179 | .init_interrupts = kgd_init_interrupts, | ||
180 | .hqd_load = kgd_hqd_load, | ||
181 | .hqd_sdma_load = kgd_hqd_sdma_load, | ||
182 | .hqd_dump = kgd_hqd_dump, | ||
183 | .hqd_sdma_dump = kgd_hqd_sdma_dump, | ||
184 | .hqd_is_occupied = kgd_hqd_is_occupied, | ||
185 | .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, | ||
186 | .hqd_destroy = kgd_hqd_destroy, | ||
187 | .hqd_sdma_destroy = kgd_hqd_sdma_destroy, | ||
188 | .address_watch_disable = kgd_address_watch_disable, | ||
189 | .address_watch_execute = kgd_address_watch_execute, | ||
190 | .wave_control_execute = kgd_wave_control_execute, | ||
191 | .address_watch_get_offset = kgd_address_watch_get_offset, | ||
192 | .get_atc_vmid_pasid_mapping_pasid = | ||
193 | get_atc_vmid_pasid_mapping_pasid, | ||
194 | .get_atc_vmid_pasid_mapping_valid = | ||
195 | get_atc_vmid_pasid_mapping_valid, | ||
196 | .get_fw_version = get_fw_version, | ||
197 | .set_scratch_backing_va = set_scratch_backing_va, | ||
198 | .get_tile_config = amdgpu_amdkfd_get_tile_config, | ||
199 | .get_cu_info = get_cu_info, | ||
200 | .get_vram_usage = amdgpu_amdkfd_get_vram_usage, | ||
201 | .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, | ||
202 | .acquire_process_vm = amdgpu_amdkfd_gpuvm_acquire_process_vm, | ||
203 | .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, | ||
204 | .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, | ||
205 | .set_vm_context_page_table_base = set_vm_context_page_table_base, | ||
206 | .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, | ||
207 | .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, | ||
208 | .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, | ||
209 | .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, | ||
210 | .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, | ||
211 | .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, | ||
212 | .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, | ||
213 | .invalidate_tlbs = invalidate_tlbs, | ||
214 | .invalidate_tlbs_vmid = invalidate_tlbs_vmid, | ||
215 | .submit_ib = amdgpu_amdkfd_submit_ib, | ||
216 | }; | ||
217 | |||
218 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void) | ||
219 | { | ||
220 | return (struct kfd2kgd_calls *)&kfd2kgd; | ||
221 | } | ||
222 | |||
223 | static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) | ||
224 | { | ||
225 | return (struct amdgpu_device *)kgd; | ||
226 | } | ||
227 | |||
228 | static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, | ||
229 | uint32_t queue, uint32_t vmid) | ||
230 | { | ||
231 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
232 | |||
233 | mutex_lock(&adev->srbm_mutex); | ||
234 | soc15_grbm_select(adev, mec, pipe, queue, vmid); | ||
235 | } | ||
236 | |||
237 | static void unlock_srbm(struct kgd_dev *kgd) | ||
238 | { | ||
239 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
240 | |||
241 | soc15_grbm_select(adev, 0, 0, 0, 0); | ||
242 | mutex_unlock(&adev->srbm_mutex); | ||
243 | } | ||
244 | |||
245 | static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, | ||
246 | uint32_t queue_id) | ||
247 | { | ||
248 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
249 | |||
250 | uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; | ||
251 | uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); | ||
252 | |||
253 | lock_srbm(kgd, mec, pipe, queue_id, 0); | ||
254 | } | ||
255 | |||
256 | static uint32_t get_queue_mask(struct amdgpu_device *adev, | ||
257 | uint32_t pipe_id, uint32_t queue_id) | ||
258 | { | ||
259 | unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe + | ||
260 | queue_id) & 31; | ||
261 | |||
262 | return ((uint32_t)1) << bit; | ||
263 | } | ||
264 | |||
265 | static void release_queue(struct kgd_dev *kgd) | ||
266 | { | ||
267 | unlock_srbm(kgd); | ||
268 | } | ||
269 | |||
270 | static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, | ||
271 | uint32_t sh_mem_config, | ||
272 | uint32_t sh_mem_ape1_base, | ||
273 | uint32_t sh_mem_ape1_limit, | ||
274 | uint32_t sh_mem_bases) | ||
275 | { | ||
276 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
277 | |||
278 | lock_srbm(kgd, 0, 0, 0, vmid); | ||
279 | |||
280 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); | ||
281 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); | ||
282 | /* APE1 no longer exists on GFX9 */ | ||
283 | |||
284 | unlock_srbm(kgd); | ||
285 | } | ||
286 | |||
287 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | ||
288 | unsigned int vmid) | ||
289 | { | ||
290 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
291 | |||
292 | /* | ||
293 | * We have to assume that there is no outstanding mapping. | ||
294 | * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because | ||
295 | * a mapping is in progress or because a mapping finished | ||
296 | * and the SW cleared it. | ||
297 | * So the protocol is to always wait & clear. | ||
298 | */ | ||
299 | uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | | ||
300 | ATC_VMID0_PASID_MAPPING__VALID_MASK; | ||
301 | |||
302 | /* | ||
303 | * need to do this twice, once for gfx and once for mmhub | ||
304 | * for ATC add 16 to VMID for mmhub, for IH different registers. | ||
305 | * ATC_VMID0..15 registers are separate from ATC_VMID16..31. | ||
306 | */ | ||
307 | |||
308 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, | ||
309 | pasid_mapping); | ||
310 | |||
311 | while (!(RREG32(SOC15_REG_OFFSET( | ||
312 | ATHUB, 0, | ||
313 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & | ||
314 | (1U << vmid))) | ||
315 | cpu_relax(); | ||
316 | |||
317 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, | ||
318 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), | ||
319 | 1U << vmid); | ||
320 | |||
321 | /* Mapping vmid to pasid also for IH block */ | ||
322 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, | ||
323 | pasid_mapping); | ||
324 | |||
325 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, | ||
326 | pasid_mapping); | ||
327 | |||
328 | while (!(RREG32(SOC15_REG_OFFSET( | ||
329 | ATHUB, 0, | ||
330 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & | ||
331 | (1U << (vmid + 16)))) | ||
332 | cpu_relax(); | ||
333 | |||
334 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, | ||
335 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), | ||
336 | 1U << (vmid + 16)); | ||
337 | |||
338 | /* Mapping vmid to pasid also for IH block */ | ||
339 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, | ||
340 | pasid_mapping); | ||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | /* TODO - RING0 form of field is obsolete, seems to date back to SI | ||
345 | * but still works | ||
346 | */ | ||
347 | |||
348 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) | ||
349 | { | ||
350 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
351 | uint32_t mec; | ||
352 | uint32_t pipe; | ||
353 | |||
354 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; | ||
355 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); | ||
356 | |||
357 | lock_srbm(kgd, mec, pipe, 0, 0); | ||
358 | |||
359 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), | ||
360 | CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | | ||
361 | CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); | ||
362 | |||
363 | unlock_srbm(kgd); | ||
364 | |||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | static uint32_t get_sdma_base_addr(struct amdgpu_device *adev, | ||
369 | unsigned int engine_id, | ||
370 | unsigned int queue_id) | ||
371 | { | ||
372 | uint32_t base[2] = { | ||
373 | SOC15_REG_OFFSET(SDMA0, 0, | ||
374 | mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, | ||
375 | SOC15_REG_OFFSET(SDMA1, 0, | ||
376 | mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL | ||
377 | }; | ||
378 | uint32_t retval; | ||
379 | |||
380 | retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - | ||
381 | mmSDMA0_RLC0_RB_CNTL); | ||
382 | |||
383 | pr_debug("sdma base address: 0x%x\n", retval); | ||
384 | |||
385 | return retval; | ||
386 | } | ||
387 | |||
388 | static inline struct v9_mqd *get_mqd(void *mqd) | ||
389 | { | ||
390 | return (struct v9_mqd *)mqd; | ||
391 | } | ||
392 | |||
393 | static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) | ||
394 | { | ||
395 | return (struct v9_sdma_mqd *)mqd; | ||
396 | } | ||
397 | |||
398 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | ||
399 | uint32_t queue_id, uint32_t __user *wptr, | ||
400 | uint32_t wptr_shift, uint32_t wptr_mask, | ||
401 | struct mm_struct *mm) | ||
402 | { | ||
403 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
404 | struct v9_mqd *m; | ||
405 | uint32_t *mqd_hqd; | ||
406 | uint32_t reg, hqd_base, data; | ||
407 | |||
408 | m = get_mqd(mqd); | ||
409 | |||
410 | acquire_queue(kgd, pipe_id, queue_id); | ||
411 | |||
412 | /* HIQ is set during driver init period with vmid set to 0*/ | ||
413 | if (m->cp_hqd_vmid == 0) { | ||
414 | uint32_t value, mec, pipe; | ||
415 | |||
416 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; | ||
417 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); | ||
418 | |||
419 | pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", | ||
420 | mec, pipe, queue_id); | ||
421 | value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); | ||
422 | value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, | ||
423 | ((mec << 5) | (pipe << 3) | queue_id | 0x80)); | ||
424 | WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); | ||
425 | } | ||
426 | |||
427 | /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ | ||
428 | mqd_hqd = &m->cp_mqd_base_addr_lo; | ||
429 | hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); | ||
430 | |||
431 | for (reg = hqd_base; | ||
432 | reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) | ||
433 | WREG32(reg, mqd_hqd[reg - hqd_base]); | ||
434 | |||
435 | |||
436 | /* Activate doorbell logic before triggering WPTR poll. */ | ||
437 | data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, | ||
438 | CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); | ||
439 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); | ||
440 | |||
441 | if (wptr) { | ||
442 | /* Don't read wptr with get_user because the user | ||
443 | * context may not be accessible (if this function | ||
444 | * runs in a work queue). Instead trigger a one-shot | ||
445 | * polling read from memory in the CP. This assumes | ||
446 | * that wptr is GPU-accessible in the queue's VMID via | ||
447 | * ATC or SVM. WPTR==RPTR before starting the poll so | ||
448 | * the CP starts fetching new commands from the right | ||
449 | * place. | ||
450 | * | ||
451 | * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit | ||
452 | * tricky. Assume that the queue didn't overflow. The | ||
453 | * number of valid bits in the 32-bit RPTR depends on | ||
454 | * the queue size. The remaining bits are taken from | ||
455 | * the saved 64-bit WPTR. If the WPTR wrapped, add the | ||
456 | * queue size. | ||
457 | */ | ||
458 | uint32_t queue_size = | ||
459 | 2 << REG_GET_FIELD(m->cp_hqd_pq_control, | ||
460 | CP_HQD_PQ_CONTROL, QUEUE_SIZE); | ||
461 | uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); | ||
462 | |||
463 | if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) | ||
464 | guessed_wptr += queue_size; | ||
465 | guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); | ||
466 | guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; | ||
467 | |||
468 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), | ||
469 | lower_32_bits(guessed_wptr)); | ||
470 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), | ||
471 | upper_32_bits(guessed_wptr)); | ||
472 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), | ||
473 | lower_32_bits((uint64_t)wptr)); | ||
474 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), | ||
475 | upper_32_bits((uint64_t)wptr)); | ||
476 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), | ||
477 | get_queue_mask(adev, pipe_id, queue_id)); | ||
478 | } | ||
479 | |||
480 | /* Start the EOP fetcher */ | ||
481 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), | ||
482 | REG_SET_FIELD(m->cp_hqd_eop_rptr, | ||
483 | CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); | ||
484 | |||
485 | data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); | ||
486 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); | ||
487 | |||
488 | release_queue(kgd); | ||
489 | |||
490 | return 0; | ||
491 | } | ||
492 | |||
493 | static int kgd_hqd_dump(struct kgd_dev *kgd, | ||
494 | uint32_t pipe_id, uint32_t queue_id, | ||
495 | uint32_t (**dump)[2], uint32_t *n_regs) | ||
496 | { | ||
497 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
498 | uint32_t i = 0, reg; | ||
499 | #define HQD_N_REGS 56 | ||
500 | #define DUMP_REG(addr) do { \ | ||
501 | if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ | ||
502 | break; \ | ||
503 | (*dump)[i][0] = (addr) << 2; \ | ||
504 | (*dump)[i++][1] = RREG32(addr); \ | ||
505 | } while (0) | ||
506 | |||
507 | *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); | ||
508 | if (*dump == NULL) | ||
509 | return -ENOMEM; | ||
510 | |||
511 | acquire_queue(kgd, pipe_id, queue_id); | ||
512 | |||
513 | for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); | ||
514 | reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) | ||
515 | DUMP_REG(reg); | ||
516 | |||
517 | release_queue(kgd); | ||
518 | |||
519 | WARN_ON_ONCE(i != HQD_N_REGS); | ||
520 | *n_regs = i; | ||
521 | |||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, | ||
526 | uint32_t __user *wptr, struct mm_struct *mm) | ||
527 | { | ||
528 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
529 | struct v9_sdma_mqd *m; | ||
530 | uint32_t sdma_base_addr, sdmax_gfx_context_cntl; | ||
531 | unsigned long end_jiffies; | ||
532 | uint32_t data; | ||
533 | uint64_t data64; | ||
534 | uint64_t __user *wptr64 = (uint64_t __user *)wptr; | ||
535 | |||
536 | m = get_sdma_mqd(mqd); | ||
537 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, | ||
538 | m->sdma_queue_id); | ||
539 | sdmax_gfx_context_cntl = m->sdma_engine_id ? | ||
540 | SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : | ||
541 | SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); | ||
542 | |||
543 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, | ||
544 | m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); | ||
545 | |||
546 | end_jiffies = msecs_to_jiffies(2000) + jiffies; | ||
547 | while (true) { | ||
548 | data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); | ||
549 | if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) | ||
550 | break; | ||
551 | if (time_after(jiffies, end_jiffies)) | ||
552 | return -ETIME; | ||
553 | usleep_range(500, 1000); | ||
554 | } | ||
555 | data = RREG32(sdmax_gfx_context_cntl); | ||
556 | data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, | ||
557 | RESUME_CTX, 0); | ||
558 | WREG32(sdmax_gfx_context_cntl, data); | ||
559 | |||
560 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, | ||
561 | m->sdmax_rlcx_doorbell_offset); | ||
562 | |||
563 | data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, | ||
564 | ENABLE, 1); | ||
565 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); | ||
566 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); | ||
567 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, | ||
568 | m->sdmax_rlcx_rb_rptr_hi); | ||
569 | |||
570 | WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); | ||
571 | if (read_user_wptr(mm, wptr64, data64)) { | ||
572 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, | ||
573 | lower_32_bits(data64)); | ||
574 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, | ||
575 | upper_32_bits(data64)); | ||
576 | } else { | ||
577 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, | ||
578 | m->sdmax_rlcx_rb_rptr); | ||
579 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, | ||
580 | m->sdmax_rlcx_rb_rptr_hi); | ||
581 | } | ||
582 | WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); | ||
583 | |||
584 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); | ||
585 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, | ||
586 | m->sdmax_rlcx_rb_base_hi); | ||
587 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, | ||
588 | m->sdmax_rlcx_rb_rptr_addr_lo); | ||
589 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, | ||
590 | m->sdmax_rlcx_rb_rptr_addr_hi); | ||
591 | |||
592 | data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, | ||
593 | RB_ENABLE, 1); | ||
594 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); | ||
595 | |||
596 | return 0; | ||
597 | } | ||
598 | |||
599 | static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, | ||
600 | uint32_t engine_id, uint32_t queue_id, | ||
601 | uint32_t (**dump)[2], uint32_t *n_regs) | ||
602 | { | ||
603 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
604 | uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id); | ||
605 | uint32_t i = 0, reg; | ||
606 | #undef HQD_N_REGS | ||
607 | #define HQD_N_REGS (19+6+7+10) | ||
608 | |||
609 | *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); | ||
610 | if (*dump == NULL) | ||
611 | return -ENOMEM; | ||
612 | |||
613 | for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) | ||
614 | DUMP_REG(sdma_base_addr + reg); | ||
615 | for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) | ||
616 | DUMP_REG(sdma_base_addr + reg); | ||
617 | for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; | ||
618 | reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) | ||
619 | DUMP_REG(sdma_base_addr + reg); | ||
620 | for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; | ||
621 | reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) | ||
622 | DUMP_REG(sdma_base_addr + reg); | ||
623 | |||
624 | WARN_ON_ONCE(i != HQD_N_REGS); | ||
625 | *n_regs = i; | ||
626 | |||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, | ||
631 | uint32_t pipe_id, uint32_t queue_id) | ||
632 | { | ||
633 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
634 | uint32_t act; | ||
635 | bool retval = false; | ||
636 | uint32_t low, high; | ||
637 | |||
638 | acquire_queue(kgd, pipe_id, queue_id); | ||
639 | act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); | ||
640 | if (act) { | ||
641 | low = lower_32_bits(queue_address >> 8); | ||
642 | high = upper_32_bits(queue_address >> 8); | ||
643 | |||
644 | if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && | ||
645 | high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) | ||
646 | retval = true; | ||
647 | } | ||
648 | release_queue(kgd); | ||
649 | return retval; | ||
650 | } | ||
651 | |||
652 | static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) | ||
653 | { | ||
654 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
655 | struct v9_sdma_mqd *m; | ||
656 | uint32_t sdma_base_addr; | ||
657 | uint32_t sdma_rlc_rb_cntl; | ||
658 | |||
659 | m = get_sdma_mqd(mqd); | ||
660 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, | ||
661 | m->sdma_queue_id); | ||
662 | |||
663 | sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); | ||
664 | |||
665 | if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) | ||
666 | return true; | ||
667 | |||
668 | return false; | ||
669 | } | ||
670 | |||
671 | static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, | ||
672 | enum kfd_preempt_type reset_type, | ||
673 | unsigned int utimeout, uint32_t pipe_id, | ||
674 | uint32_t queue_id) | ||
675 | { | ||
676 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
677 | enum hqd_dequeue_request_type type; | ||
678 | unsigned long end_jiffies; | ||
679 | uint32_t temp; | ||
680 | struct v9_mqd *m = get_mqd(mqd); | ||
681 | |||
682 | acquire_queue(kgd, pipe_id, queue_id); | ||
683 | |||
684 | if (m->cp_hqd_vmid == 0) | ||
685 | WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); | ||
686 | |||
687 | switch (reset_type) { | ||
688 | case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: | ||
689 | type = DRAIN_PIPE; | ||
690 | break; | ||
691 | case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: | ||
692 | type = RESET_WAVES; | ||
693 | break; | ||
694 | default: | ||
695 | type = DRAIN_PIPE; | ||
696 | break; | ||
697 | } | ||
698 | |||
699 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); | ||
700 | |||
701 | end_jiffies = (utimeout * HZ / 1000) + jiffies; | ||
702 | while (true) { | ||
703 | temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); | ||
704 | if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) | ||
705 | break; | ||
706 | if (time_after(jiffies, end_jiffies)) { | ||
707 | pr_err("cp queue preemption time out.\n"); | ||
708 | release_queue(kgd); | ||
709 | return -ETIME; | ||
710 | } | ||
711 | usleep_range(500, 1000); | ||
712 | } | ||
713 | |||
714 | release_queue(kgd); | ||
715 | return 0; | ||
716 | } | ||
717 | |||
718 | static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, | ||
719 | unsigned int utimeout) | ||
720 | { | ||
721 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
722 | struct v9_sdma_mqd *m; | ||
723 | uint32_t sdma_base_addr; | ||
724 | uint32_t temp; | ||
725 | unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; | ||
726 | |||
727 | m = get_sdma_mqd(mqd); | ||
728 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, | ||
729 | m->sdma_queue_id); | ||
730 | |||
731 | temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); | ||
732 | temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; | ||
733 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); | ||
734 | |||
735 | while (true) { | ||
736 | temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); | ||
737 | if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) | ||
738 | break; | ||
739 | if (time_after(jiffies, end_jiffies)) | ||
740 | return -ETIME; | ||
741 | usleep_range(500, 1000); | ||
742 | } | ||
743 | |||
744 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); | ||
745 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, | ||
746 | RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | | ||
747 | SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); | ||
748 | |||
749 | m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); | ||
750 | m->sdmax_rlcx_rb_rptr_hi = | ||
751 | RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); | ||
752 | |||
753 | return 0; | ||
754 | } | ||
755 | |||
756 | static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, | ||
757 | uint8_t vmid) | ||
758 | { | ||
759 | uint32_t reg; | ||
760 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
761 | |||
762 | reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) | ||
763 | + vmid); | ||
764 | return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; | ||
765 | } | ||
766 | |||
767 | static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, | ||
768 | uint8_t vmid) | ||
769 | { | ||
770 | uint32_t reg; | ||
771 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
772 | |||
773 | reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) | ||
774 | + vmid); | ||
775 | return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; | ||
776 | } | ||
777 | |||
778 | static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) | ||
779 | { | ||
780 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
781 | uint32_t req = (1 << vmid) | | ||
782 | (0 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* legacy */ | ||
783 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | | ||
784 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | | ||
785 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | | ||
786 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | | ||
787 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; | ||
788 | |||
789 | mutex_lock(&adev->srbm_mutex); | ||
790 | |||
791 | /* Use legacy mode tlb invalidation. | ||
792 | * | ||
793 | * Currently on Raven the code below is broken for anything but | ||
794 | * legacy mode due to a MMHUB power gating problem. A workaround | ||
795 | * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ | ||
796 | * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack | ||
797 | * bit. | ||
798 | * | ||
799 | * TODO 1: agree on the right set of invalidation registers for | ||
800 | * KFD use. Use the last one for now. Invalidate both GC and | ||
801 | * MMHUB. | ||
802 | * | ||
803 | * TODO 2: support range-based invalidation, requires kfg2kgd | ||
804 | * interface change | ||
805 | */ | ||
806 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), | ||
807 | 0xffffffff); | ||
808 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), | ||
809 | 0x0000001f); | ||
810 | |||
811 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, | ||
812 | mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), | ||
813 | 0xffffffff); | ||
814 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, | ||
815 | mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), | ||
816 | 0x0000001f); | ||
817 | |||
818 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); | ||
819 | |||
820 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), | ||
821 | req); | ||
822 | |||
823 | while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & | ||
824 | (1 << vmid))) | ||
825 | cpu_relax(); | ||
826 | |||
827 | while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, | ||
828 | mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & | ||
829 | (1 << vmid))) | ||
830 | cpu_relax(); | ||
831 | |||
832 | mutex_unlock(&adev->srbm_mutex); | ||
833 | |||
834 | } | ||
835 | |||
836 | static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) | ||
837 | { | ||
838 | signed long r; | ||
839 | uint32_t seq; | ||
840 | struct amdgpu_ring *ring = &adev->gfx.kiq.ring; | ||
841 | |||
842 | spin_lock(&adev->gfx.kiq.ring_lock); | ||
843 | amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ | ||
844 | amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); | ||
845 | amdgpu_ring_write(ring, | ||
846 | PACKET3_INVALIDATE_TLBS_DST_SEL(1) | | ||
847 | PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | | ||
848 | PACKET3_INVALIDATE_TLBS_PASID(pasid) | | ||
849 | PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */ | ||
850 | amdgpu_fence_emit_polling(ring, &seq); | ||
851 | amdgpu_ring_commit(ring); | ||
852 | spin_unlock(&adev->gfx.kiq.ring_lock); | ||
853 | |||
854 | r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); | ||
855 | if (r < 1) { | ||
856 | DRM_ERROR("wait for kiq fence error: %ld.\n", r); | ||
857 | return -ETIME; | ||
858 | } | ||
859 | |||
860 | return 0; | ||
861 | } | ||
862 | |||
863 | static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) | ||
864 | { | ||
865 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
866 | int vmid; | ||
867 | struct amdgpu_ring *ring = &adev->gfx.kiq.ring; | ||
868 | |||
869 | if (ring->ready) | ||
870 | return invalidate_tlbs_with_kiq(adev, pasid); | ||
871 | |||
872 | for (vmid = 0; vmid < 16; vmid++) { | ||
873 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) | ||
874 | continue; | ||
875 | if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { | ||
876 | if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) | ||
877 | == pasid) { | ||
878 | write_vmid_invalidate_request(kgd, vmid); | ||
879 | break; | ||
880 | } | ||
881 | } | ||
882 | } | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | |||
887 | static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid) | ||
888 | { | ||
889 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
890 | |||
891 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { | ||
892 | pr_err("non kfd vmid %d\n", vmid); | ||
893 | return 0; | ||
894 | } | ||
895 | |||
896 | write_vmid_invalidate_request(kgd, vmid); | ||
897 | return 0; | ||
898 | } | ||
899 | |||
900 | static int kgd_address_watch_disable(struct kgd_dev *kgd) | ||
901 | { | ||
902 | return 0; | ||
903 | } | ||
904 | |||
905 | static int kgd_address_watch_execute(struct kgd_dev *kgd, | ||
906 | unsigned int watch_point_id, | ||
907 | uint32_t cntl_val, | ||
908 | uint32_t addr_hi, | ||
909 | uint32_t addr_lo) | ||
910 | { | ||
911 | return 0; | ||
912 | } | ||
913 | |||
914 | static int kgd_wave_control_execute(struct kgd_dev *kgd, | ||
915 | uint32_t gfx_index_val, | ||
916 | uint32_t sq_cmd) | ||
917 | { | ||
918 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
919 | uint32_t data = 0; | ||
920 | |||
921 | mutex_lock(&adev->grbm_idx_mutex); | ||
922 | |||
923 | WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); | ||
924 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); | ||
925 | |||
926 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, | ||
927 | INSTANCE_BROADCAST_WRITES, 1); | ||
928 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, | ||
929 | SH_BROADCAST_WRITES, 1); | ||
930 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, | ||
931 | SE_BROADCAST_WRITES, 1); | ||
932 | |||
933 | WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); | ||
934 | mutex_unlock(&adev->grbm_idx_mutex); | ||
935 | |||
936 | return 0; | ||
937 | } | ||
938 | |||
939 | static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, | ||
940 | unsigned int watch_point_id, | ||
941 | unsigned int reg_offset) | ||
942 | { | ||
943 | return 0; | ||
944 | } | ||
945 | |||
946 | static void set_scratch_backing_va(struct kgd_dev *kgd, | ||
947 | uint64_t va, uint32_t vmid) | ||
948 | { | ||
949 | /* No longer needed on GFXv9. The scratch base address is | ||
950 | * passed to the shader by the CP. It's the user mode driver's | ||
951 | * responsibility. | ||
952 | */ | ||
953 | } | ||
954 | |||
955 | /* FIXME: Does this need to be ASIC-specific code? */ | ||
956 | static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) | ||
957 | { | ||
958 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
959 | const union amdgpu_firmware_header *hdr; | ||
960 | |||
961 | switch (type) { | ||
962 | case KGD_ENGINE_PFP: | ||
963 | hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; | ||
964 | break; | ||
965 | |||
966 | case KGD_ENGINE_ME: | ||
967 | hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; | ||
968 | break; | ||
969 | |||
970 | case KGD_ENGINE_CE: | ||
971 | hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; | ||
972 | break; | ||
973 | |||
974 | case KGD_ENGINE_MEC1: | ||
975 | hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; | ||
976 | break; | ||
977 | |||
978 | case KGD_ENGINE_MEC2: | ||
979 | hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; | ||
980 | break; | ||
981 | |||
982 | case KGD_ENGINE_RLC: | ||
983 | hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; | ||
984 | break; | ||
985 | |||
986 | case KGD_ENGINE_SDMA1: | ||
987 | hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; | ||
988 | break; | ||
989 | |||
990 | case KGD_ENGINE_SDMA2: | ||
991 | hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; | ||
992 | break; | ||
993 | |||
994 | default: | ||
995 | return 0; | ||
996 | } | ||
997 | |||
998 | if (hdr == NULL) | ||
999 | return 0; | ||
1000 | |||
1001 | /* Only 12 bit in use*/ | ||
1002 | return hdr->common.ucode_version; | ||
1003 | } | ||
1004 | |||
1005 | static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, | ||
1006 | uint32_t page_table_base) | ||
1007 | { | ||
1008 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
1009 | uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | | ||
1010 | AMDGPU_PTE_VALID; | ||
1011 | |||
1012 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { | ||
1013 | pr_err("trying to set page table base for wrong VMID %u\n", | ||
1014 | vmid); | ||
1015 | return; | ||
1016 | } | ||
1017 | |||
1018 | /* TODO: take advantage of per-process address space size. For | ||
1019 | * now, all processes share the same address space size, like | ||
1020 | * on GFX8 and older. | ||
1021 | */ | ||
1022 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); | ||
1023 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); | ||
1024 | |||
1025 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), | ||
1026 | lower_32_bits(adev->vm_manager.max_pfn - 1)); | ||
1027 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), | ||
1028 | upper_32_bits(adev->vm_manager.max_pfn - 1)); | ||
1029 | |||
1030 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); | ||
1031 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); | ||
1032 | |||
1033 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); | ||
1034 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); | ||
1035 | |||
1036 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), | ||
1037 | lower_32_bits(adev->vm_manager.max_pfn - 1)); | ||
1038 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), | ||
1039 | upper_32_bits(adev->vm_manager.max_pfn - 1)); | ||
1040 | |||
1041 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); | ||
1042 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); | ||
1043 | } | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 1d6e1479da38..5296e24fd662 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #define pr_fmt(fmt) "kfd2kgd: " fmt | 23 | #define pr_fmt(fmt) "kfd2kgd: " fmt |
24 | 24 | ||
25 | #include <linux/list.h> | 25 | #include <linux/list.h> |
26 | #include <linux/sched/mm.h> | ||
26 | #include <drm/drmP.h> | 27 | #include <drm/drmP.h> |
27 | #include "amdgpu_object.h" | 28 | #include "amdgpu_object.h" |
28 | #include "amdgpu_vm.h" | 29 | #include "amdgpu_vm.h" |
@@ -33,10 +34,20 @@ | |||
33 | */ | 34 | */ |
34 | #define VI_BO_SIZE_ALIGN (0x8000) | 35 | #define VI_BO_SIZE_ALIGN (0x8000) |
35 | 36 | ||
37 | /* BO flag to indicate a KFD userptr BO */ | ||
38 | #define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) | ||
39 | |||
40 | /* Userptr restore delay, just long enough to allow consecutive VM | ||
41 | * changes to accumulate | ||
42 | */ | ||
43 | #define AMDGPU_USERPTR_RESTORE_DELAY_MS 1 | ||
44 | |||
36 | /* Impose limit on how much memory KFD can use */ | 45 | /* Impose limit on how much memory KFD can use */ |
37 | static struct { | 46 | static struct { |
38 | uint64_t max_system_mem_limit; | 47 | uint64_t max_system_mem_limit; |
48 | uint64_t max_userptr_mem_limit; | ||
39 | int64_t system_mem_used; | 49 | int64_t system_mem_used; |
50 | int64_t userptr_mem_used; | ||
40 | spinlock_t mem_limit_lock; | 51 | spinlock_t mem_limit_lock; |
41 | } kfd_mem_limit; | 52 | } kfd_mem_limit; |
42 | 53 | ||
@@ -57,6 +68,7 @@ static const char * const domain_bit_to_string[] = { | |||
57 | 68 | ||
58 | #define domain_string(domain) domain_bit_to_string[ffs(domain)-1] | 69 | #define domain_string(domain) domain_bit_to_string[ffs(domain)-1] |
59 | 70 | ||
71 | static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); | ||
60 | 72 | ||
61 | 73 | ||
62 | static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) | 74 | static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) |
@@ -78,6 +90,7 @@ static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, | |||
78 | 90 | ||
79 | /* Set memory usage limits. Current, limits are | 91 | /* Set memory usage limits. Current, limits are |
80 | * System (kernel) memory - 3/8th System RAM | 92 | * System (kernel) memory - 3/8th System RAM |
93 | * Userptr memory - 3/4th System RAM | ||
81 | */ | 94 | */ |
82 | void amdgpu_amdkfd_gpuvm_init_mem_limits(void) | 95 | void amdgpu_amdkfd_gpuvm_init_mem_limits(void) |
83 | { | 96 | { |
@@ -90,8 +103,10 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void) | |||
90 | 103 | ||
91 | spin_lock_init(&kfd_mem_limit.mem_limit_lock); | 104 | spin_lock_init(&kfd_mem_limit.mem_limit_lock); |
92 | kfd_mem_limit.max_system_mem_limit = (mem >> 1) - (mem >> 3); | 105 | kfd_mem_limit.max_system_mem_limit = (mem >> 1) - (mem >> 3); |
93 | pr_debug("Kernel memory limit %lluM\n", | 106 | kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 2); |
94 | (kfd_mem_limit.max_system_mem_limit >> 20)); | 107 | pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", |
108 | (kfd_mem_limit.max_system_mem_limit >> 20), | ||
109 | (kfd_mem_limit.max_userptr_mem_limit >> 20)); | ||
95 | } | 110 | } |
96 | 111 | ||
97 | static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, | 112 | static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, |
@@ -111,6 +126,16 @@ static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, | |||
111 | goto err_no_mem; | 126 | goto err_no_mem; |
112 | } | 127 | } |
113 | kfd_mem_limit.system_mem_used += (acc_size + size); | 128 | kfd_mem_limit.system_mem_used += (acc_size + size); |
129 | } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { | ||
130 | if ((kfd_mem_limit.system_mem_used + acc_size > | ||
131 | kfd_mem_limit.max_system_mem_limit) || | ||
132 | (kfd_mem_limit.userptr_mem_used + (size + acc_size) > | ||
133 | kfd_mem_limit.max_userptr_mem_limit)) { | ||
134 | ret = -ENOMEM; | ||
135 | goto err_no_mem; | ||
136 | } | ||
137 | kfd_mem_limit.system_mem_used += acc_size; | ||
138 | kfd_mem_limit.userptr_mem_used += size; | ||
114 | } | 139 | } |
115 | err_no_mem: | 140 | err_no_mem: |
116 | spin_unlock(&kfd_mem_limit.mem_limit_lock); | 141 | spin_unlock(&kfd_mem_limit.mem_limit_lock); |
@@ -126,10 +151,16 @@ static void unreserve_system_mem_limit(struct amdgpu_device *adev, | |||
126 | sizeof(struct amdgpu_bo)); | 151 | sizeof(struct amdgpu_bo)); |
127 | 152 | ||
128 | spin_lock(&kfd_mem_limit.mem_limit_lock); | 153 | spin_lock(&kfd_mem_limit.mem_limit_lock); |
129 | if (domain == AMDGPU_GEM_DOMAIN_GTT) | 154 | if (domain == AMDGPU_GEM_DOMAIN_GTT) { |
130 | kfd_mem_limit.system_mem_used -= (acc_size + size); | 155 | kfd_mem_limit.system_mem_used -= (acc_size + size); |
156 | } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { | ||
157 | kfd_mem_limit.system_mem_used -= acc_size; | ||
158 | kfd_mem_limit.userptr_mem_used -= size; | ||
159 | } | ||
131 | WARN_ONCE(kfd_mem_limit.system_mem_used < 0, | 160 | WARN_ONCE(kfd_mem_limit.system_mem_used < 0, |
132 | "kfd system memory accounting unbalanced"); | 161 | "kfd system memory accounting unbalanced"); |
162 | WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, | ||
163 | "kfd userptr memory accounting unbalanced"); | ||
133 | 164 | ||
134 | spin_unlock(&kfd_mem_limit.mem_limit_lock); | 165 | spin_unlock(&kfd_mem_limit.mem_limit_lock); |
135 | } | 166 | } |
@@ -138,12 +169,17 @@ void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo) | |||
138 | { | 169 | { |
139 | spin_lock(&kfd_mem_limit.mem_limit_lock); | 170 | spin_lock(&kfd_mem_limit.mem_limit_lock); |
140 | 171 | ||
141 | if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { | 172 | if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { |
173 | kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; | ||
174 | kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); | ||
175 | } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { | ||
142 | kfd_mem_limit.system_mem_used -= | 176 | kfd_mem_limit.system_mem_used -= |
143 | (bo->tbo.acc_size + amdgpu_bo_size(bo)); | 177 | (bo->tbo.acc_size + amdgpu_bo_size(bo)); |
144 | } | 178 | } |
145 | WARN_ONCE(kfd_mem_limit.system_mem_used < 0, | 179 | WARN_ONCE(kfd_mem_limit.system_mem_used < 0, |
146 | "kfd system memory accounting unbalanced"); | 180 | "kfd system memory accounting unbalanced"); |
181 | WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, | ||
182 | "kfd userptr memory accounting unbalanced"); | ||
147 | 183 | ||
148 | spin_unlock(&kfd_mem_limit.mem_limit_lock); | 184 | spin_unlock(&kfd_mem_limit.mem_limit_lock); |
149 | } | 185 | } |
@@ -506,7 +542,8 @@ static void remove_bo_from_vm(struct amdgpu_device *adev, | |||
506 | } | 542 | } |
507 | 543 | ||
508 | static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, | 544 | static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, |
509 | struct amdkfd_process_info *process_info) | 545 | struct amdkfd_process_info *process_info, |
546 | bool userptr) | ||
510 | { | 547 | { |
511 | struct ttm_validate_buffer *entry = &mem->validate_list; | 548 | struct ttm_validate_buffer *entry = &mem->validate_list; |
512 | struct amdgpu_bo *bo = mem->bo; | 549 | struct amdgpu_bo *bo = mem->bo; |
@@ -515,10 +552,95 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, | |||
515 | entry->shared = true; | 552 | entry->shared = true; |
516 | entry->bo = &bo->tbo; | 553 | entry->bo = &bo->tbo; |
517 | mutex_lock(&process_info->lock); | 554 | mutex_lock(&process_info->lock); |
518 | list_add_tail(&entry->head, &process_info->kfd_bo_list); | 555 | if (userptr) |
556 | list_add_tail(&entry->head, &process_info->userptr_valid_list); | ||
557 | else | ||
558 | list_add_tail(&entry->head, &process_info->kfd_bo_list); | ||
519 | mutex_unlock(&process_info->lock); | 559 | mutex_unlock(&process_info->lock); |
520 | } | 560 | } |
521 | 561 | ||
562 | /* Initializes user pages. It registers the MMU notifier and validates | ||
563 | * the userptr BO in the GTT domain. | ||
564 | * | ||
565 | * The BO must already be on the userptr_valid_list. Otherwise an | ||
566 | * eviction and restore may happen that leaves the new BO unmapped | ||
567 | * with the user mode queues running. | ||
568 | * | ||
569 | * Takes the process_info->lock to protect against concurrent restore | ||
570 | * workers. | ||
571 | * | ||
572 | * Returns 0 for success, negative errno for errors. | ||
573 | */ | ||
574 | static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, | ||
575 | uint64_t user_addr) | ||
576 | { | ||
577 | struct amdkfd_process_info *process_info = mem->process_info; | ||
578 | struct amdgpu_bo *bo = mem->bo; | ||
579 | struct ttm_operation_ctx ctx = { true, false }; | ||
580 | int ret = 0; | ||
581 | |||
582 | mutex_lock(&process_info->lock); | ||
583 | |||
584 | ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); | ||
585 | if (ret) { | ||
586 | pr_err("%s: Failed to set userptr: %d\n", __func__, ret); | ||
587 | goto out; | ||
588 | } | ||
589 | |||
590 | ret = amdgpu_mn_register(bo, user_addr); | ||
591 | if (ret) { | ||
592 | pr_err("%s: Failed to register MMU notifier: %d\n", | ||
593 | __func__, ret); | ||
594 | goto out; | ||
595 | } | ||
596 | |||
597 | /* If no restore worker is running concurrently, user_pages | ||
598 | * should not be allocated | ||
599 | */ | ||
600 | WARN(mem->user_pages, "Leaking user_pages array"); | ||
601 | |||
602 | mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, | ||
603 | sizeof(struct page *), | ||
604 | GFP_KERNEL | __GFP_ZERO); | ||
605 | if (!mem->user_pages) { | ||
606 | pr_err("%s: Failed to allocate pages array\n", __func__); | ||
607 | ret = -ENOMEM; | ||
608 | goto unregister_out; | ||
609 | } | ||
610 | |||
611 | ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); | ||
612 | if (ret) { | ||
613 | pr_err("%s: Failed to get user pages: %d\n", __func__, ret); | ||
614 | goto free_out; | ||
615 | } | ||
616 | |||
617 | amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); | ||
618 | |||
619 | ret = amdgpu_bo_reserve(bo, true); | ||
620 | if (ret) { | ||
621 | pr_err("%s: Failed to reserve BO\n", __func__); | ||
622 | goto release_out; | ||
623 | } | ||
624 | amdgpu_ttm_placement_from_domain(bo, mem->domain); | ||
625 | ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); | ||
626 | if (ret) | ||
627 | pr_err("%s: failed to validate BO\n", __func__); | ||
628 | amdgpu_bo_unreserve(bo); | ||
629 | |||
630 | release_out: | ||
631 | if (ret) | ||
632 | release_pages(mem->user_pages, bo->tbo.ttm->num_pages); | ||
633 | free_out: | ||
634 | kvfree(mem->user_pages); | ||
635 | mem->user_pages = NULL; | ||
636 | unregister_out: | ||
637 | if (ret) | ||
638 | amdgpu_mn_unregister(bo); | ||
639 | out: | ||
640 | mutex_unlock(&process_info->lock); | ||
641 | return ret; | ||
642 | } | ||
643 | |||
522 | /* Reserving a BO and its page table BOs must happen atomically to | 644 | /* Reserving a BO and its page table BOs must happen atomically to |
523 | * avoid deadlocks. Some operations update multiple VMs at once. Track | 645 | * avoid deadlocks. Some operations update multiple VMs at once. Track |
524 | * all the reservation info in a context structure. Optionally a sync | 646 | * all the reservation info in a context structure. Optionally a sync |
@@ -748,7 +870,8 @@ static int update_gpuvm_pte(struct amdgpu_device *adev, | |||
748 | } | 870 | } |
749 | 871 | ||
750 | static int map_bo_to_gpuvm(struct amdgpu_device *adev, | 872 | static int map_bo_to_gpuvm(struct amdgpu_device *adev, |
751 | struct kfd_bo_va_list *entry, struct amdgpu_sync *sync) | 873 | struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, |
874 | bool no_update_pte) | ||
752 | { | 875 | { |
753 | int ret; | 876 | int ret; |
754 | 877 | ||
@@ -762,6 +885,9 @@ static int map_bo_to_gpuvm(struct amdgpu_device *adev, | |||
762 | return ret; | 885 | return ret; |
763 | } | 886 | } |
764 | 887 | ||
888 | if (no_update_pte) | ||
889 | return 0; | ||
890 | |||
765 | ret = update_gpuvm_pte(adev, entry, sync); | 891 | ret = update_gpuvm_pte(adev, entry, sync); |
766 | if (ret) { | 892 | if (ret) { |
767 | pr_err("update_gpuvm_pte() failed\n"); | 893 | pr_err("update_gpuvm_pte() failed\n"); |
@@ -820,6 +946,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, | |||
820 | mutex_init(&info->lock); | 946 | mutex_init(&info->lock); |
821 | INIT_LIST_HEAD(&info->vm_list_head); | 947 | INIT_LIST_HEAD(&info->vm_list_head); |
822 | INIT_LIST_HEAD(&info->kfd_bo_list); | 948 | INIT_LIST_HEAD(&info->kfd_bo_list); |
949 | INIT_LIST_HEAD(&info->userptr_valid_list); | ||
950 | INIT_LIST_HEAD(&info->userptr_inval_list); | ||
823 | 951 | ||
824 | info->eviction_fence = | 952 | info->eviction_fence = |
825 | amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), | 953 | amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), |
@@ -830,6 +958,11 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, | |||
830 | goto create_evict_fence_fail; | 958 | goto create_evict_fence_fail; |
831 | } | 959 | } |
832 | 960 | ||
961 | info->pid = get_task_pid(current->group_leader, PIDTYPE_PID); | ||
962 | atomic_set(&info->evicted_bos, 0); | ||
963 | INIT_DELAYED_WORK(&info->restore_userptr_work, | ||
964 | amdgpu_amdkfd_restore_userptr_worker); | ||
965 | |||
833 | *process_info = info; | 966 | *process_info = info; |
834 | *ef = dma_fence_get(&info->eviction_fence->base); | 967 | *ef = dma_fence_get(&info->eviction_fence->base); |
835 | } | 968 | } |
@@ -872,6 +1005,7 @@ reserve_pd_fail: | |||
872 | dma_fence_put(*ef); | 1005 | dma_fence_put(*ef); |
873 | *ef = NULL; | 1006 | *ef = NULL; |
874 | *process_info = NULL; | 1007 | *process_info = NULL; |
1008 | put_pid(info->pid); | ||
875 | create_evict_fence_fail: | 1009 | create_evict_fence_fail: |
876 | mutex_destroy(&info->lock); | 1010 | mutex_destroy(&info->lock); |
877 | kfree(info); | 1011 | kfree(info); |
@@ -967,8 +1101,12 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, | |||
967 | /* Release per-process resources when last compute VM is destroyed */ | 1101 | /* Release per-process resources when last compute VM is destroyed */ |
968 | if (!process_info->n_vms) { | 1102 | if (!process_info->n_vms) { |
969 | WARN_ON(!list_empty(&process_info->kfd_bo_list)); | 1103 | WARN_ON(!list_empty(&process_info->kfd_bo_list)); |
1104 | WARN_ON(!list_empty(&process_info->userptr_valid_list)); | ||
1105 | WARN_ON(!list_empty(&process_info->userptr_inval_list)); | ||
970 | 1106 | ||
971 | dma_fence_put(&process_info->eviction_fence->base); | 1107 | dma_fence_put(&process_info->eviction_fence->base); |
1108 | cancel_delayed_work_sync(&process_info->restore_userptr_work); | ||
1109 | put_pid(process_info->pid); | ||
972 | mutex_destroy(&process_info->lock); | 1110 | mutex_destroy(&process_info->lock); |
973 | kfree(process_info); | 1111 | kfree(process_info); |
974 | } | 1112 | } |
@@ -1003,9 +1141,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( | |||
1003 | { | 1141 | { |
1004 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | 1142 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
1005 | struct amdgpu_vm *avm = (struct amdgpu_vm *)vm; | 1143 | struct amdgpu_vm *avm = (struct amdgpu_vm *)vm; |
1144 | uint64_t user_addr = 0; | ||
1006 | struct amdgpu_bo *bo; | 1145 | struct amdgpu_bo *bo; |
1007 | int byte_align; | 1146 | int byte_align; |
1008 | u32 alloc_domain; | 1147 | u32 domain, alloc_domain; |
1009 | u64 alloc_flags; | 1148 | u64 alloc_flags; |
1010 | uint32_t mapping_flags; | 1149 | uint32_t mapping_flags; |
1011 | int ret; | 1150 | int ret; |
@@ -1014,14 +1153,21 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( | |||
1014 | * Check on which domain to allocate BO | 1153 | * Check on which domain to allocate BO |
1015 | */ | 1154 | */ |
1016 | if (flags & ALLOC_MEM_FLAGS_VRAM) { | 1155 | if (flags & ALLOC_MEM_FLAGS_VRAM) { |
1017 | alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; | 1156 | domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; |
1018 | alloc_flags = AMDGPU_GEM_CREATE_VRAM_CLEARED; | 1157 | alloc_flags = AMDGPU_GEM_CREATE_VRAM_CLEARED; |
1019 | alloc_flags |= (flags & ALLOC_MEM_FLAGS_PUBLIC) ? | 1158 | alloc_flags |= (flags & ALLOC_MEM_FLAGS_PUBLIC) ? |
1020 | AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : | 1159 | AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : |
1021 | AMDGPU_GEM_CREATE_NO_CPU_ACCESS; | 1160 | AMDGPU_GEM_CREATE_NO_CPU_ACCESS; |
1022 | } else if (flags & ALLOC_MEM_FLAGS_GTT) { | 1161 | } else if (flags & ALLOC_MEM_FLAGS_GTT) { |
1023 | alloc_domain = AMDGPU_GEM_DOMAIN_GTT; | 1162 | domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT; |
1163 | alloc_flags = 0; | ||
1164 | } else if (flags & ALLOC_MEM_FLAGS_USERPTR) { | ||
1165 | domain = AMDGPU_GEM_DOMAIN_GTT; | ||
1166 | alloc_domain = AMDGPU_GEM_DOMAIN_CPU; | ||
1024 | alloc_flags = 0; | 1167 | alloc_flags = 0; |
1168 | if (!offset || !*offset) | ||
1169 | return -EINVAL; | ||
1170 | user_addr = *offset; | ||
1025 | } else { | 1171 | } else { |
1026 | return -EINVAL; | 1172 | return -EINVAL; |
1027 | } | 1173 | } |
@@ -1078,18 +1224,34 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( | |||
1078 | } | 1224 | } |
1079 | bo->kfd_bo = *mem; | 1225 | bo->kfd_bo = *mem; |
1080 | (*mem)->bo = bo; | 1226 | (*mem)->bo = bo; |
1227 | if (user_addr) | ||
1228 | bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; | ||
1081 | 1229 | ||
1082 | (*mem)->va = va; | 1230 | (*mem)->va = va; |
1083 | (*mem)->domain = alloc_domain; | 1231 | (*mem)->domain = domain; |
1084 | (*mem)->mapped_to_gpu_memory = 0; | 1232 | (*mem)->mapped_to_gpu_memory = 0; |
1085 | (*mem)->process_info = avm->process_info; | 1233 | (*mem)->process_info = avm->process_info; |
1086 | add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info); | 1234 | add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr); |
1235 | |||
1236 | if (user_addr) { | ||
1237 | ret = init_user_pages(*mem, current->mm, user_addr); | ||
1238 | if (ret) { | ||
1239 | mutex_lock(&avm->process_info->lock); | ||
1240 | list_del(&(*mem)->validate_list.head); | ||
1241 | mutex_unlock(&avm->process_info->lock); | ||
1242 | goto allocate_init_user_pages_failed; | ||
1243 | } | ||
1244 | } | ||
1087 | 1245 | ||
1088 | if (offset) | 1246 | if (offset) |
1089 | *offset = amdgpu_bo_mmap_offset(bo); | 1247 | *offset = amdgpu_bo_mmap_offset(bo); |
1090 | 1248 | ||
1091 | return 0; | 1249 | return 0; |
1092 | 1250 | ||
1251 | allocate_init_user_pages_failed: | ||
1252 | amdgpu_bo_unref(&bo); | ||
1253 | /* Don't unreserve system mem limit twice */ | ||
1254 | goto err_reserve_system_mem; | ||
1093 | err_bo_create: | 1255 | err_bo_create: |
1094 | unreserve_system_mem_limit(adev, size, alloc_domain); | 1256 | unreserve_system_mem_limit(adev, size, alloc_domain); |
1095 | err_reserve_system_mem: | 1257 | err_reserve_system_mem: |
@@ -1122,12 +1284,24 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( | |||
1122 | * be freed anyway | 1284 | * be freed anyway |
1123 | */ | 1285 | */ |
1124 | 1286 | ||
1287 | /* No more MMU notifiers */ | ||
1288 | amdgpu_mn_unregister(mem->bo); | ||
1289 | |||
1125 | /* Make sure restore workers don't access the BO any more */ | 1290 | /* Make sure restore workers don't access the BO any more */ |
1126 | bo_list_entry = &mem->validate_list; | 1291 | bo_list_entry = &mem->validate_list; |
1127 | mutex_lock(&process_info->lock); | 1292 | mutex_lock(&process_info->lock); |
1128 | list_del(&bo_list_entry->head); | 1293 | list_del(&bo_list_entry->head); |
1129 | mutex_unlock(&process_info->lock); | 1294 | mutex_unlock(&process_info->lock); |
1130 | 1295 | ||
1296 | /* Free user pages if necessary */ | ||
1297 | if (mem->user_pages) { | ||
1298 | pr_debug("%s: Freeing user_pages array\n", __func__); | ||
1299 | if (mem->user_pages[0]) | ||
1300 | release_pages(mem->user_pages, | ||
1301 | mem->bo->tbo.ttm->num_pages); | ||
1302 | kvfree(mem->user_pages); | ||
1303 | } | ||
1304 | |||
1131 | ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx); | 1305 | ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx); |
1132 | if (unlikely(ret)) | 1306 | if (unlikely(ret)) |
1133 | return ret; | 1307 | return ret; |
@@ -1173,21 +1347,32 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( | |||
1173 | struct kfd_bo_va_list *bo_va_entry = NULL; | 1347 | struct kfd_bo_va_list *bo_va_entry = NULL; |
1174 | struct kfd_bo_va_list *bo_va_entry_aql = NULL; | 1348 | struct kfd_bo_va_list *bo_va_entry_aql = NULL; |
1175 | unsigned long bo_size; | 1349 | unsigned long bo_size; |
1176 | 1350 | bool is_invalid_userptr = false; | |
1177 | /* Make sure restore is not running concurrently. | ||
1178 | */ | ||
1179 | mutex_lock(&mem->process_info->lock); | ||
1180 | |||
1181 | mutex_lock(&mem->lock); | ||
1182 | 1351 | ||
1183 | bo = mem->bo; | 1352 | bo = mem->bo; |
1184 | |||
1185 | if (!bo) { | 1353 | if (!bo) { |
1186 | pr_err("Invalid BO when mapping memory to GPU\n"); | 1354 | pr_err("Invalid BO when mapping memory to GPU\n"); |
1187 | ret = -EINVAL; | 1355 | return -EINVAL; |
1188 | goto out; | ||
1189 | } | 1356 | } |
1190 | 1357 | ||
1358 | /* Make sure restore is not running concurrently. Since we | ||
1359 | * don't map invalid userptr BOs, we rely on the next restore | ||
1360 | * worker to do the mapping | ||
1361 | */ | ||
1362 | mutex_lock(&mem->process_info->lock); | ||
1363 | |||
1364 | /* Lock mmap-sem. If we find an invalid userptr BO, we can be | ||
1365 | * sure that the MMU notifier is no longer running | ||
1366 | * concurrently and the queues are actually stopped | ||
1367 | */ | ||
1368 | if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { | ||
1369 | down_write(¤t->mm->mmap_sem); | ||
1370 | is_invalid_userptr = atomic_read(&mem->invalid); | ||
1371 | up_write(¤t->mm->mmap_sem); | ||
1372 | } | ||
1373 | |||
1374 | mutex_lock(&mem->lock); | ||
1375 | |||
1191 | domain = mem->domain; | 1376 | domain = mem->domain; |
1192 | bo_size = bo->tbo.mem.size; | 1377 | bo_size = bo->tbo.mem.size; |
1193 | 1378 | ||
@@ -1200,6 +1385,14 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( | |||
1200 | if (unlikely(ret)) | 1385 | if (unlikely(ret)) |
1201 | goto out; | 1386 | goto out; |
1202 | 1387 | ||
1388 | /* Userptr can be marked as "not invalid", but not actually be | ||
1389 | * validated yet (still in the system domain). In that case | ||
1390 | * the queues are still stopped and we can leave mapping for | ||
1391 | * the next restore worker | ||
1392 | */ | ||
1393 | if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) | ||
1394 | is_invalid_userptr = true; | ||
1395 | |||
1203 | if (check_if_add_bo_to_vm(avm, mem)) { | 1396 | if (check_if_add_bo_to_vm(avm, mem)) { |
1204 | ret = add_bo_to_vm(adev, mem, avm, false, | 1397 | ret = add_bo_to_vm(adev, mem, avm, false, |
1205 | &bo_va_entry); | 1398 | &bo_va_entry); |
@@ -1217,7 +1410,8 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( | |||
1217 | goto add_bo_to_vm_failed; | 1410 | goto add_bo_to_vm_failed; |
1218 | } | 1411 | } |
1219 | 1412 | ||
1220 | if (mem->mapped_to_gpu_memory == 0) { | 1413 | if (mem->mapped_to_gpu_memory == 0 && |
1414 | !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { | ||
1221 | /* Validate BO only once. The eviction fence gets added to BO | 1415 | /* Validate BO only once. The eviction fence gets added to BO |
1222 | * the first time it is mapped. Validate will wait for all | 1416 | * the first time it is mapped. Validate will wait for all |
1223 | * background evictions to complete. | 1417 | * background evictions to complete. |
@@ -1235,7 +1429,8 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( | |||
1235 | entry->va, entry->va + bo_size, | 1429 | entry->va, entry->va + bo_size, |
1236 | entry); | 1430 | entry); |
1237 | 1431 | ||
1238 | ret = map_bo_to_gpuvm(adev, entry, ctx.sync); | 1432 | ret = map_bo_to_gpuvm(adev, entry, ctx.sync, |
1433 | is_invalid_userptr); | ||
1239 | if (ret) { | 1434 | if (ret) { |
1240 | pr_err("Failed to map radeon bo to gpuvm\n"); | 1435 | pr_err("Failed to map radeon bo to gpuvm\n"); |
1241 | goto map_bo_to_gpuvm_failed; | 1436 | goto map_bo_to_gpuvm_failed; |
@@ -1418,6 +1613,337 @@ bo_reserve_failed: | |||
1418 | return ret; | 1613 | return ret; |
1419 | } | 1614 | } |
1420 | 1615 | ||
1616 | /* Evict a userptr BO by stopping the queues if necessary | ||
1617 | * | ||
1618 | * Runs in MMU notifier, may be in RECLAIM_FS context. This means it | ||
1619 | * cannot do any memory allocations, and cannot take any locks that | ||
1620 | * are held elsewhere while allocating memory. Therefore this is as | ||
1621 | * simple as possible, using atomic counters. | ||
1622 | * | ||
1623 | * It doesn't do anything to the BO itself. The real work happens in | ||
1624 | * restore, where we get updated page addresses. This function only | ||
1625 | * ensures that GPU access to the BO is stopped. | ||
1626 | */ | ||
1627 | int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, | ||
1628 | struct mm_struct *mm) | ||
1629 | { | ||
1630 | struct amdkfd_process_info *process_info = mem->process_info; | ||
1631 | int invalid, evicted_bos; | ||
1632 | int r = 0; | ||
1633 | |||
1634 | invalid = atomic_inc_return(&mem->invalid); | ||
1635 | evicted_bos = atomic_inc_return(&process_info->evicted_bos); | ||
1636 | if (evicted_bos == 1) { | ||
1637 | /* First eviction, stop the queues */ | ||
1638 | r = kgd2kfd->quiesce_mm(mm); | ||
1639 | if (r) | ||
1640 | pr_err("Failed to quiesce KFD\n"); | ||
1641 | schedule_delayed_work(&process_info->restore_userptr_work, | ||
1642 | msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); | ||
1643 | } | ||
1644 | |||
1645 | return r; | ||
1646 | } | ||
1647 | |||
1648 | /* Update invalid userptr BOs | ||
1649 | * | ||
1650 | * Moves invalidated (evicted) userptr BOs from userptr_valid_list to | ||
1651 | * userptr_inval_list and updates user pages for all BOs that have | ||
1652 | * been invalidated since their last update. | ||
1653 | */ | ||
1654 | static int update_invalid_user_pages(struct amdkfd_process_info *process_info, | ||
1655 | struct mm_struct *mm) | ||
1656 | { | ||
1657 | struct kgd_mem *mem, *tmp_mem; | ||
1658 | struct amdgpu_bo *bo; | ||
1659 | struct ttm_operation_ctx ctx = { false, false }; | ||
1660 | int invalid, ret; | ||
1661 | |||
1662 | /* Move all invalidated BOs to the userptr_inval_list and | ||
1663 | * release their user pages by migration to the CPU domain | ||
1664 | */ | ||
1665 | list_for_each_entry_safe(mem, tmp_mem, | ||
1666 | &process_info->userptr_valid_list, | ||
1667 | validate_list.head) { | ||
1668 | if (!atomic_read(&mem->invalid)) | ||
1669 | continue; /* BO is still valid */ | ||
1670 | |||
1671 | bo = mem->bo; | ||
1672 | |||
1673 | if (amdgpu_bo_reserve(bo, true)) | ||
1674 | return -EAGAIN; | ||
1675 | amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); | ||
1676 | ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); | ||
1677 | amdgpu_bo_unreserve(bo); | ||
1678 | if (ret) { | ||
1679 | pr_err("%s: Failed to invalidate userptr BO\n", | ||
1680 | __func__); | ||
1681 | return -EAGAIN; | ||
1682 | } | ||
1683 | |||
1684 | list_move_tail(&mem->validate_list.head, | ||
1685 | &process_info->userptr_inval_list); | ||
1686 | } | ||
1687 | |||
1688 | if (list_empty(&process_info->userptr_inval_list)) | ||
1689 | return 0; /* All evicted userptr BOs were freed */ | ||
1690 | |||
1691 | /* Go through userptr_inval_list and update any invalid user_pages */ | ||
1692 | list_for_each_entry(mem, &process_info->userptr_inval_list, | ||
1693 | validate_list.head) { | ||
1694 | invalid = atomic_read(&mem->invalid); | ||
1695 | if (!invalid) | ||
1696 | /* BO hasn't been invalidated since the last | ||
1697 | * revalidation attempt. Keep its BO list. | ||
1698 | */ | ||
1699 | continue; | ||
1700 | |||
1701 | bo = mem->bo; | ||
1702 | |||
1703 | if (!mem->user_pages) { | ||
1704 | mem->user_pages = | ||
1705 | kvmalloc_array(bo->tbo.ttm->num_pages, | ||
1706 | sizeof(struct page *), | ||
1707 | GFP_KERNEL | __GFP_ZERO); | ||
1708 | if (!mem->user_pages) { | ||
1709 | pr_err("%s: Failed to allocate pages array\n", | ||
1710 | __func__); | ||
1711 | return -ENOMEM; | ||
1712 | } | ||
1713 | } else if (mem->user_pages[0]) { | ||
1714 | release_pages(mem->user_pages, bo->tbo.ttm->num_pages); | ||
1715 | } | ||
1716 | |||
1717 | /* Get updated user pages */ | ||
1718 | ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, | ||
1719 | mem->user_pages); | ||
1720 | if (ret) { | ||
1721 | mem->user_pages[0] = NULL; | ||
1722 | pr_info("%s: Failed to get user pages: %d\n", | ||
1723 | __func__, ret); | ||
1724 | /* Pretend it succeeded. It will fail later | ||
1725 | * with a VM fault if the GPU tries to access | ||
1726 | * it. Better than hanging indefinitely with | ||
1727 | * stalled user mode queues. | ||
1728 | */ | ||
1729 | } | ||
1730 | |||
1731 | /* Mark the BO as valid unless it was invalidated | ||
1732 | * again concurrently | ||
1733 | */ | ||
1734 | if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) | ||
1735 | return -EAGAIN; | ||
1736 | } | ||
1737 | |||
1738 | return 0; | ||
1739 | } | ||
1740 | |||
1741 | /* Validate invalid userptr BOs | ||
1742 | * | ||
1743 | * Validates BOs on the userptr_inval_list, and moves them back to the | ||
1744 | * userptr_valid_list. Also updates GPUVM page tables with new page | ||
1745 | * addresses and waits for the page table updates to complete. | ||
1746 | */ | ||
1747 | static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) | ||
1748 | { | ||
1749 | struct amdgpu_bo_list_entry *pd_bo_list_entries; | ||
1750 | struct list_head resv_list, duplicates; | ||
1751 | struct ww_acquire_ctx ticket; | ||
1752 | struct amdgpu_sync sync; | ||
1753 | |||
1754 | struct amdgpu_vm *peer_vm; | ||
1755 | struct kgd_mem *mem, *tmp_mem; | ||
1756 | struct amdgpu_bo *bo; | ||
1757 | struct ttm_operation_ctx ctx = { false, false }; | ||
1758 | int i, ret; | ||
1759 | |||
1760 | pd_bo_list_entries = kcalloc(process_info->n_vms, | ||
1761 | sizeof(struct amdgpu_bo_list_entry), | ||
1762 | GFP_KERNEL); | ||
1763 | if (!pd_bo_list_entries) { | ||
1764 | pr_err("%s: Failed to allocate PD BO list entries\n", __func__); | ||
1765 | return -ENOMEM; | ||
1766 | } | ||
1767 | |||
1768 | INIT_LIST_HEAD(&resv_list); | ||
1769 | INIT_LIST_HEAD(&duplicates); | ||
1770 | |||
1771 | /* Get all the page directory BOs that need to be reserved */ | ||
1772 | i = 0; | ||
1773 | list_for_each_entry(peer_vm, &process_info->vm_list_head, | ||
1774 | vm_list_node) | ||
1775 | amdgpu_vm_get_pd_bo(peer_vm, &resv_list, | ||
1776 | &pd_bo_list_entries[i++]); | ||
1777 | /* Add the userptr_inval_list entries to resv_list */ | ||
1778 | list_for_each_entry(mem, &process_info->userptr_inval_list, | ||
1779 | validate_list.head) { | ||
1780 | list_add_tail(&mem->resv_list.head, &resv_list); | ||
1781 | mem->resv_list.bo = mem->validate_list.bo; | ||
1782 | mem->resv_list.shared = mem->validate_list.shared; | ||
1783 | } | ||
1784 | |||
1785 | /* Reserve all BOs and page tables for validation */ | ||
1786 | ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); | ||
1787 | WARN(!list_empty(&duplicates), "Duplicates should be empty"); | ||
1788 | if (ret) | ||
1789 | goto out; | ||
1790 | |||
1791 | amdgpu_sync_create(&sync); | ||
1792 | |||
1793 | /* Avoid triggering eviction fences when unmapping invalid | ||
1794 | * userptr BOs (waits for all fences, doesn't use | ||
1795 | * FENCE_OWNER_VM) | ||
1796 | */ | ||
1797 | list_for_each_entry(peer_vm, &process_info->vm_list_head, | ||
1798 | vm_list_node) | ||
1799 | amdgpu_amdkfd_remove_eviction_fence(peer_vm->root.base.bo, | ||
1800 | process_info->eviction_fence, | ||
1801 | NULL, NULL); | ||
1802 | |||
1803 | ret = process_validate_vms(process_info); | ||
1804 | if (ret) | ||
1805 | goto unreserve_out; | ||
1806 | |||
1807 | /* Validate BOs and update GPUVM page tables */ | ||
1808 | list_for_each_entry_safe(mem, tmp_mem, | ||
1809 | &process_info->userptr_inval_list, | ||
1810 | validate_list.head) { | ||
1811 | struct kfd_bo_va_list *bo_va_entry; | ||
1812 | |||
1813 | bo = mem->bo; | ||
1814 | |||
1815 | /* Copy pages array and validate the BO if we got user pages */ | ||
1816 | if (mem->user_pages[0]) { | ||
1817 | amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, | ||
1818 | mem->user_pages); | ||
1819 | amdgpu_ttm_placement_from_domain(bo, mem->domain); | ||
1820 | ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); | ||
1821 | if (ret) { | ||
1822 | pr_err("%s: failed to validate BO\n", __func__); | ||
1823 | goto unreserve_out; | ||
1824 | } | ||
1825 | } | ||
1826 | |||
1827 | /* Validate succeeded, now the BO owns the pages, free | ||
1828 | * our copy of the pointer array. Put this BO back on | ||
1829 | * the userptr_valid_list. If we need to revalidate | ||
1830 | * it, we need to start from scratch. | ||
1831 | */ | ||
1832 | kvfree(mem->user_pages); | ||
1833 | mem->user_pages = NULL; | ||
1834 | list_move_tail(&mem->validate_list.head, | ||
1835 | &process_info->userptr_valid_list); | ||
1836 | |||
1837 | /* Update mapping. If the BO was not validated | ||
1838 | * (because we couldn't get user pages), this will | ||
1839 | * clear the page table entries, which will result in | ||
1840 | * VM faults if the GPU tries to access the invalid | ||
1841 | * memory. | ||
1842 | */ | ||
1843 | list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { | ||
1844 | if (!bo_va_entry->is_mapped) | ||
1845 | continue; | ||
1846 | |||
1847 | ret = update_gpuvm_pte((struct amdgpu_device *) | ||
1848 | bo_va_entry->kgd_dev, | ||
1849 | bo_va_entry, &sync); | ||
1850 | if (ret) { | ||
1851 | pr_err("%s: update PTE failed\n", __func__); | ||
1852 | /* make sure this gets validated again */ | ||
1853 | atomic_inc(&mem->invalid); | ||
1854 | goto unreserve_out; | ||
1855 | } | ||
1856 | } | ||
1857 | } | ||
1858 | |||
1859 | /* Update page directories */ | ||
1860 | ret = process_update_pds(process_info, &sync); | ||
1861 | |||
1862 | unreserve_out: | ||
1863 | list_for_each_entry(peer_vm, &process_info->vm_list_head, | ||
1864 | vm_list_node) | ||
1865 | amdgpu_bo_fence(peer_vm->root.base.bo, | ||
1866 | &process_info->eviction_fence->base, true); | ||
1867 | ttm_eu_backoff_reservation(&ticket, &resv_list); | ||
1868 | amdgpu_sync_wait(&sync, false); | ||
1869 | amdgpu_sync_free(&sync); | ||
1870 | out: | ||
1871 | kfree(pd_bo_list_entries); | ||
1872 | |||
1873 | return ret; | ||
1874 | } | ||
1875 | |||
1876 | /* Worker callback to restore evicted userptr BOs | ||
1877 | * | ||
1878 | * Tries to update and validate all userptr BOs. If successful and no | ||
1879 | * concurrent evictions happened, the queues are restarted. Otherwise, | ||
1880 | * reschedule for another attempt later. | ||
1881 | */ | ||
1882 | static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) | ||
1883 | { | ||
1884 | struct delayed_work *dwork = to_delayed_work(work); | ||
1885 | struct amdkfd_process_info *process_info = | ||
1886 | container_of(dwork, struct amdkfd_process_info, | ||
1887 | restore_userptr_work); | ||
1888 | struct task_struct *usertask; | ||
1889 | struct mm_struct *mm; | ||
1890 | int evicted_bos; | ||
1891 | |||
1892 | evicted_bos = atomic_read(&process_info->evicted_bos); | ||
1893 | if (!evicted_bos) | ||
1894 | return; | ||
1895 | |||
1896 | /* Reference task and mm in case of concurrent process termination */ | ||
1897 | usertask = get_pid_task(process_info->pid, PIDTYPE_PID); | ||
1898 | if (!usertask) | ||
1899 | return; | ||
1900 | mm = get_task_mm(usertask); | ||
1901 | if (!mm) { | ||
1902 | put_task_struct(usertask); | ||
1903 | return; | ||
1904 | } | ||
1905 | |||
1906 | mutex_lock(&process_info->lock); | ||
1907 | |||
1908 | if (update_invalid_user_pages(process_info, mm)) | ||
1909 | goto unlock_out; | ||
1910 | /* userptr_inval_list can be empty if all evicted userptr BOs | ||
1911 | * have been freed. In that case there is nothing to validate | ||
1912 | * and we can just restart the queues. | ||
1913 | */ | ||
1914 | if (!list_empty(&process_info->userptr_inval_list)) { | ||
1915 | if (atomic_read(&process_info->evicted_bos) != evicted_bos) | ||
1916 | goto unlock_out; /* Concurrent eviction, try again */ | ||
1917 | |||
1918 | if (validate_invalid_user_pages(process_info)) | ||
1919 | goto unlock_out; | ||
1920 | } | ||
1921 | /* Final check for concurrent evicton and atomic update. If | ||
1922 | * another eviction happens after successful update, it will | ||
1923 | * be a first eviction that calls quiesce_mm. The eviction | ||
1924 | * reference counting inside KFD will handle this case. | ||
1925 | */ | ||
1926 | if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != | ||
1927 | evicted_bos) | ||
1928 | goto unlock_out; | ||
1929 | evicted_bos = 0; | ||
1930 | if (kgd2kfd->resume_mm(mm)) { | ||
1931 | pr_err("%s: Failed to resume KFD\n", __func__); | ||
1932 | /* No recovery from this failure. Probably the CP is | ||
1933 | * hanging. No point trying again. | ||
1934 | */ | ||
1935 | } | ||
1936 | unlock_out: | ||
1937 | mutex_unlock(&process_info->lock); | ||
1938 | mmput(mm); | ||
1939 | put_task_struct(usertask); | ||
1940 | |||
1941 | /* If validation failed, reschedule another attempt */ | ||
1942 | if (evicted_bos) | ||
1943 | schedule_delayed_work(&process_info->restore_userptr_work, | ||
1944 | msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); | ||
1945 | } | ||
1946 | |||
1421 | /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given | 1947 | /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given |
1422 | * KFD process identified by process_info | 1948 | * KFD process identified by process_info |
1423 | * | 1949 | * |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index dc34b50e6b29..8e66f3702b7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | |||
@@ -536,7 +536,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, | |||
536 | if (p->bo_list) { | 536 | if (p->bo_list) { |
537 | amdgpu_bo_list_get_list(p->bo_list, &p->validated); | 537 | amdgpu_bo_list_get_list(p->bo_list, &p->validated); |
538 | if (p->bo_list->first_userptr != p->bo_list->num_entries) | 538 | if (p->bo_list->first_userptr != p->bo_list->num_entries) |
539 | p->mn = amdgpu_mn_get(p->adev); | 539 | p->mn = amdgpu_mn_get(p->adev, AMDGPU_MN_TYPE_GFX); |
540 | } | 540 | } |
541 | 541 | ||
542 | INIT_LIST_HEAD(&duplicates); | 542 | INIT_LIST_HEAD(&duplicates); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index bd67f4cb8e6c..83e344fbb50a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | |||
@@ -36,12 +36,14 @@ | |||
36 | #include <drm/drm.h> | 36 | #include <drm/drm.h> |
37 | 37 | ||
38 | #include "amdgpu.h" | 38 | #include "amdgpu.h" |
39 | #include "amdgpu_amdkfd.h" | ||
39 | 40 | ||
40 | struct amdgpu_mn { | 41 | struct amdgpu_mn { |
41 | /* constant after initialisation */ | 42 | /* constant after initialisation */ |
42 | struct amdgpu_device *adev; | 43 | struct amdgpu_device *adev; |
43 | struct mm_struct *mm; | 44 | struct mm_struct *mm; |
44 | struct mmu_notifier mn; | 45 | struct mmu_notifier mn; |
46 | enum amdgpu_mn_type type; | ||
45 | 47 | ||
46 | /* only used on destruction */ | 48 | /* only used on destruction */ |
47 | struct work_struct work; | 49 | struct work_struct work; |
@@ -185,7 +187,7 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, | |||
185 | } | 187 | } |
186 | 188 | ||
187 | /** | 189 | /** |
188 | * amdgpu_mn_invalidate_range_start - callback to notify about mm change | 190 | * amdgpu_mn_invalidate_range_start_gfx - callback to notify about mm change |
189 | * | 191 | * |
190 | * @mn: our notifier | 192 | * @mn: our notifier |
191 | * @mn: the mm this callback is about | 193 | * @mn: the mm this callback is about |
@@ -195,10 +197,10 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, | |||
195 | * We block for all BOs between start and end to be idle and | 197 | * We block for all BOs between start and end to be idle and |
196 | * unmap them by move them into system domain again. | 198 | * unmap them by move them into system domain again. |
197 | */ | 199 | */ |
198 | static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, | 200 | static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, |
199 | struct mm_struct *mm, | 201 | struct mm_struct *mm, |
200 | unsigned long start, | 202 | unsigned long start, |
201 | unsigned long end) | 203 | unsigned long end) |
202 | { | 204 | { |
203 | struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); | 205 | struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); |
204 | struct interval_tree_node *it; | 206 | struct interval_tree_node *it; |
@@ -220,6 +222,49 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, | |||
220 | } | 222 | } |
221 | 223 | ||
222 | /** | 224 | /** |
225 | * amdgpu_mn_invalidate_range_start_hsa - callback to notify about mm change | ||
226 | * | ||
227 | * @mn: our notifier | ||
228 | * @mn: the mm this callback is about | ||
229 | * @start: start of updated range | ||
230 | * @end: end of updated range | ||
231 | * | ||
232 | * We temporarily evict all BOs between start and end. This | ||
233 | * necessitates evicting all user-mode queues of the process. The BOs | ||
234 | * are restorted in amdgpu_mn_invalidate_range_end_hsa. | ||
235 | */ | ||
236 | static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, | ||
237 | struct mm_struct *mm, | ||
238 | unsigned long start, | ||
239 | unsigned long end) | ||
240 | { | ||
241 | struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); | ||
242 | struct interval_tree_node *it; | ||
243 | |||
244 | /* notification is exclusive, but interval is inclusive */ | ||
245 | end -= 1; | ||
246 | |||
247 | amdgpu_mn_read_lock(rmn); | ||
248 | |||
249 | it = interval_tree_iter_first(&rmn->objects, start, end); | ||
250 | while (it) { | ||
251 | struct amdgpu_mn_node *node; | ||
252 | struct amdgpu_bo *bo; | ||
253 | |||
254 | node = container_of(it, struct amdgpu_mn_node, it); | ||
255 | it = interval_tree_iter_next(it, start, end); | ||
256 | |||
257 | list_for_each_entry(bo, &node->bos, mn_list) { | ||
258 | struct kgd_mem *mem = bo->kfd_bo; | ||
259 | |||
260 | if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, | ||
261 | start, end)) | ||
262 | amdgpu_amdkfd_evict_userptr(mem, mm); | ||
263 | } | ||
264 | } | ||
265 | } | ||
266 | |||
267 | /** | ||
223 | * amdgpu_mn_invalidate_range_end - callback to notify about mm change | 268 | * amdgpu_mn_invalidate_range_end - callback to notify about mm change |
224 | * | 269 | * |
225 | * @mn: our notifier | 270 | * @mn: our notifier |
@@ -239,23 +284,39 @@ static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn, | |||
239 | amdgpu_mn_read_unlock(rmn); | 284 | amdgpu_mn_read_unlock(rmn); |
240 | } | 285 | } |
241 | 286 | ||
242 | static const struct mmu_notifier_ops amdgpu_mn_ops = { | 287 | static const struct mmu_notifier_ops amdgpu_mn_ops[] = { |
243 | .release = amdgpu_mn_release, | 288 | [AMDGPU_MN_TYPE_GFX] = { |
244 | .invalidate_range_start = amdgpu_mn_invalidate_range_start, | 289 | .release = amdgpu_mn_release, |
245 | .invalidate_range_end = amdgpu_mn_invalidate_range_end, | 290 | .invalidate_range_start = amdgpu_mn_invalidate_range_start_gfx, |
291 | .invalidate_range_end = amdgpu_mn_invalidate_range_end, | ||
292 | }, | ||
293 | [AMDGPU_MN_TYPE_HSA] = { | ||
294 | .release = amdgpu_mn_release, | ||
295 | .invalidate_range_start = amdgpu_mn_invalidate_range_start_hsa, | ||
296 | .invalidate_range_end = amdgpu_mn_invalidate_range_end, | ||
297 | }, | ||
246 | }; | 298 | }; |
247 | 299 | ||
300 | /* Low bits of any reasonable mm pointer will be unused due to struct | ||
301 | * alignment. Use these bits to make a unique key from the mm pointer | ||
302 | * and notifier type. | ||
303 | */ | ||
304 | #define AMDGPU_MN_KEY(mm, type) ((unsigned long)(mm) + (type)) | ||
305 | |||
248 | /** | 306 | /** |
249 | * amdgpu_mn_get - create notifier context | 307 | * amdgpu_mn_get - create notifier context |
250 | * | 308 | * |
251 | * @adev: amdgpu device pointer | 309 | * @adev: amdgpu device pointer |
310 | * @type: type of MMU notifier context | ||
252 | * | 311 | * |
253 | * Creates a notifier context for current->mm. | 312 | * Creates a notifier context for current->mm. |
254 | */ | 313 | */ |
255 | struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | 314 | struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, |
315 | enum amdgpu_mn_type type) | ||
256 | { | 316 | { |
257 | struct mm_struct *mm = current->mm; | 317 | struct mm_struct *mm = current->mm; |
258 | struct amdgpu_mn *rmn; | 318 | struct amdgpu_mn *rmn; |
319 | unsigned long key = AMDGPU_MN_KEY(mm, type); | ||
259 | int r; | 320 | int r; |
260 | 321 | ||
261 | mutex_lock(&adev->mn_lock); | 322 | mutex_lock(&adev->mn_lock); |
@@ -264,8 +325,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | |||
264 | return ERR_PTR(-EINTR); | 325 | return ERR_PTR(-EINTR); |
265 | } | 326 | } |
266 | 327 | ||
267 | hash_for_each_possible(adev->mn_hash, rmn, node, (unsigned long)mm) | 328 | hash_for_each_possible(adev->mn_hash, rmn, node, key) |
268 | if (rmn->mm == mm) | 329 | if (AMDGPU_MN_KEY(rmn->mm, rmn->type) == key) |
269 | goto release_locks; | 330 | goto release_locks; |
270 | 331 | ||
271 | rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); | 332 | rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); |
@@ -276,8 +337,9 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | |||
276 | 337 | ||
277 | rmn->adev = adev; | 338 | rmn->adev = adev; |
278 | rmn->mm = mm; | 339 | rmn->mm = mm; |
279 | rmn->mn.ops = &amdgpu_mn_ops; | ||
280 | init_rwsem(&rmn->lock); | 340 | init_rwsem(&rmn->lock); |
341 | rmn->type = type; | ||
342 | rmn->mn.ops = &amdgpu_mn_ops[type]; | ||
281 | rmn->objects = RB_ROOT_CACHED; | 343 | rmn->objects = RB_ROOT_CACHED; |
282 | mutex_init(&rmn->read_lock); | 344 | mutex_init(&rmn->read_lock); |
283 | atomic_set(&rmn->recursion, 0); | 345 | atomic_set(&rmn->recursion, 0); |
@@ -286,7 +348,7 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | |||
286 | if (r) | 348 | if (r) |
287 | goto free_rmn; | 349 | goto free_rmn; |
288 | 350 | ||
289 | hash_add(adev->mn_hash, &rmn->node, (unsigned long)mm); | 351 | hash_add(adev->mn_hash, &rmn->node, AMDGPU_MN_KEY(mm, type)); |
290 | 352 | ||
291 | release_locks: | 353 | release_locks: |
292 | up_write(&mm->mmap_sem); | 354 | up_write(&mm->mmap_sem); |
@@ -315,15 +377,21 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) | |||
315 | { | 377 | { |
316 | unsigned long end = addr + amdgpu_bo_size(bo) - 1; | 378 | unsigned long end = addr + amdgpu_bo_size(bo) - 1; |
317 | struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); | 379 | struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); |
380 | enum amdgpu_mn_type type = | ||
381 | bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX; | ||
318 | struct amdgpu_mn *rmn; | 382 | struct amdgpu_mn *rmn; |
319 | struct amdgpu_mn_node *node = NULL; | 383 | struct amdgpu_mn_node *node = NULL, *new_node; |
320 | struct list_head bos; | 384 | struct list_head bos; |
321 | struct interval_tree_node *it; | 385 | struct interval_tree_node *it; |
322 | 386 | ||
323 | rmn = amdgpu_mn_get(adev); | 387 | rmn = amdgpu_mn_get(adev, type); |
324 | if (IS_ERR(rmn)) | 388 | if (IS_ERR(rmn)) |
325 | return PTR_ERR(rmn); | 389 | return PTR_ERR(rmn); |
326 | 390 | ||
391 | new_node = kmalloc(sizeof(*new_node), GFP_KERNEL); | ||
392 | if (!new_node) | ||
393 | return -ENOMEM; | ||
394 | |||
327 | INIT_LIST_HEAD(&bos); | 395 | INIT_LIST_HEAD(&bos); |
328 | 396 | ||
329 | down_write(&rmn->lock); | 397 | down_write(&rmn->lock); |
@@ -337,13 +405,10 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) | |||
337 | list_splice(&node->bos, &bos); | 405 | list_splice(&node->bos, &bos); |
338 | } | 406 | } |
339 | 407 | ||
340 | if (!node) { | 408 | if (!node) |
341 | node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_KERNEL); | 409 | node = new_node; |
342 | if (!node) { | 410 | else |
343 | up_write(&rmn->lock); | 411 | kfree(new_node); |
344 | return -ENOMEM; | ||
345 | } | ||
346 | } | ||
347 | 412 | ||
348 | bo->mn = rmn; | 413 | bo->mn = rmn; |
349 | 414 | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h index d0095a3793b8..eb0f432f78fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h | |||
@@ -29,16 +29,23 @@ | |||
29 | */ | 29 | */ |
30 | struct amdgpu_mn; | 30 | struct amdgpu_mn; |
31 | 31 | ||
32 | enum amdgpu_mn_type { | ||
33 | AMDGPU_MN_TYPE_GFX, | ||
34 | AMDGPU_MN_TYPE_HSA, | ||
35 | }; | ||
36 | |||
32 | #if defined(CONFIG_MMU_NOTIFIER) | 37 | #if defined(CONFIG_MMU_NOTIFIER) |
33 | void amdgpu_mn_lock(struct amdgpu_mn *mn); | 38 | void amdgpu_mn_lock(struct amdgpu_mn *mn); |
34 | void amdgpu_mn_unlock(struct amdgpu_mn *mn); | 39 | void amdgpu_mn_unlock(struct amdgpu_mn *mn); |
35 | struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev); | 40 | struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, |
41 | enum amdgpu_mn_type type); | ||
36 | int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); | 42 | int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); |
37 | void amdgpu_mn_unregister(struct amdgpu_bo *bo); | 43 | void amdgpu_mn_unregister(struct amdgpu_bo *bo); |
38 | #else | 44 | #else |
39 | static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {} | 45 | static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {} |
40 | static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {} | 46 | static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {} |
41 | static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | 47 | static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, |
48 | enum amdgpu_mn_type type) | ||
42 | { | 49 | { |
43 | return NULL; | 50 | return NULL; |
44 | } | 51 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 205da3ff9cd0..c713d30cba86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | |||
@@ -695,7 +695,7 @@ struct amdgpu_ttm_tt { | |||
695 | struct ttm_dma_tt ttm; | 695 | struct ttm_dma_tt ttm; |
696 | u64 offset; | 696 | u64 offset; |
697 | uint64_t userptr; | 697 | uint64_t userptr; |
698 | struct mm_struct *usermm; | 698 | struct task_struct *usertask; |
699 | uint32_t userflags; | 699 | uint32_t userflags; |
700 | spinlock_t guptasklock; | 700 | spinlock_t guptasklock; |
701 | struct list_head guptasks; | 701 | struct list_head guptasks; |
@@ -706,14 +706,18 @@ struct amdgpu_ttm_tt { | |||
706 | int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | 706 | int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) |
707 | { | 707 | { |
708 | struct amdgpu_ttm_tt *gtt = (void *)ttm; | 708 | struct amdgpu_ttm_tt *gtt = (void *)ttm; |
709 | struct mm_struct *mm = gtt->usertask->mm; | ||
709 | unsigned int flags = 0; | 710 | unsigned int flags = 0; |
710 | unsigned pinned = 0; | 711 | unsigned pinned = 0; |
711 | int r; | 712 | int r; |
712 | 713 | ||
714 | if (!mm) /* Happens during process shutdown */ | ||
715 | return -ESRCH; | ||
716 | |||
713 | if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) | 717 | if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) |
714 | flags |= FOLL_WRITE; | 718 | flags |= FOLL_WRITE; |
715 | 719 | ||
716 | down_read(¤t->mm->mmap_sem); | 720 | down_read(&mm->mmap_sem); |
717 | 721 | ||
718 | if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) { | 722 | if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) { |
719 | /* check that we only use anonymous memory | 723 | /* check that we only use anonymous memory |
@@ -721,9 +725,9 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | |||
721 | unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE; | 725 | unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE; |
722 | struct vm_area_struct *vma; | 726 | struct vm_area_struct *vma; |
723 | 727 | ||
724 | vma = find_vma(gtt->usermm, gtt->userptr); | 728 | vma = find_vma(mm, gtt->userptr); |
725 | if (!vma || vma->vm_file || vma->vm_end < end) { | 729 | if (!vma || vma->vm_file || vma->vm_end < end) { |
726 | up_read(¤t->mm->mmap_sem); | 730 | up_read(&mm->mmap_sem); |
727 | return -EPERM; | 731 | return -EPERM; |
728 | } | 732 | } |
729 | } | 733 | } |
@@ -739,7 +743,12 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | |||
739 | list_add(&guptask.list, >t->guptasks); | 743 | list_add(&guptask.list, >t->guptasks); |
740 | spin_unlock(>t->guptasklock); | 744 | spin_unlock(>t->guptasklock); |
741 | 745 | ||
742 | r = get_user_pages(userptr, num_pages, flags, p, NULL); | 746 | if (mm == current->mm) |
747 | r = get_user_pages(userptr, num_pages, flags, p, NULL); | ||
748 | else | ||
749 | r = get_user_pages_remote(gtt->usertask, | ||
750 | mm, userptr, num_pages, | ||
751 | flags, p, NULL, NULL); | ||
743 | 752 | ||
744 | spin_lock(>t->guptasklock); | 753 | spin_lock(>t->guptasklock); |
745 | list_del(&guptask.list); | 754 | list_del(&guptask.list); |
@@ -752,12 +761,12 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | |||
752 | 761 | ||
753 | } while (pinned < ttm->num_pages); | 762 | } while (pinned < ttm->num_pages); |
754 | 763 | ||
755 | up_read(¤t->mm->mmap_sem); | 764 | up_read(&mm->mmap_sem); |
756 | return 0; | 765 | return 0; |
757 | 766 | ||
758 | release_pages: | 767 | release_pages: |
759 | release_pages(pages, pinned); | 768 | release_pages(pages, pinned); |
760 | up_read(¤t->mm->mmap_sem); | 769 | up_read(&mm->mmap_sem); |
761 | return r; | 770 | return r; |
762 | } | 771 | } |
763 | 772 | ||
@@ -978,6 +987,9 @@ static void amdgpu_ttm_backend_destroy(struct ttm_tt *ttm) | |||
978 | { | 987 | { |
979 | struct amdgpu_ttm_tt *gtt = (void *)ttm; | 988 | struct amdgpu_ttm_tt *gtt = (void *)ttm; |
980 | 989 | ||
990 | if (gtt->usertask) | ||
991 | put_task_struct(gtt->usertask); | ||
992 | |||
981 | ttm_dma_tt_fini(>t->ttm); | 993 | ttm_dma_tt_fini(>t->ttm); |
982 | kfree(gtt); | 994 | kfree(gtt); |
983 | } | 995 | } |
@@ -1079,8 +1091,13 @@ int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr, | |||
1079 | return -EINVAL; | 1091 | return -EINVAL; |
1080 | 1092 | ||
1081 | gtt->userptr = addr; | 1093 | gtt->userptr = addr; |
1082 | gtt->usermm = current->mm; | ||
1083 | gtt->userflags = flags; | 1094 | gtt->userflags = flags; |
1095 | |||
1096 | if (gtt->usertask) | ||
1097 | put_task_struct(gtt->usertask); | ||
1098 | gtt->usertask = current->group_leader; | ||
1099 | get_task_struct(gtt->usertask); | ||
1100 | |||
1084 | spin_lock_init(>t->guptasklock); | 1101 | spin_lock_init(>t->guptasklock); |
1085 | INIT_LIST_HEAD(>t->guptasks); | 1102 | INIT_LIST_HEAD(>t->guptasks); |
1086 | atomic_set(>t->mmu_invalidations, 0); | 1103 | atomic_set(>t->mmu_invalidations, 0); |
@@ -1096,7 +1113,10 @@ struct mm_struct *amdgpu_ttm_tt_get_usermm(struct ttm_tt *ttm) | |||
1096 | if (gtt == NULL) | 1113 | if (gtt == NULL) |
1097 | return NULL; | 1114 | return NULL; |
1098 | 1115 | ||
1099 | return gtt->usermm; | 1116 | if (gtt->usertask == NULL) |
1117 | return NULL; | ||
1118 | |||
1119 | return gtt->usertask->mm; | ||
1100 | } | 1120 | } |
1101 | 1121 | ||
1102 | bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start, | 1122 | bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start, |
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 9d39fd5b1822..e5962e61beb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | |||
@@ -4686,6 +4686,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev, | |||
4686 | 4686 | ||
4687 | cu_info->number = active_cu_number; | 4687 | cu_info->number = active_cu_number; |
4688 | cu_info->ao_cu_mask = ao_cu_mask; | 4688 | cu_info->ao_cu_mask = ao_cu_mask; |
4689 | cu_info->simd_per_cu = NUM_SIMD_PER_CU; | ||
4689 | 4690 | ||
4690 | return 0; | 4691 | return 0; |
4691 | } | 4692 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h index 7f408f85fdb6..f22f7a88ce0f 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h | |||
@@ -268,6 +268,11 @@ | |||
268 | * x=1: tmz_end | 268 | * x=1: tmz_end |
269 | */ | 269 | */ |
270 | 270 | ||
271 | #define PACKET3_INVALIDATE_TLBS 0x98 | ||
272 | # define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) | ||
273 | # define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4) | ||
274 | # define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) | ||
275 | # define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29) | ||
271 | #define PACKET3_SET_RESOURCES 0xA0 | 276 | #define PACKET3_SET_RESOURCES 0xA0 |
272 | /* 1. header | 277 | /* 1. header |
273 | * 2. CONTROL | 278 | * 2. CONTROL |