diff options
author | Dave Airlie <airlied@redhat.com> | 2018-05-15 01:59:10 -0400 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2018-05-15 02:06:08 -0400 |
commit | c76f0b2cc2f1be1a8a20f0fe2c0f30919bc559fb (patch) | |
tree | 1aeeb74795b2951952aa443f7104d6c090c58141 | |
parent | 444ac87becd8a2ff76f9e4194dd98da4f5d5586d (diff) | |
parent | af47b390273f1068bdb1d01263a81948c4e2f97a (diff) |
Merge tag 'drm-amdkfd-next-2018-05-14' of git://people.freedesktop.org/~gabbayo/linux into drm-next
This is amdkfd pull for 4.18. The major new features are:
- Add support for GFXv9 dGPUs (VEGA)
- Add support for userptr memory mapping
In addition, there are a couple of small fixes and improvements, such as:
- Fix lock handling
- Fix rollback packet in kernel kfd_queue
- Optimize kfd signal handling
- Fix CP hang in APU
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180514070126.GA1827@odedg-x270
52 files changed, 6222 insertions, 858 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 8daa96a99eac..ac1215a5561e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -767,12 +767,14 @@ F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | |||
767 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 767 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |
768 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 768 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |
769 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 769 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |
770 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | ||
770 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 771 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c |
771 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 772 | F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |
772 | F: drivers/gpu/drm/amd/amdkfd/ | 773 | F: drivers/gpu/drm/amd/amdkfd/ |
773 | F: drivers/gpu/drm/amd/include/cik_structs.h | 774 | F: drivers/gpu/drm/amd/include/cik_structs.h |
774 | F: drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 775 | F: drivers/gpu/drm/amd/include/kgd_kfd_interface.h |
775 | F: drivers/gpu/drm/amd/include/vi_structs.h | 776 | F: drivers/gpu/drm/amd/include/vi_structs.h |
777 | F: drivers/gpu/drm/amd/include/v9_structs.h | ||
776 | F: include/uapi/linux/kfd_ioctl.h | 778 | F: include/uapi/linux/kfd_ioctl.h |
777 | 779 | ||
778 | AMD SEATTLE DEVICE TREE SUPPORT | 780 | AMD SEATTLE DEVICE TREE SUPPORT |
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 2ca2b5154d52..f3002020df6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile | |||
@@ -130,7 +130,8 @@ amdgpu-y += \ | |||
130 | amdgpu_amdkfd.o \ | 130 | amdgpu_amdkfd.o \ |
131 | amdgpu_amdkfd_fence.o \ | 131 | amdgpu_amdkfd_fence.o \ |
132 | amdgpu_amdkfd_gpuvm.o \ | 132 | amdgpu_amdkfd_gpuvm.o \ |
133 | amdgpu_amdkfd_gfx_v8.o | 133 | amdgpu_amdkfd_gfx_v8.o \ |
134 | amdgpu_amdkfd_gfx_v9.o | ||
134 | 135 | ||
135 | # add cgs | 136 | # add cgs |
136 | amdgpu-y += amdgpu_cgs.o | 137 | amdgpu-y += amdgpu_cgs.o |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 4d36203ffb11..cd0e8f192e6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | |||
@@ -92,6 +92,10 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) | |||
92 | case CHIP_POLARIS11: | 92 | case CHIP_POLARIS11: |
93 | kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); | 93 | kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); |
94 | break; | 94 | break; |
95 | case CHIP_VEGA10: | ||
96 | case CHIP_RAVEN: | ||
97 | kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); | ||
98 | break; | ||
95 | default: | 99 | default: |
96 | dev_dbg(adev->dev, "kfd not supported on this ASIC\n"); | 100 | dev_dbg(adev->dev, "kfd not supported on this ASIC\n"); |
97 | return; | 101 | return; |
@@ -175,6 +179,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) | |||
175 | &gpu_resources.doorbell_physical_address, | 179 | &gpu_resources.doorbell_physical_address, |
176 | &gpu_resources.doorbell_aperture_size, | 180 | &gpu_resources.doorbell_aperture_size, |
177 | &gpu_resources.doorbell_start_offset); | 181 | &gpu_resources.doorbell_start_offset); |
182 | if (adev->asic_type >= CHIP_VEGA10) { | ||
183 | /* On SOC15 the BIF is involved in routing | ||
184 | * doorbells using the low 12 bits of the | ||
185 | * address. Communicate the assignments to | ||
186 | * KFD. KFD uses two doorbell pages per | ||
187 | * process in case of 64-bit doorbells so we | ||
188 | * can use each doorbell assignment twice. | ||
189 | */ | ||
190 | gpu_resources.sdma_doorbell[0][0] = | ||
191 | AMDGPU_DOORBELL64_sDMA_ENGINE0; | ||
192 | gpu_resources.sdma_doorbell[0][1] = | ||
193 | AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; | ||
194 | gpu_resources.sdma_doorbell[1][0] = | ||
195 | AMDGPU_DOORBELL64_sDMA_ENGINE1; | ||
196 | gpu_resources.sdma_doorbell[1][1] = | ||
197 | AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; | ||
198 | /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for | ||
199 | * SDMA, IH and VCN. So don't use them for the CP. | ||
200 | */ | ||
201 | gpu_resources.reserved_doorbell_mask = 0x1f0; | ||
202 | gpu_resources.reserved_doorbell_val = 0x0f0; | ||
203 | } | ||
178 | 204 | ||
179 | kgd2kfd->device_init(adev->kfd, &gpu_resources); | 205 | kgd2kfd->device_init(adev->kfd, &gpu_resources); |
180 | } | 206 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index c2c2bea731e0..12367a9951e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/types.h> | 28 | #include <linux/types.h> |
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/mmu_context.h> | 30 | #include <linux/mmu_context.h> |
31 | #include <linux/workqueue.h> | ||
31 | #include <kgd_kfd_interface.h> | 32 | #include <kgd_kfd_interface.h> |
32 | #include <drm/ttm/ttm_execbuf_util.h> | 33 | #include <drm/ttm/ttm_execbuf_util.h> |
33 | #include "amdgpu_sync.h" | 34 | #include "amdgpu_sync.h" |
@@ -59,7 +60,9 @@ struct kgd_mem { | |||
59 | 60 | ||
60 | uint32_t mapping_flags; | 61 | uint32_t mapping_flags; |
61 | 62 | ||
63 | atomic_t invalid; | ||
62 | struct amdkfd_process_info *process_info; | 64 | struct amdkfd_process_info *process_info; |
65 | struct page **user_pages; | ||
63 | 66 | ||
64 | struct amdgpu_sync sync; | 67 | struct amdgpu_sync sync; |
65 | 68 | ||
@@ -84,6 +87,9 @@ struct amdkfd_process_info { | |||
84 | struct list_head vm_list_head; | 87 | struct list_head vm_list_head; |
85 | /* List head for all KFD BOs that belong to a KFD process. */ | 88 | /* List head for all KFD BOs that belong to a KFD process. */ |
86 | struct list_head kfd_bo_list; | 89 | struct list_head kfd_bo_list; |
90 | /* List of userptr BOs that are valid or invalid */ | ||
91 | struct list_head userptr_valid_list; | ||
92 | struct list_head userptr_inval_list; | ||
87 | /* Lock to protect kfd_bo_list */ | 93 | /* Lock to protect kfd_bo_list */ |
88 | struct mutex lock; | 94 | struct mutex lock; |
89 | 95 | ||
@@ -91,6 +97,11 @@ struct amdkfd_process_info { | |||
91 | unsigned int n_vms; | 97 | unsigned int n_vms; |
92 | /* Eviction Fence */ | 98 | /* Eviction Fence */ |
93 | struct amdgpu_amdkfd_fence *eviction_fence; | 99 | struct amdgpu_amdkfd_fence *eviction_fence; |
100 | |||
101 | /* MMU-notifier related fields */ | ||
102 | atomic_t evicted_bos; | ||
103 | struct delayed_work restore_userptr_work; | ||
104 | struct pid *pid; | ||
94 | }; | 105 | }; |
95 | 106 | ||
96 | int amdgpu_amdkfd_init(void); | 107 | int amdgpu_amdkfd_init(void); |
@@ -104,12 +115,14 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); | |||
104 | void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); | 115 | void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); |
105 | void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); | 116 | void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); |
106 | 117 | ||
118 | int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); | ||
107 | int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, | 119 | int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, |
108 | uint32_t vmid, uint64_t gpu_addr, | 120 | uint32_t vmid, uint64_t gpu_addr, |
109 | uint32_t *ib_cmd, uint32_t ib_len); | 121 | uint32_t *ib_cmd, uint32_t ib_len); |
110 | 122 | ||
111 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); | 123 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); |
112 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); | 124 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); |
125 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); | ||
113 | 126 | ||
114 | bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); | 127 | bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); |
115 | 128 | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index ea54e53172b9..0ff36d45a597 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | |||
@@ -98,8 +98,6 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, | |||
98 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | 98 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, |
99 | unsigned int vmid); | 99 | unsigned int vmid); |
100 | 100 | ||
101 | static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, | ||
102 | uint32_t hpd_size, uint64_t hpd_gpu_addr); | ||
103 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); | 101 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); |
104 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | 102 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, |
105 | uint32_t queue_id, uint32_t __user *wptr, | 103 | uint32_t queue_id, uint32_t __user *wptr, |
@@ -183,7 +181,6 @@ static const struct kfd2kgd_calls kfd2kgd = { | |||
183 | .free_pasid = amdgpu_pasid_free, | 181 | .free_pasid = amdgpu_pasid_free, |
184 | .program_sh_mem_settings = kgd_program_sh_mem_settings, | 182 | .program_sh_mem_settings = kgd_program_sh_mem_settings, |
185 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, | 183 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, |
186 | .init_pipeline = kgd_init_pipeline, | ||
187 | .init_interrupts = kgd_init_interrupts, | 184 | .init_interrupts = kgd_init_interrupts, |
188 | .hqd_load = kgd_hqd_load, | 185 | .hqd_load = kgd_hqd_load, |
189 | .hqd_sdma_load = kgd_hqd_sdma_load, | 186 | .hqd_sdma_load = kgd_hqd_sdma_load, |
@@ -309,13 +306,6 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | |||
309 | return 0; | 306 | return 0; |
310 | } | 307 | } |
311 | 308 | ||
312 | static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, | ||
313 | uint32_t hpd_size, uint64_t hpd_gpu_addr) | ||
314 | { | ||
315 | /* amdgpu owns the per-pipe state */ | ||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) | 309 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) |
320 | { | 310 | { |
321 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | 311 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index 89264c9a5e9f..6ef9762b4b00 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | |||
@@ -57,8 +57,6 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, | |||
57 | uint32_t sh_mem_bases); | 57 | uint32_t sh_mem_bases); |
58 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | 58 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, |
59 | unsigned int vmid); | 59 | unsigned int vmid); |
60 | static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, | ||
61 | uint32_t hpd_size, uint64_t hpd_gpu_addr); | ||
62 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); | 60 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); |
63 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | 61 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, |
64 | uint32_t queue_id, uint32_t __user *wptr, | 62 | uint32_t queue_id, uint32_t __user *wptr, |
@@ -141,7 +139,6 @@ static const struct kfd2kgd_calls kfd2kgd = { | |||
141 | .free_pasid = amdgpu_pasid_free, | 139 | .free_pasid = amdgpu_pasid_free, |
142 | .program_sh_mem_settings = kgd_program_sh_mem_settings, | 140 | .program_sh_mem_settings = kgd_program_sh_mem_settings, |
143 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, | 141 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, |
144 | .init_pipeline = kgd_init_pipeline, | ||
145 | .init_interrupts = kgd_init_interrupts, | 142 | .init_interrupts = kgd_init_interrupts, |
146 | .hqd_load = kgd_hqd_load, | 143 | .hqd_load = kgd_hqd_load, |
147 | .hqd_sdma_load = kgd_hqd_sdma_load, | 144 | .hqd_sdma_load = kgd_hqd_sdma_load, |
@@ -270,13 +267,6 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | |||
270 | return 0; | 267 | return 0; |
271 | } | 268 | } |
272 | 269 | ||
273 | static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, | ||
274 | uint32_t hpd_size, uint64_t hpd_gpu_addr) | ||
275 | { | ||
276 | /* amdgpu owns the per-pipe state */ | ||
277 | return 0; | ||
278 | } | ||
279 | |||
280 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) | 270 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) |
281 | { | 271 | { |
282 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | 272 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c new file mode 100644 index 000000000000..8f37991df61b --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | |||
@@ -0,0 +1,1043 @@ | |||
1 | /* | ||
2 | * Copyright 2014-2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | #define pr_fmt(fmt) "kfd2kgd: " fmt | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/fdtable.h> | ||
27 | #include <linux/uaccess.h> | ||
28 | #include <linux/firmware.h> | ||
29 | #include <drm/drmP.h> | ||
30 | #include "amdgpu.h" | ||
31 | #include "amdgpu_amdkfd.h" | ||
32 | #include "amdgpu_ucode.h" | ||
33 | #include "soc15_hw_ip.h" | ||
34 | #include "gc/gc_9_0_offset.h" | ||
35 | #include "gc/gc_9_0_sh_mask.h" | ||
36 | #include "vega10_enum.h" | ||
37 | #include "sdma0/sdma0_4_0_offset.h" | ||
38 | #include "sdma0/sdma0_4_0_sh_mask.h" | ||
39 | #include "sdma1/sdma1_4_0_offset.h" | ||
40 | #include "sdma1/sdma1_4_0_sh_mask.h" | ||
41 | #include "athub/athub_1_0_offset.h" | ||
42 | #include "athub/athub_1_0_sh_mask.h" | ||
43 | #include "oss/osssys_4_0_offset.h" | ||
44 | #include "oss/osssys_4_0_sh_mask.h" | ||
45 | #include "soc15_common.h" | ||
46 | #include "v9_structs.h" | ||
47 | #include "soc15.h" | ||
48 | #include "soc15d.h" | ||
49 | |||
50 | /* HACK: MMHUB and GC both have VM-related register with the same | ||
51 | * names but different offsets. Define the MMHUB register we need here | ||
52 | * with a prefix. A proper solution would be to move the functions | ||
53 | * programming these registers into gfx_v9_0.c and mmhub_v1_0.c | ||
54 | * respectively. | ||
55 | */ | ||
56 | #define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 | ||
57 | #define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 | ||
58 | |||
59 | #define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 | ||
60 | #define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 | ||
61 | |||
62 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b | ||
63 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 | ||
64 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c | ||
65 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 | ||
66 | |||
67 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b | ||
68 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 | ||
69 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c | ||
70 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 | ||
71 | |||
72 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b | ||
73 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 | ||
74 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c | ||
75 | #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 | ||
76 | |||
77 | #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 | ||
78 | #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 | ||
79 | #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 | ||
80 | #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 | ||
81 | |||
82 | #define V9_PIPE_PER_MEC (4) | ||
83 | #define V9_QUEUES_PER_PIPE_MEC (8) | ||
84 | |||
85 | enum hqd_dequeue_request_type { | ||
86 | NO_ACTION = 0, | ||
87 | DRAIN_PIPE, | ||
88 | RESET_WAVES | ||
89 | }; | ||
90 | |||
91 | /* | ||
92 | * Register access functions | ||
93 | */ | ||
94 | |||
95 | static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, | ||
96 | uint32_t sh_mem_config, | ||
97 | uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, | ||
98 | uint32_t sh_mem_bases); | ||
99 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | ||
100 | unsigned int vmid); | ||
101 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); | ||
102 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | ||
103 | uint32_t queue_id, uint32_t __user *wptr, | ||
104 | uint32_t wptr_shift, uint32_t wptr_mask, | ||
105 | struct mm_struct *mm); | ||
106 | static int kgd_hqd_dump(struct kgd_dev *kgd, | ||
107 | uint32_t pipe_id, uint32_t queue_id, | ||
108 | uint32_t (**dump)[2], uint32_t *n_regs); | ||
109 | static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, | ||
110 | uint32_t __user *wptr, struct mm_struct *mm); | ||
111 | static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, | ||
112 | uint32_t engine_id, uint32_t queue_id, | ||
113 | uint32_t (**dump)[2], uint32_t *n_regs); | ||
114 | static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, | ||
115 | uint32_t pipe_id, uint32_t queue_id); | ||
116 | static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); | ||
117 | static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, | ||
118 | enum kfd_preempt_type reset_type, | ||
119 | unsigned int utimeout, uint32_t pipe_id, | ||
120 | uint32_t queue_id); | ||
121 | static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, | ||
122 | unsigned int utimeout); | ||
123 | static int kgd_address_watch_disable(struct kgd_dev *kgd); | ||
124 | static int kgd_address_watch_execute(struct kgd_dev *kgd, | ||
125 | unsigned int watch_point_id, | ||
126 | uint32_t cntl_val, | ||
127 | uint32_t addr_hi, | ||
128 | uint32_t addr_lo); | ||
129 | static int kgd_wave_control_execute(struct kgd_dev *kgd, | ||
130 | uint32_t gfx_index_val, | ||
131 | uint32_t sq_cmd); | ||
132 | static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, | ||
133 | unsigned int watch_point_id, | ||
134 | unsigned int reg_offset); | ||
135 | |||
136 | static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, | ||
137 | uint8_t vmid); | ||
138 | static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, | ||
139 | uint8_t vmid); | ||
140 | static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, | ||
141 | uint32_t page_table_base); | ||
142 | static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); | ||
143 | static void set_scratch_backing_va(struct kgd_dev *kgd, | ||
144 | uint64_t va, uint32_t vmid); | ||
145 | static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); | ||
146 | static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid); | ||
147 | |||
148 | /* Because of REG_GET_FIELD() being used, we put this function in the | ||
149 | * asic specific file. | ||
150 | */ | ||
151 | static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, | ||
152 | struct tile_config *config) | ||
153 | { | ||
154 | struct amdgpu_device *adev = (struct amdgpu_device *)kgd; | ||
155 | |||
156 | config->gb_addr_config = adev->gfx.config.gb_addr_config; | ||
157 | |||
158 | config->tile_config_ptr = adev->gfx.config.tile_mode_array; | ||
159 | config->num_tile_configs = | ||
160 | ARRAY_SIZE(adev->gfx.config.tile_mode_array); | ||
161 | config->macro_tile_config_ptr = | ||
162 | adev->gfx.config.macrotile_mode_array; | ||
163 | config->num_macro_tile_configs = | ||
164 | ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); | ||
165 | |||
166 | return 0; | ||
167 | } | ||
168 | |||
169 | static const struct kfd2kgd_calls kfd2kgd = { | ||
170 | .init_gtt_mem_allocation = alloc_gtt_mem, | ||
171 | .free_gtt_mem = free_gtt_mem, | ||
172 | .get_local_mem_info = get_local_mem_info, | ||
173 | .get_gpu_clock_counter = get_gpu_clock_counter, | ||
174 | .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, | ||
175 | .alloc_pasid = amdgpu_pasid_alloc, | ||
176 | .free_pasid = amdgpu_pasid_free, | ||
177 | .program_sh_mem_settings = kgd_program_sh_mem_settings, | ||
178 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, | ||
179 | .init_interrupts = kgd_init_interrupts, | ||
180 | .hqd_load = kgd_hqd_load, | ||
181 | .hqd_sdma_load = kgd_hqd_sdma_load, | ||
182 | .hqd_dump = kgd_hqd_dump, | ||
183 | .hqd_sdma_dump = kgd_hqd_sdma_dump, | ||
184 | .hqd_is_occupied = kgd_hqd_is_occupied, | ||
185 | .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, | ||
186 | .hqd_destroy = kgd_hqd_destroy, | ||
187 | .hqd_sdma_destroy = kgd_hqd_sdma_destroy, | ||
188 | .address_watch_disable = kgd_address_watch_disable, | ||
189 | .address_watch_execute = kgd_address_watch_execute, | ||
190 | .wave_control_execute = kgd_wave_control_execute, | ||
191 | .address_watch_get_offset = kgd_address_watch_get_offset, | ||
192 | .get_atc_vmid_pasid_mapping_pasid = | ||
193 | get_atc_vmid_pasid_mapping_pasid, | ||
194 | .get_atc_vmid_pasid_mapping_valid = | ||
195 | get_atc_vmid_pasid_mapping_valid, | ||
196 | .get_fw_version = get_fw_version, | ||
197 | .set_scratch_backing_va = set_scratch_backing_va, | ||
198 | .get_tile_config = amdgpu_amdkfd_get_tile_config, | ||
199 | .get_cu_info = get_cu_info, | ||
200 | .get_vram_usage = amdgpu_amdkfd_get_vram_usage, | ||
201 | .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, | ||
202 | .acquire_process_vm = amdgpu_amdkfd_gpuvm_acquire_process_vm, | ||
203 | .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, | ||
204 | .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, | ||
205 | .set_vm_context_page_table_base = set_vm_context_page_table_base, | ||
206 | .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, | ||
207 | .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, | ||
208 | .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, | ||
209 | .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, | ||
210 | .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, | ||
211 | .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, | ||
212 | .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, | ||
213 | .invalidate_tlbs = invalidate_tlbs, | ||
214 | .invalidate_tlbs_vmid = invalidate_tlbs_vmid, | ||
215 | .submit_ib = amdgpu_amdkfd_submit_ib, | ||
216 | }; | ||
217 | |||
218 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void) | ||
219 | { | ||
220 | return (struct kfd2kgd_calls *)&kfd2kgd; | ||
221 | } | ||
222 | |||
223 | static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) | ||
224 | { | ||
225 | return (struct amdgpu_device *)kgd; | ||
226 | } | ||
227 | |||
228 | static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, | ||
229 | uint32_t queue, uint32_t vmid) | ||
230 | { | ||
231 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
232 | |||
233 | mutex_lock(&adev->srbm_mutex); | ||
234 | soc15_grbm_select(adev, mec, pipe, queue, vmid); | ||
235 | } | ||
236 | |||
237 | static void unlock_srbm(struct kgd_dev *kgd) | ||
238 | { | ||
239 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
240 | |||
241 | soc15_grbm_select(adev, 0, 0, 0, 0); | ||
242 | mutex_unlock(&adev->srbm_mutex); | ||
243 | } | ||
244 | |||
245 | static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, | ||
246 | uint32_t queue_id) | ||
247 | { | ||
248 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
249 | |||
250 | uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; | ||
251 | uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); | ||
252 | |||
253 | lock_srbm(kgd, mec, pipe, queue_id, 0); | ||
254 | } | ||
255 | |||
256 | static uint32_t get_queue_mask(struct amdgpu_device *adev, | ||
257 | uint32_t pipe_id, uint32_t queue_id) | ||
258 | { | ||
259 | unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe + | ||
260 | queue_id) & 31; | ||
261 | |||
262 | return ((uint32_t)1) << bit; | ||
263 | } | ||
264 | |||
265 | static void release_queue(struct kgd_dev *kgd) | ||
266 | { | ||
267 | unlock_srbm(kgd); | ||
268 | } | ||
269 | |||
270 | static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, | ||
271 | uint32_t sh_mem_config, | ||
272 | uint32_t sh_mem_ape1_base, | ||
273 | uint32_t sh_mem_ape1_limit, | ||
274 | uint32_t sh_mem_bases) | ||
275 | { | ||
276 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
277 | |||
278 | lock_srbm(kgd, 0, 0, 0, vmid); | ||
279 | |||
280 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); | ||
281 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); | ||
282 | /* APE1 no longer exists on GFX9 */ | ||
283 | |||
284 | unlock_srbm(kgd); | ||
285 | } | ||
286 | |||
287 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, | ||
288 | unsigned int vmid) | ||
289 | { | ||
290 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
291 | |||
292 | /* | ||
293 | * We have to assume that there is no outstanding mapping. | ||
294 | * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because | ||
295 | * a mapping is in progress or because a mapping finished | ||
296 | * and the SW cleared it. | ||
297 | * So the protocol is to always wait & clear. | ||
298 | */ | ||
299 | uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | | ||
300 | ATC_VMID0_PASID_MAPPING__VALID_MASK; | ||
301 | |||
302 | /* | ||
303 | * need to do this twice, once for gfx and once for mmhub | ||
304 | * for ATC add 16 to VMID for mmhub, for IH different registers. | ||
305 | * ATC_VMID0..15 registers are separate from ATC_VMID16..31. | ||
306 | */ | ||
307 | |||
308 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, | ||
309 | pasid_mapping); | ||
310 | |||
311 | while (!(RREG32(SOC15_REG_OFFSET( | ||
312 | ATHUB, 0, | ||
313 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & | ||
314 | (1U << vmid))) | ||
315 | cpu_relax(); | ||
316 | |||
317 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, | ||
318 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), | ||
319 | 1U << vmid); | ||
320 | |||
321 | /* Mapping vmid to pasid also for IH block */ | ||
322 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, | ||
323 | pasid_mapping); | ||
324 | |||
325 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, | ||
326 | pasid_mapping); | ||
327 | |||
328 | while (!(RREG32(SOC15_REG_OFFSET( | ||
329 | ATHUB, 0, | ||
330 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & | ||
331 | (1U << (vmid + 16)))) | ||
332 | cpu_relax(); | ||
333 | |||
334 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, | ||
335 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), | ||
336 | 1U << (vmid + 16)); | ||
337 | |||
338 | /* Mapping vmid to pasid also for IH block */ | ||
339 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, | ||
340 | pasid_mapping); | ||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | /* TODO - RING0 form of field is obsolete, seems to date back to SI | ||
345 | * but still works | ||
346 | */ | ||
347 | |||
348 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) | ||
349 | { | ||
350 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
351 | uint32_t mec; | ||
352 | uint32_t pipe; | ||
353 | |||
354 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; | ||
355 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); | ||
356 | |||
357 | lock_srbm(kgd, mec, pipe, 0, 0); | ||
358 | |||
359 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), | ||
360 | CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | | ||
361 | CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); | ||
362 | |||
363 | unlock_srbm(kgd); | ||
364 | |||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | static uint32_t get_sdma_base_addr(struct amdgpu_device *adev, | ||
369 | unsigned int engine_id, | ||
370 | unsigned int queue_id) | ||
371 | { | ||
372 | uint32_t base[2] = { | ||
373 | SOC15_REG_OFFSET(SDMA0, 0, | ||
374 | mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, | ||
375 | SOC15_REG_OFFSET(SDMA1, 0, | ||
376 | mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL | ||
377 | }; | ||
378 | uint32_t retval; | ||
379 | |||
380 | retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - | ||
381 | mmSDMA0_RLC0_RB_CNTL); | ||
382 | |||
383 | pr_debug("sdma base address: 0x%x\n", retval); | ||
384 | |||
385 | return retval; | ||
386 | } | ||
387 | |||
388 | static inline struct v9_mqd *get_mqd(void *mqd) | ||
389 | { | ||
390 | return (struct v9_mqd *)mqd; | ||
391 | } | ||
392 | |||
393 | static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) | ||
394 | { | ||
395 | return (struct v9_sdma_mqd *)mqd; | ||
396 | } | ||
397 | |||
398 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | ||
399 | uint32_t queue_id, uint32_t __user *wptr, | ||
400 | uint32_t wptr_shift, uint32_t wptr_mask, | ||
401 | struct mm_struct *mm) | ||
402 | { | ||
403 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
404 | struct v9_mqd *m; | ||
405 | uint32_t *mqd_hqd; | ||
406 | uint32_t reg, hqd_base, data; | ||
407 | |||
408 | m = get_mqd(mqd); | ||
409 | |||
410 | acquire_queue(kgd, pipe_id, queue_id); | ||
411 | |||
412 | /* HIQ is set during driver init period with vmid set to 0*/ | ||
413 | if (m->cp_hqd_vmid == 0) { | ||
414 | uint32_t value, mec, pipe; | ||
415 | |||
416 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; | ||
417 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); | ||
418 | |||
419 | pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", | ||
420 | mec, pipe, queue_id); | ||
421 | value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); | ||
422 | value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, | ||
423 | ((mec << 5) | (pipe << 3) | queue_id | 0x80)); | ||
424 | WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); | ||
425 | } | ||
426 | |||
427 | /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ | ||
428 | mqd_hqd = &m->cp_mqd_base_addr_lo; | ||
429 | hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); | ||
430 | |||
431 | for (reg = hqd_base; | ||
432 | reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) | ||
433 | WREG32(reg, mqd_hqd[reg - hqd_base]); | ||
434 | |||
435 | |||
436 | /* Activate doorbell logic before triggering WPTR poll. */ | ||
437 | data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, | ||
438 | CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); | ||
439 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); | ||
440 | |||
441 | if (wptr) { | ||
442 | /* Don't read wptr with get_user because the user | ||
443 | * context may not be accessible (if this function | ||
444 | * runs in a work queue). Instead trigger a one-shot | ||
445 | * polling read from memory in the CP. This assumes | ||
446 | * that wptr is GPU-accessible in the queue's VMID via | ||
447 | * ATC or SVM. WPTR==RPTR before starting the poll so | ||
448 | * the CP starts fetching new commands from the right | ||
449 | * place. | ||
450 | * | ||
451 | * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit | ||
452 | * tricky. Assume that the queue didn't overflow. The | ||
453 | * number of valid bits in the 32-bit RPTR depends on | ||
454 | * the queue size. The remaining bits are taken from | ||
455 | * the saved 64-bit WPTR. If the WPTR wrapped, add the | ||
456 | * queue size. | ||
457 | */ | ||
458 | uint32_t queue_size = | ||
459 | 2 << REG_GET_FIELD(m->cp_hqd_pq_control, | ||
460 | CP_HQD_PQ_CONTROL, QUEUE_SIZE); | ||
461 | uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); | ||
462 | |||
463 | if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) | ||
464 | guessed_wptr += queue_size; | ||
465 | guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); | ||
466 | guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; | ||
467 | |||
468 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), | ||
469 | lower_32_bits(guessed_wptr)); | ||
470 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), | ||
471 | upper_32_bits(guessed_wptr)); | ||
472 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), | ||
473 | lower_32_bits((uint64_t)wptr)); | ||
474 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), | ||
475 | upper_32_bits((uint64_t)wptr)); | ||
476 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), | ||
477 | get_queue_mask(adev, pipe_id, queue_id)); | ||
478 | } | ||
479 | |||
480 | /* Start the EOP fetcher */ | ||
481 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), | ||
482 | REG_SET_FIELD(m->cp_hqd_eop_rptr, | ||
483 | CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); | ||
484 | |||
485 | data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); | ||
486 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); | ||
487 | |||
488 | release_queue(kgd); | ||
489 | |||
490 | return 0; | ||
491 | } | ||
492 | |||
493 | static int kgd_hqd_dump(struct kgd_dev *kgd, | ||
494 | uint32_t pipe_id, uint32_t queue_id, | ||
495 | uint32_t (**dump)[2], uint32_t *n_regs) | ||
496 | { | ||
497 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
498 | uint32_t i = 0, reg; | ||
499 | #define HQD_N_REGS 56 | ||
500 | #define DUMP_REG(addr) do { \ | ||
501 | if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ | ||
502 | break; \ | ||
503 | (*dump)[i][0] = (addr) << 2; \ | ||
504 | (*dump)[i++][1] = RREG32(addr); \ | ||
505 | } while (0) | ||
506 | |||
507 | *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); | ||
508 | if (*dump == NULL) | ||
509 | return -ENOMEM; | ||
510 | |||
511 | acquire_queue(kgd, pipe_id, queue_id); | ||
512 | |||
513 | for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); | ||
514 | reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) | ||
515 | DUMP_REG(reg); | ||
516 | |||
517 | release_queue(kgd); | ||
518 | |||
519 | WARN_ON_ONCE(i != HQD_N_REGS); | ||
520 | *n_regs = i; | ||
521 | |||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, | ||
526 | uint32_t __user *wptr, struct mm_struct *mm) | ||
527 | { | ||
528 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
529 | struct v9_sdma_mqd *m; | ||
530 | uint32_t sdma_base_addr, sdmax_gfx_context_cntl; | ||
531 | unsigned long end_jiffies; | ||
532 | uint32_t data; | ||
533 | uint64_t data64; | ||
534 | uint64_t __user *wptr64 = (uint64_t __user *)wptr; | ||
535 | |||
536 | m = get_sdma_mqd(mqd); | ||
537 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, | ||
538 | m->sdma_queue_id); | ||
539 | sdmax_gfx_context_cntl = m->sdma_engine_id ? | ||
540 | SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : | ||
541 | SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); | ||
542 | |||
543 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, | ||
544 | m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); | ||
545 | |||
546 | end_jiffies = msecs_to_jiffies(2000) + jiffies; | ||
547 | while (true) { | ||
548 | data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); | ||
549 | if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) | ||
550 | break; | ||
551 | if (time_after(jiffies, end_jiffies)) | ||
552 | return -ETIME; | ||
553 | usleep_range(500, 1000); | ||
554 | } | ||
555 | data = RREG32(sdmax_gfx_context_cntl); | ||
556 | data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, | ||
557 | RESUME_CTX, 0); | ||
558 | WREG32(sdmax_gfx_context_cntl, data); | ||
559 | |||
560 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, | ||
561 | m->sdmax_rlcx_doorbell_offset); | ||
562 | |||
563 | data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, | ||
564 | ENABLE, 1); | ||
565 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); | ||
566 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); | ||
567 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, | ||
568 | m->sdmax_rlcx_rb_rptr_hi); | ||
569 | |||
570 | WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); | ||
571 | if (read_user_wptr(mm, wptr64, data64)) { | ||
572 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, | ||
573 | lower_32_bits(data64)); | ||
574 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, | ||
575 | upper_32_bits(data64)); | ||
576 | } else { | ||
577 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, | ||
578 | m->sdmax_rlcx_rb_rptr); | ||
579 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, | ||
580 | m->sdmax_rlcx_rb_rptr_hi); | ||
581 | } | ||
582 | WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); | ||
583 | |||
584 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); | ||
585 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, | ||
586 | m->sdmax_rlcx_rb_base_hi); | ||
587 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, | ||
588 | m->sdmax_rlcx_rb_rptr_addr_lo); | ||
589 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, | ||
590 | m->sdmax_rlcx_rb_rptr_addr_hi); | ||
591 | |||
592 | data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, | ||
593 | RB_ENABLE, 1); | ||
594 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); | ||
595 | |||
596 | return 0; | ||
597 | } | ||
598 | |||
599 | static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, | ||
600 | uint32_t engine_id, uint32_t queue_id, | ||
601 | uint32_t (**dump)[2], uint32_t *n_regs) | ||
602 | { | ||
603 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
604 | uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id); | ||
605 | uint32_t i = 0, reg; | ||
606 | #undef HQD_N_REGS | ||
607 | #define HQD_N_REGS (19+6+7+10) | ||
608 | |||
609 | *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); | ||
610 | if (*dump == NULL) | ||
611 | return -ENOMEM; | ||
612 | |||
613 | for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) | ||
614 | DUMP_REG(sdma_base_addr + reg); | ||
615 | for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) | ||
616 | DUMP_REG(sdma_base_addr + reg); | ||
617 | for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; | ||
618 | reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) | ||
619 | DUMP_REG(sdma_base_addr + reg); | ||
620 | for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; | ||
621 | reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) | ||
622 | DUMP_REG(sdma_base_addr + reg); | ||
623 | |||
624 | WARN_ON_ONCE(i != HQD_N_REGS); | ||
625 | *n_regs = i; | ||
626 | |||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, | ||
631 | uint32_t pipe_id, uint32_t queue_id) | ||
632 | { | ||
633 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
634 | uint32_t act; | ||
635 | bool retval = false; | ||
636 | uint32_t low, high; | ||
637 | |||
638 | acquire_queue(kgd, pipe_id, queue_id); | ||
639 | act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); | ||
640 | if (act) { | ||
641 | low = lower_32_bits(queue_address >> 8); | ||
642 | high = upper_32_bits(queue_address >> 8); | ||
643 | |||
644 | if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && | ||
645 | high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) | ||
646 | retval = true; | ||
647 | } | ||
648 | release_queue(kgd); | ||
649 | return retval; | ||
650 | } | ||
651 | |||
652 | static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) | ||
653 | { | ||
654 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
655 | struct v9_sdma_mqd *m; | ||
656 | uint32_t sdma_base_addr; | ||
657 | uint32_t sdma_rlc_rb_cntl; | ||
658 | |||
659 | m = get_sdma_mqd(mqd); | ||
660 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, | ||
661 | m->sdma_queue_id); | ||
662 | |||
663 | sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); | ||
664 | |||
665 | if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) | ||
666 | return true; | ||
667 | |||
668 | return false; | ||
669 | } | ||
670 | |||
671 | static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, | ||
672 | enum kfd_preempt_type reset_type, | ||
673 | unsigned int utimeout, uint32_t pipe_id, | ||
674 | uint32_t queue_id) | ||
675 | { | ||
676 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
677 | enum hqd_dequeue_request_type type; | ||
678 | unsigned long end_jiffies; | ||
679 | uint32_t temp; | ||
680 | struct v9_mqd *m = get_mqd(mqd); | ||
681 | |||
682 | acquire_queue(kgd, pipe_id, queue_id); | ||
683 | |||
684 | if (m->cp_hqd_vmid == 0) | ||
685 | WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); | ||
686 | |||
687 | switch (reset_type) { | ||
688 | case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: | ||
689 | type = DRAIN_PIPE; | ||
690 | break; | ||
691 | case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: | ||
692 | type = RESET_WAVES; | ||
693 | break; | ||
694 | default: | ||
695 | type = DRAIN_PIPE; | ||
696 | break; | ||
697 | } | ||
698 | |||
699 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); | ||
700 | |||
701 | end_jiffies = (utimeout * HZ / 1000) + jiffies; | ||
702 | while (true) { | ||
703 | temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); | ||
704 | if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) | ||
705 | break; | ||
706 | if (time_after(jiffies, end_jiffies)) { | ||
707 | pr_err("cp queue preemption time out.\n"); | ||
708 | release_queue(kgd); | ||
709 | return -ETIME; | ||
710 | } | ||
711 | usleep_range(500, 1000); | ||
712 | } | ||
713 | |||
714 | release_queue(kgd); | ||
715 | return 0; | ||
716 | } | ||
717 | |||
718 | static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, | ||
719 | unsigned int utimeout) | ||
720 | { | ||
721 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
722 | struct v9_sdma_mqd *m; | ||
723 | uint32_t sdma_base_addr; | ||
724 | uint32_t temp; | ||
725 | unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; | ||
726 | |||
727 | m = get_sdma_mqd(mqd); | ||
728 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, | ||
729 | m->sdma_queue_id); | ||
730 | |||
731 | temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); | ||
732 | temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; | ||
733 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); | ||
734 | |||
735 | while (true) { | ||
736 | temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); | ||
737 | if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) | ||
738 | break; | ||
739 | if (time_after(jiffies, end_jiffies)) | ||
740 | return -ETIME; | ||
741 | usleep_range(500, 1000); | ||
742 | } | ||
743 | |||
744 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); | ||
745 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, | ||
746 | RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | | ||
747 | SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); | ||
748 | |||
749 | m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); | ||
750 | m->sdmax_rlcx_rb_rptr_hi = | ||
751 | RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); | ||
752 | |||
753 | return 0; | ||
754 | } | ||
755 | |||
756 | static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, | ||
757 | uint8_t vmid) | ||
758 | { | ||
759 | uint32_t reg; | ||
760 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
761 | |||
762 | reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) | ||
763 | + vmid); | ||
764 | return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; | ||
765 | } | ||
766 | |||
767 | static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, | ||
768 | uint8_t vmid) | ||
769 | { | ||
770 | uint32_t reg; | ||
771 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
772 | |||
773 | reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) | ||
774 | + vmid); | ||
775 | return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; | ||
776 | } | ||
777 | |||
778 | static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) | ||
779 | { | ||
780 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
781 | uint32_t req = (1 << vmid) | | ||
782 | (0 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* legacy */ | ||
783 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | | ||
784 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | | ||
785 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | | ||
786 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | | ||
787 | VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; | ||
788 | |||
789 | mutex_lock(&adev->srbm_mutex); | ||
790 | |||
791 | /* Use legacy mode tlb invalidation. | ||
792 | * | ||
793 | * Currently on Raven the code below is broken for anything but | ||
794 | * legacy mode due to a MMHUB power gating problem. A workaround | ||
795 | * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ | ||
796 | * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack | ||
797 | * bit. | ||
798 | * | ||
799 | * TODO 1: agree on the right set of invalidation registers for | ||
800 | * KFD use. Use the last one for now. Invalidate both GC and | ||
801 | * MMHUB. | ||
802 | * | ||
803 | * TODO 2: support range-based invalidation, requires kfg2kgd | ||
804 | * interface change | ||
805 | */ | ||
806 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), | ||
807 | 0xffffffff); | ||
808 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), | ||
809 | 0x0000001f); | ||
810 | |||
811 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, | ||
812 | mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), | ||
813 | 0xffffffff); | ||
814 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, | ||
815 | mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), | ||
816 | 0x0000001f); | ||
817 | |||
818 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); | ||
819 | |||
820 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), | ||
821 | req); | ||
822 | |||
823 | while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & | ||
824 | (1 << vmid))) | ||
825 | cpu_relax(); | ||
826 | |||
827 | while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, | ||
828 | mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & | ||
829 | (1 << vmid))) | ||
830 | cpu_relax(); | ||
831 | |||
832 | mutex_unlock(&adev->srbm_mutex); | ||
833 | |||
834 | } | ||
835 | |||
836 | static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) | ||
837 | { | ||
838 | signed long r; | ||
839 | uint32_t seq; | ||
840 | struct amdgpu_ring *ring = &adev->gfx.kiq.ring; | ||
841 | |||
842 | spin_lock(&adev->gfx.kiq.ring_lock); | ||
843 | amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ | ||
844 | amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); | ||
845 | amdgpu_ring_write(ring, | ||
846 | PACKET3_INVALIDATE_TLBS_DST_SEL(1) | | ||
847 | PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | | ||
848 | PACKET3_INVALIDATE_TLBS_PASID(pasid) | | ||
849 | PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */ | ||
850 | amdgpu_fence_emit_polling(ring, &seq); | ||
851 | amdgpu_ring_commit(ring); | ||
852 | spin_unlock(&adev->gfx.kiq.ring_lock); | ||
853 | |||
854 | r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); | ||
855 | if (r < 1) { | ||
856 | DRM_ERROR("wait for kiq fence error: %ld.\n", r); | ||
857 | return -ETIME; | ||
858 | } | ||
859 | |||
860 | return 0; | ||
861 | } | ||
862 | |||
863 | static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) | ||
864 | { | ||
865 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
866 | int vmid; | ||
867 | struct amdgpu_ring *ring = &adev->gfx.kiq.ring; | ||
868 | |||
869 | if (ring->ready) | ||
870 | return invalidate_tlbs_with_kiq(adev, pasid); | ||
871 | |||
872 | for (vmid = 0; vmid < 16; vmid++) { | ||
873 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) | ||
874 | continue; | ||
875 | if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { | ||
876 | if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) | ||
877 | == pasid) { | ||
878 | write_vmid_invalidate_request(kgd, vmid); | ||
879 | break; | ||
880 | } | ||
881 | } | ||
882 | } | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | |||
887 | static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid) | ||
888 | { | ||
889 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
890 | |||
891 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { | ||
892 | pr_err("non kfd vmid %d\n", vmid); | ||
893 | return 0; | ||
894 | } | ||
895 | |||
896 | write_vmid_invalidate_request(kgd, vmid); | ||
897 | return 0; | ||
898 | } | ||
899 | |||
900 | static int kgd_address_watch_disable(struct kgd_dev *kgd) | ||
901 | { | ||
902 | return 0; | ||
903 | } | ||
904 | |||
905 | static int kgd_address_watch_execute(struct kgd_dev *kgd, | ||
906 | unsigned int watch_point_id, | ||
907 | uint32_t cntl_val, | ||
908 | uint32_t addr_hi, | ||
909 | uint32_t addr_lo) | ||
910 | { | ||
911 | return 0; | ||
912 | } | ||
913 | |||
914 | static int kgd_wave_control_execute(struct kgd_dev *kgd, | ||
915 | uint32_t gfx_index_val, | ||
916 | uint32_t sq_cmd) | ||
917 | { | ||
918 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
919 | uint32_t data = 0; | ||
920 | |||
921 | mutex_lock(&adev->grbm_idx_mutex); | ||
922 | |||
923 | WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); | ||
924 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); | ||
925 | |||
926 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, | ||
927 | INSTANCE_BROADCAST_WRITES, 1); | ||
928 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, | ||
929 | SH_BROADCAST_WRITES, 1); | ||
930 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, | ||
931 | SE_BROADCAST_WRITES, 1); | ||
932 | |||
933 | WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); | ||
934 | mutex_unlock(&adev->grbm_idx_mutex); | ||
935 | |||
936 | return 0; | ||
937 | } | ||
938 | |||
939 | static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, | ||
940 | unsigned int watch_point_id, | ||
941 | unsigned int reg_offset) | ||
942 | { | ||
943 | return 0; | ||
944 | } | ||
945 | |||
946 | static void set_scratch_backing_va(struct kgd_dev *kgd, | ||
947 | uint64_t va, uint32_t vmid) | ||
948 | { | ||
949 | /* No longer needed on GFXv9. The scratch base address is | ||
950 | * passed to the shader by the CP. It's the user mode driver's | ||
951 | * responsibility. | ||
952 | */ | ||
953 | } | ||
954 | |||
955 | /* FIXME: Does this need to be ASIC-specific code? */ | ||
956 | static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) | ||
957 | { | ||
958 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; | ||
959 | const union amdgpu_firmware_header *hdr; | ||
960 | |||
961 | switch (type) { | ||
962 | case KGD_ENGINE_PFP: | ||
963 | hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; | ||
964 | break; | ||
965 | |||
966 | case KGD_ENGINE_ME: | ||
967 | hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; | ||
968 | break; | ||
969 | |||
970 | case KGD_ENGINE_CE: | ||
971 | hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; | ||
972 | break; | ||
973 | |||
974 | case KGD_ENGINE_MEC1: | ||
975 | hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; | ||
976 | break; | ||
977 | |||
978 | case KGD_ENGINE_MEC2: | ||
979 | hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; | ||
980 | break; | ||
981 | |||
982 | case KGD_ENGINE_RLC: | ||
983 | hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; | ||
984 | break; | ||
985 | |||
986 | case KGD_ENGINE_SDMA1: | ||
987 | hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; | ||
988 | break; | ||
989 | |||
990 | case KGD_ENGINE_SDMA2: | ||
991 | hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; | ||
992 | break; | ||
993 | |||
994 | default: | ||
995 | return 0; | ||
996 | } | ||
997 | |||
998 | if (hdr == NULL) | ||
999 | return 0; | ||
1000 | |||
1001 | /* Only 12 bit in use*/ | ||
1002 | return hdr->common.ucode_version; | ||
1003 | } | ||
1004 | |||
1005 | static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, | ||
1006 | uint32_t page_table_base) | ||
1007 | { | ||
1008 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | ||
1009 | uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | | ||
1010 | AMDGPU_PTE_VALID; | ||
1011 | |||
1012 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { | ||
1013 | pr_err("trying to set page table base for wrong VMID %u\n", | ||
1014 | vmid); | ||
1015 | return; | ||
1016 | } | ||
1017 | |||
1018 | /* TODO: take advantage of per-process address space size. For | ||
1019 | * now, all processes share the same address space size, like | ||
1020 | * on GFX8 and older. | ||
1021 | */ | ||
1022 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); | ||
1023 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); | ||
1024 | |||
1025 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), | ||
1026 | lower_32_bits(adev->vm_manager.max_pfn - 1)); | ||
1027 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), | ||
1028 | upper_32_bits(adev->vm_manager.max_pfn - 1)); | ||
1029 | |||
1030 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); | ||
1031 | WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); | ||
1032 | |||
1033 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); | ||
1034 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); | ||
1035 | |||
1036 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), | ||
1037 | lower_32_bits(adev->vm_manager.max_pfn - 1)); | ||
1038 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), | ||
1039 | upper_32_bits(adev->vm_manager.max_pfn - 1)); | ||
1040 | |||
1041 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); | ||
1042 | WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); | ||
1043 | } | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 1d6e1479da38..5296e24fd662 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #define pr_fmt(fmt) "kfd2kgd: " fmt | 23 | #define pr_fmt(fmt) "kfd2kgd: " fmt |
24 | 24 | ||
25 | #include <linux/list.h> | 25 | #include <linux/list.h> |
26 | #include <linux/sched/mm.h> | ||
26 | #include <drm/drmP.h> | 27 | #include <drm/drmP.h> |
27 | #include "amdgpu_object.h" | 28 | #include "amdgpu_object.h" |
28 | #include "amdgpu_vm.h" | 29 | #include "amdgpu_vm.h" |
@@ -33,10 +34,20 @@ | |||
33 | */ | 34 | */ |
34 | #define VI_BO_SIZE_ALIGN (0x8000) | 35 | #define VI_BO_SIZE_ALIGN (0x8000) |
35 | 36 | ||
37 | /* BO flag to indicate a KFD userptr BO */ | ||
38 | #define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) | ||
39 | |||
40 | /* Userptr restore delay, just long enough to allow consecutive VM | ||
41 | * changes to accumulate | ||
42 | */ | ||
43 | #define AMDGPU_USERPTR_RESTORE_DELAY_MS 1 | ||
44 | |||
36 | /* Impose limit on how much memory KFD can use */ | 45 | /* Impose limit on how much memory KFD can use */ |
37 | static struct { | 46 | static struct { |
38 | uint64_t max_system_mem_limit; | 47 | uint64_t max_system_mem_limit; |
48 | uint64_t max_userptr_mem_limit; | ||
39 | int64_t system_mem_used; | 49 | int64_t system_mem_used; |
50 | int64_t userptr_mem_used; | ||
40 | spinlock_t mem_limit_lock; | 51 | spinlock_t mem_limit_lock; |
41 | } kfd_mem_limit; | 52 | } kfd_mem_limit; |
42 | 53 | ||
@@ -57,6 +68,7 @@ static const char * const domain_bit_to_string[] = { | |||
57 | 68 | ||
58 | #define domain_string(domain) domain_bit_to_string[ffs(domain)-1] | 69 | #define domain_string(domain) domain_bit_to_string[ffs(domain)-1] |
59 | 70 | ||
71 | static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); | ||
60 | 72 | ||
61 | 73 | ||
62 | static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) | 74 | static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) |
@@ -78,6 +90,7 @@ static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm, | |||
78 | 90 | ||
79 | /* Set memory usage limits. Current, limits are | 91 | /* Set memory usage limits. Current, limits are |
80 | * System (kernel) memory - 3/8th System RAM | 92 | * System (kernel) memory - 3/8th System RAM |
93 | * Userptr memory - 3/4th System RAM | ||
81 | */ | 94 | */ |
82 | void amdgpu_amdkfd_gpuvm_init_mem_limits(void) | 95 | void amdgpu_amdkfd_gpuvm_init_mem_limits(void) |
83 | { | 96 | { |
@@ -90,8 +103,10 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void) | |||
90 | 103 | ||
91 | spin_lock_init(&kfd_mem_limit.mem_limit_lock); | 104 | spin_lock_init(&kfd_mem_limit.mem_limit_lock); |
92 | kfd_mem_limit.max_system_mem_limit = (mem >> 1) - (mem >> 3); | 105 | kfd_mem_limit.max_system_mem_limit = (mem >> 1) - (mem >> 3); |
93 | pr_debug("Kernel memory limit %lluM\n", | 106 | kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 2); |
94 | (kfd_mem_limit.max_system_mem_limit >> 20)); | 107 | pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", |
108 | (kfd_mem_limit.max_system_mem_limit >> 20), | ||
109 | (kfd_mem_limit.max_userptr_mem_limit >> 20)); | ||
95 | } | 110 | } |
96 | 111 | ||
97 | static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, | 112 | static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, |
@@ -111,6 +126,16 @@ static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, | |||
111 | goto err_no_mem; | 126 | goto err_no_mem; |
112 | } | 127 | } |
113 | kfd_mem_limit.system_mem_used += (acc_size + size); | 128 | kfd_mem_limit.system_mem_used += (acc_size + size); |
129 | } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { | ||
130 | if ((kfd_mem_limit.system_mem_used + acc_size > | ||
131 | kfd_mem_limit.max_system_mem_limit) || | ||
132 | (kfd_mem_limit.userptr_mem_used + (size + acc_size) > | ||
133 | kfd_mem_limit.max_userptr_mem_limit)) { | ||
134 | ret = -ENOMEM; | ||
135 | goto err_no_mem; | ||
136 | } | ||
137 | kfd_mem_limit.system_mem_used += acc_size; | ||
138 | kfd_mem_limit.userptr_mem_used += size; | ||
114 | } | 139 | } |
115 | err_no_mem: | 140 | err_no_mem: |
116 | spin_unlock(&kfd_mem_limit.mem_limit_lock); | 141 | spin_unlock(&kfd_mem_limit.mem_limit_lock); |
@@ -126,10 +151,16 @@ static void unreserve_system_mem_limit(struct amdgpu_device *adev, | |||
126 | sizeof(struct amdgpu_bo)); | 151 | sizeof(struct amdgpu_bo)); |
127 | 152 | ||
128 | spin_lock(&kfd_mem_limit.mem_limit_lock); | 153 | spin_lock(&kfd_mem_limit.mem_limit_lock); |
129 | if (domain == AMDGPU_GEM_DOMAIN_GTT) | 154 | if (domain == AMDGPU_GEM_DOMAIN_GTT) { |
130 | kfd_mem_limit.system_mem_used -= (acc_size + size); | 155 | kfd_mem_limit.system_mem_used -= (acc_size + size); |
156 | } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { | ||
157 | kfd_mem_limit.system_mem_used -= acc_size; | ||
158 | kfd_mem_limit.userptr_mem_used -= size; | ||
159 | } | ||
131 | WARN_ONCE(kfd_mem_limit.system_mem_used < 0, | 160 | WARN_ONCE(kfd_mem_limit.system_mem_used < 0, |
132 | "kfd system memory accounting unbalanced"); | 161 | "kfd system memory accounting unbalanced"); |
162 | WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, | ||
163 | "kfd userptr memory accounting unbalanced"); | ||
133 | 164 | ||
134 | spin_unlock(&kfd_mem_limit.mem_limit_lock); | 165 | spin_unlock(&kfd_mem_limit.mem_limit_lock); |
135 | } | 166 | } |
@@ -138,12 +169,17 @@ void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo) | |||
138 | { | 169 | { |
139 | spin_lock(&kfd_mem_limit.mem_limit_lock); | 170 | spin_lock(&kfd_mem_limit.mem_limit_lock); |
140 | 171 | ||
141 | if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { | 172 | if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { |
173 | kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; | ||
174 | kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); | ||
175 | } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { | ||
142 | kfd_mem_limit.system_mem_used -= | 176 | kfd_mem_limit.system_mem_used -= |
143 | (bo->tbo.acc_size + amdgpu_bo_size(bo)); | 177 | (bo->tbo.acc_size + amdgpu_bo_size(bo)); |
144 | } | 178 | } |
145 | WARN_ONCE(kfd_mem_limit.system_mem_used < 0, | 179 | WARN_ONCE(kfd_mem_limit.system_mem_used < 0, |
146 | "kfd system memory accounting unbalanced"); | 180 | "kfd system memory accounting unbalanced"); |
181 | WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, | ||
182 | "kfd userptr memory accounting unbalanced"); | ||
147 | 183 | ||
148 | spin_unlock(&kfd_mem_limit.mem_limit_lock); | 184 | spin_unlock(&kfd_mem_limit.mem_limit_lock); |
149 | } | 185 | } |
@@ -506,7 +542,8 @@ static void remove_bo_from_vm(struct amdgpu_device *adev, | |||
506 | } | 542 | } |
507 | 543 | ||
508 | static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, | 544 | static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, |
509 | struct amdkfd_process_info *process_info) | 545 | struct amdkfd_process_info *process_info, |
546 | bool userptr) | ||
510 | { | 547 | { |
511 | struct ttm_validate_buffer *entry = &mem->validate_list; | 548 | struct ttm_validate_buffer *entry = &mem->validate_list; |
512 | struct amdgpu_bo *bo = mem->bo; | 549 | struct amdgpu_bo *bo = mem->bo; |
@@ -515,10 +552,95 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, | |||
515 | entry->shared = true; | 552 | entry->shared = true; |
516 | entry->bo = &bo->tbo; | 553 | entry->bo = &bo->tbo; |
517 | mutex_lock(&process_info->lock); | 554 | mutex_lock(&process_info->lock); |
518 | list_add_tail(&entry->head, &process_info->kfd_bo_list); | 555 | if (userptr) |
556 | list_add_tail(&entry->head, &process_info->userptr_valid_list); | ||
557 | else | ||
558 | list_add_tail(&entry->head, &process_info->kfd_bo_list); | ||
519 | mutex_unlock(&process_info->lock); | 559 | mutex_unlock(&process_info->lock); |
520 | } | 560 | } |
521 | 561 | ||
562 | /* Initializes user pages. It registers the MMU notifier and validates | ||
563 | * the userptr BO in the GTT domain. | ||
564 | * | ||
565 | * The BO must already be on the userptr_valid_list. Otherwise an | ||
566 | * eviction and restore may happen that leaves the new BO unmapped | ||
567 | * with the user mode queues running. | ||
568 | * | ||
569 | * Takes the process_info->lock to protect against concurrent restore | ||
570 | * workers. | ||
571 | * | ||
572 | * Returns 0 for success, negative errno for errors. | ||
573 | */ | ||
574 | static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, | ||
575 | uint64_t user_addr) | ||
576 | { | ||
577 | struct amdkfd_process_info *process_info = mem->process_info; | ||
578 | struct amdgpu_bo *bo = mem->bo; | ||
579 | struct ttm_operation_ctx ctx = { true, false }; | ||
580 | int ret = 0; | ||
581 | |||
582 | mutex_lock(&process_info->lock); | ||
583 | |||
584 | ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); | ||
585 | if (ret) { | ||
586 | pr_err("%s: Failed to set userptr: %d\n", __func__, ret); | ||
587 | goto out; | ||
588 | } | ||
589 | |||
590 | ret = amdgpu_mn_register(bo, user_addr); | ||
591 | if (ret) { | ||
592 | pr_err("%s: Failed to register MMU notifier: %d\n", | ||
593 | __func__, ret); | ||
594 | goto out; | ||
595 | } | ||
596 | |||
597 | /* If no restore worker is running concurrently, user_pages | ||
598 | * should not be allocated | ||
599 | */ | ||
600 | WARN(mem->user_pages, "Leaking user_pages array"); | ||
601 | |||
602 | mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, | ||
603 | sizeof(struct page *), | ||
604 | GFP_KERNEL | __GFP_ZERO); | ||
605 | if (!mem->user_pages) { | ||
606 | pr_err("%s: Failed to allocate pages array\n", __func__); | ||
607 | ret = -ENOMEM; | ||
608 | goto unregister_out; | ||
609 | } | ||
610 | |||
611 | ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); | ||
612 | if (ret) { | ||
613 | pr_err("%s: Failed to get user pages: %d\n", __func__, ret); | ||
614 | goto free_out; | ||
615 | } | ||
616 | |||
617 | amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); | ||
618 | |||
619 | ret = amdgpu_bo_reserve(bo, true); | ||
620 | if (ret) { | ||
621 | pr_err("%s: Failed to reserve BO\n", __func__); | ||
622 | goto release_out; | ||
623 | } | ||
624 | amdgpu_ttm_placement_from_domain(bo, mem->domain); | ||
625 | ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); | ||
626 | if (ret) | ||
627 | pr_err("%s: failed to validate BO\n", __func__); | ||
628 | amdgpu_bo_unreserve(bo); | ||
629 | |||
630 | release_out: | ||
631 | if (ret) | ||
632 | release_pages(mem->user_pages, bo->tbo.ttm->num_pages); | ||
633 | free_out: | ||
634 | kvfree(mem->user_pages); | ||
635 | mem->user_pages = NULL; | ||
636 | unregister_out: | ||
637 | if (ret) | ||
638 | amdgpu_mn_unregister(bo); | ||
639 | out: | ||
640 | mutex_unlock(&process_info->lock); | ||
641 | return ret; | ||
642 | } | ||
643 | |||
522 | /* Reserving a BO and its page table BOs must happen atomically to | 644 | /* Reserving a BO and its page table BOs must happen atomically to |
523 | * avoid deadlocks. Some operations update multiple VMs at once. Track | 645 | * avoid deadlocks. Some operations update multiple VMs at once. Track |
524 | * all the reservation info in a context structure. Optionally a sync | 646 | * all the reservation info in a context structure. Optionally a sync |
@@ -748,7 +870,8 @@ static int update_gpuvm_pte(struct amdgpu_device *adev, | |||
748 | } | 870 | } |
749 | 871 | ||
750 | static int map_bo_to_gpuvm(struct amdgpu_device *adev, | 872 | static int map_bo_to_gpuvm(struct amdgpu_device *adev, |
751 | struct kfd_bo_va_list *entry, struct amdgpu_sync *sync) | 873 | struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, |
874 | bool no_update_pte) | ||
752 | { | 875 | { |
753 | int ret; | 876 | int ret; |
754 | 877 | ||
@@ -762,6 +885,9 @@ static int map_bo_to_gpuvm(struct amdgpu_device *adev, | |||
762 | return ret; | 885 | return ret; |
763 | } | 886 | } |
764 | 887 | ||
888 | if (no_update_pte) | ||
889 | return 0; | ||
890 | |||
765 | ret = update_gpuvm_pte(adev, entry, sync); | 891 | ret = update_gpuvm_pte(adev, entry, sync); |
766 | if (ret) { | 892 | if (ret) { |
767 | pr_err("update_gpuvm_pte() failed\n"); | 893 | pr_err("update_gpuvm_pte() failed\n"); |
@@ -820,6 +946,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, | |||
820 | mutex_init(&info->lock); | 946 | mutex_init(&info->lock); |
821 | INIT_LIST_HEAD(&info->vm_list_head); | 947 | INIT_LIST_HEAD(&info->vm_list_head); |
822 | INIT_LIST_HEAD(&info->kfd_bo_list); | 948 | INIT_LIST_HEAD(&info->kfd_bo_list); |
949 | INIT_LIST_HEAD(&info->userptr_valid_list); | ||
950 | INIT_LIST_HEAD(&info->userptr_inval_list); | ||
823 | 951 | ||
824 | info->eviction_fence = | 952 | info->eviction_fence = |
825 | amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), | 953 | amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), |
@@ -830,6 +958,11 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, | |||
830 | goto create_evict_fence_fail; | 958 | goto create_evict_fence_fail; |
831 | } | 959 | } |
832 | 960 | ||
961 | info->pid = get_task_pid(current->group_leader, PIDTYPE_PID); | ||
962 | atomic_set(&info->evicted_bos, 0); | ||
963 | INIT_DELAYED_WORK(&info->restore_userptr_work, | ||
964 | amdgpu_amdkfd_restore_userptr_worker); | ||
965 | |||
833 | *process_info = info; | 966 | *process_info = info; |
834 | *ef = dma_fence_get(&info->eviction_fence->base); | 967 | *ef = dma_fence_get(&info->eviction_fence->base); |
835 | } | 968 | } |
@@ -872,6 +1005,7 @@ reserve_pd_fail: | |||
872 | dma_fence_put(*ef); | 1005 | dma_fence_put(*ef); |
873 | *ef = NULL; | 1006 | *ef = NULL; |
874 | *process_info = NULL; | 1007 | *process_info = NULL; |
1008 | put_pid(info->pid); | ||
875 | create_evict_fence_fail: | 1009 | create_evict_fence_fail: |
876 | mutex_destroy(&info->lock); | 1010 | mutex_destroy(&info->lock); |
877 | kfree(info); | 1011 | kfree(info); |
@@ -967,8 +1101,12 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, | |||
967 | /* Release per-process resources when last compute VM is destroyed */ | 1101 | /* Release per-process resources when last compute VM is destroyed */ |
968 | if (!process_info->n_vms) { | 1102 | if (!process_info->n_vms) { |
969 | WARN_ON(!list_empty(&process_info->kfd_bo_list)); | 1103 | WARN_ON(!list_empty(&process_info->kfd_bo_list)); |
1104 | WARN_ON(!list_empty(&process_info->userptr_valid_list)); | ||
1105 | WARN_ON(!list_empty(&process_info->userptr_inval_list)); | ||
970 | 1106 | ||
971 | dma_fence_put(&process_info->eviction_fence->base); | 1107 | dma_fence_put(&process_info->eviction_fence->base); |
1108 | cancel_delayed_work_sync(&process_info->restore_userptr_work); | ||
1109 | put_pid(process_info->pid); | ||
972 | mutex_destroy(&process_info->lock); | 1110 | mutex_destroy(&process_info->lock); |
973 | kfree(process_info); | 1111 | kfree(process_info); |
974 | } | 1112 | } |
@@ -1003,9 +1141,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( | |||
1003 | { | 1141 | { |
1004 | struct amdgpu_device *adev = get_amdgpu_device(kgd); | 1142 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
1005 | struct amdgpu_vm *avm = (struct amdgpu_vm *)vm; | 1143 | struct amdgpu_vm *avm = (struct amdgpu_vm *)vm; |
1144 | uint64_t user_addr = 0; | ||
1006 | struct amdgpu_bo *bo; | 1145 | struct amdgpu_bo *bo; |
1007 | int byte_align; | 1146 | int byte_align; |
1008 | u32 alloc_domain; | 1147 | u32 domain, alloc_domain; |
1009 | u64 alloc_flags; | 1148 | u64 alloc_flags; |
1010 | uint32_t mapping_flags; | 1149 | uint32_t mapping_flags; |
1011 | int ret; | 1150 | int ret; |
@@ -1014,14 +1153,21 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( | |||
1014 | * Check on which domain to allocate BO | 1153 | * Check on which domain to allocate BO |
1015 | */ | 1154 | */ |
1016 | if (flags & ALLOC_MEM_FLAGS_VRAM) { | 1155 | if (flags & ALLOC_MEM_FLAGS_VRAM) { |
1017 | alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; | 1156 | domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; |
1018 | alloc_flags = AMDGPU_GEM_CREATE_VRAM_CLEARED; | 1157 | alloc_flags = AMDGPU_GEM_CREATE_VRAM_CLEARED; |
1019 | alloc_flags |= (flags & ALLOC_MEM_FLAGS_PUBLIC) ? | 1158 | alloc_flags |= (flags & ALLOC_MEM_FLAGS_PUBLIC) ? |
1020 | AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : | 1159 | AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : |
1021 | AMDGPU_GEM_CREATE_NO_CPU_ACCESS; | 1160 | AMDGPU_GEM_CREATE_NO_CPU_ACCESS; |
1022 | } else if (flags & ALLOC_MEM_FLAGS_GTT) { | 1161 | } else if (flags & ALLOC_MEM_FLAGS_GTT) { |
1023 | alloc_domain = AMDGPU_GEM_DOMAIN_GTT; | 1162 | domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT; |
1163 | alloc_flags = 0; | ||
1164 | } else if (flags & ALLOC_MEM_FLAGS_USERPTR) { | ||
1165 | domain = AMDGPU_GEM_DOMAIN_GTT; | ||
1166 | alloc_domain = AMDGPU_GEM_DOMAIN_CPU; | ||
1024 | alloc_flags = 0; | 1167 | alloc_flags = 0; |
1168 | if (!offset || !*offset) | ||
1169 | return -EINVAL; | ||
1170 | user_addr = *offset; | ||
1025 | } else { | 1171 | } else { |
1026 | return -EINVAL; | 1172 | return -EINVAL; |
1027 | } | 1173 | } |
@@ -1078,18 +1224,34 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( | |||
1078 | } | 1224 | } |
1079 | bo->kfd_bo = *mem; | 1225 | bo->kfd_bo = *mem; |
1080 | (*mem)->bo = bo; | 1226 | (*mem)->bo = bo; |
1227 | if (user_addr) | ||
1228 | bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; | ||
1081 | 1229 | ||
1082 | (*mem)->va = va; | 1230 | (*mem)->va = va; |
1083 | (*mem)->domain = alloc_domain; | 1231 | (*mem)->domain = domain; |
1084 | (*mem)->mapped_to_gpu_memory = 0; | 1232 | (*mem)->mapped_to_gpu_memory = 0; |
1085 | (*mem)->process_info = avm->process_info; | 1233 | (*mem)->process_info = avm->process_info; |
1086 | add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info); | 1234 | add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr); |
1235 | |||
1236 | if (user_addr) { | ||
1237 | ret = init_user_pages(*mem, current->mm, user_addr); | ||
1238 | if (ret) { | ||
1239 | mutex_lock(&avm->process_info->lock); | ||
1240 | list_del(&(*mem)->validate_list.head); | ||
1241 | mutex_unlock(&avm->process_info->lock); | ||
1242 | goto allocate_init_user_pages_failed; | ||
1243 | } | ||
1244 | } | ||
1087 | 1245 | ||
1088 | if (offset) | 1246 | if (offset) |
1089 | *offset = amdgpu_bo_mmap_offset(bo); | 1247 | *offset = amdgpu_bo_mmap_offset(bo); |
1090 | 1248 | ||
1091 | return 0; | 1249 | return 0; |
1092 | 1250 | ||
1251 | allocate_init_user_pages_failed: | ||
1252 | amdgpu_bo_unref(&bo); | ||
1253 | /* Don't unreserve system mem limit twice */ | ||
1254 | goto err_reserve_system_mem; | ||
1093 | err_bo_create: | 1255 | err_bo_create: |
1094 | unreserve_system_mem_limit(adev, size, alloc_domain); | 1256 | unreserve_system_mem_limit(adev, size, alloc_domain); |
1095 | err_reserve_system_mem: | 1257 | err_reserve_system_mem: |
@@ -1122,12 +1284,24 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( | |||
1122 | * be freed anyway | 1284 | * be freed anyway |
1123 | */ | 1285 | */ |
1124 | 1286 | ||
1287 | /* No more MMU notifiers */ | ||
1288 | amdgpu_mn_unregister(mem->bo); | ||
1289 | |||
1125 | /* Make sure restore workers don't access the BO any more */ | 1290 | /* Make sure restore workers don't access the BO any more */ |
1126 | bo_list_entry = &mem->validate_list; | 1291 | bo_list_entry = &mem->validate_list; |
1127 | mutex_lock(&process_info->lock); | 1292 | mutex_lock(&process_info->lock); |
1128 | list_del(&bo_list_entry->head); | 1293 | list_del(&bo_list_entry->head); |
1129 | mutex_unlock(&process_info->lock); | 1294 | mutex_unlock(&process_info->lock); |
1130 | 1295 | ||
1296 | /* Free user pages if necessary */ | ||
1297 | if (mem->user_pages) { | ||
1298 | pr_debug("%s: Freeing user_pages array\n", __func__); | ||
1299 | if (mem->user_pages[0]) | ||
1300 | release_pages(mem->user_pages, | ||
1301 | mem->bo->tbo.ttm->num_pages); | ||
1302 | kvfree(mem->user_pages); | ||
1303 | } | ||
1304 | |||
1131 | ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx); | 1305 | ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx); |
1132 | if (unlikely(ret)) | 1306 | if (unlikely(ret)) |
1133 | return ret; | 1307 | return ret; |
@@ -1173,21 +1347,32 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( | |||
1173 | struct kfd_bo_va_list *bo_va_entry = NULL; | 1347 | struct kfd_bo_va_list *bo_va_entry = NULL; |
1174 | struct kfd_bo_va_list *bo_va_entry_aql = NULL; | 1348 | struct kfd_bo_va_list *bo_va_entry_aql = NULL; |
1175 | unsigned long bo_size; | 1349 | unsigned long bo_size; |
1176 | 1350 | bool is_invalid_userptr = false; | |
1177 | /* Make sure restore is not running concurrently. | ||
1178 | */ | ||
1179 | mutex_lock(&mem->process_info->lock); | ||
1180 | |||
1181 | mutex_lock(&mem->lock); | ||
1182 | 1351 | ||
1183 | bo = mem->bo; | 1352 | bo = mem->bo; |
1184 | |||
1185 | if (!bo) { | 1353 | if (!bo) { |
1186 | pr_err("Invalid BO when mapping memory to GPU\n"); | 1354 | pr_err("Invalid BO when mapping memory to GPU\n"); |
1187 | ret = -EINVAL; | 1355 | return -EINVAL; |
1188 | goto out; | ||
1189 | } | 1356 | } |
1190 | 1357 | ||
1358 | /* Make sure restore is not running concurrently. Since we | ||
1359 | * don't map invalid userptr BOs, we rely on the next restore | ||
1360 | * worker to do the mapping | ||
1361 | */ | ||
1362 | mutex_lock(&mem->process_info->lock); | ||
1363 | |||
1364 | /* Lock mmap-sem. If we find an invalid userptr BO, we can be | ||
1365 | * sure that the MMU notifier is no longer running | ||
1366 | * concurrently and the queues are actually stopped | ||
1367 | */ | ||
1368 | if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { | ||
1369 | down_write(¤t->mm->mmap_sem); | ||
1370 | is_invalid_userptr = atomic_read(&mem->invalid); | ||
1371 | up_write(¤t->mm->mmap_sem); | ||
1372 | } | ||
1373 | |||
1374 | mutex_lock(&mem->lock); | ||
1375 | |||
1191 | domain = mem->domain; | 1376 | domain = mem->domain; |
1192 | bo_size = bo->tbo.mem.size; | 1377 | bo_size = bo->tbo.mem.size; |
1193 | 1378 | ||
@@ -1200,6 +1385,14 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( | |||
1200 | if (unlikely(ret)) | 1385 | if (unlikely(ret)) |
1201 | goto out; | 1386 | goto out; |
1202 | 1387 | ||
1388 | /* Userptr can be marked as "not invalid", but not actually be | ||
1389 | * validated yet (still in the system domain). In that case | ||
1390 | * the queues are still stopped and we can leave mapping for | ||
1391 | * the next restore worker | ||
1392 | */ | ||
1393 | if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) | ||
1394 | is_invalid_userptr = true; | ||
1395 | |||
1203 | if (check_if_add_bo_to_vm(avm, mem)) { | 1396 | if (check_if_add_bo_to_vm(avm, mem)) { |
1204 | ret = add_bo_to_vm(adev, mem, avm, false, | 1397 | ret = add_bo_to_vm(adev, mem, avm, false, |
1205 | &bo_va_entry); | 1398 | &bo_va_entry); |
@@ -1217,7 +1410,8 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( | |||
1217 | goto add_bo_to_vm_failed; | 1410 | goto add_bo_to_vm_failed; |
1218 | } | 1411 | } |
1219 | 1412 | ||
1220 | if (mem->mapped_to_gpu_memory == 0) { | 1413 | if (mem->mapped_to_gpu_memory == 0 && |
1414 | !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { | ||
1221 | /* Validate BO only once. The eviction fence gets added to BO | 1415 | /* Validate BO only once. The eviction fence gets added to BO |
1222 | * the first time it is mapped. Validate will wait for all | 1416 | * the first time it is mapped. Validate will wait for all |
1223 | * background evictions to complete. | 1417 | * background evictions to complete. |
@@ -1235,7 +1429,8 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( | |||
1235 | entry->va, entry->va + bo_size, | 1429 | entry->va, entry->va + bo_size, |
1236 | entry); | 1430 | entry); |
1237 | 1431 | ||
1238 | ret = map_bo_to_gpuvm(adev, entry, ctx.sync); | 1432 | ret = map_bo_to_gpuvm(adev, entry, ctx.sync, |
1433 | is_invalid_userptr); | ||
1239 | if (ret) { | 1434 | if (ret) { |
1240 | pr_err("Failed to map radeon bo to gpuvm\n"); | 1435 | pr_err("Failed to map radeon bo to gpuvm\n"); |
1241 | goto map_bo_to_gpuvm_failed; | 1436 | goto map_bo_to_gpuvm_failed; |
@@ -1418,6 +1613,337 @@ bo_reserve_failed: | |||
1418 | return ret; | 1613 | return ret; |
1419 | } | 1614 | } |
1420 | 1615 | ||
1616 | /* Evict a userptr BO by stopping the queues if necessary | ||
1617 | * | ||
1618 | * Runs in MMU notifier, may be in RECLAIM_FS context. This means it | ||
1619 | * cannot do any memory allocations, and cannot take any locks that | ||
1620 | * are held elsewhere while allocating memory. Therefore this is as | ||
1621 | * simple as possible, using atomic counters. | ||
1622 | * | ||
1623 | * It doesn't do anything to the BO itself. The real work happens in | ||
1624 | * restore, where we get updated page addresses. This function only | ||
1625 | * ensures that GPU access to the BO is stopped. | ||
1626 | */ | ||
1627 | int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, | ||
1628 | struct mm_struct *mm) | ||
1629 | { | ||
1630 | struct amdkfd_process_info *process_info = mem->process_info; | ||
1631 | int invalid, evicted_bos; | ||
1632 | int r = 0; | ||
1633 | |||
1634 | invalid = atomic_inc_return(&mem->invalid); | ||
1635 | evicted_bos = atomic_inc_return(&process_info->evicted_bos); | ||
1636 | if (evicted_bos == 1) { | ||
1637 | /* First eviction, stop the queues */ | ||
1638 | r = kgd2kfd->quiesce_mm(mm); | ||
1639 | if (r) | ||
1640 | pr_err("Failed to quiesce KFD\n"); | ||
1641 | schedule_delayed_work(&process_info->restore_userptr_work, | ||
1642 | msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); | ||
1643 | } | ||
1644 | |||
1645 | return r; | ||
1646 | } | ||
1647 | |||
1648 | /* Update invalid userptr BOs | ||
1649 | * | ||
1650 | * Moves invalidated (evicted) userptr BOs from userptr_valid_list to | ||
1651 | * userptr_inval_list and updates user pages for all BOs that have | ||
1652 | * been invalidated since their last update. | ||
1653 | */ | ||
1654 | static int update_invalid_user_pages(struct amdkfd_process_info *process_info, | ||
1655 | struct mm_struct *mm) | ||
1656 | { | ||
1657 | struct kgd_mem *mem, *tmp_mem; | ||
1658 | struct amdgpu_bo *bo; | ||
1659 | struct ttm_operation_ctx ctx = { false, false }; | ||
1660 | int invalid, ret; | ||
1661 | |||
1662 | /* Move all invalidated BOs to the userptr_inval_list and | ||
1663 | * release their user pages by migration to the CPU domain | ||
1664 | */ | ||
1665 | list_for_each_entry_safe(mem, tmp_mem, | ||
1666 | &process_info->userptr_valid_list, | ||
1667 | validate_list.head) { | ||
1668 | if (!atomic_read(&mem->invalid)) | ||
1669 | continue; /* BO is still valid */ | ||
1670 | |||
1671 | bo = mem->bo; | ||
1672 | |||
1673 | if (amdgpu_bo_reserve(bo, true)) | ||
1674 | return -EAGAIN; | ||
1675 | amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); | ||
1676 | ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); | ||
1677 | amdgpu_bo_unreserve(bo); | ||
1678 | if (ret) { | ||
1679 | pr_err("%s: Failed to invalidate userptr BO\n", | ||
1680 | __func__); | ||
1681 | return -EAGAIN; | ||
1682 | } | ||
1683 | |||
1684 | list_move_tail(&mem->validate_list.head, | ||
1685 | &process_info->userptr_inval_list); | ||
1686 | } | ||
1687 | |||
1688 | if (list_empty(&process_info->userptr_inval_list)) | ||
1689 | return 0; /* All evicted userptr BOs were freed */ | ||
1690 | |||
1691 | /* Go through userptr_inval_list and update any invalid user_pages */ | ||
1692 | list_for_each_entry(mem, &process_info->userptr_inval_list, | ||
1693 | validate_list.head) { | ||
1694 | invalid = atomic_read(&mem->invalid); | ||
1695 | if (!invalid) | ||
1696 | /* BO hasn't been invalidated since the last | ||
1697 | * revalidation attempt. Keep its BO list. | ||
1698 | */ | ||
1699 | continue; | ||
1700 | |||
1701 | bo = mem->bo; | ||
1702 | |||
1703 | if (!mem->user_pages) { | ||
1704 | mem->user_pages = | ||
1705 | kvmalloc_array(bo->tbo.ttm->num_pages, | ||
1706 | sizeof(struct page *), | ||
1707 | GFP_KERNEL | __GFP_ZERO); | ||
1708 | if (!mem->user_pages) { | ||
1709 | pr_err("%s: Failed to allocate pages array\n", | ||
1710 | __func__); | ||
1711 | return -ENOMEM; | ||
1712 | } | ||
1713 | } else if (mem->user_pages[0]) { | ||
1714 | release_pages(mem->user_pages, bo->tbo.ttm->num_pages); | ||
1715 | } | ||
1716 | |||
1717 | /* Get updated user pages */ | ||
1718 | ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, | ||
1719 | mem->user_pages); | ||
1720 | if (ret) { | ||
1721 | mem->user_pages[0] = NULL; | ||
1722 | pr_info("%s: Failed to get user pages: %d\n", | ||
1723 | __func__, ret); | ||
1724 | /* Pretend it succeeded. It will fail later | ||
1725 | * with a VM fault if the GPU tries to access | ||
1726 | * it. Better than hanging indefinitely with | ||
1727 | * stalled user mode queues. | ||
1728 | */ | ||
1729 | } | ||
1730 | |||
1731 | /* Mark the BO as valid unless it was invalidated | ||
1732 | * again concurrently | ||
1733 | */ | ||
1734 | if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) | ||
1735 | return -EAGAIN; | ||
1736 | } | ||
1737 | |||
1738 | return 0; | ||
1739 | } | ||
1740 | |||
1741 | /* Validate invalid userptr BOs | ||
1742 | * | ||
1743 | * Validates BOs on the userptr_inval_list, and moves them back to the | ||
1744 | * userptr_valid_list. Also updates GPUVM page tables with new page | ||
1745 | * addresses and waits for the page table updates to complete. | ||
1746 | */ | ||
1747 | static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) | ||
1748 | { | ||
1749 | struct amdgpu_bo_list_entry *pd_bo_list_entries; | ||
1750 | struct list_head resv_list, duplicates; | ||
1751 | struct ww_acquire_ctx ticket; | ||
1752 | struct amdgpu_sync sync; | ||
1753 | |||
1754 | struct amdgpu_vm *peer_vm; | ||
1755 | struct kgd_mem *mem, *tmp_mem; | ||
1756 | struct amdgpu_bo *bo; | ||
1757 | struct ttm_operation_ctx ctx = { false, false }; | ||
1758 | int i, ret; | ||
1759 | |||
1760 | pd_bo_list_entries = kcalloc(process_info->n_vms, | ||
1761 | sizeof(struct amdgpu_bo_list_entry), | ||
1762 | GFP_KERNEL); | ||
1763 | if (!pd_bo_list_entries) { | ||
1764 | pr_err("%s: Failed to allocate PD BO list entries\n", __func__); | ||
1765 | return -ENOMEM; | ||
1766 | } | ||
1767 | |||
1768 | INIT_LIST_HEAD(&resv_list); | ||
1769 | INIT_LIST_HEAD(&duplicates); | ||
1770 | |||
1771 | /* Get all the page directory BOs that need to be reserved */ | ||
1772 | i = 0; | ||
1773 | list_for_each_entry(peer_vm, &process_info->vm_list_head, | ||
1774 | vm_list_node) | ||
1775 | amdgpu_vm_get_pd_bo(peer_vm, &resv_list, | ||
1776 | &pd_bo_list_entries[i++]); | ||
1777 | /* Add the userptr_inval_list entries to resv_list */ | ||
1778 | list_for_each_entry(mem, &process_info->userptr_inval_list, | ||
1779 | validate_list.head) { | ||
1780 | list_add_tail(&mem->resv_list.head, &resv_list); | ||
1781 | mem->resv_list.bo = mem->validate_list.bo; | ||
1782 | mem->resv_list.shared = mem->validate_list.shared; | ||
1783 | } | ||
1784 | |||
1785 | /* Reserve all BOs and page tables for validation */ | ||
1786 | ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); | ||
1787 | WARN(!list_empty(&duplicates), "Duplicates should be empty"); | ||
1788 | if (ret) | ||
1789 | goto out; | ||
1790 | |||
1791 | amdgpu_sync_create(&sync); | ||
1792 | |||
1793 | /* Avoid triggering eviction fences when unmapping invalid | ||
1794 | * userptr BOs (waits for all fences, doesn't use | ||
1795 | * FENCE_OWNER_VM) | ||
1796 | */ | ||
1797 | list_for_each_entry(peer_vm, &process_info->vm_list_head, | ||
1798 | vm_list_node) | ||
1799 | amdgpu_amdkfd_remove_eviction_fence(peer_vm->root.base.bo, | ||
1800 | process_info->eviction_fence, | ||
1801 | NULL, NULL); | ||
1802 | |||
1803 | ret = process_validate_vms(process_info); | ||
1804 | if (ret) | ||
1805 | goto unreserve_out; | ||
1806 | |||
1807 | /* Validate BOs and update GPUVM page tables */ | ||
1808 | list_for_each_entry_safe(mem, tmp_mem, | ||
1809 | &process_info->userptr_inval_list, | ||
1810 | validate_list.head) { | ||
1811 | struct kfd_bo_va_list *bo_va_entry; | ||
1812 | |||
1813 | bo = mem->bo; | ||
1814 | |||
1815 | /* Copy pages array and validate the BO if we got user pages */ | ||
1816 | if (mem->user_pages[0]) { | ||
1817 | amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, | ||
1818 | mem->user_pages); | ||
1819 | amdgpu_ttm_placement_from_domain(bo, mem->domain); | ||
1820 | ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); | ||
1821 | if (ret) { | ||
1822 | pr_err("%s: failed to validate BO\n", __func__); | ||
1823 | goto unreserve_out; | ||
1824 | } | ||
1825 | } | ||
1826 | |||
1827 | /* Validate succeeded, now the BO owns the pages, free | ||
1828 | * our copy of the pointer array. Put this BO back on | ||
1829 | * the userptr_valid_list. If we need to revalidate | ||
1830 | * it, we need to start from scratch. | ||
1831 | */ | ||
1832 | kvfree(mem->user_pages); | ||
1833 | mem->user_pages = NULL; | ||
1834 | list_move_tail(&mem->validate_list.head, | ||
1835 | &process_info->userptr_valid_list); | ||
1836 | |||
1837 | /* Update mapping. If the BO was not validated | ||
1838 | * (because we couldn't get user pages), this will | ||
1839 | * clear the page table entries, which will result in | ||
1840 | * VM faults if the GPU tries to access the invalid | ||
1841 | * memory. | ||
1842 | */ | ||
1843 | list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { | ||
1844 | if (!bo_va_entry->is_mapped) | ||
1845 | continue; | ||
1846 | |||
1847 | ret = update_gpuvm_pte((struct amdgpu_device *) | ||
1848 | bo_va_entry->kgd_dev, | ||
1849 | bo_va_entry, &sync); | ||
1850 | if (ret) { | ||
1851 | pr_err("%s: update PTE failed\n", __func__); | ||
1852 | /* make sure this gets validated again */ | ||
1853 | atomic_inc(&mem->invalid); | ||
1854 | goto unreserve_out; | ||
1855 | } | ||
1856 | } | ||
1857 | } | ||
1858 | |||
1859 | /* Update page directories */ | ||
1860 | ret = process_update_pds(process_info, &sync); | ||
1861 | |||
1862 | unreserve_out: | ||
1863 | list_for_each_entry(peer_vm, &process_info->vm_list_head, | ||
1864 | vm_list_node) | ||
1865 | amdgpu_bo_fence(peer_vm->root.base.bo, | ||
1866 | &process_info->eviction_fence->base, true); | ||
1867 | ttm_eu_backoff_reservation(&ticket, &resv_list); | ||
1868 | amdgpu_sync_wait(&sync, false); | ||
1869 | amdgpu_sync_free(&sync); | ||
1870 | out: | ||
1871 | kfree(pd_bo_list_entries); | ||
1872 | |||
1873 | return ret; | ||
1874 | } | ||
1875 | |||
1876 | /* Worker callback to restore evicted userptr BOs | ||
1877 | * | ||
1878 | * Tries to update and validate all userptr BOs. If successful and no | ||
1879 | * concurrent evictions happened, the queues are restarted. Otherwise, | ||
1880 | * reschedule for another attempt later. | ||
1881 | */ | ||
1882 | static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) | ||
1883 | { | ||
1884 | struct delayed_work *dwork = to_delayed_work(work); | ||
1885 | struct amdkfd_process_info *process_info = | ||
1886 | container_of(dwork, struct amdkfd_process_info, | ||
1887 | restore_userptr_work); | ||
1888 | struct task_struct *usertask; | ||
1889 | struct mm_struct *mm; | ||
1890 | int evicted_bos; | ||
1891 | |||
1892 | evicted_bos = atomic_read(&process_info->evicted_bos); | ||
1893 | if (!evicted_bos) | ||
1894 | return; | ||
1895 | |||
1896 | /* Reference task and mm in case of concurrent process termination */ | ||
1897 | usertask = get_pid_task(process_info->pid, PIDTYPE_PID); | ||
1898 | if (!usertask) | ||
1899 | return; | ||
1900 | mm = get_task_mm(usertask); | ||
1901 | if (!mm) { | ||
1902 | put_task_struct(usertask); | ||
1903 | return; | ||
1904 | } | ||
1905 | |||
1906 | mutex_lock(&process_info->lock); | ||
1907 | |||
1908 | if (update_invalid_user_pages(process_info, mm)) | ||
1909 | goto unlock_out; | ||
1910 | /* userptr_inval_list can be empty if all evicted userptr BOs | ||
1911 | * have been freed. In that case there is nothing to validate | ||
1912 | * and we can just restart the queues. | ||
1913 | */ | ||
1914 | if (!list_empty(&process_info->userptr_inval_list)) { | ||
1915 | if (atomic_read(&process_info->evicted_bos) != evicted_bos) | ||
1916 | goto unlock_out; /* Concurrent eviction, try again */ | ||
1917 | |||
1918 | if (validate_invalid_user_pages(process_info)) | ||
1919 | goto unlock_out; | ||
1920 | } | ||
1921 | /* Final check for concurrent evicton and atomic update. If | ||
1922 | * another eviction happens after successful update, it will | ||
1923 | * be a first eviction that calls quiesce_mm. The eviction | ||
1924 | * reference counting inside KFD will handle this case. | ||
1925 | */ | ||
1926 | if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != | ||
1927 | evicted_bos) | ||
1928 | goto unlock_out; | ||
1929 | evicted_bos = 0; | ||
1930 | if (kgd2kfd->resume_mm(mm)) { | ||
1931 | pr_err("%s: Failed to resume KFD\n", __func__); | ||
1932 | /* No recovery from this failure. Probably the CP is | ||
1933 | * hanging. No point trying again. | ||
1934 | */ | ||
1935 | } | ||
1936 | unlock_out: | ||
1937 | mutex_unlock(&process_info->lock); | ||
1938 | mmput(mm); | ||
1939 | put_task_struct(usertask); | ||
1940 | |||
1941 | /* If validation failed, reschedule another attempt */ | ||
1942 | if (evicted_bos) | ||
1943 | schedule_delayed_work(&process_info->restore_userptr_work, | ||
1944 | msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); | ||
1945 | } | ||
1946 | |||
1421 | /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given | 1947 | /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given |
1422 | * KFD process identified by process_info | 1948 | * KFD process identified by process_info |
1423 | * | 1949 | * |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index dc34b50e6b29..8e66f3702b7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | |||
@@ -536,7 +536,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, | |||
536 | if (p->bo_list) { | 536 | if (p->bo_list) { |
537 | amdgpu_bo_list_get_list(p->bo_list, &p->validated); | 537 | amdgpu_bo_list_get_list(p->bo_list, &p->validated); |
538 | if (p->bo_list->first_userptr != p->bo_list->num_entries) | 538 | if (p->bo_list->first_userptr != p->bo_list->num_entries) |
539 | p->mn = amdgpu_mn_get(p->adev); | 539 | p->mn = amdgpu_mn_get(p->adev, AMDGPU_MN_TYPE_GFX); |
540 | } | 540 | } |
541 | 541 | ||
542 | INIT_LIST_HEAD(&duplicates); | 542 | INIT_LIST_HEAD(&duplicates); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index bd67f4cb8e6c..83e344fbb50a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | |||
@@ -36,12 +36,14 @@ | |||
36 | #include <drm/drm.h> | 36 | #include <drm/drm.h> |
37 | 37 | ||
38 | #include "amdgpu.h" | 38 | #include "amdgpu.h" |
39 | #include "amdgpu_amdkfd.h" | ||
39 | 40 | ||
40 | struct amdgpu_mn { | 41 | struct amdgpu_mn { |
41 | /* constant after initialisation */ | 42 | /* constant after initialisation */ |
42 | struct amdgpu_device *adev; | 43 | struct amdgpu_device *adev; |
43 | struct mm_struct *mm; | 44 | struct mm_struct *mm; |
44 | struct mmu_notifier mn; | 45 | struct mmu_notifier mn; |
46 | enum amdgpu_mn_type type; | ||
45 | 47 | ||
46 | /* only used on destruction */ | 48 | /* only used on destruction */ |
47 | struct work_struct work; | 49 | struct work_struct work; |
@@ -185,7 +187,7 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, | |||
185 | } | 187 | } |
186 | 188 | ||
187 | /** | 189 | /** |
188 | * amdgpu_mn_invalidate_range_start - callback to notify about mm change | 190 | * amdgpu_mn_invalidate_range_start_gfx - callback to notify about mm change |
189 | * | 191 | * |
190 | * @mn: our notifier | 192 | * @mn: our notifier |
191 | * @mn: the mm this callback is about | 193 | * @mn: the mm this callback is about |
@@ -195,10 +197,10 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, | |||
195 | * We block for all BOs between start and end to be idle and | 197 | * We block for all BOs between start and end to be idle and |
196 | * unmap them by move them into system domain again. | 198 | * unmap them by move them into system domain again. |
197 | */ | 199 | */ |
198 | static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, | 200 | static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, |
199 | struct mm_struct *mm, | 201 | struct mm_struct *mm, |
200 | unsigned long start, | 202 | unsigned long start, |
201 | unsigned long end) | 203 | unsigned long end) |
202 | { | 204 | { |
203 | struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); | 205 | struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); |
204 | struct interval_tree_node *it; | 206 | struct interval_tree_node *it; |
@@ -220,6 +222,49 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, | |||
220 | } | 222 | } |
221 | 223 | ||
222 | /** | 224 | /** |
225 | * amdgpu_mn_invalidate_range_start_hsa - callback to notify about mm change | ||
226 | * | ||
227 | * @mn: our notifier | ||
228 | * @mn: the mm this callback is about | ||
229 | * @start: start of updated range | ||
230 | * @end: end of updated range | ||
231 | * | ||
232 | * We temporarily evict all BOs between start and end. This | ||
233 | * necessitates evicting all user-mode queues of the process. The BOs | ||
234 | * are restorted in amdgpu_mn_invalidate_range_end_hsa. | ||
235 | */ | ||
236 | static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, | ||
237 | struct mm_struct *mm, | ||
238 | unsigned long start, | ||
239 | unsigned long end) | ||
240 | { | ||
241 | struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); | ||
242 | struct interval_tree_node *it; | ||
243 | |||
244 | /* notification is exclusive, but interval is inclusive */ | ||
245 | end -= 1; | ||
246 | |||
247 | amdgpu_mn_read_lock(rmn); | ||
248 | |||
249 | it = interval_tree_iter_first(&rmn->objects, start, end); | ||
250 | while (it) { | ||
251 | struct amdgpu_mn_node *node; | ||
252 | struct amdgpu_bo *bo; | ||
253 | |||
254 | node = container_of(it, struct amdgpu_mn_node, it); | ||
255 | it = interval_tree_iter_next(it, start, end); | ||
256 | |||
257 | list_for_each_entry(bo, &node->bos, mn_list) { | ||
258 | struct kgd_mem *mem = bo->kfd_bo; | ||
259 | |||
260 | if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, | ||
261 | start, end)) | ||
262 | amdgpu_amdkfd_evict_userptr(mem, mm); | ||
263 | } | ||
264 | } | ||
265 | } | ||
266 | |||
267 | /** | ||
223 | * amdgpu_mn_invalidate_range_end - callback to notify about mm change | 268 | * amdgpu_mn_invalidate_range_end - callback to notify about mm change |
224 | * | 269 | * |
225 | * @mn: our notifier | 270 | * @mn: our notifier |
@@ -239,23 +284,39 @@ static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn, | |||
239 | amdgpu_mn_read_unlock(rmn); | 284 | amdgpu_mn_read_unlock(rmn); |
240 | } | 285 | } |
241 | 286 | ||
242 | static const struct mmu_notifier_ops amdgpu_mn_ops = { | 287 | static const struct mmu_notifier_ops amdgpu_mn_ops[] = { |
243 | .release = amdgpu_mn_release, | 288 | [AMDGPU_MN_TYPE_GFX] = { |
244 | .invalidate_range_start = amdgpu_mn_invalidate_range_start, | 289 | .release = amdgpu_mn_release, |
245 | .invalidate_range_end = amdgpu_mn_invalidate_range_end, | 290 | .invalidate_range_start = amdgpu_mn_invalidate_range_start_gfx, |
291 | .invalidate_range_end = amdgpu_mn_invalidate_range_end, | ||
292 | }, | ||
293 | [AMDGPU_MN_TYPE_HSA] = { | ||
294 | .release = amdgpu_mn_release, | ||
295 | .invalidate_range_start = amdgpu_mn_invalidate_range_start_hsa, | ||
296 | .invalidate_range_end = amdgpu_mn_invalidate_range_end, | ||
297 | }, | ||
246 | }; | 298 | }; |
247 | 299 | ||
300 | /* Low bits of any reasonable mm pointer will be unused due to struct | ||
301 | * alignment. Use these bits to make a unique key from the mm pointer | ||
302 | * and notifier type. | ||
303 | */ | ||
304 | #define AMDGPU_MN_KEY(mm, type) ((unsigned long)(mm) + (type)) | ||
305 | |||
248 | /** | 306 | /** |
249 | * amdgpu_mn_get - create notifier context | 307 | * amdgpu_mn_get - create notifier context |
250 | * | 308 | * |
251 | * @adev: amdgpu device pointer | 309 | * @adev: amdgpu device pointer |
310 | * @type: type of MMU notifier context | ||
252 | * | 311 | * |
253 | * Creates a notifier context for current->mm. | 312 | * Creates a notifier context for current->mm. |
254 | */ | 313 | */ |
255 | struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | 314 | struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, |
315 | enum amdgpu_mn_type type) | ||
256 | { | 316 | { |
257 | struct mm_struct *mm = current->mm; | 317 | struct mm_struct *mm = current->mm; |
258 | struct amdgpu_mn *rmn; | 318 | struct amdgpu_mn *rmn; |
319 | unsigned long key = AMDGPU_MN_KEY(mm, type); | ||
259 | int r; | 320 | int r; |
260 | 321 | ||
261 | mutex_lock(&adev->mn_lock); | 322 | mutex_lock(&adev->mn_lock); |
@@ -264,8 +325,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | |||
264 | return ERR_PTR(-EINTR); | 325 | return ERR_PTR(-EINTR); |
265 | } | 326 | } |
266 | 327 | ||
267 | hash_for_each_possible(adev->mn_hash, rmn, node, (unsigned long)mm) | 328 | hash_for_each_possible(adev->mn_hash, rmn, node, key) |
268 | if (rmn->mm == mm) | 329 | if (AMDGPU_MN_KEY(rmn->mm, rmn->type) == key) |
269 | goto release_locks; | 330 | goto release_locks; |
270 | 331 | ||
271 | rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); | 332 | rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); |
@@ -276,8 +337,9 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | |||
276 | 337 | ||
277 | rmn->adev = adev; | 338 | rmn->adev = adev; |
278 | rmn->mm = mm; | 339 | rmn->mm = mm; |
279 | rmn->mn.ops = &amdgpu_mn_ops; | ||
280 | init_rwsem(&rmn->lock); | 340 | init_rwsem(&rmn->lock); |
341 | rmn->type = type; | ||
342 | rmn->mn.ops = &amdgpu_mn_ops[type]; | ||
281 | rmn->objects = RB_ROOT_CACHED; | 343 | rmn->objects = RB_ROOT_CACHED; |
282 | mutex_init(&rmn->read_lock); | 344 | mutex_init(&rmn->read_lock); |
283 | atomic_set(&rmn->recursion, 0); | 345 | atomic_set(&rmn->recursion, 0); |
@@ -286,7 +348,7 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | |||
286 | if (r) | 348 | if (r) |
287 | goto free_rmn; | 349 | goto free_rmn; |
288 | 350 | ||
289 | hash_add(adev->mn_hash, &rmn->node, (unsigned long)mm); | 351 | hash_add(adev->mn_hash, &rmn->node, AMDGPU_MN_KEY(mm, type)); |
290 | 352 | ||
291 | release_locks: | 353 | release_locks: |
292 | up_write(&mm->mmap_sem); | 354 | up_write(&mm->mmap_sem); |
@@ -315,15 +377,21 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) | |||
315 | { | 377 | { |
316 | unsigned long end = addr + amdgpu_bo_size(bo) - 1; | 378 | unsigned long end = addr + amdgpu_bo_size(bo) - 1; |
317 | struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); | 379 | struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); |
380 | enum amdgpu_mn_type type = | ||
381 | bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX; | ||
318 | struct amdgpu_mn *rmn; | 382 | struct amdgpu_mn *rmn; |
319 | struct amdgpu_mn_node *node = NULL; | 383 | struct amdgpu_mn_node *node = NULL, *new_node; |
320 | struct list_head bos; | 384 | struct list_head bos; |
321 | struct interval_tree_node *it; | 385 | struct interval_tree_node *it; |
322 | 386 | ||
323 | rmn = amdgpu_mn_get(adev); | 387 | rmn = amdgpu_mn_get(adev, type); |
324 | if (IS_ERR(rmn)) | 388 | if (IS_ERR(rmn)) |
325 | return PTR_ERR(rmn); | 389 | return PTR_ERR(rmn); |
326 | 390 | ||
391 | new_node = kmalloc(sizeof(*new_node), GFP_KERNEL); | ||
392 | if (!new_node) | ||
393 | return -ENOMEM; | ||
394 | |||
327 | INIT_LIST_HEAD(&bos); | 395 | INIT_LIST_HEAD(&bos); |
328 | 396 | ||
329 | down_write(&rmn->lock); | 397 | down_write(&rmn->lock); |
@@ -337,13 +405,10 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) | |||
337 | list_splice(&node->bos, &bos); | 405 | list_splice(&node->bos, &bos); |
338 | } | 406 | } |
339 | 407 | ||
340 | if (!node) { | 408 | if (!node) |
341 | node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_KERNEL); | 409 | node = new_node; |
342 | if (!node) { | 410 | else |
343 | up_write(&rmn->lock); | 411 | kfree(new_node); |
344 | return -ENOMEM; | ||
345 | } | ||
346 | } | ||
347 | 412 | ||
348 | bo->mn = rmn; | 413 | bo->mn = rmn; |
349 | 414 | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h index d0095a3793b8..eb0f432f78fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h | |||
@@ -29,16 +29,23 @@ | |||
29 | */ | 29 | */ |
30 | struct amdgpu_mn; | 30 | struct amdgpu_mn; |
31 | 31 | ||
32 | enum amdgpu_mn_type { | ||
33 | AMDGPU_MN_TYPE_GFX, | ||
34 | AMDGPU_MN_TYPE_HSA, | ||
35 | }; | ||
36 | |||
32 | #if defined(CONFIG_MMU_NOTIFIER) | 37 | #if defined(CONFIG_MMU_NOTIFIER) |
33 | void amdgpu_mn_lock(struct amdgpu_mn *mn); | 38 | void amdgpu_mn_lock(struct amdgpu_mn *mn); |
34 | void amdgpu_mn_unlock(struct amdgpu_mn *mn); | 39 | void amdgpu_mn_unlock(struct amdgpu_mn *mn); |
35 | struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev); | 40 | struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, |
41 | enum amdgpu_mn_type type); | ||
36 | int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); | 42 | int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); |
37 | void amdgpu_mn_unregister(struct amdgpu_bo *bo); | 43 | void amdgpu_mn_unregister(struct amdgpu_bo *bo); |
38 | #else | 44 | #else |
39 | static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {} | 45 | static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {} |
40 | static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {} | 46 | static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {} |
41 | static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) | 47 | static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, |
48 | enum amdgpu_mn_type type) | ||
42 | { | 49 | { |
43 | return NULL; | 50 | return NULL; |
44 | } | 51 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 205da3ff9cd0..c713d30cba86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | |||
@@ -695,7 +695,7 @@ struct amdgpu_ttm_tt { | |||
695 | struct ttm_dma_tt ttm; | 695 | struct ttm_dma_tt ttm; |
696 | u64 offset; | 696 | u64 offset; |
697 | uint64_t userptr; | 697 | uint64_t userptr; |
698 | struct mm_struct *usermm; | 698 | struct task_struct *usertask; |
699 | uint32_t userflags; | 699 | uint32_t userflags; |
700 | spinlock_t guptasklock; | 700 | spinlock_t guptasklock; |
701 | struct list_head guptasks; | 701 | struct list_head guptasks; |
@@ -706,14 +706,18 @@ struct amdgpu_ttm_tt { | |||
706 | int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | 706 | int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) |
707 | { | 707 | { |
708 | struct amdgpu_ttm_tt *gtt = (void *)ttm; | 708 | struct amdgpu_ttm_tt *gtt = (void *)ttm; |
709 | struct mm_struct *mm = gtt->usertask->mm; | ||
709 | unsigned int flags = 0; | 710 | unsigned int flags = 0; |
710 | unsigned pinned = 0; | 711 | unsigned pinned = 0; |
711 | int r; | 712 | int r; |
712 | 713 | ||
714 | if (!mm) /* Happens during process shutdown */ | ||
715 | return -ESRCH; | ||
716 | |||
713 | if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) | 717 | if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) |
714 | flags |= FOLL_WRITE; | 718 | flags |= FOLL_WRITE; |
715 | 719 | ||
716 | down_read(¤t->mm->mmap_sem); | 720 | down_read(&mm->mmap_sem); |
717 | 721 | ||
718 | if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) { | 722 | if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) { |
719 | /* check that we only use anonymous memory | 723 | /* check that we only use anonymous memory |
@@ -721,9 +725,9 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | |||
721 | unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE; | 725 | unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE; |
722 | struct vm_area_struct *vma; | 726 | struct vm_area_struct *vma; |
723 | 727 | ||
724 | vma = find_vma(gtt->usermm, gtt->userptr); | 728 | vma = find_vma(mm, gtt->userptr); |
725 | if (!vma || vma->vm_file || vma->vm_end < end) { | 729 | if (!vma || vma->vm_file || vma->vm_end < end) { |
726 | up_read(¤t->mm->mmap_sem); | 730 | up_read(&mm->mmap_sem); |
727 | return -EPERM; | 731 | return -EPERM; |
728 | } | 732 | } |
729 | } | 733 | } |
@@ -739,7 +743,12 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | |||
739 | list_add(&guptask.list, >t->guptasks); | 743 | list_add(&guptask.list, >t->guptasks); |
740 | spin_unlock(>t->guptasklock); | 744 | spin_unlock(>t->guptasklock); |
741 | 745 | ||
742 | r = get_user_pages(userptr, num_pages, flags, p, NULL); | 746 | if (mm == current->mm) |
747 | r = get_user_pages(userptr, num_pages, flags, p, NULL); | ||
748 | else | ||
749 | r = get_user_pages_remote(gtt->usertask, | ||
750 | mm, userptr, num_pages, | ||
751 | flags, p, NULL, NULL); | ||
743 | 752 | ||
744 | spin_lock(>t->guptasklock); | 753 | spin_lock(>t->guptasklock); |
745 | list_del(&guptask.list); | 754 | list_del(&guptask.list); |
@@ -752,12 +761,12 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | |||
752 | 761 | ||
753 | } while (pinned < ttm->num_pages); | 762 | } while (pinned < ttm->num_pages); |
754 | 763 | ||
755 | up_read(¤t->mm->mmap_sem); | 764 | up_read(&mm->mmap_sem); |
756 | return 0; | 765 | return 0; |
757 | 766 | ||
758 | release_pages: | 767 | release_pages: |
759 | release_pages(pages, pinned); | 768 | release_pages(pages, pinned); |
760 | up_read(¤t->mm->mmap_sem); | 769 | up_read(&mm->mmap_sem); |
761 | return r; | 770 | return r; |
762 | } | 771 | } |
763 | 772 | ||
@@ -978,6 +987,9 @@ static void amdgpu_ttm_backend_destroy(struct ttm_tt *ttm) | |||
978 | { | 987 | { |
979 | struct amdgpu_ttm_tt *gtt = (void *)ttm; | 988 | struct amdgpu_ttm_tt *gtt = (void *)ttm; |
980 | 989 | ||
990 | if (gtt->usertask) | ||
991 | put_task_struct(gtt->usertask); | ||
992 | |||
981 | ttm_dma_tt_fini(>t->ttm); | 993 | ttm_dma_tt_fini(>t->ttm); |
982 | kfree(gtt); | 994 | kfree(gtt); |
983 | } | 995 | } |
@@ -1079,8 +1091,13 @@ int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr, | |||
1079 | return -EINVAL; | 1091 | return -EINVAL; |
1080 | 1092 | ||
1081 | gtt->userptr = addr; | 1093 | gtt->userptr = addr; |
1082 | gtt->usermm = current->mm; | ||
1083 | gtt->userflags = flags; | 1094 | gtt->userflags = flags; |
1095 | |||
1096 | if (gtt->usertask) | ||
1097 | put_task_struct(gtt->usertask); | ||
1098 | gtt->usertask = current->group_leader; | ||
1099 | get_task_struct(gtt->usertask); | ||
1100 | |||
1084 | spin_lock_init(>t->guptasklock); | 1101 | spin_lock_init(>t->guptasklock); |
1085 | INIT_LIST_HEAD(>t->guptasks); | 1102 | INIT_LIST_HEAD(>t->guptasks); |
1086 | atomic_set(>t->mmu_invalidations, 0); | 1103 | atomic_set(>t->mmu_invalidations, 0); |
@@ -1096,7 +1113,10 @@ struct mm_struct *amdgpu_ttm_tt_get_usermm(struct ttm_tt *ttm) | |||
1096 | if (gtt == NULL) | 1113 | if (gtt == NULL) |
1097 | return NULL; | 1114 | return NULL; |
1098 | 1115 | ||
1099 | return gtt->usermm; | 1116 | if (gtt->usertask == NULL) |
1117 | return NULL; | ||
1118 | |||
1119 | return gtt->usertask->mm; | ||
1100 | } | 1120 | } |
1101 | 1121 | ||
1102 | bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start, | 1122 | bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start, |
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 9d39fd5b1822..e5962e61beb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | |||
@@ -4686,6 +4686,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev, | |||
4686 | 4686 | ||
4687 | cu_info->number = active_cu_number; | 4687 | cu_info->number = active_cu_number; |
4688 | cu_info->ao_cu_mask = ao_cu_mask; | 4688 | cu_info->ao_cu_mask = ao_cu_mask; |
4689 | cu_info->simd_per_cu = NUM_SIMD_PER_CU; | ||
4689 | 4690 | ||
4690 | return 0; | 4691 | return 0; |
4691 | } | 4692 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h index 7f408f85fdb6..f22f7a88ce0f 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h | |||
@@ -268,6 +268,11 @@ | |||
268 | * x=1: tmz_end | 268 | * x=1: tmz_end |
269 | */ | 269 | */ |
270 | 270 | ||
271 | #define PACKET3_INVALIDATE_TLBS 0x98 | ||
272 | # define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) | ||
273 | # define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4) | ||
274 | # define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) | ||
275 | # define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29) | ||
271 | #define PACKET3_SET_RESOURCES 0xA0 | 276 | #define PACKET3_SET_RESOURCES 0xA0 |
272 | /* 1. header | 277 | /* 1. header |
273 | * 2. CONTROL | 278 | * 2. CONTROL |
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 0d0242240c47..ffd096fffc1c 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile | |||
@@ -30,12 +30,14 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ | |||
30 | kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ | 30 | kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ |
31 | kfd_process.o kfd_queue.o kfd_mqd_manager.o \ | 31 | kfd_process.o kfd_queue.o kfd_mqd_manager.o \ |
32 | kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ | 32 | kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ |
33 | kfd_mqd_manager_v9.o \ | ||
33 | kfd_kernel_queue.o kfd_kernel_queue_cik.o \ | 34 | kfd_kernel_queue.o kfd_kernel_queue_cik.o \ |
34 | kfd_kernel_queue_vi.o kfd_packet_manager.o \ | 35 | kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ |
35 | kfd_process_queue_manager.o kfd_device_queue_manager.o \ | 36 | kfd_packet_manager.o kfd_process_queue_manager.o \ |
36 | kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ | 37 | kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ |
38 | kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ | ||
37 | kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ | 39 | kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ |
38 | kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o | 40 | kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o |
39 | 41 | ||
40 | ifneq ($(CONFIG_AMD_IOMMU_V2),) | 42 | ifneq ($(CONFIG_AMD_IOMMU_V2),) |
41 | amdkfd-y += kfd_iommu.o | 43 | amdkfd-y += kfd_iommu.o |
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 3d5ccb3755d4..49df6c791cfc 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | |||
@@ -27,18 +27,28 @@ | |||
27 | static bool cik_event_interrupt_isr(struct kfd_dev *dev, | 27 | static bool cik_event_interrupt_isr(struct kfd_dev *dev, |
28 | const uint32_t *ih_ring_entry) | 28 | const uint32_t *ih_ring_entry) |
29 | { | 29 | { |
30 | unsigned int pasid; | ||
31 | const struct cik_ih_ring_entry *ihre = | 30 | const struct cik_ih_ring_entry *ihre = |
32 | (const struct cik_ih_ring_entry *)ih_ring_entry; | 31 | (const struct cik_ih_ring_entry *)ih_ring_entry; |
32 | unsigned int vmid, pasid; | ||
33 | |||
34 | /* Only handle interrupts from KFD VMIDs */ | ||
35 | vmid = (ihre->ring_id & 0x0000ff00) >> 8; | ||
36 | if (vmid < dev->vm_info.first_vmid_kfd || | ||
37 | vmid > dev->vm_info.last_vmid_kfd) | ||
38 | return 0; | ||
33 | 39 | ||
40 | /* If there is no valid PASID, it's likely a firmware bug */ | ||
34 | pasid = (ihre->ring_id & 0xffff0000) >> 16; | 41 | pasid = (ihre->ring_id & 0xffff0000) >> 16; |
42 | if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt")) | ||
43 | return 0; | ||
35 | 44 | ||
36 | /* Do not process in ISR, just request it to be forwarded to WQ. */ | 45 | /* Interrupt types we care about: various signals and faults. |
37 | return (pasid != 0) && | 46 | * They will be forwarded to a work queue (see below). |
38 | (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || | 47 | */ |
48 | return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || | ||
39 | ihre->source_id == CIK_INTSRC_SDMA_TRAP || | 49 | ihre->source_id == CIK_INTSRC_SDMA_TRAP || |
40 | ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || | 50 | ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || |
41 | ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); | 51 | ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE; |
42 | } | 52 | } |
43 | 53 | ||
44 | static void cik_event_interrupt_wq(struct kfd_dev *dev, | 54 | static void cik_event_interrupt_wq(struct kfd_dev *dev, |
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h index 48769d12dd7b..37ce6dd65391 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h | |||
@@ -33,7 +33,8 @@ | |||
33 | #define APE1_MTYPE(x) ((x) << 7) | 33 | #define APE1_MTYPE(x) ((x) << 7) |
34 | 34 | ||
35 | /* valid for both DEFAULT_MTYPE and APE1_MTYPE */ | 35 | /* valid for both DEFAULT_MTYPE and APE1_MTYPE */ |
36 | #define MTYPE_CACHED 0 | 36 | #define MTYPE_CACHED_NV 0 |
37 | #define MTYPE_CACHED 1 | ||
37 | #define MTYPE_NONCACHED 3 | 38 | #define MTYPE_NONCACHED 3 |
38 | 39 | ||
39 | #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) | 40 | #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) |
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h new file mode 100644 index 000000000000..f68aef02fc1f --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | |||
@@ -0,0 +1,560 @@ | |||
1 | /* | ||
2 | * Copyright 2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | static const uint32_t cwsr_trap_gfx8_hex[] = { | ||
24 | 0xbf820001, 0xbf820125, | ||
25 | 0xb8f4f802, 0x89748674, | ||
26 | 0xb8f5f803, 0x8675ff75, | ||
27 | 0x00000400, 0xbf850011, | ||
28 | 0xc00a1e37, 0x00000000, | ||
29 | 0xbf8c007f, 0x87777978, | ||
30 | 0xbf840002, 0xb974f802, | ||
31 | 0xbe801d78, 0xb8f5f803, | ||
32 | 0x8675ff75, 0x000001ff, | ||
33 | 0xbf850002, 0x80708470, | ||
34 | 0x82718071, 0x8671ff71, | ||
35 | 0x0000ffff, 0xb974f802, | ||
36 | 0xbe801f70, 0xb8f5f803, | ||
37 | 0x8675ff75, 0x00000100, | ||
38 | 0xbf840006, 0xbefa0080, | ||
39 | 0xb97a0203, 0x8671ff71, | ||
40 | 0x0000ffff, 0x80f08870, | ||
41 | 0x82f18071, 0xbefa0080, | ||
42 | 0xb97a0283, 0xbef60068, | ||
43 | 0xbef70069, 0xb8fa1c07, | ||
44 | 0x8e7a9c7a, 0x87717a71, | ||
45 | 0xb8fa03c7, 0x8e7a9b7a, | ||
46 | 0x87717a71, 0xb8faf807, | ||
47 | 0x867aff7a, 0x00007fff, | ||
48 | 0xb97af807, 0xbef2007e, | ||
49 | 0xbef3007f, 0xbefe0180, | ||
50 | 0xbf900004, 0x877a8474, | ||
51 | 0xb97af802, 0xbf8e0002, | ||
52 | 0xbf88fffe, 0xbef8007e, | ||
53 | 0x8679ff7f, 0x0000ffff, | ||
54 | 0x8779ff79, 0x00040000, | ||
55 | 0xbefa0080, 0xbefb00ff, | ||
56 | 0x00807fac, 0x867aff7f, | ||
57 | 0x08000000, 0x8f7a837a, | ||
58 | 0x877b7a7b, 0x867aff7f, | ||
59 | 0x70000000, 0x8f7a817a, | ||
60 | 0x877b7a7b, 0xbeef007c, | ||
61 | 0xbeee0080, 0xb8ee2a05, | ||
62 | 0x806e816e, 0x8e6e8a6e, | ||
63 | 0xb8fa1605, 0x807a817a, | ||
64 | 0x8e7a867a, 0x806e7a6e, | ||
65 | 0xbefa0084, 0xbefa00ff, | ||
66 | 0x01000000, 0xbefe007c, | ||
67 | 0xbefc006e, 0xc0611bfc, | ||
68 | 0x0000007c, 0x806e846e, | ||
69 | 0xbefc007e, 0xbefe007c, | ||
70 | 0xbefc006e, 0xc0611c3c, | ||
71 | 0x0000007c, 0x806e846e, | ||
72 | 0xbefc007e, 0xbefe007c, | ||
73 | 0xbefc006e, 0xc0611c7c, | ||
74 | 0x0000007c, 0x806e846e, | ||
75 | 0xbefc007e, 0xbefe007c, | ||
76 | 0xbefc006e, 0xc0611cbc, | ||
77 | 0x0000007c, 0x806e846e, | ||
78 | 0xbefc007e, 0xbefe007c, | ||
79 | 0xbefc006e, 0xc0611cfc, | ||
80 | 0x0000007c, 0x806e846e, | ||
81 | 0xbefc007e, 0xbefe007c, | ||
82 | 0xbefc006e, 0xc0611d3c, | ||
83 | 0x0000007c, 0x806e846e, | ||
84 | 0xbefc007e, 0xb8f5f803, | ||
85 | 0xbefe007c, 0xbefc006e, | ||
86 | 0xc0611d7c, 0x0000007c, | ||
87 | 0x806e846e, 0xbefc007e, | ||
88 | 0xbefe007c, 0xbefc006e, | ||
89 | 0xc0611dbc, 0x0000007c, | ||
90 | 0x806e846e, 0xbefc007e, | ||
91 | 0xbefe007c, 0xbefc006e, | ||
92 | 0xc0611dfc, 0x0000007c, | ||
93 | 0x806e846e, 0xbefc007e, | ||
94 | 0xb8eff801, 0xbefe007c, | ||
95 | 0xbefc006e, 0xc0611bfc, | ||
96 | 0x0000007c, 0x806e846e, | ||
97 | 0xbefc007e, 0xbefe007c, | ||
98 | 0xbefc006e, 0xc0611b3c, | ||
99 | 0x0000007c, 0x806e846e, | ||
100 | 0xbefc007e, 0xbefe007c, | ||
101 | 0xbefc006e, 0xc0611b7c, | ||
102 | 0x0000007c, 0x806e846e, | ||
103 | 0xbefc007e, 0x867aff7f, | ||
104 | 0x04000000, 0xbef30080, | ||
105 | 0x8773737a, 0xb8ee2a05, | ||
106 | 0x806e816e, 0x8e6e8a6e, | ||
107 | 0xb8f51605, 0x80758175, | ||
108 | 0x8e758475, 0x8e7a8275, | ||
109 | 0xbefa00ff, 0x01000000, | ||
110 | 0xbef60178, 0x80786e78, | ||
111 | 0x82798079, 0xbefc0080, | ||
112 | 0xbe802b00, 0xbe822b02, | ||
113 | 0xbe842b04, 0xbe862b06, | ||
114 | 0xbe882b08, 0xbe8a2b0a, | ||
115 | 0xbe8c2b0c, 0xbe8e2b0e, | ||
116 | 0xc06b003c, 0x00000000, | ||
117 | 0xc06b013c, 0x00000010, | ||
118 | 0xc06b023c, 0x00000020, | ||
119 | 0xc06b033c, 0x00000030, | ||
120 | 0x8078c078, 0x82798079, | ||
121 | 0x807c907c, 0xbf0a757c, | ||
122 | 0xbf85ffeb, 0xbef80176, | ||
123 | 0xbeee0080, 0xbefe00c1, | ||
124 | 0xbeff00c1, 0xbefa00ff, | ||
125 | 0x01000000, 0xe0724000, | ||
126 | 0x6e1e0000, 0xe0724100, | ||
127 | 0x6e1e0100, 0xe0724200, | ||
128 | 0x6e1e0200, 0xe0724300, | ||
129 | 0x6e1e0300, 0xbefe00c1, | ||
130 | 0xbeff00c1, 0xb8f54306, | ||
131 | 0x8675c175, 0xbf84002c, | ||
132 | 0xbf8a0000, 0x867aff73, | ||
133 | 0x04000000, 0xbf840028, | ||
134 | 0x8e758675, 0x8e758275, | ||
135 | 0xbefa0075, 0xb8ee2a05, | ||
136 | 0x806e816e, 0x8e6e8a6e, | ||
137 | 0xb8fa1605, 0x807a817a, | ||
138 | 0x8e7a867a, 0x806e7a6e, | ||
139 | 0x806eff6e, 0x00000080, | ||
140 | 0xbefa00ff, 0x01000000, | ||
141 | 0xbefc0080, 0xd28c0002, | ||
142 | 0x000100c1, 0xd28d0003, | ||
143 | 0x000204c1, 0xd1060002, | ||
144 | 0x00011103, 0x7e0602ff, | ||
145 | 0x00000200, 0xbefc00ff, | ||
146 | 0x00010000, 0xbe80007b, | ||
147 | 0x867bff7b, 0xff7fffff, | ||
148 | 0x877bff7b, 0x00058000, | ||
149 | 0xd8ec0000, 0x00000002, | ||
150 | 0xbf8c007f, 0xe0765000, | ||
151 | 0x6e1e0002, 0x32040702, | ||
152 | 0xd0c9006a, 0x0000eb02, | ||
153 | 0xbf87fff7, 0xbefb0000, | ||
154 | 0xbeee00ff, 0x00000400, | ||
155 | 0xbefe00c1, 0xbeff00c1, | ||
156 | 0xb8f52a05, 0x80758175, | ||
157 | 0x8e758275, 0x8e7a8875, | ||
158 | 0xbefa00ff, 0x01000000, | ||
159 | 0xbefc0084, 0xbf0a757c, | ||
160 | 0xbf840015, 0xbf11017c, | ||
161 | 0x8075ff75, 0x00001000, | ||
162 | 0x7e000300, 0x7e020301, | ||
163 | 0x7e040302, 0x7e060303, | ||
164 | 0xe0724000, 0x6e1e0000, | ||
165 | 0xe0724100, 0x6e1e0100, | ||
166 | 0xe0724200, 0x6e1e0200, | ||
167 | 0xe0724300, 0x6e1e0300, | ||
168 | 0x807c847c, 0x806eff6e, | ||
169 | 0x00000400, 0xbf0a757c, | ||
170 | 0xbf85ffef, 0xbf9c0000, | ||
171 | 0xbf8200ca, 0xbef8007e, | ||
172 | 0x8679ff7f, 0x0000ffff, | ||
173 | 0x8779ff79, 0x00040000, | ||
174 | 0xbefa0080, 0xbefb00ff, | ||
175 | 0x00807fac, 0x8676ff7f, | ||
176 | 0x08000000, 0x8f768376, | ||
177 | 0x877b767b, 0x8676ff7f, | ||
178 | 0x70000000, 0x8f768176, | ||
179 | 0x877b767b, 0x8676ff7f, | ||
180 | 0x04000000, 0xbf84001e, | ||
181 | 0xbefe00c1, 0xbeff00c1, | ||
182 | 0xb8f34306, 0x8673c173, | ||
183 | 0xbf840019, 0x8e738673, | ||
184 | 0x8e738273, 0xbefa0073, | ||
185 | 0xb8f22a05, 0x80728172, | ||
186 | 0x8e728a72, 0xb8f61605, | ||
187 | 0x80768176, 0x8e768676, | ||
188 | 0x80727672, 0x8072ff72, | ||
189 | 0x00000080, 0xbefa00ff, | ||
190 | 0x01000000, 0xbefc0080, | ||
191 | 0xe0510000, 0x721e0000, | ||
192 | 0xe0510100, 0x721e0000, | ||
193 | 0x807cff7c, 0x00000200, | ||
194 | 0x8072ff72, 0x00000200, | ||
195 | 0xbf0a737c, 0xbf85fff6, | ||
196 | 0xbef20080, 0xbefe00c1, | ||
197 | 0xbeff00c1, 0xb8f32a05, | ||
198 | 0x80738173, 0x8e738273, | ||
199 | 0x8e7a8873, 0xbefa00ff, | ||
200 | 0x01000000, 0xbef60072, | ||
201 | 0x8072ff72, 0x00000400, | ||
202 | 0xbefc0084, 0xbf11087c, | ||
203 | 0x8073ff73, 0x00008000, | ||
204 | 0xe0524000, 0x721e0000, | ||
205 | 0xe0524100, 0x721e0100, | ||
206 | 0xe0524200, 0x721e0200, | ||
207 | 0xe0524300, 0x721e0300, | ||
208 | 0xbf8c0f70, 0x7e000300, | ||
209 | 0x7e020301, 0x7e040302, | ||
210 | 0x7e060303, 0x807c847c, | ||
211 | 0x8072ff72, 0x00000400, | ||
212 | 0xbf0a737c, 0xbf85ffee, | ||
213 | 0xbf9c0000, 0xe0524000, | ||
214 | 0x761e0000, 0xe0524100, | ||
215 | 0x761e0100, 0xe0524200, | ||
216 | 0x761e0200, 0xe0524300, | ||
217 | 0x761e0300, 0xb8f22a05, | ||
218 | 0x80728172, 0x8e728a72, | ||
219 | 0xb8f61605, 0x80768176, | ||
220 | 0x8e768676, 0x80727672, | ||
221 | 0x80f2c072, 0xb8f31605, | ||
222 | 0x80738173, 0x8e738473, | ||
223 | 0x8e7a8273, 0xbefa00ff, | ||
224 | 0x01000000, 0xbefc0073, | ||
225 | 0xc031003c, 0x00000072, | ||
226 | 0x80f2c072, 0xbf8c007f, | ||
227 | 0x80fc907c, 0xbe802d00, | ||
228 | 0xbe822d02, 0xbe842d04, | ||
229 | 0xbe862d06, 0xbe882d08, | ||
230 | 0xbe8a2d0a, 0xbe8c2d0c, | ||
231 | 0xbe8e2d0e, 0xbf06807c, | ||
232 | 0xbf84fff1, 0xb8f22a05, | ||
233 | 0x80728172, 0x8e728a72, | ||
234 | 0xb8f61605, 0x80768176, | ||
235 | 0x8e768676, 0x80727672, | ||
236 | 0xbefa0084, 0xbefa00ff, | ||
237 | 0x01000000, 0xc0211cfc, | ||
238 | 0x00000072, 0x80728472, | ||
239 | 0xc0211c3c, 0x00000072, | ||
240 | 0x80728472, 0xc0211c7c, | ||
241 | 0x00000072, 0x80728472, | ||
242 | 0xc0211bbc, 0x00000072, | ||
243 | 0x80728472, 0xc0211bfc, | ||
244 | 0x00000072, 0x80728472, | ||
245 | 0xc0211d3c, 0x00000072, | ||
246 | 0x80728472, 0xc0211d7c, | ||
247 | 0x00000072, 0x80728472, | ||
248 | 0xc0211a3c, 0x00000072, | ||
249 | 0x80728472, 0xc0211a7c, | ||
250 | 0x00000072, 0x80728472, | ||
251 | 0xc0211dfc, 0x00000072, | ||
252 | 0x80728472, 0xc0211b3c, | ||
253 | 0x00000072, 0x80728472, | ||
254 | 0xc0211b7c, 0x00000072, | ||
255 | 0x80728472, 0xbf8c007f, | ||
256 | 0xbefc0073, 0xbefe006e, | ||
257 | 0xbeff006f, 0x867375ff, | ||
258 | 0x000003ff, 0xb9734803, | ||
259 | 0x867375ff, 0xfffff800, | ||
260 | 0x8f738b73, 0xb973a2c3, | ||
261 | 0xb977f801, 0x8673ff71, | ||
262 | 0xf0000000, 0x8f739c73, | ||
263 | 0x8e739073, 0xbef60080, | ||
264 | 0x87767376, 0x8673ff71, | ||
265 | 0x08000000, 0x8f739b73, | ||
266 | 0x8e738f73, 0x87767376, | ||
267 | 0x8673ff74, 0x00800000, | ||
268 | 0x8f739773, 0xb976f807, | ||
269 | 0x8671ff71, 0x0000ffff, | ||
270 | 0x86fe7e7e, 0x86ea6a6a, | ||
271 | 0xb974f802, 0xbf8a0000, | ||
272 | 0x95807370, 0xbf810000, | ||
273 | }; | ||
274 | |||
275 | |||
276 | static const uint32_t cwsr_trap_gfx9_hex[] = { | ||
277 | 0xbf820001, 0xbf82015a, | ||
278 | 0xb8f8f802, 0x89788678, | ||
279 | 0xb8f1f803, 0x866eff71, | ||
280 | 0x00000400, 0xbf850034, | ||
281 | 0x866eff71, 0x00000800, | ||
282 | 0xbf850003, 0x866eff71, | ||
283 | 0x00000100, 0xbf840008, | ||
284 | 0x866eff78, 0x00002000, | ||
285 | 0xbf840001, 0xbf810000, | ||
286 | 0x8778ff78, 0x00002000, | ||
287 | 0x80ec886c, 0x82ed806d, | ||
288 | 0xb8eef807, 0x866fff6e, | ||
289 | 0x001f8000, 0x8e6f8b6f, | ||
290 | 0x8977ff77, 0xfc000000, | ||
291 | 0x87776f77, 0x896eff6e, | ||
292 | 0x001f8000, 0xb96ef807, | ||
293 | 0xb8f0f812, 0xb8f1f813, | ||
294 | 0x8ef08870, 0xc0071bb8, | ||
295 | 0x00000000, 0xbf8cc07f, | ||
296 | 0xc0071c38, 0x00000008, | ||
297 | 0xbf8cc07f, 0x86ee6e6e, | ||
298 | 0xbf840001, 0xbe801d6e, | ||
299 | 0xb8f1f803, 0x8671ff71, | ||
300 | 0x000001ff, 0xbf850002, | ||
301 | 0x806c846c, 0x826d806d, | ||
302 | 0x866dff6d, 0x0000ffff, | ||
303 | 0x8f6e8b77, 0x866eff6e, | ||
304 | 0x001f8000, 0xb96ef807, | ||
305 | 0x86fe7e7e, 0x86ea6a6a, | ||
306 | 0xb978f802, 0xbe801f6c, | ||
307 | 0x866dff6d, 0x0000ffff, | ||
308 | 0xbef00080, 0xb9700283, | ||
309 | 0xb8f02407, 0x8e709c70, | ||
310 | 0x876d706d, 0xb8f003c7, | ||
311 | 0x8e709b70, 0x876d706d, | ||
312 | 0xb8f0f807, 0x8670ff70, | ||
313 | 0x00007fff, 0xb970f807, | ||
314 | 0xbeee007e, 0xbeef007f, | ||
315 | 0xbefe0180, 0xbf900004, | ||
316 | 0x87708478, 0xb970f802, | ||
317 | 0xbf8e0002, 0xbf88fffe, | ||
318 | 0xb8f02a05, 0x80708170, | ||
319 | 0x8e708a70, 0xb8f11605, | ||
320 | 0x80718171, 0x8e718671, | ||
321 | 0x80707170, 0x80707e70, | ||
322 | 0x8271807f, 0x8671ff71, | ||
323 | 0x0000ffff, 0xc0471cb8, | ||
324 | 0x00000040, 0xbf8cc07f, | ||
325 | 0xc04b1d38, 0x00000048, | ||
326 | 0xbf8cc07f, 0xc0431e78, | ||
327 | 0x00000058, 0xbf8cc07f, | ||
328 | 0xc0471eb8, 0x0000005c, | ||
329 | 0xbf8cc07f, 0xbef4007e, | ||
330 | 0x8675ff7f, 0x0000ffff, | ||
331 | 0x8775ff75, 0x00040000, | ||
332 | 0xbef60080, 0xbef700ff, | ||
333 | 0x00807fac, 0x8670ff7f, | ||
334 | 0x08000000, 0x8f708370, | ||
335 | 0x87777077, 0x8670ff7f, | ||
336 | 0x70000000, 0x8f708170, | ||
337 | 0x87777077, 0xbefb007c, | ||
338 | 0xbefa0080, 0xb8fa2a05, | ||
339 | 0x807a817a, 0x8e7a8a7a, | ||
340 | 0xb8f01605, 0x80708170, | ||
341 | 0x8e708670, 0x807a707a, | ||
342 | 0xbef60084, 0xbef600ff, | ||
343 | 0x01000000, 0xbefe007c, | ||
344 | 0xbefc007a, 0xc0611efa, | ||
345 | 0x0000007c, 0xbf8cc07f, | ||
346 | 0x807a847a, 0xbefc007e, | ||
347 | 0xbefe007c, 0xbefc007a, | ||
348 | 0xc0611b3a, 0x0000007c, | ||
349 | 0xbf8cc07f, 0x807a847a, | ||
350 | 0xbefc007e, 0xbefe007c, | ||
351 | 0xbefc007a, 0xc0611b7a, | ||
352 | 0x0000007c, 0xbf8cc07f, | ||
353 | 0x807a847a, 0xbefc007e, | ||
354 | 0xbefe007c, 0xbefc007a, | ||
355 | 0xc0611bba, 0x0000007c, | ||
356 | 0xbf8cc07f, 0x807a847a, | ||
357 | 0xbefc007e, 0xbefe007c, | ||
358 | 0xbefc007a, 0xc0611bfa, | ||
359 | 0x0000007c, 0xbf8cc07f, | ||
360 | 0x807a847a, 0xbefc007e, | ||
361 | 0xbefe007c, 0xbefc007a, | ||
362 | 0xc0611e3a, 0x0000007c, | ||
363 | 0xbf8cc07f, 0x807a847a, | ||
364 | 0xbefc007e, 0xb8f1f803, | ||
365 | 0xbefe007c, 0xbefc007a, | ||
366 | 0xc0611c7a, 0x0000007c, | ||
367 | 0xbf8cc07f, 0x807a847a, | ||
368 | 0xbefc007e, 0xbefe007c, | ||
369 | 0xbefc007a, 0xc0611a3a, | ||
370 | 0x0000007c, 0xbf8cc07f, | ||
371 | 0x807a847a, 0xbefc007e, | ||
372 | 0xbefe007c, 0xbefc007a, | ||
373 | 0xc0611a7a, 0x0000007c, | ||
374 | 0xbf8cc07f, 0x807a847a, | ||
375 | 0xbefc007e, 0xb8fbf801, | ||
376 | 0xbefe007c, 0xbefc007a, | ||
377 | 0xc0611efa, 0x0000007c, | ||
378 | 0xbf8cc07f, 0x807a847a, | ||
379 | 0xbefc007e, 0x8670ff7f, | ||
380 | 0x04000000, 0xbeef0080, | ||
381 | 0x876f6f70, 0xb8fa2a05, | ||
382 | 0x807a817a, 0x8e7a8a7a, | ||
383 | 0xb8f11605, 0x80718171, | ||
384 | 0x8e718471, 0x8e768271, | ||
385 | 0xbef600ff, 0x01000000, | ||
386 | 0xbef20174, 0x80747a74, | ||
387 | 0x82758075, 0xbefc0080, | ||
388 | 0xbf800000, 0xbe802b00, | ||
389 | 0xbe822b02, 0xbe842b04, | ||
390 | 0xbe862b06, 0xbe882b08, | ||
391 | 0xbe8a2b0a, 0xbe8c2b0c, | ||
392 | 0xbe8e2b0e, 0xc06b003a, | ||
393 | 0x00000000, 0xbf8cc07f, | ||
394 | 0xc06b013a, 0x00000010, | ||
395 | 0xbf8cc07f, 0xc06b023a, | ||
396 | 0x00000020, 0xbf8cc07f, | ||
397 | 0xc06b033a, 0x00000030, | ||
398 | 0xbf8cc07f, 0x8074c074, | ||
399 | 0x82758075, 0x807c907c, | ||
400 | 0xbf0a717c, 0xbf85ffe7, | ||
401 | 0xbef40172, 0xbefa0080, | ||
402 | 0xbefe00c1, 0xbeff00c1, | ||
403 | 0xbee80080, 0xbee90080, | ||
404 | 0xbef600ff, 0x01000000, | ||
405 | 0xe0724000, 0x7a1d0000, | ||
406 | 0xe0724100, 0x7a1d0100, | ||
407 | 0xe0724200, 0x7a1d0200, | ||
408 | 0xe0724300, 0x7a1d0300, | ||
409 | 0xbefe00c1, 0xbeff00c1, | ||
410 | 0xb8f14306, 0x8671c171, | ||
411 | 0xbf84002c, 0xbf8a0000, | ||
412 | 0x8670ff6f, 0x04000000, | ||
413 | 0xbf840028, 0x8e718671, | ||
414 | 0x8e718271, 0xbef60071, | ||
415 | 0xb8fa2a05, 0x807a817a, | ||
416 | 0x8e7a8a7a, 0xb8f01605, | ||
417 | 0x80708170, 0x8e708670, | ||
418 | 0x807a707a, 0x807aff7a, | ||
419 | 0x00000080, 0xbef600ff, | ||
420 | 0x01000000, 0xbefc0080, | ||
421 | 0xd28c0002, 0x000100c1, | ||
422 | 0xd28d0003, 0x000204c1, | ||
423 | 0xd1060002, 0x00011103, | ||
424 | 0x7e0602ff, 0x00000200, | ||
425 | 0xbefc00ff, 0x00010000, | ||
426 | 0xbe800077, 0x8677ff77, | ||
427 | 0xff7fffff, 0x8777ff77, | ||
428 | 0x00058000, 0xd8ec0000, | ||
429 | 0x00000002, 0xbf8cc07f, | ||
430 | 0xe0765000, 0x7a1d0002, | ||
431 | 0x68040702, 0xd0c9006a, | ||
432 | 0x0000e302, 0xbf87fff7, | ||
433 | 0xbef70000, 0xbefa00ff, | ||
434 | 0x00000400, 0xbefe00c1, | ||
435 | 0xbeff00c1, 0xb8f12a05, | ||
436 | 0x80718171, 0x8e718271, | ||
437 | 0x8e768871, 0xbef600ff, | ||
438 | 0x01000000, 0xbefc0084, | ||
439 | 0xbf0a717c, 0xbf840015, | ||
440 | 0xbf11017c, 0x8071ff71, | ||
441 | 0x00001000, 0x7e000300, | ||
442 | 0x7e020301, 0x7e040302, | ||
443 | 0x7e060303, 0xe0724000, | ||
444 | 0x7a1d0000, 0xe0724100, | ||
445 | 0x7a1d0100, 0xe0724200, | ||
446 | 0x7a1d0200, 0xe0724300, | ||
447 | 0x7a1d0300, 0x807c847c, | ||
448 | 0x807aff7a, 0x00000400, | ||
449 | 0xbf0a717c, 0xbf85ffef, | ||
450 | 0xbf9c0000, 0xbf8200d9, | ||
451 | 0xbef4007e, 0x8675ff7f, | ||
452 | 0x0000ffff, 0x8775ff75, | ||
453 | 0x00040000, 0xbef60080, | ||
454 | 0xbef700ff, 0x00807fac, | ||
455 | 0x866eff7f, 0x08000000, | ||
456 | 0x8f6e836e, 0x87776e77, | ||
457 | 0x866eff7f, 0x70000000, | ||
458 | 0x8f6e816e, 0x87776e77, | ||
459 | 0x866eff7f, 0x04000000, | ||
460 | 0xbf84001e, 0xbefe00c1, | ||
461 | 0xbeff00c1, 0xb8ef4306, | ||
462 | 0x866fc16f, 0xbf840019, | ||
463 | 0x8e6f866f, 0x8e6f826f, | ||
464 | 0xbef6006f, 0xb8f82a05, | ||
465 | 0x80788178, 0x8e788a78, | ||
466 | 0xb8ee1605, 0x806e816e, | ||
467 | 0x8e6e866e, 0x80786e78, | ||
468 | 0x8078ff78, 0x00000080, | ||
469 | 0xbef600ff, 0x01000000, | ||
470 | 0xbefc0080, 0xe0510000, | ||
471 | 0x781d0000, 0xe0510100, | ||
472 | 0x781d0000, 0x807cff7c, | ||
473 | 0x00000200, 0x8078ff78, | ||
474 | 0x00000200, 0xbf0a6f7c, | ||
475 | 0xbf85fff6, 0xbef80080, | ||
476 | 0xbefe00c1, 0xbeff00c1, | ||
477 | 0xb8ef2a05, 0x806f816f, | ||
478 | 0x8e6f826f, 0x8e76886f, | ||
479 | 0xbef600ff, 0x01000000, | ||
480 | 0xbeee0078, 0x8078ff78, | ||
481 | 0x00000400, 0xbefc0084, | ||
482 | 0xbf11087c, 0x806fff6f, | ||
483 | 0x00008000, 0xe0524000, | ||
484 | 0x781d0000, 0xe0524100, | ||
485 | 0x781d0100, 0xe0524200, | ||
486 | 0x781d0200, 0xe0524300, | ||
487 | 0x781d0300, 0xbf8c0f70, | ||
488 | 0x7e000300, 0x7e020301, | ||
489 | 0x7e040302, 0x7e060303, | ||
490 | 0x807c847c, 0x8078ff78, | ||
491 | 0x00000400, 0xbf0a6f7c, | ||
492 | 0xbf85ffee, 0xbf9c0000, | ||
493 | 0xe0524000, 0x6e1d0000, | ||
494 | 0xe0524100, 0x6e1d0100, | ||
495 | 0xe0524200, 0x6e1d0200, | ||
496 | 0xe0524300, 0x6e1d0300, | ||
497 | 0xb8f82a05, 0x80788178, | ||
498 | 0x8e788a78, 0xb8ee1605, | ||
499 | 0x806e816e, 0x8e6e866e, | ||
500 | 0x80786e78, 0x80f8c078, | ||
501 | 0xb8ef1605, 0x806f816f, | ||
502 | 0x8e6f846f, 0x8e76826f, | ||
503 | 0xbef600ff, 0x01000000, | ||
504 | 0xbefc006f, 0xc031003a, | ||
505 | 0x00000078, 0x80f8c078, | ||
506 | 0xbf8cc07f, 0x80fc907c, | ||
507 | 0xbf800000, 0xbe802d00, | ||
508 | 0xbe822d02, 0xbe842d04, | ||
509 | 0xbe862d06, 0xbe882d08, | ||
510 | 0xbe8a2d0a, 0xbe8c2d0c, | ||
511 | 0xbe8e2d0e, 0xbf06807c, | ||
512 | 0xbf84fff0, 0xb8f82a05, | ||
513 | 0x80788178, 0x8e788a78, | ||
514 | 0xb8ee1605, 0x806e816e, | ||
515 | 0x8e6e866e, 0x80786e78, | ||
516 | 0xbef60084, 0xbef600ff, | ||
517 | 0x01000000, 0xc0211bfa, | ||
518 | 0x00000078, 0x80788478, | ||
519 | 0xc0211b3a, 0x00000078, | ||
520 | 0x80788478, 0xc0211b7a, | ||
521 | 0x00000078, 0x80788478, | ||
522 | 0xc0211eba, 0x00000078, | ||
523 | 0x80788478, 0xc0211efa, | ||
524 | 0x00000078, 0x80788478, | ||
525 | 0xc0211c3a, 0x00000078, | ||
526 | 0x80788478, 0xc0211c7a, | ||
527 | 0x00000078, 0x80788478, | ||
528 | 0xc0211a3a, 0x00000078, | ||
529 | 0x80788478, 0xc0211a7a, | ||
530 | 0x00000078, 0x80788478, | ||
531 | 0xc0211cfa, 0x00000078, | ||
532 | 0x80788478, 0xbf8cc07f, | ||
533 | 0xbefc006f, 0xbefe007a, | ||
534 | 0xbeff007b, 0x866f71ff, | ||
535 | 0x000003ff, 0xb96f4803, | ||
536 | 0x866f71ff, 0xfffff800, | ||
537 | 0x8f6f8b6f, 0xb96fa2c3, | ||
538 | 0xb973f801, 0xb8ee2a05, | ||
539 | 0x806e816e, 0x8e6e8a6e, | ||
540 | 0xb8ef1605, 0x806f816f, | ||
541 | 0x8e6f866f, 0x806e6f6e, | ||
542 | 0x806e746e, 0x826f8075, | ||
543 | 0x866fff6f, 0x0000ffff, | ||
544 | 0xc0071cb7, 0x00000040, | ||
545 | 0xc00b1d37, 0x00000048, | ||
546 | 0xc0031e77, 0x00000058, | ||
547 | 0xc0071eb7, 0x0000005c, | ||
548 | 0xbf8cc07f, 0x866fff6d, | ||
549 | 0xf0000000, 0x8f6f9c6f, | ||
550 | 0x8e6f906f, 0xbeee0080, | ||
551 | 0x876e6f6e, 0x866fff6d, | ||
552 | 0x08000000, 0x8f6f9b6f, | ||
553 | 0x8e6f8f6f, 0x876e6f6e, | ||
554 | 0x866fff70, 0x00800000, | ||
555 | 0x8f6f976f, 0xb96ef807, | ||
556 | 0x866dff6d, 0x0000ffff, | ||
557 | 0x86fe7e7e, 0x86ea6a6a, | ||
558 | 0xb970f802, 0xbf8a0000, | ||
559 | 0x95806f6c, 0xbf810000, | ||
560 | }; | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm index 997a383dcb8b..a2a04bb64096 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm | |||
@@ -20,9 +20,12 @@ | |||
20 | * OTHER DEALINGS IN THE SOFTWARE. | 20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #if 0 | 23 | /* To compile this assembly code: |
24 | HW (VI) source code for CWSR trap handler | 24 | * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex |
25 | #Version 18 + multiple trap handler | 25 | */ |
26 | |||
27 | /* HW (VI) source code for CWSR trap handler */ | ||
28 | /* Version 18 + multiple trap handler */ | ||
26 | 29 | ||
27 | // this performance-optimal version was originally from Seven Xu at SRDC | 30 | // this performance-optimal version was originally from Seven Xu at SRDC |
28 | 31 | ||
@@ -98,6 +101,7 @@ var SWIZZLE_EN = 0 //whether we use swi | |||
98 | /**************************************************************************/ | 101 | /**************************************************************************/ |
99 | var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 | 102 | var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 |
100 | var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 | 103 | var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 |
104 | var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 | ||
101 | var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 | 105 | var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 |
102 | 106 | ||
103 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 | 107 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 |
@@ -149,7 +153,7 @@ var s_save_spi_init_lo = exec_lo | |||
149 | var s_save_spi_init_hi = exec_hi | 153 | var s_save_spi_init_hi = exec_hi |
150 | 154 | ||
151 | //tba_lo and tba_hi need to be saved/restored | 155 | //tba_lo and tba_hi need to be saved/restored |
152 | var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} | 156 | var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} |
153 | var s_save_pc_hi = ttmp1 | 157 | var s_save_pc_hi = ttmp1 |
154 | var s_save_exec_lo = ttmp2 | 158 | var s_save_exec_lo = ttmp2 |
155 | var s_save_exec_hi = ttmp3 | 159 | var s_save_exec_hi = ttmp3 |
@@ -319,6 +323,10 @@ end | |||
319 | s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC | 323 | s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC |
320 | end | 324 | end |
321 | 325 | ||
326 | // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. | ||
327 | s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) | ||
328 | s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp | ||
329 | |||
322 | L_SLEEP: | 330 | L_SLEEP: |
323 | s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 | 331 | s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 |
324 | 332 | ||
@@ -1007,8 +1015,6 @@ end | |||
1007 | 1015 | ||
1008 | s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS | 1016 | s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS |
1009 | 1017 | ||
1010 | s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS | ||
1011 | |||
1012 | //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: | 1018 | //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: |
1013 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) | 1019 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) |
1014 | s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) | 1020 | s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) |
@@ -1044,6 +1050,7 @@ end | |||
1044 | s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT | 1050 | s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT |
1045 | s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp | 1051 | s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp |
1046 | 1052 | ||
1053 | s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS | ||
1047 | s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 | 1054 | s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 |
1048 | s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 | 1055 | s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 |
1049 | s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu | 1056 | s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu |
@@ -1127,258 +1134,3 @@ end | |||
1127 | function get_hwreg_size_bytes | 1134 | function get_hwreg_size_bytes |
1128 | return 128 //HWREG size 128 bytes | 1135 | return 128 //HWREG size 128 bytes |
1129 | end | 1136 | end |
1130 | |||
1131 | |||
1132 | #endif | ||
1133 | |||
1134 | static const uint32_t cwsr_trap_gfx8_hex[] = { | ||
1135 | 0xbf820001, 0xbf820123, | ||
1136 | 0xb8f4f802, 0x89748674, | ||
1137 | 0xb8f5f803, 0x8675ff75, | ||
1138 | 0x00000400, 0xbf850011, | ||
1139 | 0xc00a1e37, 0x00000000, | ||
1140 | 0xbf8c007f, 0x87777978, | ||
1141 | 0xbf840002, 0xb974f802, | ||
1142 | 0xbe801d78, 0xb8f5f803, | ||
1143 | 0x8675ff75, 0x000001ff, | ||
1144 | 0xbf850002, 0x80708470, | ||
1145 | 0x82718071, 0x8671ff71, | ||
1146 | 0x0000ffff, 0xb974f802, | ||
1147 | 0xbe801f70, 0xb8f5f803, | ||
1148 | 0x8675ff75, 0x00000100, | ||
1149 | 0xbf840006, 0xbefa0080, | ||
1150 | 0xb97a0203, 0x8671ff71, | ||
1151 | 0x0000ffff, 0x80f08870, | ||
1152 | 0x82f18071, 0xbefa0080, | ||
1153 | 0xb97a0283, 0xbef60068, | ||
1154 | 0xbef70069, 0xb8fa1c07, | ||
1155 | 0x8e7a9c7a, 0x87717a71, | ||
1156 | 0xb8fa03c7, 0x8e7a9b7a, | ||
1157 | 0x87717a71, 0xb8faf807, | ||
1158 | 0x867aff7a, 0x00007fff, | ||
1159 | 0xb97af807, 0xbef2007e, | ||
1160 | 0xbef3007f, 0xbefe0180, | ||
1161 | 0xbf900004, 0xbf8e0002, | ||
1162 | 0xbf88fffe, 0xbef8007e, | ||
1163 | 0x8679ff7f, 0x0000ffff, | ||
1164 | 0x8779ff79, 0x00040000, | ||
1165 | 0xbefa0080, 0xbefb00ff, | ||
1166 | 0x00807fac, 0x867aff7f, | ||
1167 | 0x08000000, 0x8f7a837a, | ||
1168 | 0x877b7a7b, 0x867aff7f, | ||
1169 | 0x70000000, 0x8f7a817a, | ||
1170 | 0x877b7a7b, 0xbeef007c, | ||
1171 | 0xbeee0080, 0xb8ee2a05, | ||
1172 | 0x806e816e, 0x8e6e8a6e, | ||
1173 | 0xb8fa1605, 0x807a817a, | ||
1174 | 0x8e7a867a, 0x806e7a6e, | ||
1175 | 0xbefa0084, 0xbefa00ff, | ||
1176 | 0x01000000, 0xbefe007c, | ||
1177 | 0xbefc006e, 0xc0611bfc, | ||
1178 | 0x0000007c, 0x806e846e, | ||
1179 | 0xbefc007e, 0xbefe007c, | ||
1180 | 0xbefc006e, 0xc0611c3c, | ||
1181 | 0x0000007c, 0x806e846e, | ||
1182 | 0xbefc007e, 0xbefe007c, | ||
1183 | 0xbefc006e, 0xc0611c7c, | ||
1184 | 0x0000007c, 0x806e846e, | ||
1185 | 0xbefc007e, 0xbefe007c, | ||
1186 | 0xbefc006e, 0xc0611cbc, | ||
1187 | 0x0000007c, 0x806e846e, | ||
1188 | 0xbefc007e, 0xbefe007c, | ||
1189 | 0xbefc006e, 0xc0611cfc, | ||
1190 | 0x0000007c, 0x806e846e, | ||
1191 | 0xbefc007e, 0xbefe007c, | ||
1192 | 0xbefc006e, 0xc0611d3c, | ||
1193 | 0x0000007c, 0x806e846e, | ||
1194 | 0xbefc007e, 0xb8f5f803, | ||
1195 | 0xbefe007c, 0xbefc006e, | ||
1196 | 0xc0611d7c, 0x0000007c, | ||
1197 | 0x806e846e, 0xbefc007e, | ||
1198 | 0xbefe007c, 0xbefc006e, | ||
1199 | 0xc0611dbc, 0x0000007c, | ||
1200 | 0x806e846e, 0xbefc007e, | ||
1201 | 0xbefe007c, 0xbefc006e, | ||
1202 | 0xc0611dfc, 0x0000007c, | ||
1203 | 0x806e846e, 0xbefc007e, | ||
1204 | 0xb8eff801, 0xbefe007c, | ||
1205 | 0xbefc006e, 0xc0611bfc, | ||
1206 | 0x0000007c, 0x806e846e, | ||
1207 | 0xbefc007e, 0xbefe007c, | ||
1208 | 0xbefc006e, 0xc0611b3c, | ||
1209 | 0x0000007c, 0x806e846e, | ||
1210 | 0xbefc007e, 0xbefe007c, | ||
1211 | 0xbefc006e, 0xc0611b7c, | ||
1212 | 0x0000007c, 0x806e846e, | ||
1213 | 0xbefc007e, 0x867aff7f, | ||
1214 | 0x04000000, 0xbef30080, | ||
1215 | 0x8773737a, 0xb8ee2a05, | ||
1216 | 0x806e816e, 0x8e6e8a6e, | ||
1217 | 0xb8f51605, 0x80758175, | ||
1218 | 0x8e758475, 0x8e7a8275, | ||
1219 | 0xbefa00ff, 0x01000000, | ||
1220 | 0xbef60178, 0x80786e78, | ||
1221 | 0x82798079, 0xbefc0080, | ||
1222 | 0xbe802b00, 0xbe822b02, | ||
1223 | 0xbe842b04, 0xbe862b06, | ||
1224 | 0xbe882b08, 0xbe8a2b0a, | ||
1225 | 0xbe8c2b0c, 0xbe8e2b0e, | ||
1226 | 0xc06b003c, 0x00000000, | ||
1227 | 0xc06b013c, 0x00000010, | ||
1228 | 0xc06b023c, 0x00000020, | ||
1229 | 0xc06b033c, 0x00000030, | ||
1230 | 0x8078c078, 0x82798079, | ||
1231 | 0x807c907c, 0xbf0a757c, | ||
1232 | 0xbf85ffeb, 0xbef80176, | ||
1233 | 0xbeee0080, 0xbefe00c1, | ||
1234 | 0xbeff00c1, 0xbefa00ff, | ||
1235 | 0x01000000, 0xe0724000, | ||
1236 | 0x6e1e0000, 0xe0724100, | ||
1237 | 0x6e1e0100, 0xe0724200, | ||
1238 | 0x6e1e0200, 0xe0724300, | ||
1239 | 0x6e1e0300, 0xbefe00c1, | ||
1240 | 0xbeff00c1, 0xb8f54306, | ||
1241 | 0x8675c175, 0xbf84002c, | ||
1242 | 0xbf8a0000, 0x867aff73, | ||
1243 | 0x04000000, 0xbf840028, | ||
1244 | 0x8e758675, 0x8e758275, | ||
1245 | 0xbefa0075, 0xb8ee2a05, | ||
1246 | 0x806e816e, 0x8e6e8a6e, | ||
1247 | 0xb8fa1605, 0x807a817a, | ||
1248 | 0x8e7a867a, 0x806e7a6e, | ||
1249 | 0x806eff6e, 0x00000080, | ||
1250 | 0xbefa00ff, 0x01000000, | ||
1251 | 0xbefc0080, 0xd28c0002, | ||
1252 | 0x000100c1, 0xd28d0003, | ||
1253 | 0x000204c1, 0xd1060002, | ||
1254 | 0x00011103, 0x7e0602ff, | ||
1255 | 0x00000200, 0xbefc00ff, | ||
1256 | 0x00010000, 0xbe80007b, | ||
1257 | 0x867bff7b, 0xff7fffff, | ||
1258 | 0x877bff7b, 0x00058000, | ||
1259 | 0xd8ec0000, 0x00000002, | ||
1260 | 0xbf8c007f, 0xe0765000, | ||
1261 | 0x6e1e0002, 0x32040702, | ||
1262 | 0xd0c9006a, 0x0000eb02, | ||
1263 | 0xbf87fff7, 0xbefb0000, | ||
1264 | 0xbeee00ff, 0x00000400, | ||
1265 | 0xbefe00c1, 0xbeff00c1, | ||
1266 | 0xb8f52a05, 0x80758175, | ||
1267 | 0x8e758275, 0x8e7a8875, | ||
1268 | 0xbefa00ff, 0x01000000, | ||
1269 | 0xbefc0084, 0xbf0a757c, | ||
1270 | 0xbf840015, 0xbf11017c, | ||
1271 | 0x8075ff75, 0x00001000, | ||
1272 | 0x7e000300, 0x7e020301, | ||
1273 | 0x7e040302, 0x7e060303, | ||
1274 | 0xe0724000, 0x6e1e0000, | ||
1275 | 0xe0724100, 0x6e1e0100, | ||
1276 | 0xe0724200, 0x6e1e0200, | ||
1277 | 0xe0724300, 0x6e1e0300, | ||
1278 | 0x807c847c, 0x806eff6e, | ||
1279 | 0x00000400, 0xbf0a757c, | ||
1280 | 0xbf85ffef, 0xbf9c0000, | ||
1281 | 0xbf8200ca, 0xbef8007e, | ||
1282 | 0x8679ff7f, 0x0000ffff, | ||
1283 | 0x8779ff79, 0x00040000, | ||
1284 | 0xbefa0080, 0xbefb00ff, | ||
1285 | 0x00807fac, 0x8676ff7f, | ||
1286 | 0x08000000, 0x8f768376, | ||
1287 | 0x877b767b, 0x8676ff7f, | ||
1288 | 0x70000000, 0x8f768176, | ||
1289 | 0x877b767b, 0x8676ff7f, | ||
1290 | 0x04000000, 0xbf84001e, | ||
1291 | 0xbefe00c1, 0xbeff00c1, | ||
1292 | 0xb8f34306, 0x8673c173, | ||
1293 | 0xbf840019, 0x8e738673, | ||
1294 | 0x8e738273, 0xbefa0073, | ||
1295 | 0xb8f22a05, 0x80728172, | ||
1296 | 0x8e728a72, 0xb8f61605, | ||
1297 | 0x80768176, 0x8e768676, | ||
1298 | 0x80727672, 0x8072ff72, | ||
1299 | 0x00000080, 0xbefa00ff, | ||
1300 | 0x01000000, 0xbefc0080, | ||
1301 | 0xe0510000, 0x721e0000, | ||
1302 | 0xe0510100, 0x721e0000, | ||
1303 | 0x807cff7c, 0x00000200, | ||
1304 | 0x8072ff72, 0x00000200, | ||
1305 | 0xbf0a737c, 0xbf85fff6, | ||
1306 | 0xbef20080, 0xbefe00c1, | ||
1307 | 0xbeff00c1, 0xb8f32a05, | ||
1308 | 0x80738173, 0x8e738273, | ||
1309 | 0x8e7a8873, 0xbefa00ff, | ||
1310 | 0x01000000, 0xbef60072, | ||
1311 | 0x8072ff72, 0x00000400, | ||
1312 | 0xbefc0084, 0xbf11087c, | ||
1313 | 0x8073ff73, 0x00008000, | ||
1314 | 0xe0524000, 0x721e0000, | ||
1315 | 0xe0524100, 0x721e0100, | ||
1316 | 0xe0524200, 0x721e0200, | ||
1317 | 0xe0524300, 0x721e0300, | ||
1318 | 0xbf8c0f70, 0x7e000300, | ||
1319 | 0x7e020301, 0x7e040302, | ||
1320 | 0x7e060303, 0x807c847c, | ||
1321 | 0x8072ff72, 0x00000400, | ||
1322 | 0xbf0a737c, 0xbf85ffee, | ||
1323 | 0xbf9c0000, 0xe0524000, | ||
1324 | 0x761e0000, 0xe0524100, | ||
1325 | 0x761e0100, 0xe0524200, | ||
1326 | 0x761e0200, 0xe0524300, | ||
1327 | 0x761e0300, 0xb8f22a05, | ||
1328 | 0x80728172, 0x8e728a72, | ||
1329 | 0xb8f61605, 0x80768176, | ||
1330 | 0x8e768676, 0x80727672, | ||
1331 | 0x80f2c072, 0xb8f31605, | ||
1332 | 0x80738173, 0x8e738473, | ||
1333 | 0x8e7a8273, 0xbefa00ff, | ||
1334 | 0x01000000, 0xbefc0073, | ||
1335 | 0xc031003c, 0x00000072, | ||
1336 | 0x80f2c072, 0xbf8c007f, | ||
1337 | 0x80fc907c, 0xbe802d00, | ||
1338 | 0xbe822d02, 0xbe842d04, | ||
1339 | 0xbe862d06, 0xbe882d08, | ||
1340 | 0xbe8a2d0a, 0xbe8c2d0c, | ||
1341 | 0xbe8e2d0e, 0xbf06807c, | ||
1342 | 0xbf84fff1, 0xb8f22a05, | ||
1343 | 0x80728172, 0x8e728a72, | ||
1344 | 0xb8f61605, 0x80768176, | ||
1345 | 0x8e768676, 0x80727672, | ||
1346 | 0xbefa0084, 0xbefa00ff, | ||
1347 | 0x01000000, 0xc0211cfc, | ||
1348 | 0x00000072, 0x80728472, | ||
1349 | 0xc0211c3c, 0x00000072, | ||
1350 | 0x80728472, 0xc0211c7c, | ||
1351 | 0x00000072, 0x80728472, | ||
1352 | 0xc0211bbc, 0x00000072, | ||
1353 | 0x80728472, 0xc0211bfc, | ||
1354 | 0x00000072, 0x80728472, | ||
1355 | 0xc0211d3c, 0x00000072, | ||
1356 | 0x80728472, 0xc0211d7c, | ||
1357 | 0x00000072, 0x80728472, | ||
1358 | 0xc0211a3c, 0x00000072, | ||
1359 | 0x80728472, 0xc0211a7c, | ||
1360 | 0x00000072, 0x80728472, | ||
1361 | 0xc0211dfc, 0x00000072, | ||
1362 | 0x80728472, 0xc0211b3c, | ||
1363 | 0x00000072, 0x80728472, | ||
1364 | 0xc0211b7c, 0x00000072, | ||
1365 | 0x80728472, 0xbf8c007f, | ||
1366 | 0x8671ff71, 0x0000ffff, | ||
1367 | 0xbefc0073, 0xbefe006e, | ||
1368 | 0xbeff006f, 0x867375ff, | ||
1369 | 0x000003ff, 0xb9734803, | ||
1370 | 0x867375ff, 0xfffff800, | ||
1371 | 0x8f738b73, 0xb973a2c3, | ||
1372 | 0xb977f801, 0x8673ff71, | ||
1373 | 0xf0000000, 0x8f739c73, | ||
1374 | 0x8e739073, 0xbef60080, | ||
1375 | 0x87767376, 0x8673ff71, | ||
1376 | 0x08000000, 0x8f739b73, | ||
1377 | 0x8e738f73, 0x87767376, | ||
1378 | 0x8673ff74, 0x00800000, | ||
1379 | 0x8f739773, 0xb976f807, | ||
1380 | 0x86fe7e7e, 0x86ea6a6a, | ||
1381 | 0xb974f802, 0xbf8a0000, | ||
1382 | 0x95807370, 0xbf810000, | ||
1383 | }; | ||
1384 | |||
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm new file mode 100644 index 000000000000..998be96be736 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | |||
@@ -0,0 +1,1214 @@ | |||
1 | /* | ||
2 | * Copyright 2016 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | /* To compile this assembly code: | ||
24 | * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex | ||
25 | */ | ||
26 | |||
27 | /* HW (GFX9) source code for CWSR trap handler */ | ||
28 | /* Version 18 + multiple trap handler */ | ||
29 | |||
30 | // this performance-optimal version was originally from Seven Xu at SRDC | ||
31 | |||
32 | // Revison #18 --... | ||
33 | /* Rev History | ||
34 | ** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) | ||
35 | ** #4. SR Memory Layout: | ||
36 | ** 1. VGPR-SGPR-HWREG-{LDS} | ||
37 | ** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. | ||
38 | ** #5. Update: 1. Accurate g8sr_ts_save_d timestamp | ||
39 | ** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) | ||
40 | ** #7. Update: 1. don't barrier if noLDS | ||
41 | ** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version | ||
42 | ** 2. Fix SQ issue by s_sleep 2 | ||
43 | ** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last | ||
44 | ** 2. optimize s_buffer save by burst 16sgprs... | ||
45 | ** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. | ||
46 | ** #11. Update 1. Add 2 more timestamp for debug version | ||
47 | ** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance | ||
48 | ** #13. Integ 1. Always use MUBUF for PV trap shader... | ||
49 | ** #14. Update 1. s_buffer_store soft clause... | ||
50 | ** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. | ||
51 | ** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree | ||
52 | ** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] | ||
53 | ** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... | ||
54 | ** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 | ||
55 | ** 2. FUNC - Handle non-CWSR traps | ||
56 | */ | ||
57 | |||
58 | var G8SR_WDMEM_HWREG_OFFSET = 0 | ||
59 | var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes | ||
60 | |||
61 | // Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. | ||
62 | |||
63 | var G8SR_DEBUG_TIMESTAMP = 0 | ||
64 | var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset | ||
65 | var s_g8sr_ts_save_s = s[34:35] // save start | ||
66 | var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi | ||
67 | var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ | ||
68 | var s_g8sr_ts_save_d = s[40:41] // save end | ||
69 | var s_g8sr_ts_restore_s = s[42:43] // restore start | ||
70 | var s_g8sr_ts_restore_d = s[44:45] // restore end | ||
71 | |||
72 | var G8SR_VGPR_SR_IN_DWX4 = 0 | ||
73 | var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes | ||
74 | var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 | ||
75 | |||
76 | |||
77 | /*************************************************************************/ | ||
78 | /* control on how to run the shader */ | ||
79 | /*************************************************************************/ | ||
80 | //any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) | ||
81 | var EMU_RUN_HACK = 0 | ||
82 | var EMU_RUN_HACK_RESTORE_NORMAL = 0 | ||
83 | var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 | ||
84 | var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 | ||
85 | var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK | ||
86 | var SAVE_LDS = 1 | ||
87 | var WG_BASE_ADDR_LO = 0x9000a000 | ||
88 | var WG_BASE_ADDR_HI = 0x0 | ||
89 | var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem | ||
90 | var CTX_SAVE_CONTROL = 0x0 | ||
91 | var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL | ||
92 | var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) | ||
93 | var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write | ||
94 | var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes | ||
95 | var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing | ||
96 | var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency | ||
97 | |||
98 | /**************************************************************************/ | ||
99 | /* variables */ | ||
100 | /**************************************************************************/ | ||
101 | var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 | ||
102 | var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 | ||
103 | var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 | ||
104 | var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 | ||
105 | var SQ_WAVE_STATUS_HALT_MASK = 0x2000 | ||
106 | |||
107 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 | ||
108 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 | ||
109 | var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 | ||
110 | var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 | ||
111 | var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 | ||
112 | var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits | ||
113 | |||
114 | var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 | ||
115 | var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask | ||
116 | var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 | ||
117 | var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 | ||
118 | var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 | ||
119 | var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF | ||
120 | var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 | ||
121 | var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 | ||
122 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 | ||
123 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 | ||
124 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 | ||
125 | var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 | ||
126 | |||
127 | var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME | ||
128 | var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME | ||
129 | var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000 | ||
130 | var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME | ||
131 | |||
132 | var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 | ||
133 | var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 | ||
134 | |||
135 | var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data | ||
136 | var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000 | ||
137 | |||
138 | /* Save */ | ||
139 | var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes | ||
140 | var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE | ||
141 | |||
142 | var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit | ||
143 | var S_SAVE_SPI_INIT_ATC_SHIFT = 27 | ||
144 | var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype | ||
145 | var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 | ||
146 | var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG | ||
147 | var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 | ||
148 | |||
149 | var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used | ||
150 | var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME | ||
151 | var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME | ||
152 | var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME | ||
153 | |||
154 | var s_save_spi_init_lo = exec_lo | ||
155 | var s_save_spi_init_hi = exec_hi | ||
156 | |||
157 | var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} | ||
158 | var s_save_pc_hi = ttmp1 | ||
159 | var s_save_exec_lo = ttmp2 | ||
160 | var s_save_exec_hi = ttmp3 | ||
161 | var s_save_tmp = ttmp4 | ||
162 | var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine | ||
163 | var s_save_xnack_mask_lo = ttmp6 | ||
164 | var s_save_xnack_mask_hi = ttmp7 | ||
165 | var s_save_buf_rsrc0 = ttmp8 | ||
166 | var s_save_buf_rsrc1 = ttmp9 | ||
167 | var s_save_buf_rsrc2 = ttmp10 | ||
168 | var s_save_buf_rsrc3 = ttmp11 | ||
169 | var s_save_status = ttmp12 | ||
170 | var s_save_mem_offset = ttmp14 | ||
171 | var s_save_alloc_size = s_save_trapsts //conflict | ||
172 | var s_save_m0 = ttmp15 | ||
173 | var s_save_ttmps_lo = s_save_tmp //no conflict | ||
174 | var s_save_ttmps_hi = s_save_trapsts //no conflict | ||
175 | |||
176 | /* Restore */ | ||
177 | var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE | ||
178 | var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC | ||
179 | |||
180 | var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit | ||
181 | var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 | ||
182 | var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype | ||
183 | var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 | ||
184 | var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG | ||
185 | var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 | ||
186 | |||
187 | var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT | ||
188 | var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK | ||
189 | var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT | ||
190 | var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK | ||
191 | |||
192 | var s_restore_spi_init_lo = exec_lo | ||
193 | var s_restore_spi_init_hi = exec_hi | ||
194 | |||
195 | var s_restore_mem_offset = ttmp12 | ||
196 | var s_restore_alloc_size = ttmp3 | ||
197 | var s_restore_tmp = ttmp2 | ||
198 | var s_restore_mem_offset_save = s_restore_tmp //no conflict | ||
199 | |||
200 | var s_restore_m0 = s_restore_alloc_size //no conflict | ||
201 | |||
202 | var s_restore_mode = ttmp7 | ||
203 | |||
204 | var s_restore_pc_lo = ttmp0 | ||
205 | var s_restore_pc_hi = ttmp1 | ||
206 | var s_restore_exec_lo = ttmp14 | ||
207 | var s_restore_exec_hi = ttmp15 | ||
208 | var s_restore_status = ttmp4 | ||
209 | var s_restore_trapsts = ttmp5 | ||
210 | var s_restore_xnack_mask_lo = xnack_mask_lo | ||
211 | var s_restore_xnack_mask_hi = xnack_mask_hi | ||
212 | var s_restore_buf_rsrc0 = ttmp8 | ||
213 | var s_restore_buf_rsrc1 = ttmp9 | ||
214 | var s_restore_buf_rsrc2 = ttmp10 | ||
215 | var s_restore_buf_rsrc3 = ttmp11 | ||
216 | var s_restore_ttmps_lo = s_restore_tmp //no conflict | ||
217 | var s_restore_ttmps_hi = s_restore_alloc_size //no conflict | ||
218 | |||
219 | /**************************************************************************/ | ||
220 | /* trap handler entry points */ | ||
221 | /**************************************************************************/ | ||
222 | /* Shader Main*/ | ||
223 | |||
224 | shader main | ||
225 | asic(GFX9) | ||
226 | type(CS) | ||
227 | |||
228 | |||
229 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore | ||
230 | //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC | ||
231 | s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC | ||
232 | s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. | ||
233 | s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE | ||
234 | //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE | ||
235 | s_branch L_SKIP_RESTORE //NOT restore, SAVE actually | ||
236 | else | ||
237 | s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save | ||
238 | end | ||
239 | |||
240 | L_JUMP_TO_RESTORE: | ||
241 | s_branch L_RESTORE //restore | ||
242 | |||
243 | L_SKIP_RESTORE: | ||
244 | |||
245 | s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC | ||
246 | s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save | ||
247 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) | ||
248 | s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save | ||
249 | s_cbranch_scc1 L_SAVE //this is the operation for save | ||
250 | |||
251 | // ********* Handle non-CWSR traps ******************* | ||
252 | if (!EMU_RUN_HACK) | ||
253 | // Illegal instruction is a non-maskable exception which blocks context save. | ||
254 | // Halt the wavefront and return from the trap. | ||
255 | s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK | ||
256 | s_cbranch_scc1 L_HALT_WAVE | ||
257 | |||
258 | // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. | ||
259 | // Instead, halt the wavefront and return from the trap. | ||
260 | s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | ||
261 | s_cbranch_scc0 L_FETCH_2ND_TRAP | ||
262 | |||
263 | L_HALT_WAVE: | ||
264 | // If STATUS.HALT is set then this fault must come from SQC instruction fetch. | ||
265 | // We cannot prevent further faults so just terminate the wavefront. | ||
266 | s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK | ||
267 | s_cbranch_scc0 L_NOT_ALREADY_HALTED | ||
268 | s_endpgm | ||
269 | L_NOT_ALREADY_HALTED: | ||
270 | s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK | ||
271 | |||
272 | // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. | ||
273 | // Rewind the PC to prevent this from occurring. The debugger compensates for this. | ||
274 | s_sub_u32 ttmp0, ttmp0, 0x8 | ||
275 | s_subb_u32 ttmp1, ttmp1, 0x0 | ||
276 | |||
277 | L_FETCH_2ND_TRAP: | ||
278 | // Preserve and clear scalar XNACK state before issuing scalar reads. | ||
279 | // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26]. | ||
280 | s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS) | ||
281 | s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK | ||
282 | s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) | ||
283 | s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK | ||
284 | s_or_b32 ttmp11, ttmp11, ttmp3 | ||
285 | |||
286 | s_andn2_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK | ||
287 | s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 | ||
288 | |||
289 | // Read second-level TBA/TMA from first-level TMA and jump if available. | ||
290 | // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) | ||
291 | // ttmp12 holds SQ_WAVE_STATUS | ||
292 | s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) | ||
293 | s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) | ||
294 | s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 | ||
295 | s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA | ||
296 | s_waitcnt lgkmcnt(0) | ||
297 | s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA | ||
298 | s_waitcnt lgkmcnt(0) | ||
299 | s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] | ||
300 | s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set | ||
301 | s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler | ||
302 | |||
303 | L_NO_NEXT_TRAP: | ||
304 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) | ||
305 | s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception | ||
306 | s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. | ||
307 | s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 | ||
308 | s_addc_u32 ttmp1, ttmp1, 0 | ||
309 | L_EXCP_CASE: | ||
310 | s_and_b32 ttmp1, ttmp1, 0xFFFF | ||
311 | |||
312 | // Restore SQ_WAVE_IB_STS. | ||
313 | s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) | ||
314 | s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK | ||
315 | s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 | ||
316 | |||
317 | // Restore SQ_WAVE_STATUS. | ||
318 | s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 | ||
319 | s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 | ||
320 | s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status | ||
321 | |||
322 | s_rfe_b64 [ttmp0, ttmp1] | ||
323 | end | ||
324 | // ********* End handling of non-CWSR traps ******************* | ||
325 | |||
326 | /**************************************************************************/ | ||
327 | /* save routine */ | ||
328 | /**************************************************************************/ | ||
329 | |||
330 | L_SAVE: | ||
331 | |||
332 | if G8SR_DEBUG_TIMESTAMP | ||
333 | s_memrealtime s_g8sr_ts_save_s | ||
334 | s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? | ||
335 | end | ||
336 | |||
337 | s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] | ||
338 | |||
339 | s_mov_b32 s_save_tmp, 0 //clear saveCtx bit | ||
340 | s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit | ||
341 | |||
342 | s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT | ||
343 | s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT | ||
344 | s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp | ||
345 | s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY | ||
346 | s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT | ||
347 | s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp | ||
348 | s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS | ||
349 | s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG | ||
350 | |||
351 | s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp | ||
352 | |||
353 | /* inform SPI the readiness and wait for SPI's go signal */ | ||
354 | s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI | ||
355 | s_mov_b32 s_save_exec_hi, exec_hi | ||
356 | s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive | ||
357 | |||
358 | if G8SR_DEBUG_TIMESTAMP | ||
359 | s_memrealtime s_g8sr_ts_sq_save_msg | ||
360 | s_waitcnt lgkmcnt(0) | ||
361 | end | ||
362 | |||
363 | if (EMU_RUN_HACK) | ||
364 | |||
365 | else | ||
366 | s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC | ||
367 | end | ||
368 | |||
369 | // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. | ||
370 | s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) | ||
371 | s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp | ||
372 | |||
373 | L_SLEEP: | ||
374 | s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 | ||
375 | |||
376 | if (EMU_RUN_HACK) | ||
377 | |||
378 | else | ||
379 | s_cbranch_execz L_SLEEP | ||
380 | end | ||
381 | |||
382 | if G8SR_DEBUG_TIMESTAMP | ||
383 | s_memrealtime s_g8sr_ts_spi_wrexec | ||
384 | s_waitcnt lgkmcnt(0) | ||
385 | end | ||
386 | |||
387 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) | ||
388 | //calculate wd_addr using absolute thread id | ||
389 | v_readlane_b32 s_save_tmp, v9, 0 | ||
390 | s_lshr_b32 s_save_tmp, s_save_tmp, 6 | ||
391 | s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE | ||
392 | s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO | ||
393 | s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI | ||
394 | s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL | ||
395 | else | ||
396 | end | ||
397 | if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) | ||
398 | s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO | ||
399 | s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI | ||
400 | s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL | ||
401 | else | ||
402 | end | ||
403 | |||
404 | // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic | ||
405 | // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 | ||
406 | get_vgpr_size_bytes(s_save_ttmps_lo) | ||
407 | get_sgpr_size_bytes(s_save_ttmps_hi) | ||
408 | s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi | ||
409 | s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo | ||
410 | s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 | ||
411 | s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF | ||
412 | s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 | ||
413 | ack_sqc_store_workaround() | ||
414 | s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 | ||
415 | ack_sqc_store_workaround() | ||
416 | s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 | ||
417 | ack_sqc_store_workaround() | ||
418 | s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 | ||
419 | ack_sqc_store_workaround() | ||
420 | |||
421 | /* setup Resource Contants */ | ||
422 | s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo | ||
423 | s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi | ||
424 | s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE | ||
425 | s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited | ||
426 | s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC | ||
427 | s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK | ||
428 | s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position | ||
429 | s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC | ||
430 | s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK | ||
431 | s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position | ||
432 | s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE | ||
433 | |||
434 | //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) | ||
435 | s_mov_b32 s_save_m0, m0 //save M0 | ||
436 | |||
437 | /* global mem offset */ | ||
438 | s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 | ||
439 | |||
440 | |||
441 | |||
442 | |||
443 | /* save HW registers */ | ||
444 | ////////////////////////////// | ||
445 | |||
446 | L_SAVE_HWREG: | ||
447 | // HWREG SR memory offset : size(VGPR)+size(SGPR) | ||
448 | get_vgpr_size_bytes(s_save_mem_offset) | ||
449 | get_sgpr_size_bytes(s_save_tmp) | ||
450 | s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp | ||
451 | |||
452 | |||
453 | s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes | ||
454 | if (SWIZZLE_EN) | ||
455 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
456 | else | ||
457 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
458 | end | ||
459 | |||
460 | |||
461 | write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 | ||
462 | |||
463 | if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) | ||
464 | s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 | ||
465 | s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over | ||
466 | end | ||
467 | |||
468 | write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC | ||
469 | write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) | ||
470 | write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC | ||
471 | write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) | ||
472 | write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS | ||
473 | |||
474 | //s_save_trapsts conflicts with s_save_alloc_size | ||
475 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) | ||
476 | write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS | ||
477 | |||
478 | write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO | ||
479 | write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI | ||
480 | |||
481 | //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 | ||
482 | s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE | ||
483 | write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) | ||
484 | |||
485 | |||
486 | |||
487 | /* the first wave in the threadgroup */ | ||
488 | s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit | ||
489 | s_mov_b32 s_save_exec_hi, 0x0 | ||
490 | s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] | ||
491 | |||
492 | |||
493 | /* save SGPRs */ | ||
494 | // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... | ||
495 | ////////////////////////////// | ||
496 | |||
497 | // SGPR SR memory offset : size(VGPR) | ||
498 | get_vgpr_size_bytes(s_save_mem_offset) | ||
499 | // TODO, change RSRC word to rearrange memory layout for SGPRS | ||
500 | |||
501 | s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size | ||
502 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 | ||
503 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) | ||
504 | |||
505 | if (SGPR_SAVE_USE_SQC) | ||
506 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes | ||
507 | else | ||
508 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) | ||
509 | end | ||
510 | |||
511 | if (SWIZZLE_EN) | ||
512 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
513 | else | ||
514 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
515 | end | ||
516 | |||
517 | |||
518 | // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 | ||
519 | //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 | ||
520 | s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 | ||
521 | s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset | ||
522 | s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 | ||
523 | |||
524 | s_mov_b32 m0, 0x0 //SGPR initial index value =0 | ||
525 | s_nop 0x0 //Manually inserted wait states | ||
526 | L_SAVE_SGPR_LOOP: | ||
527 | // SGPR is allocated in 16 SGPR granularity | ||
528 | s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] | ||
529 | s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] | ||
530 | s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] | ||
531 | s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] | ||
532 | s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] | ||
533 | s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] | ||
534 | s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] | ||
535 | s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] | ||
536 | |||
537 | write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 | ||
538 | s_add_u32 m0, m0, 16 //next sgpr index | ||
539 | s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 | ||
540 | s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? | ||
541 | // restore s_save_buf_rsrc0,1 | ||
542 | //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo | ||
543 | s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo | ||
544 | |||
545 | |||
546 | |||
547 | |||
548 | /* save first 4 VGPR, then LDS save could use */ | ||
549 | // each wave will alloc 4 vgprs at least... | ||
550 | ///////////////////////////////////////////////////////////////////////////////////// | ||
551 | |||
552 | s_mov_b32 s_save_mem_offset, 0 | ||
553 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on | ||
554 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
555 | s_mov_b32 xnack_mask_lo, 0x0 | ||
556 | s_mov_b32 xnack_mask_hi, 0x0 | ||
557 | |||
558 | if (SWIZZLE_EN) | ||
559 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
560 | else | ||
561 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
562 | end | ||
563 | |||
564 | |||
565 | // VGPR Allocated in 4-GPR granularity | ||
566 | |||
567 | if G8SR_VGPR_SR_IN_DWX4 | ||
568 | // the const stride for DWx4 is 4*4 bytes | ||
569 | s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 | ||
570 | s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes | ||
571 | |||
572 | buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 | ||
573 | |||
574 | s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 | ||
575 | s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes | ||
576 | else | ||
577 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 | ||
578 | buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 | ||
579 | buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 | ||
580 | buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 | ||
581 | end | ||
582 | |||
583 | |||
584 | |||
585 | /* save LDS */ | ||
586 | ////////////////////////////// | ||
587 | |||
588 | L_SAVE_LDS: | ||
589 | |||
590 | // Change EXEC to all threads... | ||
591 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on | ||
592 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
593 | |||
594 | s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size | ||
595 | s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? | ||
596 | s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE | ||
597 | |||
598 | s_barrier //LDS is used? wait for other waves in the same TG | ||
599 | s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here | ||
600 | s_cbranch_scc0 L_SAVE_LDS_DONE | ||
601 | |||
602 | // first wave do LDS save; | ||
603 | |||
604 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw | ||
605 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes | ||
606 | s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes | ||
607 | |||
608 | // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) | ||
609 | // | ||
610 | get_vgpr_size_bytes(s_save_mem_offset) | ||
611 | get_sgpr_size_bytes(s_save_tmp) | ||
612 | s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp | ||
613 | s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() | ||
614 | |||
615 | |||
616 | if (SWIZZLE_EN) | ||
617 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
618 | else | ||
619 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
620 | end | ||
621 | |||
622 | s_mov_b32 m0, 0x0 //lds_offset initial value = 0 | ||
623 | |||
624 | |||
625 | var LDS_DMA_ENABLE = 0 | ||
626 | var UNROLL = 0 | ||
627 | if UNROLL==0 && LDS_DMA_ENABLE==1 | ||
628 | s_mov_b32 s3, 256*2 | ||
629 | s_nop 0 | ||
630 | s_nop 0 | ||
631 | s_nop 0 | ||
632 | L_SAVE_LDS_LOOP: | ||
633 | //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? | ||
634 | if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity | ||
635 | buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW | ||
636 | buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW | ||
637 | end | ||
638 | |||
639 | s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes | ||
640 | s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes | ||
641 | s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 | ||
642 | s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? | ||
643 | |||
644 | elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss | ||
645 | // store from higest LDS address to lowest | ||
646 | s_mov_b32 s3, 256*2 | ||
647 | s_sub_u32 m0, s_save_alloc_size, s3 | ||
648 | s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 | ||
649 | s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... | ||
650 | s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest | ||
651 | s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction | ||
652 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc | ||
653 | s_nop 0 | ||
654 | s_nop 0 | ||
655 | s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes | ||
656 | s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved | ||
657 | s_add_u32 s0, s0,s_save_alloc_size | ||
658 | s_addc_u32 s1, s1, 0 | ||
659 | s_setpc_b64 s[0:1] | ||
660 | |||
661 | |||
662 | for var i =0; i< 128; i++ | ||
663 | // be careful to make here a 64Byte aligned address, which could improve performance... | ||
664 | buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW | ||
665 | buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW | ||
666 | |||
667 | if i!=127 | ||
668 | s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline | ||
669 | s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 | ||
670 | end | ||
671 | end | ||
672 | |||
673 | else // BUFFER_STORE | ||
674 | v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 | ||
675 | v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid | ||
676 | v_mul_i32_i24 v2, v3, 8 // tid*8 | ||
677 | v_mov_b32 v3, 256*2 | ||
678 | s_mov_b32 m0, 0x10000 | ||
679 | s_mov_b32 s0, s_save_buf_rsrc3 | ||
680 | s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid | ||
681 | s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT | ||
682 | |||
683 | L_SAVE_LDS_LOOP_VECTOR: | ||
684 | ds_read_b64 v[0:1], v2 //x =LDS[a], byte address | ||
685 | s_waitcnt lgkmcnt(0) | ||
686 | buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 | ||
687 | // s_waitcnt vmcnt(0) | ||
688 | // v_add_u32 v2, vcc[0:1], v2, v3 | ||
689 | v_add_u32 v2, v2, v3 | ||
690 | v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size | ||
691 | s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR | ||
692 | |||
693 | // restore rsrc3 | ||
694 | s_mov_b32 s_save_buf_rsrc3, s0 | ||
695 | |||
696 | end | ||
697 | |||
698 | L_SAVE_LDS_DONE: | ||
699 | |||
700 | |||
701 | /* save VGPRs - set the Rest VGPRs */ | ||
702 | ////////////////////////////////////////////////////////////////////////////////////// | ||
703 | L_SAVE_VGPR: | ||
704 | // VGPR SR memory offset: 0 | ||
705 | // TODO rearrange the RSRC words to use swizzle for VGPR save... | ||
706 | |||
707 | s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs | ||
708 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on | ||
709 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
710 | |||
711 | s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size | ||
712 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 | ||
713 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible | ||
714 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) | ||
715 | if (SWIZZLE_EN) | ||
716 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
717 | else | ||
718 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
719 | end | ||
720 | |||
721 | |||
722 | // VGPR Allocated in 4-GPR granularity | ||
723 | |||
724 | if G8SR_VGPR_SR_IN_DWX4 | ||
725 | // the const stride for DWx4 is 4*4 bytes | ||
726 | s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 | ||
727 | s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes | ||
728 | |||
729 | s_mov_b32 m0, 4 // skip first 4 VGPRs | ||
730 | s_cmp_lt_u32 m0, s_save_alloc_size | ||
731 | s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs | ||
732 | |||
733 | s_set_gpr_idx_on m0, 0x1 // This will change M0 | ||
734 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 | ||
735 | L_SAVE_VGPR_LOOP: | ||
736 | v_mov_b32 v0, v0 // v0 = v[0+m0] | ||
737 | v_mov_b32 v1, v1 | ||
738 | v_mov_b32 v2, v2 | ||
739 | v_mov_b32 v3, v3 | ||
740 | |||
741 | |||
742 | buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 | ||
743 | s_add_u32 m0, m0, 4 | ||
744 | s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 | ||
745 | s_cmp_lt_u32 m0, s_save_alloc_size | ||
746 | s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? | ||
747 | s_set_gpr_idx_off | ||
748 | L_SAVE_VGPR_LOOP_END: | ||
749 | |||
750 | s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 | ||
751 | s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes | ||
752 | else | ||
753 | // VGPR store using dw burst | ||
754 | s_mov_b32 m0, 0x4 //VGPR initial index value =0 | ||
755 | s_cmp_lt_u32 m0, s_save_alloc_size | ||
756 | s_cbranch_scc0 L_SAVE_VGPR_END | ||
757 | |||
758 | |||
759 | s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 | ||
760 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later | ||
761 | |||
762 | L_SAVE_VGPR_LOOP: | ||
763 | v_mov_b32 v0, v0 //v0 = v[0+m0] | ||
764 | v_mov_b32 v1, v1 //v0 = v[0+m0] | ||
765 | v_mov_b32 v2, v2 //v0 = v[0+m0] | ||
766 | v_mov_b32 v3, v3 //v0 = v[0+m0] | ||
767 | |||
768 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | ||
769 | tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
770 | else | ||
771 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 | ||
772 | buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 | ||
773 | buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 | ||
774 | buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 | ||
775 | end | ||
776 | |||
777 | s_add_u32 m0, m0, 4 //next vgpr index | ||
778 | s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes | ||
779 | s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 | ||
780 | s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? | ||
781 | s_set_gpr_idx_off | ||
782 | end | ||
783 | |||
784 | L_SAVE_VGPR_END: | ||
785 | |||
786 | |||
787 | |||
788 | |||
789 | |||
790 | |||
791 | /* S_PGM_END_SAVED */ //FIXME graphics ONLY | ||
792 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) | ||
793 | s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] | ||
794 | s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 | ||
795 | s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over | ||
796 | s_rfe_b64 s_save_pc_lo //Return to the main shader program | ||
797 | else | ||
798 | end | ||
799 | |||
800 | // Save Done timestamp | ||
801 | if G8SR_DEBUG_TIMESTAMP | ||
802 | s_memrealtime s_g8sr_ts_save_d | ||
803 | // SGPR SR memory offset : size(VGPR) | ||
804 | get_vgpr_size_bytes(s_save_mem_offset) | ||
805 | s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET | ||
806 | s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? | ||
807 | // Need reset rsrc2?? | ||
808 | s_mov_b32 m0, s_save_mem_offset | ||
809 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
810 | s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 | ||
811 | end | ||
812 | |||
813 | |||
814 | s_branch L_END_PGM | ||
815 | |||
816 | |||
817 | |||
818 | /**************************************************************************/ | ||
819 | /* restore routine */ | ||
820 | /**************************************************************************/ | ||
821 | |||
822 | L_RESTORE: | ||
823 | /* Setup Resource Contants */ | ||
824 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) | ||
825 | //calculate wd_addr using absolute thread id | ||
826 | v_readlane_b32 s_restore_tmp, v9, 0 | ||
827 | s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 | ||
828 | s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE | ||
829 | s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO | ||
830 | s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI | ||
831 | s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL | ||
832 | else | ||
833 | end | ||
834 | |||
835 | if G8SR_DEBUG_TIMESTAMP | ||
836 | s_memrealtime s_g8sr_ts_restore_s | ||
837 | s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? | ||
838 | // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... | ||
839 | s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] | ||
840 | s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. | ||
841 | end | ||
842 | |||
843 | |||
844 | |||
845 | s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo | ||
846 | s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi | ||
847 | s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE | ||
848 | s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) | ||
849 | s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC | ||
850 | s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK | ||
851 | s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position | ||
852 | s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC | ||
853 | s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK | ||
854 | s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position | ||
855 | s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE | ||
856 | |||
857 | /* global mem offset */ | ||
858 | // s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 | ||
859 | |||
860 | /* the first wave in the threadgroup */ | ||
861 | s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK | ||
862 | s_cbranch_scc0 L_RESTORE_VGPR | ||
863 | |||
864 | /* restore LDS */ | ||
865 | ////////////////////////////// | ||
866 | L_RESTORE_LDS: | ||
867 | |||
868 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead | ||
869 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
870 | |||
871 | s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size | ||
872 | s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? | ||
873 | s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR | ||
874 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw | ||
875 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes | ||
876 | s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes | ||
877 | |||
878 | // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) | ||
879 | // | ||
880 | get_vgpr_size_bytes(s_restore_mem_offset) | ||
881 | get_sgpr_size_bytes(s_restore_tmp) | ||
882 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp | ||
883 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? | ||
884 | |||
885 | |||
886 | if (SWIZZLE_EN) | ||
887 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
888 | else | ||
889 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
890 | end | ||
891 | s_mov_b32 m0, 0x0 //lds_offset initial value = 0 | ||
892 | |||
893 | L_RESTORE_LDS_LOOP: | ||
894 | if (SAVE_LDS) | ||
895 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW | ||
896 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW | ||
897 | end | ||
898 | s_add_u32 m0, m0, 256*2 // 128 DW | ||
899 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW | ||
900 | s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 | ||
901 | s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? | ||
902 | |||
903 | |||
904 | /* restore VGPRs */ | ||
905 | ////////////////////////////// | ||
906 | L_RESTORE_VGPR: | ||
907 | // VGPR SR memory offset : 0 | ||
908 | s_mov_b32 s_restore_mem_offset, 0x0 | ||
909 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead | ||
910 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
911 | |||
912 | s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size | ||
913 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 | ||
914 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) | ||
915 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) | ||
916 | if (SWIZZLE_EN) | ||
917 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
918 | else | ||
919 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
920 | end | ||
921 | |||
922 | if G8SR_VGPR_SR_IN_DWX4 | ||
923 | get_vgpr_size_bytes(s_restore_mem_offset) | ||
924 | s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 | ||
925 | |||
926 | // the const stride for DWx4 is 4*4 bytes | ||
927 | s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 | ||
928 | s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes | ||
929 | |||
930 | s_mov_b32 m0, s_restore_alloc_size | ||
931 | s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 | ||
932 | |||
933 | L_RESTORE_VGPR_LOOP: | ||
934 | buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 | ||
935 | s_waitcnt vmcnt(0) | ||
936 | s_sub_u32 m0, m0, 4 | ||
937 | v_mov_b32 v0, v0 // v[0+m0] = v0 | ||
938 | v_mov_b32 v1, v1 | ||
939 | v_mov_b32 v2, v2 | ||
940 | v_mov_b32 v3, v3 | ||
941 | s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 | ||
942 | s_cmp_eq_u32 m0, 0x8000 | ||
943 | s_cbranch_scc0 L_RESTORE_VGPR_LOOP | ||
944 | s_set_gpr_idx_off | ||
945 | |||
946 | s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 | ||
947 | s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes | ||
948 | |||
949 | else | ||
950 | // VGPR load using dw burst | ||
951 | s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last | ||
952 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 | ||
953 | s_mov_b32 m0, 4 //VGPR initial index value = 1 | ||
954 | s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 | ||
955 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later | ||
956 | |||
957 | L_RESTORE_VGPR_LOOP: | ||
958 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | ||
959 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
960 | else | ||
961 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 | ||
962 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 | ||
963 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 | ||
964 | buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 | ||
965 | end | ||
966 | s_waitcnt vmcnt(0) //ensure data ready | ||
967 | v_mov_b32 v0, v0 //v[0+m0] = v0 | ||
968 | v_mov_b32 v1, v1 | ||
969 | v_mov_b32 v2, v2 | ||
970 | v_mov_b32 v3, v3 | ||
971 | s_add_u32 m0, m0, 4 //next vgpr index | ||
972 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes | ||
973 | s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 | ||
974 | s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? | ||
975 | s_set_gpr_idx_off | ||
976 | /* VGPR restore on v0 */ | ||
977 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | ||
978 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
979 | else | ||
980 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 | ||
981 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 | ||
982 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 | ||
983 | buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 | ||
984 | end | ||
985 | |||
986 | end | ||
987 | |||
988 | /* restore SGPRs */ | ||
989 | ////////////////////////////// | ||
990 | |||
991 | // SGPR SR memory offset : size(VGPR) | ||
992 | get_vgpr_size_bytes(s_restore_mem_offset) | ||
993 | get_sgpr_size_bytes(s_restore_tmp) | ||
994 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp | ||
995 | s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group | ||
996 | // TODO, change RSRC word to rearrange memory layout for SGPRS | ||
997 | |||
998 | s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size | ||
999 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 | ||
1000 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) | ||
1001 | |||
1002 | if (SGPR_SAVE_USE_SQC) | ||
1003 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes | ||
1004 | else | ||
1005 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) | ||
1006 | end | ||
1007 | if (SWIZZLE_EN) | ||
1008 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
1009 | else | ||
1010 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
1011 | end | ||
1012 | |||
1013 | s_mov_b32 m0, s_restore_alloc_size | ||
1014 | |||
1015 | L_RESTORE_SGPR_LOOP: | ||
1016 | read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made | ||
1017 | s_waitcnt lgkmcnt(0) //ensure data ready | ||
1018 | |||
1019 | s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] | ||
1020 | s_nop 0 // hazard SALU M0=> S_MOVREL | ||
1021 | |||
1022 | s_movreld_b64 s0, s0 //s[0+m0] = s0 | ||
1023 | s_movreld_b64 s2, s2 | ||
1024 | s_movreld_b64 s4, s4 | ||
1025 | s_movreld_b64 s6, s6 | ||
1026 | s_movreld_b64 s8, s8 | ||
1027 | s_movreld_b64 s10, s10 | ||
1028 | s_movreld_b64 s12, s12 | ||
1029 | s_movreld_b64 s14, s14 | ||
1030 | |||
1031 | s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 | ||
1032 | s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? | ||
1033 | |||
1034 | /* restore HW registers */ | ||
1035 | ////////////////////////////// | ||
1036 | L_RESTORE_HWREG: | ||
1037 | |||
1038 | |||
1039 | if G8SR_DEBUG_TIMESTAMP | ||
1040 | s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo | ||
1041 | s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi | ||
1042 | end | ||
1043 | |||
1044 | // HWREG SR memory offset : size(VGPR)+size(SGPR) | ||
1045 | get_vgpr_size_bytes(s_restore_mem_offset) | ||
1046 | get_sgpr_size_bytes(s_restore_tmp) | ||
1047 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp | ||
1048 | |||
1049 | |||
1050 | s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes | ||
1051 | if (SWIZZLE_EN) | ||
1052 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
1053 | else | ||
1054 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
1055 | end | ||
1056 | |||
1057 | read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 | ||
1058 | read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC | ||
1059 | read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) | ||
1060 | read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC | ||
1061 | read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) | ||
1062 | read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS | ||
1063 | read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS | ||
1064 | read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO | ||
1065 | read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI | ||
1066 | read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE | ||
1067 | |||
1068 | s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS | ||
1069 | |||
1070 | //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: | ||
1071 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) | ||
1072 | s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) | ||
1073 | s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over | ||
1074 | end | ||
1075 | if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) | ||
1076 | s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal | ||
1077 | s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over | ||
1078 | end | ||
1079 | |||
1080 | s_mov_b32 m0, s_restore_m0 | ||
1081 | s_mov_b32 exec_lo, s_restore_exec_lo | ||
1082 | s_mov_b32 exec_hi, s_restore_exec_hi | ||
1083 | |||
1084 | s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts | ||
1085 | s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 | ||
1086 | s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts | ||
1087 | s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT | ||
1088 | s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 | ||
1089 | //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore | ||
1090 | s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode | ||
1091 | |||
1092 | // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic | ||
1093 | // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 | ||
1094 | get_vgpr_size_bytes(s_restore_ttmps_lo) | ||
1095 | get_sgpr_size_bytes(s_restore_ttmps_hi) | ||
1096 | s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi | ||
1097 | s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 | ||
1098 | s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 | ||
1099 | s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF | ||
1100 | s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 | ||
1101 | s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 | ||
1102 | s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 | ||
1103 | s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 | ||
1104 | s_waitcnt lgkmcnt(0) | ||
1105 | |||
1106 | //reuse s_restore_m0 as a temp register | ||
1107 | s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK | ||
1108 | s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT | ||
1109 | s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT | ||
1110 | s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero | ||
1111 | s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 | ||
1112 | s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK | ||
1113 | s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT | ||
1114 | s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT | ||
1115 | s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 | ||
1116 | s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK | ||
1117 | s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT | ||
1118 | s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp | ||
1119 | |||
1120 | s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS | ||
1121 | s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 | ||
1122 | s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 | ||
1123 | s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu | ||
1124 | |||
1125 | s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time | ||
1126 | |||
1127 | if G8SR_DEBUG_TIMESTAMP | ||
1128 | s_memrealtime s_g8sr_ts_restore_d | ||
1129 | s_waitcnt lgkmcnt(0) | ||
1130 | end | ||
1131 | |||
1132 | // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution | ||
1133 | s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc | ||
1134 | |||
1135 | |||
1136 | /**************************************************************************/ | ||
1137 | /* the END */ | ||
1138 | /**************************************************************************/ | ||
1139 | L_END_PGM: | ||
1140 | s_endpgm | ||
1141 | |||
1142 | end | ||
1143 | |||
1144 | |||
1145 | /**************************************************************************/ | ||
1146 | /* the helper functions */ | ||
1147 | /**************************************************************************/ | ||
1148 | |||
1149 | //Only for save hwreg to mem | ||
1150 | function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) | ||
1151 | s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on | ||
1152 | s_mov_b32 m0, s_mem_offset | ||
1153 | s_buffer_store_dword s, s_rsrc, m0 glc:1 | ||
1154 | ack_sqc_store_workaround() | ||
1155 | s_add_u32 s_mem_offset, s_mem_offset, 4 | ||
1156 | s_mov_b32 m0, exec_lo | ||
1157 | end | ||
1158 | |||
1159 | |||
1160 | // HWREG are saved before SGPRs, so all HWREG could be use. | ||
1161 | function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) | ||
1162 | |||
1163 | s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 | ||
1164 | ack_sqc_store_workaround() | ||
1165 | s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 | ||
1166 | ack_sqc_store_workaround() | ||
1167 | s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 | ||
1168 | ack_sqc_store_workaround() | ||
1169 | s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 | ||
1170 | ack_sqc_store_workaround() | ||
1171 | s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 | ||
1172 | s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc | ||
1173 | end | ||
1174 | |||
1175 | |||
1176 | function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) | ||
1177 | s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 | ||
1178 | s_add_u32 s_mem_offset, s_mem_offset, 4 | ||
1179 | end | ||
1180 | |||
1181 | function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) | ||
1182 | s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 | ||
1183 | s_sub_u32 s_mem_offset, s_mem_offset, 4*16 | ||
1184 | end | ||
1185 | |||
1186 | |||
1187 | |||
1188 | function get_lds_size_bytes(s_lds_size_byte) | ||
1189 | // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW | ||
1190 | s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size | ||
1191 | s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW | ||
1192 | end | ||
1193 | |||
1194 | function get_vgpr_size_bytes(s_vgpr_size_byte) | ||
1195 | s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size | ||
1196 | s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 | ||
1197 | s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible | ||
1198 | end | ||
1199 | |||
1200 | function get_sgpr_size_bytes(s_sgpr_size_byte) | ||
1201 | s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size | ||
1202 | s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 | ||
1203 | s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) | ||
1204 | end | ||
1205 | |||
1206 | function get_hwreg_size_bytes | ||
1207 | return 128 //HWREG size 128 bytes | ||
1208 | end | ||
1209 | |||
1210 | function ack_sqc_store_workaround | ||
1211 | if ACK_SQC_STORE | ||
1212 | s_waitcnt lgkmcnt(0) | ||
1213 | end | ||
1214 | end | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 59808a39ecf4..f64c5551cdba 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | |||
@@ -233,7 +233,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, | |||
233 | pr_debug("Queue Size: 0x%llX, %u\n", | 233 | pr_debug("Queue Size: 0x%llX, %u\n", |
234 | q_properties->queue_size, args->ring_size); | 234 | q_properties->queue_size, args->ring_size); |
235 | 235 | ||
236 | pr_debug("Queue r/w Pointers: %p, %p\n", | 236 | pr_debug("Queue r/w Pointers: %px, %px\n", |
237 | q_properties->read_ptr, | 237 | q_properties->read_ptr, |
238 | q_properties->write_ptr); | 238 | q_properties->write_ptr); |
239 | 239 | ||
@@ -292,8 +292,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, | |||
292 | 292 | ||
293 | 293 | ||
294 | /* Return gpu_id as doorbell offset for mmap usage */ | 294 | /* Return gpu_id as doorbell offset for mmap usage */ |
295 | args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); | 295 | args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; |
296 | args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); | ||
296 | args->doorbell_offset <<= PAGE_SHIFT; | 297 | args->doorbell_offset <<= PAGE_SHIFT; |
298 | if (KFD_IS_SOC15(dev->device_info->asic_family)) | ||
299 | /* On SOC15 ASICs, doorbell allocation must be | ||
300 | * per-device, and independent from the per-process | ||
301 | * queue_id. Return the doorbell offset within the | ||
302 | * doorbell aperture to user mode. | ||
303 | */ | ||
304 | args->doorbell_offset |= q_properties.doorbell_off; | ||
297 | 305 | ||
298 | mutex_unlock(&p->mutex); | 306 | mutex_unlock(&p->mutex); |
299 | 307 | ||
@@ -1296,8 +1304,8 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, | |||
1296 | return -EINVAL; | 1304 | return -EINVAL; |
1297 | } | 1305 | } |
1298 | 1306 | ||
1299 | devices_arr = kmalloc(args->n_devices * sizeof(*devices_arr), | 1307 | devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr), |
1300 | GFP_KERNEL); | 1308 | GFP_KERNEL); |
1301 | if (!devices_arr) | 1309 | if (!devices_arr) |
1302 | return -ENOMEM; | 1310 | return -ENOMEM; |
1303 | 1311 | ||
@@ -1405,8 +1413,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, | |||
1405 | return -EINVAL; | 1413 | return -EINVAL; |
1406 | } | 1414 | } |
1407 | 1415 | ||
1408 | devices_arr = kmalloc(args->n_devices * sizeof(*devices_arr), | 1416 | devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr), |
1409 | GFP_KERNEL); | 1417 | GFP_KERNEL); |
1410 | if (!devices_arr) | 1418 | if (!devices_arr) |
1411 | return -ENOMEM; | 1419 | return -ENOMEM; |
1412 | 1420 | ||
@@ -1645,23 +1653,33 @@ err_i1: | |||
1645 | static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) | 1653 | static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) |
1646 | { | 1654 | { |
1647 | struct kfd_process *process; | 1655 | struct kfd_process *process; |
1656 | struct kfd_dev *dev = NULL; | ||
1657 | unsigned long vm_pgoff; | ||
1658 | unsigned int gpu_id; | ||
1648 | 1659 | ||
1649 | process = kfd_get_process(current); | 1660 | process = kfd_get_process(current); |
1650 | if (IS_ERR(process)) | 1661 | if (IS_ERR(process)) |
1651 | return PTR_ERR(process); | 1662 | return PTR_ERR(process); |
1652 | 1663 | ||
1653 | if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == | 1664 | vm_pgoff = vma->vm_pgoff; |
1654 | KFD_MMAP_DOORBELL_MASK) { | 1665 | vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vm_pgoff); |
1655 | vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; | 1666 | gpu_id = KFD_MMAP_GPU_ID_GET(vm_pgoff); |
1656 | return kfd_doorbell_mmap(process, vma); | 1667 | if (gpu_id) |
1657 | } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == | 1668 | dev = kfd_device_by_id(gpu_id); |
1658 | KFD_MMAP_EVENTS_MASK) { | 1669 | |
1659 | vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; | 1670 | switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { |
1671 | case KFD_MMAP_TYPE_DOORBELL: | ||
1672 | if (!dev) | ||
1673 | return -ENODEV; | ||
1674 | return kfd_doorbell_mmap(dev, process, vma); | ||
1675 | |||
1676 | case KFD_MMAP_TYPE_EVENTS: | ||
1660 | return kfd_event_mmap(process, vma); | 1677 | return kfd_event_mmap(process, vma); |
1661 | } else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) == | 1678 | |
1662 | KFD_MMAP_RESERVED_MEM_MASK) { | 1679 | case KFD_MMAP_TYPE_RESERVED_MEM: |
1663 | vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK; | 1680 | if (!dev) |
1664 | return kfd_reserved_mem_mmap(process, vma); | 1681 | return -ENODEV; |
1682 | return kfd_reserved_mem_mmap(dev, process, vma); | ||
1665 | } | 1683 | } |
1666 | 1684 | ||
1667 | return -EFAULT; | 1685 | return -EFAULT; |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 4f126ef6139b..296b3f230280 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c | |||
@@ -132,6 +132,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { | |||
132 | #define fiji_cache_info carrizo_cache_info | 132 | #define fiji_cache_info carrizo_cache_info |
133 | #define polaris10_cache_info carrizo_cache_info | 133 | #define polaris10_cache_info carrizo_cache_info |
134 | #define polaris11_cache_info carrizo_cache_info | 134 | #define polaris11_cache_info carrizo_cache_info |
135 | /* TODO - check & update Vega10 cache details */ | ||
136 | #define vega10_cache_info carrizo_cache_info | ||
137 | #define raven_cache_info carrizo_cache_info | ||
135 | 138 | ||
136 | static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, | 139 | static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, |
137 | struct crat_subtype_computeunit *cu) | 140 | struct crat_subtype_computeunit *cu) |
@@ -603,6 +606,14 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, | |||
603 | pcache_info = polaris11_cache_info; | 606 | pcache_info = polaris11_cache_info; |
604 | num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); | 607 | num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); |
605 | break; | 608 | break; |
609 | case CHIP_VEGA10: | ||
610 | pcache_info = vega10_cache_info; | ||
611 | num_of_cache_types = ARRAY_SIZE(vega10_cache_info); | ||
612 | break; | ||
613 | case CHIP_RAVEN: | ||
614 | pcache_info = raven_cache_info; | ||
615 | num_of_cache_types = ARRAY_SIZE(raven_cache_info); | ||
616 | break; | ||
606 | default: | 617 | default: |
607 | return -EINVAL; | 618 | return -EINVAL; |
608 | } | 619 | } |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 3346699960dd..7ee6cec2c060 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c | |||
@@ -20,16 +20,13 @@ | |||
20 | * OTHER DEALINGS IN THE SOFTWARE. | 20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) | ||
24 | #include <linux/amd-iommu.h> | ||
25 | #endif | ||
26 | #include <linux/bsearch.h> | 23 | #include <linux/bsearch.h> |
27 | #include <linux/pci.h> | 24 | #include <linux/pci.h> |
28 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
29 | #include "kfd_priv.h" | 26 | #include "kfd_priv.h" |
30 | #include "kfd_device_queue_manager.h" | 27 | #include "kfd_device_queue_manager.h" |
31 | #include "kfd_pm4_headers_vi.h" | 28 | #include "kfd_pm4_headers_vi.h" |
32 | #include "cwsr_trap_handler_gfx8.asm" | 29 | #include "cwsr_trap_handler.h" |
33 | #include "kfd_iommu.h" | 30 | #include "kfd_iommu.h" |
34 | 31 | ||
35 | #define MQD_SIZE_ALIGNED 768 | 32 | #define MQD_SIZE_ALIGNED 768 |
@@ -41,6 +38,7 @@ static const struct kfd_device_info kaveri_device_info = { | |||
41 | .max_pasid_bits = 16, | 38 | .max_pasid_bits = 16, |
42 | /* max num of queues for KV.TODO should be a dynamic value */ | 39 | /* max num of queues for KV.TODO should be a dynamic value */ |
43 | .max_no_of_hqd = 24, | 40 | .max_no_of_hqd = 24, |
41 | .doorbell_size = 4, | ||
44 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 42 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
45 | .event_interrupt_class = &event_interrupt_class_cik, | 43 | .event_interrupt_class = &event_interrupt_class_cik, |
46 | .num_of_watch_points = 4, | 44 | .num_of_watch_points = 4, |
@@ -55,6 +53,7 @@ static const struct kfd_device_info carrizo_device_info = { | |||
55 | .max_pasid_bits = 16, | 53 | .max_pasid_bits = 16, |
56 | /* max num of queues for CZ.TODO should be a dynamic value */ | 54 | /* max num of queues for CZ.TODO should be a dynamic value */ |
57 | .max_no_of_hqd = 24, | 55 | .max_no_of_hqd = 24, |
56 | .doorbell_size = 4, | ||
58 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 57 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
59 | .event_interrupt_class = &event_interrupt_class_cik, | 58 | .event_interrupt_class = &event_interrupt_class_cik, |
60 | .num_of_watch_points = 4, | 59 | .num_of_watch_points = 4, |
@@ -70,6 +69,7 @@ static const struct kfd_device_info hawaii_device_info = { | |||
70 | .max_pasid_bits = 16, | 69 | .max_pasid_bits = 16, |
71 | /* max num of queues for KV.TODO should be a dynamic value */ | 70 | /* max num of queues for KV.TODO should be a dynamic value */ |
72 | .max_no_of_hqd = 24, | 71 | .max_no_of_hqd = 24, |
72 | .doorbell_size = 4, | ||
73 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 73 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
74 | .event_interrupt_class = &event_interrupt_class_cik, | 74 | .event_interrupt_class = &event_interrupt_class_cik, |
75 | .num_of_watch_points = 4, | 75 | .num_of_watch_points = 4, |
@@ -83,6 +83,7 @@ static const struct kfd_device_info tonga_device_info = { | |||
83 | .asic_family = CHIP_TONGA, | 83 | .asic_family = CHIP_TONGA, |
84 | .max_pasid_bits = 16, | 84 | .max_pasid_bits = 16, |
85 | .max_no_of_hqd = 24, | 85 | .max_no_of_hqd = 24, |
86 | .doorbell_size = 4, | ||
86 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 87 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
87 | .event_interrupt_class = &event_interrupt_class_cik, | 88 | .event_interrupt_class = &event_interrupt_class_cik, |
88 | .num_of_watch_points = 4, | 89 | .num_of_watch_points = 4, |
@@ -96,6 +97,7 @@ static const struct kfd_device_info tonga_vf_device_info = { | |||
96 | .asic_family = CHIP_TONGA, | 97 | .asic_family = CHIP_TONGA, |
97 | .max_pasid_bits = 16, | 98 | .max_pasid_bits = 16, |
98 | .max_no_of_hqd = 24, | 99 | .max_no_of_hqd = 24, |
100 | .doorbell_size = 4, | ||
99 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 101 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
100 | .event_interrupt_class = &event_interrupt_class_cik, | 102 | .event_interrupt_class = &event_interrupt_class_cik, |
101 | .num_of_watch_points = 4, | 103 | .num_of_watch_points = 4, |
@@ -109,6 +111,7 @@ static const struct kfd_device_info fiji_device_info = { | |||
109 | .asic_family = CHIP_FIJI, | 111 | .asic_family = CHIP_FIJI, |
110 | .max_pasid_bits = 16, | 112 | .max_pasid_bits = 16, |
111 | .max_no_of_hqd = 24, | 113 | .max_no_of_hqd = 24, |
114 | .doorbell_size = 4, | ||
112 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 115 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
113 | .event_interrupt_class = &event_interrupt_class_cik, | 116 | .event_interrupt_class = &event_interrupt_class_cik, |
114 | .num_of_watch_points = 4, | 117 | .num_of_watch_points = 4, |
@@ -122,6 +125,7 @@ static const struct kfd_device_info fiji_vf_device_info = { | |||
122 | .asic_family = CHIP_FIJI, | 125 | .asic_family = CHIP_FIJI, |
123 | .max_pasid_bits = 16, | 126 | .max_pasid_bits = 16, |
124 | .max_no_of_hqd = 24, | 127 | .max_no_of_hqd = 24, |
128 | .doorbell_size = 4, | ||
125 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 129 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
126 | .event_interrupt_class = &event_interrupt_class_cik, | 130 | .event_interrupt_class = &event_interrupt_class_cik, |
127 | .num_of_watch_points = 4, | 131 | .num_of_watch_points = 4, |
@@ -136,6 +140,7 @@ static const struct kfd_device_info polaris10_device_info = { | |||
136 | .asic_family = CHIP_POLARIS10, | 140 | .asic_family = CHIP_POLARIS10, |
137 | .max_pasid_bits = 16, | 141 | .max_pasid_bits = 16, |
138 | .max_no_of_hqd = 24, | 142 | .max_no_of_hqd = 24, |
143 | .doorbell_size = 4, | ||
139 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 144 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
140 | .event_interrupt_class = &event_interrupt_class_cik, | 145 | .event_interrupt_class = &event_interrupt_class_cik, |
141 | .num_of_watch_points = 4, | 146 | .num_of_watch_points = 4, |
@@ -149,6 +154,7 @@ static const struct kfd_device_info polaris10_vf_device_info = { | |||
149 | .asic_family = CHIP_POLARIS10, | 154 | .asic_family = CHIP_POLARIS10, |
150 | .max_pasid_bits = 16, | 155 | .max_pasid_bits = 16, |
151 | .max_no_of_hqd = 24, | 156 | .max_no_of_hqd = 24, |
157 | .doorbell_size = 4, | ||
152 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 158 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
153 | .event_interrupt_class = &event_interrupt_class_cik, | 159 | .event_interrupt_class = &event_interrupt_class_cik, |
154 | .num_of_watch_points = 4, | 160 | .num_of_watch_points = 4, |
@@ -162,6 +168,7 @@ static const struct kfd_device_info polaris11_device_info = { | |||
162 | .asic_family = CHIP_POLARIS11, | 168 | .asic_family = CHIP_POLARIS11, |
163 | .max_pasid_bits = 16, | 169 | .max_pasid_bits = 16, |
164 | .max_no_of_hqd = 24, | 170 | .max_no_of_hqd = 24, |
171 | .doorbell_size = 4, | ||
165 | .ih_ring_entry_size = 4 * sizeof(uint32_t), | 172 | .ih_ring_entry_size = 4 * sizeof(uint32_t), |
166 | .event_interrupt_class = &event_interrupt_class_cik, | 173 | .event_interrupt_class = &event_interrupt_class_cik, |
167 | .num_of_watch_points = 4, | 174 | .num_of_watch_points = 4, |
@@ -171,6 +178,34 @@ static const struct kfd_device_info polaris11_device_info = { | |||
171 | .needs_pci_atomics = true, | 178 | .needs_pci_atomics = true, |
172 | }; | 179 | }; |
173 | 180 | ||
181 | static const struct kfd_device_info vega10_device_info = { | ||
182 | .asic_family = CHIP_VEGA10, | ||
183 | .max_pasid_bits = 16, | ||
184 | .max_no_of_hqd = 24, | ||
185 | .doorbell_size = 8, | ||
186 | .ih_ring_entry_size = 8 * sizeof(uint32_t), | ||
187 | .event_interrupt_class = &event_interrupt_class_v9, | ||
188 | .num_of_watch_points = 4, | ||
189 | .mqd_size_aligned = MQD_SIZE_ALIGNED, | ||
190 | .supports_cwsr = true, | ||
191 | .needs_iommu_device = false, | ||
192 | .needs_pci_atomics = false, | ||
193 | }; | ||
194 | |||
195 | static const struct kfd_device_info vega10_vf_device_info = { | ||
196 | .asic_family = CHIP_VEGA10, | ||
197 | .max_pasid_bits = 16, | ||
198 | .max_no_of_hqd = 24, | ||
199 | .doorbell_size = 8, | ||
200 | .ih_ring_entry_size = 8 * sizeof(uint32_t), | ||
201 | .event_interrupt_class = &event_interrupt_class_v9, | ||
202 | .num_of_watch_points = 4, | ||
203 | .mqd_size_aligned = MQD_SIZE_ALIGNED, | ||
204 | .supports_cwsr = true, | ||
205 | .needs_iommu_device = false, | ||
206 | .needs_pci_atomics = false, | ||
207 | }; | ||
208 | |||
174 | 209 | ||
175 | struct kfd_deviceid { | 210 | struct kfd_deviceid { |
176 | unsigned short did; | 211 | unsigned short did; |
@@ -250,6 +285,15 @@ static const struct kfd_deviceid supported_devices[] = { | |||
250 | { 0x67EB, &polaris11_device_info }, /* Polaris11 */ | 285 | { 0x67EB, &polaris11_device_info }, /* Polaris11 */ |
251 | { 0x67EF, &polaris11_device_info }, /* Polaris11 */ | 286 | { 0x67EF, &polaris11_device_info }, /* Polaris11 */ |
252 | { 0x67FF, &polaris11_device_info }, /* Polaris11 */ | 287 | { 0x67FF, &polaris11_device_info }, /* Polaris11 */ |
288 | { 0x6860, &vega10_device_info }, /* Vega10 */ | ||
289 | { 0x6861, &vega10_device_info }, /* Vega10 */ | ||
290 | { 0x6862, &vega10_device_info }, /* Vega10 */ | ||
291 | { 0x6863, &vega10_device_info }, /* Vega10 */ | ||
292 | { 0x6864, &vega10_device_info }, /* Vega10 */ | ||
293 | { 0x6867, &vega10_device_info }, /* Vega10 */ | ||
294 | { 0x6868, &vega10_device_info }, /* Vega10 */ | ||
295 | { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ | ||
296 | { 0x687F, &vega10_device_info }, /* Vega10 */ | ||
253 | }; | 297 | }; |
254 | 298 | ||
255 | static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, | 299 | static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, |
@@ -279,7 +323,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, | |||
279 | struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) | 323 | struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) |
280 | { | 324 | { |
281 | struct kfd_dev *kfd; | 325 | struct kfd_dev *kfd; |
282 | 326 | int ret; | |
283 | const struct kfd_device_info *device_info = | 327 | const struct kfd_device_info *device_info = |
284 | lookup_device_info(pdev->device); | 328 | lookup_device_info(pdev->device); |
285 | 329 | ||
@@ -288,19 +332,18 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, | |||
288 | return NULL; | 332 | return NULL; |
289 | } | 333 | } |
290 | 334 | ||
291 | if (device_info->needs_pci_atomics) { | 335 | /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. |
292 | /* Allow BIF to recode atomics to PCIe 3.0 | 336 | * 32 and 64-bit requests are possible and must be |
293 | * AtomicOps. 32 and 64-bit requests are possible and | 337 | * supported. |
294 | * must be supported. | 338 | */ |
295 | */ | 339 | ret = pci_enable_atomic_ops_to_root(pdev, |
296 | if (pci_enable_atomic_ops_to_root(pdev, | 340 | PCI_EXP_DEVCAP2_ATOMIC_COMP32 | |
297 | PCI_EXP_DEVCAP2_ATOMIC_COMP32 | | 341 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); |
298 | PCI_EXP_DEVCAP2_ATOMIC_COMP64) < 0) { | 342 | if (device_info->needs_pci_atomics && ret < 0) { |
299 | dev_info(kfd_device, | 343 | dev_info(kfd_device, |
300 | "skipped device %x:%x, PCI rejects atomics", | 344 | "skipped device %x:%x, PCI rejects atomics\n", |
301 | pdev->vendor, pdev->device); | 345 | pdev->vendor, pdev->device); |
302 | return NULL; | 346 | return NULL; |
303 | } | ||
304 | } | 347 | } |
305 | 348 | ||
306 | kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); | 349 | kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); |
@@ -323,10 +366,16 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, | |||
323 | static void kfd_cwsr_init(struct kfd_dev *kfd) | 366 | static void kfd_cwsr_init(struct kfd_dev *kfd) |
324 | { | 367 | { |
325 | if (cwsr_enable && kfd->device_info->supports_cwsr) { | 368 | if (cwsr_enable && kfd->device_info->supports_cwsr) { |
326 | BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); | 369 | if (kfd->device_info->asic_family < CHIP_VEGA10) { |
370 | BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); | ||
371 | kfd->cwsr_isa = cwsr_trap_gfx8_hex; | ||
372 | kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); | ||
373 | } else { | ||
374 | BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); | ||
375 | kfd->cwsr_isa = cwsr_trap_gfx9_hex; | ||
376 | kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); | ||
377 | } | ||
327 | 378 | ||
328 | kfd->cwsr_isa = cwsr_trap_gfx8_hex; | ||
329 | kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); | ||
330 | kfd->cwsr_enabled = true; | 379 | kfd->cwsr_enabled = true; |
331 | } | 380 | } |
332 | } | 381 | } |
@@ -541,6 +590,44 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) | |||
541 | spin_unlock(&kfd->interrupt_lock); | 590 | spin_unlock(&kfd->interrupt_lock); |
542 | } | 591 | } |
543 | 592 | ||
593 | int kgd2kfd_quiesce_mm(struct mm_struct *mm) | ||
594 | { | ||
595 | struct kfd_process *p; | ||
596 | int r; | ||
597 | |||
598 | /* Because we are called from arbitrary context (workqueue) as opposed | ||
599 | * to process context, kfd_process could attempt to exit while we are | ||
600 | * running so the lookup function increments the process ref count. | ||
601 | */ | ||
602 | p = kfd_lookup_process_by_mm(mm); | ||
603 | if (!p) | ||
604 | return -ESRCH; | ||
605 | |||
606 | r = kfd_process_evict_queues(p); | ||
607 | |||
608 | kfd_unref_process(p); | ||
609 | return r; | ||
610 | } | ||
611 | |||
612 | int kgd2kfd_resume_mm(struct mm_struct *mm) | ||
613 | { | ||
614 | struct kfd_process *p; | ||
615 | int r; | ||
616 | |||
617 | /* Because we are called from arbitrary context (workqueue) as opposed | ||
618 | * to process context, kfd_process could attempt to exit while we are | ||
619 | * running so the lookup function increments the process ref count. | ||
620 | */ | ||
621 | p = kfd_lookup_process_by_mm(mm); | ||
622 | if (!p) | ||
623 | return -ESRCH; | ||
624 | |||
625 | r = kfd_process_restore_queues(p); | ||
626 | |||
627 | kfd_unref_process(p); | ||
628 | return r; | ||
629 | } | ||
630 | |||
544 | /** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will | 631 | /** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will |
545 | * prepare for safe eviction of KFD BOs that belong to the specified | 632 | * prepare for safe eviction of KFD BOs that belong to the specified |
546 | * process. | 633 | * process. |
@@ -652,7 +739,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, | |||
652 | if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) | 739 | if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) |
653 | return -ENOMEM; | 740 | return -ENOMEM; |
654 | 741 | ||
655 | *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); | 742 | *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); |
656 | if ((*mem_obj) == NULL) | 743 | if ((*mem_obj) == NULL) |
657 | return -ENOMEM; | 744 | return -ENOMEM; |
658 | 745 | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index d55d29d31da4..668ad07ebe1f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | |||
@@ -110,6 +110,57 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, | |||
110 | qpd->sh_mem_bases); | 110 | qpd->sh_mem_bases); |
111 | } | 111 | } |
112 | 112 | ||
113 | static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) | ||
114 | { | ||
115 | struct kfd_dev *dev = qpd->dqm->dev; | ||
116 | |||
117 | if (!KFD_IS_SOC15(dev->device_info->asic_family)) { | ||
118 | /* On pre-SOC15 chips we need to use the queue ID to | ||
119 | * preserve the user mode ABI. | ||
120 | */ | ||
121 | q->doorbell_id = q->properties.queue_id; | ||
122 | } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { | ||
123 | /* For SDMA queues on SOC15, use static doorbell | ||
124 | * assignments based on the engine and queue. | ||
125 | */ | ||
126 | q->doorbell_id = dev->shared_resources.sdma_doorbell | ||
127 | [q->properties.sdma_engine_id] | ||
128 | [q->properties.sdma_queue_id]; | ||
129 | } else { | ||
130 | /* For CP queues on SOC15 reserve a free doorbell ID */ | ||
131 | unsigned int found; | ||
132 | |||
133 | found = find_first_zero_bit(qpd->doorbell_bitmap, | ||
134 | KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); | ||
135 | if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { | ||
136 | pr_debug("No doorbells available"); | ||
137 | return -EBUSY; | ||
138 | } | ||
139 | set_bit(found, qpd->doorbell_bitmap); | ||
140 | q->doorbell_id = found; | ||
141 | } | ||
142 | |||
143 | q->properties.doorbell_off = | ||
144 | kfd_doorbell_id_to_offset(dev, q->process, | ||
145 | q->doorbell_id); | ||
146 | |||
147 | return 0; | ||
148 | } | ||
149 | |||
150 | static void deallocate_doorbell(struct qcm_process_device *qpd, | ||
151 | struct queue *q) | ||
152 | { | ||
153 | unsigned int old; | ||
154 | struct kfd_dev *dev = qpd->dqm->dev; | ||
155 | |||
156 | if (!KFD_IS_SOC15(dev->device_info->asic_family) || | ||
157 | q->properties.type == KFD_QUEUE_TYPE_SDMA) | ||
158 | return; | ||
159 | |||
160 | old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); | ||
161 | WARN_ON(!old); | ||
162 | } | ||
163 | |||
113 | static int allocate_vmid(struct device_queue_manager *dqm, | 164 | static int allocate_vmid(struct device_queue_manager *dqm, |
114 | struct qcm_process_device *qpd, | 165 | struct qcm_process_device *qpd, |
115 | struct queue *q) | 166 | struct queue *q) |
@@ -145,15 +196,19 @@ static int allocate_vmid(struct device_queue_manager *dqm, | |||
145 | static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, | 196 | static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, |
146 | struct qcm_process_device *qpd) | 197 | struct qcm_process_device *qpd) |
147 | { | 198 | { |
148 | uint32_t len; | 199 | const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf; |
200 | int ret; | ||
149 | 201 | ||
150 | if (!qpd->ib_kaddr) | 202 | if (!qpd->ib_kaddr) |
151 | return -ENOMEM; | 203 | return -ENOMEM; |
152 | 204 | ||
153 | len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); | 205 | ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); |
206 | if (ret) | ||
207 | return ret; | ||
154 | 208 | ||
155 | return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, | 209 | return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, |
156 | qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); | 210 | qpd->ib_base, (uint32_t *)qpd->ib_kaddr, |
211 | pmf->release_mem_size / sizeof(uint32_t)); | ||
157 | } | 212 | } |
158 | 213 | ||
159 | static void deallocate_vmid(struct device_queue_manager *dqm, | 214 | static void deallocate_vmid(struct device_queue_manager *dqm, |
@@ -301,10 +356,14 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, | |||
301 | if (retval) | 356 | if (retval) |
302 | return retval; | 357 | return retval; |
303 | 358 | ||
359 | retval = allocate_doorbell(qpd, q); | ||
360 | if (retval) | ||
361 | goto out_deallocate_hqd; | ||
362 | |||
304 | retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, | 363 | retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, |
305 | &q->gart_mqd_addr, &q->properties); | 364 | &q->gart_mqd_addr, &q->properties); |
306 | if (retval) | 365 | if (retval) |
307 | goto out_deallocate_hqd; | 366 | goto out_deallocate_doorbell; |
308 | 367 | ||
309 | pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", | 368 | pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", |
310 | q->pipe, q->queue); | 369 | q->pipe, q->queue); |
@@ -324,6 +383,8 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, | |||
324 | 383 | ||
325 | out_uninit_mqd: | 384 | out_uninit_mqd: |
326 | mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); | 385 | mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); |
386 | out_deallocate_doorbell: | ||
387 | deallocate_doorbell(qpd, q); | ||
327 | out_deallocate_hqd: | 388 | out_deallocate_hqd: |
328 | deallocate_hqd(dqm, q); | 389 | deallocate_hqd(dqm, q); |
329 | 390 | ||
@@ -357,6 +418,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, | |||
357 | } | 418 | } |
358 | dqm->total_queue_count--; | 419 | dqm->total_queue_count--; |
359 | 420 | ||
421 | deallocate_doorbell(qpd, q); | ||
422 | |||
360 | retval = mqd->destroy_mqd(mqd, q->mqd, | 423 | retval = mqd->destroy_mqd(mqd, q->mqd, |
361 | KFD_PREEMPT_TYPE_WAVEFRONT_RESET, | 424 | KFD_PREEMPT_TYPE_WAVEFRONT_RESET, |
362 | KFD_UNMAP_LATENCY_MS, | 425 | KFD_UNMAP_LATENCY_MS, |
@@ -861,6 +924,10 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, | |||
861 | q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; | 924 | q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; |
862 | q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; | 925 | q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; |
863 | 926 | ||
927 | retval = allocate_doorbell(qpd, q); | ||
928 | if (retval) | ||
929 | goto out_deallocate_sdma_queue; | ||
930 | |||
864 | pr_debug("SDMA id is: %d\n", q->sdma_id); | 931 | pr_debug("SDMA id is: %d\n", q->sdma_id); |
865 | pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); | 932 | pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); |
866 | pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); | 933 | pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); |
@@ -869,7 +936,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, | |||
869 | retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, | 936 | retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, |
870 | &q->gart_mqd_addr, &q->properties); | 937 | &q->gart_mqd_addr, &q->properties); |
871 | if (retval) | 938 | if (retval) |
872 | goto out_deallocate_sdma_queue; | 939 | goto out_deallocate_doorbell; |
873 | 940 | ||
874 | retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); | 941 | retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); |
875 | if (retval) | 942 | if (retval) |
@@ -879,6 +946,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, | |||
879 | 946 | ||
880 | out_uninit_mqd: | 947 | out_uninit_mqd: |
881 | mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); | 948 | mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); |
949 | out_deallocate_doorbell: | ||
950 | deallocate_doorbell(qpd, q); | ||
882 | out_deallocate_sdma_queue: | 951 | out_deallocate_sdma_queue: |
883 | deallocate_sdma_queue(dqm, q->sdma_id); | 952 | deallocate_sdma_queue(dqm, q->sdma_id); |
884 | 953 | ||
@@ -1070,12 +1139,17 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, | |||
1070 | q->properties.sdma_engine_id = | 1139 | q->properties.sdma_engine_id = |
1071 | q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; | 1140 | q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; |
1072 | } | 1141 | } |
1142 | |||
1143 | retval = allocate_doorbell(qpd, q); | ||
1144 | if (retval) | ||
1145 | goto out_deallocate_sdma_queue; | ||
1146 | |||
1073 | mqd = dqm->ops.get_mqd_manager(dqm, | 1147 | mqd = dqm->ops.get_mqd_manager(dqm, |
1074 | get_mqd_type_from_queue_type(q->properties.type)); | 1148 | get_mqd_type_from_queue_type(q->properties.type)); |
1075 | 1149 | ||
1076 | if (!mqd) { | 1150 | if (!mqd) { |
1077 | retval = -ENOMEM; | 1151 | retval = -ENOMEM; |
1078 | goto out_deallocate_sdma_queue; | 1152 | goto out_deallocate_doorbell; |
1079 | } | 1153 | } |
1080 | /* | 1154 | /* |
1081 | * Eviction state logic: we only mark active queues as evicted | 1155 | * Eviction state logic: we only mark active queues as evicted |
@@ -1093,7 +1167,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, | |||
1093 | retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, | 1167 | retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, |
1094 | &q->gart_mqd_addr, &q->properties); | 1168 | &q->gart_mqd_addr, &q->properties); |
1095 | if (retval) | 1169 | if (retval) |
1096 | goto out_deallocate_sdma_queue; | 1170 | goto out_deallocate_doorbell; |
1097 | 1171 | ||
1098 | list_add(&q->list, &qpd->queues_list); | 1172 | list_add(&q->list, &qpd->queues_list); |
1099 | qpd->queue_count++; | 1173 | qpd->queue_count++; |
@@ -1117,6 +1191,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, | |||
1117 | mutex_unlock(&dqm->lock); | 1191 | mutex_unlock(&dqm->lock); |
1118 | return retval; | 1192 | return retval; |
1119 | 1193 | ||
1194 | out_deallocate_doorbell: | ||
1195 | deallocate_doorbell(qpd, q); | ||
1120 | out_deallocate_sdma_queue: | 1196 | out_deallocate_sdma_queue: |
1121 | if (q->properties.type == KFD_QUEUE_TYPE_SDMA) | 1197 | if (q->properties.type == KFD_QUEUE_TYPE_SDMA) |
1122 | deallocate_sdma_queue(dqm, q->sdma_id); | 1198 | deallocate_sdma_queue(dqm, q->sdma_id); |
@@ -1257,6 +1333,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, | |||
1257 | goto failed; | 1333 | goto failed; |
1258 | } | 1334 | } |
1259 | 1335 | ||
1336 | deallocate_doorbell(qpd, q); | ||
1337 | |||
1260 | if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { | 1338 | if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { |
1261 | dqm->sdma_queue_count--; | 1339 | dqm->sdma_queue_count--; |
1262 | deallocate_sdma_queue(dqm, q->sdma_id); | 1340 | deallocate_sdma_queue(dqm, q->sdma_id); |
@@ -1308,7 +1386,10 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, | |||
1308 | void __user *alternate_aperture_base, | 1386 | void __user *alternate_aperture_base, |
1309 | uint64_t alternate_aperture_size) | 1387 | uint64_t alternate_aperture_size) |
1310 | { | 1388 | { |
1311 | bool retval; | 1389 | bool retval = true; |
1390 | |||
1391 | if (!dqm->asic_ops.set_cache_memory_policy) | ||
1392 | return retval; | ||
1312 | 1393 | ||
1313 | mutex_lock(&dqm->lock); | 1394 | mutex_lock(&dqm->lock); |
1314 | 1395 | ||
@@ -1577,6 +1658,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) | |||
1577 | case CHIP_POLARIS11: | 1658 | case CHIP_POLARIS11: |
1578 | device_queue_manager_init_vi_tonga(&dqm->asic_ops); | 1659 | device_queue_manager_init_vi_tonga(&dqm->asic_ops); |
1579 | break; | 1660 | break; |
1661 | |||
1662 | case CHIP_VEGA10: | ||
1663 | case CHIP_RAVEN: | ||
1664 | device_queue_manager_init_v9(&dqm->asic_ops); | ||
1665 | break; | ||
1580 | default: | 1666 | default: |
1581 | WARN(1, "Unexpected ASIC family %u", | 1667 | WARN(1, "Unexpected ASIC family %u", |
1582 | dev->device_info->asic_family); | 1668 | dev->device_info->asic_family); |
@@ -1627,6 +1713,18 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) | |||
1627 | int pipe, queue; | 1713 | int pipe, queue; |
1628 | int r = 0; | 1714 | int r = 0; |
1629 | 1715 | ||
1716 | r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd, | ||
1717 | KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs); | ||
1718 | if (!r) { | ||
1719 | seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n", | ||
1720 | KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, | ||
1721 | KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), | ||
1722 | KFD_CIK_HIQ_QUEUE); | ||
1723 | seq_reg_dump(m, dump, n_regs); | ||
1724 | |||
1725 | kfree(dump); | ||
1726 | } | ||
1727 | |||
1630 | for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { | 1728 | for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { |
1631 | int pipe_offset = pipe * get_queues_per_pipe(dqm); | 1729 | int pipe_offset = pipe * get_queues_per_pipe(dqm); |
1632 | 1730 | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 412beff3281d..59a6b1956932 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | |||
@@ -200,6 +200,8 @@ void device_queue_manager_init_vi( | |||
200 | struct device_queue_manager_asic_ops *asic_ops); | 200 | struct device_queue_manager_asic_ops *asic_ops); |
201 | void device_queue_manager_init_vi_tonga( | 201 | void device_queue_manager_init_vi_tonga( |
202 | struct device_queue_manager_asic_ops *asic_ops); | 202 | struct device_queue_manager_asic_ops *asic_ops); |
203 | void device_queue_manager_init_v9( | ||
204 | struct device_queue_manager_asic_ops *asic_ops); | ||
203 | void program_sh_mem_settings(struct device_queue_manager *dqm, | 205 | void program_sh_mem_settings(struct device_queue_manager *dqm, |
204 | struct qcm_process_device *qpd); | 206 | struct qcm_process_device *qpd); |
205 | unsigned int get_queues_num(struct device_queue_manager *dqm); | 207 | unsigned int get_queues_num(struct device_queue_manager *dqm); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c new file mode 100644 index 000000000000..79e5bcf6367c --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c | |||
@@ -0,0 +1,84 @@ | |||
1 | /* | ||
2 | * Copyright 2016-2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | #include "kfd_device_queue_manager.h" | ||
25 | #include "vega10_enum.h" | ||
26 | #include "gc/gc_9_0_offset.h" | ||
27 | #include "gc/gc_9_0_sh_mask.h" | ||
28 | #include "sdma0/sdma0_4_0_sh_mask.h" | ||
29 | |||
30 | static int update_qpd_v9(struct device_queue_manager *dqm, | ||
31 | struct qcm_process_device *qpd); | ||
32 | static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, | ||
33 | struct qcm_process_device *qpd); | ||
34 | |||
35 | void device_queue_manager_init_v9( | ||
36 | struct device_queue_manager_asic_ops *asic_ops) | ||
37 | { | ||
38 | asic_ops->update_qpd = update_qpd_v9; | ||
39 | asic_ops->init_sdma_vm = init_sdma_vm_v9; | ||
40 | } | ||
41 | |||
42 | static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) | ||
43 | { | ||
44 | uint32_t shared_base = pdd->lds_base >> 48; | ||
45 | uint32_t private_base = pdd->scratch_base >> 48; | ||
46 | |||
47 | return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | | ||
48 | private_base; | ||
49 | } | ||
50 | |||
51 | static int update_qpd_v9(struct device_queue_manager *dqm, | ||
52 | struct qcm_process_device *qpd) | ||
53 | { | ||
54 | struct kfd_process_device *pdd; | ||
55 | |||
56 | pdd = qpd_to_pdd(qpd); | ||
57 | |||
58 | /* check if sh_mem_config register already configured */ | ||
59 | if (qpd->sh_mem_config == 0) { | ||
60 | qpd->sh_mem_config = | ||
61 | SH_MEM_ALIGNMENT_MODE_UNALIGNED << | ||
62 | SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; | ||
63 | if (vega10_noretry && | ||
64 | !dqm->dev->device_info->needs_iommu_device) | ||
65 | qpd->sh_mem_config |= | ||
66 | 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; | ||
67 | |||
68 | qpd->sh_mem_ape1_limit = 0; | ||
69 | qpd->sh_mem_ape1_base = 0; | ||
70 | } | ||
71 | |||
72 | qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); | ||
73 | |||
74 | pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); | ||
75 | |||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, | ||
80 | struct qcm_process_device *qpd) | ||
81 | { | ||
82 | /* Not needed on SDMAv4 any more */ | ||
83 | q->properties.sdma_vm_addr = 0; | ||
84 | } | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index ebb4da14e3df..c3744d89352c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | |||
@@ -33,7 +33,6 @@ | |||
33 | 33 | ||
34 | static DEFINE_IDA(doorbell_ida); | 34 | static DEFINE_IDA(doorbell_ida); |
35 | static unsigned int max_doorbell_slices; | 35 | static unsigned int max_doorbell_slices; |
36 | #define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 | ||
37 | 36 | ||
38 | /* | 37 | /* |
39 | * Each device exposes a doorbell aperture, a PCI MMIO aperture that | 38 | * Each device exposes a doorbell aperture, a PCI MMIO aperture that |
@@ -50,9 +49,9 @@ static unsigned int max_doorbell_slices; | |||
50 | */ | 49 | */ |
51 | 50 | ||
52 | /* # of doorbell bytes allocated for each process. */ | 51 | /* # of doorbell bytes allocated for each process. */ |
53 | static inline size_t doorbell_process_allocation(void) | 52 | size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) |
54 | { | 53 | { |
55 | return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * | 54 | return roundup(kfd->device_info->doorbell_size * |
56 | KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, | 55 | KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, |
57 | PAGE_SIZE); | 56 | PAGE_SIZE); |
58 | } | 57 | } |
@@ -72,16 +71,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd) | |||
72 | 71 | ||
73 | doorbell_start_offset = | 72 | doorbell_start_offset = |
74 | roundup(kfd->shared_resources.doorbell_start_offset, | 73 | roundup(kfd->shared_resources.doorbell_start_offset, |
75 | doorbell_process_allocation()); | 74 | kfd_doorbell_process_slice(kfd)); |
76 | 75 | ||
77 | doorbell_aperture_size = | 76 | doorbell_aperture_size = |
78 | rounddown(kfd->shared_resources.doorbell_aperture_size, | 77 | rounddown(kfd->shared_resources.doorbell_aperture_size, |
79 | doorbell_process_allocation()); | 78 | kfd_doorbell_process_slice(kfd)); |
80 | 79 | ||
81 | if (doorbell_aperture_size > doorbell_start_offset) | 80 | if (doorbell_aperture_size > doorbell_start_offset) |
82 | doorbell_process_limit = | 81 | doorbell_process_limit = |
83 | (doorbell_aperture_size - doorbell_start_offset) / | 82 | (doorbell_aperture_size - doorbell_start_offset) / |
84 | doorbell_process_allocation(); | 83 | kfd_doorbell_process_slice(kfd); |
85 | else | 84 | else |
86 | return -ENOSPC; | 85 | return -ENOSPC; |
87 | 86 | ||
@@ -95,7 +94,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd) | |||
95 | kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); | 94 | kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); |
96 | 95 | ||
97 | kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, | 96 | kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, |
98 | doorbell_process_allocation()); | 97 | kfd_doorbell_process_slice(kfd)); |
99 | 98 | ||
100 | if (!kfd->doorbell_kernel_ptr) | 99 | if (!kfd->doorbell_kernel_ptr) |
101 | return -ENOMEM; | 100 | return -ENOMEM; |
@@ -127,21 +126,16 @@ void kfd_doorbell_fini(struct kfd_dev *kfd) | |||
127 | iounmap(kfd->doorbell_kernel_ptr); | 126 | iounmap(kfd->doorbell_kernel_ptr); |
128 | } | 127 | } |
129 | 128 | ||
130 | int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) | 129 | int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, |
130 | struct vm_area_struct *vma) | ||
131 | { | 131 | { |
132 | phys_addr_t address; | 132 | phys_addr_t address; |
133 | struct kfd_dev *dev; | ||
134 | 133 | ||
135 | /* | 134 | /* |
136 | * For simplicitly we only allow mapping of the entire doorbell | 135 | * For simplicitly we only allow mapping of the entire doorbell |
137 | * allocation of a single device & process. | 136 | * allocation of a single device & process. |
138 | */ | 137 | */ |
139 | if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) | 138 | if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) |
140 | return -EINVAL; | ||
141 | |||
142 | /* Find kfd device according to gpu id */ | ||
143 | dev = kfd_device_by_id(vma->vm_pgoff); | ||
144 | if (!dev) | ||
145 | return -EINVAL; | 139 | return -EINVAL; |
146 | 140 | ||
147 | /* Calculate physical address of doorbell */ | 141 | /* Calculate physical address of doorbell */ |
@@ -158,19 +152,19 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) | |||
158 | " vm_flags == 0x%04lX\n" | 152 | " vm_flags == 0x%04lX\n" |
159 | " size == 0x%04lX\n", | 153 | " size == 0x%04lX\n", |
160 | (unsigned long long) vma->vm_start, address, vma->vm_flags, | 154 | (unsigned long long) vma->vm_start, address, vma->vm_flags, |
161 | doorbell_process_allocation()); | 155 | kfd_doorbell_process_slice(dev)); |
162 | 156 | ||
163 | 157 | ||
164 | return io_remap_pfn_range(vma, | 158 | return io_remap_pfn_range(vma, |
165 | vma->vm_start, | 159 | vma->vm_start, |
166 | address >> PAGE_SHIFT, | 160 | address >> PAGE_SHIFT, |
167 | doorbell_process_allocation(), | 161 | kfd_doorbell_process_slice(dev), |
168 | vma->vm_page_prot); | 162 | vma->vm_page_prot); |
169 | } | 163 | } |
170 | 164 | ||
171 | 165 | ||
172 | /* get kernel iomem pointer for a doorbell */ | 166 | /* get kernel iomem pointer for a doorbell */ |
173 | u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, | 167 | void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, |
174 | unsigned int *doorbell_off) | 168 | unsigned int *doorbell_off) |
175 | { | 169 | { |
176 | u32 inx; | 170 | u32 inx; |
@@ -185,6 +179,8 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, | |||
185 | if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) | 179 | if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) |
186 | return NULL; | 180 | return NULL; |
187 | 181 | ||
182 | inx *= kfd->device_info->doorbell_size / sizeof(u32); | ||
183 | |||
188 | /* | 184 | /* |
189 | * Calculating the kernel doorbell offset using the first | 185 | * Calculating the kernel doorbell offset using the first |
190 | * doorbell page. | 186 | * doorbell page. |
@@ -210,7 +206,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) | |||
210 | mutex_unlock(&kfd->doorbell_mutex); | 206 | mutex_unlock(&kfd->doorbell_mutex); |
211 | } | 207 | } |
212 | 208 | ||
213 | inline void write_kernel_doorbell(u32 __iomem *db, u32 value) | 209 | void write_kernel_doorbell(void __iomem *db, u32 value) |
214 | { | 210 | { |
215 | if (db) { | 211 | if (db) { |
216 | writel(value, db); | 212 | writel(value, db); |
@@ -218,30 +214,37 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value) | |||
218 | } | 214 | } |
219 | } | 215 | } |
220 | 216 | ||
221 | /* | 217 | void write_kernel_doorbell64(void __iomem *db, u64 value) |
222 | * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 | 218 | { |
223 | * to doorbells with the process's doorbell page | 219 | if (db) { |
224 | */ | 220 | WARN(((unsigned long)db & 7) != 0, |
225 | unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, | 221 | "Unaligned 64-bit doorbell"); |
222 | writeq(value, (u64 __iomem *)db); | ||
223 | pr_debug("writing %llu to doorbell address %p\n", value, db); | ||
224 | } | ||
225 | } | ||
226 | |||
227 | unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, | ||
226 | struct kfd_process *process, | 228 | struct kfd_process *process, |
227 | unsigned int queue_id) | 229 | unsigned int doorbell_id) |
228 | { | 230 | { |
229 | /* | 231 | /* |
230 | * doorbell_id_offset accounts for doorbells taken by KGD. | 232 | * doorbell_id_offset accounts for doorbells taken by KGD. |
231 | * index * doorbell_process_allocation/sizeof(u32) adjusts to | 233 | * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to |
232 | * the process's doorbells. | 234 | * the process's doorbells. The offset returned is in dword |
235 | * units regardless of the ASIC-dependent doorbell size. | ||
233 | */ | 236 | */ |
234 | return kfd->doorbell_id_offset + | 237 | return kfd->doorbell_id_offset + |
235 | process->doorbell_index | 238 | process->doorbell_index |
236 | * doorbell_process_allocation() / sizeof(u32) + | 239 | * kfd_doorbell_process_slice(kfd) / sizeof(u32) + |
237 | queue_id; | 240 | doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); |
238 | } | 241 | } |
239 | 242 | ||
240 | uint64_t kfd_get_number_elems(struct kfd_dev *kfd) | 243 | uint64_t kfd_get_number_elems(struct kfd_dev *kfd) |
241 | { | 244 | { |
242 | uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - | 245 | uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - |
243 | kfd->shared_resources.doorbell_start_offset) / | 246 | kfd->shared_resources.doorbell_start_offset) / |
244 | doorbell_process_allocation() + 1; | 247 | kfd_doorbell_process_slice(kfd) + 1; |
245 | 248 | ||
246 | return num_of_elems; | 249 | return num_of_elems; |
247 | 250 | ||
@@ -251,7 +254,7 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, | |||
251 | struct kfd_process *process) | 254 | struct kfd_process *process) |
252 | { | 255 | { |
253 | return dev->doorbell_base + | 256 | return dev->doorbell_base + |
254 | process->doorbell_index * doorbell_process_allocation(); | 257 | process->doorbell_index * kfd_doorbell_process_slice(dev); |
255 | } | 258 | } |
256 | 259 | ||
257 | int kfd_alloc_process_doorbells(struct kfd_process *process) | 260 | int kfd_alloc_process_doorbells(struct kfd_process *process) |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 4890a90f1e44..5562e94e786a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c | |||
@@ -345,7 +345,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, | |||
345 | case KFD_EVENT_TYPE_DEBUG: | 345 | case KFD_EVENT_TYPE_DEBUG: |
346 | ret = create_signal_event(devkfd, p, ev); | 346 | ret = create_signal_event(devkfd, p, ev); |
347 | if (!ret) { | 347 | if (!ret) { |
348 | *event_page_offset = KFD_MMAP_EVENTS_MASK; | 348 | *event_page_offset = KFD_MMAP_TYPE_EVENTS; |
349 | *event_page_offset <<= PAGE_SHIFT; | 349 | *event_page_offset <<= PAGE_SHIFT; |
350 | *event_slot_index = ev->event_id; | 350 | *event_slot_index = ev->event_id; |
351 | } | 351 | } |
@@ -496,7 +496,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, | |||
496 | pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", | 496 | pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", |
497 | partial_id, valid_id_bits); | 497 | partial_id, valid_id_bits); |
498 | 498 | ||
499 | if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/2) { | 499 | if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) { |
500 | /* With relatively few events, it's faster to | 500 | /* With relatively few events, it's faster to |
501 | * iterate over the event IDR | 501 | * iterate over the event IDR |
502 | */ | 502 | */ |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 66852de410c8..97d5423c5673 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | |||
@@ -275,23 +275,35 @@ | |||
275 | * for FLAT_* / S_LOAD operations. | 275 | * for FLAT_* / S_LOAD operations. |
276 | */ | 276 | */ |
277 | 277 | ||
278 | #define MAKE_GPUVM_APP_BASE(gpu_num) \ | 278 | #define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ |
279 | (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) | 279 | (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) |
280 | 280 | ||
281 | #define MAKE_GPUVM_APP_LIMIT(base, size) \ | 281 | #define MAKE_GPUVM_APP_LIMIT(base, size) \ |
282 | (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) | 282 | (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) |
283 | 283 | ||
284 | #define MAKE_SCRATCH_APP_BASE() \ | 284 | #define MAKE_SCRATCH_APP_BASE_VI() \ |
285 | (((uint64_t)(0x1UL) << 61) + 0x100000000L) | 285 | (((uint64_t)(0x1UL) << 61) + 0x100000000L) |
286 | 286 | ||
287 | #define MAKE_SCRATCH_APP_LIMIT(base) \ | 287 | #define MAKE_SCRATCH_APP_LIMIT(base) \ |
288 | (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) | 288 | (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) |
289 | 289 | ||
290 | #define MAKE_LDS_APP_BASE() \ | 290 | #define MAKE_LDS_APP_BASE_VI() \ |
291 | (((uint64_t)(0x1UL) << 61) + 0x0) | 291 | (((uint64_t)(0x1UL) << 61) + 0x0) |
292 | #define MAKE_LDS_APP_LIMIT(base) \ | 292 | #define MAKE_LDS_APP_LIMIT(base) \ |
293 | (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) | 293 | (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) |
294 | 294 | ||
295 | /* On GFXv9 the LDS and scratch apertures are programmed independently | ||
296 | * using the high 16 bits of the 64-bit virtual address. They must be | ||
297 | * in the hole, which will be the case as long as the high 16 bits are | ||
298 | * not 0. | ||
299 | * | ||
300 | * The aperture sizes are still 4GB implicitly. | ||
301 | * | ||
302 | * A GPUVM aperture is not applicable on GFXv9. | ||
303 | */ | ||
304 | #define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) | ||
305 | #define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) | ||
306 | |||
295 | /* User mode manages most of the SVM aperture address space. The low | 307 | /* User mode manages most of the SVM aperture address space. The low |
296 | * 16MB are reserved for kernel use (CWSR trap handler and kernel IB | 308 | * 16MB are reserved for kernel use (CWSR trap handler and kernel IB |
297 | * for now). | 309 | * for now). |
@@ -300,6 +312,55 @@ | |||
300 | #define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE) | 312 | #define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE) |
301 | #define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE) | 313 | #define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE) |
302 | 314 | ||
315 | static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) | ||
316 | { | ||
317 | /* | ||
318 | * node id couldn't be 0 - the three MSB bits of | ||
319 | * aperture shoudn't be 0 | ||
320 | */ | ||
321 | pdd->lds_base = MAKE_LDS_APP_BASE_VI(); | ||
322 | pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); | ||
323 | |||
324 | if (!pdd->dev->device_info->needs_iommu_device) { | ||
325 | /* dGPUs: SVM aperture starting at 0 | ||
326 | * with small reserved space for kernel. | ||
327 | * Set them to CANONICAL addresses. | ||
328 | */ | ||
329 | pdd->gpuvm_base = SVM_USER_BASE; | ||
330 | pdd->gpuvm_limit = | ||
331 | pdd->dev->shared_resources.gpuvm_size - 1; | ||
332 | } else { | ||
333 | /* set them to non CANONICAL addresses, and no SVM is | ||
334 | * allocated. | ||
335 | */ | ||
336 | pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); | ||
337 | pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base, | ||
338 | pdd->dev->shared_resources.gpuvm_size); | ||
339 | } | ||
340 | |||
341 | pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); | ||
342 | pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); | ||
343 | } | ||
344 | |||
345 | static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) | ||
346 | { | ||
347 | pdd->lds_base = MAKE_LDS_APP_BASE_V9(); | ||
348 | pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); | ||
349 | |||
350 | /* Raven needs SVM to support graphic handle, etc. Leave the small | ||
351 | * reserved space before SVM on Raven as well, even though we don't | ||
352 | * have to. | ||
353 | * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they | ||
354 | * are used in Thunk to reserve SVM. | ||
355 | */ | ||
356 | pdd->gpuvm_base = SVM_USER_BASE; | ||
357 | pdd->gpuvm_limit = | ||
358 | pdd->dev->shared_resources.gpuvm_size - 1; | ||
359 | |||
360 | pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); | ||
361 | pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); | ||
362 | } | ||
363 | |||
303 | int kfd_init_apertures(struct kfd_process *process) | 364 | int kfd_init_apertures(struct kfd_process *process) |
304 | { | 365 | { |
305 | uint8_t id = 0; | 366 | uint8_t id = 0; |
@@ -307,9 +368,7 @@ int kfd_init_apertures(struct kfd_process *process) | |||
307 | struct kfd_process_device *pdd; | 368 | struct kfd_process_device *pdd; |
308 | 369 | ||
309 | /*Iterating over all devices*/ | 370 | /*Iterating over all devices*/ |
310 | while (kfd_topology_enum_kfd_devices(id, &dev) == 0 && | 371 | while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { |
311 | id < NUM_OF_SUPPORTED_GPUS) { | ||
312 | |||
313 | if (!dev) { | 372 | if (!dev) { |
314 | id++; /* Skip non GPU devices */ | 373 | id++; /* Skip non GPU devices */ |
315 | continue; | 374 | continue; |
@@ -318,7 +377,7 @@ int kfd_init_apertures(struct kfd_process *process) | |||
318 | pdd = kfd_create_process_device_data(dev, process); | 377 | pdd = kfd_create_process_device_data(dev, process); |
319 | if (!pdd) { | 378 | if (!pdd) { |
320 | pr_err("Failed to create process device data\n"); | 379 | pr_err("Failed to create process device data\n"); |
321 | return -1; | 380 | return -ENOMEM; |
322 | } | 381 | } |
323 | /* | 382 | /* |
324 | * For 64 bit process apertures will be statically reserved in | 383 | * For 64 bit process apertures will be statically reserved in |
@@ -330,32 +389,30 @@ int kfd_init_apertures(struct kfd_process *process) | |||
330 | pdd->gpuvm_base = pdd->gpuvm_limit = 0; | 389 | pdd->gpuvm_base = pdd->gpuvm_limit = 0; |
331 | pdd->scratch_base = pdd->scratch_limit = 0; | 390 | pdd->scratch_base = pdd->scratch_limit = 0; |
332 | } else { | 391 | } else { |
333 | /* Same LDS and scratch apertures can be used | 392 | switch (dev->device_info->asic_family) { |
334 | * on all GPUs. This allows using more dGPUs | 393 | case CHIP_KAVERI: |
335 | * than placement options for apertures. | 394 | case CHIP_HAWAII: |
336 | */ | 395 | case CHIP_CARRIZO: |
337 | pdd->lds_base = MAKE_LDS_APP_BASE(); | 396 | case CHIP_TONGA: |
338 | pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); | 397 | case CHIP_FIJI: |
339 | 398 | case CHIP_POLARIS10: | |
340 | pdd->scratch_base = MAKE_SCRATCH_APP_BASE(); | 399 | case CHIP_POLARIS11: |
341 | pdd->scratch_limit = | 400 | kfd_init_apertures_vi(pdd, id); |
342 | MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); | 401 | break; |
402 | case CHIP_VEGA10: | ||
403 | case CHIP_RAVEN: | ||
404 | kfd_init_apertures_v9(pdd, id); | ||
405 | break; | ||
406 | default: | ||
407 | WARN(1, "Unexpected ASIC family %u", | ||
408 | dev->device_info->asic_family); | ||
409 | return -EINVAL; | ||
410 | } | ||
343 | 411 | ||
344 | if (dev->device_info->needs_iommu_device) { | 412 | if (!dev->device_info->needs_iommu_device) { |
345 | /* APUs: GPUVM aperture in | 413 | /* dGPUs: the reserved space for kernel |
346 | * non-canonical address space | 414 | * before SVM |
347 | */ | ||
348 | pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); | ||
349 | pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( | ||
350 | pdd->gpuvm_base, | ||
351 | dev->shared_resources.gpuvm_size); | ||
352 | } else { | ||
353 | /* dGPUs: SVM aperture starting at 0 | ||
354 | * with small reserved space for kernel | ||
355 | */ | 415 | */ |
356 | pdd->gpuvm_base = SVM_USER_BASE; | ||
357 | pdd->gpuvm_limit = | ||
358 | dev->shared_resources.gpuvm_size - 1; | ||
359 | pdd->qpd.cwsr_base = SVM_CWSR_BASE; | 416 | pdd->qpd.cwsr_base = SVM_CWSR_BASE; |
360 | pdd->qpd.ib_base = SVM_IB_BASE; | 417 | pdd->qpd.ib_base = SVM_IB_BASE; |
361 | } | 418 | } |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c new file mode 100644 index 000000000000..37029baa3346 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | |||
@@ -0,0 +1,92 @@ | |||
1 | /* | ||
2 | * Copyright 2016-2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | #include "kfd_priv.h" | ||
24 | #include "kfd_events.h" | ||
25 | #include "soc15_int.h" | ||
26 | |||
27 | |||
28 | static bool event_interrupt_isr_v9(struct kfd_dev *dev, | ||
29 | const uint32_t *ih_ring_entry) | ||
30 | { | ||
31 | uint16_t source_id, client_id, pasid, vmid; | ||
32 | const uint32_t *data = ih_ring_entry; | ||
33 | |||
34 | /* Only handle interrupts from KFD VMIDs */ | ||
35 | vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); | ||
36 | if (vmid < dev->vm_info.first_vmid_kfd || | ||
37 | vmid > dev->vm_info.last_vmid_kfd) | ||
38 | return 0; | ||
39 | |||
40 | /* If there is no valid PASID, it's likely a firmware bug */ | ||
41 | pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); | ||
42 | if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt")) | ||
43 | return 0; | ||
44 | |||
45 | source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); | ||
46 | client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); | ||
47 | |||
48 | pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", | ||
49 | client_id, source_id, pasid); | ||
50 | pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", | ||
51 | data[0], data[1], data[2], data[3], | ||
52 | data[4], data[5], data[6], data[7]); | ||
53 | |||
54 | /* Interrupt types we care about: various signals and faults. | ||
55 | * They will be forwarded to a work queue (see below). | ||
56 | */ | ||
57 | return source_id == SOC15_INTSRC_CP_END_OF_PIPE || | ||
58 | source_id == SOC15_INTSRC_SDMA_TRAP || | ||
59 | source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || | ||
60 | source_id == SOC15_INTSRC_CP_BAD_OPCODE; | ||
61 | } | ||
62 | |||
63 | static void event_interrupt_wq_v9(struct kfd_dev *dev, | ||
64 | const uint32_t *ih_ring_entry) | ||
65 | { | ||
66 | uint16_t source_id, client_id, pasid, vmid; | ||
67 | uint32_t context_id; | ||
68 | |||
69 | source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); | ||
70 | client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); | ||
71 | pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); | ||
72 | vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); | ||
73 | context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); | ||
74 | |||
75 | if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) | ||
76 | kfd_signal_event_interrupt(pasid, context_id, 32); | ||
77 | else if (source_id == SOC15_INTSRC_SDMA_TRAP) | ||
78 | kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28); | ||
79 | else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) | ||
80 | kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24); | ||
81 | else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) | ||
82 | kfd_signal_hw_exception_event(pasid); | ||
83 | else if (client_id == SOC15_IH_CLIENTID_VMC || | ||
84 | client_id == SOC15_IH_CLIENTID_UTCL2) { | ||
85 | /* TODO */ | ||
86 | } | ||
87 | } | ||
88 | |||
89 | const struct kfd_event_interrupt_class event_interrupt_class_v9 = { | ||
90 | .interrupt_isr = event_interrupt_isr_v9, | ||
91 | .interrupt_wq = event_interrupt_wq_v9, | ||
92 | }; | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index 035c351f47c5..db6d9336b80d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | |||
@@ -139,10 +139,12 @@ static void interrupt_wq(struct work_struct *work) | |||
139 | { | 139 | { |
140 | struct kfd_dev *dev = container_of(work, struct kfd_dev, | 140 | struct kfd_dev *dev = container_of(work, struct kfd_dev, |
141 | interrupt_work); | 141 | interrupt_work); |
142 | uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE]; | ||
142 | 143 | ||
143 | uint32_t ih_ring_entry[DIV_ROUND_UP( | 144 | if (dev->device_info->ih_ring_entry_size > sizeof(ih_ring_entry)) { |
144 | dev->device_info->ih_ring_entry_size, | 145 | dev_err_once(kfd_chardev(), "Ring entry too small\n"); |
145 | sizeof(uint32_t))]; | 146 | return; |
147 | } | ||
146 | 148 | ||
147 | while (dequeue_ih_ring_entry(dev, ih_ring_entry)) | 149 | while (dequeue_ih_ring_entry(dev, ih_ring_entry)) |
148 | dev->device_info->event_interrupt_class->interrupt_wq(dev, | 150 | dev->device_info->event_interrupt_class->interrupt_wq(dev, |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 69f496485331..476951d8c91c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | |||
@@ -99,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, | |||
99 | kq->rptr_kernel = kq->rptr_mem->cpu_ptr; | 99 | kq->rptr_kernel = kq->rptr_mem->cpu_ptr; |
100 | kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; | 100 | kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; |
101 | 101 | ||
102 | retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), | 102 | retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, |
103 | &kq->wptr_mem); | 103 | &kq->wptr_mem); |
104 | 104 | ||
105 | if (retval != 0) | 105 | if (retval != 0) |
@@ -208,6 +208,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq, | |||
208 | size_t available_size; | 208 | size_t available_size; |
209 | size_t queue_size_dwords; | 209 | size_t queue_size_dwords; |
210 | uint32_t wptr, rptr; | 210 | uint32_t wptr, rptr; |
211 | uint64_t wptr64; | ||
211 | unsigned int *queue_address; | 212 | unsigned int *queue_address; |
212 | 213 | ||
213 | /* When rptr == wptr, the buffer is empty. | 214 | /* When rptr == wptr, the buffer is empty. |
@@ -216,7 +217,8 @@ static int acquire_packet_buffer(struct kernel_queue *kq, | |||
216 | * the opposite. So we can only use up to queue_size_dwords - 1 dwords. | 217 | * the opposite. So we can only use up to queue_size_dwords - 1 dwords. |
217 | */ | 218 | */ |
218 | rptr = *kq->rptr_kernel; | 219 | rptr = *kq->rptr_kernel; |
219 | wptr = *kq->wptr_kernel; | 220 | wptr = kq->pending_wptr; |
221 | wptr64 = kq->pending_wptr64; | ||
220 | queue_address = (unsigned int *)kq->pq_kernel_addr; | 222 | queue_address = (unsigned int *)kq->pq_kernel_addr; |
221 | queue_size_dwords = kq->queue->properties.queue_size / 4; | 223 | queue_size_dwords = kq->queue->properties.queue_size / 4; |
222 | 224 | ||
@@ -232,29 +234,33 @@ static int acquire_packet_buffer(struct kernel_queue *kq, | |||
232 | * make sure calling functions know | 234 | * make sure calling functions know |
233 | * acquire_packet_buffer() failed | 235 | * acquire_packet_buffer() failed |
234 | */ | 236 | */ |
235 | *buffer_ptr = NULL; | 237 | goto err_no_space; |
236 | return -ENOMEM; | ||
237 | } | 238 | } |
238 | 239 | ||
239 | if (wptr + packet_size_in_dwords >= queue_size_dwords) { | 240 | if (wptr + packet_size_in_dwords >= queue_size_dwords) { |
240 | /* make sure after rolling back to position 0, there is | 241 | /* make sure after rolling back to position 0, there is |
241 | * still enough space. | 242 | * still enough space. |
242 | */ | 243 | */ |
243 | if (packet_size_in_dwords >= rptr) { | 244 | if (packet_size_in_dwords >= rptr) |
244 | *buffer_ptr = NULL; | 245 | goto err_no_space; |
245 | return -ENOMEM; | 246 | |
246 | } | ||
247 | /* fill nops, roll back and start at position 0 */ | 247 | /* fill nops, roll back and start at position 0 */ |
248 | while (wptr > 0) { | 248 | while (wptr > 0) { |
249 | queue_address[wptr] = kq->nop_packet; | 249 | queue_address[wptr] = kq->nop_packet; |
250 | wptr = (wptr + 1) % queue_size_dwords; | 250 | wptr = (wptr + 1) % queue_size_dwords; |
251 | wptr64++; | ||
251 | } | 252 | } |
252 | } | 253 | } |
253 | 254 | ||
254 | *buffer_ptr = &queue_address[wptr]; | 255 | *buffer_ptr = &queue_address[wptr]; |
255 | kq->pending_wptr = wptr + packet_size_in_dwords; | 256 | kq->pending_wptr = wptr + packet_size_in_dwords; |
257 | kq->pending_wptr64 = wptr64 + packet_size_in_dwords; | ||
256 | 258 | ||
257 | return 0; | 259 | return 0; |
260 | |||
261 | err_no_space: | ||
262 | *buffer_ptr = NULL; | ||
263 | return -ENOMEM; | ||
258 | } | 264 | } |
259 | 265 | ||
260 | static void submit_packet(struct kernel_queue *kq) | 266 | static void submit_packet(struct kernel_queue *kq) |
@@ -270,14 +276,18 @@ static void submit_packet(struct kernel_queue *kq) | |||
270 | pr_debug("\n"); | 276 | pr_debug("\n"); |
271 | #endif | 277 | #endif |
272 | 278 | ||
273 | *kq->wptr_kernel = kq->pending_wptr; | 279 | kq->ops_asic_specific.submit_packet(kq); |
274 | write_kernel_doorbell(kq->queue->properties.doorbell_ptr, | ||
275 | kq->pending_wptr); | ||
276 | } | 280 | } |
277 | 281 | ||
278 | static void rollback_packet(struct kernel_queue *kq) | 282 | static void rollback_packet(struct kernel_queue *kq) |
279 | { | 283 | { |
280 | kq->pending_wptr = *kq->queue->properties.write_ptr; | 284 | if (kq->dev->device_info->doorbell_size == 8) { |
285 | kq->pending_wptr64 = *kq->wptr64_kernel; | ||
286 | kq->pending_wptr = *kq->wptr_kernel % | ||
287 | (kq->queue->properties.queue_size / 4); | ||
288 | } else { | ||
289 | kq->pending_wptr = *kq->wptr_kernel; | ||
290 | } | ||
281 | } | 291 | } |
282 | 292 | ||
283 | struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, | 293 | struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, |
@@ -308,6 +318,11 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, | |||
308 | case CHIP_HAWAII: | 318 | case CHIP_HAWAII: |
309 | kernel_queue_init_cik(&kq->ops_asic_specific); | 319 | kernel_queue_init_cik(&kq->ops_asic_specific); |
310 | break; | 320 | break; |
321 | |||
322 | case CHIP_VEGA10: | ||
323 | case CHIP_RAVEN: | ||
324 | kernel_queue_init_v9(&kq->ops_asic_specific); | ||
325 | break; | ||
311 | default: | 326 | default: |
312 | WARN(1, "Unexpected ASIC family %u", | 327 | WARN(1, "Unexpected ASIC family %u", |
313 | dev->device_info->asic_family); | 328 | dev->device_info->asic_family); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index 594053136ee4..97aff2041a5d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | |||
@@ -72,6 +72,7 @@ struct kernel_queue { | |||
72 | struct kfd_dev *dev; | 72 | struct kfd_dev *dev; |
73 | struct mqd_manager *mqd; | 73 | struct mqd_manager *mqd; |
74 | struct queue *queue; | 74 | struct queue *queue; |
75 | uint64_t pending_wptr64; | ||
75 | uint32_t pending_wptr; | 76 | uint32_t pending_wptr; |
76 | unsigned int nop_packet; | 77 | unsigned int nop_packet; |
77 | 78 | ||
@@ -79,7 +80,10 @@ struct kernel_queue { | |||
79 | uint32_t *rptr_kernel; | 80 | uint32_t *rptr_kernel; |
80 | uint64_t rptr_gpu_addr; | 81 | uint64_t rptr_gpu_addr; |
81 | struct kfd_mem_obj *wptr_mem; | 82 | struct kfd_mem_obj *wptr_mem; |
82 | uint32_t *wptr_kernel; | 83 | union { |
84 | uint64_t *wptr64_kernel; | ||
85 | uint32_t *wptr_kernel; | ||
86 | }; | ||
83 | uint64_t wptr_gpu_addr; | 87 | uint64_t wptr_gpu_addr; |
84 | struct kfd_mem_obj *pq; | 88 | struct kfd_mem_obj *pq; |
85 | uint64_t pq_gpu_addr; | 89 | uint64_t pq_gpu_addr; |
@@ -97,5 +101,6 @@ struct kernel_queue { | |||
97 | 101 | ||
98 | void kernel_queue_init_cik(struct kernel_queue_ops *ops); | 102 | void kernel_queue_init_cik(struct kernel_queue_ops *ops); |
99 | void kernel_queue_init_vi(struct kernel_queue_ops *ops); | 103 | void kernel_queue_init_vi(struct kernel_queue_ops *ops); |
104 | void kernel_queue_init_v9(struct kernel_queue_ops *ops); | ||
100 | 105 | ||
101 | #endif /* KFD_KERNEL_QUEUE_H_ */ | 106 | #endif /* KFD_KERNEL_QUEUE_H_ */ |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c index a90eb440b1fb..19e54acb4125 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | |||
@@ -26,11 +26,13 @@ | |||
26 | static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, | 26 | static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, |
27 | enum kfd_queue_type type, unsigned int queue_size); | 27 | enum kfd_queue_type type, unsigned int queue_size); |
28 | static void uninitialize_cik(struct kernel_queue *kq); | 28 | static void uninitialize_cik(struct kernel_queue *kq); |
29 | static void submit_packet_cik(struct kernel_queue *kq); | ||
29 | 30 | ||
30 | void kernel_queue_init_cik(struct kernel_queue_ops *ops) | 31 | void kernel_queue_init_cik(struct kernel_queue_ops *ops) |
31 | { | 32 | { |
32 | ops->initialize = initialize_cik; | 33 | ops->initialize = initialize_cik; |
33 | ops->uninitialize = uninitialize_cik; | 34 | ops->uninitialize = uninitialize_cik; |
35 | ops->submit_packet = submit_packet_cik; | ||
34 | } | 36 | } |
35 | 37 | ||
36 | static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, | 38 | static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, |
@@ -42,3 +44,10 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, | |||
42 | static void uninitialize_cik(struct kernel_queue *kq) | 44 | static void uninitialize_cik(struct kernel_queue *kq) |
43 | { | 45 | { |
44 | } | 46 | } |
47 | |||
48 | static void submit_packet_cik(struct kernel_queue *kq) | ||
49 | { | ||
50 | *kq->wptr_kernel = kq->pending_wptr; | ||
51 | write_kernel_doorbell(kq->queue->properties.doorbell_ptr, | ||
52 | kq->pending_wptr); | ||
53 | } | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c new file mode 100644 index 000000000000..684a3bf07efd --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | |||
@@ -0,0 +1,340 @@ | |||
1 | /* | ||
2 | * Copyright 2016-2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | #include "kfd_kernel_queue.h" | ||
25 | #include "kfd_device_queue_manager.h" | ||
26 | #include "kfd_pm4_headers_ai.h" | ||
27 | #include "kfd_pm4_opcodes.h" | ||
28 | |||
29 | static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, | ||
30 | enum kfd_queue_type type, unsigned int queue_size); | ||
31 | static void uninitialize_v9(struct kernel_queue *kq); | ||
32 | static void submit_packet_v9(struct kernel_queue *kq); | ||
33 | |||
34 | void kernel_queue_init_v9(struct kernel_queue_ops *ops) | ||
35 | { | ||
36 | ops->initialize = initialize_v9; | ||
37 | ops->uninitialize = uninitialize_v9; | ||
38 | ops->submit_packet = submit_packet_v9; | ||
39 | } | ||
40 | |||
41 | static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, | ||
42 | enum kfd_queue_type type, unsigned int queue_size) | ||
43 | { | ||
44 | int retval; | ||
45 | |||
46 | retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); | ||
47 | if (retval) | ||
48 | return false; | ||
49 | |||
50 | kq->eop_gpu_addr = kq->eop_mem->gpu_addr; | ||
51 | kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; | ||
52 | |||
53 | memset(kq->eop_kernel_addr, 0, PAGE_SIZE); | ||
54 | |||
55 | return true; | ||
56 | } | ||
57 | |||
58 | static void uninitialize_v9(struct kernel_queue *kq) | ||
59 | { | ||
60 | kfd_gtt_sa_free(kq->dev, kq->eop_mem); | ||
61 | } | ||
62 | |||
63 | static void submit_packet_v9(struct kernel_queue *kq) | ||
64 | { | ||
65 | *kq->wptr64_kernel = kq->pending_wptr64; | ||
66 | write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, | ||
67 | kq->pending_wptr64); | ||
68 | } | ||
69 | |||
70 | static int pm_map_process_v9(struct packet_manager *pm, | ||
71 | uint32_t *buffer, struct qcm_process_device *qpd) | ||
72 | { | ||
73 | struct pm4_mes_map_process *packet; | ||
74 | uint64_t vm_page_table_base_addr = | ||
75 | (uint64_t)(qpd->page_table_base) << 12; | ||
76 | |||
77 | packet = (struct pm4_mes_map_process *)buffer; | ||
78 | memset(buffer, 0, sizeof(struct pm4_mes_map_process)); | ||
79 | |||
80 | packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, | ||
81 | sizeof(struct pm4_mes_map_process)); | ||
82 | packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; | ||
83 | packet->bitfields2.process_quantum = 1; | ||
84 | packet->bitfields2.pasid = qpd->pqm->process->pasid; | ||
85 | packet->bitfields14.gds_size = qpd->gds_size; | ||
86 | packet->bitfields14.num_gws = qpd->num_gws; | ||
87 | packet->bitfields14.num_oac = qpd->num_oac; | ||
88 | packet->bitfields14.sdma_enable = 1; | ||
89 | packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; | ||
90 | |||
91 | packet->sh_mem_config = qpd->sh_mem_config; | ||
92 | packet->sh_mem_bases = qpd->sh_mem_bases; | ||
93 | packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); | ||
94 | packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); | ||
95 | packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); | ||
96 | packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); | ||
97 | |||
98 | packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); | ||
99 | packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); | ||
100 | |||
101 | packet->vm_context_page_table_base_addr_lo32 = | ||
102 | lower_32_bits(vm_page_table_base_addr); | ||
103 | packet->vm_context_page_table_base_addr_hi32 = | ||
104 | upper_32_bits(vm_page_table_base_addr); | ||
105 | |||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, | ||
110 | uint64_t ib, size_t ib_size_in_dwords, bool chain) | ||
111 | { | ||
112 | struct pm4_mes_runlist *packet; | ||
113 | |||
114 | int concurrent_proc_cnt = 0; | ||
115 | struct kfd_dev *kfd = pm->dqm->dev; | ||
116 | |||
117 | /* Determine the number of processes to map together to HW: | ||
118 | * it can not exceed the number of VMIDs available to the | ||
119 | * scheduler, and it is determined by the smaller of the number | ||
120 | * of processes in the runlist and kfd module parameter | ||
121 | * hws_max_conc_proc. | ||
122 | * Note: the arbitration between the number of VMIDs and | ||
123 | * hws_max_conc_proc has been done in | ||
124 | * kgd2kfd_device_init(). | ||
125 | */ | ||
126 | concurrent_proc_cnt = min(pm->dqm->processes_count, | ||
127 | kfd->max_proc_per_quantum); | ||
128 | |||
129 | packet = (struct pm4_mes_runlist *)buffer; | ||
130 | |||
131 | memset(buffer, 0, sizeof(struct pm4_mes_runlist)); | ||
132 | packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, | ||
133 | sizeof(struct pm4_mes_runlist)); | ||
134 | |||
135 | packet->bitfields4.ib_size = ib_size_in_dwords; | ||
136 | packet->bitfields4.chain = chain ? 1 : 0; | ||
137 | packet->bitfields4.offload_polling = 0; | ||
138 | packet->bitfields4.valid = 1; | ||
139 | packet->bitfields4.process_cnt = concurrent_proc_cnt; | ||
140 | packet->ordinal2 = lower_32_bits(ib); | ||
141 | packet->ib_base_hi = upper_32_bits(ib); | ||
142 | |||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, | ||
147 | struct queue *q, bool is_static) | ||
148 | { | ||
149 | struct pm4_mes_map_queues *packet; | ||
150 | bool use_static = is_static; | ||
151 | |||
152 | packet = (struct pm4_mes_map_queues *)buffer; | ||
153 | memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); | ||
154 | |||
155 | packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, | ||
156 | sizeof(struct pm4_mes_map_queues)); | ||
157 | packet->bitfields2.alloc_format = | ||
158 | alloc_format__mes_map_queues__one_per_pipe_vi; | ||
159 | packet->bitfields2.num_queues = 1; | ||
160 | packet->bitfields2.queue_sel = | ||
161 | queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; | ||
162 | |||
163 | packet->bitfields2.engine_sel = | ||
164 | engine_sel__mes_map_queues__compute_vi; | ||
165 | packet->bitfields2.queue_type = | ||
166 | queue_type__mes_map_queues__normal_compute_vi; | ||
167 | |||
168 | switch (q->properties.type) { | ||
169 | case KFD_QUEUE_TYPE_COMPUTE: | ||
170 | if (use_static) | ||
171 | packet->bitfields2.queue_type = | ||
172 | queue_type__mes_map_queues__normal_latency_static_queue_vi; | ||
173 | break; | ||
174 | case KFD_QUEUE_TYPE_DIQ: | ||
175 | packet->bitfields2.queue_type = | ||
176 | queue_type__mes_map_queues__debug_interface_queue_vi; | ||
177 | break; | ||
178 | case KFD_QUEUE_TYPE_SDMA: | ||
179 | packet->bitfields2.engine_sel = q->properties.sdma_engine_id + | ||
180 | engine_sel__mes_map_queues__sdma0_vi; | ||
181 | use_static = false; /* no static queues under SDMA */ | ||
182 | break; | ||
183 | default: | ||
184 | WARN(1, "queue type %d", q->properties.type); | ||
185 | return -EINVAL; | ||
186 | } | ||
187 | packet->bitfields3.doorbell_offset = | ||
188 | q->properties.doorbell_off; | ||
189 | |||
190 | packet->mqd_addr_lo = | ||
191 | lower_32_bits(q->gart_mqd_addr); | ||
192 | |||
193 | packet->mqd_addr_hi = | ||
194 | upper_32_bits(q->gart_mqd_addr); | ||
195 | |||
196 | packet->wptr_addr_lo = | ||
197 | lower_32_bits((uint64_t)q->properties.write_ptr); | ||
198 | |||
199 | packet->wptr_addr_hi = | ||
200 | upper_32_bits((uint64_t)q->properties.write_ptr); | ||
201 | |||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, | ||
206 | enum kfd_queue_type type, | ||
207 | enum kfd_unmap_queues_filter filter, | ||
208 | uint32_t filter_param, bool reset, | ||
209 | unsigned int sdma_engine) | ||
210 | { | ||
211 | struct pm4_mes_unmap_queues *packet; | ||
212 | |||
213 | packet = (struct pm4_mes_unmap_queues *)buffer; | ||
214 | memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); | ||
215 | |||
216 | packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, | ||
217 | sizeof(struct pm4_mes_unmap_queues)); | ||
218 | switch (type) { | ||
219 | case KFD_QUEUE_TYPE_COMPUTE: | ||
220 | case KFD_QUEUE_TYPE_DIQ: | ||
221 | packet->bitfields2.engine_sel = | ||
222 | engine_sel__mes_unmap_queues__compute; | ||
223 | break; | ||
224 | case KFD_QUEUE_TYPE_SDMA: | ||
225 | packet->bitfields2.engine_sel = | ||
226 | engine_sel__mes_unmap_queues__sdma0 + sdma_engine; | ||
227 | break; | ||
228 | default: | ||
229 | WARN(1, "queue type %d", type); | ||
230 | return -EINVAL; | ||
231 | } | ||
232 | |||
233 | if (reset) | ||
234 | packet->bitfields2.action = | ||
235 | action__mes_unmap_queues__reset_queues; | ||
236 | else | ||
237 | packet->bitfields2.action = | ||
238 | action__mes_unmap_queues__preempt_queues; | ||
239 | |||
240 | switch (filter) { | ||
241 | case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: | ||
242 | packet->bitfields2.queue_sel = | ||
243 | queue_sel__mes_unmap_queues__perform_request_on_specified_queues; | ||
244 | packet->bitfields2.num_queues = 1; | ||
245 | packet->bitfields3b.doorbell_offset0 = filter_param; | ||
246 | break; | ||
247 | case KFD_UNMAP_QUEUES_FILTER_BY_PASID: | ||
248 | packet->bitfields2.queue_sel = | ||
249 | queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; | ||
250 | packet->bitfields3a.pasid = filter_param; | ||
251 | break; | ||
252 | case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: | ||
253 | packet->bitfields2.queue_sel = | ||
254 | queue_sel__mes_unmap_queues__unmap_all_queues; | ||
255 | break; | ||
256 | case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: | ||
257 | /* in this case, we do not preempt static queues */ | ||
258 | packet->bitfields2.queue_sel = | ||
259 | queue_sel__mes_unmap_queues__unmap_all_non_static_queues; | ||
260 | break; | ||
261 | default: | ||
262 | WARN(1, "filter %d", filter); | ||
263 | return -EINVAL; | ||
264 | } | ||
265 | |||
266 | return 0; | ||
267 | |||
268 | } | ||
269 | |||
270 | static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, | ||
271 | uint64_t fence_address, uint32_t fence_value) | ||
272 | { | ||
273 | struct pm4_mes_query_status *packet; | ||
274 | |||
275 | packet = (struct pm4_mes_query_status *)buffer; | ||
276 | memset(buffer, 0, sizeof(struct pm4_mes_query_status)); | ||
277 | |||
278 | |||
279 | packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, | ||
280 | sizeof(struct pm4_mes_query_status)); | ||
281 | |||
282 | packet->bitfields2.context_id = 0; | ||
283 | packet->bitfields2.interrupt_sel = | ||
284 | interrupt_sel__mes_query_status__completion_status; | ||
285 | packet->bitfields2.command = | ||
286 | command__mes_query_status__fence_only_after_write_ack; | ||
287 | |||
288 | packet->addr_hi = upper_32_bits((uint64_t)fence_address); | ||
289 | packet->addr_lo = lower_32_bits((uint64_t)fence_address); | ||
290 | packet->data_hi = upper_32_bits((uint64_t)fence_value); | ||
291 | packet->data_lo = lower_32_bits((uint64_t)fence_value); | ||
292 | |||
293 | return 0; | ||
294 | } | ||
295 | |||
296 | |||
297 | static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) | ||
298 | { | ||
299 | struct pm4_mec_release_mem *packet; | ||
300 | |||
301 | packet = (struct pm4_mec_release_mem *)buffer; | ||
302 | memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); | ||
303 | |||
304 | packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, | ||
305 | sizeof(struct pm4_mec_release_mem)); | ||
306 | |||
307 | packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; | ||
308 | packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; | ||
309 | packet->bitfields2.tcl1_action_ena = 1; | ||
310 | packet->bitfields2.tc_action_ena = 1; | ||
311 | packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; | ||
312 | |||
313 | packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; | ||
314 | packet->bitfields3.int_sel = | ||
315 | int_sel__mec_release_mem__send_interrupt_after_write_confirm; | ||
316 | |||
317 | packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; | ||
318 | packet->address_hi = upper_32_bits(gpu_addr); | ||
319 | |||
320 | packet->data_lo = 0; | ||
321 | |||
322 | return 0; | ||
323 | } | ||
324 | |||
325 | const struct packet_manager_funcs kfd_v9_pm_funcs = { | ||
326 | .map_process = pm_map_process_v9, | ||
327 | .runlist = pm_runlist_v9, | ||
328 | .set_resources = pm_set_resources_vi, | ||
329 | .map_queues = pm_map_queues_v9, | ||
330 | .unmap_queues = pm_unmap_queues_v9, | ||
331 | .query_status = pm_query_status_v9, | ||
332 | .release_mem = pm_release_mem_v9, | ||
333 | .map_process_size = sizeof(struct pm4_mes_map_process), | ||
334 | .runlist_size = sizeof(struct pm4_mes_runlist), | ||
335 | .set_resources_size = sizeof(struct pm4_mes_set_resources), | ||
336 | .map_queues_size = sizeof(struct pm4_mes_map_queues), | ||
337 | .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), | ||
338 | .query_status_size = sizeof(struct pm4_mes_query_status), | ||
339 | .release_mem_size = sizeof(struct pm4_mec_release_mem) | ||
340 | }; | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c index f1d48281e322..bf20c6d32ef3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | |||
@@ -22,15 +22,20 @@ | |||
22 | */ | 22 | */ |
23 | 23 | ||
24 | #include "kfd_kernel_queue.h" | 24 | #include "kfd_kernel_queue.h" |
25 | #include "kfd_device_queue_manager.h" | ||
26 | #include "kfd_pm4_headers_vi.h" | ||
27 | #include "kfd_pm4_opcodes.h" | ||
25 | 28 | ||
26 | static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, | 29 | static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, |
27 | enum kfd_queue_type type, unsigned int queue_size); | 30 | enum kfd_queue_type type, unsigned int queue_size); |
28 | static void uninitialize_vi(struct kernel_queue *kq); | 31 | static void uninitialize_vi(struct kernel_queue *kq); |
32 | static void submit_packet_vi(struct kernel_queue *kq); | ||
29 | 33 | ||
30 | void kernel_queue_init_vi(struct kernel_queue_ops *ops) | 34 | void kernel_queue_init_vi(struct kernel_queue_ops *ops) |
31 | { | 35 | { |
32 | ops->initialize = initialize_vi; | 36 | ops->initialize = initialize_vi; |
33 | ops->uninitialize = uninitialize_vi; | 37 | ops->uninitialize = uninitialize_vi; |
38 | ops->submit_packet = submit_packet_vi; | ||
34 | } | 39 | } |
35 | 40 | ||
36 | static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, | 41 | static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, |
@@ -54,3 +59,317 @@ static void uninitialize_vi(struct kernel_queue *kq) | |||
54 | { | 59 | { |
55 | kfd_gtt_sa_free(kq->dev, kq->eop_mem); | 60 | kfd_gtt_sa_free(kq->dev, kq->eop_mem); |
56 | } | 61 | } |
62 | |||
63 | static void submit_packet_vi(struct kernel_queue *kq) | ||
64 | { | ||
65 | *kq->wptr_kernel = kq->pending_wptr; | ||
66 | write_kernel_doorbell(kq->queue->properties.doorbell_ptr, | ||
67 | kq->pending_wptr); | ||
68 | } | ||
69 | |||
70 | unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) | ||
71 | { | ||
72 | union PM4_MES_TYPE_3_HEADER header; | ||
73 | |||
74 | header.u32All = 0; | ||
75 | header.opcode = opcode; | ||
76 | header.count = packet_size / 4 - 2; | ||
77 | header.type = PM4_TYPE_3; | ||
78 | |||
79 | return header.u32All; | ||
80 | } | ||
81 | |||
82 | static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, | ||
83 | struct qcm_process_device *qpd) | ||
84 | { | ||
85 | struct pm4_mes_map_process *packet; | ||
86 | |||
87 | packet = (struct pm4_mes_map_process *)buffer; | ||
88 | |||
89 | memset(buffer, 0, sizeof(struct pm4_mes_map_process)); | ||
90 | |||
91 | packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, | ||
92 | sizeof(struct pm4_mes_map_process)); | ||
93 | packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; | ||
94 | packet->bitfields2.process_quantum = 1; | ||
95 | packet->bitfields2.pasid = qpd->pqm->process->pasid; | ||
96 | packet->bitfields3.page_table_base = qpd->page_table_base; | ||
97 | packet->bitfields10.gds_size = qpd->gds_size; | ||
98 | packet->bitfields10.num_gws = qpd->num_gws; | ||
99 | packet->bitfields10.num_oac = qpd->num_oac; | ||
100 | packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; | ||
101 | |||
102 | packet->sh_mem_config = qpd->sh_mem_config; | ||
103 | packet->sh_mem_bases = qpd->sh_mem_bases; | ||
104 | packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; | ||
105 | packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; | ||
106 | |||
107 | packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; | ||
108 | |||
109 | packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); | ||
110 | packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); | ||
111 | |||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, | ||
116 | uint64_t ib, size_t ib_size_in_dwords, bool chain) | ||
117 | { | ||
118 | struct pm4_mes_runlist *packet; | ||
119 | int concurrent_proc_cnt = 0; | ||
120 | struct kfd_dev *kfd = pm->dqm->dev; | ||
121 | |||
122 | if (WARN_ON(!ib)) | ||
123 | return -EFAULT; | ||
124 | |||
125 | /* Determine the number of processes to map together to HW: | ||
126 | * it can not exceed the number of VMIDs available to the | ||
127 | * scheduler, and it is determined by the smaller of the number | ||
128 | * of processes in the runlist and kfd module parameter | ||
129 | * hws_max_conc_proc. | ||
130 | * Note: the arbitration between the number of VMIDs and | ||
131 | * hws_max_conc_proc has been done in | ||
132 | * kgd2kfd_device_init(). | ||
133 | */ | ||
134 | concurrent_proc_cnt = min(pm->dqm->processes_count, | ||
135 | kfd->max_proc_per_quantum); | ||
136 | |||
137 | packet = (struct pm4_mes_runlist *)buffer; | ||
138 | |||
139 | memset(buffer, 0, sizeof(struct pm4_mes_runlist)); | ||
140 | packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, | ||
141 | sizeof(struct pm4_mes_runlist)); | ||
142 | |||
143 | packet->bitfields4.ib_size = ib_size_in_dwords; | ||
144 | packet->bitfields4.chain = chain ? 1 : 0; | ||
145 | packet->bitfields4.offload_polling = 0; | ||
146 | packet->bitfields4.valid = 1; | ||
147 | packet->bitfields4.process_cnt = concurrent_proc_cnt; | ||
148 | packet->ordinal2 = lower_32_bits(ib); | ||
149 | packet->bitfields3.ib_base_hi = upper_32_bits(ib); | ||
150 | |||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, | ||
155 | struct scheduling_resources *res) | ||
156 | { | ||
157 | struct pm4_mes_set_resources *packet; | ||
158 | |||
159 | packet = (struct pm4_mes_set_resources *)buffer; | ||
160 | memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); | ||
161 | |||
162 | packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, | ||
163 | sizeof(struct pm4_mes_set_resources)); | ||
164 | |||
165 | packet->bitfields2.queue_type = | ||
166 | queue_type__mes_set_resources__hsa_interface_queue_hiq; | ||
167 | packet->bitfields2.vmid_mask = res->vmid_mask; | ||
168 | packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; | ||
169 | packet->bitfields7.oac_mask = res->oac_mask; | ||
170 | packet->bitfields8.gds_heap_base = res->gds_heap_base; | ||
171 | packet->bitfields8.gds_heap_size = res->gds_heap_size; | ||
172 | |||
173 | packet->gws_mask_lo = lower_32_bits(res->gws_mask); | ||
174 | packet->gws_mask_hi = upper_32_bits(res->gws_mask); | ||
175 | |||
176 | packet->queue_mask_lo = lower_32_bits(res->queue_mask); | ||
177 | packet->queue_mask_hi = upper_32_bits(res->queue_mask); | ||
178 | |||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, | ||
183 | struct queue *q, bool is_static) | ||
184 | { | ||
185 | struct pm4_mes_map_queues *packet; | ||
186 | bool use_static = is_static; | ||
187 | |||
188 | packet = (struct pm4_mes_map_queues *)buffer; | ||
189 | memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); | ||
190 | |||
191 | packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, | ||
192 | sizeof(struct pm4_mes_map_queues)); | ||
193 | packet->bitfields2.alloc_format = | ||
194 | alloc_format__mes_map_queues__one_per_pipe_vi; | ||
195 | packet->bitfields2.num_queues = 1; | ||
196 | packet->bitfields2.queue_sel = | ||
197 | queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; | ||
198 | |||
199 | packet->bitfields2.engine_sel = | ||
200 | engine_sel__mes_map_queues__compute_vi; | ||
201 | packet->bitfields2.queue_type = | ||
202 | queue_type__mes_map_queues__normal_compute_vi; | ||
203 | |||
204 | switch (q->properties.type) { | ||
205 | case KFD_QUEUE_TYPE_COMPUTE: | ||
206 | if (use_static) | ||
207 | packet->bitfields2.queue_type = | ||
208 | queue_type__mes_map_queues__normal_latency_static_queue_vi; | ||
209 | break; | ||
210 | case KFD_QUEUE_TYPE_DIQ: | ||
211 | packet->bitfields2.queue_type = | ||
212 | queue_type__mes_map_queues__debug_interface_queue_vi; | ||
213 | break; | ||
214 | case KFD_QUEUE_TYPE_SDMA: | ||
215 | packet->bitfields2.engine_sel = q->properties.sdma_engine_id + | ||
216 | engine_sel__mes_map_queues__sdma0_vi; | ||
217 | use_static = false; /* no static queues under SDMA */ | ||
218 | break; | ||
219 | default: | ||
220 | WARN(1, "queue type %d", q->properties.type); | ||
221 | return -EINVAL; | ||
222 | } | ||
223 | packet->bitfields3.doorbell_offset = | ||
224 | q->properties.doorbell_off; | ||
225 | |||
226 | packet->mqd_addr_lo = | ||
227 | lower_32_bits(q->gart_mqd_addr); | ||
228 | |||
229 | packet->mqd_addr_hi = | ||
230 | upper_32_bits(q->gart_mqd_addr); | ||
231 | |||
232 | packet->wptr_addr_lo = | ||
233 | lower_32_bits((uint64_t)q->properties.write_ptr); | ||
234 | |||
235 | packet->wptr_addr_hi = | ||
236 | upper_32_bits((uint64_t)q->properties.write_ptr); | ||
237 | |||
238 | return 0; | ||
239 | } | ||
240 | |||
241 | static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, | ||
242 | enum kfd_queue_type type, | ||
243 | enum kfd_unmap_queues_filter filter, | ||
244 | uint32_t filter_param, bool reset, | ||
245 | unsigned int sdma_engine) | ||
246 | { | ||
247 | struct pm4_mes_unmap_queues *packet; | ||
248 | |||
249 | packet = (struct pm4_mes_unmap_queues *)buffer; | ||
250 | memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); | ||
251 | |||
252 | packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, | ||
253 | sizeof(struct pm4_mes_unmap_queues)); | ||
254 | switch (type) { | ||
255 | case KFD_QUEUE_TYPE_COMPUTE: | ||
256 | case KFD_QUEUE_TYPE_DIQ: | ||
257 | packet->bitfields2.engine_sel = | ||
258 | engine_sel__mes_unmap_queues__compute; | ||
259 | break; | ||
260 | case KFD_QUEUE_TYPE_SDMA: | ||
261 | packet->bitfields2.engine_sel = | ||
262 | engine_sel__mes_unmap_queues__sdma0 + sdma_engine; | ||
263 | break; | ||
264 | default: | ||
265 | WARN(1, "queue type %d", type); | ||
266 | return -EINVAL; | ||
267 | } | ||
268 | |||
269 | if (reset) | ||
270 | packet->bitfields2.action = | ||
271 | action__mes_unmap_queues__reset_queues; | ||
272 | else | ||
273 | packet->bitfields2.action = | ||
274 | action__mes_unmap_queues__preempt_queues; | ||
275 | |||
276 | switch (filter) { | ||
277 | case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: | ||
278 | packet->bitfields2.queue_sel = | ||
279 | queue_sel__mes_unmap_queues__perform_request_on_specified_queues; | ||
280 | packet->bitfields2.num_queues = 1; | ||
281 | packet->bitfields3b.doorbell_offset0 = filter_param; | ||
282 | break; | ||
283 | case KFD_UNMAP_QUEUES_FILTER_BY_PASID: | ||
284 | packet->bitfields2.queue_sel = | ||
285 | queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; | ||
286 | packet->bitfields3a.pasid = filter_param; | ||
287 | break; | ||
288 | case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: | ||
289 | packet->bitfields2.queue_sel = | ||
290 | queue_sel__mes_unmap_queues__unmap_all_queues; | ||
291 | break; | ||
292 | case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: | ||
293 | /* in this case, we do not preempt static queues */ | ||
294 | packet->bitfields2.queue_sel = | ||
295 | queue_sel__mes_unmap_queues__unmap_all_non_static_queues; | ||
296 | break; | ||
297 | default: | ||
298 | WARN(1, "filter %d", filter); | ||
299 | return -EINVAL; | ||
300 | } | ||
301 | |||
302 | return 0; | ||
303 | |||
304 | } | ||
305 | |||
306 | static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, | ||
307 | uint64_t fence_address, uint32_t fence_value) | ||
308 | { | ||
309 | struct pm4_mes_query_status *packet; | ||
310 | |||
311 | packet = (struct pm4_mes_query_status *)buffer; | ||
312 | memset(buffer, 0, sizeof(struct pm4_mes_query_status)); | ||
313 | |||
314 | packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, | ||
315 | sizeof(struct pm4_mes_query_status)); | ||
316 | |||
317 | packet->bitfields2.context_id = 0; | ||
318 | packet->bitfields2.interrupt_sel = | ||
319 | interrupt_sel__mes_query_status__completion_status; | ||
320 | packet->bitfields2.command = | ||
321 | command__mes_query_status__fence_only_after_write_ack; | ||
322 | |||
323 | packet->addr_hi = upper_32_bits((uint64_t)fence_address); | ||
324 | packet->addr_lo = lower_32_bits((uint64_t)fence_address); | ||
325 | packet->data_hi = upper_32_bits((uint64_t)fence_value); | ||
326 | packet->data_lo = lower_32_bits((uint64_t)fence_value); | ||
327 | |||
328 | return 0; | ||
329 | } | ||
330 | |||
331 | static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) | ||
332 | { | ||
333 | struct pm4_mec_release_mem *packet; | ||
334 | |||
335 | packet = (struct pm4_mec_release_mem *)buffer; | ||
336 | memset(buffer, 0, sizeof(*packet)); | ||
337 | |||
338 | packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, | ||
339 | sizeof(*packet)); | ||
340 | |||
341 | packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; | ||
342 | packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; | ||
343 | packet->bitfields2.tcl1_action_ena = 1; | ||
344 | packet->bitfields2.tc_action_ena = 1; | ||
345 | packet->bitfields2.cache_policy = cache_policy___release_mem__lru; | ||
346 | packet->bitfields2.atc = 0; | ||
347 | |||
348 | packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; | ||
349 | packet->bitfields3.int_sel = | ||
350 | int_sel___release_mem__send_interrupt_after_write_confirm; | ||
351 | |||
352 | packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; | ||
353 | packet->address_hi = upper_32_bits(gpu_addr); | ||
354 | |||
355 | packet->data_lo = 0; | ||
356 | |||
357 | return 0; | ||
358 | } | ||
359 | |||
360 | const struct packet_manager_funcs kfd_vi_pm_funcs = { | ||
361 | .map_process = pm_map_process_vi, | ||
362 | .runlist = pm_runlist_vi, | ||
363 | .set_resources = pm_set_resources_vi, | ||
364 | .map_queues = pm_map_queues_vi, | ||
365 | .unmap_queues = pm_unmap_queues_vi, | ||
366 | .query_status = pm_query_status_vi, | ||
367 | .release_mem = pm_release_mem_vi, | ||
368 | .map_process_size = sizeof(struct pm4_mes_map_process), | ||
369 | .runlist_size = sizeof(struct pm4_mes_runlist), | ||
370 | .set_resources_size = sizeof(struct pm4_mes_set_resources), | ||
371 | .map_queues_size = sizeof(struct pm4_mes_map_queues), | ||
372 | .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), | ||
373 | .query_status_size = sizeof(struct pm4_mes_query_status), | ||
374 | .release_mem_size = sizeof(struct pm4_mec_release_mem) | ||
375 | }; | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index e0c07d24d251..76bf2dc8aec4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c | |||
@@ -43,6 +43,8 @@ static const struct kgd2kfd_calls kgd2kfd = { | |||
43 | .interrupt = kgd2kfd_interrupt, | 43 | .interrupt = kgd2kfd_interrupt, |
44 | .suspend = kgd2kfd_suspend, | 44 | .suspend = kgd2kfd_suspend, |
45 | .resume = kgd2kfd_resume, | 45 | .resume = kgd2kfd_resume, |
46 | .quiesce_mm = kgd2kfd_quiesce_mm, | ||
47 | .resume_mm = kgd2kfd_resume_mm, | ||
46 | .schedule_evict_and_restore_process = | 48 | .schedule_evict_and_restore_process = |
47 | kgd2kfd_schedule_evict_and_restore_process, | 49 | kgd2kfd_schedule_evict_and_restore_process, |
48 | }; | 50 | }; |
@@ -81,6 +83,11 @@ module_param(ignore_crat, int, 0444); | |||
81 | MODULE_PARM_DESC(ignore_crat, | 83 | MODULE_PARM_DESC(ignore_crat, |
82 | "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); | 84 | "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); |
83 | 85 | ||
86 | int vega10_noretry; | ||
87 | module_param_named(noretry, vega10_noretry, int, 0644); | ||
88 | MODULE_PARM_DESC(noretry, | ||
89 | "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); | ||
90 | |||
84 | static int amdkfd_init_completed; | 91 | static int amdkfd_init_completed; |
85 | 92 | ||
86 | int kgd2kfd_init(unsigned int interface_version, | 93 | int kgd2kfd_init(unsigned int interface_version, |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index ee7061e1c466..4b8eb506642b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | |||
@@ -38,6 +38,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, | |||
38 | case CHIP_POLARIS10: | 38 | case CHIP_POLARIS10: |
39 | case CHIP_POLARIS11: | 39 | case CHIP_POLARIS11: |
40 | return mqd_manager_init_vi_tonga(type, dev); | 40 | return mqd_manager_init_vi_tonga(type, dev); |
41 | case CHIP_VEGA10: | ||
42 | case CHIP_RAVEN: | ||
43 | return mqd_manager_init_v9(type, dev); | ||
41 | default: | 44 | default: |
42 | WARN(1, "Unexpected ASIC family %u", | 45 | WARN(1, "Unexpected ASIC family %u", |
43 | dev->device_info->asic_family); | 46 | dev->device_info->asic_family); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index c00c325ed3c9..06eaa218eba6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | |||
@@ -79,10 +79,6 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, | |||
79 | m->cp_mqd_base_addr_lo = lower_32_bits(addr); | 79 | m->cp_mqd_base_addr_lo = lower_32_bits(addr); |
80 | m->cp_mqd_base_addr_hi = upper_32_bits(addr); | 80 | m->cp_mqd_base_addr_hi = upper_32_bits(addr); |
81 | 81 | ||
82 | m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; | ||
83 | /* Although WinKFD writes this, I suspect it should not be necessary */ | ||
84 | m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; | ||
85 | |||
86 | m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | | 82 | m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | |
87 | QUANTUM_DURATION(10); | 83 | QUANTUM_DURATION(10); |
88 | 84 | ||
@@ -412,7 +408,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, | |||
412 | if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) | 408 | if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) |
413 | return NULL; | 409 | return NULL; |
414 | 410 | ||
415 | mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); | 411 | mqd = kzalloc(sizeof(*mqd), GFP_NOIO); |
416 | if (!mqd) | 412 | if (!mqd) |
417 | return NULL; | 413 | return NULL; |
418 | 414 | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c new file mode 100644 index 000000000000..684054ff02cd --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | |||
@@ -0,0 +1,443 @@ | |||
1 | /* | ||
2 | * Copyright 2016-2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | #include <linux/printk.h> | ||
25 | #include <linux/slab.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include "kfd_priv.h" | ||
28 | #include "kfd_mqd_manager.h" | ||
29 | #include "v9_structs.h" | ||
30 | #include "gc/gc_9_0_offset.h" | ||
31 | #include "gc/gc_9_0_sh_mask.h" | ||
32 | #include "sdma0/sdma0_4_0_sh_mask.h" | ||
33 | |||
34 | static inline struct v9_mqd *get_mqd(void *mqd) | ||
35 | { | ||
36 | return (struct v9_mqd *)mqd; | ||
37 | } | ||
38 | |||
39 | static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) | ||
40 | { | ||
41 | return (struct v9_sdma_mqd *)mqd; | ||
42 | } | ||
43 | |||
44 | static int init_mqd(struct mqd_manager *mm, void **mqd, | ||
45 | struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, | ||
46 | struct queue_properties *q) | ||
47 | { | ||
48 | int retval; | ||
49 | uint64_t addr; | ||
50 | struct v9_mqd *m; | ||
51 | struct kfd_dev *kfd = mm->dev; | ||
52 | |||
53 | /* From V9, for CWSR, the control stack is located on the next page | ||
54 | * boundary after the mqd, we will use the gtt allocation function | ||
55 | * instead of sub-allocation function. | ||
56 | */ | ||
57 | if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { | ||
58 | *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); | ||
59 | if (!*mqd_mem_obj) | ||
60 | return -ENOMEM; | ||
61 | retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, | ||
62 | ALIGN(q->ctl_stack_size, PAGE_SIZE) + | ||
63 | ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), | ||
64 | &((*mqd_mem_obj)->gtt_mem), | ||
65 | &((*mqd_mem_obj)->gpu_addr), | ||
66 | (void *)&((*mqd_mem_obj)->cpu_ptr)); | ||
67 | } else | ||
68 | retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), | ||
69 | mqd_mem_obj); | ||
70 | if (retval != 0) | ||
71 | return -ENOMEM; | ||
72 | |||
73 | m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; | ||
74 | addr = (*mqd_mem_obj)->gpu_addr; | ||
75 | |||
76 | memset(m, 0, sizeof(struct v9_mqd)); | ||
77 | |||
78 | m->header = 0xC0310800; | ||
79 | m->compute_pipelinestat_enable = 1; | ||
80 | m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; | ||
81 | m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; | ||
82 | m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; | ||
83 | m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; | ||
84 | |||
85 | m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | | ||
86 | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; | ||
87 | |||
88 | m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; | ||
89 | |||
90 | m->cp_mqd_base_addr_lo = lower_32_bits(addr); | ||
91 | m->cp_mqd_base_addr_hi = upper_32_bits(addr); | ||
92 | |||
93 | m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | | ||
94 | 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | | ||
95 | 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; | ||
96 | |||
97 | m->cp_hqd_pipe_priority = 1; | ||
98 | m->cp_hqd_queue_priority = 15; | ||
99 | |||
100 | if (q->format == KFD_QUEUE_FORMAT_AQL) { | ||
101 | m->cp_hqd_aql_control = | ||
102 | 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; | ||
103 | } | ||
104 | |||
105 | if (q->tba_addr) { | ||
106 | m->compute_pgm_rsrc2 |= | ||
107 | (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); | ||
108 | } | ||
109 | |||
110 | if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { | ||
111 | m->cp_hqd_persistent_state |= | ||
112 | (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); | ||
113 | m->cp_hqd_ctx_save_base_addr_lo = | ||
114 | lower_32_bits(q->ctx_save_restore_area_address); | ||
115 | m->cp_hqd_ctx_save_base_addr_hi = | ||
116 | upper_32_bits(q->ctx_save_restore_area_address); | ||
117 | m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; | ||
118 | m->cp_hqd_cntl_stack_size = q->ctl_stack_size; | ||
119 | m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; | ||
120 | m->cp_hqd_wg_state_offset = q->ctl_stack_size; | ||
121 | } | ||
122 | |||
123 | *mqd = m; | ||
124 | if (gart_addr) | ||
125 | *gart_addr = addr; | ||
126 | retval = mm->update_mqd(mm, m, q); | ||
127 | |||
128 | return retval; | ||
129 | } | ||
130 | |||
131 | static int load_mqd(struct mqd_manager *mm, void *mqd, | ||
132 | uint32_t pipe_id, uint32_t queue_id, | ||
133 | struct queue_properties *p, struct mm_struct *mms) | ||
134 | { | ||
135 | /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ | ||
136 | uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); | ||
137 | |||
138 | return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, | ||
139 | (uint32_t __user *)p->write_ptr, | ||
140 | wptr_shift, 0, mms); | ||
141 | } | ||
142 | |||
143 | static int update_mqd(struct mqd_manager *mm, void *mqd, | ||
144 | struct queue_properties *q) | ||
145 | { | ||
146 | struct v9_mqd *m; | ||
147 | |||
148 | m = get_mqd(mqd); | ||
149 | |||
150 | m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; | ||
151 | m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; | ||
152 | pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); | ||
153 | |||
154 | m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); | ||
155 | m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); | ||
156 | |||
157 | m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); | ||
158 | m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); | ||
159 | m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); | ||
160 | m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); | ||
161 | |||
162 | m->cp_hqd_pq_doorbell_control = | ||
163 | q->doorbell_off << | ||
164 | CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; | ||
165 | pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", | ||
166 | m->cp_hqd_pq_doorbell_control); | ||
167 | |||
168 | m->cp_hqd_ib_control = | ||
169 | 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | | ||
170 | 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; | ||
171 | |||
172 | /* | ||
173 | * HW does not clamp this field correctly. Maximum EOP queue size | ||
174 | * is constrained by per-SE EOP done signal count, which is 8-bit. | ||
175 | * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit | ||
176 | * more than (EOP entry count - 1) so a queue size of 0x800 dwords | ||
177 | * is safe, giving a maximum field value of 0xA. | ||
178 | */ | ||
179 | m->cp_hqd_eop_control = min(0xA, | ||
180 | order_base_2(q->eop_ring_buffer_size / 4) - 1); | ||
181 | m->cp_hqd_eop_base_addr_lo = | ||
182 | lower_32_bits(q->eop_ring_buffer_address >> 8); | ||
183 | m->cp_hqd_eop_base_addr_hi = | ||
184 | upper_32_bits(q->eop_ring_buffer_address >> 8); | ||
185 | |||
186 | m->cp_hqd_iq_timer = 0; | ||
187 | |||
188 | m->cp_hqd_vmid = q->vmid; | ||
189 | |||
190 | if (q->format == KFD_QUEUE_FORMAT_AQL) { | ||
191 | m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | | ||
192 | 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | | ||
193 | 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | | ||
194 | 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; | ||
195 | m->cp_hqd_pq_doorbell_control |= 1 << | ||
196 | CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; | ||
197 | } | ||
198 | if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) | ||
199 | m->cp_hqd_ctx_save_control = 0; | ||
200 | |||
201 | q->is_active = (q->queue_size > 0 && | ||
202 | q->queue_address != 0 && | ||
203 | q->queue_percent > 0 && | ||
204 | !q->is_evicted); | ||
205 | |||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | |||
210 | static int destroy_mqd(struct mqd_manager *mm, void *mqd, | ||
211 | enum kfd_preempt_type type, | ||
212 | unsigned int timeout, uint32_t pipe_id, | ||
213 | uint32_t queue_id) | ||
214 | { | ||
215 | return mm->dev->kfd2kgd->hqd_destroy | ||
216 | (mm->dev->kgd, mqd, type, timeout, | ||
217 | pipe_id, queue_id); | ||
218 | } | ||
219 | |||
220 | static void uninit_mqd(struct mqd_manager *mm, void *mqd, | ||
221 | struct kfd_mem_obj *mqd_mem_obj) | ||
222 | { | ||
223 | struct kfd_dev *kfd = mm->dev; | ||
224 | |||
225 | if (mqd_mem_obj->gtt_mem) { | ||
226 | kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); | ||
227 | kfree(mqd_mem_obj); | ||
228 | } else { | ||
229 | kfd_gtt_sa_free(mm->dev, mqd_mem_obj); | ||
230 | } | ||
231 | } | ||
232 | |||
233 | static bool is_occupied(struct mqd_manager *mm, void *mqd, | ||
234 | uint64_t queue_address, uint32_t pipe_id, | ||
235 | uint32_t queue_id) | ||
236 | { | ||
237 | return mm->dev->kfd2kgd->hqd_is_occupied( | ||
238 | mm->dev->kgd, queue_address, | ||
239 | pipe_id, queue_id); | ||
240 | } | ||
241 | |||
242 | static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, | ||
243 | struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, | ||
244 | struct queue_properties *q) | ||
245 | { | ||
246 | struct v9_mqd *m; | ||
247 | int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); | ||
248 | |||
249 | if (retval != 0) | ||
250 | return retval; | ||
251 | |||
252 | m = get_mqd(*mqd); | ||
253 | |||
254 | m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | | ||
255 | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; | ||
256 | |||
257 | return retval; | ||
258 | } | ||
259 | |||
260 | static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, | ||
261 | struct queue_properties *q) | ||
262 | { | ||
263 | struct v9_mqd *m; | ||
264 | int retval = update_mqd(mm, mqd, q); | ||
265 | |||
266 | if (retval != 0) | ||
267 | return retval; | ||
268 | |||
269 | /* TODO: what's the point? update_mqd already does this. */ | ||
270 | m = get_mqd(mqd); | ||
271 | m->cp_hqd_vmid = q->vmid; | ||
272 | return retval; | ||
273 | } | ||
274 | |||
275 | static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, | ||
276 | struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, | ||
277 | struct queue_properties *q) | ||
278 | { | ||
279 | int retval; | ||
280 | struct v9_sdma_mqd *m; | ||
281 | |||
282 | |||
283 | retval = kfd_gtt_sa_allocate(mm->dev, | ||
284 | sizeof(struct v9_sdma_mqd), | ||
285 | mqd_mem_obj); | ||
286 | |||
287 | if (retval != 0) | ||
288 | return -ENOMEM; | ||
289 | |||
290 | m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; | ||
291 | |||
292 | memset(m, 0, sizeof(struct v9_sdma_mqd)); | ||
293 | |||
294 | *mqd = m; | ||
295 | if (gart_addr) | ||
296 | *gart_addr = (*mqd_mem_obj)->gpu_addr; | ||
297 | |||
298 | retval = mm->update_mqd(mm, m, q); | ||
299 | |||
300 | return retval; | ||
301 | } | ||
302 | |||
303 | static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, | ||
304 | struct kfd_mem_obj *mqd_mem_obj) | ||
305 | { | ||
306 | kfd_gtt_sa_free(mm->dev, mqd_mem_obj); | ||
307 | } | ||
308 | |||
309 | static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, | ||
310 | uint32_t pipe_id, uint32_t queue_id, | ||
311 | struct queue_properties *p, struct mm_struct *mms) | ||
312 | { | ||
313 | return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, | ||
314 | (uint32_t __user *)p->write_ptr, | ||
315 | mms); | ||
316 | } | ||
317 | |||
318 | #define SDMA_RLC_DUMMY_DEFAULT 0xf | ||
319 | |||
320 | static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, | ||
321 | struct queue_properties *q) | ||
322 | { | ||
323 | struct v9_sdma_mqd *m; | ||
324 | |||
325 | m = get_sdma_mqd(mqd); | ||
326 | m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) | ||
327 | << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | | ||
328 | q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | | ||
329 | 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | | ||
330 | 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; | ||
331 | |||
332 | m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); | ||
333 | m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); | ||
334 | m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); | ||
335 | m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); | ||
336 | m->sdmax_rlcx_doorbell_offset = | ||
337 | q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; | ||
338 | |||
339 | m->sdma_engine_id = q->sdma_engine_id; | ||
340 | m->sdma_queue_id = q->sdma_queue_id; | ||
341 | m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; | ||
342 | |||
343 | q->is_active = (q->queue_size > 0 && | ||
344 | q->queue_address != 0 && | ||
345 | q->queue_percent > 0 && | ||
346 | !q->is_evicted); | ||
347 | |||
348 | return 0; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * * preempt type here is ignored because there is only one way | ||
353 | * * to preempt sdma queue | ||
354 | */ | ||
355 | static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, | ||
356 | enum kfd_preempt_type type, | ||
357 | unsigned int timeout, uint32_t pipe_id, | ||
358 | uint32_t queue_id) | ||
359 | { | ||
360 | return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); | ||
361 | } | ||
362 | |||
363 | static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, | ||
364 | uint64_t queue_address, uint32_t pipe_id, | ||
365 | uint32_t queue_id) | ||
366 | { | ||
367 | return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); | ||
368 | } | ||
369 | |||
370 | #if defined(CONFIG_DEBUG_FS) | ||
371 | |||
372 | static int debugfs_show_mqd(struct seq_file *m, void *data) | ||
373 | { | ||
374 | seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, | ||
375 | data, sizeof(struct v9_mqd), false); | ||
376 | return 0; | ||
377 | } | ||
378 | |||
379 | static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) | ||
380 | { | ||
381 | seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, | ||
382 | data, sizeof(struct v9_sdma_mqd), false); | ||
383 | return 0; | ||
384 | } | ||
385 | |||
386 | #endif | ||
387 | |||
388 | struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, | ||
389 | struct kfd_dev *dev) | ||
390 | { | ||
391 | struct mqd_manager *mqd; | ||
392 | |||
393 | if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) | ||
394 | return NULL; | ||
395 | |||
396 | mqd = kzalloc(sizeof(*mqd), GFP_NOIO); | ||
397 | if (!mqd) | ||
398 | return NULL; | ||
399 | |||
400 | mqd->dev = dev; | ||
401 | |||
402 | switch (type) { | ||
403 | case KFD_MQD_TYPE_CP: | ||
404 | case KFD_MQD_TYPE_COMPUTE: | ||
405 | mqd->init_mqd = init_mqd; | ||
406 | mqd->uninit_mqd = uninit_mqd; | ||
407 | mqd->load_mqd = load_mqd; | ||
408 | mqd->update_mqd = update_mqd; | ||
409 | mqd->destroy_mqd = destroy_mqd; | ||
410 | mqd->is_occupied = is_occupied; | ||
411 | #if defined(CONFIG_DEBUG_FS) | ||
412 | mqd->debugfs_show_mqd = debugfs_show_mqd; | ||
413 | #endif | ||
414 | break; | ||
415 | case KFD_MQD_TYPE_HIQ: | ||
416 | mqd->init_mqd = init_mqd_hiq; | ||
417 | mqd->uninit_mqd = uninit_mqd; | ||
418 | mqd->load_mqd = load_mqd; | ||
419 | mqd->update_mqd = update_mqd_hiq; | ||
420 | mqd->destroy_mqd = destroy_mqd; | ||
421 | mqd->is_occupied = is_occupied; | ||
422 | #if defined(CONFIG_DEBUG_FS) | ||
423 | mqd->debugfs_show_mqd = debugfs_show_mqd; | ||
424 | #endif | ||
425 | break; | ||
426 | case KFD_MQD_TYPE_SDMA: | ||
427 | mqd->init_mqd = init_mqd_sdma; | ||
428 | mqd->uninit_mqd = uninit_mqd_sdma; | ||
429 | mqd->load_mqd = load_mqd_sdma; | ||
430 | mqd->update_mqd = update_mqd_sdma; | ||
431 | mqd->destroy_mqd = destroy_mqd_sdma; | ||
432 | mqd->is_occupied = is_occupied_sdma; | ||
433 | #if defined(CONFIG_DEBUG_FS) | ||
434 | mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; | ||
435 | #endif | ||
436 | break; | ||
437 | default: | ||
438 | kfree(mqd); | ||
439 | return NULL; | ||
440 | } | ||
441 | |||
442 | return mqd; | ||
443 | } | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 89e4242e43e7..481307b8b4db 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | |||
@@ -394,7 +394,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, | |||
394 | if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) | 394 | if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) |
395 | return NULL; | 395 | return NULL; |
396 | 396 | ||
397 | mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); | 397 | mqd = kzalloc(sizeof(*mqd), GFP_NOIO); |
398 | if (!mqd) | 398 | if (!mqd) |
399 | return NULL; | 399 | return NULL; |
400 | 400 | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 89ba4c670ec5..c317feb43f69 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | |||
@@ -26,8 +26,6 @@ | |||
26 | #include "kfd_device_queue_manager.h" | 26 | #include "kfd_device_queue_manager.h" |
27 | #include "kfd_kernel_queue.h" | 27 | #include "kfd_kernel_queue.h" |
28 | #include "kfd_priv.h" | 28 | #include "kfd_priv.h" |
29 | #include "kfd_pm4_headers_vi.h" | ||
30 | #include "kfd_pm4_opcodes.h" | ||
31 | 29 | ||
32 | static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, | 30 | static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, |
33 | unsigned int buffer_size_bytes) | 31 | unsigned int buffer_size_bytes) |
@@ -39,18 +37,6 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, | |||
39 | *wptr = temp; | 37 | *wptr = temp; |
40 | } | 38 | } |
41 | 39 | ||
42 | static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) | ||
43 | { | ||
44 | union PM4_MES_TYPE_3_HEADER header; | ||
45 | |||
46 | header.u32All = 0; | ||
47 | header.opcode = opcode; | ||
48 | header.count = packet_size / 4 - 2; | ||
49 | header.type = PM4_TYPE_3; | ||
50 | |||
51 | return header.u32All; | ||
52 | } | ||
53 | |||
54 | static void pm_calc_rlib_size(struct packet_manager *pm, | 40 | static void pm_calc_rlib_size(struct packet_manager *pm, |
55 | unsigned int *rlib_size, | 41 | unsigned int *rlib_size, |
56 | bool *over_subscription) | 42 | bool *over_subscription) |
@@ -80,9 +66,9 @@ static void pm_calc_rlib_size(struct packet_manager *pm, | |||
80 | pr_debug("Over subscribed runlist\n"); | 66 | pr_debug("Over subscribed runlist\n"); |
81 | } | 67 | } |
82 | 68 | ||
83 | map_queue_size = sizeof(struct pm4_mes_map_queues); | 69 | map_queue_size = pm->pmf->map_queues_size; |
84 | /* calculate run list ib allocation size */ | 70 | /* calculate run list ib allocation size */ |
85 | *rlib_size = process_count * sizeof(struct pm4_mes_map_process) + | 71 | *rlib_size = process_count * pm->pmf->map_process_size + |
86 | queue_count * map_queue_size; | 72 | queue_count * map_queue_size; |
87 | 73 | ||
88 | /* | 74 | /* |
@@ -90,7 +76,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, | |||
90 | * when over subscription | 76 | * when over subscription |
91 | */ | 77 | */ |
92 | if (*over_subscription) | 78 | if (*over_subscription) |
93 | *rlib_size += sizeof(struct pm4_mes_runlist); | 79 | *rlib_size += pm->pmf->runlist_size; |
94 | 80 | ||
95 | pr_debug("runlist ib size %d\n", *rlib_size); | 81 | pr_debug("runlist ib size %d\n", *rlib_size); |
96 | } | 82 | } |
@@ -108,12 +94,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, | |||
108 | 94 | ||
109 | pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); | 95 | pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); |
110 | 96 | ||
97 | mutex_lock(&pm->lock); | ||
98 | |||
111 | retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, | 99 | retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, |
112 | &pm->ib_buffer_obj); | 100 | &pm->ib_buffer_obj); |
113 | 101 | ||
114 | if (retval) { | 102 | if (retval) { |
115 | pr_err("Failed to allocate runlist IB\n"); | 103 | pr_err("Failed to allocate runlist IB\n"); |
116 | return retval; | 104 | goto out; |
117 | } | 105 | } |
118 | 106 | ||
119 | *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; | 107 | *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; |
@@ -121,138 +109,10 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, | |||
121 | 109 | ||
122 | memset(*rl_buffer, 0, *rl_buffer_size); | 110 | memset(*rl_buffer, 0, *rl_buffer_size); |
123 | pm->allocated = true; | 111 | pm->allocated = true; |
124 | return retval; | ||
125 | } | ||
126 | |||
127 | static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, | ||
128 | uint64_t ib, size_t ib_size_in_dwords, bool chain) | ||
129 | { | ||
130 | struct pm4_mes_runlist *packet; | ||
131 | int concurrent_proc_cnt = 0; | ||
132 | struct kfd_dev *kfd = pm->dqm->dev; | ||
133 | |||
134 | if (WARN_ON(!ib)) | ||
135 | return -EFAULT; | ||
136 | |||
137 | /* Determine the number of processes to map together to HW: | ||
138 | * it can not exceed the number of VMIDs available to the | ||
139 | * scheduler, and it is determined by the smaller of the number | ||
140 | * of processes in the runlist and kfd module parameter | ||
141 | * hws_max_conc_proc. | ||
142 | * Note: the arbitration between the number of VMIDs and | ||
143 | * hws_max_conc_proc has been done in | ||
144 | * kgd2kfd_device_init(). | ||
145 | */ | ||
146 | concurrent_proc_cnt = min(pm->dqm->processes_count, | ||
147 | kfd->max_proc_per_quantum); | ||
148 | |||
149 | packet = (struct pm4_mes_runlist *)buffer; | ||
150 | |||
151 | memset(buffer, 0, sizeof(struct pm4_mes_runlist)); | ||
152 | packet->header.u32All = build_pm4_header(IT_RUN_LIST, | ||
153 | sizeof(struct pm4_mes_runlist)); | ||
154 | |||
155 | packet->bitfields4.ib_size = ib_size_in_dwords; | ||
156 | packet->bitfields4.chain = chain ? 1 : 0; | ||
157 | packet->bitfields4.offload_polling = 0; | ||
158 | packet->bitfields4.valid = 1; | ||
159 | packet->bitfields4.process_cnt = concurrent_proc_cnt; | ||
160 | packet->ordinal2 = lower_32_bits(ib); | ||
161 | packet->bitfields3.ib_base_hi = upper_32_bits(ib); | ||
162 | |||
163 | return 0; | ||
164 | } | ||
165 | |||
166 | static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, | ||
167 | struct qcm_process_device *qpd) | ||
168 | { | ||
169 | struct pm4_mes_map_process *packet; | ||
170 | |||
171 | packet = (struct pm4_mes_map_process *)buffer; | ||
172 | 112 | ||
173 | memset(buffer, 0, sizeof(struct pm4_mes_map_process)); | 113 | out: |
174 | 114 | mutex_unlock(&pm->lock); | |
175 | packet->header.u32All = build_pm4_header(IT_MAP_PROCESS, | 115 | return retval; |
176 | sizeof(struct pm4_mes_map_process)); | ||
177 | packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; | ||
178 | packet->bitfields2.process_quantum = 1; | ||
179 | packet->bitfields2.pasid = qpd->pqm->process->pasid; | ||
180 | packet->bitfields3.page_table_base = qpd->page_table_base; | ||
181 | packet->bitfields10.gds_size = qpd->gds_size; | ||
182 | packet->bitfields10.num_gws = qpd->num_gws; | ||
183 | packet->bitfields10.num_oac = qpd->num_oac; | ||
184 | packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; | ||
185 | |||
186 | packet->sh_mem_config = qpd->sh_mem_config; | ||
187 | packet->sh_mem_bases = qpd->sh_mem_bases; | ||
188 | packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; | ||
189 | packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; | ||
190 | |||
191 | packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; | ||
192 | |||
193 | packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); | ||
194 | packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); | ||
195 | |||
196 | return 0; | ||
197 | } | ||
198 | |||
199 | static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, | ||
200 | struct queue *q, bool is_static) | ||
201 | { | ||
202 | struct pm4_mes_map_queues *packet; | ||
203 | bool use_static = is_static; | ||
204 | |||
205 | packet = (struct pm4_mes_map_queues *)buffer; | ||
206 | memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); | ||
207 | |||
208 | packet->header.u32All = build_pm4_header(IT_MAP_QUEUES, | ||
209 | sizeof(struct pm4_mes_map_queues)); | ||
210 | packet->bitfields2.alloc_format = | ||
211 | alloc_format__mes_map_queues__one_per_pipe_vi; | ||
212 | packet->bitfields2.num_queues = 1; | ||
213 | packet->bitfields2.queue_sel = | ||
214 | queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; | ||
215 | |||
216 | packet->bitfields2.engine_sel = | ||
217 | engine_sel__mes_map_queues__compute_vi; | ||
218 | packet->bitfields2.queue_type = | ||
219 | queue_type__mes_map_queues__normal_compute_vi; | ||
220 | |||
221 | switch (q->properties.type) { | ||
222 | case KFD_QUEUE_TYPE_COMPUTE: | ||
223 | if (use_static) | ||
224 | packet->bitfields2.queue_type = | ||
225 | queue_type__mes_map_queues__normal_latency_static_queue_vi; | ||
226 | break; | ||
227 | case KFD_QUEUE_TYPE_DIQ: | ||
228 | packet->bitfields2.queue_type = | ||
229 | queue_type__mes_map_queues__debug_interface_queue_vi; | ||
230 | break; | ||
231 | case KFD_QUEUE_TYPE_SDMA: | ||
232 | packet->bitfields2.engine_sel = q->properties.sdma_engine_id + | ||
233 | engine_sel__mes_map_queues__sdma0_vi; | ||
234 | use_static = false; /* no static queues under SDMA */ | ||
235 | break; | ||
236 | default: | ||
237 | WARN(1, "queue type %d", q->properties.type); | ||
238 | return -EINVAL; | ||
239 | } | ||
240 | packet->bitfields3.doorbell_offset = | ||
241 | q->properties.doorbell_off; | ||
242 | |||
243 | packet->mqd_addr_lo = | ||
244 | lower_32_bits(q->gart_mqd_addr); | ||
245 | |||
246 | packet->mqd_addr_hi = | ||
247 | upper_32_bits(q->gart_mqd_addr); | ||
248 | |||
249 | packet->wptr_addr_lo = | ||
250 | lower_32_bits((uint64_t)q->properties.write_ptr); | ||
251 | |||
252 | packet->wptr_addr_hi = | ||
253 | upper_32_bits((uint64_t)q->properties.write_ptr); | ||
254 | |||
255 | return 0; | ||
256 | } | 116 | } |
257 | 117 | ||
258 | static int pm_create_runlist_ib(struct packet_manager *pm, | 118 | static int pm_create_runlist_ib(struct packet_manager *pm, |
@@ -292,12 +152,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm, | |||
292 | return -ENOMEM; | 152 | return -ENOMEM; |
293 | } | 153 | } |
294 | 154 | ||
295 | retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); | 155 | retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); |
296 | if (retval) | 156 | if (retval) |
297 | return retval; | 157 | return retval; |
298 | 158 | ||
299 | proccesses_mapped++; | 159 | proccesses_mapped++; |
300 | inc_wptr(&rl_wptr, sizeof(struct pm4_mes_map_process), | 160 | inc_wptr(&rl_wptr, pm->pmf->map_process_size, |
301 | alloc_size_bytes); | 161 | alloc_size_bytes); |
302 | 162 | ||
303 | list_for_each_entry(kq, &qpd->priv_queue_list, list) { | 163 | list_for_each_entry(kq, &qpd->priv_queue_list, list) { |
@@ -307,7 +167,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, | |||
307 | pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", | 167 | pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", |
308 | kq->queue->queue, qpd->is_debug); | 168 | kq->queue->queue, qpd->is_debug); |
309 | 169 | ||
310 | retval = pm_create_map_queue(pm, | 170 | retval = pm->pmf->map_queues(pm, |
311 | &rl_buffer[rl_wptr], | 171 | &rl_buffer[rl_wptr], |
312 | kq->queue, | 172 | kq->queue, |
313 | qpd->is_debug); | 173 | qpd->is_debug); |
@@ -315,7 +175,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, | |||
315 | return retval; | 175 | return retval; |
316 | 176 | ||
317 | inc_wptr(&rl_wptr, | 177 | inc_wptr(&rl_wptr, |
318 | sizeof(struct pm4_mes_map_queues), | 178 | pm->pmf->map_queues_size, |
319 | alloc_size_bytes); | 179 | alloc_size_bytes); |
320 | } | 180 | } |
321 | 181 | ||
@@ -326,7 +186,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, | |||
326 | pr_debug("static_queue, mapping user queue %d, is debug status %d\n", | 186 | pr_debug("static_queue, mapping user queue %d, is debug status %d\n", |
327 | q->queue, qpd->is_debug); | 187 | q->queue, qpd->is_debug); |
328 | 188 | ||
329 | retval = pm_create_map_queue(pm, | 189 | retval = pm->pmf->map_queues(pm, |
330 | &rl_buffer[rl_wptr], | 190 | &rl_buffer[rl_wptr], |
331 | q, | 191 | q, |
332 | qpd->is_debug); | 192 | qpd->is_debug); |
@@ -335,7 +195,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, | |||
335 | return retval; | 195 | return retval; |
336 | 196 | ||
337 | inc_wptr(&rl_wptr, | 197 | inc_wptr(&rl_wptr, |
338 | sizeof(struct pm4_mes_map_queues), | 198 | pm->pmf->map_queues_size, |
339 | alloc_size_bytes); | 199 | alloc_size_bytes); |
340 | } | 200 | } |
341 | } | 201 | } |
@@ -343,7 +203,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, | |||
343 | pr_debug("Finished map process and queues to runlist\n"); | 203 | pr_debug("Finished map process and queues to runlist\n"); |
344 | 204 | ||
345 | if (is_over_subscription) | 205 | if (is_over_subscription) |
346 | retval = pm_create_runlist(pm, &rl_buffer[rl_wptr], | 206 | retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], |
347 | *rl_gpu_addr, | 207 | *rl_gpu_addr, |
348 | alloc_size_bytes / sizeof(uint32_t), | 208 | alloc_size_bytes / sizeof(uint32_t), |
349 | true); | 209 | true); |
@@ -355,45 +215,29 @@ static int pm_create_runlist_ib(struct packet_manager *pm, | |||
355 | return retval; | 215 | return retval; |
356 | } | 216 | } |
357 | 217 | ||
358 | /* pm_create_release_mem - Create a RELEASE_MEM packet and return the size | ||
359 | * of this packet | ||
360 | * @gpu_addr - GPU address of the packet. It's a virtual address. | ||
361 | * @buffer - buffer to fill up with the packet. It's a CPU kernel pointer | ||
362 | * Return - length of the packet | ||
363 | */ | ||
364 | uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer) | ||
365 | { | ||
366 | struct pm4_mec_release_mem *packet; | ||
367 | |||
368 | WARN_ON(!buffer); | ||
369 | |||
370 | packet = (struct pm4_mec_release_mem *)buffer; | ||
371 | memset(buffer, 0, sizeof(*packet)); | ||
372 | |||
373 | packet->header.u32All = build_pm4_header(IT_RELEASE_MEM, | ||
374 | sizeof(*packet)); | ||
375 | |||
376 | packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; | ||
377 | packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; | ||
378 | packet->bitfields2.tcl1_action_ena = 1; | ||
379 | packet->bitfields2.tc_action_ena = 1; | ||
380 | packet->bitfields2.cache_policy = cache_policy___release_mem__lru; | ||
381 | packet->bitfields2.atc = 0; | ||
382 | |||
383 | packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; | ||
384 | packet->bitfields3.int_sel = | ||
385 | int_sel___release_mem__send_interrupt_after_write_confirm; | ||
386 | |||
387 | packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; | ||
388 | packet->address_hi = upper_32_bits(gpu_addr); | ||
389 | |||
390 | packet->data_lo = 0; | ||
391 | |||
392 | return sizeof(*packet) / sizeof(unsigned int); | ||
393 | } | ||
394 | |||
395 | int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) | 218 | int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) |
396 | { | 219 | { |
220 | switch (dqm->dev->device_info->asic_family) { | ||
221 | case CHIP_KAVERI: | ||
222 | case CHIP_HAWAII: | ||
223 | /* PM4 packet structures on CIK are the same as on VI */ | ||
224 | case CHIP_CARRIZO: | ||
225 | case CHIP_TONGA: | ||
226 | case CHIP_FIJI: | ||
227 | case CHIP_POLARIS10: | ||
228 | case CHIP_POLARIS11: | ||
229 | pm->pmf = &kfd_vi_pm_funcs; | ||
230 | break; | ||
231 | case CHIP_VEGA10: | ||
232 | case CHIP_RAVEN: | ||
233 | pm->pmf = &kfd_v9_pm_funcs; | ||
234 | break; | ||
235 | default: | ||
236 | WARN(1, "Unexpected ASIC family %u", | ||
237 | dqm->dev->device_info->asic_family); | ||
238 | return -EINVAL; | ||
239 | } | ||
240 | |||
397 | pm->dqm = dqm; | 241 | pm->dqm = dqm; |
398 | mutex_init(&pm->lock); | 242 | mutex_init(&pm->lock); |
399 | pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); | 243 | pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); |
@@ -415,38 +259,25 @@ void pm_uninit(struct packet_manager *pm) | |||
415 | int pm_send_set_resources(struct packet_manager *pm, | 259 | int pm_send_set_resources(struct packet_manager *pm, |
416 | struct scheduling_resources *res) | 260 | struct scheduling_resources *res) |
417 | { | 261 | { |
418 | struct pm4_mes_set_resources *packet; | 262 | uint32_t *buffer, size; |
419 | int retval = 0; | 263 | int retval = 0; |
420 | 264 | ||
265 | size = pm->pmf->set_resources_size; | ||
421 | mutex_lock(&pm->lock); | 266 | mutex_lock(&pm->lock); |
422 | pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, | 267 | pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, |
423 | sizeof(*packet) / sizeof(uint32_t), | 268 | size / sizeof(uint32_t), |
424 | (unsigned int **)&packet); | 269 | (unsigned int **)&buffer); |
425 | if (!packet) { | 270 | if (!buffer) { |
426 | pr_err("Failed to allocate buffer on kernel queue\n"); | 271 | pr_err("Failed to allocate buffer on kernel queue\n"); |
427 | retval = -ENOMEM; | 272 | retval = -ENOMEM; |
428 | goto out; | 273 | goto out; |
429 | } | 274 | } |
430 | 275 | ||
431 | memset(packet, 0, sizeof(struct pm4_mes_set_resources)); | 276 | retval = pm->pmf->set_resources(pm, buffer, res); |
432 | packet->header.u32All = build_pm4_header(IT_SET_RESOURCES, | 277 | if (!retval) |
433 | sizeof(struct pm4_mes_set_resources)); | 278 | pm->priv_queue->ops.submit_packet(pm->priv_queue); |
434 | 279 | else | |
435 | packet->bitfields2.queue_type = | 280 | pm->priv_queue->ops.rollback_packet(pm->priv_queue); |
436 | queue_type__mes_set_resources__hsa_interface_queue_hiq; | ||
437 | packet->bitfields2.vmid_mask = res->vmid_mask; | ||
438 | packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; | ||
439 | packet->bitfields7.oac_mask = res->oac_mask; | ||
440 | packet->bitfields8.gds_heap_base = res->gds_heap_base; | ||
441 | packet->bitfields8.gds_heap_size = res->gds_heap_size; | ||
442 | |||
443 | packet->gws_mask_lo = lower_32_bits(res->gws_mask); | ||
444 | packet->gws_mask_hi = upper_32_bits(res->gws_mask); | ||
445 | |||
446 | packet->queue_mask_lo = lower_32_bits(res->queue_mask); | ||
447 | packet->queue_mask_hi = upper_32_bits(res->queue_mask); | ||
448 | |||
449 | pm->priv_queue->ops.submit_packet(pm->priv_queue); | ||
450 | 281 | ||
451 | out: | 282 | out: |
452 | mutex_unlock(&pm->lock); | 283 | mutex_unlock(&pm->lock); |
@@ -468,7 +299,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) | |||
468 | 299 | ||
469 | pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); | 300 | pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); |
470 | 301 | ||
471 | packet_size_dwords = sizeof(struct pm4_mes_runlist) / sizeof(uint32_t); | 302 | packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t); |
472 | mutex_lock(&pm->lock); | 303 | mutex_lock(&pm->lock); |
473 | 304 | ||
474 | retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, | 305 | retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, |
@@ -476,7 +307,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) | |||
476 | if (retval) | 307 | if (retval) |
477 | goto fail_acquire_packet_buffer; | 308 | goto fail_acquire_packet_buffer; |
478 | 309 | ||
479 | retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, | 310 | retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, |
480 | rl_ib_size / sizeof(uint32_t), false); | 311 | rl_ib_size / sizeof(uint32_t), false); |
481 | if (retval) | 312 | if (retval) |
482 | goto fail_create_runlist; | 313 | goto fail_create_runlist; |
@@ -499,37 +330,29 @@ fail_create_runlist_ib: | |||
499 | int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, | 330 | int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, |
500 | uint32_t fence_value) | 331 | uint32_t fence_value) |
501 | { | 332 | { |
502 | int retval; | 333 | uint32_t *buffer, size; |
503 | struct pm4_mes_query_status *packet; | 334 | int retval = 0; |
504 | 335 | ||
505 | if (WARN_ON(!fence_address)) | 336 | if (WARN_ON(!fence_address)) |
506 | return -EFAULT; | 337 | return -EFAULT; |
507 | 338 | ||
339 | size = pm->pmf->query_status_size; | ||
508 | mutex_lock(&pm->lock); | 340 | mutex_lock(&pm->lock); |
509 | retval = pm->priv_queue->ops.acquire_packet_buffer( | 341 | pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, |
510 | pm->priv_queue, | 342 | size / sizeof(uint32_t), (unsigned int **)&buffer); |
511 | sizeof(struct pm4_mes_query_status) / sizeof(uint32_t), | 343 | if (!buffer) { |
512 | (unsigned int **)&packet); | 344 | pr_err("Failed to allocate buffer on kernel queue\n"); |
513 | if (retval) | 345 | retval = -ENOMEM; |
514 | goto fail_acquire_packet_buffer; | 346 | goto out; |
515 | 347 | } | |
516 | packet->header.u32All = build_pm4_header(IT_QUERY_STATUS, | ||
517 | sizeof(struct pm4_mes_query_status)); | ||
518 | |||
519 | packet->bitfields2.context_id = 0; | ||
520 | packet->bitfields2.interrupt_sel = | ||
521 | interrupt_sel__mes_query_status__completion_status; | ||
522 | packet->bitfields2.command = | ||
523 | command__mes_query_status__fence_only_after_write_ack; | ||
524 | |||
525 | packet->addr_hi = upper_32_bits((uint64_t)fence_address); | ||
526 | packet->addr_lo = lower_32_bits((uint64_t)fence_address); | ||
527 | packet->data_hi = upper_32_bits((uint64_t)fence_value); | ||
528 | packet->data_lo = lower_32_bits((uint64_t)fence_value); | ||
529 | 348 | ||
530 | pm->priv_queue->ops.submit_packet(pm->priv_queue); | 349 | retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); |
350 | if (!retval) | ||
351 | pm->priv_queue->ops.submit_packet(pm->priv_queue); | ||
352 | else | ||
353 | pm->priv_queue->ops.rollback_packet(pm->priv_queue); | ||
531 | 354 | ||
532 | fail_acquire_packet_buffer: | 355 | out: |
533 | mutex_unlock(&pm->lock); | 356 | mutex_unlock(&pm->lock); |
534 | return retval; | 357 | return retval; |
535 | } | 358 | } |
@@ -539,82 +362,27 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, | |||
539 | uint32_t filter_param, bool reset, | 362 | uint32_t filter_param, bool reset, |
540 | unsigned int sdma_engine) | 363 | unsigned int sdma_engine) |
541 | { | 364 | { |
542 | int retval; | 365 | uint32_t *buffer, size; |
543 | uint32_t *buffer; | 366 | int retval = 0; |
544 | struct pm4_mes_unmap_queues *packet; | ||
545 | 367 | ||
368 | size = pm->pmf->unmap_queues_size; | ||
546 | mutex_lock(&pm->lock); | 369 | mutex_lock(&pm->lock); |
547 | retval = pm->priv_queue->ops.acquire_packet_buffer( | 370 | pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, |
548 | pm->priv_queue, | 371 | size / sizeof(uint32_t), (unsigned int **)&buffer); |
549 | sizeof(struct pm4_mes_unmap_queues) / sizeof(uint32_t), | 372 | if (!buffer) { |
550 | &buffer); | 373 | pr_err("Failed to allocate buffer on kernel queue\n"); |
551 | if (retval) | 374 | retval = -ENOMEM; |
552 | goto err_acquire_packet_buffer; | 375 | goto out; |
553 | |||
554 | packet = (struct pm4_mes_unmap_queues *)buffer; | ||
555 | memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); | ||
556 | pr_debug("static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n", | ||
557 | filter, reset, type); | ||
558 | packet->header.u32All = build_pm4_header(IT_UNMAP_QUEUES, | ||
559 | sizeof(struct pm4_mes_unmap_queues)); | ||
560 | switch (type) { | ||
561 | case KFD_QUEUE_TYPE_COMPUTE: | ||
562 | case KFD_QUEUE_TYPE_DIQ: | ||
563 | packet->bitfields2.engine_sel = | ||
564 | engine_sel__mes_unmap_queues__compute; | ||
565 | break; | ||
566 | case KFD_QUEUE_TYPE_SDMA: | ||
567 | packet->bitfields2.engine_sel = | ||
568 | engine_sel__mes_unmap_queues__sdma0 + sdma_engine; | ||
569 | break; | ||
570 | default: | ||
571 | WARN(1, "queue type %d", type); | ||
572 | retval = -EINVAL; | ||
573 | goto err_invalid; | ||
574 | } | 376 | } |
575 | 377 | ||
576 | if (reset) | 378 | retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, |
577 | packet->bitfields2.action = | 379 | reset, sdma_engine); |
578 | action__mes_unmap_queues__reset_queues; | 380 | if (!retval) |
381 | pm->priv_queue->ops.submit_packet(pm->priv_queue); | ||
579 | else | 382 | else |
580 | packet->bitfields2.action = | 383 | pm->priv_queue->ops.rollback_packet(pm->priv_queue); |
581 | action__mes_unmap_queues__preempt_queues; | ||
582 | |||
583 | switch (filter) { | ||
584 | case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: | ||
585 | packet->bitfields2.queue_sel = | ||
586 | queue_sel__mes_unmap_queues__perform_request_on_specified_queues; | ||
587 | packet->bitfields2.num_queues = 1; | ||
588 | packet->bitfields3b.doorbell_offset0 = filter_param; | ||
589 | break; | ||
590 | case KFD_UNMAP_QUEUES_FILTER_BY_PASID: | ||
591 | packet->bitfields2.queue_sel = | ||
592 | queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; | ||
593 | packet->bitfields3a.pasid = filter_param; | ||
594 | break; | ||
595 | case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: | ||
596 | packet->bitfields2.queue_sel = | ||
597 | queue_sel__mes_unmap_queues__unmap_all_queues; | ||
598 | break; | ||
599 | case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: | ||
600 | /* in this case, we do not preempt static queues */ | ||
601 | packet->bitfields2.queue_sel = | ||
602 | queue_sel__mes_unmap_queues__unmap_all_non_static_queues; | ||
603 | break; | ||
604 | default: | ||
605 | WARN(1, "filter %d", filter); | ||
606 | retval = -EINVAL; | ||
607 | goto err_invalid; | ||
608 | } | ||
609 | 384 | ||
610 | pm->priv_queue->ops.submit_packet(pm->priv_queue); | 385 | out: |
611 | |||
612 | mutex_unlock(&pm->lock); | ||
613 | return 0; | ||
614 | |||
615 | err_invalid: | ||
616 | pm->priv_queue->ops.rollback_packet(pm->priv_queue); | ||
617 | err_acquire_packet_buffer: | ||
618 | mutex_unlock(&pm->lock); | 386 | mutex_unlock(&pm->lock); |
619 | return retval; | 387 | return retval; |
620 | } | 388 | } |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h new file mode 100644 index 000000000000..f2bcf5c092ea --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h | |||
@@ -0,0 +1,583 @@ | |||
1 | /* | ||
2 | * Copyright 2016 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | #ifndef F32_MES_PM4_PACKETS_H | ||
25 | #define F32_MES_PM4_PACKETS_H | ||
26 | |||
27 | #ifndef PM4_MES_HEADER_DEFINED | ||
28 | #define PM4_MES_HEADER_DEFINED | ||
29 | union PM4_MES_TYPE_3_HEADER { | ||
30 | struct { | ||
31 | uint32_t reserved1 : 8; /* < reserved */ | ||
32 | uint32_t opcode : 8; /* < IT opcode */ | ||
33 | uint32_t count : 14;/* < number of DWORDs - 1 in the | ||
34 | * information body. | ||
35 | */ | ||
36 | uint32_t type : 2; /* < packet identifier. | ||
37 | * It should be 3 for type 3 packets | ||
38 | */ | ||
39 | }; | ||
40 | uint32_t u32All; | ||
41 | }; | ||
42 | #endif /* PM4_MES_HEADER_DEFINED */ | ||
43 | |||
44 | /*--------------------MES_SET_RESOURCES--------------------*/ | ||
45 | |||
46 | #ifndef PM4_MES_SET_RESOURCES_DEFINED | ||
47 | #define PM4_MES_SET_RESOURCES_DEFINED | ||
48 | enum mes_set_resources_queue_type_enum { | ||
49 | queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, | ||
50 | queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, | ||
51 | queue_type__mes_set_resources__hsa_debug_interface_queue = 4 | ||
52 | }; | ||
53 | |||
54 | |||
55 | struct pm4_mes_set_resources { | ||
56 | union { | ||
57 | union PM4_MES_TYPE_3_HEADER header; /* header */ | ||
58 | uint32_t ordinal1; | ||
59 | }; | ||
60 | |||
61 | union { | ||
62 | struct { | ||
63 | uint32_t vmid_mask:16; | ||
64 | uint32_t unmap_latency:8; | ||
65 | uint32_t reserved1:5; | ||
66 | enum mes_set_resources_queue_type_enum queue_type:3; | ||
67 | } bitfields2; | ||
68 | uint32_t ordinal2; | ||
69 | }; | ||
70 | |||
71 | uint32_t queue_mask_lo; | ||
72 | uint32_t queue_mask_hi; | ||
73 | uint32_t gws_mask_lo; | ||
74 | uint32_t gws_mask_hi; | ||
75 | |||
76 | union { | ||
77 | struct { | ||
78 | uint32_t oac_mask:16; | ||
79 | uint32_t reserved2:16; | ||
80 | } bitfields7; | ||
81 | uint32_t ordinal7; | ||
82 | }; | ||
83 | |||
84 | union { | ||
85 | struct { | ||
86 | uint32_t gds_heap_base:6; | ||
87 | uint32_t reserved3:5; | ||
88 | uint32_t gds_heap_size:6; | ||
89 | uint32_t reserved4:15; | ||
90 | } bitfields8; | ||
91 | uint32_t ordinal8; | ||
92 | }; | ||
93 | |||
94 | }; | ||
95 | #endif | ||
96 | |||
97 | /*--------------------MES_RUN_LIST--------------------*/ | ||
98 | |||
99 | #ifndef PM4_MES_RUN_LIST_DEFINED | ||
100 | #define PM4_MES_RUN_LIST_DEFINED | ||
101 | |||
102 | struct pm4_mes_runlist { | ||
103 | union { | ||
104 | union PM4_MES_TYPE_3_HEADER header; /* header */ | ||
105 | uint32_t ordinal1; | ||
106 | }; | ||
107 | |||
108 | union { | ||
109 | struct { | ||
110 | uint32_t reserved1:2; | ||
111 | uint32_t ib_base_lo:30; | ||
112 | } bitfields2; | ||
113 | uint32_t ordinal2; | ||
114 | }; | ||
115 | |||
116 | uint32_t ib_base_hi; | ||
117 | |||
118 | union { | ||
119 | struct { | ||
120 | uint32_t ib_size:20; | ||
121 | uint32_t chain:1; | ||
122 | uint32_t offload_polling:1; | ||
123 | uint32_t reserved2:1; | ||
124 | uint32_t valid:1; | ||
125 | uint32_t process_cnt:4; | ||
126 | uint32_t reserved3:4; | ||
127 | } bitfields4; | ||
128 | uint32_t ordinal4; | ||
129 | }; | ||
130 | |||
131 | }; | ||
132 | #endif | ||
133 | |||
134 | /*--------------------MES_MAP_PROCESS--------------------*/ | ||
135 | |||
136 | #ifndef PM4_MES_MAP_PROCESS_DEFINED | ||
137 | #define PM4_MES_MAP_PROCESS_DEFINED | ||
138 | |||
139 | struct pm4_mes_map_process { | ||
140 | union { | ||
141 | union PM4_MES_TYPE_3_HEADER header; /* header */ | ||
142 | uint32_t ordinal1; | ||
143 | }; | ||
144 | |||
145 | union { | ||
146 | struct { | ||
147 | uint32_t pasid:16; | ||
148 | uint32_t reserved1:8; | ||
149 | uint32_t diq_enable:1; | ||
150 | uint32_t process_quantum:7; | ||
151 | } bitfields2; | ||
152 | uint32_t ordinal2; | ||
153 | }; | ||
154 | |||
155 | uint32_t vm_context_page_table_base_addr_lo32; | ||
156 | |||
157 | uint32_t vm_context_page_table_base_addr_hi32; | ||
158 | |||
159 | uint32_t sh_mem_bases; | ||
160 | |||
161 | uint32_t sh_mem_config; | ||
162 | |||
163 | uint32_t sq_shader_tba_lo; | ||
164 | |||
165 | uint32_t sq_shader_tba_hi; | ||
166 | |||
167 | uint32_t sq_shader_tma_lo; | ||
168 | |||
169 | uint32_t sq_shader_tma_hi; | ||
170 | |||
171 | uint32_t reserved6; | ||
172 | |||
173 | uint32_t gds_addr_lo; | ||
174 | |||
175 | uint32_t gds_addr_hi; | ||
176 | |||
177 | union { | ||
178 | struct { | ||
179 | uint32_t num_gws:6; | ||
180 | uint32_t reserved7:1; | ||
181 | uint32_t sdma_enable:1; | ||
182 | uint32_t num_oac:4; | ||
183 | uint32_t reserved8:4; | ||
184 | uint32_t gds_size:6; | ||
185 | uint32_t num_queues:10; | ||
186 | } bitfields14; | ||
187 | uint32_t ordinal14; | ||
188 | }; | ||
189 | |||
190 | uint32_t completion_signal_lo; | ||
191 | |||
192 | uint32_t completion_signal_hi; | ||
193 | |||
194 | }; | ||
195 | |||
196 | #endif | ||
197 | |||
198 | /*--------------------MES_MAP_PROCESS_VM--------------------*/ | ||
199 | |||
200 | #ifndef PM4_MES_MAP_PROCESS_VM_DEFINED | ||
201 | #define PM4_MES_MAP_PROCESS_VM_DEFINED | ||
202 | |||
203 | struct PM4_MES_MAP_PROCESS_VM { | ||
204 | union { | ||
205 | union PM4_MES_TYPE_3_HEADER header; /* header */ | ||
206 | uint32_t ordinal1; | ||
207 | }; | ||
208 | |||
209 | uint32_t reserved1; | ||
210 | |||
211 | uint32_t vm_context_cntl; | ||
212 | |||
213 | uint32_t reserved2; | ||
214 | |||
215 | uint32_t vm_context_page_table_end_addr_lo32; | ||
216 | |||
217 | uint32_t vm_context_page_table_end_addr_hi32; | ||
218 | |||
219 | uint32_t vm_context_page_table_start_addr_lo32; | ||
220 | |||
221 | uint32_t vm_context_page_table_start_addr_hi32; | ||
222 | |||
223 | uint32_t reserved3; | ||
224 | |||
225 | uint32_t reserved4; | ||
226 | |||
227 | uint32_t reserved5; | ||
228 | |||
229 | uint32_t reserved6; | ||
230 | |||
231 | uint32_t reserved7; | ||
232 | |||
233 | uint32_t reserved8; | ||
234 | |||
235 | uint32_t completion_signal_lo32; | ||
236 | |||
237 | uint32_t completion_signal_hi32; | ||
238 | |||
239 | }; | ||
240 | #endif | ||
241 | |||
242 | /*--------------------MES_MAP_QUEUES--------------------*/ | ||
243 | |||
244 | #ifndef PM4_MES_MAP_QUEUES_VI_DEFINED | ||
245 | #define PM4_MES_MAP_QUEUES_VI_DEFINED | ||
246 | enum mes_map_queues_queue_sel_enum { | ||
247 | queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, | ||
248 | queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 | ||
249 | }; | ||
250 | |||
251 | enum mes_map_queues_queue_type_enum { | ||
252 | queue_type__mes_map_queues__normal_compute_vi = 0, | ||
253 | queue_type__mes_map_queues__debug_interface_queue_vi = 1, | ||
254 | queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, | ||
255 | queue_type__mes_map_queues__low_latency_static_queue_vi = 3 | ||
256 | }; | ||
257 | |||
258 | enum mes_map_queues_alloc_format_enum { | ||
259 | alloc_format__mes_map_queues__one_per_pipe_vi = 0, | ||
260 | alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 | ||
261 | }; | ||
262 | |||
263 | enum mes_map_queues_engine_sel_enum { | ||
264 | engine_sel__mes_map_queues__compute_vi = 0, | ||
265 | engine_sel__mes_map_queues__sdma0_vi = 2, | ||
266 | engine_sel__mes_map_queues__sdma1_vi = 3 | ||
267 | }; | ||
268 | |||
269 | |||
270 | struct pm4_mes_map_queues { | ||
271 | union { | ||
272 | union PM4_MES_TYPE_3_HEADER header; /* header */ | ||
273 | uint32_t ordinal1; | ||
274 | }; | ||
275 | |||
276 | union { | ||
277 | struct { | ||
278 | uint32_t reserved1:4; | ||
279 | enum mes_map_queues_queue_sel_enum queue_sel:2; | ||
280 | uint32_t reserved2:15; | ||
281 | enum mes_map_queues_queue_type_enum queue_type:3; | ||
282 | enum mes_map_queues_alloc_format_enum alloc_format:2; | ||
283 | enum mes_map_queues_engine_sel_enum engine_sel:3; | ||
284 | uint32_t num_queues:3; | ||
285 | } bitfields2; | ||
286 | uint32_t ordinal2; | ||
287 | }; | ||
288 | |||
289 | union { | ||
290 | struct { | ||
291 | uint32_t reserved3:1; | ||
292 | uint32_t check_disable:1; | ||
293 | uint32_t doorbell_offset:26; | ||
294 | uint32_t reserved4:4; | ||
295 | } bitfields3; | ||
296 | uint32_t ordinal3; | ||
297 | }; | ||
298 | |||
299 | uint32_t mqd_addr_lo; | ||
300 | uint32_t mqd_addr_hi; | ||
301 | uint32_t wptr_addr_lo; | ||
302 | uint32_t wptr_addr_hi; | ||
303 | }; | ||
304 | #endif | ||
305 | |||
306 | /*--------------------MES_QUERY_STATUS--------------------*/ | ||
307 | |||
308 | #ifndef PM4_MES_QUERY_STATUS_DEFINED | ||
309 | #define PM4_MES_QUERY_STATUS_DEFINED | ||
310 | enum mes_query_status_interrupt_sel_enum { | ||
311 | interrupt_sel__mes_query_status__completion_status = 0, | ||
312 | interrupt_sel__mes_query_status__process_status = 1, | ||
313 | interrupt_sel__mes_query_status__queue_status = 2 | ||
314 | }; | ||
315 | |||
316 | enum mes_query_status_command_enum { | ||
317 | command__mes_query_status__interrupt_only = 0, | ||
318 | command__mes_query_status__fence_only_immediate = 1, | ||
319 | command__mes_query_status__fence_only_after_write_ack = 2, | ||
320 | command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 | ||
321 | }; | ||
322 | |||
323 | enum mes_query_status_engine_sel_enum { | ||
324 | engine_sel__mes_query_status__compute = 0, | ||
325 | engine_sel__mes_query_status__sdma0_queue = 2, | ||
326 | engine_sel__mes_query_status__sdma1_queue = 3 | ||
327 | }; | ||
328 | |||
329 | struct pm4_mes_query_status { | ||
330 | union { | ||
331 | union PM4_MES_TYPE_3_HEADER header; /* header */ | ||
332 | uint32_t ordinal1; | ||
333 | }; | ||
334 | |||
335 | union { | ||
336 | struct { | ||
337 | uint32_t context_id:28; | ||
338 | enum mes_query_status_interrupt_sel_enum interrupt_sel:2; | ||
339 | enum mes_query_status_command_enum command:2; | ||
340 | } bitfields2; | ||
341 | uint32_t ordinal2; | ||
342 | }; | ||
343 | |||
344 | union { | ||
345 | struct { | ||
346 | uint32_t pasid:16; | ||
347 | uint32_t reserved1:16; | ||
348 | } bitfields3a; | ||
349 | struct { | ||
350 | uint32_t reserved2:2; | ||
351 | uint32_t doorbell_offset:26; | ||
352 | enum mes_query_status_engine_sel_enum engine_sel:3; | ||
353 | uint32_t reserved3:1; | ||
354 | } bitfields3b; | ||
355 | uint32_t ordinal3; | ||
356 | }; | ||
357 | |||
358 | uint32_t addr_lo; | ||
359 | uint32_t addr_hi; | ||
360 | uint32_t data_lo; | ||
361 | uint32_t data_hi; | ||
362 | }; | ||
363 | #endif | ||
364 | |||
365 | /*--------------------MES_UNMAP_QUEUES--------------------*/ | ||
366 | |||
367 | #ifndef PM4_MES_UNMAP_QUEUES_DEFINED | ||
368 | #define PM4_MES_UNMAP_QUEUES_DEFINED | ||
369 | enum mes_unmap_queues_action_enum { | ||
370 | action__mes_unmap_queues__preempt_queues = 0, | ||
371 | action__mes_unmap_queues__reset_queues = 1, | ||
372 | action__mes_unmap_queues__disable_process_queues = 2, | ||
373 | action__mes_unmap_queues__reserved = 3 | ||
374 | }; | ||
375 | |||
376 | enum mes_unmap_queues_queue_sel_enum { | ||
377 | queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, | ||
378 | queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, | ||
379 | queue_sel__mes_unmap_queues__unmap_all_queues = 2, | ||
380 | queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 | ||
381 | }; | ||
382 | |||
383 | enum mes_unmap_queues_engine_sel_enum { | ||
384 | engine_sel__mes_unmap_queues__compute = 0, | ||
385 | engine_sel__mes_unmap_queues__sdma0 = 2, | ||
386 | engine_sel__mes_unmap_queues__sdmal = 3 | ||
387 | }; | ||
388 | |||
389 | struct pm4_mes_unmap_queues { | ||
390 | union { | ||
391 | union PM4_MES_TYPE_3_HEADER header; /* header */ | ||
392 | uint32_t ordinal1; | ||
393 | }; | ||
394 | |||
395 | union { | ||
396 | struct { | ||
397 | enum mes_unmap_queues_action_enum action:2; | ||
398 | uint32_t reserved1:2; | ||
399 | enum mes_unmap_queues_queue_sel_enum queue_sel:2; | ||
400 | uint32_t reserved2:20; | ||
401 | enum mes_unmap_queues_engine_sel_enum engine_sel:3; | ||
402 | uint32_t num_queues:3; | ||
403 | } bitfields2; | ||
404 | uint32_t ordinal2; | ||
405 | }; | ||
406 | |||
407 | union { | ||
408 | struct { | ||
409 | uint32_t pasid:16; | ||
410 | uint32_t reserved3:16; | ||
411 | } bitfields3a; | ||
412 | struct { | ||
413 | uint32_t reserved4:2; | ||
414 | uint32_t doorbell_offset0:26; | ||
415 | int32_t reserved5:4; | ||
416 | } bitfields3b; | ||
417 | uint32_t ordinal3; | ||
418 | }; | ||
419 | |||
420 | union { | ||
421 | struct { | ||
422 | uint32_t reserved6:2; | ||
423 | uint32_t doorbell_offset1:26; | ||
424 | uint32_t reserved7:4; | ||
425 | } bitfields4; | ||
426 | uint32_t ordinal4; | ||
427 | }; | ||
428 | |||
429 | union { | ||
430 | struct { | ||
431 | uint32_t reserved8:2; | ||
432 | uint32_t doorbell_offset2:26; | ||
433 | uint32_t reserved9:4; | ||
434 | } bitfields5; | ||
435 | uint32_t ordinal5; | ||
436 | }; | ||
437 | |||
438 | union { | ||
439 | struct { | ||
440 | uint32_t reserved10:2; | ||
441 | uint32_t doorbell_offset3:26; | ||
442 | uint32_t reserved11:4; | ||
443 | } bitfields6; | ||
444 | uint32_t ordinal6; | ||
445 | }; | ||
446 | }; | ||
447 | #endif | ||
448 | |||
449 | #ifndef PM4_MEC_RELEASE_MEM_DEFINED | ||
450 | #define PM4_MEC_RELEASE_MEM_DEFINED | ||
451 | |||
452 | enum mec_release_mem_event_index_enum { | ||
453 | event_index__mec_release_mem__end_of_pipe = 5, | ||
454 | event_index__mec_release_mem__shader_done = 6 | ||
455 | }; | ||
456 | |||
457 | enum mec_release_mem_cache_policy_enum { | ||
458 | cache_policy__mec_release_mem__lru = 0, | ||
459 | cache_policy__mec_release_mem__stream = 1 | ||
460 | }; | ||
461 | |||
462 | enum mec_release_mem_pq_exe_status_enum { | ||
463 | pq_exe_status__mec_release_mem__default = 0, | ||
464 | pq_exe_status__mec_release_mem__phase_update = 1 | ||
465 | }; | ||
466 | |||
467 | enum mec_release_mem_dst_sel_enum { | ||
468 | dst_sel__mec_release_mem__memory_controller = 0, | ||
469 | dst_sel__mec_release_mem__tc_l2 = 1, | ||
470 | dst_sel__mec_release_mem__queue_write_pointer_register = 2, | ||
471 | dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 | ||
472 | }; | ||
473 | |||
474 | enum mec_release_mem_int_sel_enum { | ||
475 | int_sel__mec_release_mem__none = 0, | ||
476 | int_sel__mec_release_mem__send_interrupt_only = 1, | ||
477 | int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, | ||
478 | int_sel__mec_release_mem__send_data_after_write_confirm = 3, | ||
479 | int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, | ||
480 | int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, | ||
481 | int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 | ||
482 | }; | ||
483 | |||
484 | enum mec_release_mem_data_sel_enum { | ||
485 | data_sel__mec_release_mem__none = 0, | ||
486 | data_sel__mec_release_mem__send_32_bit_low = 1, | ||
487 | data_sel__mec_release_mem__send_64_bit_data = 2, | ||
488 | data_sel__mec_release_mem__send_gpu_clock_counter = 3, | ||
489 | data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, | ||
490 | data_sel__mec_release_mem__store_gds_data_to_memory = 5 | ||
491 | }; | ||
492 | |||
493 | struct pm4_mec_release_mem { | ||
494 | union { | ||
495 | union PM4_MES_TYPE_3_HEADER header; /*header */ | ||
496 | unsigned int ordinal1; | ||
497 | }; | ||
498 | |||
499 | union { | ||
500 | struct { | ||
501 | unsigned int event_type:6; | ||
502 | unsigned int reserved1:2; | ||
503 | enum mec_release_mem_event_index_enum event_index:4; | ||
504 | unsigned int tcl1_vol_action_ena:1; | ||
505 | unsigned int tc_vol_action_ena:1; | ||
506 | unsigned int reserved2:1; | ||
507 | unsigned int tc_wb_action_ena:1; | ||
508 | unsigned int tcl1_action_ena:1; | ||
509 | unsigned int tc_action_ena:1; | ||
510 | uint32_t reserved3:1; | ||
511 | uint32_t tc_nc_action_ena:1; | ||
512 | uint32_t tc_wc_action_ena:1; | ||
513 | uint32_t tc_md_action_ena:1; | ||
514 | uint32_t reserved4:3; | ||
515 | enum mec_release_mem_cache_policy_enum cache_policy:2; | ||
516 | uint32_t reserved5:2; | ||
517 | enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; | ||
518 | uint32_t reserved6:2; | ||
519 | } bitfields2; | ||
520 | unsigned int ordinal2; | ||
521 | }; | ||
522 | |||
523 | union { | ||
524 | struct { | ||
525 | uint32_t reserved7:16; | ||
526 | enum mec_release_mem_dst_sel_enum dst_sel:2; | ||
527 | uint32_t reserved8:6; | ||
528 | enum mec_release_mem_int_sel_enum int_sel:3; | ||
529 | uint32_t reserved9:2; | ||
530 | enum mec_release_mem_data_sel_enum data_sel:3; | ||
531 | } bitfields3; | ||
532 | unsigned int ordinal3; | ||
533 | }; | ||
534 | |||
535 | union { | ||
536 | struct { | ||
537 | uint32_t reserved10:2; | ||
538 | unsigned int address_lo_32b:30; | ||
539 | } bitfields4; | ||
540 | struct { | ||
541 | uint32_t reserved11:3; | ||
542 | uint32_t address_lo_64b:29; | ||
543 | } bitfields4b; | ||
544 | uint32_t reserved12; | ||
545 | unsigned int ordinal4; | ||
546 | }; | ||
547 | |||
548 | union { | ||
549 | uint32_t address_hi; | ||
550 | uint32_t reserved13; | ||
551 | uint32_t ordinal5; | ||
552 | }; | ||
553 | |||
554 | union { | ||
555 | uint32_t data_lo; | ||
556 | uint32_t cmp_data_lo; | ||
557 | struct { | ||
558 | uint32_t dw_offset:16; | ||
559 | uint32_t num_dwords:16; | ||
560 | } bitfields6c; | ||
561 | uint32_t reserved14; | ||
562 | uint32_t ordinal6; | ||
563 | }; | ||
564 | |||
565 | union { | ||
566 | uint32_t data_hi; | ||
567 | uint32_t cmp_data_hi; | ||
568 | uint32_t reserved15; | ||
569 | uint32_t reserved16; | ||
570 | uint32_t ordinal7; | ||
571 | }; | ||
572 | |||
573 | uint32_t int_ctxid; | ||
574 | |||
575 | }; | ||
576 | |||
577 | #endif | ||
578 | |||
579 | enum { | ||
580 | CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 | ||
581 | }; | ||
582 | #endif | ||
583 | |||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 96a9cc0f02c9..5e3990bb4c4b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h | |||
@@ -39,11 +39,37 @@ | |||
39 | 39 | ||
40 | #include "amd_shared.h" | 40 | #include "amd_shared.h" |
41 | 41 | ||
42 | #define KFD_MAX_RING_ENTRY_SIZE 8 | ||
43 | |||
42 | #define KFD_SYSFS_FILE_MODE 0444 | 44 | #define KFD_SYSFS_FILE_MODE 0444 |
43 | 45 | ||
44 | #define KFD_MMAP_DOORBELL_MASK 0x8000000000000ull | 46 | /* GPU ID hash width in bits */ |
45 | #define KFD_MMAP_EVENTS_MASK 0x4000000000000ull | 47 | #define KFD_GPU_ID_HASH_WIDTH 16 |
46 | #define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000ull | 48 | |
49 | /* Use upper bits of mmap offset to store KFD driver specific information. | ||
50 | * BITS[63:62] - Encode MMAP type | ||
51 | * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to | ||
52 | * BITS[45:0] - MMAP offset value | ||
53 | * | ||
54 | * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these | ||
55 | * defines are w.r.t to PAGE_SIZE | ||
56 | */ | ||
57 | #define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) | ||
58 | #define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) | ||
59 | #define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) | ||
60 | #define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) | ||
61 | #define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT) | ||
62 | |||
63 | #define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) | ||
64 | #define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ | ||
65 | << KFD_MMAP_GPU_ID_SHIFT) | ||
66 | #define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ | ||
67 | & KFD_MMAP_GPU_ID_MASK) | ||
68 | #define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ | ||
69 | >> KFD_MMAP_GPU_ID_SHIFT) | ||
70 | |||
71 | #define KFD_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFULL >> PAGE_SHIFT) | ||
72 | #define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) | ||
47 | 73 | ||
48 | /* | 74 | /* |
49 | * When working with cp scheduler we should assign the HIQ manually or via | 75 | * When working with cp scheduler we should assign the HIQ manually or via |
@@ -55,9 +81,6 @@ | |||
55 | #define KFD_CIK_HIQ_PIPE 4 | 81 | #define KFD_CIK_HIQ_PIPE 4 |
56 | #define KFD_CIK_HIQ_QUEUE 0 | 82 | #define KFD_CIK_HIQ_QUEUE 0 |
57 | 83 | ||
58 | /* GPU ID hash width in bits */ | ||
59 | #define KFD_GPU_ID_HASH_WIDTH 16 | ||
60 | |||
61 | /* Macro for allocating structures */ | 84 | /* Macro for allocating structures */ |
62 | #define kfd_alloc_struct(ptr_to_struct) \ | 85 | #define kfd_alloc_struct(ptr_to_struct) \ |
63 | ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) | 86 | ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) |
@@ -116,6 +139,11 @@ extern int debug_largebar; | |||
116 | */ | 139 | */ |
117 | extern int ignore_crat; | 140 | extern int ignore_crat; |
118 | 141 | ||
142 | /* | ||
143 | * Set sh_mem_config.retry_disable on Vega10 | ||
144 | */ | ||
145 | extern int vega10_noretry; | ||
146 | |||
119 | /** | 147 | /** |
120 | * enum kfd_sched_policy | 148 | * enum kfd_sched_policy |
121 | * | 149 | * |
@@ -148,6 +176,8 @@ enum cache_policy { | |||
148 | cache_policy_noncoherent | 176 | cache_policy_noncoherent |
149 | }; | 177 | }; |
150 | 178 | ||
179 | #define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) | ||
180 | |||
151 | struct kfd_event_interrupt_class { | 181 | struct kfd_event_interrupt_class { |
152 | bool (*interrupt_isr)(struct kfd_dev *dev, | 182 | bool (*interrupt_isr)(struct kfd_dev *dev, |
153 | const uint32_t *ih_ring_entry); | 183 | const uint32_t *ih_ring_entry); |
@@ -160,6 +190,7 @@ struct kfd_device_info { | |||
160 | const struct kfd_event_interrupt_class *event_interrupt_class; | 190 | const struct kfd_event_interrupt_class *event_interrupt_class; |
161 | unsigned int max_pasid_bits; | 191 | unsigned int max_pasid_bits; |
162 | unsigned int max_no_of_hqd; | 192 | unsigned int max_no_of_hqd; |
193 | unsigned int doorbell_size; | ||
163 | size_t ih_ring_entry_size; | 194 | size_t ih_ring_entry_size; |
164 | uint8_t num_of_watch_points; | 195 | uint8_t num_of_watch_points; |
165 | uint16_t mqd_size_aligned; | 196 | uint16_t mqd_size_aligned; |
@@ -173,6 +204,7 @@ struct kfd_mem_obj { | |||
173 | uint32_t range_end; | 204 | uint32_t range_end; |
174 | uint64_t gpu_addr; | 205 | uint64_t gpu_addr; |
175 | uint32_t *cpu_ptr; | 206 | uint32_t *cpu_ptr; |
207 | void *gtt_mem; | ||
176 | }; | 208 | }; |
177 | 209 | ||
178 | struct kfd_vmid_info { | 210 | struct kfd_vmid_info { |
@@ -364,7 +396,7 @@ struct queue_properties { | |||
364 | uint32_t queue_percent; | 396 | uint32_t queue_percent; |
365 | uint32_t *read_ptr; | 397 | uint32_t *read_ptr; |
366 | uint32_t *write_ptr; | 398 | uint32_t *write_ptr; |
367 | uint32_t __iomem *doorbell_ptr; | 399 | void __iomem *doorbell_ptr; |
368 | uint32_t doorbell_off; | 400 | uint32_t doorbell_off; |
369 | bool is_interop; | 401 | bool is_interop; |
370 | bool is_evicted; | 402 | bool is_evicted; |
@@ -427,6 +459,7 @@ struct queue { | |||
427 | uint32_t queue; | 459 | uint32_t queue; |
428 | 460 | ||
429 | unsigned int sdma_id; | 461 | unsigned int sdma_id; |
462 | unsigned int doorbell_id; | ||
430 | 463 | ||
431 | struct kfd_process *process; | 464 | struct kfd_process *process; |
432 | struct kfd_dev *device; | 465 | struct kfd_dev *device; |
@@ -501,6 +534,9 @@ struct qcm_process_device { | |||
501 | /* IB memory */ | 534 | /* IB memory */ |
502 | uint64_t ib_base; | 535 | uint64_t ib_base; |
503 | void *ib_kaddr; | 536 | void *ib_kaddr; |
537 | |||
538 | /* doorbell resources per process per device */ | ||
539 | unsigned long *doorbell_bitmap; | ||
504 | }; | 540 | }; |
505 | 541 | ||
506 | /* KFD Memory Eviction */ | 542 | /* KFD Memory Eviction */ |
@@ -512,6 +548,8 @@ struct qcm_process_device { | |||
512 | /* Approx. time before evicting the process again */ | 548 | /* Approx. time before evicting the process again */ |
513 | #define PROCESS_ACTIVE_TIME_MS 10 | 549 | #define PROCESS_ACTIVE_TIME_MS 10 |
514 | 550 | ||
551 | int kgd2kfd_quiesce_mm(struct mm_struct *mm); | ||
552 | int kgd2kfd_resume_mm(struct mm_struct *mm); | ||
515 | int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, | 553 | int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, |
516 | struct dma_fence *fence); | 554 | struct dma_fence *fence); |
517 | 555 | ||
@@ -681,6 +719,8 @@ struct kfd_process *kfd_get_process(const struct task_struct *); | |||
681 | struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); | 719 | struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); |
682 | struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); | 720 | struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); |
683 | void kfd_unref_process(struct kfd_process *p); | 721 | void kfd_unref_process(struct kfd_process *p); |
722 | int kfd_process_evict_queues(struct kfd_process *p); | ||
723 | int kfd_process_restore_queues(struct kfd_process *p); | ||
684 | void kfd_suspend_all_processes(void); | 724 | void kfd_suspend_all_processes(void); |
685 | int kfd_resume_all_processes(void); | 725 | int kfd_resume_all_processes(void); |
686 | 726 | ||
@@ -693,7 +733,7 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, | |||
693 | struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, | 733 | struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, |
694 | struct kfd_process *p); | 734 | struct kfd_process *p); |
695 | 735 | ||
696 | int kfd_reserved_mem_mmap(struct kfd_process *process, | 736 | int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, |
697 | struct vm_area_struct *vma); | 737 | struct vm_area_struct *vma); |
698 | 738 | ||
699 | /* KFD process API for creating and translating handles */ | 739 | /* KFD process API for creating and translating handles */ |
@@ -721,17 +761,20 @@ unsigned int kfd_pasid_alloc(void); | |||
721 | void kfd_pasid_free(unsigned int pasid); | 761 | void kfd_pasid_free(unsigned int pasid); |
722 | 762 | ||
723 | /* Doorbells */ | 763 | /* Doorbells */ |
764 | size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); | ||
724 | int kfd_doorbell_init(struct kfd_dev *kfd); | 765 | int kfd_doorbell_init(struct kfd_dev *kfd); |
725 | void kfd_doorbell_fini(struct kfd_dev *kfd); | 766 | void kfd_doorbell_fini(struct kfd_dev *kfd); |
726 | int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); | 767 | int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, |
727 | u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, | 768 | struct vm_area_struct *vma); |
769 | void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, | ||
728 | unsigned int *doorbell_off); | 770 | unsigned int *doorbell_off); |
729 | void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); | 771 | void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); |
730 | u32 read_kernel_doorbell(u32 __iomem *db); | 772 | u32 read_kernel_doorbell(u32 __iomem *db); |
731 | void write_kernel_doorbell(u32 __iomem *db, u32 value); | 773 | void write_kernel_doorbell(void __iomem *db, u32 value); |
732 | unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, | 774 | void write_kernel_doorbell64(void __iomem *db, u64 value); |
775 | unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, | ||
733 | struct kfd_process *process, | 776 | struct kfd_process *process, |
734 | unsigned int queue_id); | 777 | unsigned int doorbell_id); |
735 | phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, | 778 | phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, |
736 | struct kfd_process *process); | 779 | struct kfd_process *process); |
737 | int kfd_alloc_process_doorbells(struct kfd_process *process); | 780 | int kfd_alloc_process_doorbells(struct kfd_process *process); |
@@ -788,6 +831,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, | |||
788 | struct kfd_dev *dev); | 831 | struct kfd_dev *dev); |
789 | struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, | 832 | struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, |
790 | struct kfd_dev *dev); | 833 | struct kfd_dev *dev); |
834 | struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, | ||
835 | struct kfd_dev *dev); | ||
791 | struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); | 836 | struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); |
792 | void device_queue_manager_uninit(struct device_queue_manager *dqm); | 837 | void device_queue_manager_uninit(struct device_queue_manager *dqm); |
793 | struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, | 838 | struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, |
@@ -832,8 +877,42 @@ struct packet_manager { | |||
832 | bool allocated; | 877 | bool allocated; |
833 | struct kfd_mem_obj *ib_buffer_obj; | 878 | struct kfd_mem_obj *ib_buffer_obj; |
834 | unsigned int ib_size_bytes; | 879 | unsigned int ib_size_bytes; |
880 | |||
881 | const struct packet_manager_funcs *pmf; | ||
882 | }; | ||
883 | |||
884 | struct packet_manager_funcs { | ||
885 | /* Support ASIC-specific packet formats for PM4 packets */ | ||
886 | int (*map_process)(struct packet_manager *pm, uint32_t *buffer, | ||
887 | struct qcm_process_device *qpd); | ||
888 | int (*runlist)(struct packet_manager *pm, uint32_t *buffer, | ||
889 | uint64_t ib, size_t ib_size_in_dwords, bool chain); | ||
890 | int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, | ||
891 | struct scheduling_resources *res); | ||
892 | int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, | ||
893 | struct queue *q, bool is_static); | ||
894 | int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, | ||
895 | enum kfd_queue_type type, | ||
896 | enum kfd_unmap_queues_filter mode, | ||
897 | uint32_t filter_param, bool reset, | ||
898 | unsigned int sdma_engine); | ||
899 | int (*query_status)(struct packet_manager *pm, uint32_t *buffer, | ||
900 | uint64_t fence_address, uint32_t fence_value); | ||
901 | int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); | ||
902 | |||
903 | /* Packet sizes */ | ||
904 | int map_process_size; | ||
905 | int runlist_size; | ||
906 | int set_resources_size; | ||
907 | int map_queues_size; | ||
908 | int unmap_queues_size; | ||
909 | int query_status_size; | ||
910 | int release_mem_size; | ||
835 | }; | 911 | }; |
836 | 912 | ||
913 | extern const struct packet_manager_funcs kfd_vi_pm_funcs; | ||
914 | extern const struct packet_manager_funcs kfd_v9_pm_funcs; | ||
915 | |||
837 | int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); | 916 | int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); |
838 | void pm_uninit(struct packet_manager *pm); | 917 | void pm_uninit(struct packet_manager *pm); |
839 | int pm_send_set_resources(struct packet_manager *pm, | 918 | int pm_send_set_resources(struct packet_manager *pm, |
@@ -849,12 +928,17 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, | |||
849 | 928 | ||
850 | void pm_release_ib(struct packet_manager *pm); | 929 | void pm_release_ib(struct packet_manager *pm); |
851 | 930 | ||
852 | uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer); | 931 | /* Following PM funcs can be shared among VI and AI */ |
932 | unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); | ||
933 | int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, | ||
934 | struct scheduling_resources *res); | ||
853 | 935 | ||
854 | uint64_t kfd_get_number_elems(struct kfd_dev *kfd); | 936 | uint64_t kfd_get_number_elems(struct kfd_dev *kfd); |
855 | 937 | ||
856 | /* Events */ | 938 | /* Events */ |
857 | extern const struct kfd_event_interrupt_class event_interrupt_class_cik; | 939 | extern const struct kfd_event_interrupt_class event_interrupt_class_cik; |
940 | extern const struct kfd_event_interrupt_class event_interrupt_class_v9; | ||
941 | |||
858 | extern const struct kfd_device_global_init_class device_global_init_class_cik; | 942 | extern const struct kfd_device_global_init_class device_global_init_class_cik; |
859 | 943 | ||
860 | void kfd_event_init_process(struct kfd_process *p); | 944 | void kfd_event_init_process(struct kfd_process *p); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 1711ad0642f7..1d80b4f7c681 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c | |||
@@ -332,6 +332,7 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) | |||
332 | free_pages((unsigned long)pdd->qpd.cwsr_kaddr, | 332 | free_pages((unsigned long)pdd->qpd.cwsr_kaddr, |
333 | get_order(KFD_CWSR_TBA_TMA_SIZE)); | 333 | get_order(KFD_CWSR_TBA_TMA_SIZE)); |
334 | 334 | ||
335 | kfree(pdd->qpd.doorbell_bitmap); | ||
335 | idr_destroy(&pdd->alloc_idr); | 336 | idr_destroy(&pdd->alloc_idr); |
336 | 337 | ||
337 | kfree(pdd); | 338 | kfree(pdd); |
@@ -451,7 +452,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) | |||
451 | if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base) | 452 | if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base) |
452 | continue; | 453 | continue; |
453 | 454 | ||
454 | offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT; | 455 | offset = (KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id)) |
456 | << PAGE_SHIFT; | ||
455 | qpd->tba_addr = (int64_t)vm_mmap(filep, 0, | 457 | qpd->tba_addr = (int64_t)vm_mmap(filep, 0, |
456 | KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, | 458 | KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, |
457 | MAP_SHARED, offset); | 459 | MAP_SHARED, offset); |
@@ -585,6 +587,31 @@ err_alloc_process: | |||
585 | return ERR_PTR(err); | 587 | return ERR_PTR(err); |
586 | } | 588 | } |
587 | 589 | ||
590 | static int init_doorbell_bitmap(struct qcm_process_device *qpd, | ||
591 | struct kfd_dev *dev) | ||
592 | { | ||
593 | unsigned int i; | ||
594 | |||
595 | if (!KFD_IS_SOC15(dev->device_info->asic_family)) | ||
596 | return 0; | ||
597 | |||
598 | qpd->doorbell_bitmap = | ||
599 | kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, | ||
600 | BITS_PER_BYTE), GFP_KERNEL); | ||
601 | if (!qpd->doorbell_bitmap) | ||
602 | return -ENOMEM; | ||
603 | |||
604 | /* Mask out any reserved doorbells */ | ||
605 | for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) | ||
606 | if ((dev->shared_resources.reserved_doorbell_mask & i) == | ||
607 | dev->shared_resources.reserved_doorbell_val) { | ||
608 | set_bit(i, qpd->doorbell_bitmap); | ||
609 | pr_debug("reserved doorbell 0x%03x\n", i); | ||
610 | } | ||
611 | |||
612 | return 0; | ||
613 | } | ||
614 | |||
588 | struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, | 615 | struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, |
589 | struct kfd_process *p) | 616 | struct kfd_process *p) |
590 | { | 617 | { |
@@ -606,6 +633,12 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, | |||
606 | if (!pdd) | 633 | if (!pdd) |
607 | return NULL; | 634 | return NULL; |
608 | 635 | ||
636 | if (init_doorbell_bitmap(&pdd->qpd, dev)) { | ||
637 | pr_err("Failed to init doorbell for process\n"); | ||
638 | kfree(pdd); | ||
639 | return NULL; | ||
640 | } | ||
641 | |||
609 | pdd->dev = dev; | 642 | pdd->dev = dev; |
610 | INIT_LIST_HEAD(&pdd->qpd.queues_list); | 643 | INIT_LIST_HEAD(&pdd->qpd.queues_list); |
611 | INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); | 644 | INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); |
@@ -808,7 +841,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) | |||
808 | * Eviction is reference-counted per process-device. This means multiple | 841 | * Eviction is reference-counted per process-device. This means multiple |
809 | * evictions from different sources can be nested safely. | 842 | * evictions from different sources can be nested safely. |
810 | */ | 843 | */ |
811 | static int process_evict_queues(struct kfd_process *p) | 844 | int kfd_process_evict_queues(struct kfd_process *p) |
812 | { | 845 | { |
813 | struct kfd_process_device *pdd; | 846 | struct kfd_process_device *pdd; |
814 | int r = 0; | 847 | int r = 0; |
@@ -844,7 +877,7 @@ fail: | |||
844 | } | 877 | } |
845 | 878 | ||
846 | /* process_restore_queues - Restore all user queues of a process */ | 879 | /* process_restore_queues - Restore all user queues of a process */ |
847 | static int process_restore_queues(struct kfd_process *p) | 880 | int kfd_process_restore_queues(struct kfd_process *p) |
848 | { | 881 | { |
849 | struct kfd_process_device *pdd; | 882 | struct kfd_process_device *pdd; |
850 | int r, ret = 0; | 883 | int r, ret = 0; |
@@ -886,7 +919,7 @@ static void evict_process_worker(struct work_struct *work) | |||
886 | flush_delayed_work(&p->restore_work); | 919 | flush_delayed_work(&p->restore_work); |
887 | 920 | ||
888 | pr_debug("Started evicting pasid %d\n", p->pasid); | 921 | pr_debug("Started evicting pasid %d\n", p->pasid); |
889 | ret = process_evict_queues(p); | 922 | ret = kfd_process_evict_queues(p); |
890 | if (!ret) { | 923 | if (!ret) { |
891 | dma_fence_signal(p->ef); | 924 | dma_fence_signal(p->ef); |
892 | dma_fence_put(p->ef); | 925 | dma_fence_put(p->ef); |
@@ -946,7 +979,7 @@ static void restore_process_worker(struct work_struct *work) | |||
946 | return; | 979 | return; |
947 | } | 980 | } |
948 | 981 | ||
949 | ret = process_restore_queues(p); | 982 | ret = kfd_process_restore_queues(p); |
950 | if (!ret) | 983 | if (!ret) |
951 | pr_debug("Finished restoring pasid %d\n", p->pasid); | 984 | pr_debug("Finished restoring pasid %d\n", p->pasid); |
952 | else | 985 | else |
@@ -963,7 +996,7 @@ void kfd_suspend_all_processes(void) | |||
963 | cancel_delayed_work_sync(&p->eviction_work); | 996 | cancel_delayed_work_sync(&p->eviction_work); |
964 | cancel_delayed_work_sync(&p->restore_work); | 997 | cancel_delayed_work_sync(&p->restore_work); |
965 | 998 | ||
966 | if (process_evict_queues(p)) | 999 | if (kfd_process_evict_queues(p)) |
967 | pr_err("Failed to suspend process %d\n", p->pasid); | 1000 | pr_err("Failed to suspend process %d\n", p->pasid); |
968 | dma_fence_signal(p->ef); | 1001 | dma_fence_signal(p->ef); |
969 | dma_fence_put(p->ef); | 1002 | dma_fence_put(p->ef); |
@@ -989,15 +1022,12 @@ int kfd_resume_all_processes(void) | |||
989 | return ret; | 1022 | return ret; |
990 | } | 1023 | } |
991 | 1024 | ||
992 | int kfd_reserved_mem_mmap(struct kfd_process *process, | 1025 | int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, |
993 | struct vm_area_struct *vma) | 1026 | struct vm_area_struct *vma) |
994 | { | 1027 | { |
995 | struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); | ||
996 | struct kfd_process_device *pdd; | 1028 | struct kfd_process_device *pdd; |
997 | struct qcm_process_device *qpd; | 1029 | struct qcm_process_device *qpd; |
998 | 1030 | ||
999 | if (!dev) | ||
1000 | return -EINVAL; | ||
1001 | if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { | 1031 | if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { |
1002 | pr_err("Incorrect CWSR mapping size.\n"); | 1032 | pr_err("Incorrect CWSR mapping size.\n"); |
1003 | return -EINVAL; | 1033 | return -EINVAL; |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 7817e327ea6d..d65ce0436b31 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | |||
@@ -119,9 +119,6 @@ static int create_cp_queue(struct process_queue_manager *pqm, | |||
119 | /* Doorbell initialized in user space*/ | 119 | /* Doorbell initialized in user space*/ |
120 | q_properties->doorbell_ptr = NULL; | 120 | q_properties->doorbell_ptr = NULL; |
121 | 121 | ||
122 | q_properties->doorbell_off = | ||
123 | kfd_queue_id_to_doorbell(dev, pqm->process, qid); | ||
124 | |||
125 | /* let DQM handle it*/ | 122 | /* let DQM handle it*/ |
126 | q_properties->vmid = 0; | 123 | q_properties->vmid = 0; |
127 | q_properties->queue_id = qid; | 124 | q_properties->queue_id = qid; |
@@ -244,10 +241,20 @@ int pqm_create_queue(struct process_queue_manager *pqm, | |||
244 | } | 241 | } |
245 | 242 | ||
246 | if (retval != 0) { | 243 | if (retval != 0) { |
247 | pr_err("DQM create queue failed\n"); | 244 | pr_err("Pasid %d DQM create queue %d failed. ret %d\n", |
245 | pqm->process->pasid, type, retval); | ||
248 | goto err_create_queue; | 246 | goto err_create_queue; |
249 | } | 247 | } |
250 | 248 | ||
249 | if (q) | ||
250 | /* Return the doorbell offset within the doorbell page | ||
251 | * to the caller so it can be passed up to user mode | ||
252 | * (in bytes). | ||
253 | */ | ||
254 | properties->doorbell_off = | ||
255 | (q->properties.doorbell_off * sizeof(uint32_t)) & | ||
256 | (kfd_doorbell_process_slice(dev) - 1); | ||
257 | |||
251 | pr_debug("PQM After DQM create queue\n"); | 258 | pr_debug("PQM After DQM create queue\n"); |
252 | 259 | ||
253 | list_add(&pqn->process_queue_list, &pqm->queues); | 260 | list_add(&pqn->process_queue_list, &pqm->queues); |
@@ -313,8 +320,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) | |||
313 | dqm = pqn->q->device->dqm; | 320 | dqm = pqn->q->device->dqm; |
314 | retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); | 321 | retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); |
315 | if (retval) { | 322 | if (retval) { |
316 | pr_debug("Destroy queue failed, returned %d\n", retval); | 323 | pr_err("Pasid %d destroy queue %d failed, ret %d\n", |
317 | goto err_destroy_queue; | 324 | pqm->process->pasid, |
325 | pqn->q->properties.queue_id, retval); | ||
326 | if (retval != -ETIME) | ||
327 | goto err_destroy_queue; | ||
318 | } | 328 | } |
319 | uninit_queue(pqn->q); | 329 | uninit_queue(pqn->q); |
320 | } | 330 | } |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index a5315d4f1c95..6dcd621e5b71 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c | |||
@@ -36,8 +36,8 @@ void print_queue_properties(struct queue_properties *q) | |||
36 | pr_debug("Queue Address: 0x%llX\n", q->queue_address); | 36 | pr_debug("Queue Address: 0x%llX\n", q->queue_address); |
37 | pr_debug("Queue Id: %u\n", q->queue_id); | 37 | pr_debug("Queue Id: %u\n", q->queue_id); |
38 | pr_debug("Queue Process Vmid: %u\n", q->vmid); | 38 | pr_debug("Queue Process Vmid: %u\n", q->vmid); |
39 | pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr); | 39 | pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr); |
40 | pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr); | 40 | pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr); |
41 | pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); | 41 | pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); |
42 | pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); | 42 | pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); |
43 | } | 43 | } |
@@ -53,8 +53,8 @@ void print_queue(struct queue *q) | |||
53 | pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); | 53 | pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); |
54 | pr_debug("Queue Id: %u\n", q->properties.queue_id); | 54 | pr_debug("Queue Id: %u\n", q->properties.queue_id); |
55 | pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); | 55 | pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); |
56 | pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr); | 56 | pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr); |
57 | pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr); | 57 | pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr); |
58 | pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); | 58 | pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); |
59 | pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); | 59 | pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); |
60 | pr_debug("Queue MQD Address: 0x%p\n", q->mqd); | 60 | pr_debug("Queue MQD Address: 0x%p\n", q->mqd); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index ac28abc94e57..bc95d4dfee2e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c | |||
@@ -1239,6 +1239,12 @@ int kfd_topology_add_device(struct kfd_dev *gpu) | |||
1239 | HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & | 1239 | HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & |
1240 | HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); | 1240 | HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); |
1241 | break; | 1241 | break; |
1242 | case CHIP_VEGA10: | ||
1243 | case CHIP_RAVEN: | ||
1244 | dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << | ||
1245 | HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & | ||
1246 | HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); | ||
1247 | break; | ||
1242 | default: | 1248 | default: |
1243 | WARN(1, "Unexpected ASIC family %u", | 1249 | WARN(1, "Unexpected ASIC family %u", |
1244 | dev->gpu->device_info->asic_family); | 1250 | dev->gpu->device_info->asic_family); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index eb54cfcaf039..7d9c3f948dff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h | |||
@@ -45,6 +45,7 @@ | |||
45 | 45 | ||
46 | #define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 | 46 | #define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 |
47 | #define HSA_CAP_DOORBELL_TYPE_1_0 0x1 | 47 | #define HSA_CAP_DOORBELL_TYPE_1_0 0x1 |
48 | #define HSA_CAP_DOORBELL_TYPE_2_0 0x2 | ||
48 | #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 | 49 | #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 |
49 | 50 | ||
50 | struct kfd_node_properties { | 51 | struct kfd_node_properties { |
diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h new file mode 100644 index 000000000000..0bc0b25cb410 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h | |||
@@ -0,0 +1,47 @@ | |||
1 | /* | ||
2 | * Copyright 2016-2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | */ | ||
22 | |||
23 | #ifndef HSA_SOC15_INT_H_INCLUDED | ||
24 | #define HSA_SOC15_INT_H_INCLUDED | ||
25 | |||
26 | #include "soc15_ih_clientid.h" | ||
27 | |||
28 | #define SOC15_INTSRC_CP_END_OF_PIPE 181 | ||
29 | #define SOC15_INTSRC_CP_BAD_OPCODE 183 | ||
30 | #define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 | ||
31 | #define SOC15_INTSRC_VMC_FAULT 0 | ||
32 | #define SOC15_INTSRC_SDMA_TRAP 224 | ||
33 | |||
34 | |||
35 | #define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) | ||
36 | #define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) | ||
37 | #define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) | ||
38 | #define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) | ||
39 | #define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) | ||
40 | #define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) | ||
41 | #define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) | ||
42 | #define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) | ||
43 | #define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) | ||
44 | #define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) | ||
45 | |||
46 | #endif | ||
47 | |||
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index 237289a72bb7..5733fbee07f7 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h | |||
@@ -100,6 +100,21 @@ struct kgd2kfd_shared_resources { | |||
100 | /* Bit n == 1 means Queue n is available for KFD */ | 100 | /* Bit n == 1 means Queue n is available for KFD */ |
101 | DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); | 101 | DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); |
102 | 102 | ||
103 | /* Doorbell assignments (SOC15 and later chips only). Only | ||
104 | * specific doorbells are routed to each SDMA engine. Others | ||
105 | * are routed to IH and VCN. They are not usable by the CP. | ||
106 | * | ||
107 | * Any doorbell number D that satisfies the following condition | ||
108 | * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val | ||
109 | * | ||
110 | * KFD currently uses 1024 (= 0x3ff) doorbells per process. If | ||
111 | * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means | ||
112 | * mask would be set to 0x1f8 and val set to 0x0f0. | ||
113 | */ | ||
114 | unsigned int sdma_doorbell[2][2]; | ||
115 | unsigned int reserved_doorbell_mask; | ||
116 | unsigned int reserved_doorbell_val; | ||
117 | |||
103 | /* Base address of doorbell aperture. */ | 118 | /* Base address of doorbell aperture. */ |
104 | phys_addr_t doorbell_physical_address; | 119 | phys_addr_t doorbell_physical_address; |
105 | 120 | ||
@@ -173,8 +188,6 @@ struct tile_config { | |||
173 | * @set_pasid_vmid_mapping: Exposes pasid/vmid pair to the H/W for no cp | 188 | * @set_pasid_vmid_mapping: Exposes pasid/vmid pair to the H/W for no cp |
174 | * scheduling mode. Only used for no cp scheduling mode. | 189 | * scheduling mode. Only used for no cp scheduling mode. |
175 | * | 190 | * |
176 | * @init_pipeline: Initialized the compute pipelines. | ||
177 | * | ||
178 | * @hqd_load: Loads the mqd structure to a H/W hqd slot. used only for no cp | 191 | * @hqd_load: Loads the mqd structure to a H/W hqd slot. used only for no cp |
179 | * sceduling mode. | 192 | * sceduling mode. |
180 | * | 193 | * |
@@ -274,9 +287,6 @@ struct kfd2kgd_calls { | |||
274 | int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, unsigned int pasid, | 287 | int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, unsigned int pasid, |
275 | unsigned int vmid); | 288 | unsigned int vmid); |
276 | 289 | ||
277 | int (*init_pipeline)(struct kgd_dev *kgd, uint32_t pipe_id, | ||
278 | uint32_t hpd_size, uint64_t hpd_gpu_addr); | ||
279 | |||
280 | int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); | 290 | int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); |
281 | 291 | ||
282 | int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, | 292 | int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, |
@@ -382,6 +392,10 @@ struct kfd2kgd_calls { | |||
382 | * | 392 | * |
383 | * @resume: Notifies amdkfd about a resume action done to a kgd device | 393 | * @resume: Notifies amdkfd about a resume action done to a kgd device |
384 | * | 394 | * |
395 | * @quiesce_mm: Quiesce all user queue access to specified MM address space | ||
396 | * | ||
397 | * @resume_mm: Resume user queue access to specified MM address space | ||
398 | * | ||
385 | * @schedule_evict_and_restore_process: Schedules work queue that will prepare | 399 | * @schedule_evict_and_restore_process: Schedules work queue that will prepare |
386 | * for safe eviction of KFD BOs that belong to the specified process. | 400 | * for safe eviction of KFD BOs that belong to the specified process. |
387 | * | 401 | * |
@@ -399,6 +413,8 @@ struct kgd2kfd_calls { | |||
399 | void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); | 413 | void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); |
400 | void (*suspend)(struct kfd_dev *kfd); | 414 | void (*suspend)(struct kfd_dev *kfd); |
401 | int (*resume)(struct kfd_dev *kfd); | 415 | int (*resume)(struct kfd_dev *kfd); |
416 | int (*quiesce_mm)(struct mm_struct *mm); | ||
417 | int (*resume_mm)(struct mm_struct *mm); | ||
402 | int (*schedule_evict_and_restore_process)(struct mm_struct *mm, | 418 | int (*schedule_evict_and_restore_process)(struct mm_struct *mm, |
403 | struct dma_fence *fence); | 419 | struct dma_fence *fence); |
404 | }; | 420 | }; |
diff --git a/drivers/gpu/drm/amd/include/v9_structs.h b/drivers/gpu/drm/amd/include/v9_structs.h index 2fb25abaf7c8..ceaf4932258d 100644 --- a/drivers/gpu/drm/amd/include/v9_structs.h +++ b/drivers/gpu/drm/amd/include/v9_structs.h | |||
@@ -29,10 +29,10 @@ struct v9_sdma_mqd { | |||
29 | uint32_t sdmax_rlcx_rb_base; | 29 | uint32_t sdmax_rlcx_rb_base; |
30 | uint32_t sdmax_rlcx_rb_base_hi; | 30 | uint32_t sdmax_rlcx_rb_base_hi; |
31 | uint32_t sdmax_rlcx_rb_rptr; | 31 | uint32_t sdmax_rlcx_rb_rptr; |
32 | uint32_t sdmax_rlcx_rb_rptr_hi; | ||
32 | uint32_t sdmax_rlcx_rb_wptr; | 33 | uint32_t sdmax_rlcx_rb_wptr; |
34 | uint32_t sdmax_rlcx_rb_wptr_hi; | ||
33 | uint32_t sdmax_rlcx_rb_wptr_poll_cntl; | 35 | uint32_t sdmax_rlcx_rb_wptr_poll_cntl; |
34 | uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; | ||
35 | uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; | ||
36 | uint32_t sdmax_rlcx_rb_rptr_addr_hi; | 36 | uint32_t sdmax_rlcx_rb_rptr_addr_hi; |
37 | uint32_t sdmax_rlcx_rb_rptr_addr_lo; | 37 | uint32_t sdmax_rlcx_rb_rptr_addr_lo; |
38 | uint32_t sdmax_rlcx_ib_cntl; | 38 | uint32_t sdmax_rlcx_ib_cntl; |
@@ -44,29 +44,29 @@ struct v9_sdma_mqd { | |||
44 | uint32_t sdmax_rlcx_skip_cntl; | 44 | uint32_t sdmax_rlcx_skip_cntl; |
45 | uint32_t sdmax_rlcx_context_status; | 45 | uint32_t sdmax_rlcx_context_status; |
46 | uint32_t sdmax_rlcx_doorbell; | 46 | uint32_t sdmax_rlcx_doorbell; |
47 | uint32_t sdmax_rlcx_virtual_addr; | 47 | uint32_t sdmax_rlcx_status; |
48 | uint32_t sdmax_rlcx_ape1_cntl; | ||
49 | uint32_t sdmax_rlcx_doorbell_log; | 48 | uint32_t sdmax_rlcx_doorbell_log; |
50 | uint32_t reserved_22; | 49 | uint32_t sdmax_rlcx_watermark; |
51 | uint32_t reserved_23; | 50 | uint32_t sdmax_rlcx_doorbell_offset; |
52 | uint32_t reserved_24; | 51 | uint32_t sdmax_rlcx_csa_addr_lo; |
53 | uint32_t reserved_25; | 52 | uint32_t sdmax_rlcx_csa_addr_hi; |
54 | uint32_t reserved_26; | 53 | uint32_t sdmax_rlcx_ib_sub_remain; |
55 | uint32_t reserved_27; | 54 | uint32_t sdmax_rlcx_preempt; |
56 | uint32_t reserved_28; | 55 | uint32_t sdmax_rlcx_dummy_reg; |
57 | uint32_t reserved_29; | 56 | uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; |
58 | uint32_t reserved_30; | 57 | uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; |
59 | uint32_t reserved_31; | 58 | uint32_t sdmax_rlcx_rb_aql_cntl; |
60 | uint32_t reserved_32; | 59 | uint32_t sdmax_rlcx_minor_ptr_update; |
61 | uint32_t reserved_33; | 60 | uint32_t sdmax_rlcx_midcmd_data0; |
62 | uint32_t reserved_34; | 61 | uint32_t sdmax_rlcx_midcmd_data1; |
63 | uint32_t reserved_35; | 62 | uint32_t sdmax_rlcx_midcmd_data2; |
64 | uint32_t reserved_36; | 63 | uint32_t sdmax_rlcx_midcmd_data3; |
65 | uint32_t reserved_37; | 64 | uint32_t sdmax_rlcx_midcmd_data4; |
66 | uint32_t reserved_38; | 65 | uint32_t sdmax_rlcx_midcmd_data5; |
67 | uint32_t reserved_39; | 66 | uint32_t sdmax_rlcx_midcmd_data6; |
68 | uint32_t reserved_40; | 67 | uint32_t sdmax_rlcx_midcmd_data7; |
69 | uint32_t reserved_41; | 68 | uint32_t sdmax_rlcx_midcmd_data8; |
69 | uint32_t sdmax_rlcx_midcmd_cntl; | ||
70 | uint32_t reserved_42; | 70 | uint32_t reserved_42; |
71 | uint32_t reserved_43; | 71 | uint32_t reserved_43; |
72 | uint32_t reserved_44; | 72 | uint32_t reserved_44; |