aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2018-05-15 01:59:10 -0400
committerDave Airlie <airlied@redhat.com>2018-05-15 02:06:08 -0400
commitc76f0b2cc2f1be1a8a20f0fe2c0f30919bc559fb (patch)
tree1aeeb74795b2951952aa443f7104d6c090c58141
parent444ac87becd8a2ff76f9e4194dd98da4f5d5586d (diff)
parentaf47b390273f1068bdb1d01263a81948c4e2f97a (diff)
Merge tag 'drm-amdkfd-next-2018-05-14' of git://people.freedesktop.org/~gabbayo/linux into drm-next
This is amdkfd pull for 4.18. The major new features are: - Add support for GFXv9 dGPUs (VEGA) - Add support for userptr memory mapping In addition, there are a couple of small fixes and improvements, such as: - Fix lock handling - Fix rollback packet in kernel kfd_queue - Optimize kfd signal handling - Fix CP hang in APU Signed-off-by: Dave Airlie <airlied@redhat.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180514070126.GA1827@odedg-x270
-rw-r--r--MAINTAINERS2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/Makefile3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c26
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h13
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c1043
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c572
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c111
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h11
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c38
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/soc15d.h5
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Makefile10
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c20
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_regs.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h560
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm274
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm1214
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c52
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_crat.c11
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c131
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c114
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c84
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c65
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c119
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c92
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c8
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c39
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h7
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c9
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c340
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c319
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_module.c7
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c443
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c392
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h583
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h112
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c50
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c22
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_queue.c8
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/soc15_int.h47
-rw-r--r--drivers/gpu/drm/amd/include/kgd_kfd_interface.h26
-rw-r--r--drivers/gpu/drm/amd/include/v9_structs.h48
52 files changed, 6222 insertions, 858 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 8daa96a99eac..ac1215a5561e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -767,12 +767,14 @@ F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
767F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 767F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
768F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 768F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
769F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 769F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
770F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
770F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c 771F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
771F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 772F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
772F: drivers/gpu/drm/amd/amdkfd/ 773F: drivers/gpu/drm/amd/amdkfd/
773F: drivers/gpu/drm/amd/include/cik_structs.h 774F: drivers/gpu/drm/amd/include/cik_structs.h
774F: drivers/gpu/drm/amd/include/kgd_kfd_interface.h 775F: drivers/gpu/drm/amd/include/kgd_kfd_interface.h
775F: drivers/gpu/drm/amd/include/vi_structs.h 776F: drivers/gpu/drm/amd/include/vi_structs.h
777F: drivers/gpu/drm/amd/include/v9_structs.h
776F: include/uapi/linux/kfd_ioctl.h 778F: include/uapi/linux/kfd_ioctl.h
777 779
778AMD SEATTLE DEVICE TREE SUPPORT 780AMD SEATTLE DEVICE TREE SUPPORT
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 2ca2b5154d52..f3002020df6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -130,7 +130,8 @@ amdgpu-y += \
130 amdgpu_amdkfd.o \ 130 amdgpu_amdkfd.o \
131 amdgpu_amdkfd_fence.o \ 131 amdgpu_amdkfd_fence.o \
132 amdgpu_amdkfd_gpuvm.o \ 132 amdgpu_amdkfd_gpuvm.o \
133 amdgpu_amdkfd_gfx_v8.o 133 amdgpu_amdkfd_gfx_v8.o \
134 amdgpu_amdkfd_gfx_v9.o
134 135
135# add cgs 136# add cgs
136amdgpu-y += amdgpu_cgs.o 137amdgpu-y += amdgpu_cgs.o
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 4d36203ffb11..cd0e8f192e6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -92,6 +92,10 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
92 case CHIP_POLARIS11: 92 case CHIP_POLARIS11:
93 kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); 93 kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions();
94 break; 94 break;
95 case CHIP_VEGA10:
96 case CHIP_RAVEN:
97 kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions();
98 break;
95 default: 99 default:
96 dev_dbg(adev->dev, "kfd not supported on this ASIC\n"); 100 dev_dbg(adev->dev, "kfd not supported on this ASIC\n");
97 return; 101 return;
@@ -175,6 +179,28 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
175 &gpu_resources.doorbell_physical_address, 179 &gpu_resources.doorbell_physical_address,
176 &gpu_resources.doorbell_aperture_size, 180 &gpu_resources.doorbell_aperture_size,
177 &gpu_resources.doorbell_start_offset); 181 &gpu_resources.doorbell_start_offset);
182 if (adev->asic_type >= CHIP_VEGA10) {
183 /* On SOC15 the BIF is involved in routing
184 * doorbells using the low 12 bits of the
185 * address. Communicate the assignments to
186 * KFD. KFD uses two doorbell pages per
187 * process in case of 64-bit doorbells so we
188 * can use each doorbell assignment twice.
189 */
190 gpu_resources.sdma_doorbell[0][0] =
191 AMDGPU_DOORBELL64_sDMA_ENGINE0;
192 gpu_resources.sdma_doorbell[0][1] =
193 AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200;
194 gpu_resources.sdma_doorbell[1][0] =
195 AMDGPU_DOORBELL64_sDMA_ENGINE1;
196 gpu_resources.sdma_doorbell[1][1] =
197 AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200;
198 /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for
199 * SDMA, IH and VCN. So don't use them for the CP.
200 */
201 gpu_resources.reserved_doorbell_mask = 0x1f0;
202 gpu_resources.reserved_doorbell_val = 0x0f0;
203 }
178 204
179 kgd2kfd->device_init(adev->kfd, &gpu_resources); 205 kgd2kfd->device_init(adev->kfd, &gpu_resources);
180 } 206 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index c2c2bea731e0..12367a9951e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -28,6 +28,7 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/mmu_context.h> 30#include <linux/mmu_context.h>
31#include <linux/workqueue.h>
31#include <kgd_kfd_interface.h> 32#include <kgd_kfd_interface.h>
32#include <drm/ttm/ttm_execbuf_util.h> 33#include <drm/ttm/ttm_execbuf_util.h>
33#include "amdgpu_sync.h" 34#include "amdgpu_sync.h"
@@ -59,7 +60,9 @@ struct kgd_mem {
59 60
60 uint32_t mapping_flags; 61 uint32_t mapping_flags;
61 62
63 atomic_t invalid;
62 struct amdkfd_process_info *process_info; 64 struct amdkfd_process_info *process_info;
65 struct page **user_pages;
63 66
64 struct amdgpu_sync sync; 67 struct amdgpu_sync sync;
65 68
@@ -84,6 +87,9 @@ struct amdkfd_process_info {
84 struct list_head vm_list_head; 87 struct list_head vm_list_head;
85 /* List head for all KFD BOs that belong to a KFD process. */ 88 /* List head for all KFD BOs that belong to a KFD process. */
86 struct list_head kfd_bo_list; 89 struct list_head kfd_bo_list;
90 /* List of userptr BOs that are valid or invalid */
91 struct list_head userptr_valid_list;
92 struct list_head userptr_inval_list;
87 /* Lock to protect kfd_bo_list */ 93 /* Lock to protect kfd_bo_list */
88 struct mutex lock; 94 struct mutex lock;
89 95
@@ -91,6 +97,11 @@ struct amdkfd_process_info {
91 unsigned int n_vms; 97 unsigned int n_vms;
92 /* Eviction Fence */ 98 /* Eviction Fence */
93 struct amdgpu_amdkfd_fence *eviction_fence; 99 struct amdgpu_amdkfd_fence *eviction_fence;
100
101 /* MMU-notifier related fields */
102 atomic_t evicted_bos;
103 struct delayed_work restore_userptr_work;
104 struct pid *pid;
94}; 105};
95 106
96int amdgpu_amdkfd_init(void); 107int amdgpu_amdkfd_init(void);
@@ -104,12 +115,14 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
104void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); 115void amdgpu_amdkfd_device_init(struct amdgpu_device *adev);
105void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); 116void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev);
106 117
118int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm);
107int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, 119int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
108 uint32_t vmid, uint64_t gpu_addr, 120 uint32_t vmid, uint64_t gpu_addr,
109 uint32_t *ib_cmd, uint32_t ib_len); 121 uint32_t *ib_cmd, uint32_t ib_len);
110 122
111struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); 123struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void);
112struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); 124struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void);
125struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void);
113 126
114bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); 127bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
115 128
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index ea54e53172b9..0ff36d45a597 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -98,8 +98,6 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
98static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 98static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
99 unsigned int vmid); 99 unsigned int vmid);
100 100
101static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
102 uint32_t hpd_size, uint64_t hpd_gpu_addr);
103static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); 101static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
104static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 102static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
105 uint32_t queue_id, uint32_t __user *wptr, 103 uint32_t queue_id, uint32_t __user *wptr,
@@ -183,7 +181,6 @@ static const struct kfd2kgd_calls kfd2kgd = {
183 .free_pasid = amdgpu_pasid_free, 181 .free_pasid = amdgpu_pasid_free,
184 .program_sh_mem_settings = kgd_program_sh_mem_settings, 182 .program_sh_mem_settings = kgd_program_sh_mem_settings,
185 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 183 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
186 .init_pipeline = kgd_init_pipeline,
187 .init_interrupts = kgd_init_interrupts, 184 .init_interrupts = kgd_init_interrupts,
188 .hqd_load = kgd_hqd_load, 185 .hqd_load = kgd_hqd_load,
189 .hqd_sdma_load = kgd_hqd_sdma_load, 186 .hqd_sdma_load = kgd_hqd_sdma_load,
@@ -309,13 +306,6 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
309 return 0; 306 return 0;
310} 307}
311 308
312static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
313 uint32_t hpd_size, uint64_t hpd_gpu_addr)
314{
315 /* amdgpu owns the per-pipe state */
316 return 0;
317}
318
319static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) 309static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
320{ 310{
321 struct amdgpu_device *adev = get_amdgpu_device(kgd); 311 struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index 89264c9a5e9f..6ef9762b4b00 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -57,8 +57,6 @@ static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
57 uint32_t sh_mem_bases); 57 uint32_t sh_mem_bases);
58static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 58static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
59 unsigned int vmid); 59 unsigned int vmid);
60static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
61 uint32_t hpd_size, uint64_t hpd_gpu_addr);
62static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); 60static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
63static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 61static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
64 uint32_t queue_id, uint32_t __user *wptr, 62 uint32_t queue_id, uint32_t __user *wptr,
@@ -141,7 +139,6 @@ static const struct kfd2kgd_calls kfd2kgd = {
141 .free_pasid = amdgpu_pasid_free, 139 .free_pasid = amdgpu_pasid_free,
142 .program_sh_mem_settings = kgd_program_sh_mem_settings, 140 .program_sh_mem_settings = kgd_program_sh_mem_settings,
143 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 141 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
144 .init_pipeline = kgd_init_pipeline,
145 .init_interrupts = kgd_init_interrupts, 142 .init_interrupts = kgd_init_interrupts,
146 .hqd_load = kgd_hqd_load, 143 .hqd_load = kgd_hqd_load,
147 .hqd_sdma_load = kgd_hqd_sdma_load, 144 .hqd_sdma_load = kgd_hqd_sdma_load,
@@ -270,13 +267,6 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
270 return 0; 267 return 0;
271} 268}
272 269
273static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
274 uint32_t hpd_size, uint64_t hpd_gpu_addr)
275{
276 /* amdgpu owns the per-pipe state */
277 return 0;
278}
279
280static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) 270static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
281{ 271{
282 struct amdgpu_device *adev = get_amdgpu_device(kgd); 272 struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
new file mode 100644
index 000000000000..8f37991df61b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -0,0 +1,1043 @@
1/*
2 * Copyright 2014-2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#define pr_fmt(fmt) "kfd2kgd: " fmt
24
25#include <linux/module.h>
26#include <linux/fdtable.h>
27#include <linux/uaccess.h>
28#include <linux/firmware.h>
29#include <drm/drmP.h>
30#include "amdgpu.h"
31#include "amdgpu_amdkfd.h"
32#include "amdgpu_ucode.h"
33#include "soc15_hw_ip.h"
34#include "gc/gc_9_0_offset.h"
35#include "gc/gc_9_0_sh_mask.h"
36#include "vega10_enum.h"
37#include "sdma0/sdma0_4_0_offset.h"
38#include "sdma0/sdma0_4_0_sh_mask.h"
39#include "sdma1/sdma1_4_0_offset.h"
40#include "sdma1/sdma1_4_0_sh_mask.h"
41#include "athub/athub_1_0_offset.h"
42#include "athub/athub_1_0_sh_mask.h"
43#include "oss/osssys_4_0_offset.h"
44#include "oss/osssys_4_0_sh_mask.h"
45#include "soc15_common.h"
46#include "v9_structs.h"
47#include "soc15.h"
48#include "soc15d.h"
49
50/* HACK: MMHUB and GC both have VM-related register with the same
51 * names but different offsets. Define the MMHUB register we need here
52 * with a prefix. A proper solution would be to move the functions
53 * programming these registers into gfx_v9_0.c and mmhub_v1_0.c
54 * respectively.
55 */
56#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3
57#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0
58
59#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705
60#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0
61
62#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b
63#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0
64#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c
65#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0
66
67#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b
68#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0
69#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c
70#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0
71
72#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b
73#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0
74#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c
75#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0
76
77#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727
78#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0
79#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728
80#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0
81
82#define V9_PIPE_PER_MEC (4)
83#define V9_QUEUES_PER_PIPE_MEC (8)
84
85enum hqd_dequeue_request_type {
86 NO_ACTION = 0,
87 DRAIN_PIPE,
88 RESET_WAVES
89};
90
91/*
92 * Register access functions
93 */
94
95static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
96 uint32_t sh_mem_config,
97 uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
98 uint32_t sh_mem_bases);
99static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
100 unsigned int vmid);
101static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
102static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
103 uint32_t queue_id, uint32_t __user *wptr,
104 uint32_t wptr_shift, uint32_t wptr_mask,
105 struct mm_struct *mm);
106static int kgd_hqd_dump(struct kgd_dev *kgd,
107 uint32_t pipe_id, uint32_t queue_id,
108 uint32_t (**dump)[2], uint32_t *n_regs);
109static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
110 uint32_t __user *wptr, struct mm_struct *mm);
111static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
112 uint32_t engine_id, uint32_t queue_id,
113 uint32_t (**dump)[2], uint32_t *n_regs);
114static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
115 uint32_t pipe_id, uint32_t queue_id);
116static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
117static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
118 enum kfd_preempt_type reset_type,
119 unsigned int utimeout, uint32_t pipe_id,
120 uint32_t queue_id);
121static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
122 unsigned int utimeout);
123static int kgd_address_watch_disable(struct kgd_dev *kgd);
124static int kgd_address_watch_execute(struct kgd_dev *kgd,
125 unsigned int watch_point_id,
126 uint32_t cntl_val,
127 uint32_t addr_hi,
128 uint32_t addr_lo);
129static int kgd_wave_control_execute(struct kgd_dev *kgd,
130 uint32_t gfx_index_val,
131 uint32_t sq_cmd);
132static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
133 unsigned int watch_point_id,
134 unsigned int reg_offset);
135
136static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
137 uint8_t vmid);
138static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
139 uint8_t vmid);
140static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
141 uint32_t page_table_base);
142static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
143static void set_scratch_backing_va(struct kgd_dev *kgd,
144 uint64_t va, uint32_t vmid);
145static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
146static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid);
147
148/* Because of REG_GET_FIELD() being used, we put this function in the
149 * asic specific file.
150 */
151static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
152 struct tile_config *config)
153{
154 struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
155
156 config->gb_addr_config = adev->gfx.config.gb_addr_config;
157
158 config->tile_config_ptr = adev->gfx.config.tile_mode_array;
159 config->num_tile_configs =
160 ARRAY_SIZE(adev->gfx.config.tile_mode_array);
161 config->macro_tile_config_ptr =
162 adev->gfx.config.macrotile_mode_array;
163 config->num_macro_tile_configs =
164 ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
165
166 return 0;
167}
168
169static const struct kfd2kgd_calls kfd2kgd = {
170 .init_gtt_mem_allocation = alloc_gtt_mem,
171 .free_gtt_mem = free_gtt_mem,
172 .get_local_mem_info = get_local_mem_info,
173 .get_gpu_clock_counter = get_gpu_clock_counter,
174 .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
175 .alloc_pasid = amdgpu_pasid_alloc,
176 .free_pasid = amdgpu_pasid_free,
177 .program_sh_mem_settings = kgd_program_sh_mem_settings,
178 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
179 .init_interrupts = kgd_init_interrupts,
180 .hqd_load = kgd_hqd_load,
181 .hqd_sdma_load = kgd_hqd_sdma_load,
182 .hqd_dump = kgd_hqd_dump,
183 .hqd_sdma_dump = kgd_hqd_sdma_dump,
184 .hqd_is_occupied = kgd_hqd_is_occupied,
185 .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
186 .hqd_destroy = kgd_hqd_destroy,
187 .hqd_sdma_destroy = kgd_hqd_sdma_destroy,
188 .address_watch_disable = kgd_address_watch_disable,
189 .address_watch_execute = kgd_address_watch_execute,
190 .wave_control_execute = kgd_wave_control_execute,
191 .address_watch_get_offset = kgd_address_watch_get_offset,
192 .get_atc_vmid_pasid_mapping_pasid =
193 get_atc_vmid_pasid_mapping_pasid,
194 .get_atc_vmid_pasid_mapping_valid =
195 get_atc_vmid_pasid_mapping_valid,
196 .get_fw_version = get_fw_version,
197 .set_scratch_backing_va = set_scratch_backing_va,
198 .get_tile_config = amdgpu_amdkfd_get_tile_config,
199 .get_cu_info = get_cu_info,
200 .get_vram_usage = amdgpu_amdkfd_get_vram_usage,
201 .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
202 .acquire_process_vm = amdgpu_amdkfd_gpuvm_acquire_process_vm,
203 .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
204 .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
205 .set_vm_context_page_table_base = set_vm_context_page_table_base,
206 .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
207 .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
208 .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
209 .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
210 .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
211 .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
212 .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
213 .invalidate_tlbs = invalidate_tlbs,
214 .invalidate_tlbs_vmid = invalidate_tlbs_vmid,
215 .submit_ib = amdgpu_amdkfd_submit_ib,
216};
217
218struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void)
219{
220 return (struct kfd2kgd_calls *)&kfd2kgd;
221}
222
223static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
224{
225 return (struct amdgpu_device *)kgd;
226}
227
228static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
229 uint32_t queue, uint32_t vmid)
230{
231 struct amdgpu_device *adev = get_amdgpu_device(kgd);
232
233 mutex_lock(&adev->srbm_mutex);
234 soc15_grbm_select(adev, mec, pipe, queue, vmid);
235}
236
237static void unlock_srbm(struct kgd_dev *kgd)
238{
239 struct amdgpu_device *adev = get_amdgpu_device(kgd);
240
241 soc15_grbm_select(adev, 0, 0, 0, 0);
242 mutex_unlock(&adev->srbm_mutex);
243}
244
245static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
246 uint32_t queue_id)
247{
248 struct amdgpu_device *adev = get_amdgpu_device(kgd);
249
250 uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
251 uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
252
253 lock_srbm(kgd, mec, pipe, queue_id, 0);
254}
255
256static uint32_t get_queue_mask(struct amdgpu_device *adev,
257 uint32_t pipe_id, uint32_t queue_id)
258{
259 unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe +
260 queue_id) & 31;
261
262 return ((uint32_t)1) << bit;
263}
264
265static void release_queue(struct kgd_dev *kgd)
266{
267 unlock_srbm(kgd);
268}
269
270static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
271 uint32_t sh_mem_config,
272 uint32_t sh_mem_ape1_base,
273 uint32_t sh_mem_ape1_limit,
274 uint32_t sh_mem_bases)
275{
276 struct amdgpu_device *adev = get_amdgpu_device(kgd);
277
278 lock_srbm(kgd, 0, 0, 0, vmid);
279
280 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
281 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
282 /* APE1 no longer exists on GFX9 */
283
284 unlock_srbm(kgd);
285}
286
287static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
288 unsigned int vmid)
289{
290 struct amdgpu_device *adev = get_amdgpu_device(kgd);
291
292 /*
293 * We have to assume that there is no outstanding mapping.
294 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
295 * a mapping is in progress or because a mapping finished
296 * and the SW cleared it.
297 * So the protocol is to always wait & clear.
298 */
299 uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
300 ATC_VMID0_PASID_MAPPING__VALID_MASK;
301
302 /*
303 * need to do this twice, once for gfx and once for mmhub
304 * for ATC add 16 to VMID for mmhub, for IH different registers.
305 * ATC_VMID0..15 registers are separate from ATC_VMID16..31.
306 */
307
308 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid,
309 pasid_mapping);
310
311 while (!(RREG32(SOC15_REG_OFFSET(
312 ATHUB, 0,
313 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
314 (1U << vmid)))
315 cpu_relax();
316
317 WREG32(SOC15_REG_OFFSET(ATHUB, 0,
318 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
319 1U << vmid);
320
321 /* Mapping vmid to pasid also for IH block */
322 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid,
323 pasid_mapping);
324
325 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid,
326 pasid_mapping);
327
328 while (!(RREG32(SOC15_REG_OFFSET(
329 ATHUB, 0,
330 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
331 (1U << (vmid + 16))))
332 cpu_relax();
333
334 WREG32(SOC15_REG_OFFSET(ATHUB, 0,
335 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
336 1U << (vmid + 16));
337
338 /* Mapping vmid to pasid also for IH block */
339 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid,
340 pasid_mapping);
341 return 0;
342}
343
344/* TODO - RING0 form of field is obsolete, seems to date back to SI
345 * but still works
346 */
347
348static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
349{
350 struct amdgpu_device *adev = get_amdgpu_device(kgd);
351 uint32_t mec;
352 uint32_t pipe;
353
354 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
355 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
356
357 lock_srbm(kgd, mec, pipe, 0, 0);
358
359 WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL),
360 CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
361 CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
362
363 unlock_srbm(kgd);
364
365 return 0;
366}
367
368static uint32_t get_sdma_base_addr(struct amdgpu_device *adev,
369 unsigned int engine_id,
370 unsigned int queue_id)
371{
372 uint32_t base[2] = {
373 SOC15_REG_OFFSET(SDMA0, 0,
374 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL,
375 SOC15_REG_OFFSET(SDMA1, 0,
376 mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL
377 };
378 uint32_t retval;
379
380 retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL -
381 mmSDMA0_RLC0_RB_CNTL);
382
383 pr_debug("sdma base address: 0x%x\n", retval);
384
385 return retval;
386}
387
388static inline struct v9_mqd *get_mqd(void *mqd)
389{
390 return (struct v9_mqd *)mqd;
391}
392
393static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
394{
395 return (struct v9_sdma_mqd *)mqd;
396}
397
398static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
399 uint32_t queue_id, uint32_t __user *wptr,
400 uint32_t wptr_shift, uint32_t wptr_mask,
401 struct mm_struct *mm)
402{
403 struct amdgpu_device *adev = get_amdgpu_device(kgd);
404 struct v9_mqd *m;
405 uint32_t *mqd_hqd;
406 uint32_t reg, hqd_base, data;
407
408 m = get_mqd(mqd);
409
410 acquire_queue(kgd, pipe_id, queue_id);
411
412 /* HIQ is set during driver init period with vmid set to 0*/
413 if (m->cp_hqd_vmid == 0) {
414 uint32_t value, mec, pipe;
415
416 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
417 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
418
419 pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
420 mec, pipe, queue_id);
421 value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS));
422 value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1,
423 ((mec << 5) | (pipe << 3) | queue_id | 0x80));
424 WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value);
425 }
426
427 /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
428 mqd_hqd = &m->cp_mqd_base_addr_lo;
429 hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
430
431 for (reg = hqd_base;
432 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
433 WREG32(reg, mqd_hqd[reg - hqd_base]);
434
435
436 /* Activate doorbell logic before triggering WPTR poll. */
437 data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
438 CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
439 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
440
441 if (wptr) {
442 /* Don't read wptr with get_user because the user
443 * context may not be accessible (if this function
444 * runs in a work queue). Instead trigger a one-shot
445 * polling read from memory in the CP. This assumes
446 * that wptr is GPU-accessible in the queue's VMID via
447 * ATC or SVM. WPTR==RPTR before starting the poll so
448 * the CP starts fetching new commands from the right
449 * place.
450 *
451 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
452 * tricky. Assume that the queue didn't overflow. The
453 * number of valid bits in the 32-bit RPTR depends on
454 * the queue size. The remaining bits are taken from
455 * the saved 64-bit WPTR. If the WPTR wrapped, add the
456 * queue size.
457 */
458 uint32_t queue_size =
459 2 << REG_GET_FIELD(m->cp_hqd_pq_control,
460 CP_HQD_PQ_CONTROL, QUEUE_SIZE);
461 uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
462
463 if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
464 guessed_wptr += queue_size;
465 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
466 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
467
468 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
469 lower_32_bits(guessed_wptr));
470 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
471 upper_32_bits(guessed_wptr));
472 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
473 lower_32_bits((uint64_t)wptr));
474 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
475 upper_32_bits((uint64_t)wptr));
476 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1),
477 get_queue_mask(adev, pipe_id, queue_id));
478 }
479
480 /* Start the EOP fetcher */
481 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
482 REG_SET_FIELD(m->cp_hqd_eop_rptr,
483 CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
484
485 data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
486 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
487
488 release_queue(kgd);
489
490 return 0;
491}
492
493static int kgd_hqd_dump(struct kgd_dev *kgd,
494 uint32_t pipe_id, uint32_t queue_id,
495 uint32_t (**dump)[2], uint32_t *n_regs)
496{
497 struct amdgpu_device *adev = get_amdgpu_device(kgd);
498 uint32_t i = 0, reg;
499#define HQD_N_REGS 56
500#define DUMP_REG(addr) do { \
501 if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
502 break; \
503 (*dump)[i][0] = (addr) << 2; \
504 (*dump)[i++][1] = RREG32(addr); \
505 } while (0)
506
507 *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
508 if (*dump == NULL)
509 return -ENOMEM;
510
511 acquire_queue(kgd, pipe_id, queue_id);
512
513 for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
514 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
515 DUMP_REG(reg);
516
517 release_queue(kgd);
518
519 WARN_ON_ONCE(i != HQD_N_REGS);
520 *n_regs = i;
521
522 return 0;
523}
524
525static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
526 uint32_t __user *wptr, struct mm_struct *mm)
527{
528 struct amdgpu_device *adev = get_amdgpu_device(kgd);
529 struct v9_sdma_mqd *m;
530 uint32_t sdma_base_addr, sdmax_gfx_context_cntl;
531 unsigned long end_jiffies;
532 uint32_t data;
533 uint64_t data64;
534 uint64_t __user *wptr64 = (uint64_t __user *)wptr;
535
536 m = get_sdma_mqd(mqd);
537 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
538 m->sdma_queue_id);
539 sdmax_gfx_context_cntl = m->sdma_engine_id ?
540 SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) :
541 SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL);
542
543 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
544 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
545
546 end_jiffies = msecs_to_jiffies(2000) + jiffies;
547 while (true) {
548 data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
549 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
550 break;
551 if (time_after(jiffies, end_jiffies))
552 return -ETIME;
553 usleep_range(500, 1000);
554 }
555 data = RREG32(sdmax_gfx_context_cntl);
556 data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
557 RESUME_CTX, 0);
558 WREG32(sdmax_gfx_context_cntl, data);
559
560 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET,
561 m->sdmax_rlcx_doorbell_offset);
562
563 data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
564 ENABLE, 1);
565 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
566 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
567 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI,
568 m->sdmax_rlcx_rb_rptr_hi);
569
570 WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
571 if (read_user_wptr(mm, wptr64, data64)) {
572 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
573 lower_32_bits(data64));
574 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
575 upper_32_bits(data64));
576 } else {
577 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
578 m->sdmax_rlcx_rb_rptr);
579 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
580 m->sdmax_rlcx_rb_rptr_hi);
581 }
582 WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
583
584 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
585 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
586 m->sdmax_rlcx_rb_base_hi);
587 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
588 m->sdmax_rlcx_rb_rptr_addr_lo);
589 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
590 m->sdmax_rlcx_rb_rptr_addr_hi);
591
592 data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
593 RB_ENABLE, 1);
594 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
595
596 return 0;
597}
598
599static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
600 uint32_t engine_id, uint32_t queue_id,
601 uint32_t (**dump)[2], uint32_t *n_regs)
602{
603 struct amdgpu_device *adev = get_amdgpu_device(kgd);
604 uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id);
605 uint32_t i = 0, reg;
606#undef HQD_N_REGS
607#define HQD_N_REGS (19+6+7+10)
608
609 *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
610 if (*dump == NULL)
611 return -ENOMEM;
612
613 for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
614 DUMP_REG(sdma_base_addr + reg);
615 for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
616 DUMP_REG(sdma_base_addr + reg);
617 for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
618 reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
619 DUMP_REG(sdma_base_addr + reg);
620 for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
621 reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
622 DUMP_REG(sdma_base_addr + reg);
623
624 WARN_ON_ONCE(i != HQD_N_REGS);
625 *n_regs = i;
626
627 return 0;
628}
629
630static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
631 uint32_t pipe_id, uint32_t queue_id)
632{
633 struct amdgpu_device *adev = get_amdgpu_device(kgd);
634 uint32_t act;
635 bool retval = false;
636 uint32_t low, high;
637
638 acquire_queue(kgd, pipe_id, queue_id);
639 act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
640 if (act) {
641 low = lower_32_bits(queue_address >> 8);
642 high = upper_32_bits(queue_address >> 8);
643
644 if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) &&
645 high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI)))
646 retval = true;
647 }
648 release_queue(kgd);
649 return retval;
650}
651
652static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
653{
654 struct amdgpu_device *adev = get_amdgpu_device(kgd);
655 struct v9_sdma_mqd *m;
656 uint32_t sdma_base_addr;
657 uint32_t sdma_rlc_rb_cntl;
658
659 m = get_sdma_mqd(mqd);
660 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
661 m->sdma_queue_id);
662
663 sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
664
665 if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
666 return true;
667
668 return false;
669}
670
671static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
672 enum kfd_preempt_type reset_type,
673 unsigned int utimeout, uint32_t pipe_id,
674 uint32_t queue_id)
675{
676 struct amdgpu_device *adev = get_amdgpu_device(kgd);
677 enum hqd_dequeue_request_type type;
678 unsigned long end_jiffies;
679 uint32_t temp;
680 struct v9_mqd *m = get_mqd(mqd);
681
682 acquire_queue(kgd, pipe_id, queue_id);
683
684 if (m->cp_hqd_vmid == 0)
685 WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
686
687 switch (reset_type) {
688 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
689 type = DRAIN_PIPE;
690 break;
691 case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
692 type = RESET_WAVES;
693 break;
694 default:
695 type = DRAIN_PIPE;
696 break;
697 }
698
699 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
700
701 end_jiffies = (utimeout * HZ / 1000) + jiffies;
702 while (true) {
703 temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
704 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
705 break;
706 if (time_after(jiffies, end_jiffies)) {
707 pr_err("cp queue preemption time out.\n");
708 release_queue(kgd);
709 return -ETIME;
710 }
711 usleep_range(500, 1000);
712 }
713
714 release_queue(kgd);
715 return 0;
716}
717
718static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
719 unsigned int utimeout)
720{
721 struct amdgpu_device *adev = get_amdgpu_device(kgd);
722 struct v9_sdma_mqd *m;
723 uint32_t sdma_base_addr;
724 uint32_t temp;
725 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
726
727 m = get_sdma_mqd(mqd);
728 sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
729 m->sdma_queue_id);
730
731 temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
732 temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
733 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp);
734
735 while (true) {
736 temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
737 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
738 break;
739 if (time_after(jiffies, end_jiffies))
740 return -ETIME;
741 usleep_range(500, 1000);
742 }
743
744 WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
745 WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
746 RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
747 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
748
749 m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
750 m->sdmax_rlcx_rb_rptr_hi =
751 RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI);
752
753 return 0;
754}
755
756static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
757 uint8_t vmid)
758{
759 uint32_t reg;
760 struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
761
762 reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
763 + vmid);
764 return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
765}
766
767static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
768 uint8_t vmid)
769{
770 uint32_t reg;
771 struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
772
773 reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
774 + vmid);
775 return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
776}
777
778static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
779{
780 struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
781 uint32_t req = (1 << vmid) |
782 (0 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* legacy */
783 VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK |
784 VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK |
785 VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK |
786 VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK |
787 VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK;
788
789 mutex_lock(&adev->srbm_mutex);
790
791 /* Use legacy mode tlb invalidation.
792 *
793 * Currently on Raven the code below is broken for anything but
794 * legacy mode due to a MMHUB power gating problem. A workaround
795 * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ
796 * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack
797 * bit.
798 *
799 * TODO 1: agree on the right set of invalidation registers for
800 * KFD use. Use the last one for now. Invalidate both GC and
801 * MMHUB.
802 *
803 * TODO 2: support range-based invalidation, requires kfg2kgd
804 * interface change
805 */
806 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32),
807 0xffffffff);
808 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32),
809 0x0000001f);
810
811 WREG32(SOC15_REG_OFFSET(MMHUB, 0,
812 mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32),
813 0xffffffff);
814 WREG32(SOC15_REG_OFFSET(MMHUB, 0,
815 mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32),
816 0x0000001f);
817
818 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req);
819
820 WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ),
821 req);
822
823 while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) &
824 (1 << vmid)))
825 cpu_relax();
826
827 while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0,
828 mmMMHUB_VM_INVALIDATE_ENG16_ACK)) &
829 (1 << vmid)))
830 cpu_relax();
831
832 mutex_unlock(&adev->srbm_mutex);
833
834}
835
836static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid)
837{
838 signed long r;
839 uint32_t seq;
840 struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
841
842 spin_lock(&adev->gfx.kiq.ring_lock);
843 amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/
844 amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
845 amdgpu_ring_write(ring,
846 PACKET3_INVALIDATE_TLBS_DST_SEL(1) |
847 PACKET3_INVALIDATE_TLBS_ALL_HUB(1) |
848 PACKET3_INVALIDATE_TLBS_PASID(pasid) |
849 PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */
850 amdgpu_fence_emit_polling(ring, &seq);
851 amdgpu_ring_commit(ring);
852 spin_unlock(&adev->gfx.kiq.ring_lock);
853
854 r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
855 if (r < 1) {
856 DRM_ERROR("wait for kiq fence error: %ld.\n", r);
857 return -ETIME;
858 }
859
860 return 0;
861}
862
863static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
864{
865 struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
866 int vmid;
867 struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
868
869 if (ring->ready)
870 return invalidate_tlbs_with_kiq(adev, pasid);
871
872 for (vmid = 0; vmid < 16; vmid++) {
873 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
874 continue;
875 if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) {
876 if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid)
877 == pasid) {
878 write_vmid_invalidate_request(kgd, vmid);
879 break;
880 }
881 }
882 }
883
884 return 0;
885}
886
887static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid)
888{
889 struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
890
891 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
892 pr_err("non kfd vmid %d\n", vmid);
893 return 0;
894 }
895
896 write_vmid_invalidate_request(kgd, vmid);
897 return 0;
898}
899
900static int kgd_address_watch_disable(struct kgd_dev *kgd)
901{
902 return 0;
903}
904
905static int kgd_address_watch_execute(struct kgd_dev *kgd,
906 unsigned int watch_point_id,
907 uint32_t cntl_val,
908 uint32_t addr_hi,
909 uint32_t addr_lo)
910{
911 return 0;
912}
913
914static int kgd_wave_control_execute(struct kgd_dev *kgd,
915 uint32_t gfx_index_val,
916 uint32_t sq_cmd)
917{
918 struct amdgpu_device *adev = get_amdgpu_device(kgd);
919 uint32_t data = 0;
920
921 mutex_lock(&adev->grbm_idx_mutex);
922
923 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val);
924 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd);
925
926 data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
927 INSTANCE_BROADCAST_WRITES, 1);
928 data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
929 SH_BROADCAST_WRITES, 1);
930 data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
931 SE_BROADCAST_WRITES, 1);
932
933 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
934 mutex_unlock(&adev->grbm_idx_mutex);
935
936 return 0;
937}
938
939static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
940 unsigned int watch_point_id,
941 unsigned int reg_offset)
942{
943 return 0;
944}
945
946static void set_scratch_backing_va(struct kgd_dev *kgd,
947 uint64_t va, uint32_t vmid)
948{
949 /* No longer needed on GFXv9. The scratch base address is
950 * passed to the shader by the CP. It's the user mode driver's
951 * responsibility.
952 */
953}
954
955/* FIXME: Does this need to be ASIC-specific code? */
956static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
957{
958 struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
959 const union amdgpu_firmware_header *hdr;
960
961 switch (type) {
962 case KGD_ENGINE_PFP:
963 hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data;
964 break;
965
966 case KGD_ENGINE_ME:
967 hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data;
968 break;
969
970 case KGD_ENGINE_CE:
971 hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data;
972 break;
973
974 case KGD_ENGINE_MEC1:
975 hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data;
976 break;
977
978 case KGD_ENGINE_MEC2:
979 hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data;
980 break;
981
982 case KGD_ENGINE_RLC:
983 hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data;
984 break;
985
986 case KGD_ENGINE_SDMA1:
987 hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data;
988 break;
989
990 case KGD_ENGINE_SDMA2:
991 hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data;
992 break;
993
994 default:
995 return 0;
996 }
997
998 if (hdr == NULL)
999 return 0;
1000
1001 /* Only 12 bit in use*/
1002 return hdr->common.ucode_version;
1003}
1004
1005static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
1006 uint32_t page_table_base)
1007{
1008 struct amdgpu_device *adev = get_amdgpu_device(kgd);
1009 uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT |
1010 AMDGPU_PTE_VALID;
1011
1012 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
1013 pr_err("trying to set page table base for wrong VMID %u\n",
1014 vmid);
1015 return;
1016 }
1017
1018 /* TODO: take advantage of per-process address space size. For
1019 * now, all processes share the same address space size, like
1020 * on GFX8 and older.
1021 */
1022 WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
1023 WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
1024
1025 WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2),
1026 lower_32_bits(adev->vm_manager.max_pfn - 1));
1027 WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2),
1028 upper_32_bits(adev->vm_manager.max_pfn - 1));
1029
1030 WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
1031 WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
1032
1033 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
1034 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
1035
1036 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2),
1037 lower_32_bits(adev->vm_manager.max_pfn - 1));
1038 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2),
1039 upper_32_bits(adev->vm_manager.max_pfn - 1));
1040
1041 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
1042 WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
1043}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 1d6e1479da38..5296e24fd662 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -23,6 +23,7 @@
23#define pr_fmt(fmt) "kfd2kgd: " fmt 23#define pr_fmt(fmt) "kfd2kgd: " fmt
24 24
25#include <linux/list.h> 25#include <linux/list.h>
26#include <linux/sched/mm.h>
26#include <drm/drmP.h> 27#include <drm/drmP.h>
27#include "amdgpu_object.h" 28#include "amdgpu_object.h"
28#include "amdgpu_vm.h" 29#include "amdgpu_vm.h"
@@ -33,10 +34,20 @@
33 */ 34 */
34#define VI_BO_SIZE_ALIGN (0x8000) 35#define VI_BO_SIZE_ALIGN (0x8000)
35 36
37/* BO flag to indicate a KFD userptr BO */
38#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63)
39
40/* Userptr restore delay, just long enough to allow consecutive VM
41 * changes to accumulate
42 */
43#define AMDGPU_USERPTR_RESTORE_DELAY_MS 1
44
36/* Impose limit on how much memory KFD can use */ 45/* Impose limit on how much memory KFD can use */
37static struct { 46static struct {
38 uint64_t max_system_mem_limit; 47 uint64_t max_system_mem_limit;
48 uint64_t max_userptr_mem_limit;
39 int64_t system_mem_used; 49 int64_t system_mem_used;
50 int64_t userptr_mem_used;
40 spinlock_t mem_limit_lock; 51 spinlock_t mem_limit_lock;
41} kfd_mem_limit; 52} kfd_mem_limit;
42 53
@@ -57,6 +68,7 @@ static const char * const domain_bit_to_string[] = {
57 68
58#define domain_string(domain) domain_bit_to_string[ffs(domain)-1] 69#define domain_string(domain) domain_bit_to_string[ffs(domain)-1]
59 70
71static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work);
60 72
61 73
62static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) 74static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
@@ -78,6 +90,7 @@ static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm,
78 90
79/* Set memory usage limits. Current, limits are 91/* Set memory usage limits. Current, limits are
80 * System (kernel) memory - 3/8th System RAM 92 * System (kernel) memory - 3/8th System RAM
93 * Userptr memory - 3/4th System RAM
81 */ 94 */
82void amdgpu_amdkfd_gpuvm_init_mem_limits(void) 95void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
83{ 96{
@@ -90,8 +103,10 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
90 103
91 spin_lock_init(&kfd_mem_limit.mem_limit_lock); 104 spin_lock_init(&kfd_mem_limit.mem_limit_lock);
92 kfd_mem_limit.max_system_mem_limit = (mem >> 1) - (mem >> 3); 105 kfd_mem_limit.max_system_mem_limit = (mem >> 1) - (mem >> 3);
93 pr_debug("Kernel memory limit %lluM\n", 106 kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 2);
94 (kfd_mem_limit.max_system_mem_limit >> 20)); 107 pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n",
108 (kfd_mem_limit.max_system_mem_limit >> 20),
109 (kfd_mem_limit.max_userptr_mem_limit >> 20));
95} 110}
96 111
97static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, 112static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev,
@@ -111,6 +126,16 @@ static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev,
111 goto err_no_mem; 126 goto err_no_mem;
112 } 127 }
113 kfd_mem_limit.system_mem_used += (acc_size + size); 128 kfd_mem_limit.system_mem_used += (acc_size + size);
129 } else if (domain == AMDGPU_GEM_DOMAIN_CPU) {
130 if ((kfd_mem_limit.system_mem_used + acc_size >
131 kfd_mem_limit.max_system_mem_limit) ||
132 (kfd_mem_limit.userptr_mem_used + (size + acc_size) >
133 kfd_mem_limit.max_userptr_mem_limit)) {
134 ret = -ENOMEM;
135 goto err_no_mem;
136 }
137 kfd_mem_limit.system_mem_used += acc_size;
138 kfd_mem_limit.userptr_mem_used += size;
114 } 139 }
115err_no_mem: 140err_no_mem:
116 spin_unlock(&kfd_mem_limit.mem_limit_lock); 141 spin_unlock(&kfd_mem_limit.mem_limit_lock);
@@ -126,10 +151,16 @@ static void unreserve_system_mem_limit(struct amdgpu_device *adev,
126 sizeof(struct amdgpu_bo)); 151 sizeof(struct amdgpu_bo));
127 152
128 spin_lock(&kfd_mem_limit.mem_limit_lock); 153 spin_lock(&kfd_mem_limit.mem_limit_lock);
129 if (domain == AMDGPU_GEM_DOMAIN_GTT) 154 if (domain == AMDGPU_GEM_DOMAIN_GTT) {
130 kfd_mem_limit.system_mem_used -= (acc_size + size); 155 kfd_mem_limit.system_mem_used -= (acc_size + size);
156 } else if (domain == AMDGPU_GEM_DOMAIN_CPU) {
157 kfd_mem_limit.system_mem_used -= acc_size;
158 kfd_mem_limit.userptr_mem_used -= size;
159 }
131 WARN_ONCE(kfd_mem_limit.system_mem_used < 0, 160 WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
132 "kfd system memory accounting unbalanced"); 161 "kfd system memory accounting unbalanced");
162 WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0,
163 "kfd userptr memory accounting unbalanced");
133 164
134 spin_unlock(&kfd_mem_limit.mem_limit_lock); 165 spin_unlock(&kfd_mem_limit.mem_limit_lock);
135} 166}
@@ -138,12 +169,17 @@ void amdgpu_amdkfd_unreserve_system_memory_limit(struct amdgpu_bo *bo)
138{ 169{
139 spin_lock(&kfd_mem_limit.mem_limit_lock); 170 spin_lock(&kfd_mem_limit.mem_limit_lock);
140 171
141 if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { 172 if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) {
173 kfd_mem_limit.system_mem_used -= bo->tbo.acc_size;
174 kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo);
175 } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) {
142 kfd_mem_limit.system_mem_used -= 176 kfd_mem_limit.system_mem_used -=
143 (bo->tbo.acc_size + amdgpu_bo_size(bo)); 177 (bo->tbo.acc_size + amdgpu_bo_size(bo));
144 } 178 }
145 WARN_ONCE(kfd_mem_limit.system_mem_used < 0, 179 WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
146 "kfd system memory accounting unbalanced"); 180 "kfd system memory accounting unbalanced");
181 WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0,
182 "kfd userptr memory accounting unbalanced");
147 183
148 spin_unlock(&kfd_mem_limit.mem_limit_lock); 184 spin_unlock(&kfd_mem_limit.mem_limit_lock);
149} 185}
@@ -506,7 +542,8 @@ static void remove_bo_from_vm(struct amdgpu_device *adev,
506} 542}
507 543
508static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, 544static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
509 struct amdkfd_process_info *process_info) 545 struct amdkfd_process_info *process_info,
546 bool userptr)
510{ 547{
511 struct ttm_validate_buffer *entry = &mem->validate_list; 548 struct ttm_validate_buffer *entry = &mem->validate_list;
512 struct amdgpu_bo *bo = mem->bo; 549 struct amdgpu_bo *bo = mem->bo;
@@ -515,10 +552,95 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
515 entry->shared = true; 552 entry->shared = true;
516 entry->bo = &bo->tbo; 553 entry->bo = &bo->tbo;
517 mutex_lock(&process_info->lock); 554 mutex_lock(&process_info->lock);
518 list_add_tail(&entry->head, &process_info->kfd_bo_list); 555 if (userptr)
556 list_add_tail(&entry->head, &process_info->userptr_valid_list);
557 else
558 list_add_tail(&entry->head, &process_info->kfd_bo_list);
519 mutex_unlock(&process_info->lock); 559 mutex_unlock(&process_info->lock);
520} 560}
521 561
562/* Initializes user pages. It registers the MMU notifier and validates
563 * the userptr BO in the GTT domain.
564 *
565 * The BO must already be on the userptr_valid_list. Otherwise an
566 * eviction and restore may happen that leaves the new BO unmapped
567 * with the user mode queues running.
568 *
569 * Takes the process_info->lock to protect against concurrent restore
570 * workers.
571 *
572 * Returns 0 for success, negative errno for errors.
573 */
574static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
575 uint64_t user_addr)
576{
577 struct amdkfd_process_info *process_info = mem->process_info;
578 struct amdgpu_bo *bo = mem->bo;
579 struct ttm_operation_ctx ctx = { true, false };
580 int ret = 0;
581
582 mutex_lock(&process_info->lock);
583
584 ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0);
585 if (ret) {
586 pr_err("%s: Failed to set userptr: %d\n", __func__, ret);
587 goto out;
588 }
589
590 ret = amdgpu_mn_register(bo, user_addr);
591 if (ret) {
592 pr_err("%s: Failed to register MMU notifier: %d\n",
593 __func__, ret);
594 goto out;
595 }
596
597 /* If no restore worker is running concurrently, user_pages
598 * should not be allocated
599 */
600 WARN(mem->user_pages, "Leaking user_pages array");
601
602 mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages,
603 sizeof(struct page *),
604 GFP_KERNEL | __GFP_ZERO);
605 if (!mem->user_pages) {
606 pr_err("%s: Failed to allocate pages array\n", __func__);
607 ret = -ENOMEM;
608 goto unregister_out;
609 }
610
611 ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages);
612 if (ret) {
613 pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
614 goto free_out;
615 }
616
617 amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages);
618
619 ret = amdgpu_bo_reserve(bo, true);
620 if (ret) {
621 pr_err("%s: Failed to reserve BO\n", __func__);
622 goto release_out;
623 }
624 amdgpu_ttm_placement_from_domain(bo, mem->domain);
625 ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
626 if (ret)
627 pr_err("%s: failed to validate BO\n", __func__);
628 amdgpu_bo_unreserve(bo);
629
630release_out:
631 if (ret)
632 release_pages(mem->user_pages, bo->tbo.ttm->num_pages);
633free_out:
634 kvfree(mem->user_pages);
635 mem->user_pages = NULL;
636unregister_out:
637 if (ret)
638 amdgpu_mn_unregister(bo);
639out:
640 mutex_unlock(&process_info->lock);
641 return ret;
642}
643
522/* Reserving a BO and its page table BOs must happen atomically to 644/* Reserving a BO and its page table BOs must happen atomically to
523 * avoid deadlocks. Some operations update multiple VMs at once. Track 645 * avoid deadlocks. Some operations update multiple VMs at once. Track
524 * all the reservation info in a context structure. Optionally a sync 646 * all the reservation info in a context structure. Optionally a sync
@@ -748,7 +870,8 @@ static int update_gpuvm_pte(struct amdgpu_device *adev,
748} 870}
749 871
750static int map_bo_to_gpuvm(struct amdgpu_device *adev, 872static int map_bo_to_gpuvm(struct amdgpu_device *adev,
751 struct kfd_bo_va_list *entry, struct amdgpu_sync *sync) 873 struct kfd_bo_va_list *entry, struct amdgpu_sync *sync,
874 bool no_update_pte)
752{ 875{
753 int ret; 876 int ret;
754 877
@@ -762,6 +885,9 @@ static int map_bo_to_gpuvm(struct amdgpu_device *adev,
762 return ret; 885 return ret;
763 } 886 }
764 887
888 if (no_update_pte)
889 return 0;
890
765 ret = update_gpuvm_pte(adev, entry, sync); 891 ret = update_gpuvm_pte(adev, entry, sync);
766 if (ret) { 892 if (ret) {
767 pr_err("update_gpuvm_pte() failed\n"); 893 pr_err("update_gpuvm_pte() failed\n");
@@ -820,6 +946,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
820 mutex_init(&info->lock); 946 mutex_init(&info->lock);
821 INIT_LIST_HEAD(&info->vm_list_head); 947 INIT_LIST_HEAD(&info->vm_list_head);
822 INIT_LIST_HEAD(&info->kfd_bo_list); 948 INIT_LIST_HEAD(&info->kfd_bo_list);
949 INIT_LIST_HEAD(&info->userptr_valid_list);
950 INIT_LIST_HEAD(&info->userptr_inval_list);
823 951
824 info->eviction_fence = 952 info->eviction_fence =
825 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), 953 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
@@ -830,6 +958,11 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
830 goto create_evict_fence_fail; 958 goto create_evict_fence_fail;
831 } 959 }
832 960
961 info->pid = get_task_pid(current->group_leader, PIDTYPE_PID);
962 atomic_set(&info->evicted_bos, 0);
963 INIT_DELAYED_WORK(&info->restore_userptr_work,
964 amdgpu_amdkfd_restore_userptr_worker);
965
833 *process_info = info; 966 *process_info = info;
834 *ef = dma_fence_get(&info->eviction_fence->base); 967 *ef = dma_fence_get(&info->eviction_fence->base);
835 } 968 }
@@ -872,6 +1005,7 @@ reserve_pd_fail:
872 dma_fence_put(*ef); 1005 dma_fence_put(*ef);
873 *ef = NULL; 1006 *ef = NULL;
874 *process_info = NULL; 1007 *process_info = NULL;
1008 put_pid(info->pid);
875create_evict_fence_fail: 1009create_evict_fence_fail:
876 mutex_destroy(&info->lock); 1010 mutex_destroy(&info->lock);
877 kfree(info); 1011 kfree(info);
@@ -967,8 +1101,12 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
967 /* Release per-process resources when last compute VM is destroyed */ 1101 /* Release per-process resources when last compute VM is destroyed */
968 if (!process_info->n_vms) { 1102 if (!process_info->n_vms) {
969 WARN_ON(!list_empty(&process_info->kfd_bo_list)); 1103 WARN_ON(!list_empty(&process_info->kfd_bo_list));
1104 WARN_ON(!list_empty(&process_info->userptr_valid_list));
1105 WARN_ON(!list_empty(&process_info->userptr_inval_list));
970 1106
971 dma_fence_put(&process_info->eviction_fence->base); 1107 dma_fence_put(&process_info->eviction_fence->base);
1108 cancel_delayed_work_sync(&process_info->restore_userptr_work);
1109 put_pid(process_info->pid);
972 mutex_destroy(&process_info->lock); 1110 mutex_destroy(&process_info->lock);
973 kfree(process_info); 1111 kfree(process_info);
974 } 1112 }
@@ -1003,9 +1141,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
1003{ 1141{
1004 struct amdgpu_device *adev = get_amdgpu_device(kgd); 1142 struct amdgpu_device *adev = get_amdgpu_device(kgd);
1005 struct amdgpu_vm *avm = (struct amdgpu_vm *)vm; 1143 struct amdgpu_vm *avm = (struct amdgpu_vm *)vm;
1144 uint64_t user_addr = 0;
1006 struct amdgpu_bo *bo; 1145 struct amdgpu_bo *bo;
1007 int byte_align; 1146 int byte_align;
1008 u32 alloc_domain; 1147 u32 domain, alloc_domain;
1009 u64 alloc_flags; 1148 u64 alloc_flags;
1010 uint32_t mapping_flags; 1149 uint32_t mapping_flags;
1011 int ret; 1150 int ret;
@@ -1014,14 +1153,21 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
1014 * Check on which domain to allocate BO 1153 * Check on which domain to allocate BO
1015 */ 1154 */
1016 if (flags & ALLOC_MEM_FLAGS_VRAM) { 1155 if (flags & ALLOC_MEM_FLAGS_VRAM) {
1017 alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; 1156 domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
1018 alloc_flags = AMDGPU_GEM_CREATE_VRAM_CLEARED; 1157 alloc_flags = AMDGPU_GEM_CREATE_VRAM_CLEARED;
1019 alloc_flags |= (flags & ALLOC_MEM_FLAGS_PUBLIC) ? 1158 alloc_flags |= (flags & ALLOC_MEM_FLAGS_PUBLIC) ?
1020 AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 1159 AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
1021 AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 1160 AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1022 } else if (flags & ALLOC_MEM_FLAGS_GTT) { 1161 } else if (flags & ALLOC_MEM_FLAGS_GTT) {
1023 alloc_domain = AMDGPU_GEM_DOMAIN_GTT; 1162 domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
1163 alloc_flags = 0;
1164 } else if (flags & ALLOC_MEM_FLAGS_USERPTR) {
1165 domain = AMDGPU_GEM_DOMAIN_GTT;
1166 alloc_domain = AMDGPU_GEM_DOMAIN_CPU;
1024 alloc_flags = 0; 1167 alloc_flags = 0;
1168 if (!offset || !*offset)
1169 return -EINVAL;
1170 user_addr = *offset;
1025 } else { 1171 } else {
1026 return -EINVAL; 1172 return -EINVAL;
1027 } 1173 }
@@ -1078,18 +1224,34 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
1078 } 1224 }
1079 bo->kfd_bo = *mem; 1225 bo->kfd_bo = *mem;
1080 (*mem)->bo = bo; 1226 (*mem)->bo = bo;
1227 if (user_addr)
1228 bo->flags |= AMDGPU_AMDKFD_USERPTR_BO;
1081 1229
1082 (*mem)->va = va; 1230 (*mem)->va = va;
1083 (*mem)->domain = alloc_domain; 1231 (*mem)->domain = domain;
1084 (*mem)->mapped_to_gpu_memory = 0; 1232 (*mem)->mapped_to_gpu_memory = 0;
1085 (*mem)->process_info = avm->process_info; 1233 (*mem)->process_info = avm->process_info;
1086 add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info); 1234 add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
1235
1236 if (user_addr) {
1237 ret = init_user_pages(*mem, current->mm, user_addr);
1238 if (ret) {
1239 mutex_lock(&avm->process_info->lock);
1240 list_del(&(*mem)->validate_list.head);
1241 mutex_unlock(&avm->process_info->lock);
1242 goto allocate_init_user_pages_failed;
1243 }
1244 }
1087 1245
1088 if (offset) 1246 if (offset)
1089 *offset = amdgpu_bo_mmap_offset(bo); 1247 *offset = amdgpu_bo_mmap_offset(bo);
1090 1248
1091 return 0; 1249 return 0;
1092 1250
1251allocate_init_user_pages_failed:
1252 amdgpu_bo_unref(&bo);
1253 /* Don't unreserve system mem limit twice */
1254 goto err_reserve_system_mem;
1093err_bo_create: 1255err_bo_create:
1094 unreserve_system_mem_limit(adev, size, alloc_domain); 1256 unreserve_system_mem_limit(adev, size, alloc_domain);
1095err_reserve_system_mem: 1257err_reserve_system_mem:
@@ -1122,12 +1284,24 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
1122 * be freed anyway 1284 * be freed anyway
1123 */ 1285 */
1124 1286
1287 /* No more MMU notifiers */
1288 amdgpu_mn_unregister(mem->bo);
1289
1125 /* Make sure restore workers don't access the BO any more */ 1290 /* Make sure restore workers don't access the BO any more */
1126 bo_list_entry = &mem->validate_list; 1291 bo_list_entry = &mem->validate_list;
1127 mutex_lock(&process_info->lock); 1292 mutex_lock(&process_info->lock);
1128 list_del(&bo_list_entry->head); 1293 list_del(&bo_list_entry->head);
1129 mutex_unlock(&process_info->lock); 1294 mutex_unlock(&process_info->lock);
1130 1295
1296 /* Free user pages if necessary */
1297 if (mem->user_pages) {
1298 pr_debug("%s: Freeing user_pages array\n", __func__);
1299 if (mem->user_pages[0])
1300 release_pages(mem->user_pages,
1301 mem->bo->tbo.ttm->num_pages);
1302 kvfree(mem->user_pages);
1303 }
1304
1131 ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx); 1305 ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx);
1132 if (unlikely(ret)) 1306 if (unlikely(ret))
1133 return ret; 1307 return ret;
@@ -1173,21 +1347,32 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
1173 struct kfd_bo_va_list *bo_va_entry = NULL; 1347 struct kfd_bo_va_list *bo_va_entry = NULL;
1174 struct kfd_bo_va_list *bo_va_entry_aql = NULL; 1348 struct kfd_bo_va_list *bo_va_entry_aql = NULL;
1175 unsigned long bo_size; 1349 unsigned long bo_size;
1176 1350 bool is_invalid_userptr = false;
1177 /* Make sure restore is not running concurrently.
1178 */
1179 mutex_lock(&mem->process_info->lock);
1180
1181 mutex_lock(&mem->lock);
1182 1351
1183 bo = mem->bo; 1352 bo = mem->bo;
1184
1185 if (!bo) { 1353 if (!bo) {
1186 pr_err("Invalid BO when mapping memory to GPU\n"); 1354 pr_err("Invalid BO when mapping memory to GPU\n");
1187 ret = -EINVAL; 1355 return -EINVAL;
1188 goto out;
1189 } 1356 }
1190 1357
1358 /* Make sure restore is not running concurrently. Since we
1359 * don't map invalid userptr BOs, we rely on the next restore
1360 * worker to do the mapping
1361 */
1362 mutex_lock(&mem->process_info->lock);
1363
1364 /* Lock mmap-sem. If we find an invalid userptr BO, we can be
1365 * sure that the MMU notifier is no longer running
1366 * concurrently and the queues are actually stopped
1367 */
1368 if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
1369 down_write(&current->mm->mmap_sem);
1370 is_invalid_userptr = atomic_read(&mem->invalid);
1371 up_write(&current->mm->mmap_sem);
1372 }
1373
1374 mutex_lock(&mem->lock);
1375
1191 domain = mem->domain; 1376 domain = mem->domain;
1192 bo_size = bo->tbo.mem.size; 1377 bo_size = bo->tbo.mem.size;
1193 1378
@@ -1200,6 +1385,14 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
1200 if (unlikely(ret)) 1385 if (unlikely(ret))
1201 goto out; 1386 goto out;
1202 1387
1388 /* Userptr can be marked as "not invalid", but not actually be
1389 * validated yet (still in the system domain). In that case
1390 * the queues are still stopped and we can leave mapping for
1391 * the next restore worker
1392 */
1393 if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM)
1394 is_invalid_userptr = true;
1395
1203 if (check_if_add_bo_to_vm(avm, mem)) { 1396 if (check_if_add_bo_to_vm(avm, mem)) {
1204 ret = add_bo_to_vm(adev, mem, avm, false, 1397 ret = add_bo_to_vm(adev, mem, avm, false,
1205 &bo_va_entry); 1398 &bo_va_entry);
@@ -1217,7 +1410,8 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
1217 goto add_bo_to_vm_failed; 1410 goto add_bo_to_vm_failed;
1218 } 1411 }
1219 1412
1220 if (mem->mapped_to_gpu_memory == 0) { 1413 if (mem->mapped_to_gpu_memory == 0 &&
1414 !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
1221 /* Validate BO only once. The eviction fence gets added to BO 1415 /* Validate BO only once. The eviction fence gets added to BO
1222 * the first time it is mapped. Validate will wait for all 1416 * the first time it is mapped. Validate will wait for all
1223 * background evictions to complete. 1417 * background evictions to complete.
@@ -1235,7 +1429,8 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
1235 entry->va, entry->va + bo_size, 1429 entry->va, entry->va + bo_size,
1236 entry); 1430 entry);
1237 1431
1238 ret = map_bo_to_gpuvm(adev, entry, ctx.sync); 1432 ret = map_bo_to_gpuvm(adev, entry, ctx.sync,
1433 is_invalid_userptr);
1239 if (ret) { 1434 if (ret) {
1240 pr_err("Failed to map radeon bo to gpuvm\n"); 1435 pr_err("Failed to map radeon bo to gpuvm\n");
1241 goto map_bo_to_gpuvm_failed; 1436 goto map_bo_to_gpuvm_failed;
@@ -1418,6 +1613,337 @@ bo_reserve_failed:
1418 return ret; 1613 return ret;
1419} 1614}
1420 1615
1616/* Evict a userptr BO by stopping the queues if necessary
1617 *
1618 * Runs in MMU notifier, may be in RECLAIM_FS context. This means it
1619 * cannot do any memory allocations, and cannot take any locks that
1620 * are held elsewhere while allocating memory. Therefore this is as
1621 * simple as possible, using atomic counters.
1622 *
1623 * It doesn't do anything to the BO itself. The real work happens in
1624 * restore, where we get updated page addresses. This function only
1625 * ensures that GPU access to the BO is stopped.
1626 */
1627int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
1628 struct mm_struct *mm)
1629{
1630 struct amdkfd_process_info *process_info = mem->process_info;
1631 int invalid, evicted_bos;
1632 int r = 0;
1633
1634 invalid = atomic_inc_return(&mem->invalid);
1635 evicted_bos = atomic_inc_return(&process_info->evicted_bos);
1636 if (evicted_bos == 1) {
1637 /* First eviction, stop the queues */
1638 r = kgd2kfd->quiesce_mm(mm);
1639 if (r)
1640 pr_err("Failed to quiesce KFD\n");
1641 schedule_delayed_work(&process_info->restore_userptr_work,
1642 msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
1643 }
1644
1645 return r;
1646}
1647
1648/* Update invalid userptr BOs
1649 *
1650 * Moves invalidated (evicted) userptr BOs from userptr_valid_list to
1651 * userptr_inval_list and updates user pages for all BOs that have
1652 * been invalidated since their last update.
1653 */
1654static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
1655 struct mm_struct *mm)
1656{
1657 struct kgd_mem *mem, *tmp_mem;
1658 struct amdgpu_bo *bo;
1659 struct ttm_operation_ctx ctx = { false, false };
1660 int invalid, ret;
1661
1662 /* Move all invalidated BOs to the userptr_inval_list and
1663 * release their user pages by migration to the CPU domain
1664 */
1665 list_for_each_entry_safe(mem, tmp_mem,
1666 &process_info->userptr_valid_list,
1667 validate_list.head) {
1668 if (!atomic_read(&mem->invalid))
1669 continue; /* BO is still valid */
1670
1671 bo = mem->bo;
1672
1673 if (amdgpu_bo_reserve(bo, true))
1674 return -EAGAIN;
1675 amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
1676 ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
1677 amdgpu_bo_unreserve(bo);
1678 if (ret) {
1679 pr_err("%s: Failed to invalidate userptr BO\n",
1680 __func__);
1681 return -EAGAIN;
1682 }
1683
1684 list_move_tail(&mem->validate_list.head,
1685 &process_info->userptr_inval_list);
1686 }
1687
1688 if (list_empty(&process_info->userptr_inval_list))
1689 return 0; /* All evicted userptr BOs were freed */
1690
1691 /* Go through userptr_inval_list and update any invalid user_pages */
1692 list_for_each_entry(mem, &process_info->userptr_inval_list,
1693 validate_list.head) {
1694 invalid = atomic_read(&mem->invalid);
1695 if (!invalid)
1696 /* BO hasn't been invalidated since the last
1697 * revalidation attempt. Keep its BO list.
1698 */
1699 continue;
1700
1701 bo = mem->bo;
1702
1703 if (!mem->user_pages) {
1704 mem->user_pages =
1705 kvmalloc_array(bo->tbo.ttm->num_pages,
1706 sizeof(struct page *),
1707 GFP_KERNEL | __GFP_ZERO);
1708 if (!mem->user_pages) {
1709 pr_err("%s: Failed to allocate pages array\n",
1710 __func__);
1711 return -ENOMEM;
1712 }
1713 } else if (mem->user_pages[0]) {
1714 release_pages(mem->user_pages, bo->tbo.ttm->num_pages);
1715 }
1716
1717 /* Get updated user pages */
1718 ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
1719 mem->user_pages);
1720 if (ret) {
1721 mem->user_pages[0] = NULL;
1722 pr_info("%s: Failed to get user pages: %d\n",
1723 __func__, ret);
1724 /* Pretend it succeeded. It will fail later
1725 * with a VM fault if the GPU tries to access
1726 * it. Better than hanging indefinitely with
1727 * stalled user mode queues.
1728 */
1729 }
1730
1731 /* Mark the BO as valid unless it was invalidated
1732 * again concurrently
1733 */
1734 if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid)
1735 return -EAGAIN;
1736 }
1737
1738 return 0;
1739}
1740
1741/* Validate invalid userptr BOs
1742 *
1743 * Validates BOs on the userptr_inval_list, and moves them back to the
1744 * userptr_valid_list. Also updates GPUVM page tables with new page
1745 * addresses and waits for the page table updates to complete.
1746 */
1747static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
1748{
1749 struct amdgpu_bo_list_entry *pd_bo_list_entries;
1750 struct list_head resv_list, duplicates;
1751 struct ww_acquire_ctx ticket;
1752 struct amdgpu_sync sync;
1753
1754 struct amdgpu_vm *peer_vm;
1755 struct kgd_mem *mem, *tmp_mem;
1756 struct amdgpu_bo *bo;
1757 struct ttm_operation_ctx ctx = { false, false };
1758 int i, ret;
1759
1760 pd_bo_list_entries = kcalloc(process_info->n_vms,
1761 sizeof(struct amdgpu_bo_list_entry),
1762 GFP_KERNEL);
1763 if (!pd_bo_list_entries) {
1764 pr_err("%s: Failed to allocate PD BO list entries\n", __func__);
1765 return -ENOMEM;
1766 }
1767
1768 INIT_LIST_HEAD(&resv_list);
1769 INIT_LIST_HEAD(&duplicates);
1770
1771 /* Get all the page directory BOs that need to be reserved */
1772 i = 0;
1773 list_for_each_entry(peer_vm, &process_info->vm_list_head,
1774 vm_list_node)
1775 amdgpu_vm_get_pd_bo(peer_vm, &resv_list,
1776 &pd_bo_list_entries[i++]);
1777 /* Add the userptr_inval_list entries to resv_list */
1778 list_for_each_entry(mem, &process_info->userptr_inval_list,
1779 validate_list.head) {
1780 list_add_tail(&mem->resv_list.head, &resv_list);
1781 mem->resv_list.bo = mem->validate_list.bo;
1782 mem->resv_list.shared = mem->validate_list.shared;
1783 }
1784
1785 /* Reserve all BOs and page tables for validation */
1786 ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates);
1787 WARN(!list_empty(&duplicates), "Duplicates should be empty");
1788 if (ret)
1789 goto out;
1790
1791 amdgpu_sync_create(&sync);
1792
1793 /* Avoid triggering eviction fences when unmapping invalid
1794 * userptr BOs (waits for all fences, doesn't use
1795 * FENCE_OWNER_VM)
1796 */
1797 list_for_each_entry(peer_vm, &process_info->vm_list_head,
1798 vm_list_node)
1799 amdgpu_amdkfd_remove_eviction_fence(peer_vm->root.base.bo,
1800 process_info->eviction_fence,
1801 NULL, NULL);
1802
1803 ret = process_validate_vms(process_info);
1804 if (ret)
1805 goto unreserve_out;
1806
1807 /* Validate BOs and update GPUVM page tables */
1808 list_for_each_entry_safe(mem, tmp_mem,
1809 &process_info->userptr_inval_list,
1810 validate_list.head) {
1811 struct kfd_bo_va_list *bo_va_entry;
1812
1813 bo = mem->bo;
1814
1815 /* Copy pages array and validate the BO if we got user pages */
1816 if (mem->user_pages[0]) {
1817 amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm,
1818 mem->user_pages);
1819 amdgpu_ttm_placement_from_domain(bo, mem->domain);
1820 ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
1821 if (ret) {
1822 pr_err("%s: failed to validate BO\n", __func__);
1823 goto unreserve_out;
1824 }
1825 }
1826
1827 /* Validate succeeded, now the BO owns the pages, free
1828 * our copy of the pointer array. Put this BO back on
1829 * the userptr_valid_list. If we need to revalidate
1830 * it, we need to start from scratch.
1831 */
1832 kvfree(mem->user_pages);
1833 mem->user_pages = NULL;
1834 list_move_tail(&mem->validate_list.head,
1835 &process_info->userptr_valid_list);
1836
1837 /* Update mapping. If the BO was not validated
1838 * (because we couldn't get user pages), this will
1839 * clear the page table entries, which will result in
1840 * VM faults if the GPU tries to access the invalid
1841 * memory.
1842 */
1843 list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) {
1844 if (!bo_va_entry->is_mapped)
1845 continue;
1846
1847 ret = update_gpuvm_pte((struct amdgpu_device *)
1848 bo_va_entry->kgd_dev,
1849 bo_va_entry, &sync);
1850 if (ret) {
1851 pr_err("%s: update PTE failed\n", __func__);
1852 /* make sure this gets validated again */
1853 atomic_inc(&mem->invalid);
1854 goto unreserve_out;
1855 }
1856 }
1857 }
1858
1859 /* Update page directories */
1860 ret = process_update_pds(process_info, &sync);
1861
1862unreserve_out:
1863 list_for_each_entry(peer_vm, &process_info->vm_list_head,
1864 vm_list_node)
1865 amdgpu_bo_fence(peer_vm->root.base.bo,
1866 &process_info->eviction_fence->base, true);
1867 ttm_eu_backoff_reservation(&ticket, &resv_list);
1868 amdgpu_sync_wait(&sync, false);
1869 amdgpu_sync_free(&sync);
1870out:
1871 kfree(pd_bo_list_entries);
1872
1873 return ret;
1874}
1875
1876/* Worker callback to restore evicted userptr BOs
1877 *
1878 * Tries to update and validate all userptr BOs. If successful and no
1879 * concurrent evictions happened, the queues are restarted. Otherwise,
1880 * reschedule for another attempt later.
1881 */
1882static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
1883{
1884 struct delayed_work *dwork = to_delayed_work(work);
1885 struct amdkfd_process_info *process_info =
1886 container_of(dwork, struct amdkfd_process_info,
1887 restore_userptr_work);
1888 struct task_struct *usertask;
1889 struct mm_struct *mm;
1890 int evicted_bos;
1891
1892 evicted_bos = atomic_read(&process_info->evicted_bos);
1893 if (!evicted_bos)
1894 return;
1895
1896 /* Reference task and mm in case of concurrent process termination */
1897 usertask = get_pid_task(process_info->pid, PIDTYPE_PID);
1898 if (!usertask)
1899 return;
1900 mm = get_task_mm(usertask);
1901 if (!mm) {
1902 put_task_struct(usertask);
1903 return;
1904 }
1905
1906 mutex_lock(&process_info->lock);
1907
1908 if (update_invalid_user_pages(process_info, mm))
1909 goto unlock_out;
1910 /* userptr_inval_list can be empty if all evicted userptr BOs
1911 * have been freed. In that case there is nothing to validate
1912 * and we can just restart the queues.
1913 */
1914 if (!list_empty(&process_info->userptr_inval_list)) {
1915 if (atomic_read(&process_info->evicted_bos) != evicted_bos)
1916 goto unlock_out; /* Concurrent eviction, try again */
1917
1918 if (validate_invalid_user_pages(process_info))
1919 goto unlock_out;
1920 }
1921 /* Final check for concurrent evicton and atomic update. If
1922 * another eviction happens after successful update, it will
1923 * be a first eviction that calls quiesce_mm. The eviction
1924 * reference counting inside KFD will handle this case.
1925 */
1926 if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) !=
1927 evicted_bos)
1928 goto unlock_out;
1929 evicted_bos = 0;
1930 if (kgd2kfd->resume_mm(mm)) {
1931 pr_err("%s: Failed to resume KFD\n", __func__);
1932 /* No recovery from this failure. Probably the CP is
1933 * hanging. No point trying again.
1934 */
1935 }
1936unlock_out:
1937 mutex_unlock(&process_info->lock);
1938 mmput(mm);
1939 put_task_struct(usertask);
1940
1941 /* If validation failed, reschedule another attempt */
1942 if (evicted_bos)
1943 schedule_delayed_work(&process_info->restore_userptr_work,
1944 msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
1945}
1946
1421/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given 1947/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
1422 * KFD process identified by process_info 1948 * KFD process identified by process_info
1423 * 1949 *
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index dc34b50e6b29..8e66f3702b7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -536,7 +536,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
536 if (p->bo_list) { 536 if (p->bo_list) {
537 amdgpu_bo_list_get_list(p->bo_list, &p->validated); 537 amdgpu_bo_list_get_list(p->bo_list, &p->validated);
538 if (p->bo_list->first_userptr != p->bo_list->num_entries) 538 if (p->bo_list->first_userptr != p->bo_list->num_entries)
539 p->mn = amdgpu_mn_get(p->adev); 539 p->mn = amdgpu_mn_get(p->adev, AMDGPU_MN_TYPE_GFX);
540 } 540 }
541 541
542 INIT_LIST_HEAD(&duplicates); 542 INIT_LIST_HEAD(&duplicates);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index bd67f4cb8e6c..83e344fbb50a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -36,12 +36,14 @@
36#include <drm/drm.h> 36#include <drm/drm.h>
37 37
38#include "amdgpu.h" 38#include "amdgpu.h"
39#include "amdgpu_amdkfd.h"
39 40
40struct amdgpu_mn { 41struct amdgpu_mn {
41 /* constant after initialisation */ 42 /* constant after initialisation */
42 struct amdgpu_device *adev; 43 struct amdgpu_device *adev;
43 struct mm_struct *mm; 44 struct mm_struct *mm;
44 struct mmu_notifier mn; 45 struct mmu_notifier mn;
46 enum amdgpu_mn_type type;
45 47
46 /* only used on destruction */ 48 /* only used on destruction */
47 struct work_struct work; 49 struct work_struct work;
@@ -185,7 +187,7 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
185} 187}
186 188
187/** 189/**
188 * amdgpu_mn_invalidate_range_start - callback to notify about mm change 190 * amdgpu_mn_invalidate_range_start_gfx - callback to notify about mm change
189 * 191 *
190 * @mn: our notifier 192 * @mn: our notifier
191 * @mn: the mm this callback is about 193 * @mn: the mm this callback is about
@@ -195,10 +197,10 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
195 * We block for all BOs between start and end to be idle and 197 * We block for all BOs between start and end to be idle and
196 * unmap them by move them into system domain again. 198 * unmap them by move them into system domain again.
197 */ 199 */
198static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, 200static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
199 struct mm_struct *mm, 201 struct mm_struct *mm,
200 unsigned long start, 202 unsigned long start,
201 unsigned long end) 203 unsigned long end)
202{ 204{
203 struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); 205 struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
204 struct interval_tree_node *it; 206 struct interval_tree_node *it;
@@ -220,6 +222,49 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
220} 222}
221 223
222/** 224/**
225 * amdgpu_mn_invalidate_range_start_hsa - callback to notify about mm change
226 *
227 * @mn: our notifier
228 * @mn: the mm this callback is about
229 * @start: start of updated range
230 * @end: end of updated range
231 *
232 * We temporarily evict all BOs between start and end. This
233 * necessitates evicting all user-mode queues of the process. The BOs
234 * are restorted in amdgpu_mn_invalidate_range_end_hsa.
235 */
236static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
237 struct mm_struct *mm,
238 unsigned long start,
239 unsigned long end)
240{
241 struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
242 struct interval_tree_node *it;
243
244 /* notification is exclusive, but interval is inclusive */
245 end -= 1;
246
247 amdgpu_mn_read_lock(rmn);
248
249 it = interval_tree_iter_first(&rmn->objects, start, end);
250 while (it) {
251 struct amdgpu_mn_node *node;
252 struct amdgpu_bo *bo;
253
254 node = container_of(it, struct amdgpu_mn_node, it);
255 it = interval_tree_iter_next(it, start, end);
256
257 list_for_each_entry(bo, &node->bos, mn_list) {
258 struct kgd_mem *mem = bo->kfd_bo;
259
260 if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
261 start, end))
262 amdgpu_amdkfd_evict_userptr(mem, mm);
263 }
264 }
265}
266
267/**
223 * amdgpu_mn_invalidate_range_end - callback to notify about mm change 268 * amdgpu_mn_invalidate_range_end - callback to notify about mm change
224 * 269 *
225 * @mn: our notifier 270 * @mn: our notifier
@@ -239,23 +284,39 @@ static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn,
239 amdgpu_mn_read_unlock(rmn); 284 amdgpu_mn_read_unlock(rmn);
240} 285}
241 286
242static const struct mmu_notifier_ops amdgpu_mn_ops = { 287static const struct mmu_notifier_ops amdgpu_mn_ops[] = {
243 .release = amdgpu_mn_release, 288 [AMDGPU_MN_TYPE_GFX] = {
244 .invalidate_range_start = amdgpu_mn_invalidate_range_start, 289 .release = amdgpu_mn_release,
245 .invalidate_range_end = amdgpu_mn_invalidate_range_end, 290 .invalidate_range_start = amdgpu_mn_invalidate_range_start_gfx,
291 .invalidate_range_end = amdgpu_mn_invalidate_range_end,
292 },
293 [AMDGPU_MN_TYPE_HSA] = {
294 .release = amdgpu_mn_release,
295 .invalidate_range_start = amdgpu_mn_invalidate_range_start_hsa,
296 .invalidate_range_end = amdgpu_mn_invalidate_range_end,
297 },
246}; 298};
247 299
300/* Low bits of any reasonable mm pointer will be unused due to struct
301 * alignment. Use these bits to make a unique key from the mm pointer
302 * and notifier type.
303 */
304#define AMDGPU_MN_KEY(mm, type) ((unsigned long)(mm) + (type))
305
248/** 306/**
249 * amdgpu_mn_get - create notifier context 307 * amdgpu_mn_get - create notifier context
250 * 308 *
251 * @adev: amdgpu device pointer 309 * @adev: amdgpu device pointer
310 * @type: type of MMU notifier context
252 * 311 *
253 * Creates a notifier context for current->mm. 312 * Creates a notifier context for current->mm.
254 */ 313 */
255struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) 314struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
315 enum amdgpu_mn_type type)
256{ 316{
257 struct mm_struct *mm = current->mm; 317 struct mm_struct *mm = current->mm;
258 struct amdgpu_mn *rmn; 318 struct amdgpu_mn *rmn;
319 unsigned long key = AMDGPU_MN_KEY(mm, type);
259 int r; 320 int r;
260 321
261 mutex_lock(&adev->mn_lock); 322 mutex_lock(&adev->mn_lock);
@@ -264,8 +325,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
264 return ERR_PTR(-EINTR); 325 return ERR_PTR(-EINTR);
265 } 326 }
266 327
267 hash_for_each_possible(adev->mn_hash, rmn, node, (unsigned long)mm) 328 hash_for_each_possible(adev->mn_hash, rmn, node, key)
268 if (rmn->mm == mm) 329 if (AMDGPU_MN_KEY(rmn->mm, rmn->type) == key)
269 goto release_locks; 330 goto release_locks;
270 331
271 rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); 332 rmn = kzalloc(sizeof(*rmn), GFP_KERNEL);
@@ -276,8 +337,9 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
276 337
277 rmn->adev = adev; 338 rmn->adev = adev;
278 rmn->mm = mm; 339 rmn->mm = mm;
279 rmn->mn.ops = &amdgpu_mn_ops;
280 init_rwsem(&rmn->lock); 340 init_rwsem(&rmn->lock);
341 rmn->type = type;
342 rmn->mn.ops = &amdgpu_mn_ops[type];
281 rmn->objects = RB_ROOT_CACHED; 343 rmn->objects = RB_ROOT_CACHED;
282 mutex_init(&rmn->read_lock); 344 mutex_init(&rmn->read_lock);
283 atomic_set(&rmn->recursion, 0); 345 atomic_set(&rmn->recursion, 0);
@@ -286,7 +348,7 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
286 if (r) 348 if (r)
287 goto free_rmn; 349 goto free_rmn;
288 350
289 hash_add(adev->mn_hash, &rmn->node, (unsigned long)mm); 351 hash_add(adev->mn_hash, &rmn->node, AMDGPU_MN_KEY(mm, type));
290 352
291release_locks: 353release_locks:
292 up_write(&mm->mmap_sem); 354 up_write(&mm->mmap_sem);
@@ -315,15 +377,21 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
315{ 377{
316 unsigned long end = addr + amdgpu_bo_size(bo) - 1; 378 unsigned long end = addr + amdgpu_bo_size(bo) - 1;
317 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 379 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
380 enum amdgpu_mn_type type =
381 bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX;
318 struct amdgpu_mn *rmn; 382 struct amdgpu_mn *rmn;
319 struct amdgpu_mn_node *node = NULL; 383 struct amdgpu_mn_node *node = NULL, *new_node;
320 struct list_head bos; 384 struct list_head bos;
321 struct interval_tree_node *it; 385 struct interval_tree_node *it;
322 386
323 rmn = amdgpu_mn_get(adev); 387 rmn = amdgpu_mn_get(adev, type);
324 if (IS_ERR(rmn)) 388 if (IS_ERR(rmn))
325 return PTR_ERR(rmn); 389 return PTR_ERR(rmn);
326 390
391 new_node = kmalloc(sizeof(*new_node), GFP_KERNEL);
392 if (!new_node)
393 return -ENOMEM;
394
327 INIT_LIST_HEAD(&bos); 395 INIT_LIST_HEAD(&bos);
328 396
329 down_write(&rmn->lock); 397 down_write(&rmn->lock);
@@ -337,13 +405,10 @@ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
337 list_splice(&node->bos, &bos); 405 list_splice(&node->bos, &bos);
338 } 406 }
339 407
340 if (!node) { 408 if (!node)
341 node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_KERNEL); 409 node = new_node;
342 if (!node) { 410 else
343 up_write(&rmn->lock); 411 kfree(new_node);
344 return -ENOMEM;
345 }
346 }
347 412
348 bo->mn = rmn; 413 bo->mn = rmn;
349 414
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
index d0095a3793b8..eb0f432f78fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
@@ -29,16 +29,23 @@
29 */ 29 */
30struct amdgpu_mn; 30struct amdgpu_mn;
31 31
32enum amdgpu_mn_type {
33 AMDGPU_MN_TYPE_GFX,
34 AMDGPU_MN_TYPE_HSA,
35};
36
32#if defined(CONFIG_MMU_NOTIFIER) 37#if defined(CONFIG_MMU_NOTIFIER)
33void amdgpu_mn_lock(struct amdgpu_mn *mn); 38void amdgpu_mn_lock(struct amdgpu_mn *mn);
34void amdgpu_mn_unlock(struct amdgpu_mn *mn); 39void amdgpu_mn_unlock(struct amdgpu_mn *mn);
35struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev); 40struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
41 enum amdgpu_mn_type type);
36int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); 42int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr);
37void amdgpu_mn_unregister(struct amdgpu_bo *bo); 43void amdgpu_mn_unregister(struct amdgpu_bo *bo);
38#else 44#else
39static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {} 45static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {}
40static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {} 46static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {}
41static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) 47static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
48 enum amdgpu_mn_type type)
42{ 49{
43 return NULL; 50 return NULL;
44} 51}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 205da3ff9cd0..c713d30cba86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -695,7 +695,7 @@ struct amdgpu_ttm_tt {
695 struct ttm_dma_tt ttm; 695 struct ttm_dma_tt ttm;
696 u64 offset; 696 u64 offset;
697 uint64_t userptr; 697 uint64_t userptr;
698 struct mm_struct *usermm; 698 struct task_struct *usertask;
699 uint32_t userflags; 699 uint32_t userflags;
700 spinlock_t guptasklock; 700 spinlock_t guptasklock;
701 struct list_head guptasks; 701 struct list_head guptasks;
@@ -706,14 +706,18 @@ struct amdgpu_ttm_tt {
706int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) 706int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
707{ 707{
708 struct amdgpu_ttm_tt *gtt = (void *)ttm; 708 struct amdgpu_ttm_tt *gtt = (void *)ttm;
709 struct mm_struct *mm = gtt->usertask->mm;
709 unsigned int flags = 0; 710 unsigned int flags = 0;
710 unsigned pinned = 0; 711 unsigned pinned = 0;
711 int r; 712 int r;
712 713
714 if (!mm) /* Happens during process shutdown */
715 return -ESRCH;
716
713 if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) 717 if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY))
714 flags |= FOLL_WRITE; 718 flags |= FOLL_WRITE;
715 719
716 down_read(&current->mm->mmap_sem); 720 down_read(&mm->mmap_sem);
717 721
718 if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) { 722 if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) {
719 /* check that we only use anonymous memory 723 /* check that we only use anonymous memory
@@ -721,9 +725,9 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
721 unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE; 725 unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
722 struct vm_area_struct *vma; 726 struct vm_area_struct *vma;
723 727
724 vma = find_vma(gtt->usermm, gtt->userptr); 728 vma = find_vma(mm, gtt->userptr);
725 if (!vma || vma->vm_file || vma->vm_end < end) { 729 if (!vma || vma->vm_file || vma->vm_end < end) {
726 up_read(&current->mm->mmap_sem); 730 up_read(&mm->mmap_sem);
727 return -EPERM; 731 return -EPERM;
728 } 732 }
729 } 733 }
@@ -739,7 +743,12 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
739 list_add(&guptask.list, &gtt->guptasks); 743 list_add(&guptask.list, &gtt->guptasks);
740 spin_unlock(&gtt->guptasklock); 744 spin_unlock(&gtt->guptasklock);
741 745
742 r = get_user_pages(userptr, num_pages, flags, p, NULL); 746 if (mm == current->mm)
747 r = get_user_pages(userptr, num_pages, flags, p, NULL);
748 else
749 r = get_user_pages_remote(gtt->usertask,
750 mm, userptr, num_pages,
751 flags, p, NULL, NULL);
743 752
744 spin_lock(&gtt->guptasklock); 753 spin_lock(&gtt->guptasklock);
745 list_del(&guptask.list); 754 list_del(&guptask.list);
@@ -752,12 +761,12 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
752 761
753 } while (pinned < ttm->num_pages); 762 } while (pinned < ttm->num_pages);
754 763
755 up_read(&current->mm->mmap_sem); 764 up_read(&mm->mmap_sem);
756 return 0; 765 return 0;
757 766
758release_pages: 767release_pages:
759 release_pages(pages, pinned); 768 release_pages(pages, pinned);
760 up_read(&current->mm->mmap_sem); 769 up_read(&mm->mmap_sem);
761 return r; 770 return r;
762} 771}
763 772
@@ -978,6 +987,9 @@ static void amdgpu_ttm_backend_destroy(struct ttm_tt *ttm)
978{ 987{
979 struct amdgpu_ttm_tt *gtt = (void *)ttm; 988 struct amdgpu_ttm_tt *gtt = (void *)ttm;
980 989
990 if (gtt->usertask)
991 put_task_struct(gtt->usertask);
992
981 ttm_dma_tt_fini(&gtt->ttm); 993 ttm_dma_tt_fini(&gtt->ttm);
982 kfree(gtt); 994 kfree(gtt);
983} 995}
@@ -1079,8 +1091,13 @@ int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr,
1079 return -EINVAL; 1091 return -EINVAL;
1080 1092
1081 gtt->userptr = addr; 1093 gtt->userptr = addr;
1082 gtt->usermm = current->mm;
1083 gtt->userflags = flags; 1094 gtt->userflags = flags;
1095
1096 if (gtt->usertask)
1097 put_task_struct(gtt->usertask);
1098 gtt->usertask = current->group_leader;
1099 get_task_struct(gtt->usertask);
1100
1084 spin_lock_init(&gtt->guptasklock); 1101 spin_lock_init(&gtt->guptasklock);
1085 INIT_LIST_HEAD(&gtt->guptasks); 1102 INIT_LIST_HEAD(&gtt->guptasks);
1086 atomic_set(&gtt->mmu_invalidations, 0); 1103 atomic_set(&gtt->mmu_invalidations, 0);
@@ -1096,7 +1113,10 @@ struct mm_struct *amdgpu_ttm_tt_get_usermm(struct ttm_tt *ttm)
1096 if (gtt == NULL) 1113 if (gtt == NULL)
1097 return NULL; 1114 return NULL;
1098 1115
1099 return gtt->usermm; 1116 if (gtt->usertask == NULL)
1117 return NULL;
1118
1119 return gtt->usertask->mm;
1100} 1120}
1101 1121
1102bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start, 1122bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9d39fd5b1822..e5962e61beb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4686,6 +4686,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
4686 4686
4687 cu_info->number = active_cu_number; 4687 cu_info->number = active_cu_number;
4688 cu_info->ao_cu_mask = ao_cu_mask; 4688 cu_info->ao_cu_mask = ao_cu_mask;
4689 cu_info->simd_per_cu = NUM_SIMD_PER_CU;
4689 4690
4690 return 0; 4691 return 0;
4691} 4692}
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
index 7f408f85fdb6..f22f7a88ce0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
@@ -268,6 +268,11 @@
268 * x=1: tmz_end 268 * x=1: tmz_end
269 */ 269 */
270 270
271#define PACKET3_INVALIDATE_TLBS 0x98
272# define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0)
273# define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4)
274# define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5)
275# define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29)
271#define PACKET3_SET_RESOURCES 0xA0 276#define PACKET3_SET_RESOURCES 0xA0
272/* 1. header 277/* 1. header
273 * 2. CONTROL 278 * 2. CONTROL
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 0d0242240c47..ffd096fffc1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -30,12 +30,14 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
30 kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ 30 kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
31 kfd_process.o kfd_queue.o kfd_mqd_manager.o \ 31 kfd_process.o kfd_queue.o kfd_mqd_manager.o \
32 kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ 32 kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \
33 kfd_mqd_manager_v9.o \
33 kfd_kernel_queue.o kfd_kernel_queue_cik.o \ 34 kfd_kernel_queue.o kfd_kernel_queue_cik.o \
34 kfd_kernel_queue_vi.o kfd_packet_manager.o \ 35 kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \
35 kfd_process_queue_manager.o kfd_device_queue_manager.o \ 36 kfd_packet_manager.o kfd_process_queue_manager.o \
36 kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ 37 kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \
38 kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \
37 kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ 39 kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
38 kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o 40 kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
39 41
40ifneq ($(CONFIG_AMD_IOMMU_V2),) 42ifneq ($(CONFIG_AMD_IOMMU_V2),)
41amdkfd-y += kfd_iommu.o 43amdkfd-y += kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 3d5ccb3755d4..49df6c791cfc 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -27,18 +27,28 @@
27static bool cik_event_interrupt_isr(struct kfd_dev *dev, 27static bool cik_event_interrupt_isr(struct kfd_dev *dev,
28 const uint32_t *ih_ring_entry) 28 const uint32_t *ih_ring_entry)
29{ 29{
30 unsigned int pasid;
31 const struct cik_ih_ring_entry *ihre = 30 const struct cik_ih_ring_entry *ihre =
32 (const struct cik_ih_ring_entry *)ih_ring_entry; 31 (const struct cik_ih_ring_entry *)ih_ring_entry;
32 unsigned int vmid, pasid;
33
34 /* Only handle interrupts from KFD VMIDs */
35 vmid = (ihre->ring_id & 0x0000ff00) >> 8;
36 if (vmid < dev->vm_info.first_vmid_kfd ||
37 vmid > dev->vm_info.last_vmid_kfd)
38 return 0;
33 39
40 /* If there is no valid PASID, it's likely a firmware bug */
34 pasid = (ihre->ring_id & 0xffff0000) >> 16; 41 pasid = (ihre->ring_id & 0xffff0000) >> 16;
42 if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt"))
43 return 0;
35 44
36 /* Do not process in ISR, just request it to be forwarded to WQ. */ 45 /* Interrupt types we care about: various signals and faults.
37 return (pasid != 0) && 46 * They will be forwarded to a work queue (see below).
38 (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || 47 */
48 return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
39 ihre->source_id == CIK_INTSRC_SDMA_TRAP || 49 ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
40 ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || 50 ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
41 ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); 51 ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
42} 52}
43 53
44static void cik_event_interrupt_wq(struct kfd_dev *dev, 54static void cik_event_interrupt_wq(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
index 48769d12dd7b..37ce6dd65391 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
@@ -33,7 +33,8 @@
33#define APE1_MTYPE(x) ((x) << 7) 33#define APE1_MTYPE(x) ((x) << 7)
34 34
35/* valid for both DEFAULT_MTYPE and APE1_MTYPE */ 35/* valid for both DEFAULT_MTYPE and APE1_MTYPE */
36#define MTYPE_CACHED 0 36#define MTYPE_CACHED_NV 0
37#define MTYPE_CACHED 1
37#define MTYPE_NONCACHED 3 38#define MTYPE_NONCACHED 3
38 39
39#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) 40#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8)
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
new file mode 100644
index 000000000000..f68aef02fc1f
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -0,0 +1,560 @@
1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23static const uint32_t cwsr_trap_gfx8_hex[] = {
24 0xbf820001, 0xbf820125,
25 0xb8f4f802, 0x89748674,
26 0xb8f5f803, 0x8675ff75,
27 0x00000400, 0xbf850011,
28 0xc00a1e37, 0x00000000,
29 0xbf8c007f, 0x87777978,
30 0xbf840002, 0xb974f802,
31 0xbe801d78, 0xb8f5f803,
32 0x8675ff75, 0x000001ff,
33 0xbf850002, 0x80708470,
34 0x82718071, 0x8671ff71,
35 0x0000ffff, 0xb974f802,
36 0xbe801f70, 0xb8f5f803,
37 0x8675ff75, 0x00000100,
38 0xbf840006, 0xbefa0080,
39 0xb97a0203, 0x8671ff71,
40 0x0000ffff, 0x80f08870,
41 0x82f18071, 0xbefa0080,
42 0xb97a0283, 0xbef60068,
43 0xbef70069, 0xb8fa1c07,
44 0x8e7a9c7a, 0x87717a71,
45 0xb8fa03c7, 0x8e7a9b7a,
46 0x87717a71, 0xb8faf807,
47 0x867aff7a, 0x00007fff,
48 0xb97af807, 0xbef2007e,
49 0xbef3007f, 0xbefe0180,
50 0xbf900004, 0x877a8474,
51 0xb97af802, 0xbf8e0002,
52 0xbf88fffe, 0xbef8007e,
53 0x8679ff7f, 0x0000ffff,
54 0x8779ff79, 0x00040000,
55 0xbefa0080, 0xbefb00ff,
56 0x00807fac, 0x867aff7f,
57 0x08000000, 0x8f7a837a,
58 0x877b7a7b, 0x867aff7f,
59 0x70000000, 0x8f7a817a,
60 0x877b7a7b, 0xbeef007c,
61 0xbeee0080, 0xb8ee2a05,
62 0x806e816e, 0x8e6e8a6e,
63 0xb8fa1605, 0x807a817a,
64 0x8e7a867a, 0x806e7a6e,
65 0xbefa0084, 0xbefa00ff,
66 0x01000000, 0xbefe007c,
67 0xbefc006e, 0xc0611bfc,
68 0x0000007c, 0x806e846e,
69 0xbefc007e, 0xbefe007c,
70 0xbefc006e, 0xc0611c3c,
71 0x0000007c, 0x806e846e,
72 0xbefc007e, 0xbefe007c,
73 0xbefc006e, 0xc0611c7c,
74 0x0000007c, 0x806e846e,
75 0xbefc007e, 0xbefe007c,
76 0xbefc006e, 0xc0611cbc,
77 0x0000007c, 0x806e846e,
78 0xbefc007e, 0xbefe007c,
79 0xbefc006e, 0xc0611cfc,
80 0x0000007c, 0x806e846e,
81 0xbefc007e, 0xbefe007c,
82 0xbefc006e, 0xc0611d3c,
83 0x0000007c, 0x806e846e,
84 0xbefc007e, 0xb8f5f803,
85 0xbefe007c, 0xbefc006e,
86 0xc0611d7c, 0x0000007c,
87 0x806e846e, 0xbefc007e,
88 0xbefe007c, 0xbefc006e,
89 0xc0611dbc, 0x0000007c,
90 0x806e846e, 0xbefc007e,
91 0xbefe007c, 0xbefc006e,
92 0xc0611dfc, 0x0000007c,
93 0x806e846e, 0xbefc007e,
94 0xb8eff801, 0xbefe007c,
95 0xbefc006e, 0xc0611bfc,
96 0x0000007c, 0x806e846e,
97 0xbefc007e, 0xbefe007c,
98 0xbefc006e, 0xc0611b3c,
99 0x0000007c, 0x806e846e,
100 0xbefc007e, 0xbefe007c,
101 0xbefc006e, 0xc0611b7c,
102 0x0000007c, 0x806e846e,
103 0xbefc007e, 0x867aff7f,
104 0x04000000, 0xbef30080,
105 0x8773737a, 0xb8ee2a05,
106 0x806e816e, 0x8e6e8a6e,
107 0xb8f51605, 0x80758175,
108 0x8e758475, 0x8e7a8275,
109 0xbefa00ff, 0x01000000,
110 0xbef60178, 0x80786e78,
111 0x82798079, 0xbefc0080,
112 0xbe802b00, 0xbe822b02,
113 0xbe842b04, 0xbe862b06,
114 0xbe882b08, 0xbe8a2b0a,
115 0xbe8c2b0c, 0xbe8e2b0e,
116 0xc06b003c, 0x00000000,
117 0xc06b013c, 0x00000010,
118 0xc06b023c, 0x00000020,
119 0xc06b033c, 0x00000030,
120 0x8078c078, 0x82798079,
121 0x807c907c, 0xbf0a757c,
122 0xbf85ffeb, 0xbef80176,
123 0xbeee0080, 0xbefe00c1,
124 0xbeff00c1, 0xbefa00ff,
125 0x01000000, 0xe0724000,
126 0x6e1e0000, 0xe0724100,
127 0x6e1e0100, 0xe0724200,
128 0x6e1e0200, 0xe0724300,
129 0x6e1e0300, 0xbefe00c1,
130 0xbeff00c1, 0xb8f54306,
131 0x8675c175, 0xbf84002c,
132 0xbf8a0000, 0x867aff73,
133 0x04000000, 0xbf840028,
134 0x8e758675, 0x8e758275,
135 0xbefa0075, 0xb8ee2a05,
136 0x806e816e, 0x8e6e8a6e,
137 0xb8fa1605, 0x807a817a,
138 0x8e7a867a, 0x806e7a6e,
139 0x806eff6e, 0x00000080,
140 0xbefa00ff, 0x01000000,
141 0xbefc0080, 0xd28c0002,
142 0x000100c1, 0xd28d0003,
143 0x000204c1, 0xd1060002,
144 0x00011103, 0x7e0602ff,
145 0x00000200, 0xbefc00ff,
146 0x00010000, 0xbe80007b,
147 0x867bff7b, 0xff7fffff,
148 0x877bff7b, 0x00058000,
149 0xd8ec0000, 0x00000002,
150 0xbf8c007f, 0xe0765000,
151 0x6e1e0002, 0x32040702,
152 0xd0c9006a, 0x0000eb02,
153 0xbf87fff7, 0xbefb0000,
154 0xbeee00ff, 0x00000400,
155 0xbefe00c1, 0xbeff00c1,
156 0xb8f52a05, 0x80758175,
157 0x8e758275, 0x8e7a8875,
158 0xbefa00ff, 0x01000000,
159 0xbefc0084, 0xbf0a757c,
160 0xbf840015, 0xbf11017c,
161 0x8075ff75, 0x00001000,
162 0x7e000300, 0x7e020301,
163 0x7e040302, 0x7e060303,
164 0xe0724000, 0x6e1e0000,
165 0xe0724100, 0x6e1e0100,
166 0xe0724200, 0x6e1e0200,
167 0xe0724300, 0x6e1e0300,
168 0x807c847c, 0x806eff6e,
169 0x00000400, 0xbf0a757c,
170 0xbf85ffef, 0xbf9c0000,
171 0xbf8200ca, 0xbef8007e,
172 0x8679ff7f, 0x0000ffff,
173 0x8779ff79, 0x00040000,
174 0xbefa0080, 0xbefb00ff,
175 0x00807fac, 0x8676ff7f,
176 0x08000000, 0x8f768376,
177 0x877b767b, 0x8676ff7f,
178 0x70000000, 0x8f768176,
179 0x877b767b, 0x8676ff7f,
180 0x04000000, 0xbf84001e,
181 0xbefe00c1, 0xbeff00c1,
182 0xb8f34306, 0x8673c173,
183 0xbf840019, 0x8e738673,
184 0x8e738273, 0xbefa0073,
185 0xb8f22a05, 0x80728172,
186 0x8e728a72, 0xb8f61605,
187 0x80768176, 0x8e768676,
188 0x80727672, 0x8072ff72,
189 0x00000080, 0xbefa00ff,
190 0x01000000, 0xbefc0080,
191 0xe0510000, 0x721e0000,
192 0xe0510100, 0x721e0000,
193 0x807cff7c, 0x00000200,
194 0x8072ff72, 0x00000200,
195 0xbf0a737c, 0xbf85fff6,
196 0xbef20080, 0xbefe00c1,
197 0xbeff00c1, 0xb8f32a05,
198 0x80738173, 0x8e738273,
199 0x8e7a8873, 0xbefa00ff,
200 0x01000000, 0xbef60072,
201 0x8072ff72, 0x00000400,
202 0xbefc0084, 0xbf11087c,
203 0x8073ff73, 0x00008000,
204 0xe0524000, 0x721e0000,
205 0xe0524100, 0x721e0100,
206 0xe0524200, 0x721e0200,
207 0xe0524300, 0x721e0300,
208 0xbf8c0f70, 0x7e000300,
209 0x7e020301, 0x7e040302,
210 0x7e060303, 0x807c847c,
211 0x8072ff72, 0x00000400,
212 0xbf0a737c, 0xbf85ffee,
213 0xbf9c0000, 0xe0524000,
214 0x761e0000, 0xe0524100,
215 0x761e0100, 0xe0524200,
216 0x761e0200, 0xe0524300,
217 0x761e0300, 0xb8f22a05,
218 0x80728172, 0x8e728a72,
219 0xb8f61605, 0x80768176,
220 0x8e768676, 0x80727672,
221 0x80f2c072, 0xb8f31605,
222 0x80738173, 0x8e738473,
223 0x8e7a8273, 0xbefa00ff,
224 0x01000000, 0xbefc0073,
225 0xc031003c, 0x00000072,
226 0x80f2c072, 0xbf8c007f,
227 0x80fc907c, 0xbe802d00,
228 0xbe822d02, 0xbe842d04,
229 0xbe862d06, 0xbe882d08,
230 0xbe8a2d0a, 0xbe8c2d0c,
231 0xbe8e2d0e, 0xbf06807c,
232 0xbf84fff1, 0xb8f22a05,
233 0x80728172, 0x8e728a72,
234 0xb8f61605, 0x80768176,
235 0x8e768676, 0x80727672,
236 0xbefa0084, 0xbefa00ff,
237 0x01000000, 0xc0211cfc,
238 0x00000072, 0x80728472,
239 0xc0211c3c, 0x00000072,
240 0x80728472, 0xc0211c7c,
241 0x00000072, 0x80728472,
242 0xc0211bbc, 0x00000072,
243 0x80728472, 0xc0211bfc,
244 0x00000072, 0x80728472,
245 0xc0211d3c, 0x00000072,
246 0x80728472, 0xc0211d7c,
247 0x00000072, 0x80728472,
248 0xc0211a3c, 0x00000072,
249 0x80728472, 0xc0211a7c,
250 0x00000072, 0x80728472,
251 0xc0211dfc, 0x00000072,
252 0x80728472, 0xc0211b3c,
253 0x00000072, 0x80728472,
254 0xc0211b7c, 0x00000072,
255 0x80728472, 0xbf8c007f,
256 0xbefc0073, 0xbefe006e,
257 0xbeff006f, 0x867375ff,
258 0x000003ff, 0xb9734803,
259 0x867375ff, 0xfffff800,
260 0x8f738b73, 0xb973a2c3,
261 0xb977f801, 0x8673ff71,
262 0xf0000000, 0x8f739c73,
263 0x8e739073, 0xbef60080,
264 0x87767376, 0x8673ff71,
265 0x08000000, 0x8f739b73,
266 0x8e738f73, 0x87767376,
267 0x8673ff74, 0x00800000,
268 0x8f739773, 0xb976f807,
269 0x8671ff71, 0x0000ffff,
270 0x86fe7e7e, 0x86ea6a6a,
271 0xb974f802, 0xbf8a0000,
272 0x95807370, 0xbf810000,
273};
274
275
276static const uint32_t cwsr_trap_gfx9_hex[] = {
277 0xbf820001, 0xbf82015a,
278 0xb8f8f802, 0x89788678,
279 0xb8f1f803, 0x866eff71,
280 0x00000400, 0xbf850034,
281 0x866eff71, 0x00000800,
282 0xbf850003, 0x866eff71,
283 0x00000100, 0xbf840008,
284 0x866eff78, 0x00002000,
285 0xbf840001, 0xbf810000,
286 0x8778ff78, 0x00002000,
287 0x80ec886c, 0x82ed806d,
288 0xb8eef807, 0x866fff6e,
289 0x001f8000, 0x8e6f8b6f,
290 0x8977ff77, 0xfc000000,
291 0x87776f77, 0x896eff6e,
292 0x001f8000, 0xb96ef807,
293 0xb8f0f812, 0xb8f1f813,
294 0x8ef08870, 0xc0071bb8,
295 0x00000000, 0xbf8cc07f,
296 0xc0071c38, 0x00000008,
297 0xbf8cc07f, 0x86ee6e6e,
298 0xbf840001, 0xbe801d6e,
299 0xb8f1f803, 0x8671ff71,
300 0x000001ff, 0xbf850002,
301 0x806c846c, 0x826d806d,
302 0x866dff6d, 0x0000ffff,
303 0x8f6e8b77, 0x866eff6e,
304 0x001f8000, 0xb96ef807,
305 0x86fe7e7e, 0x86ea6a6a,
306 0xb978f802, 0xbe801f6c,
307 0x866dff6d, 0x0000ffff,
308 0xbef00080, 0xb9700283,
309 0xb8f02407, 0x8e709c70,
310 0x876d706d, 0xb8f003c7,
311 0x8e709b70, 0x876d706d,
312 0xb8f0f807, 0x8670ff70,
313 0x00007fff, 0xb970f807,
314 0xbeee007e, 0xbeef007f,
315 0xbefe0180, 0xbf900004,
316 0x87708478, 0xb970f802,
317 0xbf8e0002, 0xbf88fffe,
318 0xb8f02a05, 0x80708170,
319 0x8e708a70, 0xb8f11605,
320 0x80718171, 0x8e718671,
321 0x80707170, 0x80707e70,
322 0x8271807f, 0x8671ff71,
323 0x0000ffff, 0xc0471cb8,
324 0x00000040, 0xbf8cc07f,
325 0xc04b1d38, 0x00000048,
326 0xbf8cc07f, 0xc0431e78,
327 0x00000058, 0xbf8cc07f,
328 0xc0471eb8, 0x0000005c,
329 0xbf8cc07f, 0xbef4007e,
330 0x8675ff7f, 0x0000ffff,
331 0x8775ff75, 0x00040000,
332 0xbef60080, 0xbef700ff,
333 0x00807fac, 0x8670ff7f,
334 0x08000000, 0x8f708370,
335 0x87777077, 0x8670ff7f,
336 0x70000000, 0x8f708170,
337 0x87777077, 0xbefb007c,
338 0xbefa0080, 0xb8fa2a05,
339 0x807a817a, 0x8e7a8a7a,
340 0xb8f01605, 0x80708170,
341 0x8e708670, 0x807a707a,
342 0xbef60084, 0xbef600ff,
343 0x01000000, 0xbefe007c,
344 0xbefc007a, 0xc0611efa,
345 0x0000007c, 0xbf8cc07f,
346 0x807a847a, 0xbefc007e,
347 0xbefe007c, 0xbefc007a,
348 0xc0611b3a, 0x0000007c,
349 0xbf8cc07f, 0x807a847a,
350 0xbefc007e, 0xbefe007c,
351 0xbefc007a, 0xc0611b7a,
352 0x0000007c, 0xbf8cc07f,
353 0x807a847a, 0xbefc007e,
354 0xbefe007c, 0xbefc007a,
355 0xc0611bba, 0x0000007c,
356 0xbf8cc07f, 0x807a847a,
357 0xbefc007e, 0xbefe007c,
358 0xbefc007a, 0xc0611bfa,
359 0x0000007c, 0xbf8cc07f,
360 0x807a847a, 0xbefc007e,
361 0xbefe007c, 0xbefc007a,
362 0xc0611e3a, 0x0000007c,
363 0xbf8cc07f, 0x807a847a,
364 0xbefc007e, 0xb8f1f803,
365 0xbefe007c, 0xbefc007a,
366 0xc0611c7a, 0x0000007c,
367 0xbf8cc07f, 0x807a847a,
368 0xbefc007e, 0xbefe007c,
369 0xbefc007a, 0xc0611a3a,
370 0x0000007c, 0xbf8cc07f,
371 0x807a847a, 0xbefc007e,
372 0xbefe007c, 0xbefc007a,
373 0xc0611a7a, 0x0000007c,
374 0xbf8cc07f, 0x807a847a,
375 0xbefc007e, 0xb8fbf801,
376 0xbefe007c, 0xbefc007a,
377 0xc0611efa, 0x0000007c,
378 0xbf8cc07f, 0x807a847a,
379 0xbefc007e, 0x8670ff7f,
380 0x04000000, 0xbeef0080,
381 0x876f6f70, 0xb8fa2a05,
382 0x807a817a, 0x8e7a8a7a,
383 0xb8f11605, 0x80718171,
384 0x8e718471, 0x8e768271,
385 0xbef600ff, 0x01000000,
386 0xbef20174, 0x80747a74,
387 0x82758075, 0xbefc0080,
388 0xbf800000, 0xbe802b00,
389 0xbe822b02, 0xbe842b04,
390 0xbe862b06, 0xbe882b08,
391 0xbe8a2b0a, 0xbe8c2b0c,
392 0xbe8e2b0e, 0xc06b003a,
393 0x00000000, 0xbf8cc07f,
394 0xc06b013a, 0x00000010,
395 0xbf8cc07f, 0xc06b023a,
396 0x00000020, 0xbf8cc07f,
397 0xc06b033a, 0x00000030,
398 0xbf8cc07f, 0x8074c074,
399 0x82758075, 0x807c907c,
400 0xbf0a717c, 0xbf85ffe7,
401 0xbef40172, 0xbefa0080,
402 0xbefe00c1, 0xbeff00c1,
403 0xbee80080, 0xbee90080,
404 0xbef600ff, 0x01000000,
405 0xe0724000, 0x7a1d0000,
406 0xe0724100, 0x7a1d0100,
407 0xe0724200, 0x7a1d0200,
408 0xe0724300, 0x7a1d0300,
409 0xbefe00c1, 0xbeff00c1,
410 0xb8f14306, 0x8671c171,
411 0xbf84002c, 0xbf8a0000,
412 0x8670ff6f, 0x04000000,
413 0xbf840028, 0x8e718671,
414 0x8e718271, 0xbef60071,
415 0xb8fa2a05, 0x807a817a,
416 0x8e7a8a7a, 0xb8f01605,
417 0x80708170, 0x8e708670,
418 0x807a707a, 0x807aff7a,
419 0x00000080, 0xbef600ff,
420 0x01000000, 0xbefc0080,
421 0xd28c0002, 0x000100c1,
422 0xd28d0003, 0x000204c1,
423 0xd1060002, 0x00011103,
424 0x7e0602ff, 0x00000200,
425 0xbefc00ff, 0x00010000,
426 0xbe800077, 0x8677ff77,
427 0xff7fffff, 0x8777ff77,
428 0x00058000, 0xd8ec0000,
429 0x00000002, 0xbf8cc07f,
430 0xe0765000, 0x7a1d0002,
431 0x68040702, 0xd0c9006a,
432 0x0000e302, 0xbf87fff7,
433 0xbef70000, 0xbefa00ff,
434 0x00000400, 0xbefe00c1,
435 0xbeff00c1, 0xb8f12a05,
436 0x80718171, 0x8e718271,
437 0x8e768871, 0xbef600ff,
438 0x01000000, 0xbefc0084,
439 0xbf0a717c, 0xbf840015,
440 0xbf11017c, 0x8071ff71,
441 0x00001000, 0x7e000300,
442 0x7e020301, 0x7e040302,
443 0x7e060303, 0xe0724000,
444 0x7a1d0000, 0xe0724100,
445 0x7a1d0100, 0xe0724200,
446 0x7a1d0200, 0xe0724300,
447 0x7a1d0300, 0x807c847c,
448 0x807aff7a, 0x00000400,
449 0xbf0a717c, 0xbf85ffef,
450 0xbf9c0000, 0xbf8200d9,
451 0xbef4007e, 0x8675ff7f,
452 0x0000ffff, 0x8775ff75,
453 0x00040000, 0xbef60080,
454 0xbef700ff, 0x00807fac,
455 0x866eff7f, 0x08000000,
456 0x8f6e836e, 0x87776e77,
457 0x866eff7f, 0x70000000,
458 0x8f6e816e, 0x87776e77,
459 0x866eff7f, 0x04000000,
460 0xbf84001e, 0xbefe00c1,
461 0xbeff00c1, 0xb8ef4306,
462 0x866fc16f, 0xbf840019,
463 0x8e6f866f, 0x8e6f826f,
464 0xbef6006f, 0xb8f82a05,
465 0x80788178, 0x8e788a78,
466 0xb8ee1605, 0x806e816e,
467 0x8e6e866e, 0x80786e78,
468 0x8078ff78, 0x00000080,
469 0xbef600ff, 0x01000000,
470 0xbefc0080, 0xe0510000,
471 0x781d0000, 0xe0510100,
472 0x781d0000, 0x807cff7c,
473 0x00000200, 0x8078ff78,
474 0x00000200, 0xbf0a6f7c,
475 0xbf85fff6, 0xbef80080,
476 0xbefe00c1, 0xbeff00c1,
477 0xb8ef2a05, 0x806f816f,
478 0x8e6f826f, 0x8e76886f,
479 0xbef600ff, 0x01000000,
480 0xbeee0078, 0x8078ff78,
481 0x00000400, 0xbefc0084,
482 0xbf11087c, 0x806fff6f,
483 0x00008000, 0xe0524000,
484 0x781d0000, 0xe0524100,
485 0x781d0100, 0xe0524200,
486 0x781d0200, 0xe0524300,
487 0x781d0300, 0xbf8c0f70,
488 0x7e000300, 0x7e020301,
489 0x7e040302, 0x7e060303,
490 0x807c847c, 0x8078ff78,
491 0x00000400, 0xbf0a6f7c,
492 0xbf85ffee, 0xbf9c0000,
493 0xe0524000, 0x6e1d0000,
494 0xe0524100, 0x6e1d0100,
495 0xe0524200, 0x6e1d0200,
496 0xe0524300, 0x6e1d0300,
497 0xb8f82a05, 0x80788178,
498 0x8e788a78, 0xb8ee1605,
499 0x806e816e, 0x8e6e866e,
500 0x80786e78, 0x80f8c078,
501 0xb8ef1605, 0x806f816f,
502 0x8e6f846f, 0x8e76826f,
503 0xbef600ff, 0x01000000,
504 0xbefc006f, 0xc031003a,
505 0x00000078, 0x80f8c078,
506 0xbf8cc07f, 0x80fc907c,
507 0xbf800000, 0xbe802d00,
508 0xbe822d02, 0xbe842d04,
509 0xbe862d06, 0xbe882d08,
510 0xbe8a2d0a, 0xbe8c2d0c,
511 0xbe8e2d0e, 0xbf06807c,
512 0xbf84fff0, 0xb8f82a05,
513 0x80788178, 0x8e788a78,
514 0xb8ee1605, 0x806e816e,
515 0x8e6e866e, 0x80786e78,
516 0xbef60084, 0xbef600ff,
517 0x01000000, 0xc0211bfa,
518 0x00000078, 0x80788478,
519 0xc0211b3a, 0x00000078,
520 0x80788478, 0xc0211b7a,
521 0x00000078, 0x80788478,
522 0xc0211eba, 0x00000078,
523 0x80788478, 0xc0211efa,
524 0x00000078, 0x80788478,
525 0xc0211c3a, 0x00000078,
526 0x80788478, 0xc0211c7a,
527 0x00000078, 0x80788478,
528 0xc0211a3a, 0x00000078,
529 0x80788478, 0xc0211a7a,
530 0x00000078, 0x80788478,
531 0xc0211cfa, 0x00000078,
532 0x80788478, 0xbf8cc07f,
533 0xbefc006f, 0xbefe007a,
534 0xbeff007b, 0x866f71ff,
535 0x000003ff, 0xb96f4803,
536 0x866f71ff, 0xfffff800,
537 0x8f6f8b6f, 0xb96fa2c3,
538 0xb973f801, 0xb8ee2a05,
539 0x806e816e, 0x8e6e8a6e,
540 0xb8ef1605, 0x806f816f,
541 0x8e6f866f, 0x806e6f6e,
542 0x806e746e, 0x826f8075,
543 0x866fff6f, 0x0000ffff,
544 0xc0071cb7, 0x00000040,
545 0xc00b1d37, 0x00000048,
546 0xc0031e77, 0x00000058,
547 0xc0071eb7, 0x0000005c,
548 0xbf8cc07f, 0x866fff6d,
549 0xf0000000, 0x8f6f9c6f,
550 0x8e6f906f, 0xbeee0080,
551 0x876e6f6e, 0x866fff6d,
552 0x08000000, 0x8f6f9b6f,
553 0x8e6f8f6f, 0x876e6f6e,
554 0x866fff70, 0x00800000,
555 0x8f6f976f, 0xb96ef807,
556 0x866dff6d, 0x0000ffff,
557 0x86fe7e7e, 0x86ea6a6a,
558 0xb970f802, 0xbf8a0000,
559 0x95806f6c, 0xbf810000,
560};
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
index 997a383dcb8b..a2a04bb64096 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
@@ -20,9 +20,12 @@
20 * OTHER DEALINGS IN THE SOFTWARE. 20 * OTHER DEALINGS IN THE SOFTWARE.
21 */ 21 */
22 22
23#if 0 23/* To compile this assembly code:
24HW (VI) source code for CWSR trap handler 24 * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex
25#Version 18 + multiple trap handler 25 */
26
27/* HW (VI) source code for CWSR trap handler */
28/* Version 18 + multiple trap handler */
26 29
27// this performance-optimal version was originally from Seven Xu at SRDC 30// this performance-optimal version was originally from Seven Xu at SRDC
28 31
@@ -98,6 +101,7 @@ var SWIZZLE_EN = 0 //whether we use swi
98/**************************************************************************/ 101/**************************************************************************/
99var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 102var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
100var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 103var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
104var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1
101var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 105var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
102 106
103var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 107var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
@@ -149,7 +153,7 @@ var s_save_spi_init_lo = exec_lo
149var s_save_spi_init_hi = exec_hi 153var s_save_spi_init_hi = exec_hi
150 154
151 //tba_lo and tba_hi need to be saved/restored 155 //tba_lo and tba_hi need to be saved/restored
152var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} 156var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
153var s_save_pc_hi = ttmp1 157var s_save_pc_hi = ttmp1
154var s_save_exec_lo = ttmp2 158var s_save_exec_lo = ttmp2
155var s_save_exec_hi = ttmp3 159var s_save_exec_hi = ttmp3
@@ -319,6 +323,10 @@ end
319 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC 323 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
320 end 324 end
321 325
326 // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
327 s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
328 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
329
322 L_SLEEP: 330 L_SLEEP:
323 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 331 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
324 332
@@ -1007,8 +1015,6 @@ end
1007 1015
1008 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS 1016 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
1009 1017
1010 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
1011
1012 //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: 1018 //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
1013 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 1019 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
1014 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) 1020 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
@@ -1044,6 +1050,7 @@ end
1044 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT 1050 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
1045 s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp 1051 s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
1046 1052
1053 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
1047 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 1054 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
1048 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 1055 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
1049 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu 1056 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
@@ -1127,258 +1134,3 @@ end
1127function get_hwreg_size_bytes 1134function get_hwreg_size_bytes
1128 return 128 //HWREG size 128 bytes 1135 return 128 //HWREG size 128 bytes
1129end 1136end
1130
1131
1132#endif
1133
1134static const uint32_t cwsr_trap_gfx8_hex[] = {
1135 0xbf820001, 0xbf820123,
1136 0xb8f4f802, 0x89748674,
1137 0xb8f5f803, 0x8675ff75,
1138 0x00000400, 0xbf850011,
1139 0xc00a1e37, 0x00000000,
1140 0xbf8c007f, 0x87777978,
1141 0xbf840002, 0xb974f802,
1142 0xbe801d78, 0xb8f5f803,
1143 0x8675ff75, 0x000001ff,
1144 0xbf850002, 0x80708470,
1145 0x82718071, 0x8671ff71,
1146 0x0000ffff, 0xb974f802,
1147 0xbe801f70, 0xb8f5f803,
1148 0x8675ff75, 0x00000100,
1149 0xbf840006, 0xbefa0080,
1150 0xb97a0203, 0x8671ff71,
1151 0x0000ffff, 0x80f08870,
1152 0x82f18071, 0xbefa0080,
1153 0xb97a0283, 0xbef60068,
1154 0xbef70069, 0xb8fa1c07,
1155 0x8e7a9c7a, 0x87717a71,
1156 0xb8fa03c7, 0x8e7a9b7a,
1157 0x87717a71, 0xb8faf807,
1158 0x867aff7a, 0x00007fff,
1159 0xb97af807, 0xbef2007e,
1160 0xbef3007f, 0xbefe0180,
1161 0xbf900004, 0xbf8e0002,
1162 0xbf88fffe, 0xbef8007e,
1163 0x8679ff7f, 0x0000ffff,
1164 0x8779ff79, 0x00040000,
1165 0xbefa0080, 0xbefb00ff,
1166 0x00807fac, 0x867aff7f,
1167 0x08000000, 0x8f7a837a,
1168 0x877b7a7b, 0x867aff7f,
1169 0x70000000, 0x8f7a817a,
1170 0x877b7a7b, 0xbeef007c,
1171 0xbeee0080, 0xb8ee2a05,
1172 0x806e816e, 0x8e6e8a6e,
1173 0xb8fa1605, 0x807a817a,
1174 0x8e7a867a, 0x806e7a6e,
1175 0xbefa0084, 0xbefa00ff,
1176 0x01000000, 0xbefe007c,
1177 0xbefc006e, 0xc0611bfc,
1178 0x0000007c, 0x806e846e,
1179 0xbefc007e, 0xbefe007c,
1180 0xbefc006e, 0xc0611c3c,
1181 0x0000007c, 0x806e846e,
1182 0xbefc007e, 0xbefe007c,
1183 0xbefc006e, 0xc0611c7c,
1184 0x0000007c, 0x806e846e,
1185 0xbefc007e, 0xbefe007c,
1186 0xbefc006e, 0xc0611cbc,
1187 0x0000007c, 0x806e846e,
1188 0xbefc007e, 0xbefe007c,
1189 0xbefc006e, 0xc0611cfc,
1190 0x0000007c, 0x806e846e,
1191 0xbefc007e, 0xbefe007c,
1192 0xbefc006e, 0xc0611d3c,
1193 0x0000007c, 0x806e846e,
1194 0xbefc007e, 0xb8f5f803,
1195 0xbefe007c, 0xbefc006e,
1196 0xc0611d7c, 0x0000007c,
1197 0x806e846e, 0xbefc007e,
1198 0xbefe007c, 0xbefc006e,
1199 0xc0611dbc, 0x0000007c,
1200 0x806e846e, 0xbefc007e,
1201 0xbefe007c, 0xbefc006e,
1202 0xc0611dfc, 0x0000007c,
1203 0x806e846e, 0xbefc007e,
1204 0xb8eff801, 0xbefe007c,
1205 0xbefc006e, 0xc0611bfc,
1206 0x0000007c, 0x806e846e,
1207 0xbefc007e, 0xbefe007c,
1208 0xbefc006e, 0xc0611b3c,
1209 0x0000007c, 0x806e846e,
1210 0xbefc007e, 0xbefe007c,
1211 0xbefc006e, 0xc0611b7c,
1212 0x0000007c, 0x806e846e,
1213 0xbefc007e, 0x867aff7f,
1214 0x04000000, 0xbef30080,
1215 0x8773737a, 0xb8ee2a05,
1216 0x806e816e, 0x8e6e8a6e,
1217 0xb8f51605, 0x80758175,
1218 0x8e758475, 0x8e7a8275,
1219 0xbefa00ff, 0x01000000,
1220 0xbef60178, 0x80786e78,
1221 0x82798079, 0xbefc0080,
1222 0xbe802b00, 0xbe822b02,
1223 0xbe842b04, 0xbe862b06,
1224 0xbe882b08, 0xbe8a2b0a,
1225 0xbe8c2b0c, 0xbe8e2b0e,
1226 0xc06b003c, 0x00000000,
1227 0xc06b013c, 0x00000010,
1228 0xc06b023c, 0x00000020,
1229 0xc06b033c, 0x00000030,
1230 0x8078c078, 0x82798079,
1231 0x807c907c, 0xbf0a757c,
1232 0xbf85ffeb, 0xbef80176,
1233 0xbeee0080, 0xbefe00c1,
1234 0xbeff00c1, 0xbefa00ff,
1235 0x01000000, 0xe0724000,
1236 0x6e1e0000, 0xe0724100,
1237 0x6e1e0100, 0xe0724200,
1238 0x6e1e0200, 0xe0724300,
1239 0x6e1e0300, 0xbefe00c1,
1240 0xbeff00c1, 0xb8f54306,
1241 0x8675c175, 0xbf84002c,
1242 0xbf8a0000, 0x867aff73,
1243 0x04000000, 0xbf840028,
1244 0x8e758675, 0x8e758275,
1245 0xbefa0075, 0xb8ee2a05,
1246 0x806e816e, 0x8e6e8a6e,
1247 0xb8fa1605, 0x807a817a,
1248 0x8e7a867a, 0x806e7a6e,
1249 0x806eff6e, 0x00000080,
1250 0xbefa00ff, 0x01000000,
1251 0xbefc0080, 0xd28c0002,
1252 0x000100c1, 0xd28d0003,
1253 0x000204c1, 0xd1060002,
1254 0x00011103, 0x7e0602ff,
1255 0x00000200, 0xbefc00ff,
1256 0x00010000, 0xbe80007b,
1257 0x867bff7b, 0xff7fffff,
1258 0x877bff7b, 0x00058000,
1259 0xd8ec0000, 0x00000002,
1260 0xbf8c007f, 0xe0765000,
1261 0x6e1e0002, 0x32040702,
1262 0xd0c9006a, 0x0000eb02,
1263 0xbf87fff7, 0xbefb0000,
1264 0xbeee00ff, 0x00000400,
1265 0xbefe00c1, 0xbeff00c1,
1266 0xb8f52a05, 0x80758175,
1267 0x8e758275, 0x8e7a8875,
1268 0xbefa00ff, 0x01000000,
1269 0xbefc0084, 0xbf0a757c,
1270 0xbf840015, 0xbf11017c,
1271 0x8075ff75, 0x00001000,
1272 0x7e000300, 0x7e020301,
1273 0x7e040302, 0x7e060303,
1274 0xe0724000, 0x6e1e0000,
1275 0xe0724100, 0x6e1e0100,
1276 0xe0724200, 0x6e1e0200,
1277 0xe0724300, 0x6e1e0300,
1278 0x807c847c, 0x806eff6e,
1279 0x00000400, 0xbf0a757c,
1280 0xbf85ffef, 0xbf9c0000,
1281 0xbf8200ca, 0xbef8007e,
1282 0x8679ff7f, 0x0000ffff,
1283 0x8779ff79, 0x00040000,
1284 0xbefa0080, 0xbefb00ff,
1285 0x00807fac, 0x8676ff7f,
1286 0x08000000, 0x8f768376,
1287 0x877b767b, 0x8676ff7f,
1288 0x70000000, 0x8f768176,
1289 0x877b767b, 0x8676ff7f,
1290 0x04000000, 0xbf84001e,
1291 0xbefe00c1, 0xbeff00c1,
1292 0xb8f34306, 0x8673c173,
1293 0xbf840019, 0x8e738673,
1294 0x8e738273, 0xbefa0073,
1295 0xb8f22a05, 0x80728172,
1296 0x8e728a72, 0xb8f61605,
1297 0x80768176, 0x8e768676,
1298 0x80727672, 0x8072ff72,
1299 0x00000080, 0xbefa00ff,
1300 0x01000000, 0xbefc0080,
1301 0xe0510000, 0x721e0000,
1302 0xe0510100, 0x721e0000,
1303 0x807cff7c, 0x00000200,
1304 0x8072ff72, 0x00000200,
1305 0xbf0a737c, 0xbf85fff6,
1306 0xbef20080, 0xbefe00c1,
1307 0xbeff00c1, 0xb8f32a05,
1308 0x80738173, 0x8e738273,
1309 0x8e7a8873, 0xbefa00ff,
1310 0x01000000, 0xbef60072,
1311 0x8072ff72, 0x00000400,
1312 0xbefc0084, 0xbf11087c,
1313 0x8073ff73, 0x00008000,
1314 0xe0524000, 0x721e0000,
1315 0xe0524100, 0x721e0100,
1316 0xe0524200, 0x721e0200,
1317 0xe0524300, 0x721e0300,
1318 0xbf8c0f70, 0x7e000300,
1319 0x7e020301, 0x7e040302,
1320 0x7e060303, 0x807c847c,
1321 0x8072ff72, 0x00000400,
1322 0xbf0a737c, 0xbf85ffee,
1323 0xbf9c0000, 0xe0524000,
1324 0x761e0000, 0xe0524100,
1325 0x761e0100, 0xe0524200,
1326 0x761e0200, 0xe0524300,
1327 0x761e0300, 0xb8f22a05,
1328 0x80728172, 0x8e728a72,
1329 0xb8f61605, 0x80768176,
1330 0x8e768676, 0x80727672,
1331 0x80f2c072, 0xb8f31605,
1332 0x80738173, 0x8e738473,
1333 0x8e7a8273, 0xbefa00ff,
1334 0x01000000, 0xbefc0073,
1335 0xc031003c, 0x00000072,
1336 0x80f2c072, 0xbf8c007f,
1337 0x80fc907c, 0xbe802d00,
1338 0xbe822d02, 0xbe842d04,
1339 0xbe862d06, 0xbe882d08,
1340 0xbe8a2d0a, 0xbe8c2d0c,
1341 0xbe8e2d0e, 0xbf06807c,
1342 0xbf84fff1, 0xb8f22a05,
1343 0x80728172, 0x8e728a72,
1344 0xb8f61605, 0x80768176,
1345 0x8e768676, 0x80727672,
1346 0xbefa0084, 0xbefa00ff,
1347 0x01000000, 0xc0211cfc,
1348 0x00000072, 0x80728472,
1349 0xc0211c3c, 0x00000072,
1350 0x80728472, 0xc0211c7c,
1351 0x00000072, 0x80728472,
1352 0xc0211bbc, 0x00000072,
1353 0x80728472, 0xc0211bfc,
1354 0x00000072, 0x80728472,
1355 0xc0211d3c, 0x00000072,
1356 0x80728472, 0xc0211d7c,
1357 0x00000072, 0x80728472,
1358 0xc0211a3c, 0x00000072,
1359 0x80728472, 0xc0211a7c,
1360 0x00000072, 0x80728472,
1361 0xc0211dfc, 0x00000072,
1362 0x80728472, 0xc0211b3c,
1363 0x00000072, 0x80728472,
1364 0xc0211b7c, 0x00000072,
1365 0x80728472, 0xbf8c007f,
1366 0x8671ff71, 0x0000ffff,
1367 0xbefc0073, 0xbefe006e,
1368 0xbeff006f, 0x867375ff,
1369 0x000003ff, 0xb9734803,
1370 0x867375ff, 0xfffff800,
1371 0x8f738b73, 0xb973a2c3,
1372 0xb977f801, 0x8673ff71,
1373 0xf0000000, 0x8f739c73,
1374 0x8e739073, 0xbef60080,
1375 0x87767376, 0x8673ff71,
1376 0x08000000, 0x8f739b73,
1377 0x8e738f73, 0x87767376,
1378 0x8673ff74, 0x00800000,
1379 0x8f739773, 0xb976f807,
1380 0x86fe7e7e, 0x86ea6a6a,
1381 0xb974f802, 0xbf8a0000,
1382 0x95807370, 0xbf810000,
1383};
1384
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
new file mode 100644
index 000000000000..998be96be736
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -0,0 +1,1214 @@
1/*
2 * Copyright 2016 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23/* To compile this assembly code:
24 * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex
25 */
26
27/* HW (GFX9) source code for CWSR trap handler */
28/* Version 18 + multiple trap handler */
29
30// this performance-optimal version was originally from Seven Xu at SRDC
31
32// Revison #18 --...
33/* Rev History
34** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
35** #4. SR Memory Layout:
36** 1. VGPR-SGPR-HWREG-{LDS}
37** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
38** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
39** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
40** #7. Update: 1. don't barrier if noLDS
41** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
42** 2. Fix SQ issue by s_sleep 2
43** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
44** 2. optimize s_buffer save by burst 16sgprs...
45** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
46** #11. Update 1. Add 2 more timestamp for debug version
47** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
48** #13. Integ 1. Always use MUBUF for PV trap shader...
49** #14. Update 1. s_buffer_store soft clause...
50** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
51** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
52** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
53** 2. PERF - Save LDS before save VGPR to cover LDS save long latency...
54** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
55** 2. FUNC - Handle non-CWSR traps
56*/
57
58var G8SR_WDMEM_HWREG_OFFSET = 0
59var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes
60
61// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
62
63var G8SR_DEBUG_TIMESTAMP = 0
64var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
65var s_g8sr_ts_save_s = s[34:35] // save start
66var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi
67var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ
68var s_g8sr_ts_save_d = s[40:41] // save end
69var s_g8sr_ts_restore_s = s[42:43] // restore start
70var s_g8sr_ts_restore_d = s[44:45] // restore end
71
72var G8SR_VGPR_SR_IN_DWX4 = 0
73var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes
74var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
75
76
77/*************************************************************************/
78/* control on how to run the shader */
79/*************************************************************************/
80//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
81var EMU_RUN_HACK = 0
82var EMU_RUN_HACK_RESTORE_NORMAL = 0
83var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
84var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
85var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
86var SAVE_LDS = 1
87var WG_BASE_ADDR_LO = 0x9000a000
88var WG_BASE_ADDR_HI = 0x0
89var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
90var CTX_SAVE_CONTROL = 0x0
91var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
92var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
93var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write
94var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
95var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
96var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency
97
98/**************************************************************************/
99/* variables */
100/**************************************************************************/
101var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
102var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
103var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1
104var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
105var SQ_WAVE_STATUS_HALT_MASK = 0x2000
106
107var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
108var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
109var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
110var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
111var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
112var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
113
114var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
115var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
116var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
117var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
118var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
119var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
120var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
121var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
122var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
123var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
124var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
125var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
126
127var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
128var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
129var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000
130var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
131
132var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
133var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
134
135var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data
136var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000
137
138/* Save */
139var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
140var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
141
142var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
143var S_SAVE_SPI_INIT_ATC_SHIFT = 27
144var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
145var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
146var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
147var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
148
149var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
150var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
151var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
152var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
153
154var s_save_spi_init_lo = exec_lo
155var s_save_spi_init_hi = exec_hi
156
157var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
158var s_save_pc_hi = ttmp1
159var s_save_exec_lo = ttmp2
160var s_save_exec_hi = ttmp3
161var s_save_tmp = ttmp4
162var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
163var s_save_xnack_mask_lo = ttmp6
164var s_save_xnack_mask_hi = ttmp7
165var s_save_buf_rsrc0 = ttmp8
166var s_save_buf_rsrc1 = ttmp9
167var s_save_buf_rsrc2 = ttmp10
168var s_save_buf_rsrc3 = ttmp11
169var s_save_status = ttmp12
170var s_save_mem_offset = ttmp14
171var s_save_alloc_size = s_save_trapsts //conflict
172var s_save_m0 = ttmp15
173var s_save_ttmps_lo = s_save_tmp //no conflict
174var s_save_ttmps_hi = s_save_trapsts //no conflict
175
176/* Restore */
177var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
178var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
179
180var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
181var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
182var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
183var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
184var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
185var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
186
187var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
188var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
189var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
190var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
191
192var s_restore_spi_init_lo = exec_lo
193var s_restore_spi_init_hi = exec_hi
194
195var s_restore_mem_offset = ttmp12
196var s_restore_alloc_size = ttmp3
197var s_restore_tmp = ttmp2
198var s_restore_mem_offset_save = s_restore_tmp //no conflict
199
200var s_restore_m0 = s_restore_alloc_size //no conflict
201
202var s_restore_mode = ttmp7
203
204var s_restore_pc_lo = ttmp0
205var s_restore_pc_hi = ttmp1
206var s_restore_exec_lo = ttmp14
207var s_restore_exec_hi = ttmp15
208var s_restore_status = ttmp4
209var s_restore_trapsts = ttmp5
210var s_restore_xnack_mask_lo = xnack_mask_lo
211var s_restore_xnack_mask_hi = xnack_mask_hi
212var s_restore_buf_rsrc0 = ttmp8
213var s_restore_buf_rsrc1 = ttmp9
214var s_restore_buf_rsrc2 = ttmp10
215var s_restore_buf_rsrc3 = ttmp11
216var s_restore_ttmps_lo = s_restore_tmp //no conflict
217var s_restore_ttmps_hi = s_restore_alloc_size //no conflict
218
219/**************************************************************************/
220/* trap handler entry points */
221/**************************************************************************/
222/* Shader Main*/
223
224shader main
225 asic(GFX9)
226 type(CS)
227
228
229 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
230 //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
231 s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
232 s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
233 s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
234 //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
235 s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
236 else
237 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
238 end
239
240L_JUMP_TO_RESTORE:
241 s_branch L_RESTORE //restore
242
243L_SKIP_RESTORE:
244
245 s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
246 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save
247 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
248 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
249 s_cbranch_scc1 L_SAVE //this is the operation for save
250
251 // ********* Handle non-CWSR traps *******************
252if (!EMU_RUN_HACK)
253 // Illegal instruction is a non-maskable exception which blocks context save.
254 // Halt the wavefront and return from the trap.
255 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
256 s_cbranch_scc1 L_HALT_WAVE
257
258 // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA.
259 // Instead, halt the wavefront and return from the trap.
260 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
261 s_cbranch_scc0 L_FETCH_2ND_TRAP
262
263L_HALT_WAVE:
264 // If STATUS.HALT is set then this fault must come from SQC instruction fetch.
265 // We cannot prevent further faults so just terminate the wavefront.
266 s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
267 s_cbranch_scc0 L_NOT_ALREADY_HALTED
268 s_endpgm
269L_NOT_ALREADY_HALTED:
270 s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
271
272 // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
273 // Rewind the PC to prevent this from occurring. The debugger compensates for this.
274 s_sub_u32 ttmp0, ttmp0, 0x8
275 s_subb_u32 ttmp1, ttmp1, 0x0
276
277L_FETCH_2ND_TRAP:
278 // Preserve and clear scalar XNACK state before issuing scalar reads.
279 // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26].
280 s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS)
281 s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
282 s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
283 s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK
284 s_or_b32 ttmp11, ttmp11, ttmp3
285
286 s_andn2_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
287 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
288
289 // Read second-level TBA/TMA from first-level TMA and jump if available.
290 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
291 // ttmp12 holds SQ_WAVE_STATUS
292 s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO)
293 s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI)
294 s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
295 s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
296 s_waitcnt lgkmcnt(0)
297 s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
298 s_waitcnt lgkmcnt(0)
299 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
300 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
301 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
302
303L_NO_NEXT_TRAP:
304 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
305 s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
306 s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
307 s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
308 s_addc_u32 ttmp1, ttmp1, 0
309L_EXCP_CASE:
310 s_and_b32 ttmp1, ttmp1, 0xFFFF
311
312 // Restore SQ_WAVE_IB_STS.
313 s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
314 s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
315 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
316
317 // Restore SQ_WAVE_STATUS.
318 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
319 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
320 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status
321
322 s_rfe_b64 [ttmp0, ttmp1]
323end
324 // ********* End handling of non-CWSR traps *******************
325
326/**************************************************************************/
327/* save routine */
328/**************************************************************************/
329
330L_SAVE:
331
332if G8SR_DEBUG_TIMESTAMP
333 s_memrealtime s_g8sr_ts_save_s
334 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
335end
336
337 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
338
339 s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
340 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
341
342 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
343 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
344 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
345 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
346 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
347 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
348 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
349 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
350
351 s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
352
353 /* inform SPI the readiness and wait for SPI's go signal */
354 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
355 s_mov_b32 s_save_exec_hi, exec_hi
356 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
357
358if G8SR_DEBUG_TIMESTAMP
359 s_memrealtime s_g8sr_ts_sq_save_msg
360 s_waitcnt lgkmcnt(0)
361end
362
363 if (EMU_RUN_HACK)
364
365 else
366 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
367 end
368
369 // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
370 s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
371 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
372
373 L_SLEEP:
374 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
375
376 if (EMU_RUN_HACK)
377
378 else
379 s_cbranch_execz L_SLEEP
380 end
381
382if G8SR_DEBUG_TIMESTAMP
383 s_memrealtime s_g8sr_ts_spi_wrexec
384 s_waitcnt lgkmcnt(0)
385end
386
387 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
388 //calculate wd_addr using absolute thread id
389 v_readlane_b32 s_save_tmp, v9, 0
390 s_lshr_b32 s_save_tmp, s_save_tmp, 6
391 s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
392 s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
393 s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
394 s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
395 else
396 end
397 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
398 s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
399 s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
400 s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
401 else
402 end
403
404 // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
405 // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
406 get_vgpr_size_bytes(s_save_ttmps_lo)
407 get_sgpr_size_bytes(s_save_ttmps_hi)
408 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
409 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
410 s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0
411 s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
412 s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1
413 ack_sqc_store_workaround()
414 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1
415 ack_sqc_store_workaround()
416 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1
417 ack_sqc_store_workaround()
418 s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1
419 ack_sqc_store_workaround()
420
421 /* setup Resource Contants */
422 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
423 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
424 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
425 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
426 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
427 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
428 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
429 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
430 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
431 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
432 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
433
434 //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
435 s_mov_b32 s_save_m0, m0 //save M0
436
437 /* global mem offset */
438 s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
439
440
441
442
443 /* save HW registers */
444 //////////////////////////////
445
446 L_SAVE_HWREG:
447 // HWREG SR memory offset : size(VGPR)+size(SGPR)
448 get_vgpr_size_bytes(s_save_mem_offset)
449 get_sgpr_size_bytes(s_save_tmp)
450 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
451
452
453 s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
454 if (SWIZZLE_EN)
455 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
456 else
457 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
458 end
459
460
461 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0
462
463 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
464 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
465 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
466 end
467
468 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC
469 write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
470 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC
471 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
472 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS
473
474 //s_save_trapsts conflicts with s_save_alloc_size
475 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
476 write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS
477
478 write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO
479 write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI
480
481 //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
482 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
483 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
484
485
486
487 /* the first wave in the threadgroup */
488 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit
489 s_mov_b32 s_save_exec_hi, 0x0
490 s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26]
491
492
493 /* save SGPRs */
494 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
495 //////////////////////////////
496
497 // SGPR SR memory offset : size(VGPR)
498 get_vgpr_size_bytes(s_save_mem_offset)
499 // TODO, change RSRC word to rearrange memory layout for SGPRS
500
501 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
502 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
503 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
504
505 if (SGPR_SAVE_USE_SQC)
506 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
507 else
508 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
509 end
510
511 if (SWIZZLE_EN)
512 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
513 else
514 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
515 end
516
517
518 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
519 //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
520 s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
521 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
522 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
523
524 s_mov_b32 m0, 0x0 //SGPR initial index value =0
525 s_nop 0x0 //Manually inserted wait states
526 L_SAVE_SGPR_LOOP:
527 // SGPR is allocated in 16 SGPR granularity
528 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
529 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
530 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
531 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
532 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
533 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
534 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
535 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
536
537 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
538 s_add_u32 m0, m0, 16 //next sgpr index
539 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
540 s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
541 // restore s_save_buf_rsrc0,1
542 //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
543 s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
544
545
546
547
548 /* save first 4 VGPR, then LDS save could use */
549 // each wave will alloc 4 vgprs at least...
550 /////////////////////////////////////////////////////////////////////////////////////
551
552 s_mov_b32 s_save_mem_offset, 0
553 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
554 s_mov_b32 exec_hi, 0xFFFFFFFF
555 s_mov_b32 xnack_mask_lo, 0x0
556 s_mov_b32 xnack_mask_hi, 0x0
557
558 if (SWIZZLE_EN)
559 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
560 else
561 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
562 end
563
564
565 // VGPR Allocated in 4-GPR granularity
566
567if G8SR_VGPR_SR_IN_DWX4
568 // the const stride for DWx4 is 4*4 bytes
569 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
570 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
571
572 buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
573
574 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
575 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
576else
577 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
578 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
579 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
580 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
581end
582
583
584
585 /* save LDS */
586 //////////////////////////////
587
588 L_SAVE_LDS:
589
590 // Change EXEC to all threads...
591 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
592 s_mov_b32 exec_hi, 0xFFFFFFFF
593
594 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
595 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
596 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
597
598 s_barrier //LDS is used? wait for other waves in the same TG
599 s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
600 s_cbranch_scc0 L_SAVE_LDS_DONE
601
602 // first wave do LDS save;
603
604 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
605 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
606 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
607
608 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
609 //
610 get_vgpr_size_bytes(s_save_mem_offset)
611 get_sgpr_size_bytes(s_save_tmp)
612 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
613 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
614
615
616 if (SWIZZLE_EN)
617 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
618 else
619 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
620 end
621
622 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
623
624
625var LDS_DMA_ENABLE = 0
626var UNROLL = 0
627if UNROLL==0 && LDS_DMA_ENABLE==1
628 s_mov_b32 s3, 256*2
629 s_nop 0
630 s_nop 0
631 s_nop 0
632 L_SAVE_LDS_LOOP:
633 //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
634 if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity
635 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW
636 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
637 end
638
639 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
640 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes
641 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
642 s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
643
644elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss
645 // store from higest LDS address to lowest
646 s_mov_b32 s3, 256*2
647 s_sub_u32 m0, s_save_alloc_size, s3
648 s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
649 s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks...
650 s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest
651 s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction
652 s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc
653 s_nop 0
654 s_nop 0
655 s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes
656 s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved
657 s_add_u32 s0, s0,s_save_alloc_size
658 s_addc_u32 s1, s1, 0
659 s_setpc_b64 s[0:1]
660
661
662 for var i =0; i< 128; i++
663 // be careful to make here a 64Byte aligned address, which could improve performance...
664 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW
665 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
666
667 if i!=127
668 s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline
669 s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3
670 end
671 end
672
673else // BUFFER_STORE
674 v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
675 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid
676 v_mul_i32_i24 v2, v3, 8 // tid*8
677 v_mov_b32 v3, 256*2
678 s_mov_b32 m0, 0x10000
679 s_mov_b32 s0, s_save_buf_rsrc3
680 s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid
681 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT
682
683L_SAVE_LDS_LOOP_VECTOR:
684 ds_read_b64 v[0:1], v2 //x =LDS[a], byte address
685 s_waitcnt lgkmcnt(0)
686 buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1
687// s_waitcnt vmcnt(0)
688// v_add_u32 v2, vcc[0:1], v2, v3
689 v_add_u32 v2, v2, v3
690 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
691 s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
692
693 // restore rsrc3
694 s_mov_b32 s_save_buf_rsrc3, s0
695
696end
697
698L_SAVE_LDS_DONE:
699
700
701 /* save VGPRs - set the Rest VGPRs */
702 //////////////////////////////////////////////////////////////////////////////////////
703 L_SAVE_VGPR:
704 // VGPR SR memory offset: 0
705 // TODO rearrange the RSRC words to use swizzle for VGPR save...
706
707 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
708 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
709 s_mov_b32 exec_hi, 0xFFFFFFFF
710
711 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
712 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
713 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
714 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
715 if (SWIZZLE_EN)
716 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
717 else
718 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
719 end
720
721
722 // VGPR Allocated in 4-GPR granularity
723
724if G8SR_VGPR_SR_IN_DWX4
725 // the const stride for DWx4 is 4*4 bytes
726 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
727 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
728
729 s_mov_b32 m0, 4 // skip first 4 VGPRs
730 s_cmp_lt_u32 m0, s_save_alloc_size
731 s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs
732
733 s_set_gpr_idx_on m0, 0x1 // This will change M0
734 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0
735L_SAVE_VGPR_LOOP:
736 v_mov_b32 v0, v0 // v0 = v[0+m0]
737 v_mov_b32 v1, v1
738 v_mov_b32 v2, v2
739 v_mov_b32 v3, v3
740
741
742 buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
743 s_add_u32 m0, m0, 4
744 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4
745 s_cmp_lt_u32 m0, s_save_alloc_size
746 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
747 s_set_gpr_idx_off
748L_SAVE_VGPR_LOOP_END:
749
750 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
751 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
752else
753 // VGPR store using dw burst
754 s_mov_b32 m0, 0x4 //VGPR initial index value =0
755 s_cmp_lt_u32 m0, s_save_alloc_size
756 s_cbranch_scc0 L_SAVE_VGPR_END
757
758
759 s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
760 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
761
762 L_SAVE_VGPR_LOOP:
763 v_mov_b32 v0, v0 //v0 = v[0+m0]
764 v_mov_b32 v1, v1 //v0 = v[0+m0]
765 v_mov_b32 v2, v2 //v0 = v[0+m0]
766 v_mov_b32 v3, v3 //v0 = v[0+m0]
767
768 if(USE_MTBUF_INSTEAD_OF_MUBUF)
769 tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
770 else
771 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
772 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
773 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
774 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
775 end
776
777 s_add_u32 m0, m0, 4 //next vgpr index
778 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
779 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
780 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
781 s_set_gpr_idx_off
782end
783
784L_SAVE_VGPR_END:
785
786
787
788
789
790
791 /* S_PGM_END_SAVED */ //FIXME graphics ONLY
792 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
793 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
794 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
795 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
796 s_rfe_b64 s_save_pc_lo //Return to the main shader program
797 else
798 end
799
800// Save Done timestamp
801if G8SR_DEBUG_TIMESTAMP
802 s_memrealtime s_g8sr_ts_save_d
803 // SGPR SR memory offset : size(VGPR)
804 get_vgpr_size_bytes(s_save_mem_offset)
805 s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
806 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
807 // Need reset rsrc2??
808 s_mov_b32 m0, s_save_mem_offset
809 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
810 s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1
811end
812
813
814 s_branch L_END_PGM
815
816
817
818/**************************************************************************/
819/* restore routine */
820/**************************************************************************/
821
822L_RESTORE:
823 /* Setup Resource Contants */
824 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
825 //calculate wd_addr using absolute thread id
826 v_readlane_b32 s_restore_tmp, v9, 0
827 s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
828 s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
829 s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
830 s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
831 s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
832 else
833 end
834
835if G8SR_DEBUG_TIMESTAMP
836 s_memrealtime s_g8sr_ts_restore_s
837 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
838 // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
839 s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
840 s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored..
841end
842
843
844
845 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
846 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
847 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
848 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
849 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
850 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
851 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
852 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
853 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
854 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
855 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
856
857 /* global mem offset */
858// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
859
860 /* the first wave in the threadgroup */
861 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
862 s_cbranch_scc0 L_RESTORE_VGPR
863
864 /* restore LDS */
865 //////////////////////////////
866 L_RESTORE_LDS:
867
868 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
869 s_mov_b32 exec_hi, 0xFFFFFFFF
870
871 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
872 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
873 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
874 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
875 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
876 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
877
878 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
879 //
880 get_vgpr_size_bytes(s_restore_mem_offset)
881 get_sgpr_size_bytes(s_restore_tmp)
882 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
883 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow???
884
885
886 if (SWIZZLE_EN)
887 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
888 else
889 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
890 end
891 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
892
893 L_RESTORE_LDS_LOOP:
894 if (SAVE_LDS)
895 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
896 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
897 end
898 s_add_u32 m0, m0, 256*2 // 128 DW
899 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
900 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
901 s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
902
903
904 /* restore VGPRs */
905 //////////////////////////////
906 L_RESTORE_VGPR:
907 // VGPR SR memory offset : 0
908 s_mov_b32 s_restore_mem_offset, 0x0
909 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
910 s_mov_b32 exec_hi, 0xFFFFFFFF
911
912 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
913 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
914 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
915 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
916 if (SWIZZLE_EN)
917 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
918 else
919 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
920 end
921
922if G8SR_VGPR_SR_IN_DWX4
923 get_vgpr_size_bytes(s_restore_mem_offset)
924 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
925
926 // the const stride for DWx4 is 4*4 bytes
927 s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
928 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
929
930 s_mov_b32 m0, s_restore_alloc_size
931 s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0
932
933L_RESTORE_VGPR_LOOP:
934 buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
935 s_waitcnt vmcnt(0)
936 s_sub_u32 m0, m0, 4
937 v_mov_b32 v0, v0 // v[0+m0] = v0
938 v_mov_b32 v1, v1
939 v_mov_b32 v2, v2
940 v_mov_b32 v3, v3
941 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
942 s_cmp_eq_u32 m0, 0x8000
943 s_cbranch_scc0 L_RESTORE_VGPR_LOOP
944 s_set_gpr_idx_off
945
946 s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
947 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes
948
949else
950 // VGPR load using dw burst
951 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
952 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
953 s_mov_b32 m0, 4 //VGPR initial index value = 1
954 s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
955 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
956
957 L_RESTORE_VGPR_LOOP:
958 if(USE_MTBUF_INSTEAD_OF_MUBUF)
959 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
960 else
961 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
962 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
963 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
964 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
965 end
966 s_waitcnt vmcnt(0) //ensure data ready
967 v_mov_b32 v0, v0 //v[0+m0] = v0
968 v_mov_b32 v1, v1
969 v_mov_b32 v2, v2
970 v_mov_b32 v3, v3
971 s_add_u32 m0, m0, 4 //next vgpr index
972 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
973 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
974 s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
975 s_set_gpr_idx_off
976 /* VGPR restore on v0 */
977 if(USE_MTBUF_INSTEAD_OF_MUBUF)
978 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
979 else
980 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
981 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
982 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
983 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
984 end
985
986end
987
988 /* restore SGPRs */
989 //////////////////////////////
990
991 // SGPR SR memory offset : size(VGPR)
992 get_vgpr_size_bytes(s_restore_mem_offset)
993 get_sgpr_size_bytes(s_restore_tmp)
994 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
995 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group
996 // TODO, change RSRC word to rearrange memory layout for SGPRS
997
998 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
999 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
1000 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
1001
1002 if (SGPR_SAVE_USE_SQC)
1003 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
1004 else
1005 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
1006 end
1007 if (SWIZZLE_EN)
1008 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
1009 else
1010 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1011 end
1012
1013 s_mov_b32 m0, s_restore_alloc_size
1014
1015 L_RESTORE_SGPR_LOOP:
1016 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made
1017 s_waitcnt lgkmcnt(0) //ensure data ready
1018
1019 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
1020 s_nop 0 // hazard SALU M0=> S_MOVREL
1021
1022 s_movreld_b64 s0, s0 //s[0+m0] = s0
1023 s_movreld_b64 s2, s2
1024 s_movreld_b64 s4, s4
1025 s_movreld_b64 s6, s6
1026 s_movreld_b64 s8, s8
1027 s_movreld_b64 s10, s10
1028 s_movreld_b64 s12, s12
1029 s_movreld_b64 s14, s14
1030
1031 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0
1032 s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
1033
1034 /* restore HW registers */
1035 //////////////////////////////
1036 L_RESTORE_HWREG:
1037
1038
1039if G8SR_DEBUG_TIMESTAMP
1040 s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
1041 s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
1042end
1043
1044 // HWREG SR memory offset : size(VGPR)+size(SGPR)
1045 get_vgpr_size_bytes(s_restore_mem_offset)
1046 get_sgpr_size_bytes(s_restore_tmp)
1047 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1048
1049
1050 s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
1051 if (SWIZZLE_EN)
1052 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
1053 else
1054 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1055 end
1056
1057 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0
1058 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC
1059 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1060 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC
1061 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1062 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS
1063 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS
1064 read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO
1065 read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI
1066 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE
1067
1068 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
1069
1070 //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
1071 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
1072 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
1073 s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
1074 end
1075 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
1076 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
1077 s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
1078 end
1079
1080 s_mov_b32 m0, s_restore_m0
1081 s_mov_b32 exec_lo, s_restore_exec_lo
1082 s_mov_b32 exec_hi, s_restore_exec_hi
1083
1084 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1085 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1086 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1087 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1088 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1089 //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
1090 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
1091
1092 // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
1093 // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
1094 get_vgpr_size_bytes(s_restore_ttmps_lo)
1095 get_sgpr_size_bytes(s_restore_ttmps_hi)
1096 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1097 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1098 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1099 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1100 s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1
1101 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1
1102 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1
1103 s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1
1104 s_waitcnt lgkmcnt(0)
1105
1106 //reuse s_restore_m0 as a temp register
1107 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
1108 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
1109 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
1110 s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
1111 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
1112 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
1113 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
1114 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
1115 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
1116 s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
1117 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
1118 s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
1119
1120 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
1121 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
1122 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
1123 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
1124
1125 s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
1126
1127if G8SR_DEBUG_TIMESTAMP
1128 s_memrealtime s_g8sr_ts_restore_d
1129 s_waitcnt lgkmcnt(0)
1130end
1131
1132// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
1133 s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
1134
1135
1136/**************************************************************************/
1137/* the END */
1138/**************************************************************************/
1139L_END_PGM:
1140 s_endpgm
1141
1142end
1143
1144
1145/**************************************************************************/
1146/* the helper functions */
1147/**************************************************************************/
1148
1149//Only for save hwreg to mem
1150function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1151 s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
1152 s_mov_b32 m0, s_mem_offset
1153 s_buffer_store_dword s, s_rsrc, m0 glc:1
1154 ack_sqc_store_workaround()
1155 s_add_u32 s_mem_offset, s_mem_offset, 4
1156 s_mov_b32 m0, exec_lo
1157end
1158
1159
1160// HWREG are saved before SGPRs, so all HWREG could be use.
1161function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1162
1163 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
1164 ack_sqc_store_workaround()
1165 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
1166 ack_sqc_store_workaround()
1167 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
1168 ack_sqc_store_workaround()
1169 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
1170 ack_sqc_store_workaround()
1171 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
1172 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc
1173end
1174
1175
1176function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1177 s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
1178 s_add_u32 s_mem_offset, s_mem_offset, 4
1179end
1180
1181function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1182 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
1183 s_sub_u32 s_mem_offset, s_mem_offset, 4*16
1184end
1185
1186
1187
1188function get_lds_size_bytes(s_lds_size_byte)
1189 // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
1190 s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size
1191 s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
1192end
1193
1194function get_vgpr_size_bytes(s_vgpr_size_byte)
1195 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
1196 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
1197 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible
1198end
1199
1200function get_sgpr_size_bytes(s_sgpr_size_byte)
1201 s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
1202 s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1
1203 s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value)
1204end
1205
1206function get_hwreg_size_bytes
1207 return 128 //HWREG size 128 bytes
1208end
1209
1210function ack_sqc_store_workaround
1211 if ACK_SQC_STORE
1212 s_waitcnt lgkmcnt(0)
1213 end
1214end
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 59808a39ecf4..f64c5551cdba 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -233,7 +233,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
233 pr_debug("Queue Size: 0x%llX, %u\n", 233 pr_debug("Queue Size: 0x%llX, %u\n",
234 q_properties->queue_size, args->ring_size); 234 q_properties->queue_size, args->ring_size);
235 235
236 pr_debug("Queue r/w Pointers: %p, %p\n", 236 pr_debug("Queue r/w Pointers: %px, %px\n",
237 q_properties->read_ptr, 237 q_properties->read_ptr,
238 q_properties->write_ptr); 238 q_properties->write_ptr);
239 239
@@ -292,8 +292,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
292 292
293 293
294 /* Return gpu_id as doorbell offset for mmap usage */ 294 /* Return gpu_id as doorbell offset for mmap usage */
295 args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); 295 args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL;
296 args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id);
296 args->doorbell_offset <<= PAGE_SHIFT; 297 args->doorbell_offset <<= PAGE_SHIFT;
298 if (KFD_IS_SOC15(dev->device_info->asic_family))
299 /* On SOC15 ASICs, doorbell allocation must be
300 * per-device, and independent from the per-process
301 * queue_id. Return the doorbell offset within the
302 * doorbell aperture to user mode.
303 */
304 args->doorbell_offset |= q_properties.doorbell_off;
297 305
298 mutex_unlock(&p->mutex); 306 mutex_unlock(&p->mutex);
299 307
@@ -1296,8 +1304,8 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
1296 return -EINVAL; 1304 return -EINVAL;
1297 } 1305 }
1298 1306
1299 devices_arr = kmalloc(args->n_devices * sizeof(*devices_arr), 1307 devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
1300 GFP_KERNEL); 1308 GFP_KERNEL);
1301 if (!devices_arr) 1309 if (!devices_arr)
1302 return -ENOMEM; 1310 return -ENOMEM;
1303 1311
@@ -1405,8 +1413,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
1405 return -EINVAL; 1413 return -EINVAL;
1406 } 1414 }
1407 1415
1408 devices_arr = kmalloc(args->n_devices * sizeof(*devices_arr), 1416 devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
1409 GFP_KERNEL); 1417 GFP_KERNEL);
1410 if (!devices_arr) 1418 if (!devices_arr)
1411 return -ENOMEM; 1419 return -ENOMEM;
1412 1420
@@ -1645,23 +1653,33 @@ err_i1:
1645static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) 1653static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
1646{ 1654{
1647 struct kfd_process *process; 1655 struct kfd_process *process;
1656 struct kfd_dev *dev = NULL;
1657 unsigned long vm_pgoff;
1658 unsigned int gpu_id;
1648 1659
1649 process = kfd_get_process(current); 1660 process = kfd_get_process(current);
1650 if (IS_ERR(process)) 1661 if (IS_ERR(process))
1651 return PTR_ERR(process); 1662 return PTR_ERR(process);
1652 1663
1653 if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == 1664 vm_pgoff = vma->vm_pgoff;
1654 KFD_MMAP_DOORBELL_MASK) { 1665 vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vm_pgoff);
1655 vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; 1666 gpu_id = KFD_MMAP_GPU_ID_GET(vm_pgoff);
1656 return kfd_doorbell_mmap(process, vma); 1667 if (gpu_id)
1657 } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == 1668 dev = kfd_device_by_id(gpu_id);
1658 KFD_MMAP_EVENTS_MASK) { 1669
1659 vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; 1670 switch (vm_pgoff & KFD_MMAP_TYPE_MASK) {
1671 case KFD_MMAP_TYPE_DOORBELL:
1672 if (!dev)
1673 return -ENODEV;
1674 return kfd_doorbell_mmap(dev, process, vma);
1675
1676 case KFD_MMAP_TYPE_EVENTS:
1660 return kfd_event_mmap(process, vma); 1677 return kfd_event_mmap(process, vma);
1661 } else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) == 1678
1662 KFD_MMAP_RESERVED_MEM_MASK) { 1679 case KFD_MMAP_TYPE_RESERVED_MEM:
1663 vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK; 1680 if (!dev)
1664 return kfd_reserved_mem_mmap(process, vma); 1681 return -ENODEV;
1682 return kfd_reserved_mem_mmap(dev, process, vma);
1665 } 1683 }
1666 1684
1667 return -EFAULT; 1685 return -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 4f126ef6139b..296b3f230280 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -132,6 +132,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
132#define fiji_cache_info carrizo_cache_info 132#define fiji_cache_info carrizo_cache_info
133#define polaris10_cache_info carrizo_cache_info 133#define polaris10_cache_info carrizo_cache_info
134#define polaris11_cache_info carrizo_cache_info 134#define polaris11_cache_info carrizo_cache_info
135/* TODO - check & update Vega10 cache details */
136#define vega10_cache_info carrizo_cache_info
137#define raven_cache_info carrizo_cache_info
135 138
136static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, 139static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
137 struct crat_subtype_computeunit *cu) 140 struct crat_subtype_computeunit *cu)
@@ -603,6 +606,14 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
603 pcache_info = polaris11_cache_info; 606 pcache_info = polaris11_cache_info;
604 num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); 607 num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
605 break; 608 break;
609 case CHIP_VEGA10:
610 pcache_info = vega10_cache_info;
611 num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
612 break;
613 case CHIP_RAVEN:
614 pcache_info = raven_cache_info;
615 num_of_cache_types = ARRAY_SIZE(raven_cache_info);
616 break;
606 default: 617 default:
607 return -EINVAL; 618 return -EINVAL;
608 } 619 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 3346699960dd..7ee6cec2c060 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -20,16 +20,13 @@
20 * OTHER DEALINGS IN THE SOFTWARE. 20 * OTHER DEALINGS IN THE SOFTWARE.
21 */ 21 */
22 22
23#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
24#include <linux/amd-iommu.h>
25#endif
26#include <linux/bsearch.h> 23#include <linux/bsearch.h>
27#include <linux/pci.h> 24#include <linux/pci.h>
28#include <linux/slab.h> 25#include <linux/slab.h>
29#include "kfd_priv.h" 26#include "kfd_priv.h"
30#include "kfd_device_queue_manager.h" 27#include "kfd_device_queue_manager.h"
31#include "kfd_pm4_headers_vi.h" 28#include "kfd_pm4_headers_vi.h"
32#include "cwsr_trap_handler_gfx8.asm" 29#include "cwsr_trap_handler.h"
33#include "kfd_iommu.h" 30#include "kfd_iommu.h"
34 31
35#define MQD_SIZE_ALIGNED 768 32#define MQD_SIZE_ALIGNED 768
@@ -41,6 +38,7 @@ static const struct kfd_device_info kaveri_device_info = {
41 .max_pasid_bits = 16, 38 .max_pasid_bits = 16,
42 /* max num of queues for KV.TODO should be a dynamic value */ 39 /* max num of queues for KV.TODO should be a dynamic value */
43 .max_no_of_hqd = 24, 40 .max_no_of_hqd = 24,
41 .doorbell_size = 4,
44 .ih_ring_entry_size = 4 * sizeof(uint32_t), 42 .ih_ring_entry_size = 4 * sizeof(uint32_t),
45 .event_interrupt_class = &event_interrupt_class_cik, 43 .event_interrupt_class = &event_interrupt_class_cik,
46 .num_of_watch_points = 4, 44 .num_of_watch_points = 4,
@@ -55,6 +53,7 @@ static const struct kfd_device_info carrizo_device_info = {
55 .max_pasid_bits = 16, 53 .max_pasid_bits = 16,
56 /* max num of queues for CZ.TODO should be a dynamic value */ 54 /* max num of queues for CZ.TODO should be a dynamic value */
57 .max_no_of_hqd = 24, 55 .max_no_of_hqd = 24,
56 .doorbell_size = 4,
58 .ih_ring_entry_size = 4 * sizeof(uint32_t), 57 .ih_ring_entry_size = 4 * sizeof(uint32_t),
59 .event_interrupt_class = &event_interrupt_class_cik, 58 .event_interrupt_class = &event_interrupt_class_cik,
60 .num_of_watch_points = 4, 59 .num_of_watch_points = 4,
@@ -70,6 +69,7 @@ static const struct kfd_device_info hawaii_device_info = {
70 .max_pasid_bits = 16, 69 .max_pasid_bits = 16,
71 /* max num of queues for KV.TODO should be a dynamic value */ 70 /* max num of queues for KV.TODO should be a dynamic value */
72 .max_no_of_hqd = 24, 71 .max_no_of_hqd = 24,
72 .doorbell_size = 4,
73 .ih_ring_entry_size = 4 * sizeof(uint32_t), 73 .ih_ring_entry_size = 4 * sizeof(uint32_t),
74 .event_interrupt_class = &event_interrupt_class_cik, 74 .event_interrupt_class = &event_interrupt_class_cik,
75 .num_of_watch_points = 4, 75 .num_of_watch_points = 4,
@@ -83,6 +83,7 @@ static const struct kfd_device_info tonga_device_info = {
83 .asic_family = CHIP_TONGA, 83 .asic_family = CHIP_TONGA,
84 .max_pasid_bits = 16, 84 .max_pasid_bits = 16,
85 .max_no_of_hqd = 24, 85 .max_no_of_hqd = 24,
86 .doorbell_size = 4,
86 .ih_ring_entry_size = 4 * sizeof(uint32_t), 87 .ih_ring_entry_size = 4 * sizeof(uint32_t),
87 .event_interrupt_class = &event_interrupt_class_cik, 88 .event_interrupt_class = &event_interrupt_class_cik,
88 .num_of_watch_points = 4, 89 .num_of_watch_points = 4,
@@ -96,6 +97,7 @@ static const struct kfd_device_info tonga_vf_device_info = {
96 .asic_family = CHIP_TONGA, 97 .asic_family = CHIP_TONGA,
97 .max_pasid_bits = 16, 98 .max_pasid_bits = 16,
98 .max_no_of_hqd = 24, 99 .max_no_of_hqd = 24,
100 .doorbell_size = 4,
99 .ih_ring_entry_size = 4 * sizeof(uint32_t), 101 .ih_ring_entry_size = 4 * sizeof(uint32_t),
100 .event_interrupt_class = &event_interrupt_class_cik, 102 .event_interrupt_class = &event_interrupt_class_cik,
101 .num_of_watch_points = 4, 103 .num_of_watch_points = 4,
@@ -109,6 +111,7 @@ static const struct kfd_device_info fiji_device_info = {
109 .asic_family = CHIP_FIJI, 111 .asic_family = CHIP_FIJI,
110 .max_pasid_bits = 16, 112 .max_pasid_bits = 16,
111 .max_no_of_hqd = 24, 113 .max_no_of_hqd = 24,
114 .doorbell_size = 4,
112 .ih_ring_entry_size = 4 * sizeof(uint32_t), 115 .ih_ring_entry_size = 4 * sizeof(uint32_t),
113 .event_interrupt_class = &event_interrupt_class_cik, 116 .event_interrupt_class = &event_interrupt_class_cik,
114 .num_of_watch_points = 4, 117 .num_of_watch_points = 4,
@@ -122,6 +125,7 @@ static const struct kfd_device_info fiji_vf_device_info = {
122 .asic_family = CHIP_FIJI, 125 .asic_family = CHIP_FIJI,
123 .max_pasid_bits = 16, 126 .max_pasid_bits = 16,
124 .max_no_of_hqd = 24, 127 .max_no_of_hqd = 24,
128 .doorbell_size = 4,
125 .ih_ring_entry_size = 4 * sizeof(uint32_t), 129 .ih_ring_entry_size = 4 * sizeof(uint32_t),
126 .event_interrupt_class = &event_interrupt_class_cik, 130 .event_interrupt_class = &event_interrupt_class_cik,
127 .num_of_watch_points = 4, 131 .num_of_watch_points = 4,
@@ -136,6 +140,7 @@ static const struct kfd_device_info polaris10_device_info = {
136 .asic_family = CHIP_POLARIS10, 140 .asic_family = CHIP_POLARIS10,
137 .max_pasid_bits = 16, 141 .max_pasid_bits = 16,
138 .max_no_of_hqd = 24, 142 .max_no_of_hqd = 24,
143 .doorbell_size = 4,
139 .ih_ring_entry_size = 4 * sizeof(uint32_t), 144 .ih_ring_entry_size = 4 * sizeof(uint32_t),
140 .event_interrupt_class = &event_interrupt_class_cik, 145 .event_interrupt_class = &event_interrupt_class_cik,
141 .num_of_watch_points = 4, 146 .num_of_watch_points = 4,
@@ -149,6 +154,7 @@ static const struct kfd_device_info polaris10_vf_device_info = {
149 .asic_family = CHIP_POLARIS10, 154 .asic_family = CHIP_POLARIS10,
150 .max_pasid_bits = 16, 155 .max_pasid_bits = 16,
151 .max_no_of_hqd = 24, 156 .max_no_of_hqd = 24,
157 .doorbell_size = 4,
152 .ih_ring_entry_size = 4 * sizeof(uint32_t), 158 .ih_ring_entry_size = 4 * sizeof(uint32_t),
153 .event_interrupt_class = &event_interrupt_class_cik, 159 .event_interrupt_class = &event_interrupt_class_cik,
154 .num_of_watch_points = 4, 160 .num_of_watch_points = 4,
@@ -162,6 +168,7 @@ static const struct kfd_device_info polaris11_device_info = {
162 .asic_family = CHIP_POLARIS11, 168 .asic_family = CHIP_POLARIS11,
163 .max_pasid_bits = 16, 169 .max_pasid_bits = 16,
164 .max_no_of_hqd = 24, 170 .max_no_of_hqd = 24,
171 .doorbell_size = 4,
165 .ih_ring_entry_size = 4 * sizeof(uint32_t), 172 .ih_ring_entry_size = 4 * sizeof(uint32_t),
166 .event_interrupt_class = &event_interrupt_class_cik, 173 .event_interrupt_class = &event_interrupt_class_cik,
167 .num_of_watch_points = 4, 174 .num_of_watch_points = 4,
@@ -171,6 +178,34 @@ static const struct kfd_device_info polaris11_device_info = {
171 .needs_pci_atomics = true, 178 .needs_pci_atomics = true,
172}; 179};
173 180
181static const struct kfd_device_info vega10_device_info = {
182 .asic_family = CHIP_VEGA10,
183 .max_pasid_bits = 16,
184 .max_no_of_hqd = 24,
185 .doorbell_size = 8,
186 .ih_ring_entry_size = 8 * sizeof(uint32_t),
187 .event_interrupt_class = &event_interrupt_class_v9,
188 .num_of_watch_points = 4,
189 .mqd_size_aligned = MQD_SIZE_ALIGNED,
190 .supports_cwsr = true,
191 .needs_iommu_device = false,
192 .needs_pci_atomics = false,
193};
194
195static const struct kfd_device_info vega10_vf_device_info = {
196 .asic_family = CHIP_VEGA10,
197 .max_pasid_bits = 16,
198 .max_no_of_hqd = 24,
199 .doorbell_size = 8,
200 .ih_ring_entry_size = 8 * sizeof(uint32_t),
201 .event_interrupt_class = &event_interrupt_class_v9,
202 .num_of_watch_points = 4,
203 .mqd_size_aligned = MQD_SIZE_ALIGNED,
204 .supports_cwsr = true,
205 .needs_iommu_device = false,
206 .needs_pci_atomics = false,
207};
208
174 209
175struct kfd_deviceid { 210struct kfd_deviceid {
176 unsigned short did; 211 unsigned short did;
@@ -250,6 +285,15 @@ static const struct kfd_deviceid supported_devices[] = {
250 { 0x67EB, &polaris11_device_info }, /* Polaris11 */ 285 { 0x67EB, &polaris11_device_info }, /* Polaris11 */
251 { 0x67EF, &polaris11_device_info }, /* Polaris11 */ 286 { 0x67EF, &polaris11_device_info }, /* Polaris11 */
252 { 0x67FF, &polaris11_device_info }, /* Polaris11 */ 287 { 0x67FF, &polaris11_device_info }, /* Polaris11 */
288 { 0x6860, &vega10_device_info }, /* Vega10 */
289 { 0x6861, &vega10_device_info }, /* Vega10 */
290 { 0x6862, &vega10_device_info }, /* Vega10 */
291 { 0x6863, &vega10_device_info }, /* Vega10 */
292 { 0x6864, &vega10_device_info }, /* Vega10 */
293 { 0x6867, &vega10_device_info }, /* Vega10 */
294 { 0x6868, &vega10_device_info }, /* Vega10 */
295 { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/
296 { 0x687F, &vega10_device_info }, /* Vega10 */
253}; 297};
254 298
255static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, 299static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
@@ -279,7 +323,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
279 struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) 323 struct pci_dev *pdev, const struct kfd2kgd_calls *f2g)
280{ 324{
281 struct kfd_dev *kfd; 325 struct kfd_dev *kfd;
282 326 int ret;
283 const struct kfd_device_info *device_info = 327 const struct kfd_device_info *device_info =
284 lookup_device_info(pdev->device); 328 lookup_device_info(pdev->device);
285 329
@@ -288,19 +332,18 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
288 return NULL; 332 return NULL;
289 } 333 }
290 334
291 if (device_info->needs_pci_atomics) { 335 /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps.
292 /* Allow BIF to recode atomics to PCIe 3.0 336 * 32 and 64-bit requests are possible and must be
293 * AtomicOps. 32 and 64-bit requests are possible and 337 * supported.
294 * must be supported. 338 */
295 */ 339 ret = pci_enable_atomic_ops_to_root(pdev,
296 if (pci_enable_atomic_ops_to_root(pdev, 340 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
297 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 341 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
298 PCI_EXP_DEVCAP2_ATOMIC_COMP64) < 0) { 342 if (device_info->needs_pci_atomics && ret < 0) {
299 dev_info(kfd_device, 343 dev_info(kfd_device,
300 "skipped device %x:%x, PCI rejects atomics", 344 "skipped device %x:%x, PCI rejects atomics\n",
301 pdev->vendor, pdev->device); 345 pdev->vendor, pdev->device);
302 return NULL; 346 return NULL;
303 }
304 } 347 }
305 348
306 kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); 349 kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
@@ -323,10 +366,16 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
323static void kfd_cwsr_init(struct kfd_dev *kfd) 366static void kfd_cwsr_init(struct kfd_dev *kfd)
324{ 367{
325 if (cwsr_enable && kfd->device_info->supports_cwsr) { 368 if (cwsr_enable && kfd->device_info->supports_cwsr) {
326 BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); 369 if (kfd->device_info->asic_family < CHIP_VEGA10) {
370 BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
371 kfd->cwsr_isa = cwsr_trap_gfx8_hex;
372 kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
373 } else {
374 BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
375 kfd->cwsr_isa = cwsr_trap_gfx9_hex;
376 kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
377 }
327 378
328 kfd->cwsr_isa = cwsr_trap_gfx8_hex;
329 kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
330 kfd->cwsr_enabled = true; 379 kfd->cwsr_enabled = true;
331 } 380 }
332} 381}
@@ -541,6 +590,44 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
541 spin_unlock(&kfd->interrupt_lock); 590 spin_unlock(&kfd->interrupt_lock);
542} 591}
543 592
593int kgd2kfd_quiesce_mm(struct mm_struct *mm)
594{
595 struct kfd_process *p;
596 int r;
597
598 /* Because we are called from arbitrary context (workqueue) as opposed
599 * to process context, kfd_process could attempt to exit while we are
600 * running so the lookup function increments the process ref count.
601 */
602 p = kfd_lookup_process_by_mm(mm);
603 if (!p)
604 return -ESRCH;
605
606 r = kfd_process_evict_queues(p);
607
608 kfd_unref_process(p);
609 return r;
610}
611
612int kgd2kfd_resume_mm(struct mm_struct *mm)
613{
614 struct kfd_process *p;
615 int r;
616
617 /* Because we are called from arbitrary context (workqueue) as opposed
618 * to process context, kfd_process could attempt to exit while we are
619 * running so the lookup function increments the process ref count.
620 */
621 p = kfd_lookup_process_by_mm(mm);
622 if (!p)
623 return -ESRCH;
624
625 r = kfd_process_restore_queues(p);
626
627 kfd_unref_process(p);
628 return r;
629}
630
544/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will 631/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will
545 * prepare for safe eviction of KFD BOs that belong to the specified 632 * prepare for safe eviction of KFD BOs that belong to the specified
546 * process. 633 * process.
@@ -652,7 +739,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
652 if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) 739 if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size)
653 return -ENOMEM; 740 return -ENOMEM;
654 741
655 *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); 742 *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
656 if ((*mem_obj) == NULL) 743 if ((*mem_obj) == NULL)
657 return -ENOMEM; 744 return -ENOMEM;
658 745
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index d55d29d31da4..668ad07ebe1f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -110,6 +110,57 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
110 qpd->sh_mem_bases); 110 qpd->sh_mem_bases);
111} 111}
112 112
113static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
114{
115 struct kfd_dev *dev = qpd->dqm->dev;
116
117 if (!KFD_IS_SOC15(dev->device_info->asic_family)) {
118 /* On pre-SOC15 chips we need to use the queue ID to
119 * preserve the user mode ABI.
120 */
121 q->doorbell_id = q->properties.queue_id;
122 } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
123 /* For SDMA queues on SOC15, use static doorbell
124 * assignments based on the engine and queue.
125 */
126 q->doorbell_id = dev->shared_resources.sdma_doorbell
127 [q->properties.sdma_engine_id]
128 [q->properties.sdma_queue_id];
129 } else {
130 /* For CP queues on SOC15 reserve a free doorbell ID */
131 unsigned int found;
132
133 found = find_first_zero_bit(qpd->doorbell_bitmap,
134 KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
135 if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
136 pr_debug("No doorbells available");
137 return -EBUSY;
138 }
139 set_bit(found, qpd->doorbell_bitmap);
140 q->doorbell_id = found;
141 }
142
143 q->properties.doorbell_off =
144 kfd_doorbell_id_to_offset(dev, q->process,
145 q->doorbell_id);
146
147 return 0;
148}
149
150static void deallocate_doorbell(struct qcm_process_device *qpd,
151 struct queue *q)
152{
153 unsigned int old;
154 struct kfd_dev *dev = qpd->dqm->dev;
155
156 if (!KFD_IS_SOC15(dev->device_info->asic_family) ||
157 q->properties.type == KFD_QUEUE_TYPE_SDMA)
158 return;
159
160 old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
161 WARN_ON(!old);
162}
163
113static int allocate_vmid(struct device_queue_manager *dqm, 164static int allocate_vmid(struct device_queue_manager *dqm,
114 struct qcm_process_device *qpd, 165 struct qcm_process_device *qpd,
115 struct queue *q) 166 struct queue *q)
@@ -145,15 +196,19 @@ static int allocate_vmid(struct device_queue_manager *dqm,
145static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, 196static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
146 struct qcm_process_device *qpd) 197 struct qcm_process_device *qpd)
147{ 198{
148 uint32_t len; 199 const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf;
200 int ret;
149 201
150 if (!qpd->ib_kaddr) 202 if (!qpd->ib_kaddr)
151 return -ENOMEM; 203 return -ENOMEM;
152 204
153 len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); 205 ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
206 if (ret)
207 return ret;
154 208
155 return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, 209 return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
156 qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); 210 qpd->ib_base, (uint32_t *)qpd->ib_kaddr,
211 pmf->release_mem_size / sizeof(uint32_t));
157} 212}
158 213
159static void deallocate_vmid(struct device_queue_manager *dqm, 214static void deallocate_vmid(struct device_queue_manager *dqm,
@@ -301,10 +356,14 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
301 if (retval) 356 if (retval)
302 return retval; 357 return retval;
303 358
359 retval = allocate_doorbell(qpd, q);
360 if (retval)
361 goto out_deallocate_hqd;
362
304 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, 363 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
305 &q->gart_mqd_addr, &q->properties); 364 &q->gart_mqd_addr, &q->properties);
306 if (retval) 365 if (retval)
307 goto out_deallocate_hqd; 366 goto out_deallocate_doorbell;
308 367
309 pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", 368 pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
310 q->pipe, q->queue); 369 q->pipe, q->queue);
@@ -324,6 +383,8 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
324 383
325out_uninit_mqd: 384out_uninit_mqd:
326 mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); 385 mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
386out_deallocate_doorbell:
387 deallocate_doorbell(qpd, q);
327out_deallocate_hqd: 388out_deallocate_hqd:
328 deallocate_hqd(dqm, q); 389 deallocate_hqd(dqm, q);
329 390
@@ -357,6 +418,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
357 } 418 }
358 dqm->total_queue_count--; 419 dqm->total_queue_count--;
359 420
421 deallocate_doorbell(qpd, q);
422
360 retval = mqd->destroy_mqd(mqd, q->mqd, 423 retval = mqd->destroy_mqd(mqd, q->mqd,
361 KFD_PREEMPT_TYPE_WAVEFRONT_RESET, 424 KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
362 KFD_UNMAP_LATENCY_MS, 425 KFD_UNMAP_LATENCY_MS,
@@ -861,6 +924,10 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
861 q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; 924 q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
862 q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; 925 q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
863 926
927 retval = allocate_doorbell(qpd, q);
928 if (retval)
929 goto out_deallocate_sdma_queue;
930
864 pr_debug("SDMA id is: %d\n", q->sdma_id); 931 pr_debug("SDMA id is: %d\n", q->sdma_id);
865 pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); 932 pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
866 pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); 933 pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
@@ -869,7 +936,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
869 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, 936 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
870 &q->gart_mqd_addr, &q->properties); 937 &q->gart_mqd_addr, &q->properties);
871 if (retval) 938 if (retval)
872 goto out_deallocate_sdma_queue; 939 goto out_deallocate_doorbell;
873 940
874 retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); 941 retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL);
875 if (retval) 942 if (retval)
@@ -879,6 +946,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
879 946
880out_uninit_mqd: 947out_uninit_mqd:
881 mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); 948 mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
949out_deallocate_doorbell:
950 deallocate_doorbell(qpd, q);
882out_deallocate_sdma_queue: 951out_deallocate_sdma_queue:
883 deallocate_sdma_queue(dqm, q->sdma_id); 952 deallocate_sdma_queue(dqm, q->sdma_id);
884 953
@@ -1070,12 +1139,17 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
1070 q->properties.sdma_engine_id = 1139 q->properties.sdma_engine_id =
1071 q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; 1140 q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
1072 } 1141 }
1142
1143 retval = allocate_doorbell(qpd, q);
1144 if (retval)
1145 goto out_deallocate_sdma_queue;
1146
1073 mqd = dqm->ops.get_mqd_manager(dqm, 1147 mqd = dqm->ops.get_mqd_manager(dqm,
1074 get_mqd_type_from_queue_type(q->properties.type)); 1148 get_mqd_type_from_queue_type(q->properties.type));
1075 1149
1076 if (!mqd) { 1150 if (!mqd) {
1077 retval = -ENOMEM; 1151 retval = -ENOMEM;
1078 goto out_deallocate_sdma_queue; 1152 goto out_deallocate_doorbell;
1079 } 1153 }
1080 /* 1154 /*
1081 * Eviction state logic: we only mark active queues as evicted 1155 * Eviction state logic: we only mark active queues as evicted
@@ -1093,7 +1167,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
1093 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, 1167 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
1094 &q->gart_mqd_addr, &q->properties); 1168 &q->gart_mqd_addr, &q->properties);
1095 if (retval) 1169 if (retval)
1096 goto out_deallocate_sdma_queue; 1170 goto out_deallocate_doorbell;
1097 1171
1098 list_add(&q->list, &qpd->queues_list); 1172 list_add(&q->list, &qpd->queues_list);
1099 qpd->queue_count++; 1173 qpd->queue_count++;
@@ -1117,6 +1191,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
1117 mutex_unlock(&dqm->lock); 1191 mutex_unlock(&dqm->lock);
1118 return retval; 1192 return retval;
1119 1193
1194out_deallocate_doorbell:
1195 deallocate_doorbell(qpd, q);
1120out_deallocate_sdma_queue: 1196out_deallocate_sdma_queue:
1121 if (q->properties.type == KFD_QUEUE_TYPE_SDMA) 1197 if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
1122 deallocate_sdma_queue(dqm, q->sdma_id); 1198 deallocate_sdma_queue(dqm, q->sdma_id);
@@ -1257,6 +1333,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
1257 goto failed; 1333 goto failed;
1258 } 1334 }
1259 1335
1336 deallocate_doorbell(qpd, q);
1337
1260 if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { 1338 if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
1261 dqm->sdma_queue_count--; 1339 dqm->sdma_queue_count--;
1262 deallocate_sdma_queue(dqm, q->sdma_id); 1340 deallocate_sdma_queue(dqm, q->sdma_id);
@@ -1308,7 +1386,10 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
1308 void __user *alternate_aperture_base, 1386 void __user *alternate_aperture_base,
1309 uint64_t alternate_aperture_size) 1387 uint64_t alternate_aperture_size)
1310{ 1388{
1311 bool retval; 1389 bool retval = true;
1390
1391 if (!dqm->asic_ops.set_cache_memory_policy)
1392 return retval;
1312 1393
1313 mutex_lock(&dqm->lock); 1394 mutex_lock(&dqm->lock);
1314 1395
@@ -1577,6 +1658,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
1577 case CHIP_POLARIS11: 1658 case CHIP_POLARIS11:
1578 device_queue_manager_init_vi_tonga(&dqm->asic_ops); 1659 device_queue_manager_init_vi_tonga(&dqm->asic_ops);
1579 break; 1660 break;
1661
1662 case CHIP_VEGA10:
1663 case CHIP_RAVEN:
1664 device_queue_manager_init_v9(&dqm->asic_ops);
1665 break;
1580 default: 1666 default:
1581 WARN(1, "Unexpected ASIC family %u", 1667 WARN(1, "Unexpected ASIC family %u",
1582 dev->device_info->asic_family); 1668 dev->device_info->asic_family);
@@ -1627,6 +1713,18 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
1627 int pipe, queue; 1713 int pipe, queue;
1628 int r = 0; 1714 int r = 0;
1629 1715
1716 r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
1717 KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs);
1718 if (!r) {
1719 seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n",
1720 KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
1721 KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
1722 KFD_CIK_HIQ_QUEUE);
1723 seq_reg_dump(m, dump, n_regs);
1724
1725 kfree(dump);
1726 }
1727
1630 for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { 1728 for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
1631 int pipe_offset = pipe * get_queues_per_pipe(dqm); 1729 int pipe_offset = pipe * get_queues_per_pipe(dqm);
1632 1730
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 412beff3281d..59a6b1956932 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -200,6 +200,8 @@ void device_queue_manager_init_vi(
200 struct device_queue_manager_asic_ops *asic_ops); 200 struct device_queue_manager_asic_ops *asic_ops);
201void device_queue_manager_init_vi_tonga( 201void device_queue_manager_init_vi_tonga(
202 struct device_queue_manager_asic_ops *asic_ops); 202 struct device_queue_manager_asic_ops *asic_ops);
203void device_queue_manager_init_v9(
204 struct device_queue_manager_asic_ops *asic_ops);
203void program_sh_mem_settings(struct device_queue_manager *dqm, 205void program_sh_mem_settings(struct device_queue_manager *dqm,
204 struct qcm_process_device *qpd); 206 struct qcm_process_device *qpd);
205unsigned int get_queues_num(struct device_queue_manager *dqm); 207unsigned int get_queues_num(struct device_queue_manager *dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
new file mode 100644
index 000000000000..79e5bcf6367c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -0,0 +1,84 @@
1/*
2 * Copyright 2016-2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24#include "kfd_device_queue_manager.h"
25#include "vega10_enum.h"
26#include "gc/gc_9_0_offset.h"
27#include "gc/gc_9_0_sh_mask.h"
28#include "sdma0/sdma0_4_0_sh_mask.h"
29
30static int update_qpd_v9(struct device_queue_manager *dqm,
31 struct qcm_process_device *qpd);
32static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
33 struct qcm_process_device *qpd);
34
35void device_queue_manager_init_v9(
36 struct device_queue_manager_asic_ops *asic_ops)
37{
38 asic_ops->update_qpd = update_qpd_v9;
39 asic_ops->init_sdma_vm = init_sdma_vm_v9;
40}
41
42static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
43{
44 uint32_t shared_base = pdd->lds_base >> 48;
45 uint32_t private_base = pdd->scratch_base >> 48;
46
47 return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
48 private_base;
49}
50
51static int update_qpd_v9(struct device_queue_manager *dqm,
52 struct qcm_process_device *qpd)
53{
54 struct kfd_process_device *pdd;
55
56 pdd = qpd_to_pdd(qpd);
57
58 /* check if sh_mem_config register already configured */
59 if (qpd->sh_mem_config == 0) {
60 qpd->sh_mem_config =
61 SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
62 SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
63 if (vega10_noretry &&
64 !dqm->dev->device_info->needs_iommu_device)
65 qpd->sh_mem_config |=
66 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
67
68 qpd->sh_mem_ape1_limit = 0;
69 qpd->sh_mem_ape1_base = 0;
70 }
71
72 qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
73
74 pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
75
76 return 0;
77}
78
79static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
80 struct qcm_process_device *qpd)
81{
82 /* Not needed on SDMAv4 any more */
83 q->properties.sdma_vm_addr = 0;
84}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
index ebb4da14e3df..c3744d89352c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@@ -33,7 +33,6 @@
33 33
34static DEFINE_IDA(doorbell_ida); 34static DEFINE_IDA(doorbell_ida);
35static unsigned int max_doorbell_slices; 35static unsigned int max_doorbell_slices;
36#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4
37 36
38/* 37/*
39 * Each device exposes a doorbell aperture, a PCI MMIO aperture that 38 * Each device exposes a doorbell aperture, a PCI MMIO aperture that
@@ -50,9 +49,9 @@ static unsigned int max_doorbell_slices;
50 */ 49 */
51 50
52/* # of doorbell bytes allocated for each process. */ 51/* # of doorbell bytes allocated for each process. */
53static inline size_t doorbell_process_allocation(void) 52size_t kfd_doorbell_process_slice(struct kfd_dev *kfd)
54{ 53{
55 return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * 54 return roundup(kfd->device_info->doorbell_size *
56 KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, 55 KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
57 PAGE_SIZE); 56 PAGE_SIZE);
58} 57}
@@ -72,16 +71,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
72 71
73 doorbell_start_offset = 72 doorbell_start_offset =
74 roundup(kfd->shared_resources.doorbell_start_offset, 73 roundup(kfd->shared_resources.doorbell_start_offset,
75 doorbell_process_allocation()); 74 kfd_doorbell_process_slice(kfd));
76 75
77 doorbell_aperture_size = 76 doorbell_aperture_size =
78 rounddown(kfd->shared_resources.doorbell_aperture_size, 77 rounddown(kfd->shared_resources.doorbell_aperture_size,
79 doorbell_process_allocation()); 78 kfd_doorbell_process_slice(kfd));
80 79
81 if (doorbell_aperture_size > doorbell_start_offset) 80 if (doorbell_aperture_size > doorbell_start_offset)
82 doorbell_process_limit = 81 doorbell_process_limit =
83 (doorbell_aperture_size - doorbell_start_offset) / 82 (doorbell_aperture_size - doorbell_start_offset) /
84 doorbell_process_allocation(); 83 kfd_doorbell_process_slice(kfd);
85 else 84 else
86 return -ENOSPC; 85 return -ENOSPC;
87 86
@@ -95,7 +94,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
95 kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); 94 kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32);
96 95
97 kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, 96 kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base,
98 doorbell_process_allocation()); 97 kfd_doorbell_process_slice(kfd));
99 98
100 if (!kfd->doorbell_kernel_ptr) 99 if (!kfd->doorbell_kernel_ptr)
101 return -ENOMEM; 100 return -ENOMEM;
@@ -127,21 +126,16 @@ void kfd_doorbell_fini(struct kfd_dev *kfd)
127 iounmap(kfd->doorbell_kernel_ptr); 126 iounmap(kfd->doorbell_kernel_ptr);
128} 127}
129 128
130int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) 129int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
130 struct vm_area_struct *vma)
131{ 131{
132 phys_addr_t address; 132 phys_addr_t address;
133 struct kfd_dev *dev;
134 133
135 /* 134 /*
136 * For simplicitly we only allow mapping of the entire doorbell 135 * For simplicitly we only allow mapping of the entire doorbell
137 * allocation of a single device & process. 136 * allocation of a single device & process.
138 */ 137 */
139 if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) 138 if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev))
140 return -EINVAL;
141
142 /* Find kfd device according to gpu id */
143 dev = kfd_device_by_id(vma->vm_pgoff);
144 if (!dev)
145 return -EINVAL; 139 return -EINVAL;
146 140
147 /* Calculate physical address of doorbell */ 141 /* Calculate physical address of doorbell */
@@ -158,19 +152,19 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
158 " vm_flags == 0x%04lX\n" 152 " vm_flags == 0x%04lX\n"
159 " size == 0x%04lX\n", 153 " size == 0x%04lX\n",
160 (unsigned long long) vma->vm_start, address, vma->vm_flags, 154 (unsigned long long) vma->vm_start, address, vma->vm_flags,
161 doorbell_process_allocation()); 155 kfd_doorbell_process_slice(dev));
162 156
163 157
164 return io_remap_pfn_range(vma, 158 return io_remap_pfn_range(vma,
165 vma->vm_start, 159 vma->vm_start,
166 address >> PAGE_SHIFT, 160 address >> PAGE_SHIFT,
167 doorbell_process_allocation(), 161 kfd_doorbell_process_slice(dev),
168 vma->vm_page_prot); 162 vma->vm_page_prot);
169} 163}
170 164
171 165
172/* get kernel iomem pointer for a doorbell */ 166/* get kernel iomem pointer for a doorbell */
173u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, 167void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
174 unsigned int *doorbell_off) 168 unsigned int *doorbell_off)
175{ 169{
176 u32 inx; 170 u32 inx;
@@ -185,6 +179,8 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
185 if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) 179 if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
186 return NULL; 180 return NULL;
187 181
182 inx *= kfd->device_info->doorbell_size / sizeof(u32);
183
188 /* 184 /*
189 * Calculating the kernel doorbell offset using the first 185 * Calculating the kernel doorbell offset using the first
190 * doorbell page. 186 * doorbell page.
@@ -210,7 +206,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr)
210 mutex_unlock(&kfd->doorbell_mutex); 206 mutex_unlock(&kfd->doorbell_mutex);
211} 207}
212 208
213inline void write_kernel_doorbell(u32 __iomem *db, u32 value) 209void write_kernel_doorbell(void __iomem *db, u32 value)
214{ 210{
215 if (db) { 211 if (db) {
216 writel(value, db); 212 writel(value, db);
@@ -218,30 +214,37 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
218 } 214 }
219} 215}
220 216
221/* 217void write_kernel_doorbell64(void __iomem *db, u64 value)
222 * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 218{
223 * to doorbells with the process's doorbell page 219 if (db) {
224 */ 220 WARN(((unsigned long)db & 7) != 0,
225unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, 221 "Unaligned 64-bit doorbell");
222 writeq(value, (u64 __iomem *)db);
223 pr_debug("writing %llu to doorbell address %p\n", value, db);
224 }
225}
226
227unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
226 struct kfd_process *process, 228 struct kfd_process *process,
227 unsigned int queue_id) 229 unsigned int doorbell_id)
228{ 230{
229 /* 231 /*
230 * doorbell_id_offset accounts for doorbells taken by KGD. 232 * doorbell_id_offset accounts for doorbells taken by KGD.
231 * index * doorbell_process_allocation/sizeof(u32) adjusts to 233 * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to
232 * the process's doorbells. 234 * the process's doorbells. The offset returned is in dword
235 * units regardless of the ASIC-dependent doorbell size.
233 */ 236 */
234 return kfd->doorbell_id_offset + 237 return kfd->doorbell_id_offset +
235 process->doorbell_index 238 process->doorbell_index
236 * doorbell_process_allocation() / sizeof(u32) + 239 * kfd_doorbell_process_slice(kfd) / sizeof(u32) +
237 queue_id; 240 doorbell_id * kfd->device_info->doorbell_size / sizeof(u32);
238} 241}
239 242
240uint64_t kfd_get_number_elems(struct kfd_dev *kfd) 243uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
241{ 244{
242 uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - 245 uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size -
243 kfd->shared_resources.doorbell_start_offset) / 246 kfd->shared_resources.doorbell_start_offset) /
244 doorbell_process_allocation() + 1; 247 kfd_doorbell_process_slice(kfd) + 1;
245 248
246 return num_of_elems; 249 return num_of_elems;
247 250
@@ -251,7 +254,7 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
251 struct kfd_process *process) 254 struct kfd_process *process)
252{ 255{
253 return dev->doorbell_base + 256 return dev->doorbell_base +
254 process->doorbell_index * doorbell_process_allocation(); 257 process->doorbell_index * kfd_doorbell_process_slice(dev);
255} 258}
256 259
257int kfd_alloc_process_doorbells(struct kfd_process *process) 260int kfd_alloc_process_doorbells(struct kfd_process *process)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 4890a90f1e44..5562e94e786a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -345,7 +345,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
345 case KFD_EVENT_TYPE_DEBUG: 345 case KFD_EVENT_TYPE_DEBUG:
346 ret = create_signal_event(devkfd, p, ev); 346 ret = create_signal_event(devkfd, p, ev);
347 if (!ret) { 347 if (!ret) {
348 *event_page_offset = KFD_MMAP_EVENTS_MASK; 348 *event_page_offset = KFD_MMAP_TYPE_EVENTS;
349 *event_page_offset <<= PAGE_SHIFT; 349 *event_page_offset <<= PAGE_SHIFT;
350 *event_slot_index = ev->event_id; 350 *event_slot_index = ev->event_id;
351 } 351 }
@@ -496,7 +496,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
496 pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", 496 pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
497 partial_id, valid_id_bits); 497 partial_id, valid_id_bits);
498 498
499 if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/2) { 499 if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) {
500 /* With relatively few events, it's faster to 500 /* With relatively few events, it's faster to
501 * iterate over the event IDR 501 * iterate over the event IDR
502 */ 502 */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 66852de410c8..97d5423c5673 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -275,23 +275,35 @@
275 * for FLAT_* / S_LOAD operations. 275 * for FLAT_* / S_LOAD operations.
276 */ 276 */
277 277
278#define MAKE_GPUVM_APP_BASE(gpu_num) \ 278#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \
279 (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) 279 (((uint64_t)(gpu_num) << 61) + 0x1000000000000L)
280 280
281#define MAKE_GPUVM_APP_LIMIT(base, size) \ 281#define MAKE_GPUVM_APP_LIMIT(base, size) \
282 (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) 282 (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1)
283 283
284#define MAKE_SCRATCH_APP_BASE() \ 284#define MAKE_SCRATCH_APP_BASE_VI() \
285 (((uint64_t)(0x1UL) << 61) + 0x100000000L) 285 (((uint64_t)(0x1UL) << 61) + 0x100000000L)
286 286
287#define MAKE_SCRATCH_APP_LIMIT(base) \ 287#define MAKE_SCRATCH_APP_LIMIT(base) \
288 (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) 288 (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
289 289
290#define MAKE_LDS_APP_BASE() \ 290#define MAKE_LDS_APP_BASE_VI() \
291 (((uint64_t)(0x1UL) << 61) + 0x0) 291 (((uint64_t)(0x1UL) << 61) + 0x0)
292#define MAKE_LDS_APP_LIMIT(base) \ 292#define MAKE_LDS_APP_LIMIT(base) \
293 (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) 293 (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
294 294
295/* On GFXv9 the LDS and scratch apertures are programmed independently
296 * using the high 16 bits of the 64-bit virtual address. They must be
297 * in the hole, which will be the case as long as the high 16 bits are
298 * not 0.
299 *
300 * The aperture sizes are still 4GB implicitly.
301 *
302 * A GPUVM aperture is not applicable on GFXv9.
303 */
304#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48)
305#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48)
306
295/* User mode manages most of the SVM aperture address space. The low 307/* User mode manages most of the SVM aperture address space. The low
296 * 16MB are reserved for kernel use (CWSR trap handler and kernel IB 308 * 16MB are reserved for kernel use (CWSR trap handler and kernel IB
297 * for now). 309 * for now).
@@ -300,6 +312,55 @@
300#define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE) 312#define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE)
301#define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE) 313#define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE)
302 314
315static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id)
316{
317 /*
318 * node id couldn't be 0 - the three MSB bits of
319 * aperture shoudn't be 0
320 */
321 pdd->lds_base = MAKE_LDS_APP_BASE_VI();
322 pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
323
324 if (!pdd->dev->device_info->needs_iommu_device) {
325 /* dGPUs: SVM aperture starting at 0
326 * with small reserved space for kernel.
327 * Set them to CANONICAL addresses.
328 */
329 pdd->gpuvm_base = SVM_USER_BASE;
330 pdd->gpuvm_limit =
331 pdd->dev->shared_resources.gpuvm_size - 1;
332 } else {
333 /* set them to non CANONICAL addresses, and no SVM is
334 * allocated.
335 */
336 pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1);
337 pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base,
338 pdd->dev->shared_resources.gpuvm_size);
339 }
340
341 pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI();
342 pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
343}
344
345static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id)
346{
347 pdd->lds_base = MAKE_LDS_APP_BASE_V9();
348 pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
349
350 /* Raven needs SVM to support graphic handle, etc. Leave the small
351 * reserved space before SVM on Raven as well, even though we don't
352 * have to.
353 * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they
354 * are used in Thunk to reserve SVM.
355 */
356 pdd->gpuvm_base = SVM_USER_BASE;
357 pdd->gpuvm_limit =
358 pdd->dev->shared_resources.gpuvm_size - 1;
359
360 pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9();
361 pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
362}
363
303int kfd_init_apertures(struct kfd_process *process) 364int kfd_init_apertures(struct kfd_process *process)
304{ 365{
305 uint8_t id = 0; 366 uint8_t id = 0;
@@ -307,9 +368,7 @@ int kfd_init_apertures(struct kfd_process *process)
307 struct kfd_process_device *pdd; 368 struct kfd_process_device *pdd;
308 369
309 /*Iterating over all devices*/ 370 /*Iterating over all devices*/
310 while (kfd_topology_enum_kfd_devices(id, &dev) == 0 && 371 while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
311 id < NUM_OF_SUPPORTED_GPUS) {
312
313 if (!dev) { 372 if (!dev) {
314 id++; /* Skip non GPU devices */ 373 id++; /* Skip non GPU devices */
315 continue; 374 continue;
@@ -318,7 +377,7 @@ int kfd_init_apertures(struct kfd_process *process)
318 pdd = kfd_create_process_device_data(dev, process); 377 pdd = kfd_create_process_device_data(dev, process);
319 if (!pdd) { 378 if (!pdd) {
320 pr_err("Failed to create process device data\n"); 379 pr_err("Failed to create process device data\n");
321 return -1; 380 return -ENOMEM;
322 } 381 }
323 /* 382 /*
324 * For 64 bit process apertures will be statically reserved in 383 * For 64 bit process apertures will be statically reserved in
@@ -330,32 +389,30 @@ int kfd_init_apertures(struct kfd_process *process)
330 pdd->gpuvm_base = pdd->gpuvm_limit = 0; 389 pdd->gpuvm_base = pdd->gpuvm_limit = 0;
331 pdd->scratch_base = pdd->scratch_limit = 0; 390 pdd->scratch_base = pdd->scratch_limit = 0;
332 } else { 391 } else {
333 /* Same LDS and scratch apertures can be used 392 switch (dev->device_info->asic_family) {
334 * on all GPUs. This allows using more dGPUs 393 case CHIP_KAVERI:
335 * than placement options for apertures. 394 case CHIP_HAWAII:
336 */ 395 case CHIP_CARRIZO:
337 pdd->lds_base = MAKE_LDS_APP_BASE(); 396 case CHIP_TONGA:
338 pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); 397 case CHIP_FIJI:
339 398 case CHIP_POLARIS10:
340 pdd->scratch_base = MAKE_SCRATCH_APP_BASE(); 399 case CHIP_POLARIS11:
341 pdd->scratch_limit = 400 kfd_init_apertures_vi(pdd, id);
342 MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); 401 break;
402 case CHIP_VEGA10:
403 case CHIP_RAVEN:
404 kfd_init_apertures_v9(pdd, id);
405 break;
406 default:
407 WARN(1, "Unexpected ASIC family %u",
408 dev->device_info->asic_family);
409 return -EINVAL;
410 }
343 411
344 if (dev->device_info->needs_iommu_device) { 412 if (!dev->device_info->needs_iommu_device) {
345 /* APUs: GPUVM aperture in 413 /* dGPUs: the reserved space for kernel
346 * non-canonical address space 414 * before SVM
347 */
348 pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
349 pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(
350 pdd->gpuvm_base,
351 dev->shared_resources.gpuvm_size);
352 } else {
353 /* dGPUs: SVM aperture starting at 0
354 * with small reserved space for kernel
355 */ 415 */
356 pdd->gpuvm_base = SVM_USER_BASE;
357 pdd->gpuvm_limit =
358 dev->shared_resources.gpuvm_size - 1;
359 pdd->qpd.cwsr_base = SVM_CWSR_BASE; 416 pdd->qpd.cwsr_base = SVM_CWSR_BASE;
360 pdd->qpd.ib_base = SVM_IB_BASE; 417 pdd->qpd.ib_base = SVM_IB_BASE;
361 } 418 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
new file mode 100644
index 000000000000..37029baa3346
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -0,0 +1,92 @@
1/*
2 * Copyright 2016-2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "kfd_priv.h"
24#include "kfd_events.h"
25#include "soc15_int.h"
26
27
28static bool event_interrupt_isr_v9(struct kfd_dev *dev,
29 const uint32_t *ih_ring_entry)
30{
31 uint16_t source_id, client_id, pasid, vmid;
32 const uint32_t *data = ih_ring_entry;
33
34 /* Only handle interrupts from KFD VMIDs */
35 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
36 if (vmid < dev->vm_info.first_vmid_kfd ||
37 vmid > dev->vm_info.last_vmid_kfd)
38 return 0;
39
40 /* If there is no valid PASID, it's likely a firmware bug */
41 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
42 if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt"))
43 return 0;
44
45 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
46 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
47
48 pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n",
49 client_id, source_id, pasid);
50 pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
51 data[0], data[1], data[2], data[3],
52 data[4], data[5], data[6], data[7]);
53
54 /* Interrupt types we care about: various signals and faults.
55 * They will be forwarded to a work queue (see below).
56 */
57 return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
58 source_id == SOC15_INTSRC_SDMA_TRAP ||
59 source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
60 source_id == SOC15_INTSRC_CP_BAD_OPCODE;
61}
62
63static void event_interrupt_wq_v9(struct kfd_dev *dev,
64 const uint32_t *ih_ring_entry)
65{
66 uint16_t source_id, client_id, pasid, vmid;
67 uint32_t context_id;
68
69 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
70 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
71 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
72 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
73 context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
74
75 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
76 kfd_signal_event_interrupt(pasid, context_id, 32);
77 else if (source_id == SOC15_INTSRC_SDMA_TRAP)
78 kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28);
79 else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG)
80 kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24);
81 else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
82 kfd_signal_hw_exception_event(pasid);
83 else if (client_id == SOC15_IH_CLIENTID_VMC ||
84 client_id == SOC15_IH_CLIENTID_UTCL2) {
85 /* TODO */
86 }
87}
88
89const struct kfd_event_interrupt_class event_interrupt_class_v9 = {
90 .interrupt_isr = event_interrupt_isr_v9,
91 .interrupt_wq = event_interrupt_wq_v9,
92};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
index 035c351f47c5..db6d9336b80d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@@ -139,10 +139,12 @@ static void interrupt_wq(struct work_struct *work)
139{ 139{
140 struct kfd_dev *dev = container_of(work, struct kfd_dev, 140 struct kfd_dev *dev = container_of(work, struct kfd_dev,
141 interrupt_work); 141 interrupt_work);
142 uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE];
142 143
143 uint32_t ih_ring_entry[DIV_ROUND_UP( 144 if (dev->device_info->ih_ring_entry_size > sizeof(ih_ring_entry)) {
144 dev->device_info->ih_ring_entry_size, 145 dev_err_once(kfd_chardev(), "Ring entry too small\n");
145 sizeof(uint32_t))]; 146 return;
147 }
146 148
147 while (dequeue_ih_ring_entry(dev, ih_ring_entry)) 149 while (dequeue_ih_ring_entry(dev, ih_ring_entry))
148 dev->device_info->event_interrupt_class->interrupt_wq(dev, 150 dev->device_info->event_interrupt_class->interrupt_wq(dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 69f496485331..476951d8c91c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -99,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
99 kq->rptr_kernel = kq->rptr_mem->cpu_ptr; 99 kq->rptr_kernel = kq->rptr_mem->cpu_ptr;
100 kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; 100 kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr;
101 101
102 retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), 102 retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size,
103 &kq->wptr_mem); 103 &kq->wptr_mem);
104 104
105 if (retval != 0) 105 if (retval != 0)
@@ -208,6 +208,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
208 size_t available_size; 208 size_t available_size;
209 size_t queue_size_dwords; 209 size_t queue_size_dwords;
210 uint32_t wptr, rptr; 210 uint32_t wptr, rptr;
211 uint64_t wptr64;
211 unsigned int *queue_address; 212 unsigned int *queue_address;
212 213
213 /* When rptr == wptr, the buffer is empty. 214 /* When rptr == wptr, the buffer is empty.
@@ -216,7 +217,8 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
216 * the opposite. So we can only use up to queue_size_dwords - 1 dwords. 217 * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
217 */ 218 */
218 rptr = *kq->rptr_kernel; 219 rptr = *kq->rptr_kernel;
219 wptr = *kq->wptr_kernel; 220 wptr = kq->pending_wptr;
221 wptr64 = kq->pending_wptr64;
220 queue_address = (unsigned int *)kq->pq_kernel_addr; 222 queue_address = (unsigned int *)kq->pq_kernel_addr;
221 queue_size_dwords = kq->queue->properties.queue_size / 4; 223 queue_size_dwords = kq->queue->properties.queue_size / 4;
222 224
@@ -232,29 +234,33 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
232 * make sure calling functions know 234 * make sure calling functions know
233 * acquire_packet_buffer() failed 235 * acquire_packet_buffer() failed
234 */ 236 */
235 *buffer_ptr = NULL; 237 goto err_no_space;
236 return -ENOMEM;
237 } 238 }
238 239
239 if (wptr + packet_size_in_dwords >= queue_size_dwords) { 240 if (wptr + packet_size_in_dwords >= queue_size_dwords) {
240 /* make sure after rolling back to position 0, there is 241 /* make sure after rolling back to position 0, there is
241 * still enough space. 242 * still enough space.
242 */ 243 */
243 if (packet_size_in_dwords >= rptr) { 244 if (packet_size_in_dwords >= rptr)
244 *buffer_ptr = NULL; 245 goto err_no_space;
245 return -ENOMEM; 246
246 }
247 /* fill nops, roll back and start at position 0 */ 247 /* fill nops, roll back and start at position 0 */
248 while (wptr > 0) { 248 while (wptr > 0) {
249 queue_address[wptr] = kq->nop_packet; 249 queue_address[wptr] = kq->nop_packet;
250 wptr = (wptr + 1) % queue_size_dwords; 250 wptr = (wptr + 1) % queue_size_dwords;
251 wptr64++;
251 } 252 }
252 } 253 }
253 254
254 *buffer_ptr = &queue_address[wptr]; 255 *buffer_ptr = &queue_address[wptr];
255 kq->pending_wptr = wptr + packet_size_in_dwords; 256 kq->pending_wptr = wptr + packet_size_in_dwords;
257 kq->pending_wptr64 = wptr64 + packet_size_in_dwords;
256 258
257 return 0; 259 return 0;
260
261err_no_space:
262 *buffer_ptr = NULL;
263 return -ENOMEM;
258} 264}
259 265
260static void submit_packet(struct kernel_queue *kq) 266static void submit_packet(struct kernel_queue *kq)
@@ -270,14 +276,18 @@ static void submit_packet(struct kernel_queue *kq)
270 pr_debug("\n"); 276 pr_debug("\n");
271#endif 277#endif
272 278
273 *kq->wptr_kernel = kq->pending_wptr; 279 kq->ops_asic_specific.submit_packet(kq);
274 write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
275 kq->pending_wptr);
276} 280}
277 281
278static void rollback_packet(struct kernel_queue *kq) 282static void rollback_packet(struct kernel_queue *kq)
279{ 283{
280 kq->pending_wptr = *kq->queue->properties.write_ptr; 284 if (kq->dev->device_info->doorbell_size == 8) {
285 kq->pending_wptr64 = *kq->wptr64_kernel;
286 kq->pending_wptr = *kq->wptr_kernel %
287 (kq->queue->properties.queue_size / 4);
288 } else {
289 kq->pending_wptr = *kq->wptr_kernel;
290 }
281} 291}
282 292
283struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, 293struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
@@ -308,6 +318,11 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
308 case CHIP_HAWAII: 318 case CHIP_HAWAII:
309 kernel_queue_init_cik(&kq->ops_asic_specific); 319 kernel_queue_init_cik(&kq->ops_asic_specific);
310 break; 320 break;
321
322 case CHIP_VEGA10:
323 case CHIP_RAVEN:
324 kernel_queue_init_v9(&kq->ops_asic_specific);
325 break;
311 default: 326 default:
312 WARN(1, "Unexpected ASIC family %u", 327 WARN(1, "Unexpected ASIC family %u",
313 dev->device_info->asic_family); 328 dev->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
index 594053136ee4..97aff2041a5d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
@@ -72,6 +72,7 @@ struct kernel_queue {
72 struct kfd_dev *dev; 72 struct kfd_dev *dev;
73 struct mqd_manager *mqd; 73 struct mqd_manager *mqd;
74 struct queue *queue; 74 struct queue *queue;
75 uint64_t pending_wptr64;
75 uint32_t pending_wptr; 76 uint32_t pending_wptr;
76 unsigned int nop_packet; 77 unsigned int nop_packet;
77 78
@@ -79,7 +80,10 @@ struct kernel_queue {
79 uint32_t *rptr_kernel; 80 uint32_t *rptr_kernel;
80 uint64_t rptr_gpu_addr; 81 uint64_t rptr_gpu_addr;
81 struct kfd_mem_obj *wptr_mem; 82 struct kfd_mem_obj *wptr_mem;
82 uint32_t *wptr_kernel; 83 union {
84 uint64_t *wptr64_kernel;
85 uint32_t *wptr_kernel;
86 };
83 uint64_t wptr_gpu_addr; 87 uint64_t wptr_gpu_addr;
84 struct kfd_mem_obj *pq; 88 struct kfd_mem_obj *pq;
85 uint64_t pq_gpu_addr; 89 uint64_t pq_gpu_addr;
@@ -97,5 +101,6 @@ struct kernel_queue {
97 101
98void kernel_queue_init_cik(struct kernel_queue_ops *ops); 102void kernel_queue_init_cik(struct kernel_queue_ops *ops);
99void kernel_queue_init_vi(struct kernel_queue_ops *ops); 103void kernel_queue_init_vi(struct kernel_queue_ops *ops);
104void kernel_queue_init_v9(struct kernel_queue_ops *ops);
100 105
101#endif /* KFD_KERNEL_QUEUE_H_ */ 106#endif /* KFD_KERNEL_QUEUE_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
index a90eb440b1fb..19e54acb4125 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
@@ -26,11 +26,13 @@
26static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, 26static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
27 enum kfd_queue_type type, unsigned int queue_size); 27 enum kfd_queue_type type, unsigned int queue_size);
28static void uninitialize_cik(struct kernel_queue *kq); 28static void uninitialize_cik(struct kernel_queue *kq);
29static void submit_packet_cik(struct kernel_queue *kq);
29 30
30void kernel_queue_init_cik(struct kernel_queue_ops *ops) 31void kernel_queue_init_cik(struct kernel_queue_ops *ops)
31{ 32{
32 ops->initialize = initialize_cik; 33 ops->initialize = initialize_cik;
33 ops->uninitialize = uninitialize_cik; 34 ops->uninitialize = uninitialize_cik;
35 ops->submit_packet = submit_packet_cik;
34} 36}
35 37
36static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, 38static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
@@ -42,3 +44,10 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
42static void uninitialize_cik(struct kernel_queue *kq) 44static void uninitialize_cik(struct kernel_queue *kq)
43{ 45{
44} 46}
47
48static void submit_packet_cik(struct kernel_queue *kq)
49{
50 *kq->wptr_kernel = kq->pending_wptr;
51 write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
52 kq->pending_wptr);
53}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
new file mode 100644
index 000000000000..684a3bf07efd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -0,0 +1,340 @@
1/*
2 * Copyright 2016-2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24#include "kfd_kernel_queue.h"
25#include "kfd_device_queue_manager.h"
26#include "kfd_pm4_headers_ai.h"
27#include "kfd_pm4_opcodes.h"
28
29static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
30 enum kfd_queue_type type, unsigned int queue_size);
31static void uninitialize_v9(struct kernel_queue *kq);
32static void submit_packet_v9(struct kernel_queue *kq);
33
34void kernel_queue_init_v9(struct kernel_queue_ops *ops)
35{
36 ops->initialize = initialize_v9;
37 ops->uninitialize = uninitialize_v9;
38 ops->submit_packet = submit_packet_v9;
39}
40
41static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
42 enum kfd_queue_type type, unsigned int queue_size)
43{
44 int retval;
45
46 retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
47 if (retval)
48 return false;
49
50 kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
51 kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;
52
53 memset(kq->eop_kernel_addr, 0, PAGE_SIZE);
54
55 return true;
56}
57
58static void uninitialize_v9(struct kernel_queue *kq)
59{
60 kfd_gtt_sa_free(kq->dev, kq->eop_mem);
61}
62
63static void submit_packet_v9(struct kernel_queue *kq)
64{
65 *kq->wptr64_kernel = kq->pending_wptr64;
66 write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
67 kq->pending_wptr64);
68}
69
70static int pm_map_process_v9(struct packet_manager *pm,
71 uint32_t *buffer, struct qcm_process_device *qpd)
72{
73 struct pm4_mes_map_process *packet;
74 uint64_t vm_page_table_base_addr =
75 (uint64_t)(qpd->page_table_base) << 12;
76
77 packet = (struct pm4_mes_map_process *)buffer;
78 memset(buffer, 0, sizeof(struct pm4_mes_map_process));
79
80 packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
81 sizeof(struct pm4_mes_map_process));
82 packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
83 packet->bitfields2.process_quantum = 1;
84 packet->bitfields2.pasid = qpd->pqm->process->pasid;
85 packet->bitfields14.gds_size = qpd->gds_size;
86 packet->bitfields14.num_gws = qpd->num_gws;
87 packet->bitfields14.num_oac = qpd->num_oac;
88 packet->bitfields14.sdma_enable = 1;
89 packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
90
91 packet->sh_mem_config = qpd->sh_mem_config;
92 packet->sh_mem_bases = qpd->sh_mem_bases;
93 packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8);
94 packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8);
95 packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8);
96 packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8);
97
98 packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
99 packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
100
101 packet->vm_context_page_table_base_addr_lo32 =
102 lower_32_bits(vm_page_table_base_addr);
103 packet->vm_context_page_table_base_addr_hi32 =
104 upper_32_bits(vm_page_table_base_addr);
105
106 return 0;
107}
108
109static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
110 uint64_t ib, size_t ib_size_in_dwords, bool chain)
111{
112 struct pm4_mes_runlist *packet;
113
114 int concurrent_proc_cnt = 0;
115 struct kfd_dev *kfd = pm->dqm->dev;
116
117 /* Determine the number of processes to map together to HW:
118 * it can not exceed the number of VMIDs available to the
119 * scheduler, and it is determined by the smaller of the number
120 * of processes in the runlist and kfd module parameter
121 * hws_max_conc_proc.
122 * Note: the arbitration between the number of VMIDs and
123 * hws_max_conc_proc has been done in
124 * kgd2kfd_device_init().
125 */
126 concurrent_proc_cnt = min(pm->dqm->processes_count,
127 kfd->max_proc_per_quantum);
128
129 packet = (struct pm4_mes_runlist *)buffer;
130
131 memset(buffer, 0, sizeof(struct pm4_mes_runlist));
132 packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
133 sizeof(struct pm4_mes_runlist));
134
135 packet->bitfields4.ib_size = ib_size_in_dwords;
136 packet->bitfields4.chain = chain ? 1 : 0;
137 packet->bitfields4.offload_polling = 0;
138 packet->bitfields4.valid = 1;
139 packet->bitfields4.process_cnt = concurrent_proc_cnt;
140 packet->ordinal2 = lower_32_bits(ib);
141 packet->ib_base_hi = upper_32_bits(ib);
142
143 return 0;
144}
145
146static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
147 struct queue *q, bool is_static)
148{
149 struct pm4_mes_map_queues *packet;
150 bool use_static = is_static;
151
152 packet = (struct pm4_mes_map_queues *)buffer;
153 memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
154
155 packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
156 sizeof(struct pm4_mes_map_queues));
157 packet->bitfields2.alloc_format =
158 alloc_format__mes_map_queues__one_per_pipe_vi;
159 packet->bitfields2.num_queues = 1;
160 packet->bitfields2.queue_sel =
161 queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
162
163 packet->bitfields2.engine_sel =
164 engine_sel__mes_map_queues__compute_vi;
165 packet->bitfields2.queue_type =
166 queue_type__mes_map_queues__normal_compute_vi;
167
168 switch (q->properties.type) {
169 case KFD_QUEUE_TYPE_COMPUTE:
170 if (use_static)
171 packet->bitfields2.queue_type =
172 queue_type__mes_map_queues__normal_latency_static_queue_vi;
173 break;
174 case KFD_QUEUE_TYPE_DIQ:
175 packet->bitfields2.queue_type =
176 queue_type__mes_map_queues__debug_interface_queue_vi;
177 break;
178 case KFD_QUEUE_TYPE_SDMA:
179 packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
180 engine_sel__mes_map_queues__sdma0_vi;
181 use_static = false; /* no static queues under SDMA */
182 break;
183 default:
184 WARN(1, "queue type %d", q->properties.type);
185 return -EINVAL;
186 }
187 packet->bitfields3.doorbell_offset =
188 q->properties.doorbell_off;
189
190 packet->mqd_addr_lo =
191 lower_32_bits(q->gart_mqd_addr);
192
193 packet->mqd_addr_hi =
194 upper_32_bits(q->gart_mqd_addr);
195
196 packet->wptr_addr_lo =
197 lower_32_bits((uint64_t)q->properties.write_ptr);
198
199 packet->wptr_addr_hi =
200 upper_32_bits((uint64_t)q->properties.write_ptr);
201
202 return 0;
203}
204
205static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
206 enum kfd_queue_type type,
207 enum kfd_unmap_queues_filter filter,
208 uint32_t filter_param, bool reset,
209 unsigned int sdma_engine)
210{
211 struct pm4_mes_unmap_queues *packet;
212
213 packet = (struct pm4_mes_unmap_queues *)buffer;
214 memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
215
216 packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
217 sizeof(struct pm4_mes_unmap_queues));
218 switch (type) {
219 case KFD_QUEUE_TYPE_COMPUTE:
220 case KFD_QUEUE_TYPE_DIQ:
221 packet->bitfields2.engine_sel =
222 engine_sel__mes_unmap_queues__compute;
223 break;
224 case KFD_QUEUE_TYPE_SDMA:
225 packet->bitfields2.engine_sel =
226 engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
227 break;
228 default:
229 WARN(1, "queue type %d", type);
230 return -EINVAL;
231 }
232
233 if (reset)
234 packet->bitfields2.action =
235 action__mes_unmap_queues__reset_queues;
236 else
237 packet->bitfields2.action =
238 action__mes_unmap_queues__preempt_queues;
239
240 switch (filter) {
241 case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
242 packet->bitfields2.queue_sel =
243 queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
244 packet->bitfields2.num_queues = 1;
245 packet->bitfields3b.doorbell_offset0 = filter_param;
246 break;
247 case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
248 packet->bitfields2.queue_sel =
249 queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
250 packet->bitfields3a.pasid = filter_param;
251 break;
252 case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
253 packet->bitfields2.queue_sel =
254 queue_sel__mes_unmap_queues__unmap_all_queues;
255 break;
256 case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
257 /* in this case, we do not preempt static queues */
258 packet->bitfields2.queue_sel =
259 queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
260 break;
261 default:
262 WARN(1, "filter %d", filter);
263 return -EINVAL;
264 }
265
266 return 0;
267
268}
269
270static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer,
271 uint64_t fence_address, uint32_t fence_value)
272{
273 struct pm4_mes_query_status *packet;
274
275 packet = (struct pm4_mes_query_status *)buffer;
276 memset(buffer, 0, sizeof(struct pm4_mes_query_status));
277
278
279 packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
280 sizeof(struct pm4_mes_query_status));
281
282 packet->bitfields2.context_id = 0;
283 packet->bitfields2.interrupt_sel =
284 interrupt_sel__mes_query_status__completion_status;
285 packet->bitfields2.command =
286 command__mes_query_status__fence_only_after_write_ack;
287
288 packet->addr_hi = upper_32_bits((uint64_t)fence_address);
289 packet->addr_lo = lower_32_bits((uint64_t)fence_address);
290 packet->data_hi = upper_32_bits((uint64_t)fence_value);
291 packet->data_lo = lower_32_bits((uint64_t)fence_value);
292
293 return 0;
294}
295
296
297static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
298{
299 struct pm4_mec_release_mem *packet;
300
301 packet = (struct pm4_mec_release_mem *)buffer;
302 memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
303
304 packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
305 sizeof(struct pm4_mec_release_mem));
306
307 packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
308 packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
309 packet->bitfields2.tcl1_action_ena = 1;
310 packet->bitfields2.tc_action_ena = 1;
311 packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru;
312
313 packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
314 packet->bitfields3.int_sel =
315 int_sel__mec_release_mem__send_interrupt_after_write_confirm;
316
317 packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
318 packet->address_hi = upper_32_bits(gpu_addr);
319
320 packet->data_lo = 0;
321
322 return 0;
323}
324
325const struct packet_manager_funcs kfd_v9_pm_funcs = {
326 .map_process = pm_map_process_v9,
327 .runlist = pm_runlist_v9,
328 .set_resources = pm_set_resources_vi,
329 .map_queues = pm_map_queues_v9,
330 .unmap_queues = pm_unmap_queues_v9,
331 .query_status = pm_query_status_v9,
332 .release_mem = pm_release_mem_v9,
333 .map_process_size = sizeof(struct pm4_mes_map_process),
334 .runlist_size = sizeof(struct pm4_mes_runlist),
335 .set_resources_size = sizeof(struct pm4_mes_set_resources),
336 .map_queues_size = sizeof(struct pm4_mes_map_queues),
337 .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
338 .query_status_size = sizeof(struct pm4_mes_query_status),
339 .release_mem_size = sizeof(struct pm4_mec_release_mem)
340};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
index f1d48281e322..bf20c6d32ef3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
@@ -22,15 +22,20 @@
22 */ 22 */
23 23
24#include "kfd_kernel_queue.h" 24#include "kfd_kernel_queue.h"
25#include "kfd_device_queue_manager.h"
26#include "kfd_pm4_headers_vi.h"
27#include "kfd_pm4_opcodes.h"
25 28
26static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, 29static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev,
27 enum kfd_queue_type type, unsigned int queue_size); 30 enum kfd_queue_type type, unsigned int queue_size);
28static void uninitialize_vi(struct kernel_queue *kq); 31static void uninitialize_vi(struct kernel_queue *kq);
32static void submit_packet_vi(struct kernel_queue *kq);
29 33
30void kernel_queue_init_vi(struct kernel_queue_ops *ops) 34void kernel_queue_init_vi(struct kernel_queue_ops *ops)
31{ 35{
32 ops->initialize = initialize_vi; 36 ops->initialize = initialize_vi;
33 ops->uninitialize = uninitialize_vi; 37 ops->uninitialize = uninitialize_vi;
38 ops->submit_packet = submit_packet_vi;
34} 39}
35 40
36static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, 41static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev,
@@ -54,3 +59,317 @@ static void uninitialize_vi(struct kernel_queue *kq)
54{ 59{
55 kfd_gtt_sa_free(kq->dev, kq->eop_mem); 60 kfd_gtt_sa_free(kq->dev, kq->eop_mem);
56} 61}
62
63static void submit_packet_vi(struct kernel_queue *kq)
64{
65 *kq->wptr_kernel = kq->pending_wptr;
66 write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
67 kq->pending_wptr);
68}
69
70unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size)
71{
72 union PM4_MES_TYPE_3_HEADER header;
73
74 header.u32All = 0;
75 header.opcode = opcode;
76 header.count = packet_size / 4 - 2;
77 header.type = PM4_TYPE_3;
78
79 return header.u32All;
80}
81
82static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer,
83 struct qcm_process_device *qpd)
84{
85 struct pm4_mes_map_process *packet;
86
87 packet = (struct pm4_mes_map_process *)buffer;
88
89 memset(buffer, 0, sizeof(struct pm4_mes_map_process));
90
91 packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
92 sizeof(struct pm4_mes_map_process));
93 packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
94 packet->bitfields2.process_quantum = 1;
95 packet->bitfields2.pasid = qpd->pqm->process->pasid;
96 packet->bitfields3.page_table_base = qpd->page_table_base;
97 packet->bitfields10.gds_size = qpd->gds_size;
98 packet->bitfields10.num_gws = qpd->num_gws;
99 packet->bitfields10.num_oac = qpd->num_oac;
100 packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
101
102 packet->sh_mem_config = qpd->sh_mem_config;
103 packet->sh_mem_bases = qpd->sh_mem_bases;
104 packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
105 packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
106
107 packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
108
109 packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
110 packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
111
112 return 0;
113}
114
115static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
116 uint64_t ib, size_t ib_size_in_dwords, bool chain)
117{
118 struct pm4_mes_runlist *packet;
119 int concurrent_proc_cnt = 0;
120 struct kfd_dev *kfd = pm->dqm->dev;
121
122 if (WARN_ON(!ib))
123 return -EFAULT;
124
125 /* Determine the number of processes to map together to HW:
126 * it can not exceed the number of VMIDs available to the
127 * scheduler, and it is determined by the smaller of the number
128 * of processes in the runlist and kfd module parameter
129 * hws_max_conc_proc.
130 * Note: the arbitration between the number of VMIDs and
131 * hws_max_conc_proc has been done in
132 * kgd2kfd_device_init().
133 */
134 concurrent_proc_cnt = min(pm->dqm->processes_count,
135 kfd->max_proc_per_quantum);
136
137 packet = (struct pm4_mes_runlist *)buffer;
138
139 memset(buffer, 0, sizeof(struct pm4_mes_runlist));
140 packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
141 sizeof(struct pm4_mes_runlist));
142
143 packet->bitfields4.ib_size = ib_size_in_dwords;
144 packet->bitfields4.chain = chain ? 1 : 0;
145 packet->bitfields4.offload_polling = 0;
146 packet->bitfields4.valid = 1;
147 packet->bitfields4.process_cnt = concurrent_proc_cnt;
148 packet->ordinal2 = lower_32_bits(ib);
149 packet->bitfields3.ib_base_hi = upper_32_bits(ib);
150
151 return 0;
152}
153
154int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
155 struct scheduling_resources *res)
156{
157 struct pm4_mes_set_resources *packet;
158
159 packet = (struct pm4_mes_set_resources *)buffer;
160 memset(buffer, 0, sizeof(struct pm4_mes_set_resources));
161
162 packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES,
163 sizeof(struct pm4_mes_set_resources));
164
165 packet->bitfields2.queue_type =
166 queue_type__mes_set_resources__hsa_interface_queue_hiq;
167 packet->bitfields2.vmid_mask = res->vmid_mask;
168 packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
169 packet->bitfields7.oac_mask = res->oac_mask;
170 packet->bitfields8.gds_heap_base = res->gds_heap_base;
171 packet->bitfields8.gds_heap_size = res->gds_heap_size;
172
173 packet->gws_mask_lo = lower_32_bits(res->gws_mask);
174 packet->gws_mask_hi = upper_32_bits(res->gws_mask);
175
176 packet->queue_mask_lo = lower_32_bits(res->queue_mask);
177 packet->queue_mask_hi = upper_32_bits(res->queue_mask);
178
179 return 0;
180}
181
182static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
183 struct queue *q, bool is_static)
184{
185 struct pm4_mes_map_queues *packet;
186 bool use_static = is_static;
187
188 packet = (struct pm4_mes_map_queues *)buffer;
189 memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
190
191 packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
192 sizeof(struct pm4_mes_map_queues));
193 packet->bitfields2.alloc_format =
194 alloc_format__mes_map_queues__one_per_pipe_vi;
195 packet->bitfields2.num_queues = 1;
196 packet->bitfields2.queue_sel =
197 queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
198
199 packet->bitfields2.engine_sel =
200 engine_sel__mes_map_queues__compute_vi;
201 packet->bitfields2.queue_type =
202 queue_type__mes_map_queues__normal_compute_vi;
203
204 switch (q->properties.type) {
205 case KFD_QUEUE_TYPE_COMPUTE:
206 if (use_static)
207 packet->bitfields2.queue_type =
208 queue_type__mes_map_queues__normal_latency_static_queue_vi;
209 break;
210 case KFD_QUEUE_TYPE_DIQ:
211 packet->bitfields2.queue_type =
212 queue_type__mes_map_queues__debug_interface_queue_vi;
213 break;
214 case KFD_QUEUE_TYPE_SDMA:
215 packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
216 engine_sel__mes_map_queues__sdma0_vi;
217 use_static = false; /* no static queues under SDMA */
218 break;
219 default:
220 WARN(1, "queue type %d", q->properties.type);
221 return -EINVAL;
222 }
223 packet->bitfields3.doorbell_offset =
224 q->properties.doorbell_off;
225
226 packet->mqd_addr_lo =
227 lower_32_bits(q->gart_mqd_addr);
228
229 packet->mqd_addr_hi =
230 upper_32_bits(q->gart_mqd_addr);
231
232 packet->wptr_addr_lo =
233 lower_32_bits((uint64_t)q->properties.write_ptr);
234
235 packet->wptr_addr_hi =
236 upper_32_bits((uint64_t)q->properties.write_ptr);
237
238 return 0;
239}
240
241static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
242 enum kfd_queue_type type,
243 enum kfd_unmap_queues_filter filter,
244 uint32_t filter_param, bool reset,
245 unsigned int sdma_engine)
246{
247 struct pm4_mes_unmap_queues *packet;
248
249 packet = (struct pm4_mes_unmap_queues *)buffer;
250 memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
251
252 packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
253 sizeof(struct pm4_mes_unmap_queues));
254 switch (type) {
255 case KFD_QUEUE_TYPE_COMPUTE:
256 case KFD_QUEUE_TYPE_DIQ:
257 packet->bitfields2.engine_sel =
258 engine_sel__mes_unmap_queues__compute;
259 break;
260 case KFD_QUEUE_TYPE_SDMA:
261 packet->bitfields2.engine_sel =
262 engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
263 break;
264 default:
265 WARN(1, "queue type %d", type);
266 return -EINVAL;
267 }
268
269 if (reset)
270 packet->bitfields2.action =
271 action__mes_unmap_queues__reset_queues;
272 else
273 packet->bitfields2.action =
274 action__mes_unmap_queues__preempt_queues;
275
276 switch (filter) {
277 case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
278 packet->bitfields2.queue_sel =
279 queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
280 packet->bitfields2.num_queues = 1;
281 packet->bitfields3b.doorbell_offset0 = filter_param;
282 break;
283 case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
284 packet->bitfields2.queue_sel =
285 queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
286 packet->bitfields3a.pasid = filter_param;
287 break;
288 case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
289 packet->bitfields2.queue_sel =
290 queue_sel__mes_unmap_queues__unmap_all_queues;
291 break;
292 case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
293 /* in this case, we do not preempt static queues */
294 packet->bitfields2.queue_sel =
295 queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
296 break;
297 default:
298 WARN(1, "filter %d", filter);
299 return -EINVAL;
300 }
301
302 return 0;
303
304}
305
306static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
307 uint64_t fence_address, uint32_t fence_value)
308{
309 struct pm4_mes_query_status *packet;
310
311 packet = (struct pm4_mes_query_status *)buffer;
312 memset(buffer, 0, sizeof(struct pm4_mes_query_status));
313
314 packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
315 sizeof(struct pm4_mes_query_status));
316
317 packet->bitfields2.context_id = 0;
318 packet->bitfields2.interrupt_sel =
319 interrupt_sel__mes_query_status__completion_status;
320 packet->bitfields2.command =
321 command__mes_query_status__fence_only_after_write_ack;
322
323 packet->addr_hi = upper_32_bits((uint64_t)fence_address);
324 packet->addr_lo = lower_32_bits((uint64_t)fence_address);
325 packet->data_hi = upper_32_bits((uint64_t)fence_value);
326 packet->data_lo = lower_32_bits((uint64_t)fence_value);
327
328 return 0;
329}
330
331static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer)
332{
333 struct pm4_mec_release_mem *packet;
334
335 packet = (struct pm4_mec_release_mem *)buffer;
336 memset(buffer, 0, sizeof(*packet));
337
338 packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
339 sizeof(*packet));
340
341 packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
342 packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
343 packet->bitfields2.tcl1_action_ena = 1;
344 packet->bitfields2.tc_action_ena = 1;
345 packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
346 packet->bitfields2.atc = 0;
347
348 packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
349 packet->bitfields3.int_sel =
350 int_sel___release_mem__send_interrupt_after_write_confirm;
351
352 packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
353 packet->address_hi = upper_32_bits(gpu_addr);
354
355 packet->data_lo = 0;
356
357 return 0;
358}
359
360const struct packet_manager_funcs kfd_vi_pm_funcs = {
361 .map_process = pm_map_process_vi,
362 .runlist = pm_runlist_vi,
363 .set_resources = pm_set_resources_vi,
364 .map_queues = pm_map_queues_vi,
365 .unmap_queues = pm_unmap_queues_vi,
366 .query_status = pm_query_status_vi,
367 .release_mem = pm_release_mem_vi,
368 .map_process_size = sizeof(struct pm4_mes_map_process),
369 .runlist_size = sizeof(struct pm4_mes_runlist),
370 .set_resources_size = sizeof(struct pm4_mes_set_resources),
371 .map_queues_size = sizeof(struct pm4_mes_map_queues),
372 .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
373 .query_status_size = sizeof(struct pm4_mes_query_status),
374 .release_mem_size = sizeof(struct pm4_mec_release_mem)
375};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index e0c07d24d251..76bf2dc8aec4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -43,6 +43,8 @@ static const struct kgd2kfd_calls kgd2kfd = {
43 .interrupt = kgd2kfd_interrupt, 43 .interrupt = kgd2kfd_interrupt,
44 .suspend = kgd2kfd_suspend, 44 .suspend = kgd2kfd_suspend,
45 .resume = kgd2kfd_resume, 45 .resume = kgd2kfd_resume,
46 .quiesce_mm = kgd2kfd_quiesce_mm,
47 .resume_mm = kgd2kfd_resume_mm,
46 .schedule_evict_and_restore_process = 48 .schedule_evict_and_restore_process =
47 kgd2kfd_schedule_evict_and_restore_process, 49 kgd2kfd_schedule_evict_and_restore_process,
48}; 50};
@@ -81,6 +83,11 @@ module_param(ignore_crat, int, 0444);
81MODULE_PARM_DESC(ignore_crat, 83MODULE_PARM_DESC(ignore_crat,
82 "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); 84 "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
83 85
86int vega10_noretry;
87module_param_named(noretry, vega10_noretry, int, 0644);
88MODULE_PARM_DESC(noretry,
89 "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");
90
84static int amdkfd_init_completed; 91static int amdkfd_init_completed;
85 92
86int kgd2kfd_init(unsigned int interface_version, 93int kgd2kfd_init(unsigned int interface_version,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index ee7061e1c466..4b8eb506642b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -38,6 +38,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
38 case CHIP_POLARIS10: 38 case CHIP_POLARIS10:
39 case CHIP_POLARIS11: 39 case CHIP_POLARIS11:
40 return mqd_manager_init_vi_tonga(type, dev); 40 return mqd_manager_init_vi_tonga(type, dev);
41 case CHIP_VEGA10:
42 case CHIP_RAVEN:
43 return mqd_manager_init_v9(type, dev);
41 default: 44 default:
42 WARN(1, "Unexpected ASIC family %u", 45 WARN(1, "Unexpected ASIC family %u",
43 dev->device_info->asic_family); 46 dev->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index c00c325ed3c9..06eaa218eba6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -79,10 +79,6 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
79 m->cp_mqd_base_addr_lo = lower_32_bits(addr); 79 m->cp_mqd_base_addr_lo = lower_32_bits(addr);
80 m->cp_mqd_base_addr_hi = upper_32_bits(addr); 80 m->cp_mqd_base_addr_hi = upper_32_bits(addr);
81 81
82 m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN;
83 /* Although WinKFD writes this, I suspect it should not be necessary */
84 m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE;
85
86 m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | 82 m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
87 QUANTUM_DURATION(10); 83 QUANTUM_DURATION(10);
88 84
@@ -412,7 +408,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
412 if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) 408 if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
413 return NULL; 409 return NULL;
414 410
415 mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); 411 mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
416 if (!mqd) 412 if (!mqd)
417 return NULL; 413 return NULL;
418 414
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
new file mode 100644
index 000000000000..684054ff02cd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -0,0 +1,443 @@
1/*
2 * Copyright 2016-2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24#include <linux/printk.h>
25#include <linux/slab.h>
26#include <linux/uaccess.h>
27#include "kfd_priv.h"
28#include "kfd_mqd_manager.h"
29#include "v9_structs.h"
30#include "gc/gc_9_0_offset.h"
31#include "gc/gc_9_0_sh_mask.h"
32#include "sdma0/sdma0_4_0_sh_mask.h"
33
34static inline struct v9_mqd *get_mqd(void *mqd)
35{
36 return (struct v9_mqd *)mqd;
37}
38
39static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
40{
41 return (struct v9_sdma_mqd *)mqd;
42}
43
44static int init_mqd(struct mqd_manager *mm, void **mqd,
45 struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
46 struct queue_properties *q)
47{
48 int retval;
49 uint64_t addr;
50 struct v9_mqd *m;
51 struct kfd_dev *kfd = mm->dev;
52
53 /* From V9, for CWSR, the control stack is located on the next page
54 * boundary after the mqd, we will use the gtt allocation function
55 * instead of sub-allocation function.
56 */
57 if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
58 *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
59 if (!*mqd_mem_obj)
60 return -ENOMEM;
61 retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd,
62 ALIGN(q->ctl_stack_size, PAGE_SIZE) +
63 ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
64 &((*mqd_mem_obj)->gtt_mem),
65 &((*mqd_mem_obj)->gpu_addr),
66 (void *)&((*mqd_mem_obj)->cpu_ptr));
67 } else
68 retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
69 mqd_mem_obj);
70 if (retval != 0)
71 return -ENOMEM;
72
73 m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
74 addr = (*mqd_mem_obj)->gpu_addr;
75
76 memset(m, 0, sizeof(struct v9_mqd));
77
78 m->header = 0xC0310800;
79 m->compute_pipelinestat_enable = 1;
80 m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
81 m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
82 m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
83 m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
84
85 m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
86 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
87
88 m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
89
90 m->cp_mqd_base_addr_lo = lower_32_bits(addr);
91 m->cp_mqd_base_addr_hi = upper_32_bits(addr);
92
93 m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
94 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
95 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
96
97 m->cp_hqd_pipe_priority = 1;
98 m->cp_hqd_queue_priority = 15;
99
100 if (q->format == KFD_QUEUE_FORMAT_AQL) {
101 m->cp_hqd_aql_control =
102 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
103 }
104
105 if (q->tba_addr) {
106 m->compute_pgm_rsrc2 |=
107 (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
108 }
109
110 if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
111 m->cp_hqd_persistent_state |=
112 (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
113 m->cp_hqd_ctx_save_base_addr_lo =
114 lower_32_bits(q->ctx_save_restore_area_address);
115 m->cp_hqd_ctx_save_base_addr_hi =
116 upper_32_bits(q->ctx_save_restore_area_address);
117 m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
118 m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
119 m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
120 m->cp_hqd_wg_state_offset = q->ctl_stack_size;
121 }
122
123 *mqd = m;
124 if (gart_addr)
125 *gart_addr = addr;
126 retval = mm->update_mqd(mm, m, q);
127
128 return retval;
129}
130
131static int load_mqd(struct mqd_manager *mm, void *mqd,
132 uint32_t pipe_id, uint32_t queue_id,
133 struct queue_properties *p, struct mm_struct *mms)
134{
135 /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
136 uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
137
138 return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
139 (uint32_t __user *)p->write_ptr,
140 wptr_shift, 0, mms);
141}
142
143static int update_mqd(struct mqd_manager *mm, void *mqd,
144 struct queue_properties *q)
145{
146 struct v9_mqd *m;
147
148 m = get_mqd(mqd);
149
150 m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
151 m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
152 pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
153
154 m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
155 m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
156
157 m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
158 m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
159 m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
160 m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
161
162 m->cp_hqd_pq_doorbell_control =
163 q->doorbell_off <<
164 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
165 pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
166 m->cp_hqd_pq_doorbell_control);
167
168 m->cp_hqd_ib_control =
169 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT |
170 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT;
171
172 /*
173 * HW does not clamp this field correctly. Maximum EOP queue size
174 * is constrained by per-SE EOP done signal count, which is 8-bit.
175 * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
176 * more than (EOP entry count - 1) so a queue size of 0x800 dwords
177 * is safe, giving a maximum field value of 0xA.
178 */
179 m->cp_hqd_eop_control = min(0xA,
180 order_base_2(q->eop_ring_buffer_size / 4) - 1);
181 m->cp_hqd_eop_base_addr_lo =
182 lower_32_bits(q->eop_ring_buffer_address >> 8);
183 m->cp_hqd_eop_base_addr_hi =
184 upper_32_bits(q->eop_ring_buffer_address >> 8);
185
186 m->cp_hqd_iq_timer = 0;
187
188 m->cp_hqd_vmid = q->vmid;
189
190 if (q->format == KFD_QUEUE_FORMAT_AQL) {
191 m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
192 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
193 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT |
194 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT;
195 m->cp_hqd_pq_doorbell_control |= 1 <<
196 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
197 }
198 if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
199 m->cp_hqd_ctx_save_control = 0;
200
201 q->is_active = (q->queue_size > 0 &&
202 q->queue_address != 0 &&
203 q->queue_percent > 0 &&
204 !q->is_evicted);
205
206 return 0;
207}
208
209
210static int destroy_mqd(struct mqd_manager *mm, void *mqd,
211 enum kfd_preempt_type type,
212 unsigned int timeout, uint32_t pipe_id,
213 uint32_t queue_id)
214{
215 return mm->dev->kfd2kgd->hqd_destroy
216 (mm->dev->kgd, mqd, type, timeout,
217 pipe_id, queue_id);
218}
219
220static void uninit_mqd(struct mqd_manager *mm, void *mqd,
221 struct kfd_mem_obj *mqd_mem_obj)
222{
223 struct kfd_dev *kfd = mm->dev;
224
225 if (mqd_mem_obj->gtt_mem) {
226 kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem);
227 kfree(mqd_mem_obj);
228 } else {
229 kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
230 }
231}
232
233static bool is_occupied(struct mqd_manager *mm, void *mqd,
234 uint64_t queue_address, uint32_t pipe_id,
235 uint32_t queue_id)
236{
237 return mm->dev->kfd2kgd->hqd_is_occupied(
238 mm->dev->kgd, queue_address,
239 pipe_id, queue_id);
240}
241
242static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
243 struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
244 struct queue_properties *q)
245{
246 struct v9_mqd *m;
247 int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
248
249 if (retval != 0)
250 return retval;
251
252 m = get_mqd(*mqd);
253
254 m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
255 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
256
257 return retval;
258}
259
260static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
261 struct queue_properties *q)
262{
263 struct v9_mqd *m;
264 int retval = update_mqd(mm, mqd, q);
265
266 if (retval != 0)
267 return retval;
268
269 /* TODO: what's the point? update_mqd already does this. */
270 m = get_mqd(mqd);
271 m->cp_hqd_vmid = q->vmid;
272 return retval;
273}
274
275static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
276 struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
277 struct queue_properties *q)
278{
279 int retval;
280 struct v9_sdma_mqd *m;
281
282
283 retval = kfd_gtt_sa_allocate(mm->dev,
284 sizeof(struct v9_sdma_mqd),
285 mqd_mem_obj);
286
287 if (retval != 0)
288 return -ENOMEM;
289
290 m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
291
292 memset(m, 0, sizeof(struct v9_sdma_mqd));
293
294 *mqd = m;
295 if (gart_addr)
296 *gart_addr = (*mqd_mem_obj)->gpu_addr;
297
298 retval = mm->update_mqd(mm, m, q);
299
300 return retval;
301}
302
303static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
304 struct kfd_mem_obj *mqd_mem_obj)
305{
306 kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
307}
308
309static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
310 uint32_t pipe_id, uint32_t queue_id,
311 struct queue_properties *p, struct mm_struct *mms)
312{
313 return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
314 (uint32_t __user *)p->write_ptr,
315 mms);
316}
317
318#define SDMA_RLC_DUMMY_DEFAULT 0xf
319
320static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
321 struct queue_properties *q)
322{
323 struct v9_sdma_mqd *m;
324
325 m = get_sdma_mqd(mqd);
326 m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4)
327 << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
328 q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
329 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
330 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
331
332 m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
333 m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
334 m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
335 m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
336 m->sdmax_rlcx_doorbell_offset =
337 q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT;
338
339 m->sdma_engine_id = q->sdma_engine_id;
340 m->sdma_queue_id = q->sdma_queue_id;
341 m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
342
343 q->is_active = (q->queue_size > 0 &&
344 q->queue_address != 0 &&
345 q->queue_percent > 0 &&
346 !q->is_evicted);
347
348 return 0;
349}
350
351/*
352 * * preempt type here is ignored because there is only one way
353 * * to preempt sdma queue
354 */
355static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
356 enum kfd_preempt_type type,
357 unsigned int timeout, uint32_t pipe_id,
358 uint32_t queue_id)
359{
360 return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
361}
362
363static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
364 uint64_t queue_address, uint32_t pipe_id,
365 uint32_t queue_id)
366{
367 return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
368}
369
370#if defined(CONFIG_DEBUG_FS)
371
372static int debugfs_show_mqd(struct seq_file *m, void *data)
373{
374 seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
375 data, sizeof(struct v9_mqd), false);
376 return 0;
377}
378
379static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
380{
381 seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
382 data, sizeof(struct v9_sdma_mqd), false);
383 return 0;
384}
385
386#endif
387
388struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
389 struct kfd_dev *dev)
390{
391 struct mqd_manager *mqd;
392
393 if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
394 return NULL;
395
396 mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
397 if (!mqd)
398 return NULL;
399
400 mqd->dev = dev;
401
402 switch (type) {
403 case KFD_MQD_TYPE_CP:
404 case KFD_MQD_TYPE_COMPUTE:
405 mqd->init_mqd = init_mqd;
406 mqd->uninit_mqd = uninit_mqd;
407 mqd->load_mqd = load_mqd;
408 mqd->update_mqd = update_mqd;
409 mqd->destroy_mqd = destroy_mqd;
410 mqd->is_occupied = is_occupied;
411#if defined(CONFIG_DEBUG_FS)
412 mqd->debugfs_show_mqd = debugfs_show_mqd;
413#endif
414 break;
415 case KFD_MQD_TYPE_HIQ:
416 mqd->init_mqd = init_mqd_hiq;
417 mqd->uninit_mqd = uninit_mqd;
418 mqd->load_mqd = load_mqd;
419 mqd->update_mqd = update_mqd_hiq;
420 mqd->destroy_mqd = destroy_mqd;
421 mqd->is_occupied = is_occupied;
422#if defined(CONFIG_DEBUG_FS)
423 mqd->debugfs_show_mqd = debugfs_show_mqd;
424#endif
425 break;
426 case KFD_MQD_TYPE_SDMA:
427 mqd->init_mqd = init_mqd_sdma;
428 mqd->uninit_mqd = uninit_mqd_sdma;
429 mqd->load_mqd = load_mqd_sdma;
430 mqd->update_mqd = update_mqd_sdma;
431 mqd->destroy_mqd = destroy_mqd_sdma;
432 mqd->is_occupied = is_occupied_sdma;
433#if defined(CONFIG_DEBUG_FS)
434 mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
435#endif
436 break;
437 default:
438 kfree(mqd);
439 return NULL;
440 }
441
442 return mqd;
443}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 89e4242e43e7..481307b8b4db 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -394,7 +394,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
394 if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) 394 if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
395 return NULL; 395 return NULL;
396 396
397 mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); 397 mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
398 if (!mqd) 398 if (!mqd)
399 return NULL; 399 return NULL;
400 400
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 89ba4c670ec5..c317feb43f69 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -26,8 +26,6 @@
26#include "kfd_device_queue_manager.h" 26#include "kfd_device_queue_manager.h"
27#include "kfd_kernel_queue.h" 27#include "kfd_kernel_queue.h"
28#include "kfd_priv.h" 28#include "kfd_priv.h"
29#include "kfd_pm4_headers_vi.h"
30#include "kfd_pm4_opcodes.h"
31 29
32static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, 30static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
33 unsigned int buffer_size_bytes) 31 unsigned int buffer_size_bytes)
@@ -39,18 +37,6 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
39 *wptr = temp; 37 *wptr = temp;
40} 38}
41 39
42static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size)
43{
44 union PM4_MES_TYPE_3_HEADER header;
45
46 header.u32All = 0;
47 header.opcode = opcode;
48 header.count = packet_size / 4 - 2;
49 header.type = PM4_TYPE_3;
50
51 return header.u32All;
52}
53
54static void pm_calc_rlib_size(struct packet_manager *pm, 40static void pm_calc_rlib_size(struct packet_manager *pm,
55 unsigned int *rlib_size, 41 unsigned int *rlib_size,
56 bool *over_subscription) 42 bool *over_subscription)
@@ -80,9 +66,9 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
80 pr_debug("Over subscribed runlist\n"); 66 pr_debug("Over subscribed runlist\n");
81 } 67 }
82 68
83 map_queue_size = sizeof(struct pm4_mes_map_queues); 69 map_queue_size = pm->pmf->map_queues_size;
84 /* calculate run list ib allocation size */ 70 /* calculate run list ib allocation size */
85 *rlib_size = process_count * sizeof(struct pm4_mes_map_process) + 71 *rlib_size = process_count * pm->pmf->map_process_size +
86 queue_count * map_queue_size; 72 queue_count * map_queue_size;
87 73
88 /* 74 /*
@@ -90,7 +76,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
90 * when over subscription 76 * when over subscription
91 */ 77 */
92 if (*over_subscription) 78 if (*over_subscription)
93 *rlib_size += sizeof(struct pm4_mes_runlist); 79 *rlib_size += pm->pmf->runlist_size;
94 80
95 pr_debug("runlist ib size %d\n", *rlib_size); 81 pr_debug("runlist ib size %d\n", *rlib_size);
96} 82}
@@ -108,12 +94,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
108 94
109 pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); 95 pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
110 96
97 mutex_lock(&pm->lock);
98
111 retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, 99 retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size,
112 &pm->ib_buffer_obj); 100 &pm->ib_buffer_obj);
113 101
114 if (retval) { 102 if (retval) {
115 pr_err("Failed to allocate runlist IB\n"); 103 pr_err("Failed to allocate runlist IB\n");
116 return retval; 104 goto out;
117 } 105 }
118 106
119 *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; 107 *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr;
@@ -121,138 +109,10 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
121 109
122 memset(*rl_buffer, 0, *rl_buffer_size); 110 memset(*rl_buffer, 0, *rl_buffer_size);
123 pm->allocated = true; 111 pm->allocated = true;
124 return retval;
125}
126
127static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
128 uint64_t ib, size_t ib_size_in_dwords, bool chain)
129{
130 struct pm4_mes_runlist *packet;
131 int concurrent_proc_cnt = 0;
132 struct kfd_dev *kfd = pm->dqm->dev;
133
134 if (WARN_ON(!ib))
135 return -EFAULT;
136
137 /* Determine the number of processes to map together to HW:
138 * it can not exceed the number of VMIDs available to the
139 * scheduler, and it is determined by the smaller of the number
140 * of processes in the runlist and kfd module parameter
141 * hws_max_conc_proc.
142 * Note: the arbitration between the number of VMIDs and
143 * hws_max_conc_proc has been done in
144 * kgd2kfd_device_init().
145 */
146 concurrent_proc_cnt = min(pm->dqm->processes_count,
147 kfd->max_proc_per_quantum);
148
149 packet = (struct pm4_mes_runlist *)buffer;
150
151 memset(buffer, 0, sizeof(struct pm4_mes_runlist));
152 packet->header.u32All = build_pm4_header(IT_RUN_LIST,
153 sizeof(struct pm4_mes_runlist));
154
155 packet->bitfields4.ib_size = ib_size_in_dwords;
156 packet->bitfields4.chain = chain ? 1 : 0;
157 packet->bitfields4.offload_polling = 0;
158 packet->bitfields4.valid = 1;
159 packet->bitfields4.process_cnt = concurrent_proc_cnt;
160 packet->ordinal2 = lower_32_bits(ib);
161 packet->bitfields3.ib_base_hi = upper_32_bits(ib);
162
163 return 0;
164}
165
166static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer,
167 struct qcm_process_device *qpd)
168{
169 struct pm4_mes_map_process *packet;
170
171 packet = (struct pm4_mes_map_process *)buffer;
172 112
173 memset(buffer, 0, sizeof(struct pm4_mes_map_process)); 113out:
174 114 mutex_unlock(&pm->lock);
175 packet->header.u32All = build_pm4_header(IT_MAP_PROCESS, 115 return retval;
176 sizeof(struct pm4_mes_map_process));
177 packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
178 packet->bitfields2.process_quantum = 1;
179 packet->bitfields2.pasid = qpd->pqm->process->pasid;
180 packet->bitfields3.page_table_base = qpd->page_table_base;
181 packet->bitfields10.gds_size = qpd->gds_size;
182 packet->bitfields10.num_gws = qpd->num_gws;
183 packet->bitfields10.num_oac = qpd->num_oac;
184 packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
185
186 packet->sh_mem_config = qpd->sh_mem_config;
187 packet->sh_mem_bases = qpd->sh_mem_bases;
188 packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
189 packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
190
191 packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
192
193 packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
194 packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
195
196 return 0;
197}
198
199static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer,
200 struct queue *q, bool is_static)
201{
202 struct pm4_mes_map_queues *packet;
203 bool use_static = is_static;
204
205 packet = (struct pm4_mes_map_queues *)buffer;
206 memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
207
208 packet->header.u32All = build_pm4_header(IT_MAP_QUEUES,
209 sizeof(struct pm4_mes_map_queues));
210 packet->bitfields2.alloc_format =
211 alloc_format__mes_map_queues__one_per_pipe_vi;
212 packet->bitfields2.num_queues = 1;
213 packet->bitfields2.queue_sel =
214 queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
215
216 packet->bitfields2.engine_sel =
217 engine_sel__mes_map_queues__compute_vi;
218 packet->bitfields2.queue_type =
219 queue_type__mes_map_queues__normal_compute_vi;
220
221 switch (q->properties.type) {
222 case KFD_QUEUE_TYPE_COMPUTE:
223 if (use_static)
224 packet->bitfields2.queue_type =
225 queue_type__mes_map_queues__normal_latency_static_queue_vi;
226 break;
227 case KFD_QUEUE_TYPE_DIQ:
228 packet->bitfields2.queue_type =
229 queue_type__mes_map_queues__debug_interface_queue_vi;
230 break;
231 case KFD_QUEUE_TYPE_SDMA:
232 packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
233 engine_sel__mes_map_queues__sdma0_vi;
234 use_static = false; /* no static queues under SDMA */
235 break;
236 default:
237 WARN(1, "queue type %d", q->properties.type);
238 return -EINVAL;
239 }
240 packet->bitfields3.doorbell_offset =
241 q->properties.doorbell_off;
242
243 packet->mqd_addr_lo =
244 lower_32_bits(q->gart_mqd_addr);
245
246 packet->mqd_addr_hi =
247 upper_32_bits(q->gart_mqd_addr);
248
249 packet->wptr_addr_lo =
250 lower_32_bits((uint64_t)q->properties.write_ptr);
251
252 packet->wptr_addr_hi =
253 upper_32_bits((uint64_t)q->properties.write_ptr);
254
255 return 0;
256} 116}
257 117
258static int pm_create_runlist_ib(struct packet_manager *pm, 118static int pm_create_runlist_ib(struct packet_manager *pm,
@@ -292,12 +152,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
292 return -ENOMEM; 152 return -ENOMEM;
293 } 153 }
294 154
295 retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); 155 retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd);
296 if (retval) 156 if (retval)
297 return retval; 157 return retval;
298 158
299 proccesses_mapped++; 159 proccesses_mapped++;
300 inc_wptr(&rl_wptr, sizeof(struct pm4_mes_map_process), 160 inc_wptr(&rl_wptr, pm->pmf->map_process_size,
301 alloc_size_bytes); 161 alloc_size_bytes);
302 162
303 list_for_each_entry(kq, &qpd->priv_queue_list, list) { 163 list_for_each_entry(kq, &qpd->priv_queue_list, list) {
@@ -307,7 +167,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
307 pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", 167 pr_debug("static_queue, mapping kernel q %d, is debug status %d\n",
308 kq->queue->queue, qpd->is_debug); 168 kq->queue->queue, qpd->is_debug);
309 169
310 retval = pm_create_map_queue(pm, 170 retval = pm->pmf->map_queues(pm,
311 &rl_buffer[rl_wptr], 171 &rl_buffer[rl_wptr],
312 kq->queue, 172 kq->queue,
313 qpd->is_debug); 173 qpd->is_debug);
@@ -315,7 +175,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
315 return retval; 175 return retval;
316 176
317 inc_wptr(&rl_wptr, 177 inc_wptr(&rl_wptr,
318 sizeof(struct pm4_mes_map_queues), 178 pm->pmf->map_queues_size,
319 alloc_size_bytes); 179 alloc_size_bytes);
320 } 180 }
321 181
@@ -326,7 +186,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
326 pr_debug("static_queue, mapping user queue %d, is debug status %d\n", 186 pr_debug("static_queue, mapping user queue %d, is debug status %d\n",
327 q->queue, qpd->is_debug); 187 q->queue, qpd->is_debug);
328 188
329 retval = pm_create_map_queue(pm, 189 retval = pm->pmf->map_queues(pm,
330 &rl_buffer[rl_wptr], 190 &rl_buffer[rl_wptr],
331 q, 191 q,
332 qpd->is_debug); 192 qpd->is_debug);
@@ -335,7 +195,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
335 return retval; 195 return retval;
336 196
337 inc_wptr(&rl_wptr, 197 inc_wptr(&rl_wptr,
338 sizeof(struct pm4_mes_map_queues), 198 pm->pmf->map_queues_size,
339 alloc_size_bytes); 199 alloc_size_bytes);
340 } 200 }
341 } 201 }
@@ -343,7 +203,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
343 pr_debug("Finished map process and queues to runlist\n"); 203 pr_debug("Finished map process and queues to runlist\n");
344 204
345 if (is_over_subscription) 205 if (is_over_subscription)
346 retval = pm_create_runlist(pm, &rl_buffer[rl_wptr], 206 retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
347 *rl_gpu_addr, 207 *rl_gpu_addr,
348 alloc_size_bytes / sizeof(uint32_t), 208 alloc_size_bytes / sizeof(uint32_t),
349 true); 209 true);
@@ -355,45 +215,29 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
355 return retval; 215 return retval;
356} 216}
357 217
358/* pm_create_release_mem - Create a RELEASE_MEM packet and return the size
359 * of this packet
360 * @gpu_addr - GPU address of the packet. It's a virtual address.
361 * @buffer - buffer to fill up with the packet. It's a CPU kernel pointer
362 * Return - length of the packet
363 */
364uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer)
365{
366 struct pm4_mec_release_mem *packet;
367
368 WARN_ON(!buffer);
369
370 packet = (struct pm4_mec_release_mem *)buffer;
371 memset(buffer, 0, sizeof(*packet));
372
373 packet->header.u32All = build_pm4_header(IT_RELEASE_MEM,
374 sizeof(*packet));
375
376 packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
377 packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
378 packet->bitfields2.tcl1_action_ena = 1;
379 packet->bitfields2.tc_action_ena = 1;
380 packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
381 packet->bitfields2.atc = 0;
382
383 packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
384 packet->bitfields3.int_sel =
385 int_sel___release_mem__send_interrupt_after_write_confirm;
386
387 packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
388 packet->address_hi = upper_32_bits(gpu_addr);
389
390 packet->data_lo = 0;
391
392 return sizeof(*packet) / sizeof(unsigned int);
393}
394
395int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) 218int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
396{ 219{
220 switch (dqm->dev->device_info->asic_family) {
221 case CHIP_KAVERI:
222 case CHIP_HAWAII:
223 /* PM4 packet structures on CIK are the same as on VI */
224 case CHIP_CARRIZO:
225 case CHIP_TONGA:
226 case CHIP_FIJI:
227 case CHIP_POLARIS10:
228 case CHIP_POLARIS11:
229 pm->pmf = &kfd_vi_pm_funcs;
230 break;
231 case CHIP_VEGA10:
232 case CHIP_RAVEN:
233 pm->pmf = &kfd_v9_pm_funcs;
234 break;
235 default:
236 WARN(1, "Unexpected ASIC family %u",
237 dqm->dev->device_info->asic_family);
238 return -EINVAL;
239 }
240
397 pm->dqm = dqm; 241 pm->dqm = dqm;
398 mutex_init(&pm->lock); 242 mutex_init(&pm->lock);
399 pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); 243 pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ);
@@ -415,38 +259,25 @@ void pm_uninit(struct packet_manager *pm)
415int pm_send_set_resources(struct packet_manager *pm, 259int pm_send_set_resources(struct packet_manager *pm,
416 struct scheduling_resources *res) 260 struct scheduling_resources *res)
417{ 261{
418 struct pm4_mes_set_resources *packet; 262 uint32_t *buffer, size;
419 int retval = 0; 263 int retval = 0;
420 264
265 size = pm->pmf->set_resources_size;
421 mutex_lock(&pm->lock); 266 mutex_lock(&pm->lock);
422 pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, 267 pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
423 sizeof(*packet) / sizeof(uint32_t), 268 size / sizeof(uint32_t),
424 (unsigned int **)&packet); 269 (unsigned int **)&buffer);
425 if (!packet) { 270 if (!buffer) {
426 pr_err("Failed to allocate buffer on kernel queue\n"); 271 pr_err("Failed to allocate buffer on kernel queue\n");
427 retval = -ENOMEM; 272 retval = -ENOMEM;
428 goto out; 273 goto out;
429 } 274 }
430 275
431 memset(packet, 0, sizeof(struct pm4_mes_set_resources)); 276 retval = pm->pmf->set_resources(pm, buffer, res);
432 packet->header.u32All = build_pm4_header(IT_SET_RESOURCES, 277 if (!retval)
433 sizeof(struct pm4_mes_set_resources)); 278 pm->priv_queue->ops.submit_packet(pm->priv_queue);
434 279 else
435 packet->bitfields2.queue_type = 280 pm->priv_queue->ops.rollback_packet(pm->priv_queue);
436 queue_type__mes_set_resources__hsa_interface_queue_hiq;
437 packet->bitfields2.vmid_mask = res->vmid_mask;
438 packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
439 packet->bitfields7.oac_mask = res->oac_mask;
440 packet->bitfields8.gds_heap_base = res->gds_heap_base;
441 packet->bitfields8.gds_heap_size = res->gds_heap_size;
442
443 packet->gws_mask_lo = lower_32_bits(res->gws_mask);
444 packet->gws_mask_hi = upper_32_bits(res->gws_mask);
445
446 packet->queue_mask_lo = lower_32_bits(res->queue_mask);
447 packet->queue_mask_hi = upper_32_bits(res->queue_mask);
448
449 pm->priv_queue->ops.submit_packet(pm->priv_queue);
450 281
451out: 282out:
452 mutex_unlock(&pm->lock); 283 mutex_unlock(&pm->lock);
@@ -468,7 +299,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
468 299
469 pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); 300 pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr);
470 301
471 packet_size_dwords = sizeof(struct pm4_mes_runlist) / sizeof(uint32_t); 302 packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t);
472 mutex_lock(&pm->lock); 303 mutex_lock(&pm->lock);
473 304
474 retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, 305 retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
@@ -476,7 +307,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
476 if (retval) 307 if (retval)
477 goto fail_acquire_packet_buffer; 308 goto fail_acquire_packet_buffer;
478 309
479 retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, 310 retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr,
480 rl_ib_size / sizeof(uint32_t), false); 311 rl_ib_size / sizeof(uint32_t), false);
481 if (retval) 312 if (retval)
482 goto fail_create_runlist; 313 goto fail_create_runlist;
@@ -499,37 +330,29 @@ fail_create_runlist_ib:
499int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, 330int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
500 uint32_t fence_value) 331 uint32_t fence_value)
501{ 332{
502 int retval; 333 uint32_t *buffer, size;
503 struct pm4_mes_query_status *packet; 334 int retval = 0;
504 335
505 if (WARN_ON(!fence_address)) 336 if (WARN_ON(!fence_address))
506 return -EFAULT; 337 return -EFAULT;
507 338
339 size = pm->pmf->query_status_size;
508 mutex_lock(&pm->lock); 340 mutex_lock(&pm->lock);
509 retval = pm->priv_queue->ops.acquire_packet_buffer( 341 pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
510 pm->priv_queue, 342 size / sizeof(uint32_t), (unsigned int **)&buffer);
511 sizeof(struct pm4_mes_query_status) / sizeof(uint32_t), 343 if (!buffer) {
512 (unsigned int **)&packet); 344 pr_err("Failed to allocate buffer on kernel queue\n");
513 if (retval) 345 retval = -ENOMEM;
514 goto fail_acquire_packet_buffer; 346 goto out;
515 347 }
516 packet->header.u32All = build_pm4_header(IT_QUERY_STATUS,
517 sizeof(struct pm4_mes_query_status));
518
519 packet->bitfields2.context_id = 0;
520 packet->bitfields2.interrupt_sel =
521 interrupt_sel__mes_query_status__completion_status;
522 packet->bitfields2.command =
523 command__mes_query_status__fence_only_after_write_ack;
524
525 packet->addr_hi = upper_32_bits((uint64_t)fence_address);
526 packet->addr_lo = lower_32_bits((uint64_t)fence_address);
527 packet->data_hi = upper_32_bits((uint64_t)fence_value);
528 packet->data_lo = lower_32_bits((uint64_t)fence_value);
529 348
530 pm->priv_queue->ops.submit_packet(pm->priv_queue); 349 retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
350 if (!retval)
351 pm->priv_queue->ops.submit_packet(pm->priv_queue);
352 else
353 pm->priv_queue->ops.rollback_packet(pm->priv_queue);
531 354
532fail_acquire_packet_buffer: 355out:
533 mutex_unlock(&pm->lock); 356 mutex_unlock(&pm->lock);
534 return retval; 357 return retval;
535} 358}
@@ -539,82 +362,27 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
539 uint32_t filter_param, bool reset, 362 uint32_t filter_param, bool reset,
540 unsigned int sdma_engine) 363 unsigned int sdma_engine)
541{ 364{
542 int retval; 365 uint32_t *buffer, size;
543 uint32_t *buffer; 366 int retval = 0;
544 struct pm4_mes_unmap_queues *packet;
545 367
368 size = pm->pmf->unmap_queues_size;
546 mutex_lock(&pm->lock); 369 mutex_lock(&pm->lock);
547 retval = pm->priv_queue->ops.acquire_packet_buffer( 370 pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
548 pm->priv_queue, 371 size / sizeof(uint32_t), (unsigned int **)&buffer);
549 sizeof(struct pm4_mes_unmap_queues) / sizeof(uint32_t), 372 if (!buffer) {
550 &buffer); 373 pr_err("Failed to allocate buffer on kernel queue\n");
551 if (retval) 374 retval = -ENOMEM;
552 goto err_acquire_packet_buffer; 375 goto out;
553
554 packet = (struct pm4_mes_unmap_queues *)buffer;
555 memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
556 pr_debug("static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n",
557 filter, reset, type);
558 packet->header.u32All = build_pm4_header(IT_UNMAP_QUEUES,
559 sizeof(struct pm4_mes_unmap_queues));
560 switch (type) {
561 case KFD_QUEUE_TYPE_COMPUTE:
562 case KFD_QUEUE_TYPE_DIQ:
563 packet->bitfields2.engine_sel =
564 engine_sel__mes_unmap_queues__compute;
565 break;
566 case KFD_QUEUE_TYPE_SDMA:
567 packet->bitfields2.engine_sel =
568 engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
569 break;
570 default:
571 WARN(1, "queue type %d", type);
572 retval = -EINVAL;
573 goto err_invalid;
574 } 376 }
575 377
576 if (reset) 378 retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param,
577 packet->bitfields2.action = 379 reset, sdma_engine);
578 action__mes_unmap_queues__reset_queues; 380 if (!retval)
381 pm->priv_queue->ops.submit_packet(pm->priv_queue);
579 else 382 else
580 packet->bitfields2.action = 383 pm->priv_queue->ops.rollback_packet(pm->priv_queue);
581 action__mes_unmap_queues__preempt_queues;
582
583 switch (filter) {
584 case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
585 packet->bitfields2.queue_sel =
586 queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
587 packet->bitfields2.num_queues = 1;
588 packet->bitfields3b.doorbell_offset0 = filter_param;
589 break;
590 case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
591 packet->bitfields2.queue_sel =
592 queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
593 packet->bitfields3a.pasid = filter_param;
594 break;
595 case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
596 packet->bitfields2.queue_sel =
597 queue_sel__mes_unmap_queues__unmap_all_queues;
598 break;
599 case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
600 /* in this case, we do not preempt static queues */
601 packet->bitfields2.queue_sel =
602 queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
603 break;
604 default:
605 WARN(1, "filter %d", filter);
606 retval = -EINVAL;
607 goto err_invalid;
608 }
609 384
610 pm->priv_queue->ops.submit_packet(pm->priv_queue); 385out:
611
612 mutex_unlock(&pm->lock);
613 return 0;
614
615err_invalid:
616 pm->priv_queue->ops.rollback_packet(pm->priv_queue);
617err_acquire_packet_buffer:
618 mutex_unlock(&pm->lock); 386 mutex_unlock(&pm->lock);
619 return retval; 387 return retval;
620} 388}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
new file mode 100644
index 000000000000..f2bcf5c092ea
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -0,0 +1,583 @@
1/*
2 * Copyright 2016 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24#ifndef F32_MES_PM4_PACKETS_H
25#define F32_MES_PM4_PACKETS_H
26
27#ifndef PM4_MES_HEADER_DEFINED
28#define PM4_MES_HEADER_DEFINED
29union PM4_MES_TYPE_3_HEADER {
30 struct {
31 uint32_t reserved1 : 8; /* < reserved */
32 uint32_t opcode : 8; /* < IT opcode */
33 uint32_t count : 14;/* < number of DWORDs - 1 in the
34 * information body.
35 */
36 uint32_t type : 2; /* < packet identifier.
37 * It should be 3 for type 3 packets
38 */
39 };
40 uint32_t u32All;
41};
42#endif /* PM4_MES_HEADER_DEFINED */
43
44/*--------------------MES_SET_RESOURCES--------------------*/
45
46#ifndef PM4_MES_SET_RESOURCES_DEFINED
47#define PM4_MES_SET_RESOURCES_DEFINED
48enum mes_set_resources_queue_type_enum {
49 queue_type__mes_set_resources__kernel_interface_queue_kiq = 0,
50 queue_type__mes_set_resources__hsa_interface_queue_hiq = 1,
51 queue_type__mes_set_resources__hsa_debug_interface_queue = 4
52};
53
54
55struct pm4_mes_set_resources {
56 union {
57 union PM4_MES_TYPE_3_HEADER header; /* header */
58 uint32_t ordinal1;
59 };
60
61 union {
62 struct {
63 uint32_t vmid_mask:16;
64 uint32_t unmap_latency:8;
65 uint32_t reserved1:5;
66 enum mes_set_resources_queue_type_enum queue_type:3;
67 } bitfields2;
68 uint32_t ordinal2;
69 };
70
71 uint32_t queue_mask_lo;
72 uint32_t queue_mask_hi;
73 uint32_t gws_mask_lo;
74 uint32_t gws_mask_hi;
75
76 union {
77 struct {
78 uint32_t oac_mask:16;
79 uint32_t reserved2:16;
80 } bitfields7;
81 uint32_t ordinal7;
82 };
83
84 union {
85 struct {
86 uint32_t gds_heap_base:6;
87 uint32_t reserved3:5;
88 uint32_t gds_heap_size:6;
89 uint32_t reserved4:15;
90 } bitfields8;
91 uint32_t ordinal8;
92 };
93
94};
95#endif
96
97/*--------------------MES_RUN_LIST--------------------*/
98
99#ifndef PM4_MES_RUN_LIST_DEFINED
100#define PM4_MES_RUN_LIST_DEFINED
101
102struct pm4_mes_runlist {
103 union {
104 union PM4_MES_TYPE_3_HEADER header; /* header */
105 uint32_t ordinal1;
106 };
107
108 union {
109 struct {
110 uint32_t reserved1:2;
111 uint32_t ib_base_lo:30;
112 } bitfields2;
113 uint32_t ordinal2;
114 };
115
116 uint32_t ib_base_hi;
117
118 union {
119 struct {
120 uint32_t ib_size:20;
121 uint32_t chain:1;
122 uint32_t offload_polling:1;
123 uint32_t reserved2:1;
124 uint32_t valid:1;
125 uint32_t process_cnt:4;
126 uint32_t reserved3:4;
127 } bitfields4;
128 uint32_t ordinal4;
129 };
130
131};
132#endif
133
134/*--------------------MES_MAP_PROCESS--------------------*/
135
136#ifndef PM4_MES_MAP_PROCESS_DEFINED
137#define PM4_MES_MAP_PROCESS_DEFINED
138
139struct pm4_mes_map_process {
140 union {
141 union PM4_MES_TYPE_3_HEADER header; /* header */
142 uint32_t ordinal1;
143 };
144
145 union {
146 struct {
147 uint32_t pasid:16;
148 uint32_t reserved1:8;
149 uint32_t diq_enable:1;
150 uint32_t process_quantum:7;
151 } bitfields2;
152 uint32_t ordinal2;
153 };
154
155 uint32_t vm_context_page_table_base_addr_lo32;
156
157 uint32_t vm_context_page_table_base_addr_hi32;
158
159 uint32_t sh_mem_bases;
160
161 uint32_t sh_mem_config;
162
163 uint32_t sq_shader_tba_lo;
164
165 uint32_t sq_shader_tba_hi;
166
167 uint32_t sq_shader_tma_lo;
168
169 uint32_t sq_shader_tma_hi;
170
171 uint32_t reserved6;
172
173 uint32_t gds_addr_lo;
174
175 uint32_t gds_addr_hi;
176
177 union {
178 struct {
179 uint32_t num_gws:6;
180 uint32_t reserved7:1;
181 uint32_t sdma_enable:1;
182 uint32_t num_oac:4;
183 uint32_t reserved8:4;
184 uint32_t gds_size:6;
185 uint32_t num_queues:10;
186 } bitfields14;
187 uint32_t ordinal14;
188 };
189
190 uint32_t completion_signal_lo;
191
192 uint32_t completion_signal_hi;
193
194};
195
196#endif
197
198/*--------------------MES_MAP_PROCESS_VM--------------------*/
199
200#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED
201#define PM4_MES_MAP_PROCESS_VM_DEFINED
202
203struct PM4_MES_MAP_PROCESS_VM {
204 union {
205 union PM4_MES_TYPE_3_HEADER header; /* header */
206 uint32_t ordinal1;
207 };
208
209 uint32_t reserved1;
210
211 uint32_t vm_context_cntl;
212
213 uint32_t reserved2;
214
215 uint32_t vm_context_page_table_end_addr_lo32;
216
217 uint32_t vm_context_page_table_end_addr_hi32;
218
219 uint32_t vm_context_page_table_start_addr_lo32;
220
221 uint32_t vm_context_page_table_start_addr_hi32;
222
223 uint32_t reserved3;
224
225 uint32_t reserved4;
226
227 uint32_t reserved5;
228
229 uint32_t reserved6;
230
231 uint32_t reserved7;
232
233 uint32_t reserved8;
234
235 uint32_t completion_signal_lo32;
236
237 uint32_t completion_signal_hi32;
238
239};
240#endif
241
242/*--------------------MES_MAP_QUEUES--------------------*/
243
244#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED
245#define PM4_MES_MAP_QUEUES_VI_DEFINED
246enum mes_map_queues_queue_sel_enum {
247 queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0,
248queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1
249};
250
251enum mes_map_queues_queue_type_enum {
252 queue_type__mes_map_queues__normal_compute_vi = 0,
253 queue_type__mes_map_queues__debug_interface_queue_vi = 1,
254 queue_type__mes_map_queues__normal_latency_static_queue_vi = 2,
255queue_type__mes_map_queues__low_latency_static_queue_vi = 3
256};
257
258enum mes_map_queues_alloc_format_enum {
259 alloc_format__mes_map_queues__one_per_pipe_vi = 0,
260alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
261};
262
263enum mes_map_queues_engine_sel_enum {
264 engine_sel__mes_map_queues__compute_vi = 0,
265 engine_sel__mes_map_queues__sdma0_vi = 2,
266 engine_sel__mes_map_queues__sdma1_vi = 3
267};
268
269
270struct pm4_mes_map_queues {
271 union {
272 union PM4_MES_TYPE_3_HEADER header; /* header */
273 uint32_t ordinal1;
274 };
275
276 union {
277 struct {
278 uint32_t reserved1:4;
279 enum mes_map_queues_queue_sel_enum queue_sel:2;
280 uint32_t reserved2:15;
281 enum mes_map_queues_queue_type_enum queue_type:3;
282 enum mes_map_queues_alloc_format_enum alloc_format:2;
283 enum mes_map_queues_engine_sel_enum engine_sel:3;
284 uint32_t num_queues:3;
285 } bitfields2;
286 uint32_t ordinal2;
287 };
288
289 union {
290 struct {
291 uint32_t reserved3:1;
292 uint32_t check_disable:1;
293 uint32_t doorbell_offset:26;
294 uint32_t reserved4:4;
295 } bitfields3;
296 uint32_t ordinal3;
297 };
298
299 uint32_t mqd_addr_lo;
300 uint32_t mqd_addr_hi;
301 uint32_t wptr_addr_lo;
302 uint32_t wptr_addr_hi;
303};
304#endif
305
306/*--------------------MES_QUERY_STATUS--------------------*/
307
308#ifndef PM4_MES_QUERY_STATUS_DEFINED
309#define PM4_MES_QUERY_STATUS_DEFINED
310enum mes_query_status_interrupt_sel_enum {
311 interrupt_sel__mes_query_status__completion_status = 0,
312 interrupt_sel__mes_query_status__process_status = 1,
313 interrupt_sel__mes_query_status__queue_status = 2
314};
315
316enum mes_query_status_command_enum {
317 command__mes_query_status__interrupt_only = 0,
318 command__mes_query_status__fence_only_immediate = 1,
319 command__mes_query_status__fence_only_after_write_ack = 2,
320 command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3
321};
322
323enum mes_query_status_engine_sel_enum {
324 engine_sel__mes_query_status__compute = 0,
325 engine_sel__mes_query_status__sdma0_queue = 2,
326 engine_sel__mes_query_status__sdma1_queue = 3
327};
328
329struct pm4_mes_query_status {
330 union {
331 union PM4_MES_TYPE_3_HEADER header; /* header */
332 uint32_t ordinal1;
333 };
334
335 union {
336 struct {
337 uint32_t context_id:28;
338 enum mes_query_status_interrupt_sel_enum interrupt_sel:2;
339 enum mes_query_status_command_enum command:2;
340 } bitfields2;
341 uint32_t ordinal2;
342 };
343
344 union {
345 struct {
346 uint32_t pasid:16;
347 uint32_t reserved1:16;
348 } bitfields3a;
349 struct {
350 uint32_t reserved2:2;
351 uint32_t doorbell_offset:26;
352 enum mes_query_status_engine_sel_enum engine_sel:3;
353 uint32_t reserved3:1;
354 } bitfields3b;
355 uint32_t ordinal3;
356 };
357
358 uint32_t addr_lo;
359 uint32_t addr_hi;
360 uint32_t data_lo;
361 uint32_t data_hi;
362};
363#endif
364
365/*--------------------MES_UNMAP_QUEUES--------------------*/
366
367#ifndef PM4_MES_UNMAP_QUEUES_DEFINED
368#define PM4_MES_UNMAP_QUEUES_DEFINED
369enum mes_unmap_queues_action_enum {
370 action__mes_unmap_queues__preempt_queues = 0,
371 action__mes_unmap_queues__reset_queues = 1,
372 action__mes_unmap_queues__disable_process_queues = 2,
373 action__mes_unmap_queues__reserved = 3
374};
375
376enum mes_unmap_queues_queue_sel_enum {
377 queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0,
378 queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1,
379 queue_sel__mes_unmap_queues__unmap_all_queues = 2,
380 queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3
381};
382
383enum mes_unmap_queues_engine_sel_enum {
384 engine_sel__mes_unmap_queues__compute = 0,
385 engine_sel__mes_unmap_queues__sdma0 = 2,
386 engine_sel__mes_unmap_queues__sdmal = 3
387};
388
389struct pm4_mes_unmap_queues {
390 union {
391 union PM4_MES_TYPE_3_HEADER header; /* header */
392 uint32_t ordinal1;
393 };
394
395 union {
396 struct {
397 enum mes_unmap_queues_action_enum action:2;
398 uint32_t reserved1:2;
399 enum mes_unmap_queues_queue_sel_enum queue_sel:2;
400 uint32_t reserved2:20;
401 enum mes_unmap_queues_engine_sel_enum engine_sel:3;
402 uint32_t num_queues:3;
403 } bitfields2;
404 uint32_t ordinal2;
405 };
406
407 union {
408 struct {
409 uint32_t pasid:16;
410 uint32_t reserved3:16;
411 } bitfields3a;
412 struct {
413 uint32_t reserved4:2;
414 uint32_t doorbell_offset0:26;
415 int32_t reserved5:4;
416 } bitfields3b;
417 uint32_t ordinal3;
418 };
419
420 union {
421 struct {
422 uint32_t reserved6:2;
423 uint32_t doorbell_offset1:26;
424 uint32_t reserved7:4;
425 } bitfields4;
426 uint32_t ordinal4;
427 };
428
429 union {
430 struct {
431 uint32_t reserved8:2;
432 uint32_t doorbell_offset2:26;
433 uint32_t reserved9:4;
434 } bitfields5;
435 uint32_t ordinal5;
436 };
437
438 union {
439 struct {
440 uint32_t reserved10:2;
441 uint32_t doorbell_offset3:26;
442 uint32_t reserved11:4;
443 } bitfields6;
444 uint32_t ordinal6;
445 };
446};
447#endif
448
449#ifndef PM4_MEC_RELEASE_MEM_DEFINED
450#define PM4_MEC_RELEASE_MEM_DEFINED
451
452enum mec_release_mem_event_index_enum {
453 event_index__mec_release_mem__end_of_pipe = 5,
454 event_index__mec_release_mem__shader_done = 6
455};
456
457enum mec_release_mem_cache_policy_enum {
458 cache_policy__mec_release_mem__lru = 0,
459 cache_policy__mec_release_mem__stream = 1
460};
461
462enum mec_release_mem_pq_exe_status_enum {
463 pq_exe_status__mec_release_mem__default = 0,
464 pq_exe_status__mec_release_mem__phase_update = 1
465};
466
467enum mec_release_mem_dst_sel_enum {
468 dst_sel__mec_release_mem__memory_controller = 0,
469 dst_sel__mec_release_mem__tc_l2 = 1,
470 dst_sel__mec_release_mem__queue_write_pointer_register = 2,
471 dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
472};
473
474enum mec_release_mem_int_sel_enum {
475 int_sel__mec_release_mem__none = 0,
476 int_sel__mec_release_mem__send_interrupt_only = 1,
477 int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2,
478 int_sel__mec_release_mem__send_data_after_write_confirm = 3,
479 int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4,
480 int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5,
481 int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
482};
483
484enum mec_release_mem_data_sel_enum {
485 data_sel__mec_release_mem__none = 0,
486 data_sel__mec_release_mem__send_32_bit_low = 1,
487 data_sel__mec_release_mem__send_64_bit_data = 2,
488 data_sel__mec_release_mem__send_gpu_clock_counter = 3,
489 data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4,
490 data_sel__mec_release_mem__store_gds_data_to_memory = 5
491};
492
493struct pm4_mec_release_mem {
494 union {
495 union PM4_MES_TYPE_3_HEADER header; /*header */
496 unsigned int ordinal1;
497 };
498
499 union {
500 struct {
501 unsigned int event_type:6;
502 unsigned int reserved1:2;
503 enum mec_release_mem_event_index_enum event_index:4;
504 unsigned int tcl1_vol_action_ena:1;
505 unsigned int tc_vol_action_ena:1;
506 unsigned int reserved2:1;
507 unsigned int tc_wb_action_ena:1;
508 unsigned int tcl1_action_ena:1;
509 unsigned int tc_action_ena:1;
510 uint32_t reserved3:1;
511 uint32_t tc_nc_action_ena:1;
512 uint32_t tc_wc_action_ena:1;
513 uint32_t tc_md_action_ena:1;
514 uint32_t reserved4:3;
515 enum mec_release_mem_cache_policy_enum cache_policy:2;
516 uint32_t reserved5:2;
517 enum mec_release_mem_pq_exe_status_enum pq_exe_status:1;
518 uint32_t reserved6:2;
519 } bitfields2;
520 unsigned int ordinal2;
521 };
522
523 union {
524 struct {
525 uint32_t reserved7:16;
526 enum mec_release_mem_dst_sel_enum dst_sel:2;
527 uint32_t reserved8:6;
528 enum mec_release_mem_int_sel_enum int_sel:3;
529 uint32_t reserved9:2;
530 enum mec_release_mem_data_sel_enum data_sel:3;
531 } bitfields3;
532 unsigned int ordinal3;
533 };
534
535 union {
536 struct {
537 uint32_t reserved10:2;
538 unsigned int address_lo_32b:30;
539 } bitfields4;
540 struct {
541 uint32_t reserved11:3;
542 uint32_t address_lo_64b:29;
543 } bitfields4b;
544 uint32_t reserved12;
545 unsigned int ordinal4;
546 };
547
548 union {
549 uint32_t address_hi;
550 uint32_t reserved13;
551 uint32_t ordinal5;
552 };
553
554 union {
555 uint32_t data_lo;
556 uint32_t cmp_data_lo;
557 struct {
558 uint32_t dw_offset:16;
559 uint32_t num_dwords:16;
560 } bitfields6c;
561 uint32_t reserved14;
562 uint32_t ordinal6;
563 };
564
565 union {
566 uint32_t data_hi;
567 uint32_t cmp_data_hi;
568 uint32_t reserved15;
569 uint32_t reserved16;
570 uint32_t ordinal7;
571 };
572
573 uint32_t int_ctxid;
574
575};
576
577#endif
578
579enum {
580 CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
581};
582#endif
583
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 96a9cc0f02c9..5e3990bb4c4b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -39,11 +39,37 @@
39 39
40#include "amd_shared.h" 40#include "amd_shared.h"
41 41
42#define KFD_MAX_RING_ENTRY_SIZE 8
43
42#define KFD_SYSFS_FILE_MODE 0444 44#define KFD_SYSFS_FILE_MODE 0444
43 45
44#define KFD_MMAP_DOORBELL_MASK 0x8000000000000ull 46/* GPU ID hash width in bits */
45#define KFD_MMAP_EVENTS_MASK 0x4000000000000ull 47#define KFD_GPU_ID_HASH_WIDTH 16
46#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000ull 48
49/* Use upper bits of mmap offset to store KFD driver specific information.
50 * BITS[63:62] - Encode MMAP type
51 * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to
52 * BITS[45:0] - MMAP offset value
53 *
54 * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
55 * defines are w.r.t to PAGE_SIZE
56 */
57#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT)
58#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT)
59#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT)
60#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT)
61#define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT)
62
63#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
64#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
65 << KFD_MMAP_GPU_ID_SHIFT)
66#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\
67 & KFD_MMAP_GPU_ID_MASK)
68#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \
69 >> KFD_MMAP_GPU_ID_SHIFT)
70
71#define KFD_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFULL >> PAGE_SHIFT)
72#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK)
47 73
48/* 74/*
49 * When working with cp scheduler we should assign the HIQ manually or via 75 * When working with cp scheduler we should assign the HIQ manually or via
@@ -55,9 +81,6 @@
55#define KFD_CIK_HIQ_PIPE 4 81#define KFD_CIK_HIQ_PIPE 4
56#define KFD_CIK_HIQ_QUEUE 0 82#define KFD_CIK_HIQ_QUEUE 0
57 83
58/* GPU ID hash width in bits */
59#define KFD_GPU_ID_HASH_WIDTH 16
60
61/* Macro for allocating structures */ 84/* Macro for allocating structures */
62#define kfd_alloc_struct(ptr_to_struct) \ 85#define kfd_alloc_struct(ptr_to_struct) \
63 ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) 86 ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
@@ -116,6 +139,11 @@ extern int debug_largebar;
116 */ 139 */
117extern int ignore_crat; 140extern int ignore_crat;
118 141
142/*
143 * Set sh_mem_config.retry_disable on Vega10
144 */
145extern int vega10_noretry;
146
119/** 147/**
120 * enum kfd_sched_policy 148 * enum kfd_sched_policy
121 * 149 *
@@ -148,6 +176,8 @@ enum cache_policy {
148 cache_policy_noncoherent 176 cache_policy_noncoherent
149}; 177};
150 178
179#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10)
180
151struct kfd_event_interrupt_class { 181struct kfd_event_interrupt_class {
152 bool (*interrupt_isr)(struct kfd_dev *dev, 182 bool (*interrupt_isr)(struct kfd_dev *dev,
153 const uint32_t *ih_ring_entry); 183 const uint32_t *ih_ring_entry);
@@ -160,6 +190,7 @@ struct kfd_device_info {
160 const struct kfd_event_interrupt_class *event_interrupt_class; 190 const struct kfd_event_interrupt_class *event_interrupt_class;
161 unsigned int max_pasid_bits; 191 unsigned int max_pasid_bits;
162 unsigned int max_no_of_hqd; 192 unsigned int max_no_of_hqd;
193 unsigned int doorbell_size;
163 size_t ih_ring_entry_size; 194 size_t ih_ring_entry_size;
164 uint8_t num_of_watch_points; 195 uint8_t num_of_watch_points;
165 uint16_t mqd_size_aligned; 196 uint16_t mqd_size_aligned;
@@ -173,6 +204,7 @@ struct kfd_mem_obj {
173 uint32_t range_end; 204 uint32_t range_end;
174 uint64_t gpu_addr; 205 uint64_t gpu_addr;
175 uint32_t *cpu_ptr; 206 uint32_t *cpu_ptr;
207 void *gtt_mem;
176}; 208};
177 209
178struct kfd_vmid_info { 210struct kfd_vmid_info {
@@ -364,7 +396,7 @@ struct queue_properties {
364 uint32_t queue_percent; 396 uint32_t queue_percent;
365 uint32_t *read_ptr; 397 uint32_t *read_ptr;
366 uint32_t *write_ptr; 398 uint32_t *write_ptr;
367 uint32_t __iomem *doorbell_ptr; 399 void __iomem *doorbell_ptr;
368 uint32_t doorbell_off; 400 uint32_t doorbell_off;
369 bool is_interop; 401 bool is_interop;
370 bool is_evicted; 402 bool is_evicted;
@@ -427,6 +459,7 @@ struct queue {
427 uint32_t queue; 459 uint32_t queue;
428 460
429 unsigned int sdma_id; 461 unsigned int sdma_id;
462 unsigned int doorbell_id;
430 463
431 struct kfd_process *process; 464 struct kfd_process *process;
432 struct kfd_dev *device; 465 struct kfd_dev *device;
@@ -501,6 +534,9 @@ struct qcm_process_device {
501 /* IB memory */ 534 /* IB memory */
502 uint64_t ib_base; 535 uint64_t ib_base;
503 void *ib_kaddr; 536 void *ib_kaddr;
537
538 /* doorbell resources per process per device */
539 unsigned long *doorbell_bitmap;
504}; 540};
505 541
506/* KFD Memory Eviction */ 542/* KFD Memory Eviction */
@@ -512,6 +548,8 @@ struct qcm_process_device {
512/* Approx. time before evicting the process again */ 548/* Approx. time before evicting the process again */
513#define PROCESS_ACTIVE_TIME_MS 10 549#define PROCESS_ACTIVE_TIME_MS 10
514 550
551int kgd2kfd_quiesce_mm(struct mm_struct *mm);
552int kgd2kfd_resume_mm(struct mm_struct *mm);
515int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, 553int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
516 struct dma_fence *fence); 554 struct dma_fence *fence);
517 555
@@ -681,6 +719,8 @@ struct kfd_process *kfd_get_process(const struct task_struct *);
681struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); 719struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
682struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); 720struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
683void kfd_unref_process(struct kfd_process *p); 721void kfd_unref_process(struct kfd_process *p);
722int kfd_process_evict_queues(struct kfd_process *p);
723int kfd_process_restore_queues(struct kfd_process *p);
684void kfd_suspend_all_processes(void); 724void kfd_suspend_all_processes(void);
685int kfd_resume_all_processes(void); 725int kfd_resume_all_processes(void);
686 726
@@ -693,7 +733,7 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
693struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, 733struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
694 struct kfd_process *p); 734 struct kfd_process *p);
695 735
696int kfd_reserved_mem_mmap(struct kfd_process *process, 736int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
697 struct vm_area_struct *vma); 737 struct vm_area_struct *vma);
698 738
699/* KFD process API for creating and translating handles */ 739/* KFD process API for creating and translating handles */
@@ -721,17 +761,20 @@ unsigned int kfd_pasid_alloc(void);
721void kfd_pasid_free(unsigned int pasid); 761void kfd_pasid_free(unsigned int pasid);
722 762
723/* Doorbells */ 763/* Doorbells */
764size_t kfd_doorbell_process_slice(struct kfd_dev *kfd);
724int kfd_doorbell_init(struct kfd_dev *kfd); 765int kfd_doorbell_init(struct kfd_dev *kfd);
725void kfd_doorbell_fini(struct kfd_dev *kfd); 766void kfd_doorbell_fini(struct kfd_dev *kfd);
726int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); 767int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
727u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, 768 struct vm_area_struct *vma);
769void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
728 unsigned int *doorbell_off); 770 unsigned int *doorbell_off);
729void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); 771void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr);
730u32 read_kernel_doorbell(u32 __iomem *db); 772u32 read_kernel_doorbell(u32 __iomem *db);
731void write_kernel_doorbell(u32 __iomem *db, u32 value); 773void write_kernel_doorbell(void __iomem *db, u32 value);
732unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, 774void write_kernel_doorbell64(void __iomem *db, u64 value);
775unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
733 struct kfd_process *process, 776 struct kfd_process *process,
734 unsigned int queue_id); 777 unsigned int doorbell_id);
735phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, 778phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
736 struct kfd_process *process); 779 struct kfd_process *process);
737int kfd_alloc_process_doorbells(struct kfd_process *process); 780int kfd_alloc_process_doorbells(struct kfd_process *process);
@@ -788,6 +831,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
788 struct kfd_dev *dev); 831 struct kfd_dev *dev);
789struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, 832struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
790 struct kfd_dev *dev); 833 struct kfd_dev *dev);
834struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
835 struct kfd_dev *dev);
791struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); 836struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
792void device_queue_manager_uninit(struct device_queue_manager *dqm); 837void device_queue_manager_uninit(struct device_queue_manager *dqm);
793struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, 838struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
@@ -832,8 +877,42 @@ struct packet_manager {
832 bool allocated; 877 bool allocated;
833 struct kfd_mem_obj *ib_buffer_obj; 878 struct kfd_mem_obj *ib_buffer_obj;
834 unsigned int ib_size_bytes; 879 unsigned int ib_size_bytes;
880
881 const struct packet_manager_funcs *pmf;
882};
883
884struct packet_manager_funcs {
885 /* Support ASIC-specific packet formats for PM4 packets */
886 int (*map_process)(struct packet_manager *pm, uint32_t *buffer,
887 struct qcm_process_device *qpd);
888 int (*runlist)(struct packet_manager *pm, uint32_t *buffer,
889 uint64_t ib, size_t ib_size_in_dwords, bool chain);
890 int (*set_resources)(struct packet_manager *pm, uint32_t *buffer,
891 struct scheduling_resources *res);
892 int (*map_queues)(struct packet_manager *pm, uint32_t *buffer,
893 struct queue *q, bool is_static);
894 int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
895 enum kfd_queue_type type,
896 enum kfd_unmap_queues_filter mode,
897 uint32_t filter_param, bool reset,
898 unsigned int sdma_engine);
899 int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
900 uint64_t fence_address, uint32_t fence_value);
901 int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
902
903 /* Packet sizes */
904 int map_process_size;
905 int runlist_size;
906 int set_resources_size;
907 int map_queues_size;
908 int unmap_queues_size;
909 int query_status_size;
910 int release_mem_size;
835}; 911};
836 912
913extern const struct packet_manager_funcs kfd_vi_pm_funcs;
914extern const struct packet_manager_funcs kfd_v9_pm_funcs;
915
837int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); 916int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
838void pm_uninit(struct packet_manager *pm); 917void pm_uninit(struct packet_manager *pm);
839int pm_send_set_resources(struct packet_manager *pm, 918int pm_send_set_resources(struct packet_manager *pm,
@@ -849,12 +928,17 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
849 928
850void pm_release_ib(struct packet_manager *pm); 929void pm_release_ib(struct packet_manager *pm);
851 930
852uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer); 931/* Following PM funcs can be shared among VI and AI */
932unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
933int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
934 struct scheduling_resources *res);
853 935
854uint64_t kfd_get_number_elems(struct kfd_dev *kfd); 936uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
855 937
856/* Events */ 938/* Events */
857extern const struct kfd_event_interrupt_class event_interrupt_class_cik; 939extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
940extern const struct kfd_event_interrupt_class event_interrupt_class_v9;
941
858extern const struct kfd_device_global_init_class device_global_init_class_cik; 942extern const struct kfd_device_global_init_class device_global_init_class_cik;
859 943
860void kfd_event_init_process(struct kfd_process *p); 944void kfd_event_init_process(struct kfd_process *p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 1711ad0642f7..1d80b4f7c681 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -332,6 +332,7 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
332 free_pages((unsigned long)pdd->qpd.cwsr_kaddr, 332 free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
333 get_order(KFD_CWSR_TBA_TMA_SIZE)); 333 get_order(KFD_CWSR_TBA_TMA_SIZE));
334 334
335 kfree(pdd->qpd.doorbell_bitmap);
335 idr_destroy(&pdd->alloc_idr); 336 idr_destroy(&pdd->alloc_idr);
336 337
337 kfree(pdd); 338 kfree(pdd);
@@ -451,7 +452,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
451 if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base) 452 if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base)
452 continue; 453 continue;
453 454
454 offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT; 455 offset = (KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id))
456 << PAGE_SHIFT;
455 qpd->tba_addr = (int64_t)vm_mmap(filep, 0, 457 qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
456 KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, 458 KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
457 MAP_SHARED, offset); 459 MAP_SHARED, offset);
@@ -585,6 +587,31 @@ err_alloc_process:
585 return ERR_PTR(err); 587 return ERR_PTR(err);
586} 588}
587 589
590static int init_doorbell_bitmap(struct qcm_process_device *qpd,
591 struct kfd_dev *dev)
592{
593 unsigned int i;
594
595 if (!KFD_IS_SOC15(dev->device_info->asic_family))
596 return 0;
597
598 qpd->doorbell_bitmap =
599 kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
600 BITS_PER_BYTE), GFP_KERNEL);
601 if (!qpd->doorbell_bitmap)
602 return -ENOMEM;
603
604 /* Mask out any reserved doorbells */
605 for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++)
606 if ((dev->shared_resources.reserved_doorbell_mask & i) ==
607 dev->shared_resources.reserved_doorbell_val) {
608 set_bit(i, qpd->doorbell_bitmap);
609 pr_debug("reserved doorbell 0x%03x\n", i);
610 }
611
612 return 0;
613}
614
588struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, 615struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
589 struct kfd_process *p) 616 struct kfd_process *p)
590{ 617{
@@ -606,6 +633,12 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
606 if (!pdd) 633 if (!pdd)
607 return NULL; 634 return NULL;
608 635
636 if (init_doorbell_bitmap(&pdd->qpd, dev)) {
637 pr_err("Failed to init doorbell for process\n");
638 kfree(pdd);
639 return NULL;
640 }
641
609 pdd->dev = dev; 642 pdd->dev = dev;
610 INIT_LIST_HEAD(&pdd->qpd.queues_list); 643 INIT_LIST_HEAD(&pdd->qpd.queues_list);
611 INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); 644 INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
@@ -808,7 +841,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
808 * Eviction is reference-counted per process-device. This means multiple 841 * Eviction is reference-counted per process-device. This means multiple
809 * evictions from different sources can be nested safely. 842 * evictions from different sources can be nested safely.
810 */ 843 */
811static int process_evict_queues(struct kfd_process *p) 844int kfd_process_evict_queues(struct kfd_process *p)
812{ 845{
813 struct kfd_process_device *pdd; 846 struct kfd_process_device *pdd;
814 int r = 0; 847 int r = 0;
@@ -844,7 +877,7 @@ fail:
844} 877}
845 878
846/* process_restore_queues - Restore all user queues of a process */ 879/* process_restore_queues - Restore all user queues of a process */
847static int process_restore_queues(struct kfd_process *p) 880int kfd_process_restore_queues(struct kfd_process *p)
848{ 881{
849 struct kfd_process_device *pdd; 882 struct kfd_process_device *pdd;
850 int r, ret = 0; 883 int r, ret = 0;
@@ -886,7 +919,7 @@ static void evict_process_worker(struct work_struct *work)
886 flush_delayed_work(&p->restore_work); 919 flush_delayed_work(&p->restore_work);
887 920
888 pr_debug("Started evicting pasid %d\n", p->pasid); 921 pr_debug("Started evicting pasid %d\n", p->pasid);
889 ret = process_evict_queues(p); 922 ret = kfd_process_evict_queues(p);
890 if (!ret) { 923 if (!ret) {
891 dma_fence_signal(p->ef); 924 dma_fence_signal(p->ef);
892 dma_fence_put(p->ef); 925 dma_fence_put(p->ef);
@@ -946,7 +979,7 @@ static void restore_process_worker(struct work_struct *work)
946 return; 979 return;
947 } 980 }
948 981
949 ret = process_restore_queues(p); 982 ret = kfd_process_restore_queues(p);
950 if (!ret) 983 if (!ret)
951 pr_debug("Finished restoring pasid %d\n", p->pasid); 984 pr_debug("Finished restoring pasid %d\n", p->pasid);
952 else 985 else
@@ -963,7 +996,7 @@ void kfd_suspend_all_processes(void)
963 cancel_delayed_work_sync(&p->eviction_work); 996 cancel_delayed_work_sync(&p->eviction_work);
964 cancel_delayed_work_sync(&p->restore_work); 997 cancel_delayed_work_sync(&p->restore_work);
965 998
966 if (process_evict_queues(p)) 999 if (kfd_process_evict_queues(p))
967 pr_err("Failed to suspend process %d\n", p->pasid); 1000 pr_err("Failed to suspend process %d\n", p->pasid);
968 dma_fence_signal(p->ef); 1001 dma_fence_signal(p->ef);
969 dma_fence_put(p->ef); 1002 dma_fence_put(p->ef);
@@ -989,15 +1022,12 @@ int kfd_resume_all_processes(void)
989 return ret; 1022 return ret;
990} 1023}
991 1024
992int kfd_reserved_mem_mmap(struct kfd_process *process, 1025int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
993 struct vm_area_struct *vma) 1026 struct vm_area_struct *vma)
994{ 1027{
995 struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
996 struct kfd_process_device *pdd; 1028 struct kfd_process_device *pdd;
997 struct qcm_process_device *qpd; 1029 struct qcm_process_device *qpd;
998 1030
999 if (!dev)
1000 return -EINVAL;
1001 if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { 1031 if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
1002 pr_err("Incorrect CWSR mapping size.\n"); 1032 pr_err("Incorrect CWSR mapping size.\n");
1003 return -EINVAL; 1033 return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 7817e327ea6d..d65ce0436b31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -119,9 +119,6 @@ static int create_cp_queue(struct process_queue_manager *pqm,
119 /* Doorbell initialized in user space*/ 119 /* Doorbell initialized in user space*/
120 q_properties->doorbell_ptr = NULL; 120 q_properties->doorbell_ptr = NULL;
121 121
122 q_properties->doorbell_off =
123 kfd_queue_id_to_doorbell(dev, pqm->process, qid);
124
125 /* let DQM handle it*/ 122 /* let DQM handle it*/
126 q_properties->vmid = 0; 123 q_properties->vmid = 0;
127 q_properties->queue_id = qid; 124 q_properties->queue_id = qid;
@@ -244,10 +241,20 @@ int pqm_create_queue(struct process_queue_manager *pqm,
244 } 241 }
245 242
246 if (retval != 0) { 243 if (retval != 0) {
247 pr_err("DQM create queue failed\n"); 244 pr_err("Pasid %d DQM create queue %d failed. ret %d\n",
245 pqm->process->pasid, type, retval);
248 goto err_create_queue; 246 goto err_create_queue;
249 } 247 }
250 248
249 if (q)
250 /* Return the doorbell offset within the doorbell page
251 * to the caller so it can be passed up to user mode
252 * (in bytes).
253 */
254 properties->doorbell_off =
255 (q->properties.doorbell_off * sizeof(uint32_t)) &
256 (kfd_doorbell_process_slice(dev) - 1);
257
251 pr_debug("PQM After DQM create queue\n"); 258 pr_debug("PQM After DQM create queue\n");
252 259
253 list_add(&pqn->process_queue_list, &pqm->queues); 260 list_add(&pqn->process_queue_list, &pqm->queues);
@@ -313,8 +320,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
313 dqm = pqn->q->device->dqm; 320 dqm = pqn->q->device->dqm;
314 retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); 321 retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
315 if (retval) { 322 if (retval) {
316 pr_debug("Destroy queue failed, returned %d\n", retval); 323 pr_err("Pasid %d destroy queue %d failed, ret %d\n",
317 goto err_destroy_queue; 324 pqm->process->pasid,
325 pqn->q->properties.queue_id, retval);
326 if (retval != -ETIME)
327 goto err_destroy_queue;
318 } 328 }
319 uninit_queue(pqn->q); 329 uninit_queue(pqn->q);
320 } 330 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index a5315d4f1c95..6dcd621e5b71 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -36,8 +36,8 @@ void print_queue_properties(struct queue_properties *q)
36 pr_debug("Queue Address: 0x%llX\n", q->queue_address); 36 pr_debug("Queue Address: 0x%llX\n", q->queue_address);
37 pr_debug("Queue Id: %u\n", q->queue_id); 37 pr_debug("Queue Id: %u\n", q->queue_id);
38 pr_debug("Queue Process Vmid: %u\n", q->vmid); 38 pr_debug("Queue Process Vmid: %u\n", q->vmid);
39 pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr); 39 pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr);
40 pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr); 40 pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr);
41 pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); 41 pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr);
42 pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); 42 pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off);
43} 43}
@@ -53,8 +53,8 @@ void print_queue(struct queue *q)
53 pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); 53 pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address);
54 pr_debug("Queue Id: %u\n", q->properties.queue_id); 54 pr_debug("Queue Id: %u\n", q->properties.queue_id);
55 pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); 55 pr_debug("Queue Process Vmid: %u\n", q->properties.vmid);
56 pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr); 56 pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr);
57 pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr); 57 pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr);
58 pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); 58 pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr);
59 pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); 59 pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off);
60 pr_debug("Queue MQD Address: 0x%p\n", q->mqd); 60 pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index ac28abc94e57..bc95d4dfee2e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1239,6 +1239,12 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
1239 HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & 1239 HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
1240 HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); 1240 HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
1241 break; 1241 break;
1242 case CHIP_VEGA10:
1243 case CHIP_RAVEN:
1244 dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
1245 HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
1246 HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
1247 break;
1242 default: 1248 default:
1243 WARN(1, "Unexpected ASIC family %u", 1249 WARN(1, "Unexpected ASIC family %u",
1244 dev->gpu->device_info->asic_family); 1250 dev->gpu->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index eb54cfcaf039..7d9c3f948dff 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -45,6 +45,7 @@
45 45
46#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 46#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0
47#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 47#define HSA_CAP_DOORBELL_TYPE_1_0 0x1
48#define HSA_CAP_DOORBELL_TYPE_2_0 0x2
48#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 49#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000
49 50
50struct kfd_node_properties { 51struct kfd_node_properties {
diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
new file mode 100644
index 000000000000..0bc0b25cb410
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright 2016-2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#ifndef HSA_SOC15_INT_H_INCLUDED
24#define HSA_SOC15_INT_H_INCLUDED
25
26#include "soc15_ih_clientid.h"
27
28#define SOC15_INTSRC_CP_END_OF_PIPE 181
29#define SOC15_INTSRC_CP_BAD_OPCODE 183
30#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239
31#define SOC15_INTSRC_VMC_FAULT 0
32#define SOC15_INTSRC_SDMA_TRAP 224
33
34
35#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff)
36#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff)
37#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff)
38#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf)
39#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1)
40#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff)
41#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4]))
42#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5]))
43#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6]))
44#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7]))
45
46#endif
47
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 237289a72bb7..5733fbee07f7 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -100,6 +100,21 @@ struct kgd2kfd_shared_resources {
100 /* Bit n == 1 means Queue n is available for KFD */ 100 /* Bit n == 1 means Queue n is available for KFD */
101 DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); 101 DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES);
102 102
103 /* Doorbell assignments (SOC15 and later chips only). Only
104 * specific doorbells are routed to each SDMA engine. Others
105 * are routed to IH and VCN. They are not usable by the CP.
106 *
107 * Any doorbell number D that satisfies the following condition
108 * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val
109 *
110 * KFD currently uses 1024 (= 0x3ff) doorbells per process. If
111 * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means
112 * mask would be set to 0x1f8 and val set to 0x0f0.
113 */
114 unsigned int sdma_doorbell[2][2];
115 unsigned int reserved_doorbell_mask;
116 unsigned int reserved_doorbell_val;
117
103 /* Base address of doorbell aperture. */ 118 /* Base address of doorbell aperture. */
104 phys_addr_t doorbell_physical_address; 119 phys_addr_t doorbell_physical_address;
105 120
@@ -173,8 +188,6 @@ struct tile_config {
173 * @set_pasid_vmid_mapping: Exposes pasid/vmid pair to the H/W for no cp 188 * @set_pasid_vmid_mapping: Exposes pasid/vmid pair to the H/W for no cp
174 * scheduling mode. Only used for no cp scheduling mode. 189 * scheduling mode. Only used for no cp scheduling mode.
175 * 190 *
176 * @init_pipeline: Initialized the compute pipelines.
177 *
178 * @hqd_load: Loads the mqd structure to a H/W hqd slot. used only for no cp 191 * @hqd_load: Loads the mqd structure to a H/W hqd slot. used only for no cp
179 * sceduling mode. 192 * sceduling mode.
180 * 193 *
@@ -274,9 +287,6 @@ struct kfd2kgd_calls {
274 int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, unsigned int pasid, 287 int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, unsigned int pasid,
275 unsigned int vmid); 288 unsigned int vmid);
276 289
277 int (*init_pipeline)(struct kgd_dev *kgd, uint32_t pipe_id,
278 uint32_t hpd_size, uint64_t hpd_gpu_addr);
279
280 int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); 290 int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id);
281 291
282 int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 292 int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
@@ -382,6 +392,10 @@ struct kfd2kgd_calls {
382 * 392 *
383 * @resume: Notifies amdkfd about a resume action done to a kgd device 393 * @resume: Notifies amdkfd about a resume action done to a kgd device
384 * 394 *
395 * @quiesce_mm: Quiesce all user queue access to specified MM address space
396 *
397 * @resume_mm: Resume user queue access to specified MM address space
398 *
385 * @schedule_evict_and_restore_process: Schedules work queue that will prepare 399 * @schedule_evict_and_restore_process: Schedules work queue that will prepare
386 * for safe eviction of KFD BOs that belong to the specified process. 400 * for safe eviction of KFD BOs that belong to the specified process.
387 * 401 *
@@ -399,6 +413,8 @@ struct kgd2kfd_calls {
399 void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); 413 void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry);
400 void (*suspend)(struct kfd_dev *kfd); 414 void (*suspend)(struct kfd_dev *kfd);
401 int (*resume)(struct kfd_dev *kfd); 415 int (*resume)(struct kfd_dev *kfd);
416 int (*quiesce_mm)(struct mm_struct *mm);
417 int (*resume_mm)(struct mm_struct *mm);
402 int (*schedule_evict_and_restore_process)(struct mm_struct *mm, 418 int (*schedule_evict_and_restore_process)(struct mm_struct *mm,
403 struct dma_fence *fence); 419 struct dma_fence *fence);
404}; 420};
diff --git a/drivers/gpu/drm/amd/include/v9_structs.h b/drivers/gpu/drm/amd/include/v9_structs.h
index 2fb25abaf7c8..ceaf4932258d 100644
--- a/drivers/gpu/drm/amd/include/v9_structs.h
+++ b/drivers/gpu/drm/amd/include/v9_structs.h
@@ -29,10 +29,10 @@ struct v9_sdma_mqd {
29 uint32_t sdmax_rlcx_rb_base; 29 uint32_t sdmax_rlcx_rb_base;
30 uint32_t sdmax_rlcx_rb_base_hi; 30 uint32_t sdmax_rlcx_rb_base_hi;
31 uint32_t sdmax_rlcx_rb_rptr; 31 uint32_t sdmax_rlcx_rb_rptr;
32 uint32_t sdmax_rlcx_rb_rptr_hi;
32 uint32_t sdmax_rlcx_rb_wptr; 33 uint32_t sdmax_rlcx_rb_wptr;
34 uint32_t sdmax_rlcx_rb_wptr_hi;
33 uint32_t sdmax_rlcx_rb_wptr_poll_cntl; 35 uint32_t sdmax_rlcx_rb_wptr_poll_cntl;
34 uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi;
35 uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo;
36 uint32_t sdmax_rlcx_rb_rptr_addr_hi; 36 uint32_t sdmax_rlcx_rb_rptr_addr_hi;
37 uint32_t sdmax_rlcx_rb_rptr_addr_lo; 37 uint32_t sdmax_rlcx_rb_rptr_addr_lo;
38 uint32_t sdmax_rlcx_ib_cntl; 38 uint32_t sdmax_rlcx_ib_cntl;
@@ -44,29 +44,29 @@ struct v9_sdma_mqd {
44 uint32_t sdmax_rlcx_skip_cntl; 44 uint32_t sdmax_rlcx_skip_cntl;
45 uint32_t sdmax_rlcx_context_status; 45 uint32_t sdmax_rlcx_context_status;
46 uint32_t sdmax_rlcx_doorbell; 46 uint32_t sdmax_rlcx_doorbell;
47 uint32_t sdmax_rlcx_virtual_addr; 47 uint32_t sdmax_rlcx_status;
48 uint32_t sdmax_rlcx_ape1_cntl;
49 uint32_t sdmax_rlcx_doorbell_log; 48 uint32_t sdmax_rlcx_doorbell_log;
50 uint32_t reserved_22; 49 uint32_t sdmax_rlcx_watermark;
51 uint32_t reserved_23; 50 uint32_t sdmax_rlcx_doorbell_offset;
52 uint32_t reserved_24; 51 uint32_t sdmax_rlcx_csa_addr_lo;
53 uint32_t reserved_25; 52 uint32_t sdmax_rlcx_csa_addr_hi;
54 uint32_t reserved_26; 53 uint32_t sdmax_rlcx_ib_sub_remain;
55 uint32_t reserved_27; 54 uint32_t sdmax_rlcx_preempt;
56 uint32_t reserved_28; 55 uint32_t sdmax_rlcx_dummy_reg;
57 uint32_t reserved_29; 56 uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi;
58 uint32_t reserved_30; 57 uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo;
59 uint32_t reserved_31; 58 uint32_t sdmax_rlcx_rb_aql_cntl;
60 uint32_t reserved_32; 59 uint32_t sdmax_rlcx_minor_ptr_update;
61 uint32_t reserved_33; 60 uint32_t sdmax_rlcx_midcmd_data0;
62 uint32_t reserved_34; 61 uint32_t sdmax_rlcx_midcmd_data1;
63 uint32_t reserved_35; 62 uint32_t sdmax_rlcx_midcmd_data2;
64 uint32_t reserved_36; 63 uint32_t sdmax_rlcx_midcmd_data3;
65 uint32_t reserved_37; 64 uint32_t sdmax_rlcx_midcmd_data4;
66 uint32_t reserved_38; 65 uint32_t sdmax_rlcx_midcmd_data5;
67 uint32_t reserved_39; 66 uint32_t sdmax_rlcx_midcmd_data6;
68 uint32_t reserved_40; 67 uint32_t sdmax_rlcx_midcmd_data7;
69 uint32_t reserved_41; 68 uint32_t sdmax_rlcx_midcmd_data8;
69 uint32_t sdmax_rlcx_midcmd_cntl;
70 uint32_t reserved_42; 70 uint32_t reserved_42;
71 uint32_t reserved_43; 71 uint32_t reserved_43;
72 uint32_t reserved_44; 72 uint32_t reserved_44;