aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFelix Kuehling <Felix.Kuehling@amd.com>2018-04-10 17:33:16 -0400
committerOded Gabbay <oded.gabbay@gmail.com>2018-04-10 17:33:16 -0400
commit3e76c2399b55483b1a28499b090f9d6600ab9eff (patch)
treef2ee5a6bf3426a7dabbde1524ab80b5eea97cfc6
parent70a31d16ccac518c701b9fbfacce5460a226bfd9 (diff)
drm/amdkfd: Add GFXv9 CWSR trap handler
Signed-off-by: Shaoyun Liu <Shaoyun.Liu@amd.com> Signed-off-by: Jay Cornwall <Jay.Cornwall@amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm1495
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c13
2 files changed, 1505 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
new file mode 100644
index 000000000000..033580c997ea
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -0,0 +1,1495 @@
1/*
2 * Copyright 2016 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#if 0
24HW (GFX9) source code for CWSR trap handler
25#Version 18 + multiple trap handler
26
27// this performance-optimal version was originally from Seven Xu at SRDC
28
29// Revison #18 --...
30/* Rev History
31** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
32** #4. SR Memory Layout:
33** 1. VGPR-SGPR-HWREG-{LDS}
34** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
35** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
36** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
37** #7. Update: 1. don't barrier if noLDS
38** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
39** 2. Fix SQ issue by s_sleep 2
40** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
41** 2. optimize s_buffer save by burst 16sgprs...
42** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
43** #11. Update 1. Add 2 more timestamp for debug version
44** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
45** #13. Integ 1. Always use MUBUF for PV trap shader...
46** #14. Update 1. s_buffer_store soft clause...
47** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
48** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
49** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
50** 2. PERF - Save LDS before save VGPR to cover LDS save long latency...
51** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
52** 2. FUNC - Handle non-CWSR traps
53*/
54
55var G8SR_WDMEM_HWREG_OFFSET = 0
56var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes
57
58// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
59
60var G8SR_DEBUG_TIMESTAMP = 0
61var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
62var s_g8sr_ts_save_s = s[34:35] // save start
63var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi
64var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ
65var s_g8sr_ts_save_d = s[40:41] // save end
66var s_g8sr_ts_restore_s = s[42:43] // restore start
67var s_g8sr_ts_restore_d = s[44:45] // restore end
68
69var G8SR_VGPR_SR_IN_DWX4 = 0
70var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes
71var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
72
73
74/*************************************************************************/
75/* control on how to run the shader */
76/*************************************************************************/
77//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
78var EMU_RUN_HACK = 0
79var EMU_RUN_HACK_RESTORE_NORMAL = 0
80var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
81var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
82var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
83var SAVE_LDS = 1
84var WG_BASE_ADDR_LO = 0x9000a000
85var WG_BASE_ADDR_HI = 0x0
86var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
87var CTX_SAVE_CONTROL = 0x0
88var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
89var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
90var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write
91var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
92var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
93var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency
94
95/**************************************************************************/
96/* variables */
97/**************************************************************************/
98var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
99var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
100var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
101var SQ_WAVE_STATUS_HALT_MASK = 0x2000
102
103var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
104var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
105var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
106var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
107var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
108var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
109
110var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
111var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
112var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
113var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
114var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
115var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
116var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
117var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
118var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
119var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
120var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
121var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
122
123var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
124var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
125var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000
126var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
127
128var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
129var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
130
131var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data
132var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000
133
134/* Save */
135var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
136var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
137
138var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
139var S_SAVE_SPI_INIT_ATC_SHIFT = 27
140var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
141var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
142var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
143var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
144
145var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
146var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
147var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
148var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
149
150var s_save_spi_init_lo = exec_lo
151var s_save_spi_init_hi = exec_hi
152
153var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
154var s_save_pc_hi = ttmp1
155var s_save_exec_lo = ttmp2
156var s_save_exec_hi = ttmp3
157var s_save_tmp = ttmp4
158var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
159var s_save_xnack_mask_lo = ttmp6
160var s_save_xnack_mask_hi = ttmp7
161var s_save_buf_rsrc0 = ttmp8
162var s_save_buf_rsrc1 = ttmp9
163var s_save_buf_rsrc2 = ttmp10
164var s_save_buf_rsrc3 = ttmp11
165var s_save_status = ttmp12
166var s_save_mem_offset = ttmp14
167var s_save_alloc_size = s_save_trapsts //conflict
168var s_save_m0 = ttmp15
169var s_save_ttmps_lo = s_save_tmp //no conflict
170var s_save_ttmps_hi = s_save_trapsts //no conflict
171
172/* Restore */
173var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
174var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
175
176var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
177var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
178var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
179var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
180var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
181var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
182
183var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
184var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
185var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
186var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
187
188var s_restore_spi_init_lo = exec_lo
189var s_restore_spi_init_hi = exec_hi
190
191var s_restore_mem_offset = ttmp12
192var s_restore_alloc_size = ttmp3
193var s_restore_tmp = ttmp2
194var s_restore_mem_offset_save = s_restore_tmp //no conflict
195
196var s_restore_m0 = s_restore_alloc_size //no conflict
197
198var s_restore_mode = ttmp7
199
200var s_restore_pc_lo = ttmp0
201var s_restore_pc_hi = ttmp1
202var s_restore_exec_lo = ttmp14
203var s_restore_exec_hi = ttmp15
204var s_restore_status = ttmp4
205var s_restore_trapsts = ttmp5
206var s_restore_xnack_mask_lo = xnack_mask_lo
207var s_restore_xnack_mask_hi = xnack_mask_hi
208var s_restore_buf_rsrc0 = ttmp8
209var s_restore_buf_rsrc1 = ttmp9
210var s_restore_buf_rsrc2 = ttmp10
211var s_restore_buf_rsrc3 = ttmp11
212var s_restore_ttmps_lo = s_restore_tmp //no conflict
213var s_restore_ttmps_hi = s_restore_alloc_size //no conflict
214
215/**************************************************************************/
216/* trap handler entry points */
217/**************************************************************************/
218/* Shader Main*/
219
220shader main
221 asic(GFX9)
222 type(CS)
223
224
225 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
226 //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
227 s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
228 s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
229 s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
230 //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
231 s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
232 else
233 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
234 end
235
236L_JUMP_TO_RESTORE:
237 s_branch L_RESTORE //restore
238
239L_SKIP_RESTORE:
240
241 s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
242 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save
243 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
244 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
245 s_cbranch_scc1 L_SAVE //this is the operation for save
246
247 // ********* Handle non-CWSR traps *******************
248if (!EMU_RUN_HACK)
249 // Illegal instruction is a non-maskable exception which blocks context save.
250 // Halt the wavefront and return from the trap.
251 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
252 s_cbranch_scc1 L_HALT_WAVE
253
254 // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA.
255 // Instead, halt the wavefront and return from the trap.
256 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
257 s_cbranch_scc0 L_FETCH_2ND_TRAP
258
259L_HALT_WAVE:
260 // If STATUS.HALT is set then this fault must come from SQC instruction fetch.
261 // We cannot prevent further faults so just terminate the wavefront.
262 s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
263 s_cbranch_scc0 L_NOT_ALREADY_HALTED
264 s_endpgm
265L_NOT_ALREADY_HALTED:
266 s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
267
268 // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
269 // Rewind the PC to prevent this from occurring. The debugger compensates for this.
270 s_sub_u32 ttmp0, ttmp0, 0x8
271 s_subb_u32 ttmp1, ttmp1, 0x0
272
273L_FETCH_2ND_TRAP:
274 // Preserve and clear scalar XNACK state before issuing scalar reads.
275 // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26].
276 s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS)
277 s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
278 s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
279 s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK
280 s_or_b32 ttmp11, ttmp11, ttmp3
281
282 s_andn2_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
283 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
284
285 // Read second-level TBA/TMA from first-level TMA and jump if available.
286 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
287 // ttmp12 holds SQ_WAVE_STATUS
288 s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO)
289 s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI)
290 s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
291 s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
292 s_waitcnt lgkmcnt(0)
293 s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
294 s_waitcnt lgkmcnt(0)
295 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
296 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
297 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
298
299L_NO_NEXT_TRAP:
300 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
301 s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
302 s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
303 s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
304 s_addc_u32 ttmp1, ttmp1, 0
305L_EXCP_CASE:
306 s_and_b32 ttmp1, ttmp1, 0xFFFF
307
308 // Restore SQ_WAVE_IB_STS.
309 s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
310 s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
311 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
312
313 // Restore SQ_WAVE_STATUS.
314 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
315 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
316 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status
317
318 s_rfe_b64 [ttmp0, ttmp1]
319end
320 // ********* End handling of non-CWSR traps *******************
321
322/**************************************************************************/
323/* save routine */
324/**************************************************************************/
325
326L_SAVE:
327
328if G8SR_DEBUG_TIMESTAMP
329 s_memrealtime s_g8sr_ts_save_s
330 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
331end
332
333 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
334
335 s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
336 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
337
338 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
339 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
340 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
341 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
342 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
343 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
344 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
345 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
346
347 s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
348
349 /* inform SPI the readiness and wait for SPI's go signal */
350 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
351 s_mov_b32 s_save_exec_hi, exec_hi
352 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
353
354if G8SR_DEBUG_TIMESTAMP
355 s_memrealtime s_g8sr_ts_sq_save_msg
356 s_waitcnt lgkmcnt(0)
357end
358
359 if (EMU_RUN_HACK)
360
361 else
362 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
363 end
364
365 L_SLEEP:
366 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
367
368 if (EMU_RUN_HACK)
369
370 else
371 s_cbranch_execz L_SLEEP
372 end
373
374if G8SR_DEBUG_TIMESTAMP
375 s_memrealtime s_g8sr_ts_spi_wrexec
376 s_waitcnt lgkmcnt(0)
377end
378
379 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
380 //calculate wd_addr using absolute thread id
381 v_readlane_b32 s_save_tmp, v9, 0
382 s_lshr_b32 s_save_tmp, s_save_tmp, 6
383 s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
384 s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
385 s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
386 s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
387 else
388 end
389 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
390 s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
391 s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
392 s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
393 else
394 end
395
396 // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
397 // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
398 get_vgpr_size_bytes(s_save_ttmps_lo)
399 get_sgpr_size_bytes(s_save_ttmps_hi)
400 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
401 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
402 s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0
403 s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
404 s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1
405 ack_sqc_store_workaround()
406 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1
407 ack_sqc_store_workaround()
408 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1
409 ack_sqc_store_workaround()
410 s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1
411 ack_sqc_store_workaround()
412
413 /* setup Resource Contants */
414 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
415 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
416 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
417 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
418 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
419 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
420 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
421 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
422 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
423 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
424 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
425
426 //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
427 s_mov_b32 s_save_m0, m0 //save M0
428
429 /* global mem offset */
430 s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
431
432
433
434
435 /* save HW registers */
436 //////////////////////////////
437
438 L_SAVE_HWREG:
439 // HWREG SR memory offset : size(VGPR)+size(SGPR)
440 get_vgpr_size_bytes(s_save_mem_offset)
441 get_sgpr_size_bytes(s_save_tmp)
442 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
443
444
445 s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
446 if (SWIZZLE_EN)
447 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
448 else
449 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
450 end
451
452
453 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0
454
455 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
456 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
457 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
458 end
459
460 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC
461 write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
462 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC
463 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
464 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS
465
466 //s_save_trapsts conflicts with s_save_alloc_size
467 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
468 write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS
469
470 write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO
471 write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI
472
473 //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
474 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
475 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
476
477
478
479 /* the first wave in the threadgroup */
480 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit
481 s_mov_b32 s_save_exec_hi, 0x0
482 s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26]
483
484
485 /* save SGPRs */
486 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
487 //////////////////////////////
488
489 // SGPR SR memory offset : size(VGPR)
490 get_vgpr_size_bytes(s_save_mem_offset)
491 // TODO, change RSRC word to rearrange memory layout for SGPRS
492
493 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
494 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
495 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
496
497 if (SGPR_SAVE_USE_SQC)
498 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
499 else
500 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
501 end
502
503 if (SWIZZLE_EN)
504 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
505 else
506 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
507 end
508
509
510 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
511 //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
512 s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
513 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
514 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
515
516 s_mov_b32 m0, 0x0 //SGPR initial index value =0
517 s_nop 0x0 //Manually inserted wait states
518 L_SAVE_SGPR_LOOP:
519 // SGPR is allocated in 16 SGPR granularity
520 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
521 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
522 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
523 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
524 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
525 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
526 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
527 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
528
529 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
530 s_add_u32 m0, m0, 16 //next sgpr index
531 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
532 s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
533 // restore s_save_buf_rsrc0,1
534 //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
535 s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
536
537
538
539
540 /* save first 4 VGPR, then LDS save could use */
541 // each wave will alloc 4 vgprs at least...
542 /////////////////////////////////////////////////////////////////////////////////////
543
544 s_mov_b32 s_save_mem_offset, 0
545 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
546 s_mov_b32 exec_hi, 0xFFFFFFFF
547 s_mov_b32 xnack_mask_lo, 0x0
548 s_mov_b32 xnack_mask_hi, 0x0
549
550 if (SWIZZLE_EN)
551 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
552 else
553 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
554 end
555
556
557 // VGPR Allocated in 4-GPR granularity
558
559if G8SR_VGPR_SR_IN_DWX4
560 // the const stride for DWx4 is 4*4 bytes
561 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
562 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
563
564 buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
565
566 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
567 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
568else
569 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
570 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
571 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
572 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
573end
574
575
576
577 /* save LDS */
578 //////////////////////////////
579
580 L_SAVE_LDS:
581
582 // Change EXEC to all threads...
583 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
584 s_mov_b32 exec_hi, 0xFFFFFFFF
585
586 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
587 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
588 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
589
590 s_barrier //LDS is used? wait for other waves in the same TG
591 s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
592 s_cbranch_scc0 L_SAVE_LDS_DONE
593
594 // first wave do LDS save;
595
596 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
597 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
598 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
599
600 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
601 //
602 get_vgpr_size_bytes(s_save_mem_offset)
603 get_sgpr_size_bytes(s_save_tmp)
604 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
605 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
606
607
608 if (SWIZZLE_EN)
609 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
610 else
611 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
612 end
613
614 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
615
616
617var LDS_DMA_ENABLE = 0
618var UNROLL = 0
619if UNROLL==0 && LDS_DMA_ENABLE==1
620 s_mov_b32 s3, 256*2
621 s_nop 0
622 s_nop 0
623 s_nop 0
624 L_SAVE_LDS_LOOP:
625 //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
626 if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity
627 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW
628 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
629 end
630
631 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
632 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes
633 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
634 s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
635
636elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss
637 // store from higest LDS address to lowest
638 s_mov_b32 s3, 256*2
639 s_sub_u32 m0, s_save_alloc_size, s3
640 s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
641 s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks...
642 s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest
643 s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction
644 s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc
645 s_nop 0
646 s_nop 0
647 s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes
648 s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved
649 s_add_u32 s0, s0,s_save_alloc_size
650 s_addc_u32 s1, s1, 0
651 s_setpc_b64 s[0:1]
652
653
654 for var i =0; i< 128; i++
655 // be careful to make here a 64Byte aligned address, which could improve performance...
656 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW
657 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
658
659 if i!=127
660 s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline
661 s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3
662 end
663 end
664
665else // BUFFER_STORE
666 v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
667 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid
668 v_mul_i32_i24 v2, v3, 8 // tid*8
669 v_mov_b32 v3, 256*2
670 s_mov_b32 m0, 0x10000
671 s_mov_b32 s0, s_save_buf_rsrc3
672 s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid
673 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT
674
675L_SAVE_LDS_LOOP_VECTOR:
676 ds_read_b64 v[0:1], v2 //x =LDS[a], byte address
677 s_waitcnt lgkmcnt(0)
678 buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1
679// s_waitcnt vmcnt(0)
680// v_add_u32 v2, vcc[0:1], v2, v3
681 v_add_u32 v2, v2, v3
682 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
683 s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
684
685 // restore rsrc3
686 s_mov_b32 s_save_buf_rsrc3, s0
687
688end
689
690L_SAVE_LDS_DONE:
691
692
693 /* save VGPRs - set the Rest VGPRs */
694 //////////////////////////////////////////////////////////////////////////////////////
695 L_SAVE_VGPR:
696 // VGPR SR memory offset: 0
697 // TODO rearrange the RSRC words to use swizzle for VGPR save...
698
699 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
700 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
701 s_mov_b32 exec_hi, 0xFFFFFFFF
702
703 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
704 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
705 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
706 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
707 if (SWIZZLE_EN)
708 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
709 else
710 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
711 end
712
713
714 // VGPR Allocated in 4-GPR granularity
715
716if G8SR_VGPR_SR_IN_DWX4
717 // the const stride for DWx4 is 4*4 bytes
718 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
719 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
720
721 s_mov_b32 m0, 4 // skip first 4 VGPRs
722 s_cmp_lt_u32 m0, s_save_alloc_size
723 s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs
724
725 s_set_gpr_idx_on m0, 0x1 // This will change M0
726 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0
727L_SAVE_VGPR_LOOP:
728 v_mov_b32 v0, v0 // v0 = v[0+m0]
729 v_mov_b32 v1, v1
730 v_mov_b32 v2, v2
731 v_mov_b32 v3, v3
732
733
734 buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
735 s_add_u32 m0, m0, 4
736 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4
737 s_cmp_lt_u32 m0, s_save_alloc_size
738 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
739 s_set_gpr_idx_off
740L_SAVE_VGPR_LOOP_END:
741
742 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
743 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
744else
745 // VGPR store using dw burst
746 s_mov_b32 m0, 0x4 //VGPR initial index value =0
747 s_cmp_lt_u32 m0, s_save_alloc_size
748 s_cbranch_scc0 L_SAVE_VGPR_END
749
750
751 s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
752 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
753
754 L_SAVE_VGPR_LOOP:
755 v_mov_b32 v0, v0 //v0 = v[0+m0]
756 v_mov_b32 v1, v1 //v0 = v[0+m0]
757 v_mov_b32 v2, v2 //v0 = v[0+m0]
758 v_mov_b32 v3, v3 //v0 = v[0+m0]
759
760 if(USE_MTBUF_INSTEAD_OF_MUBUF)
761 tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
762 else
763 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
764 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
765 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
766 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
767 end
768
769 s_add_u32 m0, m0, 4 //next vgpr index
770 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
771 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
772 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
773 s_set_gpr_idx_off
774end
775
776L_SAVE_VGPR_END:
777
778
779
780
781
782
783 /* S_PGM_END_SAVED */ //FIXME graphics ONLY
784 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
785 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
786 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
787 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
788 s_rfe_b64 s_save_pc_lo //Return to the main shader program
789 else
790 end
791
792// Save Done timestamp
793if G8SR_DEBUG_TIMESTAMP
794 s_memrealtime s_g8sr_ts_save_d
795 // SGPR SR memory offset : size(VGPR)
796 get_vgpr_size_bytes(s_save_mem_offset)
797 s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
798 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
799 // Need reset rsrc2??
800 s_mov_b32 m0, s_save_mem_offset
801 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
802 s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1
803end
804
805
806 s_branch L_END_PGM
807
808
809
810/**************************************************************************/
811/* restore routine */
812/**************************************************************************/
813
814L_RESTORE:
815 /* Setup Resource Contants */
816 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
817 //calculate wd_addr using absolute thread id
818 v_readlane_b32 s_restore_tmp, v9, 0
819 s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
820 s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
821 s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
822 s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
823 s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
824 else
825 end
826
827if G8SR_DEBUG_TIMESTAMP
828 s_memrealtime s_g8sr_ts_restore_s
829 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
830 // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
831 s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
832 s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored..
833end
834
835
836
837 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
838 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
839 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
840 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
841 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
842 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
843 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
844 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
845 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
846 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
847 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
848
849 /* global mem offset */
850// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
851
852 /* the first wave in the threadgroup */
853 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
854 s_cbranch_scc0 L_RESTORE_VGPR
855
856 /* restore LDS */
857 //////////////////////////////
858 L_RESTORE_LDS:
859
860 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
861 s_mov_b32 exec_hi, 0xFFFFFFFF
862
863 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
864 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
865 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
866 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
867 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
868 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
869
870 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
871 //
872 get_vgpr_size_bytes(s_restore_mem_offset)
873 get_sgpr_size_bytes(s_restore_tmp)
874 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
875 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow???
876
877
878 if (SWIZZLE_EN)
879 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
880 else
881 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
882 end
883 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
884
885 L_RESTORE_LDS_LOOP:
886 if (SAVE_LDS)
887 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
888 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
889 end
890 s_add_u32 m0, m0, 256*2 // 128 DW
891 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
892 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
893 s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
894
895
896 /* restore VGPRs */
897 //////////////////////////////
898 L_RESTORE_VGPR:
899 // VGPR SR memory offset : 0
900 s_mov_b32 s_restore_mem_offset, 0x0
901 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
902 s_mov_b32 exec_hi, 0xFFFFFFFF
903
904 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
905 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
906 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
907 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
908 if (SWIZZLE_EN)
909 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
910 else
911 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
912 end
913
914if G8SR_VGPR_SR_IN_DWX4
915 get_vgpr_size_bytes(s_restore_mem_offset)
916 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
917
918 // the const stride for DWx4 is 4*4 bytes
919 s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
920 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
921
922 s_mov_b32 m0, s_restore_alloc_size
923 s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0
924
925L_RESTORE_VGPR_LOOP:
926 buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
927 s_waitcnt vmcnt(0)
928 s_sub_u32 m0, m0, 4
929 v_mov_b32 v0, v0 // v[0+m0] = v0
930 v_mov_b32 v1, v1
931 v_mov_b32 v2, v2
932 v_mov_b32 v3, v3
933 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
934 s_cmp_eq_u32 m0, 0x8000
935 s_cbranch_scc0 L_RESTORE_VGPR_LOOP
936 s_set_gpr_idx_off
937
938 s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
939 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes
940
941else
942 // VGPR load using dw burst
943 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
944 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
945 s_mov_b32 m0, 4 //VGPR initial index value = 1
946 s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
947 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
948
949 L_RESTORE_VGPR_LOOP:
950 if(USE_MTBUF_INSTEAD_OF_MUBUF)
951 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
952 else
953 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
954 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
955 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
956 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
957 end
958 s_waitcnt vmcnt(0) //ensure data ready
959 v_mov_b32 v0, v0 //v[0+m0] = v0
960 v_mov_b32 v1, v1
961 v_mov_b32 v2, v2
962 v_mov_b32 v3, v3
963 s_add_u32 m0, m0, 4 //next vgpr index
964 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
965 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
966 s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
967 s_set_gpr_idx_off
968 /* VGPR restore on v0 */
969 if(USE_MTBUF_INSTEAD_OF_MUBUF)
970 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
971 else
972 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
973 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
974 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
975 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
976 end
977
978end
979
980 /* restore SGPRs */
981 //////////////////////////////
982
983 // SGPR SR memory offset : size(VGPR)
984 get_vgpr_size_bytes(s_restore_mem_offset)
985 get_sgpr_size_bytes(s_restore_tmp)
986 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
987 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group
988 // TODO, change RSRC word to rearrange memory layout for SGPRS
989
990 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
991 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
992 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
993
994 if (SGPR_SAVE_USE_SQC)
995 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
996 else
997 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
998 end
999 if (SWIZZLE_EN)
1000 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
1001 else
1002 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1003 end
1004
1005 s_mov_b32 m0, s_restore_alloc_size
1006
1007 L_RESTORE_SGPR_LOOP:
1008 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made
1009 s_waitcnt lgkmcnt(0) //ensure data ready
1010
1011 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
1012 s_nop 0 // hazard SALU M0=> S_MOVREL
1013
1014 s_movreld_b64 s0, s0 //s[0+m0] = s0
1015 s_movreld_b64 s2, s2
1016 s_movreld_b64 s4, s4
1017 s_movreld_b64 s6, s6
1018 s_movreld_b64 s8, s8
1019 s_movreld_b64 s10, s10
1020 s_movreld_b64 s12, s12
1021 s_movreld_b64 s14, s14
1022
1023 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0
1024 s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
1025
1026 /* restore HW registers */
1027 //////////////////////////////
1028 L_RESTORE_HWREG:
1029
1030
1031if G8SR_DEBUG_TIMESTAMP
1032 s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
1033 s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
1034end
1035
1036 // HWREG SR memory offset : size(VGPR)+size(SGPR)
1037 get_vgpr_size_bytes(s_restore_mem_offset)
1038 get_sgpr_size_bytes(s_restore_tmp)
1039 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1040
1041
1042 s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
1043 if (SWIZZLE_EN)
1044 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
1045 else
1046 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1047 end
1048
1049 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0
1050 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC
1051 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1052 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC
1053 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1054 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS
1055 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS
1056 read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO
1057 read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI
1058 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE
1059
1060 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
1061
1062 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
1063
1064 //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
1065 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
1066 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
1067 s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
1068 end
1069 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
1070 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
1071 s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
1072 end
1073
1074 s_mov_b32 m0, s_restore_m0
1075 s_mov_b32 exec_lo, s_restore_exec_lo
1076 s_mov_b32 exec_hi, s_restore_exec_hi
1077
1078 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1079 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1080 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1081 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1082 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1083 //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
1084 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
1085
1086 // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
1087 // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
1088 get_vgpr_size_bytes(s_restore_ttmps_lo)
1089 get_sgpr_size_bytes(s_restore_ttmps_hi)
1090 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1091 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1092 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1093 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1094 s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1
1095 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1
1096 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1
1097 s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1
1098 s_waitcnt lgkmcnt(0)
1099
1100 //reuse s_restore_m0 as a temp register
1101 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
1102 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
1103 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
1104 s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
1105 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
1106 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
1107 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
1108 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
1109 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
1110 s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
1111 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
1112 s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
1113
1114 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
1115 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
1116 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
1117
1118 s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
1119
1120if G8SR_DEBUG_TIMESTAMP
1121 s_memrealtime s_g8sr_ts_restore_d
1122 s_waitcnt lgkmcnt(0)
1123end
1124
1125// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
1126 s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
1127
1128
1129/**************************************************************************/
1130/* the END */
1131/**************************************************************************/
1132L_END_PGM:
1133 s_endpgm
1134
1135end
1136
1137
1138/**************************************************************************/
1139/* the helper functions */
1140/**************************************************************************/
1141
1142//Only for save hwreg to mem
1143function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1144 s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
1145 s_mov_b32 m0, s_mem_offset
1146 s_buffer_store_dword s, s_rsrc, m0 glc:1
1147 ack_sqc_store_workaround()
1148 s_add_u32 s_mem_offset, s_mem_offset, 4
1149 s_mov_b32 m0, exec_lo
1150end
1151
1152
1153// HWREG are saved before SGPRs, so all HWREG could be use.
1154function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1155
1156 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
1157 ack_sqc_store_workaround()
1158 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
1159 ack_sqc_store_workaround()
1160 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
1161 ack_sqc_store_workaround()
1162 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
1163 ack_sqc_store_workaround()
1164 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
1165 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc
1166end
1167
1168
1169function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1170 s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
1171 s_add_u32 s_mem_offset, s_mem_offset, 4
1172end
1173
1174function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1175 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
1176 s_sub_u32 s_mem_offset, s_mem_offset, 4*16
1177end
1178
1179
1180
1181function get_lds_size_bytes(s_lds_size_byte)
1182 // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
1183 s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size
1184 s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
1185end
1186
1187function get_vgpr_size_bytes(s_vgpr_size_byte)
1188 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
1189 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
1190 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible
1191end
1192
1193function get_sgpr_size_bytes(s_sgpr_size_byte)
1194 s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
1195 s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1
1196 s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value)
1197end
1198
1199function get_hwreg_size_bytes
1200 return 128 //HWREG size 128 bytes
1201end
1202
1203function ack_sqc_store_workaround
1204 if ACK_SQC_STORE
1205 s_waitcnt lgkmcnt(0)
1206 end
1207end
1208
1209
1210#endif
1211
1212static const uint32_t cwsr_trap_gfx9_hex[] = {
1213 0xbf820001, 0xbf820158,
1214 0xb8f8f802, 0x89788678,
1215 0xb8f1f803, 0x866eff71,
1216 0x00000400, 0xbf850034,
1217 0x866eff71, 0x00000800,
1218 0xbf850003, 0x866eff71,
1219 0x00000100, 0xbf840008,
1220 0x866eff78, 0x00002000,
1221 0xbf840001, 0xbf810000,
1222 0x8778ff78, 0x00002000,
1223 0x80ec886c, 0x82ed806d,
1224 0xb8eef807, 0x866fff6e,
1225 0x001f8000, 0x8e6f8b6f,
1226 0x8977ff77, 0xfc000000,
1227 0x87776f77, 0x896eff6e,
1228 0x001f8000, 0xb96ef807,
1229 0xb8f0f812, 0xb8f1f813,
1230 0x8ef08870, 0xc0071bb8,
1231 0x00000000, 0xbf8cc07f,
1232 0xc0071c38, 0x00000008,
1233 0xbf8cc07f, 0x86ee6e6e,
1234 0xbf840001, 0xbe801d6e,
1235 0xb8f1f803, 0x8671ff71,
1236 0x000001ff, 0xbf850002,
1237 0x806c846c, 0x826d806d,
1238 0x866dff6d, 0x0000ffff,
1239 0x8f6e8b77, 0x866eff6e,
1240 0x001f8000, 0xb96ef807,
1241 0x86fe7e7e, 0x86ea6a6a,
1242 0xb978f802, 0xbe801f6c,
1243 0x866dff6d, 0x0000ffff,
1244 0xbef00080, 0xb9700283,
1245 0xb8f02407, 0x8e709c70,
1246 0x876d706d, 0xb8f003c7,
1247 0x8e709b70, 0x876d706d,
1248 0xb8f0f807, 0x8670ff70,
1249 0x00007fff, 0xb970f807,
1250 0xbeee007e, 0xbeef007f,
1251 0xbefe0180, 0xbf900004,
1252 0xbf8e0002, 0xbf88fffe,
1253 0xb8f02a05, 0x80708170,
1254 0x8e708a70, 0xb8f11605,
1255 0x80718171, 0x8e718671,
1256 0x80707170, 0x80707e70,
1257 0x8271807f, 0x8671ff71,
1258 0x0000ffff, 0xc0471cb8,
1259 0x00000040, 0xbf8cc07f,
1260 0xc04b1d38, 0x00000048,
1261 0xbf8cc07f, 0xc0431e78,
1262 0x00000058, 0xbf8cc07f,
1263 0xc0471eb8, 0x0000005c,
1264 0xbf8cc07f, 0xbef4007e,
1265 0x8675ff7f, 0x0000ffff,
1266 0x8775ff75, 0x00040000,
1267 0xbef60080, 0xbef700ff,
1268 0x00807fac, 0x8670ff7f,
1269 0x08000000, 0x8f708370,
1270 0x87777077, 0x8670ff7f,
1271 0x70000000, 0x8f708170,
1272 0x87777077, 0xbefb007c,
1273 0xbefa0080, 0xb8fa2a05,
1274 0x807a817a, 0x8e7a8a7a,
1275 0xb8f01605, 0x80708170,
1276 0x8e708670, 0x807a707a,
1277 0xbef60084, 0xbef600ff,
1278 0x01000000, 0xbefe007c,
1279 0xbefc007a, 0xc0611efa,
1280 0x0000007c, 0xbf8cc07f,
1281 0x807a847a, 0xbefc007e,
1282 0xbefe007c, 0xbefc007a,
1283 0xc0611b3a, 0x0000007c,
1284 0xbf8cc07f, 0x807a847a,
1285 0xbefc007e, 0xbefe007c,
1286 0xbefc007a, 0xc0611b7a,
1287 0x0000007c, 0xbf8cc07f,
1288 0x807a847a, 0xbefc007e,
1289 0xbefe007c, 0xbefc007a,
1290 0xc0611bba, 0x0000007c,
1291 0xbf8cc07f, 0x807a847a,
1292 0xbefc007e, 0xbefe007c,
1293 0xbefc007a, 0xc0611bfa,
1294 0x0000007c, 0xbf8cc07f,
1295 0x807a847a, 0xbefc007e,
1296 0xbefe007c, 0xbefc007a,
1297 0xc0611e3a, 0x0000007c,
1298 0xbf8cc07f, 0x807a847a,
1299 0xbefc007e, 0xb8f1f803,
1300 0xbefe007c, 0xbefc007a,
1301 0xc0611c7a, 0x0000007c,
1302 0xbf8cc07f, 0x807a847a,
1303 0xbefc007e, 0xbefe007c,
1304 0xbefc007a, 0xc0611a3a,
1305 0x0000007c, 0xbf8cc07f,
1306 0x807a847a, 0xbefc007e,
1307 0xbefe007c, 0xbefc007a,
1308 0xc0611a7a, 0x0000007c,
1309 0xbf8cc07f, 0x807a847a,
1310 0xbefc007e, 0xb8fbf801,
1311 0xbefe007c, 0xbefc007a,
1312 0xc0611efa, 0x0000007c,
1313 0xbf8cc07f, 0x807a847a,
1314 0xbefc007e, 0x8670ff7f,
1315 0x04000000, 0xbeef0080,
1316 0x876f6f70, 0xb8fa2a05,
1317 0x807a817a, 0x8e7a8a7a,
1318 0xb8f11605, 0x80718171,
1319 0x8e718471, 0x8e768271,
1320 0xbef600ff, 0x01000000,
1321 0xbef20174, 0x80747a74,
1322 0x82758075, 0xbefc0080,
1323 0xbf800000, 0xbe802b00,
1324 0xbe822b02, 0xbe842b04,
1325 0xbe862b06, 0xbe882b08,
1326 0xbe8a2b0a, 0xbe8c2b0c,
1327 0xbe8e2b0e, 0xc06b003a,
1328 0x00000000, 0xbf8cc07f,
1329 0xc06b013a, 0x00000010,
1330 0xbf8cc07f, 0xc06b023a,
1331 0x00000020, 0xbf8cc07f,
1332 0xc06b033a, 0x00000030,
1333 0xbf8cc07f, 0x8074c074,
1334 0x82758075, 0x807c907c,
1335 0xbf0a717c, 0xbf85ffe7,
1336 0xbef40172, 0xbefa0080,
1337 0xbefe00c1, 0xbeff00c1,
1338 0xbee80080, 0xbee90080,
1339 0xbef600ff, 0x01000000,
1340 0xe0724000, 0x7a1d0000,
1341 0xe0724100, 0x7a1d0100,
1342 0xe0724200, 0x7a1d0200,
1343 0xe0724300, 0x7a1d0300,
1344 0xbefe00c1, 0xbeff00c1,
1345 0xb8f14306, 0x8671c171,
1346 0xbf84002c, 0xbf8a0000,
1347 0x8670ff6f, 0x04000000,
1348 0xbf840028, 0x8e718671,
1349 0x8e718271, 0xbef60071,
1350 0xb8fa2a05, 0x807a817a,
1351 0x8e7a8a7a, 0xb8f01605,
1352 0x80708170, 0x8e708670,
1353 0x807a707a, 0x807aff7a,
1354 0x00000080, 0xbef600ff,
1355 0x01000000, 0xbefc0080,
1356 0xd28c0002, 0x000100c1,
1357 0xd28d0003, 0x000204c1,
1358 0xd1060002, 0x00011103,
1359 0x7e0602ff, 0x00000200,
1360 0xbefc00ff, 0x00010000,
1361 0xbe800077, 0x8677ff77,
1362 0xff7fffff, 0x8777ff77,
1363 0x00058000, 0xd8ec0000,
1364 0x00000002, 0xbf8cc07f,
1365 0xe0765000, 0x7a1d0002,
1366 0x68040702, 0xd0c9006a,
1367 0x0000e302, 0xbf87fff7,
1368 0xbef70000, 0xbefa00ff,
1369 0x00000400, 0xbefe00c1,
1370 0xbeff00c1, 0xb8f12a05,
1371 0x80718171, 0x8e718271,
1372 0x8e768871, 0xbef600ff,
1373 0x01000000, 0xbefc0084,
1374 0xbf0a717c, 0xbf840015,
1375 0xbf11017c, 0x8071ff71,
1376 0x00001000, 0x7e000300,
1377 0x7e020301, 0x7e040302,
1378 0x7e060303, 0xe0724000,
1379 0x7a1d0000, 0xe0724100,
1380 0x7a1d0100, 0xe0724200,
1381 0x7a1d0200, 0xe0724300,
1382 0x7a1d0300, 0x807c847c,
1383 0x807aff7a, 0x00000400,
1384 0xbf0a717c, 0xbf85ffef,
1385 0xbf9c0000, 0xbf8200d9,
1386 0xbef4007e, 0x8675ff7f,
1387 0x0000ffff, 0x8775ff75,
1388 0x00040000, 0xbef60080,
1389 0xbef700ff, 0x00807fac,
1390 0x866eff7f, 0x08000000,
1391 0x8f6e836e, 0x87776e77,
1392 0x866eff7f, 0x70000000,
1393 0x8f6e816e, 0x87776e77,
1394 0x866eff7f, 0x04000000,
1395 0xbf84001e, 0xbefe00c1,
1396 0xbeff00c1, 0xb8ef4306,
1397 0x866fc16f, 0xbf840019,
1398 0x8e6f866f, 0x8e6f826f,
1399 0xbef6006f, 0xb8f82a05,
1400 0x80788178, 0x8e788a78,
1401 0xb8ee1605, 0x806e816e,
1402 0x8e6e866e, 0x80786e78,
1403 0x8078ff78, 0x00000080,
1404 0xbef600ff, 0x01000000,
1405 0xbefc0080, 0xe0510000,
1406 0x781d0000, 0xe0510100,
1407 0x781d0000, 0x807cff7c,
1408 0x00000200, 0x8078ff78,
1409 0x00000200, 0xbf0a6f7c,
1410 0xbf85fff6, 0xbef80080,
1411 0xbefe00c1, 0xbeff00c1,
1412 0xb8ef2a05, 0x806f816f,
1413 0x8e6f826f, 0x8e76886f,
1414 0xbef600ff, 0x01000000,
1415 0xbeee0078, 0x8078ff78,
1416 0x00000400, 0xbefc0084,
1417 0xbf11087c, 0x806fff6f,
1418 0x00008000, 0xe0524000,
1419 0x781d0000, 0xe0524100,
1420 0x781d0100, 0xe0524200,
1421 0x781d0200, 0xe0524300,
1422 0x781d0300, 0xbf8c0f70,
1423 0x7e000300, 0x7e020301,
1424 0x7e040302, 0x7e060303,
1425 0x807c847c, 0x8078ff78,
1426 0x00000400, 0xbf0a6f7c,
1427 0xbf85ffee, 0xbf9c0000,
1428 0xe0524000, 0x6e1d0000,
1429 0xe0524100, 0x6e1d0100,
1430 0xe0524200, 0x6e1d0200,
1431 0xe0524300, 0x6e1d0300,
1432 0xb8f82a05, 0x80788178,
1433 0x8e788a78, 0xb8ee1605,
1434 0x806e816e, 0x8e6e866e,
1435 0x80786e78, 0x80f8c078,
1436 0xb8ef1605, 0x806f816f,
1437 0x8e6f846f, 0x8e76826f,
1438 0xbef600ff, 0x01000000,
1439 0xbefc006f, 0xc031003a,
1440 0x00000078, 0x80f8c078,
1441 0xbf8cc07f, 0x80fc907c,
1442 0xbf800000, 0xbe802d00,
1443 0xbe822d02, 0xbe842d04,
1444 0xbe862d06, 0xbe882d08,
1445 0xbe8a2d0a, 0xbe8c2d0c,
1446 0xbe8e2d0e, 0xbf06807c,
1447 0xbf84fff0, 0xb8f82a05,
1448 0x80788178, 0x8e788a78,
1449 0xb8ee1605, 0x806e816e,
1450 0x8e6e866e, 0x80786e78,
1451 0xbef60084, 0xbef600ff,
1452 0x01000000, 0xc0211bfa,
1453 0x00000078, 0x80788478,
1454 0xc0211b3a, 0x00000078,
1455 0x80788478, 0xc0211b7a,
1456 0x00000078, 0x80788478,
1457 0xc0211eba, 0x00000078,
1458 0x80788478, 0xc0211efa,
1459 0x00000078, 0x80788478,
1460 0xc0211c3a, 0x00000078,
1461 0x80788478, 0xc0211c7a,
1462 0x00000078, 0x80788478,
1463 0xc0211a3a, 0x00000078,
1464 0x80788478, 0xc0211a7a,
1465 0x00000078, 0x80788478,
1466 0xc0211cfa, 0x00000078,
1467 0x80788478, 0xbf8cc07f,
1468 0x866dff6d, 0x0000ffff,
1469 0xbefc006f, 0xbefe007a,
1470 0xbeff007b, 0x866f71ff,
1471 0x000003ff, 0xb96f4803,
1472 0x866f71ff, 0xfffff800,
1473 0x8f6f8b6f, 0xb96fa2c3,
1474 0xb973f801, 0xb8ee2a05,
1475 0x806e816e, 0x8e6e8a6e,
1476 0xb8ef1605, 0x806f816f,
1477 0x8e6f866f, 0x806e6f6e,
1478 0x806e746e, 0x826f8075,
1479 0x866fff6f, 0x0000ffff,
1480 0xc0071cb7, 0x00000040,
1481 0xc00b1d37, 0x00000048,
1482 0xc0031e77, 0x00000058,
1483 0xc0071eb7, 0x0000005c,
1484 0xbf8cc07f, 0x866fff6d,
1485 0xf0000000, 0x8f6f9c6f,
1486 0x8e6f906f, 0xbeee0080,
1487 0x876e6f6e, 0x866fff6d,
1488 0x08000000, 0x8f6f9b6f,
1489 0x8e6f8f6f, 0x876e6f6e,
1490 0x866fff70, 0x00800000,
1491 0x8f6f976f, 0xb96ef807,
1492 0x86fe7e7e, 0x86ea6a6a,
1493 0xb970f802, 0xbf8a0000,
1494 0x95806f6c, 0xbf810000,
1495};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c368ce3e96ff..053f1d0f80b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -30,6 +30,7 @@
30#include "kfd_device_queue_manager.h" 30#include "kfd_device_queue_manager.h"
31#include "kfd_pm4_headers_vi.h" 31#include "kfd_pm4_headers_vi.h"
32#include "cwsr_trap_handler_gfx8.asm" 32#include "cwsr_trap_handler_gfx8.asm"
33#include "cwsr_trap_handler_gfx9.asm"
33#include "kfd_iommu.h" 34#include "kfd_iommu.h"
34 35
35#define MQD_SIZE_ALIGNED 768 36#define MQD_SIZE_ALIGNED 768
@@ -333,10 +334,16 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
333static void kfd_cwsr_init(struct kfd_dev *kfd) 334static void kfd_cwsr_init(struct kfd_dev *kfd)
334{ 335{
335 if (cwsr_enable && kfd->device_info->supports_cwsr) { 336 if (cwsr_enable && kfd->device_info->supports_cwsr) {
336 BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); 337 if (kfd->device_info->asic_family < CHIP_VEGA10) {
338 BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
339 kfd->cwsr_isa = cwsr_trap_gfx8_hex;
340 kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
341 } else {
342 BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
343 kfd->cwsr_isa = cwsr_trap_gfx9_hex;
344 kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
345 }
337 346
338 kfd->cwsr_isa = cwsr_trap_gfx8_hex;
339 kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
340 kfd->cwsr_enabled = true; 347 kfd->cwsr_enabled = true;
341 } 348 }
342} 349}