aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c200
1 files changed, 105 insertions, 95 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 271452d3999a..8fb933c62cf5 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -33,56 +33,34 @@
33 33
34static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev) 34static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
35{ 35{
36 u32 reg; 36 WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
37 int timeout = AI_MAILBOX_TIMEDOUT;
38 u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_VALID);
39
40 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
41 mmBIF_BX_PF0_MAILBOX_CONTROL));
42 reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_ACK, 1);
43 WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
44 mmBIF_BX_PF0_MAILBOX_CONTROL), reg);
45
46 /*Wait for RCV_MSG_VALID to be 0*/
47 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
48 mmBIF_BX_PF0_MAILBOX_CONTROL));
49 while (reg & mask) {
50 if (timeout <= 0) {
51 pr_err("RCV_MSG_VALID is not cleared\n");
52 break;
53 }
54 mdelay(1);
55 timeout -=1;
56
57 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
58 mmBIF_BX_PF0_MAILBOX_CONTROL));
59 }
60} 37}
61 38
62static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val) 39static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val)
63{ 40{
64 u32 reg; 41 WREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE, val ? 1 : 0);
42}
65 43
66 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, 44/*
67 mmBIF_BX_PF0_MAILBOX_CONTROL)); 45 * this peek_msg could *only* be called in IRQ routine becuase in IRQ routine
68 reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_CONTROL, 46 * RCV_MSG_VALID filed of BIF_BX_PF0_MAILBOX_CONTROL must already be set to 1
69 TRN_MSG_VALID, val ? 1 : 0); 47 * by host.
70 WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_CONTROL), 48 *
71 reg); 49 * if called no in IRQ routine, this peek_msg cannot guaranteed to return the
50 * correct value since it doesn't return the RCV_DW0 under the case that
51 * RCV_MSG_VALID is set by host.
52 */
53static enum idh_event xgpu_ai_mailbox_peek_msg(struct amdgpu_device *adev)
54{
55 return RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
56 mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));
72} 57}
73 58
59
74static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev, 60static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
75 enum idh_event event) 61 enum idh_event event)
76{ 62{
77 u32 reg; 63 u32 reg;
78 u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_VALID);
79
80 if (event != IDH_FLR_NOTIFICATION_CMPL) {
81 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
82 mmBIF_BX_PF0_MAILBOX_CONTROL));
83 if (!(reg & mask))
84 return -ENOENT;
85 }
86 64
87 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, 65 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
88 mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0)); 66 mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));
@@ -94,54 +72,67 @@ static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
94 return 0; 72 return 0;
95} 73}
96 74
75static uint8_t xgpu_ai_peek_ack(struct amdgpu_device *adev) {
76 return RREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE) & 2;
77}
78
97static int xgpu_ai_poll_ack(struct amdgpu_device *adev) 79static int xgpu_ai_poll_ack(struct amdgpu_device *adev)
98{ 80{
99 int r = 0, timeout = AI_MAILBOX_TIMEDOUT; 81 int timeout = AI_MAILBOX_POLL_ACK_TIMEDOUT;
100 u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, TRN_MSG_ACK); 82 u8 reg;
101 u32 reg; 83
84 do {
85 reg = RREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE);
86 if (reg & 2)
87 return 0;
102 88
103 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
104 mmBIF_BX_PF0_MAILBOX_CONTROL));
105 while (!(reg & mask)) {
106 if (timeout <= 0) {
107 pr_err("Doesn't get ack from pf.\n");
108 r = -ETIME;
109 break;
110 }
111 mdelay(5); 89 mdelay(5);
112 timeout -= 5; 90 timeout -= 5;
91 } while (timeout > 1);
113 92
114 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, 93 pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", AI_MAILBOX_POLL_ACK_TIMEDOUT);
115 mmBIF_BX_PF0_MAILBOX_CONTROL));
116 }
117 94
118 return r; 95 return -ETIME;
119} 96}
120 97
121static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event) 98static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event)
122{ 99{
123 int r = 0, timeout = AI_MAILBOX_TIMEDOUT; 100 int r, timeout = AI_MAILBOX_POLL_MSG_TIMEDOUT;
124
125 r = xgpu_ai_mailbox_rcv_msg(adev, event);
126 while (r) {
127 if (timeout <= 0) {
128 pr_err("Doesn't get msg:%d from pf.\n", event);
129 r = -ETIME;
130 break;
131 }
132 mdelay(5);
133 timeout -= 5;
134 101
102 do {
135 r = xgpu_ai_mailbox_rcv_msg(adev, event); 103 r = xgpu_ai_mailbox_rcv_msg(adev, event);
136 } 104 if (!r)
105 return 0;
137 106
138 return r; 107 msleep(10);
108 timeout -= 10;
109 } while (timeout > 1);
110
111 pr_err("Doesn't get msg:%d from pf, error=%d\n", event, r);
112
113 return -ETIME;
139} 114}
140 115
141static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev, 116static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev,
142 enum idh_request req, u32 data1, u32 data2, u32 data3) { 117 enum idh_request req, u32 data1, u32 data2, u32 data3) {
143 u32 reg; 118 u32 reg;
144 int r; 119 int r;
120 uint8_t trn;
121
122 /* IMPORTANT:
123 * clear TRN_MSG_VALID valid to clear host's RCV_MSG_ACK
124 * and with host's RCV_MSG_ACK cleared hw automatically clear host's RCV_MSG_ACK
125 * which lead to VF's TRN_MSG_ACK cleared, otherwise below xgpu_ai_poll_ack()
126 * will return immediatly
127 */
128 do {
129 xgpu_ai_mailbox_set_valid(adev, false);
130 trn = xgpu_ai_peek_ack(adev);
131 if (trn) {
132 pr_err("trn=%x ACK should not asssert! wait again !\n", trn);
133 msleep(1);
134 }
135 } while(trn);
145 136
146 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, 137 reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
147 mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0)); 138 mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));
@@ -245,15 +236,36 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
245{ 236{
246 struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); 237 struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
247 struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); 238 struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
248 239 int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
249 /* wait until RCV_MSG become 3 */ 240 int locked;
250 if (xgpu_ai_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL)) { 241
251 pr_err("failed to recieve FLR_CMPL\n"); 242 /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
252 return; 243 * otherwise the mailbox msg will be ruined/reseted by
253 } 244 * the VF FLR.
254 245 *
255 /* Trigger recovery due to world switch failure */ 246 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
256 amdgpu_device_gpu_recover(adev, NULL, false); 247 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
248 * which means host side had finished this VF's FLR.
249 */
250 locked = mutex_trylock(&adev->lock_reset);
251 if (locked)
252 adev->in_gpu_reset = 1;
253
254 do {
255 if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
256 goto flr_done;
257
258 msleep(10);
259 timeout -= 10;
260 } while (timeout > 1);
261
262flr_done:
263 if (locked)
264 mutex_unlock(&adev->lock_reset);
265
266 /* Trigger recovery for world switch failure if no TDR */
267 if (amdgpu_lockup_timeout == 0)
268 amdgpu_device_gpu_recover(adev, NULL, true);
257} 269}
258 270
259static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, 271static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -274,24 +286,22 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
274 struct amdgpu_irq_src *source, 286 struct amdgpu_irq_src *source,
275 struct amdgpu_iv_entry *entry) 287 struct amdgpu_iv_entry *entry)
276{ 288{
277 int r; 289 enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);
278 290
279 /* trigger gpu-reset by hypervisor only if TDR disbaled */ 291 switch (event) {
280 if (!amdgpu_gpu_recovery) { 292 case IDH_FLR_NOTIFICATION:
281 /* see what event we get */ 293 if (amdgpu_sriov_runtime(adev))
282 r = xgpu_ai_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION); 294 schedule_work(&adev->virt.flr_work);
283 295 break;
284 /* sometimes the interrupt is delayed to inject to VM, so under such case 296 /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
285 * the IDH_FLR_NOTIFICATION is overwritten by VF FLR from GIM side, thus 297 * it byfar since that polling thread will handle it,
286 * above recieve message could be failed, we should schedule the flr_work 298 * other msg like flr complete is not handled here.
287 * anyway
288 */ 299 */
289 if (r) { 300 case IDH_CLR_MSG_BUF:
290 DRM_ERROR("FLR_NOTIFICATION is missed\n"); 301 case IDH_FLR_NOTIFICATION_CMPL:
291 xgpu_ai_mailbox_send_ack(adev); 302 case IDH_READY_TO_ACCESS_GPU:
292 } 303 default:
293 304 break;
294 schedule_work(&adev->virt.flr_work);
295 } 305 }
296 306
297 return 0; 307 return 0;
@@ -319,11 +329,11 @@ int xgpu_ai_mailbox_add_irq_id(struct amdgpu_device *adev)
319{ 329{
320 int r; 330 int r;
321 331
322 r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_BIF, 135, &adev->virt.rcv_irq); 332 r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, 135, &adev->virt.rcv_irq);
323 if (r) 333 if (r)
324 return r; 334 return r;
325 335
326 r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_BIF, 138, &adev->virt.ack_irq); 336 r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, 138, &adev->virt.ack_irq);
327 if (r) { 337 if (r) {
328 amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); 338 amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
329 return r; 339 return r;