drm/amdgpu: refactoring mailbox to fix TDR handshake bugs(v2)

this patch actually refactor mailbox implmentations, and all below changes are needed together to fix all those mailbox handshake issues exposured by heavey TDR test. 1)refactor all mailbox functions based on byte accessing for mb_control reason is to avoid touching non-related bits when writing trn/rcv part of mailbox_control, this way some incorrect INTR sent to hypervisor side could be avoided, and it fixes couple handshake bug. 2)trans_msg function re-impled: put a invalid logic before transmitting message to make sure the ACK bit is in a clear status, otherwise there is chance that ACK asserted already before transmitting message and lead to fake ACK polling. (hypervisor side have some tricks to workaround ACK bit being corrupted by VF FLR which hase an side effects that may make guest side ACK bit asserted wrongly), and clear TRANS_MSG words after message transferred. 3)for mailbox_flr_work, it is also re-worked: it takes the mutex lock first if invoked, to block gpu recover's participate too early while hypervisor side is doing VF FLR. (hypervisor sends FLR_NOTIFY to guest before doing VF FLR and sentds FLR_COMPLETE after VF FLR done, and the FLR_NOTIFY will trigger interrupt to guest which lead to mailbox_flr_work being invoked) This can avoid the issue that mailbox trans msg being cleared by its VF FLR. 4)for mailbox_rcv_irq IRQ routine, it should only peek msg and schedule mailbox_flr_work, instead of ACK to hypervisor itself, because FLR_NOTIFY msg sent from hypervisor side doesn't need VF's ACK (this is because VF's ACK would lead to hypervisor clear its trans_valid/msg, and this would cause handshake bug if trans_valid/msg is cleared not due to correct VF ACK but from a wrong VF ACK like this "FLR_NOTIFY" one) This fixed handshake bug that sometimes GUEST always couldn't receive "READY_TO_ACCESS_GPU" msg from hypervisor. 5)seperate polling time limite accordingly: POLL ACK cost no more than 500ms POLL MSG cost no more than 12000ms POLL FLR finish cost no more than 500ms 6) we still need to set adev into in_gpu_reset mode after we received FLR_NOTIFY from host side, this can prevent innocent app wrongly succesed to open amdgpu dri device. FLR_NOFITY is received due to an IDLE hang detected from hypervisor side which indicating GPU is already die in this VF. v2: use MACRO as the offset of mailbox_control register don't test if NOTIFY_CMPL event in rcv_msg since it won't recieve that message anymore Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Pixel Ding <Pixel.Ding@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Monk Liu <Monk.Liu@amd.com> 2018-01-15 00:44:30 -0500
committer: Alex Deucher <alexander.deucher@amd.com> 2018-03-14 15:38:27 -0400
commit: 48527e5296edc7b952fb2c1c40fd8c388cc935ed (patch)
tree: ccc065f565a23fb6c28113a8abde6033457c2b41 /drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
parent: 421a2a30c121660c4628e4494dcca1fceab8a4be (diff)
1 files changed, 103 insertions, 93 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 271452d3999a..8b47484e169a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -33,56 +33,34 @@
 static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
 {
-        u32 reg;
+        WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
-        int timeout = AI_MAILBOX_TIMEDOUT;
-        u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_VALID);
-        reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
-                                             mmBIF_BX_PF0_MAILBOX_CONTROL));
-        reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_ACK, 1);
-        WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
-                                       mmBIF_BX_PF0_MAILBOX_CONTROL), reg);
-        /*Wait for RCV_MSG_VALID to be 0*/
-        reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
-                                             mmBIF_BX_PF0_MAILBOX_CONTROL));
-        while (reg & mask) {
-                if (timeout <= 0) {
-                        pr_err("RCV_MSG_VALID is not cleared\n");
-                        break;
-                }
-                mdelay(1);
-                timeout -=1;
-                reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
-                                                     mmBIF_BX_PF0_MAILBOX_CONTROL));
-        }
 }
 static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val)
 {
-        u32 reg;
+        WREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE, val ? 1 : 0);
+}
-        reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
+/*
-                                             mmBIF_BX_PF0_MAILBOX_CONTROL));
+ * this peek_msg could *only* be called in IRQ routine becuase in IRQ routine
-        reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_CONTROL,
+ * RCV_MSG_VALID filed of BIF_BX_PF0_MAILBOX_CONTROL must already be set to 1
-                            TRN_MSG_VALID, val ? 1 : 0);
+ * by host.
-        WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_CONTROL),
+ *
-                      reg);
+ * if called no in IRQ routine, this peek_msg cannot guaranteed to return the
+ * correct value since it doesn't return the RCV_DW0 under the case that
+ * RCV_MSG_VALID is set by host.
+ */
+static enum idh_event xgpu_ai_mailbox_peek_msg(struct amdgpu_device *adev)
+{
+        return RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
+                                mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));
 }
 static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
                                   enum idh_event event)
 {
        u32 reg;
-        u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_VALID);
-        if (event != IDH_FLR_NOTIFICATION_CMPL) {
-                reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
-                                                     mmBIF_BX_PF0_MAILBOX_CONTROL));
-                if (!(reg & mask))
-                        return -ENOENT;
-        }
        reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
                                             mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));
@@ -94,54 +72,67 @@ static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
        return 0;
 }
+static uint8_t xgpu_ai_peek_ack(struct amdgpu_device *adev) {
+        return RREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE) & 2;
+}
 static int xgpu_ai_poll_ack(struct amdgpu_device *adev)
 {
-        int r = 0, timeout = AI_MAILBOX_TIMEDOUT;
+        int timeout  = AI_MAILBOX_POLL_ACK_TIMEDOUT;
-        u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, TRN_MSG_ACK);
+        u8 reg;
-        u32 reg;
+        do {
+                reg = RREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE);
+                if (reg & 2)
+                        return 0;
-        reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
-                                             mmBIF_BX_PF0_MAILBOX_CONTROL));
-        while (!(reg & mask)) {
-                if (timeout <= 0) {
-                        pr_err("Doesn't get ack from pf.\n");
-                        r = -ETIME;
-                        break;
-                }
                mdelay(5);
                timeout -= 5;
+        } while (timeout > 1);
-                reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
+        pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", AI_MAILBOX_POLL_ACK_TIMEDOUT);
-                                                     mmBIF_BX_PF0_MAILBOX_CONTROL));
-        }
-        return r;
+        return -ETIME;
 }
 static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event)
 {
-        int r = 0, timeout = AI_MAILBOX_TIMEDOUT;
+        int r, timeout = AI_MAILBOX_POLL_MSG_TIMEDOUT;
-        r = xgpu_ai_mailbox_rcv_msg(adev, event);
-        while (r) {
-                if (timeout <= 0) {
-                        pr_err("Doesn't get msg:%d from pf.\n", event);
-                        r = -ETIME;
-                        break;
-                }
-                mdelay(5);
-                timeout -= 5;
+        do {
                r = xgpu_ai_mailbox_rcv_msg(adev, event);
-        }
+                if (!r)
+                        return 0;
-        return r;
+                msleep(10);
+                timeout -= 10;
+        } while (timeout > 1);
+        pr_err("Doesn't get msg:%d from pf, error=%d\n", event, r);
+        return -ETIME;
 }
 static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev,
              enum idh_request req, u32 data1, u32 data2, u32 data3) {
        u32 reg;
        int r;
+        uint8_t trn;
+        /* IMPORTANT:
+         * clear TRN_MSG_VALID valid to clear host's RCV_MSG_ACK
+         * and with host's RCV_MSG_ACK cleared hw automatically clear host's RCV_MSG_ACK
+         * which lead to VF's TRN_MSG_ACK cleared, otherwise below xgpu_ai_poll_ack()
+         * will return immediatly
+         */
+        do {
+                xgpu_ai_mailbox_set_valid(adev, false);
+                trn = xgpu_ai_peek_ack(adev);
+                if (trn) {
+                        pr_err("trn=%x ACK should not asssert! wait again !\n", trn);
+                        msleep(1);
+                }
+        } while(trn);
        reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
                                             mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));
@@ -245,15 +236,36 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 {
        struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
        struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+        int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-        /* wait until RCV_MSG become 3 */
+        int locked;
-        if (xgpu_ai_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL)) {
-                pr_err("failed to recieve FLR_CMPL\n");
+        /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
-                return;
+         * otherwise the mailbox msg will be ruined/reseted by
-        }
+         * the VF FLR.
+         *
-        /* Trigger recovery due to world switch failure */
+         * we can unlock the lock_reset to allow "amdgpu_job_timedout"
-        amdgpu_device_gpu_recover(adev, NULL, false);
+         * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
+         * which means host side had finished this VF's FLR.
+         */
+        locked = mutex_trylock(&adev->lock_reset);
+        if (locked)
+                adev->in_gpu_reset = 1;
+        do {
+                if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
+                        goto flr_done;
+                msleep(10);
+                timeout -= 10;
+        } while (timeout > 1);
+flr_done:
+        if (locked)
+                mutex_unlock(&adev->lock_reset);
+        /* Trigger recovery for world switch failure if no TDR */
+        if (amdgpu_lockup_timeout == 0)
+                amdgpu_device_gpu_recover(adev, NULL, true);
 }
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -274,24 +286,22 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
                                   struct amdgpu_irq_src *source,
                                   struct amdgpu_iv_entry *entry)
 {
-        int r;
+        enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);
-        /* trigger gpu-reset by hypervisor only if TDR disbaled */
+        switch (event) {
-        if (!amdgpu_gpu_recovery) {
+                case IDH_FLR_NOTIFICATION:
-                /* see what event we get */
+                if (amdgpu_sriov_runtime(adev))
-                r = xgpu_ai_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
+                        schedule_work(&adev->virt.flr_work);
+                break;
-                /* sometimes the interrupt is delayed to inject to VM, so under such case
+                /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
-                 * the IDH_FLR_NOTIFICATION is overwritten by VF FLR from GIM side, thus
+                 * it byfar since that polling thread will handle it,
-                 * above recieve message could be failed, we should schedule the flr_work
+                 * other msg like flr complete is not handled here.
-                 * anyway
                 */
-                if (r) {
+                case IDH_CLR_MSG_BUF:
-                        DRM_ERROR("FLR_NOTIFICATION is missed\n");
+                case IDH_FLR_NOTIFICATION_CMPL:
-                        xgpu_ai_mailbox_send_ack(adev);
+                case IDH_READY_TO_ACCESS_GPU:
-                }
+                default:
+                break;
-                schedule_work(&adev->virt.flr_work);
        }
        return 0;
author	Monk Liu <Monk.Liu@amd.com>	2018-01-15 00:44:30 -0500
committer	Alex Deucher <alexander.deucher@amd.com>	2018-03-14 15:38:27 -0400
commit	48527e5296edc7b952fb2c1c40fd8c388cc935ed (patch)
tree	ccc065f565a23fb6c28113a8abde6033457c2b41 /drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
parent	421a2a30c121660c4628e4494dcca1fceab8a4be (diff)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 271452d3999a..8b47484e169a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -33,56 +33,34 @@
33		33
34	static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)	34	static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
35	{	35	{
36	u32 reg;	36	WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
37	int timeout = AI_MAILBOX_TIMEDOUT;
38	u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_VALID);
39
40	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
41	mmBIF_BX_PF0_MAILBOX_CONTROL));
42	reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_ACK, 1);
43	WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
44	mmBIF_BX_PF0_MAILBOX_CONTROL), reg);
45
46	/Wait for RCV_MSG_VALID to be 0/
47	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
48	mmBIF_BX_PF0_MAILBOX_CONTROL));
49	while (reg & mask) {
50	if (timeout <= 0) {
51	pr_err("RCV_MSG_VALID is not cleared\n");
52	break;
53	}
54	mdelay(1);
55	timeout -=1;
56
57	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
58	mmBIF_BX_PF0_MAILBOX_CONTROL));
59	}
60	}	37	}
61		38
62	static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val)	39	static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val)
63	{	40	{
64	u32 reg;	41	WREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE, val ? 1 : 0);
		42	}
65		43
66	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,	44	/*
67	mmBIF_BX_PF0_MAILBOX_CONTROL));	45	* this peek_msg could only be called in IRQ routine becuase in IRQ routine
68	reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_CONTROL,	46	* RCV_MSG_VALID filed of BIF_BX_PF0_MAILBOX_CONTROL must already be set to 1
69	TRN_MSG_VALID, val ? 1 : 0);	47	* by host.
70	WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_CONTROL),	48	*
71	reg);	49	* if called no in IRQ routine, this peek_msg cannot guaranteed to return the
		50	* correct value since it doesn't return the RCV_DW0 under the case that
		51	* RCV_MSG_VALID is set by host.
		52	*/
		53	static enum idh_event xgpu_ai_mailbox_peek_msg(struct amdgpu_device *adev)
		54	{
		55	return RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
		56	mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));
72	}	57	}
73		58
		59
74	static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,	60	static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
75	enum idh_event event)	61	enum idh_event event)
76	{	62	{
77	u32 reg;	63	u32 reg;
78	u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, RCV_MSG_VALID);
79
80	if (event != IDH_FLR_NOTIFICATION_CMPL) {
81	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
82	mmBIF_BX_PF0_MAILBOX_CONTROL));
83	if (!(reg & mask))
84	return -ENOENT;
85	}
86		64
87	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,	65	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
88	mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));	66	mmBIF_BX_PF0_MAILBOX_MSGBUF_RCV_DW0));
@@ -94,54 +72,67 @@ static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
94	return 0;	72	return 0;
95	}	73	}
96		74
		75	static uint8_t xgpu_ai_peek_ack(struct amdgpu_device *adev) {
		76	return RREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE) & 2;
		77	}
		78
97	static int xgpu_ai_poll_ack(struct amdgpu_device *adev)	79	static int xgpu_ai_poll_ack(struct amdgpu_device *adev)
98	{	80	{
99	int r = 0, timeout = AI_MAILBOX_TIMEDOUT;	81	int timeout = AI_MAILBOX_POLL_ACK_TIMEDOUT;
100	u32 mask = REG_FIELD_MASK(BIF_BX_PF0_MAILBOX_CONTROL, TRN_MSG_ACK);	82	u8 reg;
101	u32 reg;	83
		84	do {
		85	reg = RREG8(AI_MAIBOX_CONTROL_TRN_OFFSET_BYTE);
		86	if (reg & 2)
		87	return 0;
102		88
103	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
104	mmBIF_BX_PF0_MAILBOX_CONTROL));
105	while (!(reg & mask)) {
106	if (timeout <= 0) {
107	pr_err("Doesn't get ack from pf.\n");
108	r = -ETIME;
109	break;
110	}
111	mdelay(5);	89	mdelay(5);
112	timeout -= 5;	90	timeout -= 5;
		91	} while (timeout > 1);
113		92
114	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,	93	pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", AI_MAILBOX_POLL_ACK_TIMEDOUT);
115	mmBIF_BX_PF0_MAILBOX_CONTROL));
116	}
117		94
118	return r;	95	return -ETIME;
119	}	96	}
120		97
121	static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event)	98	static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event)
122	{	99	{
123	int r = 0, timeout = AI_MAILBOX_TIMEDOUT;	100	int r, timeout = AI_MAILBOX_POLL_MSG_TIMEDOUT;
124
125	r = xgpu_ai_mailbox_rcv_msg(adev, event);
126	while (r) {
127	if (timeout <= 0) {
128	pr_err("Doesn't get msg:%d from pf.\n", event);
129	r = -ETIME;
130	break;
131	}
132	mdelay(5);
133	timeout -= 5;
134		101
		102	do {
135	r = xgpu_ai_mailbox_rcv_msg(adev, event);	103	r = xgpu_ai_mailbox_rcv_msg(adev, event);
136	}	104	if (!r)
		105	return 0;
137		106
138	return r;	107	msleep(10);
		108	timeout -= 10;
		109	} while (timeout > 1);
		110
		111	pr_err("Doesn't get msg:%d from pf, error=%d\n", event, r);
		112
		113	return -ETIME;
139	}	114	}
140		115
141	static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev,	116	static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev,
142	enum idh_request req, u32 data1, u32 data2, u32 data3) {	117	enum idh_request req, u32 data1, u32 data2, u32 data3) {
143	u32 reg;	118	u32 reg;
144	int r;	119	int r;
		120	uint8_t trn;
		121
		122	/* IMPORTANT:
		123	* clear TRN_MSG_VALID valid to clear host's RCV_MSG_ACK
		124	* and with host's RCV_MSG_ACK cleared hw automatically clear host's RCV_MSG_ACK
		125	* which lead to VF's TRN_MSG_ACK cleared, otherwise below xgpu_ai_poll_ack()
		126	* will return immediatly
		127	*/
		128	do {
		129	xgpu_ai_mailbox_set_valid(adev, false);
		130	trn = xgpu_ai_peek_ack(adev);
		131	if (trn) {
		132	pr_err("trn=%x ACK should not asssert! wait again !\n", trn);
		133	msleep(1);
		134	}
		135	} while(trn);
145		136
146	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,	137	reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
147	mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));	138	mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));
@@ -245,15 +236,36 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
245	{	236	{
246	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);	237	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
247	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);	238	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
248		239	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
249	/* wait until RCV_MSG become 3 */	240	int locked;
250	if (xgpu_ai_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL)) {	241
251	pr_err("failed to recieve FLR_CMPL\n");	242	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
252	return;	243	* otherwise the mailbox msg will be ruined/reseted by
253	}	244	* the VF FLR.
254		245	*
255	/* Trigger recovery due to world switch failure */	246	* we can unlock the lock_reset to allow "amdgpu_job_timedout"
256	amdgpu_device_gpu_recover(adev, NULL, false);	247	* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
		248	* which means host side had finished this VF's FLR.
		249	*/
		250	locked = mutex_trylock(&adev->lock_reset);
		251	if (locked)
		252	adev->in_gpu_reset = 1;
		253
		254	do {
		255	if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
		256	goto flr_done;
		257
		258	msleep(10);
		259	timeout -= 10;
		260	} while (timeout > 1);
		261
		262	flr_done:
		263	if (locked)
		264	mutex_unlock(&adev->lock_reset);
		265
		266	/* Trigger recovery for world switch failure if no TDR */
		267	if (amdgpu_lockup_timeout == 0)
		268	amdgpu_device_gpu_recover(adev, NULL, true);
257	}	269	}
258		270
259	static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,	271	static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -274,24 +286,22 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
274	struct amdgpu_irq_src *source,	286	struct amdgpu_irq_src *source,
275	struct amdgpu_iv_entry *entry)	287	struct amdgpu_iv_entry *entry)
276	{	288	{
277	int r;	289	enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);
278		290
279	/* trigger gpu-reset by hypervisor only if TDR disbaled */	291	switch (event) {
280	if (!amdgpu_gpu_recovery) {	292	case IDH_FLR_NOTIFICATION:
281	/* see what event we get */	293	if (amdgpu_sriov_runtime(adev))
282	r = xgpu_ai_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);	294	schedule_work(&adev->virt.flr_work);
283		295	break;
284	/* sometimes the interrupt is delayed to inject to VM, so under such case	296	/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
285	* the IDH_FLR_NOTIFICATION is overwritten by VF FLR from GIM side, thus	297	* it byfar since that polling thread will handle it,
286	* above recieve message could be failed, we should schedule the flr_work	298	* other msg like flr complete is not handled here.
287	* anyway
288	*/	299	*/
289	if (r) {	300	case IDH_CLR_MSG_BUF:
290	DRM_ERROR("FLR_NOTIFICATION is missed\n");	301	case IDH_FLR_NOTIFICATION_CMPL:
291	xgpu_ai_mailbox_send_ack(adev);	302	case IDH_READY_TO_ACCESS_GPU:
292	}	303	default:
293		304	break;
294	schedule_work(&adev->virt.flr_work);
295	}	305	}
296		306
297	return 0;	307	return 0;