aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOmer Shpigelman <oshpigelman@habana.ai>2019-03-13 07:36:28 -0400
committerOded Gabbay <oded.gabbay@gmail.com>2019-03-13 07:36:28 -0400
commitf650a95b71026f5940804f273f9c36b60634131f (patch)
treebafe49399cb3acc158e5799f3a9008f2baf1b172
parent4eb1d1253ddd95e985c57fc99e9de6802dd2d867 (diff)
habanalabs: complete user context cleanup before hard reset
This patch fixes a bug which led to a crash during hard reset flow. Before a hard reset is executed, we wait a few seconds for the user context cleanup to complete. If it wasn't completed, we kill the user process and move on to the reset flow. Upon killing the user process, the context cleanup flow begins and may take a while due to MMU unmaps. Meanwhile, in the driver reset flow, we change the PCI DRAM bar location which can interfere with the MMU that uses the bar. If the context cleanup flow didn't finish quickly, a crash may occur due to PCI DRAM bar mislocation during the MMU unmap. Hence adding a wait between killing the user process and the start of the reset flow. Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r--drivers/misc/habanalabs/device.c24
1 files changed, 23 insertions, 1 deletions
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index de46aa6ed154..93d67983ddba 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -11,6 +11,8 @@
11#include <linux/sched/signal.h> 11#include <linux/sched/signal.h>
12#include <linux/hwmon.h> 12#include <linux/hwmon.h>
13 13
14#define HL_PLDM_PENDING_RESET_PER_SEC (HL_PENDING_RESET_PER_SEC * 10)
15
14bool hl_device_disabled_or_in_reset(struct hl_device *hdev) 16bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
15{ 17{
16 if ((hdev->disabled) || (atomic_read(&hdev->in_reset))) 18 if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
@@ -462,9 +464,16 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
462 struct hl_device_reset_work *device_reset_work = 464 struct hl_device_reset_work *device_reset_work =
463 container_of(work, struct hl_device_reset_work, reset_work); 465 container_of(work, struct hl_device_reset_work, reset_work);
464 struct hl_device *hdev = device_reset_work->hdev; 466 struct hl_device *hdev = device_reset_work->hdev;
465 u16 pending_cnt = HL_PENDING_RESET_PER_SEC; 467 u16 pending_total, pending_cnt;
466 struct task_struct *task = NULL; 468 struct task_struct *task = NULL;
467 469
470 if (hdev->pldm)
471 pending_total = HL_PLDM_PENDING_RESET_PER_SEC;
472 else
473 pending_total = HL_PENDING_RESET_PER_SEC;
474
475 pending_cnt = pending_total;
476
468 /* Flush all processes that are inside hl_open */ 477 /* Flush all processes that are inside hl_open */
469 mutex_lock(&hdev->fd_open_cnt_lock); 478 mutex_lock(&hdev->fd_open_cnt_lock);
470 479
@@ -489,6 +498,19 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
489 } 498 }
490 } 499 }
491 500
501 pending_cnt = pending_total;
502
503 while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
504
505 pending_cnt--;
506
507 ssleep(1);
508 }
509
510 if (atomic_read(&hdev->fd_open_cnt))
511 dev_crit(hdev->dev,
512 "Going to hard reset with open user contexts\n");
513
492 mutex_unlock(&hdev->fd_open_cnt_lock); 514 mutex_unlock(&hdev->fd_open_cnt_lock);
493 515
494 hl_device_reset(hdev, true, true); 516 hl_device_reset(hdev, true, true);