From 469308becaff326da02fcf791e803e812e1cf9f8 Mon Sep 17 00:00:00 2001
From: David Nieto <dmartineznie@nvidia.com>
Date: Mon, 13 Feb 2017 11:22:59 -0800
Subject: gpu: nvgpu: fix arbiter teardown on PCI

The driver is not properly tearing down the arbiter on the PCI driver
unload. This change makes sure that the workqueues are drained before
tearing down the driver

bug 200277762
JIRA: EVLR-1023

Change-Id: If98fd00e27949ba1569dd26e2af02b75897231a7
Signed-off-by: David Nieto <dmartineznie@nvidia.com>
Reviewed-on: http://git-master/r/1320147
Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/clk/clk_arb.c      | 67 +++++++++++++++++++++++++-----------
 drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c | 18 +++++++---
 drivers/gpu/nvgpu/pci.c              |  4 +++
 3 files changed, 63 insertions(+), 26 deletions(-)

(limited to 'drivers')

diff --git a/drivers/gpu/nvgpu/clk/clk_arb.c b/drivers/gpu/nvgpu/clk/clk_arb.c
index 44b442d8..30447d3e 100644
--- a/drivers/gpu/nvgpu/clk/clk_arb.c
+++ b/drivers/gpu/nvgpu/clk/clk_arb.c
@@ -403,7 +403,8 @@ void nvgpu_clk_arb_schedule_alarm(struct gk20a *g, u32 alarm)
 	struct nvgpu_clk_arb *arb = g->clk_arb;
 
 	nvgpu_clk_arb_set_global_alarm(g, alarm);
-	queue_work(arb->update_work_queue, &arb->update_fn_work);
+	if (arb->update_work_queue)
+		queue_work(arb->update_work_queue, &arb->update_fn_work);
 }
 
 static void nvgpu_clk_arb_clear_global_alarm(struct gk20a *g, u32 alarm)
@@ -455,8 +456,30 @@ static void nvgpu_clk_arb_set_global_alarm(struct gk20a *g, u32 alarm)
 
 void nvgpu_clk_arb_cleanup_arbiter(struct gk20a *g)
 {
+	struct nvgpu_clk_arb *arb = g->clk_arb;
+	int index;
+
+	if (arb) {
+		cancel_work_sync(&arb->vf_table_fn_work);
+		destroy_workqueue(arb->vf_table_work_queue);
+		arb->vf_table_work_queue = NULL;
+
+		cancel_work_sync(&arb->update_fn_work);
+		destroy_workqueue(arb->update_work_queue);
+		arb->update_work_queue = NULL;
+
+		kfree(arb->gpc2clk_f_points);
+		kfree(arb->mclk_f_points);
+
+		for (index = 0; index < 2; index++) {
+			kfree(arb->vf_table_pool[index].gpc2clk_points);
+			kfree(arb->vf_table_pool[index].mclk_points);
+		}
+	}
+
 	nvgpu_mutex_destroy(&g->clk_arb->pstate_lock);
 	kfree(g->clk_arb);
+	g->clk_arb = NULL;
 }
 
 static int nvgpu_clk_arb_install_fd(struct gk20a *g,
@@ -575,9 +598,11 @@ static void nvgpu_clk_arb_free_session(struct kref *refcount)
 
 	gk20a_dbg_fn("");
 
-	nvgpu_spinlock_acquire(&arb->sessions_lock);
-	list_del_rcu(&session->link);
-	nvgpu_spinlock_release(&arb->sessions_lock);
+	if (arb) {
+		nvgpu_spinlock_acquire(&arb->sessions_lock);
+		list_del_rcu(&session->link);
+		nvgpu_spinlock_release(&arb->sessions_lock);
+	}
 
 	head = llist_del_all(&session->targets);
 	llist_for_each_entry_safe(dev, tmp, head, node) {
@@ -596,8 +621,8 @@ void nvgpu_clk_arb_release_session(struct gk20a *g,
 
 	session->zombie = true;
 	kref_put(&session->refcount, nvgpu_clk_arb_free_session);
-
-	queue_work(arb->update_work_queue, &arb->update_fn_work);
+	if (arb && arb->update_work_queue)
+		queue_work(arb->update_work_queue, &arb->update_fn_work);
 }
 
 int nvgpu_clk_arb_install_event_fd(struct gk20a *g,
@@ -964,8 +989,8 @@ exit_vf_table:
 	if (status < 0)
 		nvgpu_clk_arb_set_global_alarm(g,
 			EVENT(ALARM_VF_TABLE_UPDATE_FAILED));
-
-	queue_work(arb->update_work_queue, &arb->update_fn_work);
+	if (arb->update_work_queue)
+		queue_work(arb->update_work_queue, &arb->update_fn_work);
 
 	return status;
 }
@@ -973,8 +998,8 @@ exit_vf_table:
 void nvgpu_clk_arb_schedule_vf_table_update(struct gk20a *g)
 {
 	struct nvgpu_clk_arb *arb = g->clk_arb;
-
-	queue_work(arb->vf_table_work_queue, &arb->vf_table_fn_work);
+	if (arb->vf_table_work_queue)
+		queue_work(arb->vf_table_work_queue, &arb->vf_table_fn_work);
 }
 
 static void nvgpu_clk_arb_run_vf_table_cb(struct work_struct *work)
@@ -991,8 +1016,9 @@ static void nvgpu_clk_arb_run_vf_table_cb(struct work_struct *work)
 			"failed to cache VF table");
 		nvgpu_clk_arb_set_global_alarm(g,
 			EVENT(ALARM_VF_TABLE_UPDATE_FAILED));
-
-		queue_work(arb->update_work_queue, &arb->update_fn_work);
+		if (arb->update_work_queue)
+			queue_work(arb->update_work_queue,
+				&arb->update_fn_work);
 
 		return;
 	}
@@ -1490,8 +1516,8 @@ int nvgpu_clk_arb_commit_request_fd(struct gk20a *g,
 	}
 	kref_get(&dev->refcount);
 	llist_add(&dev->node, &session->targets);
-
-	queue_work(arb->update_work_queue, &arb->update_fn_work);
+	if (arb->update_work_queue)
+		queue_work(arb->update_work_queue, &arb->update_fn_work);
 
 fdput_fd:
 	fdput(fd);
@@ -1568,15 +1594,12 @@ static int nvgpu_clk_arb_release_completion_dev(struct inode *inode,
 {
 	struct nvgpu_clk_dev *dev = filp->private_data;
 	struct nvgpu_clk_session *session = dev->session;
-	struct nvgpu_clk_arb *arb;
 
-	arb = session->g->clk_arb;
 
 	gk20a_dbg_fn("");
 
 	kref_put(&session->refcount, nvgpu_clk_arb_free_session);
 	kref_put(&dev->refcount, nvgpu_clk_arb_free_fd);
-
 	return 0;
 }
 
@@ -1591,15 +1614,17 @@ static int nvgpu_clk_arb_release_event_dev(struct inode *inode,
 
 	gk20a_dbg_fn("");
 
-	nvgpu_spinlock_acquire(&arb->users_lock);
-	list_del_rcu(&dev->link);
-	nvgpu_spinlock_release(&arb->users_lock);
+	if (arb) {
+		nvgpu_spinlock_acquire(&arb->users_lock);
+		list_del_rcu(&dev->link);
+		nvgpu_spinlock_release(&arb->users_lock);
+	}
 
 	synchronize_rcu();
 	kref_put(&session->refcount, nvgpu_clk_arb_free_session);
 
 	nvgpu_clk_notification_queue_free(&dev->queue);
-	kfree(dev);
+	kref_put(&dev->refcount, nvgpu_clk_arb_free_fd);
 
 	return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
index 5a4a2251..753623fa 100644
--- a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -39,6 +39,7 @@
 
 struct gk20a_ctrl_priv {
 	struct device *dev;
+	struct gk20a *g;
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	struct nvgpu_clk_session *clk_session;
 #endif
@@ -58,35 +59,42 @@ int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp)
 	priv = kzalloc(sizeof(struct gk20a_ctrl_priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
-
 	filp->private_data = priv;
 	priv->dev = g->dev;
+	/*
+	 * We dont close the arbiter fd's after driver teardown to support
+	 * GPU_LOST events, so we store g here, instead of dereferencing the
+	 * dev structure on teardown
+	 */
+	priv->g = g;
 
 	if (!g->gr.sw_ready) {
 		err = gk20a_busy(g->dev);
 		if (err)
 			return err;
-
 		gk20a_idle(g->dev);
 	}
 
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	err = nvgpu_clk_arb_init_session(g, &priv->clk_session);
+	if (err)
+		return err;
 #endif
+
 	return err;
 }
-
 int gk20a_ctrl_dev_release(struct inode *inode, struct file *filp)
 {
 	struct gk20a_ctrl_priv *priv = filp->private_data;
+	struct gk20a *g = priv->g;
 
 	gk20a_dbg_fn("");
 
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	if (priv->clk_session)
-		nvgpu_clk_arb_release_session(gk20a_from_dev(priv->dev),
-				priv->clk_session);
+		nvgpu_clk_arb_release_session(g, priv->clk_session);
 #endif
+
 	kfree(priv);
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/pci.c b/drivers/gpu/nvgpu/pci.c
index 69e16267..7ef626c2 100644
--- a/drivers/gpu/nvgpu/pci.c
+++ b/drivers/gpu/nvgpu/pci.c
@@ -460,6 +460,10 @@ static void nvgpu_pci_remove(struct pci_dev *pdev)
 	gk20a_wait_for_idle(&pdev->dev);
 	gk20a_dbg(gpu_dbg_shutdown, "Driver idle.\n");
 
+#ifdef CONFIG_ARCH_TEGRA_18x_SOC
+	nvgpu_clk_arb_cleanup_arbiter(g);
+#endif
+
 	gk20a_user_deinit(g->dev, &nvgpu_pci_class);
 	gk20a_dbg(gpu_dbg_shutdown, "User de-init done.\b");
 
-- 
cgit v1.2.2