From 74fe1caa2b56aab24c17ad4dd2524128fc237894 Mon Sep 17 00:00:00 2001
From: David Nieto <dmartineznie@nvidia.com>
Date: Mon, 13 Feb 2017 11:22:59 -0800
Subject: gpu: nvgpu: Add refcounting to driver fds

The main driver structure is not refcounted properly,
so when the driver unload, file desciptors associated to the
driver are kept open with dangling references to the main object.

This change adds referencing to the gk20a structure.

bug 200277762
JIRA: EVLR-1023

Change-Id: Id892e9e1677a344789e99bf649088c076f0bf8de
Signed-off-by: David Nieto <dmartineznie@nvidia.com>
Reviewed-on: http://git-master/r/1317420
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/nvgpu_common.c     |  2 +
 drivers/gpu/nvgpu/gk20a/as_gk20a.c          | 11 ++++-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c     | 35 ++++++++++++----
 drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c        | 18 +++++---
 drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c | 18 +++++---
 drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c     | 10 ++++-
 drivers/gpu/nvgpu/gk20a/gk20a.c             | 64 ++++++++++++++++++++++++++++-
 drivers/gpu/nvgpu/gk20a/gk20a.h             |  4 ++
 drivers/gpu/nvgpu/gk20a/sched_gk20a.c       | 23 ++++++++---
 drivers/gpu/nvgpu/gk20a/tsg_gk20a.c         | 38 ++++++++++++-----
 drivers/gpu/nvgpu/pci.c                     |  3 +-
 drivers/gpu/nvgpu/vgpu/vgpu.c               |  5 ++-
 12 files changed, 190 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/nvgpu_common.c b/drivers/gpu/nvgpu/common/nvgpu_common.c
index 16640aa6..f1109684 100644
--- a/drivers/gpu/nvgpu/common/nvgpu_common.c
+++ b/drivers/gpu/nvgpu/common/nvgpu_common.c
@@ -172,6 +172,8 @@ int nvgpu_probe(struct gk20a *g,
 
 	g->remove_support = gk20a_remove_support;
 
+	kref_init(&g->refcount);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index e4bd8b73..a3c8c1ec 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -1,7 +1,7 @@
 /*
  * GK20A Address Spaces
  *
- * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -46,6 +46,9 @@ int gk20a_as_alloc_share(struct gk20a_as *as,
 	int err = 0;
 
 	gk20a_dbg_fn("");
+	g = gk20a_get(g);
+	if (!g)
+		return -ENODEV;
 
 	*out = NULL;
 	as_share = kzalloc(sizeof(*as_share), GFP_KERNEL);
@@ -85,15 +88,19 @@ int gk20a_as_release_share(struct gk20a_as_share *as_share)
 	gk20a_dbg_fn("");
 
 	err = gk20a_busy(g->dev);
+
 	if (err)
-		return err;
+		goto release_fail;
 
 	err = gk20a_vm_release_share(as_share);
 
 	gk20a_idle(g->dev);
 
+release_fail:
 	release_as_share_id(as_share->as, as_share->id);
+	gk20a_put(g);
 	kfree(as_share);
+
 	return err;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 73cc18d2..c09539ff 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1234,7 +1234,7 @@ int gk20a_channel_release(struct inode *inode, struct file *filp)
 	err = gk20a_busy(g->dev);
 	if (err) {
 		gk20a_err(dev_from_gk20a(g), "failed to release a channel!");
-		return err;
+		goto channel_release;
 	}
 
 	trace_gk20a_channel_release(dev_name(g->dev));
@@ -1242,6 +1242,8 @@ int gk20a_channel_release(struct inode *inode, struct file *filp)
 	gk20a_channel_close(ch);
 	gk20a_idle(g->dev);
 
+channel_release:
+	gk20a_put(g);
 	kfree(filp->private_data);
 	filp->private_data = NULL;
 	return 0;
@@ -1382,11 +1384,17 @@ static int __gk20a_channel_open(struct gk20a *g, struct file *filp, s32 runlist_
 
 	gk20a_dbg_fn("");
 
+	g = gk20a_get(g);
+	if (!g)
+		return -ENODEV;
+
 	trace_gk20a_channel_open(dev_name(g->dev));
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
+	if (!priv) {
+		err = -ENOMEM;
+		goto free_ref;
+	}
 
 	err = gk20a_busy(g->dev);
 	if (err) {
@@ -1414,6 +1422,8 @@ static int __gk20a_channel_open(struct gk20a *g, struct file *filp, s32 runlist_
 
 fail_busy:
 	kfree(priv);
+free_ref:
+	gk20a_put(g);
 	return err;
 }
 
@@ -3509,6 +3519,7 @@ static int gk20a_event_id_release(struct inode *inode, struct file *filp)
 	}
 
 	nvgpu_mutex_destroy(&event_id_data->lock);
+	gk20a_put(g);
 	kfree(event_id_data);
 	filp->private_data = NULL;
 
@@ -3573,20 +3584,28 @@ static int gk20a_channel_event_id_enable(struct channel_gk20a *ch,
 					 int event_id,
 					 int *fd)
 {
+	struct gk20a *g;
 	int err = 0;
 	int local_fd;
 	struct file *file;
 	char *name;
 	struct gk20a_event_id_data *event_id_data;
 
+	g = gk20a_get(ch->g);
+	if (!g)
+		return -ENODEV;
+
 	err = gk20a_channel_get_event_data_from_id(ch,
 				event_id, &event_id_data);
-	if (err == 0) /* We already have event enabled */
-		return -EINVAL;
+	if (err == 0) {
+		/* We already have event enabled */
+		err = -EINVAL;
+		goto free_ref;
+	}
 
 	err = get_unused_fd_flags(O_RDWR);
 	if (err < 0)
-		return err;
+		goto free_ref;
 	local_fd = err;
 
 	name = kasprintf(GFP_KERNEL, "nvgpu-event%d-fd%d",
@@ -3605,7 +3624,7 @@ static int gk20a_channel_event_id_enable(struct channel_gk20a *ch,
 		err = -ENOMEM;
 		goto clean_up_file;
 	}
-	event_id_data->g = ch->g;
+	event_id_data->g = g;
 	event_id_data->id = ch->hw_chid;
 	event_id_data->is_tsg = false;
 	event_id_data->event_id = event_id;
@@ -3633,6 +3652,8 @@ clean_up_file:
 	fput(file);
 clean_up:
 	put_unused_fd(local_fd);
+free_ref:
+	gk20a_put(g);
 	return err;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
index 753623fa..e6626c4a 100644
--- a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -55,10 +55,15 @@ int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp)
 
 	g = container_of(inode->i_cdev,
 			 struct gk20a, ctrl.cdev);
+	g = gk20a_get(g);
+	if (!g)
+		return -ENODEV;
 
 	priv = kzalloc(sizeof(struct gk20a_ctrl_priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
+	if (!priv) {
+		err = -ENOMEM;
+		goto free_ref;
+	}
 	filp->private_data = priv;
 	priv->dev = g->dev;
 	/*
@@ -71,16 +76,16 @@ int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp)
 	if (!g->gr.sw_ready) {
 		err = gk20a_busy(g->dev);
 		if (err)
-			return err;
+			goto free_ref;
 		gk20a_idle(g->dev);
 	}
 
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	err = nvgpu_clk_arb_init_session(g, &priv->clk_session);
-	if (err)
-		return err;
 #endif
-
+free_ref:
+	if (err)
+		gk20a_put(g);
 	return err;
 }
 int gk20a_ctrl_dev_release(struct inode *inode, struct file *filp)
@@ -95,6 +100,7 @@ int gk20a_ctrl_dev_release(struct inode *inode, struct file *filp)
 		nvgpu_clk_arb_release_session(g, priv->clk_session);
 #endif
 
+	gk20a_put(g);
 	kfree(priv);
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
index e33477f6..4ad2abd6 100644
--- a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
@@ -276,14 +276,20 @@ int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp)
 	const int vmid = 0;
 
 	g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
+	g = gk20a_get(g);
+	if (!g)
+		return -ENODEV;
+
 	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p", g);
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (!capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto free_ref;
+	}
 
 	err = gk20a_busy(g->dev);
 	if (err)
-		return err;
+		goto free_ref;
 
 	trace = g->ctxsw_trace;
 	if (!trace) {
@@ -325,7 +331,9 @@ done:
 
 idle:
 	gk20a_idle(g->dev);
-
+free_ref:
+	if (err)
+		gk20a_put(g);
 	return err;
 }
 
@@ -346,7 +354,7 @@ int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp)
 		dev->g->ops.fecs_trace.free_user_buffer(dev->g);
 		dev->hdr = NULL;
 	}
-
+	gk20a_put(g);
 	return 0;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
index 821951ad..7eb742ed 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -101,13 +101,17 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
 	else
 		g = container_of(inode->i_cdev,
 				 struct gk20a, prof.cdev);
+	g = gk20a_get(g);
+	if (!g)
+		return -ENODEV;
+
 	dev = g->dev;
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "dbg session: %s", dev_name(dev));
 
 	err  = alloc_session(&dbg_session);
 	if (err)
-		return err;
+		goto free_ref;
 
 	filp->private_data = dbg_session;
 	dbg_session->dev   = dev;
@@ -133,6 +137,8 @@ err_destroy_lock:
 	nvgpu_mutex_destroy(&dbg_session->ch_list_lock);
 err_free_session:
 	kfree(dbg_session);
+free_ref:
+	gk20a_put(g);
 	return err;
 }
 
@@ -494,6 +500,8 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
 	nvgpu_mutex_destroy(&dbg_s->ioctl_lock);
 
 	kfree(dbg_s);
+	gk20a_put(g);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 30c0f2fb..1d6fb0e9 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1712,7 +1712,7 @@ static int __exit gk20a_remove(struct platform_device *pdev)
 		platform->remove(dev);
 
 	set_gk20a(pdev, NULL);
-	kfree(g);
+	gk20a_put(g);
 
 	gk20a_dbg_fn("removed");
 
@@ -2274,6 +2274,68 @@ int gk20a_read_ptimer(struct gk20a *g, u64 *value)
 	return -EBUSY;
 }
 
+/*
+ * Free the gk20a struct.
+ */
+static void gk20a_free_cb(struct kref *refcount)
+{
+	struct gk20a *g = container_of(refcount,
+		struct gk20a, refcount);
+
+	gk20a_dbg(gpu_dbg_shutdown, "Freeing GK20A struct!");
+	kfree(g);
+}
+
+/**
+ * gk20a_get() - Increment ref count on driver
+ *
+ * @g The driver to increment
+ * This will fail if the driver is in the process of being released. In that
+ * case it will return NULL. Otherwise a pointer to the driver passed in will
+ * be returned.
+ */
+struct gk20a * __must_check gk20a_get(struct gk20a *g)
+{
+	int success;
+
+	/*
+	 * Handle the possibility we are still freeing the gk20a struct while
+	 * gk20a_get() is called. Unlikely but plausible race condition. Ideally
+	 * the code will never be in such a situation that this race is
+	 * possible.
+	 */
+	success = kref_get_unless_zero(&g->refcount);
+
+	gk20a_dbg(gpu_dbg_shutdown, "GET: refs currently %d %s",
+		atomic_read(&g->refcount.refcount), success ? "" : "(FAILED)");
+
+	return success ? g : NULL;
+}
+
+/**
+ * gk20a_put() - Decrement ref count on driver
+ *
+ * @g - The driver to decrement
+ *
+ * Decrement the driver ref-count. If neccesary also free the underlying driver
+ * memory
+ */
+void gk20a_put(struct gk20a *g)
+{
+	/*
+	 * Note - this is racy, two instances of this could run before the
+	 * actual kref_put(0 runs, you could see something like:
+	 *
+	 *  ... PUT: refs currently 2
+	 *  ... PUT: refs currently 2
+	 *  ... Freeing GK20A struct!
+	 */
+	gk20a_dbg(gpu_dbg_shutdown, "PUT: refs currently %d",
+		atomic_read(&g->refcount.refcount));
+
+	kref_put(&g->refcount, gk20a_free_cb);
+}
+
 MODULE_LICENSE("GPL v2");
 module_init(gk20a_init);
 module_exit(gk20a_exit);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index a4f0799a..3ba05e84 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -857,6 +857,7 @@ struct gk20a {
 	atomic_t nonstall_ops;
 	struct work_struct nonstall_fn_work;
 	struct workqueue_struct *nonstall_work_queue;
+	struct kref refcount;
 
 	struct resource *reg_mem;
 	void __iomem *regs;
@@ -1468,6 +1469,9 @@ static inline void gk20a_channel_trace_sched_param(
 
 void nvgpu_wait_for_deferred_interrupts(struct gk20a *g);
 
+struct gk20a * __must_check gk20a_get(struct gk20a *g);
+void gk20a_put(struct gk20a *g);
+
 #ifdef CONFIG_DEBUG_FS
 int gk20a_railgating_debugfs_init(struct device *dev);
 #endif
diff --git a/drivers/gpu/nvgpu/gk20a/sched_gk20a.c b/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
index 6fdc2774..a73e7993 100644
--- a/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/sched_gk20a.c
@@ -377,21 +377,28 @@ int gk20a_sched_dev_open(struct inode *inode, struct file *filp)
 {
 	struct gk20a *g = container_of(inode->i_cdev,
 				struct gk20a, sched.cdev);
-	struct gk20a_sched_ctrl *sched = &g->sched_ctrl;
-	int err;
+	struct gk20a_sched_ctrl *sched;
+	int err = 0;
+
+	g = gk20a_get(g);
+	if (!g)
+		return -ENODEV;
+	sched = &g->sched_ctrl;
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_sched, "g=%p", g);
 
 	if (!sched->sw_ready) {
 		err = gk20a_busy(g->dev);
 		if (err)
-			return err;
+			goto free_ref;
 
 		gk20a_idle(g->dev);
 	}
 
-	if (!nvgpu_mutex_tryacquire(&sched->busy_lock))
-		return -EBUSY;
+	if (!nvgpu_mutex_tryacquire(&sched->busy_lock)) {
+		err = -EBUSY;
+		goto free_ref;
+	}
 
 	memcpy(sched->recent_tsg_bitmap, sched->active_tsg_bitmap,
 			sched->bitmap_size);
@@ -400,7 +407,10 @@ int gk20a_sched_dev_open(struct inode *inode, struct file *filp)
 	filp->private_data = sched;
 	gk20a_dbg(gpu_dbg_sched, "filp=%p sched=%p", filp, sched);
 
-	return 0;
+free_ref:
+	if (err)
+		gk20a_put(g);
+	return err;
 }
 
 long gk20a_sched_dev_ioctl(struct file *filp, unsigned int cmd,
@@ -511,6 +521,7 @@ int gk20a_sched_dev_release(struct inode *inode, struct file *filp)
 	nvgpu_mutex_release(&sched->control_lock);
 
 	nvgpu_mutex_release(&sched->busy_lock);
+	gk20a_put(g);
 	return 0;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index e1424f2b..270fed85 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -259,15 +259,23 @@ static int gk20a_tsg_event_id_enable(struct tsg_gk20a *tsg,
 	struct file *file;
 	char *name;
 	struct gk20a_event_id_data *event_id_data;
+	struct gk20a *g;
+
+	g = gk20a_get(tsg->g);
+	if (!g)
+		return -ENODEV;
 
 	err = gk20a_tsg_get_event_data_from_id(tsg,
 				event_id, &event_id_data);
-	if (err == 0) /* We already have event enabled */
-		return -EINVAL;
+	if (err == 0) {
+		/* We already have event enabled */
+		err = -EINVAL;
+		goto free_ref;
+	}
 
 	err = get_unused_fd_flags(O_RDWR);
 	if (err < 0)
-		return err;
+		goto free_ref;
 	local_fd = err;
 
 	name = kasprintf(GFP_KERNEL, "nvgpu-event%d-fd%d",
@@ -286,7 +294,7 @@ static int gk20a_tsg_event_id_enable(struct tsg_gk20a *tsg,
 		err = -ENOMEM;
 		goto clean_up_file;
 	}
-	event_id_data->g = tsg->g;
+	event_id_data->g = g;
 	event_id_data->id = tsg->tsgid;
 	event_id_data->is_tsg = true;
 	event_id_data->event_id = event_id;
@@ -315,6 +323,8 @@ clean_up_file:
 	fput(file);
 clean_up:
 	put_unused_fd(local_fd);
+free_ref:
+	gk20a_put(g);
 	return err;
 }
 
@@ -410,18 +420,25 @@ int gk20a_tsg_open(struct gk20a *g, struct file *filp)
 	struct device *dev;
 	int err;
 
+	g = gk20a_get(g);
+	if (!g)
+		return -ENODEV;
+
 	dev  = dev_from_gk20a(g);
 
 	gk20a_dbg(gpu_dbg_fn, "tsg: %s", dev_name(dev));
 
 	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
+	if (!priv) {
+		err = -ENOMEM;
+		goto free_ref;
+	}
 
 	tsg = acquire_unused_tsg(&g->fifo);
 	if (!tsg) {
 		kfree(priv);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto free_ref;
 	}
 
 	tsg->g = g;
@@ -458,6 +475,8 @@ int gk20a_tsg_open(struct gk20a *g, struct file *filp)
 
 clean_up:
 	kref_put(&tsg->refcount, gk20a_tsg_release);
+free_ref:
+	gk20a_put(g);
 	return err;
 }
 
@@ -505,16 +524,13 @@ void gk20a_tsg_release(struct kref *ref)
 	tsg->runlist_id = ~0;
 
 	gk20a_dbg(gpu_dbg_fn, "tsg released %d\n", tsg->tsgid);
+	gk20a_put(g);
 }
 
 int gk20a_tsg_dev_release(struct inode *inode, struct file *filp)
 {
 	struct tsg_private *priv = filp->private_data;
 	struct tsg_gk20a *tsg = priv->tsg;
-	struct gk20a *g = priv->g;
-
-	if (g->driver_is_dying)
-		return -ENODEV;
 
 	kref_put(&tsg->refcount, gk20a_tsg_release);
 	kfree(priv);
diff --git a/drivers/gpu/nvgpu/pci.c b/drivers/gpu/nvgpu/pci.c
index 7ef626c2..0ed621ce 100644
--- a/drivers/gpu/nvgpu/pci.c
+++ b/drivers/gpu/nvgpu/pci.c
@@ -481,7 +481,8 @@ static void nvgpu_pci_remove(struct pci_dev *pdev)
 
 	enable_irq(g->irq_stall);
 
-	kfree(g);
+	gk20a_get_platform(&pdev->dev)->g = NULL;
+	gk20a_put(g);
 }
 
 static struct pci_driver nvgpu_pci_driver = {
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index d5eb05ac..a97c179f 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -638,6 +638,8 @@ int vgpu_probe(struct platform_device *pdev)
 	vgpu_create_sysfs(dev);
 	gk20a_init_gr(gk20a);
 
+	kref_init(&gk20a->refcount);
+
 	return 0;
 }
 
@@ -656,6 +658,7 @@ int vgpu_remove(struct platform_device *pdev)
 	gk20a_user_deinit(dev, &nvgpu_class);
 	vgpu_remove_sysfs(dev);
 	gk20a_get_platform(dev)->g = NULL;
-	kfree(g);
+	gk20a_put(g);
+
 	return 0;
 }
-- 
cgit v1.2.2