gpu: nvgpu: defer channel worker initialization

kthread_run can fail if SIGKILL is triggered on an application during driver load. On this change we defer the channel worker init to the enqueue to avoid this condition during driver power on which would cause the driver state to be corrupted leaving subsequent attempts to load the driver unsuccesful. By moving this code to a later time, it is now needed to protect the task structure with a mutex. JIRA: EVLR-956 Bug 1816515 Change-Id: I3a159de2d1f03e70b2a3969730a927532ede2d6e Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: http://git-master/r/1462490 Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Vladislav Buzov <vbuzov@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1460689 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: David Nieto <dmartineznie@nvidia.com> 2017-04-06 18:46:36 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-09-22 18:49:37 -0400
commit: 7eabc16b8488e20a6cbfe1a80dc99a0b046750eb (patch)
tree: 1adf2b98e4473d575ebce14bb45298eb9efc7e35 /drivers/gpu
parent: 90568a2ce58c03f457bdd4fab6675cd327ed13fd (diff)
2 files changed, 51 insertions, 6 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index ea69d7cb..a0494e31 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1878,34 +1878,70 @@ static int gk20a_channel_poll_worker(void *arg)
        return 0;
 }
+static int __nvgpu_channel_worker_start(struct gk20a *g)
+{
+        char thread_name[64];
+        int err = 0;
+        if (nvgpu_thread_is_running(&g->channel_worker.poll_task))
+                return err;
+        nvgpu_mutex_acquire(&g->channel_worker.start_lock);
+        /*
+         * We don't want to grab a mutex on every channel update so we check
+         * again if the worker has been initialized before creating a new thread
+         */
+        /*
+         * Mutexes have implicit barriers, so there is no risk of a thread
+         * having a stale copy of the poll_task variable as the call to
+         * thread_is_running is volatile
+         */
+        if (nvgpu_thread_is_running(&g->channel_worker.poll_task)) {
+                nvgpu_mutex_release(&g->channel_worker.start_lock);
+                return err;
+        }
+        snprintf(thread_name, sizeof(thread_name),
+                        "nvgpu_channel_poll_%s", g->name);
+        err = nvgpu_thread_create(&g->channel_worker.poll_task, g,
+                        gk20a_channel_poll_worker, thread_name);
+        nvgpu_mutex_release(&g->channel_worker.start_lock);
+        return err;
+}
 /**
 * Initialize the channel worker's metadata and start the background thread.
 */
 int nvgpu_channel_worker_init(struct gk20a *g)
 {
        int err;
-        char thread_name[64];
        nvgpu_atomic_set(&g->channel_worker.put, 0);
        nvgpu_cond_init(&g->channel_worker.wq);
        nvgpu_init_list_node(&g->channel_worker.items);
        nvgpu_spinlock_init(&g->channel_worker.items_lock);
-        snprintf(thread_name, sizeof(thread_name),
+        err = nvgpu_mutex_init(&g->channel_worker.start_lock);
-                        "nvgpu_channel_poll_%s", g->name);
+        if (err)
+                goto error_check;
-        err = nvgpu_thread_create(&g->channel_worker.poll_task, g,
+        err = __nvgpu_channel_worker_start(g);
-                        gk20a_channel_poll_worker, thread_name);
+error_check:
        if (err) {
                nvgpu_err(g, "failed to start channel poller thread");
                return err;
        }
        return 0;
 }
 void nvgpu_channel_worker_deinit(struct gk20a *g)
 {
+        nvgpu_mutex_acquire(&g->channel_worker.start_lock);
        nvgpu_thread_stop(&g->channel_worker.poll_task);
+        nvgpu_mutex_release(&g->channel_worker.start_lock);
 }
 /**
@@ -1924,6 +1960,14 @@ static void gk20a_channel_worker_enqueue(struct channel_gk20a *ch)
        gk20a_dbg_fn("");
        /*
+         * Warn if worker thread cannot run
+         */
+        if (WARN_ON(__nvgpu_channel_worker_start(g))) {
+                nvgpu_warn(g, "channel worker cannot run!");
+                return;
+        }
+        /*
         * Ref released when this item gets processed. The caller should hold
         * one ref already, so normally shouldn't fail, but the channel could
         * end up being freed between the time the caller got its reference and
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 0cd77d1e..35d58ef1 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -1215,6 +1215,7 @@ struct gk20a {
                struct nvgpu_cond wq;
                struct nvgpu_list_node items;
                struct nvgpu_spinlock items_lock;
+                struct nvgpu_mutex start_lock;
        } channel_worker;
        struct gk20a_scale_profile *scale_profile;
author	David Nieto <dmartineznie@nvidia.com>	2017-04-06 18:46:36 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-09-22 18:49:37 -0400
commit	7eabc16b8488e20a6cbfe1a80dc99a0b046750eb (patch)
tree	1adf2b98e4473d575ebce14bb45298eb9efc7e35 /drivers/gpu
parent	90568a2ce58c03f457bdd4fab6675cd327ed13fd (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index ea69d7cb..a0494e31 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1878,34 +1878,70 @@ static int gk20a_channel_poll_worker(void *arg)
1878	return 0;	1878	return 0;
1879	}	1879	}
1880		1880
		1881	static int __nvgpu_channel_worker_start(struct gk20a *g)
		1882	{
		1883	char thread_name[64];
		1884	int err = 0;
		1885
		1886	if (nvgpu_thread_is_running(&g->channel_worker.poll_task))
		1887	return err;
		1888
		1889	nvgpu_mutex_acquire(&g->channel_worker.start_lock);
		1890
		1891	/*
		1892	* We don't want to grab a mutex on every channel update so we check
		1893	* again if the worker has been initialized before creating a new thread
		1894	*/
		1895
		1896	/*
		1897	* Mutexes have implicit barriers, so there is no risk of a thread
		1898	* having a stale copy of the poll_task variable as the call to
		1899	* thread_is_running is volatile
		1900	*/
		1901
		1902	if (nvgpu_thread_is_running(&g->channel_worker.poll_task)) {
		1903	nvgpu_mutex_release(&g->channel_worker.start_lock);
		1904	return err;
		1905	}
		1906
		1907	snprintf(thread_name, sizeof(thread_name),
		1908	"nvgpu_channel_poll_%s", g->name);
		1909
		1910	err = nvgpu_thread_create(&g->channel_worker.poll_task, g,
		1911	gk20a_channel_poll_worker, thread_name);
		1912
		1913	nvgpu_mutex_release(&g->channel_worker.start_lock);
		1914	return err;
		1915	}
1881	/**	1916	/**
1882	* Initialize the channel worker's metadata and start the background thread.	1917	* Initialize the channel worker's metadata and start the background thread.
1883	*/	1918	*/
1884	int nvgpu_channel_worker_init(struct gk20a *g)	1919	int nvgpu_channel_worker_init(struct gk20a *g)
1885	{	1920	{
1886	int err;	1921	int err;
1887	char thread_name[64];
1888		1922
1889	nvgpu_atomic_set(&g->channel_worker.put, 0);	1923	nvgpu_atomic_set(&g->channel_worker.put, 0);
1890	nvgpu_cond_init(&g->channel_worker.wq);	1924	nvgpu_cond_init(&g->channel_worker.wq);
1891	nvgpu_init_list_node(&g->channel_worker.items);	1925	nvgpu_init_list_node(&g->channel_worker.items);
1892	nvgpu_spinlock_init(&g->channel_worker.items_lock);	1926	nvgpu_spinlock_init(&g->channel_worker.items_lock);
1893	snprintf(thread_name, sizeof(thread_name),	1927	err = nvgpu_mutex_init(&g->channel_worker.start_lock);
1894	"nvgpu_channel_poll_%s", g->name);	1928	if (err)
		1929	goto error_check;
1895		1930
1896	err = nvgpu_thread_create(&g->channel_worker.poll_task, g,	1931	err = __nvgpu_channel_worker_start(g);
1897	gk20a_channel_poll_worker, thread_name);	1932	error_check:
1898	if (err) {	1933	if (err) {
1899	nvgpu_err(g, "failed to start channel poller thread");	1934	nvgpu_err(g, "failed to start channel poller thread");
1900	return err;	1935	return err;
1901	}	1936	}
1902
1903	return 0;	1937	return 0;
1904	}	1938	}
1905		1939
1906	void nvgpu_channel_worker_deinit(struct gk20a *g)	1940	void nvgpu_channel_worker_deinit(struct gk20a *g)
1907	{	1941	{
		1942	nvgpu_mutex_acquire(&g->channel_worker.start_lock);
1908	nvgpu_thread_stop(&g->channel_worker.poll_task);	1943	nvgpu_thread_stop(&g->channel_worker.poll_task);
		1944	nvgpu_mutex_release(&g->channel_worker.start_lock);
1909	}	1945	}
1910		1946
1911	/**	1947	/**
@@ -1924,6 +1960,14 @@ static void gk20a_channel_worker_enqueue(struct channel_gk20a *ch)
1924	gk20a_dbg_fn("");	1960	gk20a_dbg_fn("");
1925		1961
1926	/*	1962	/*
		1963	* Warn if worker thread cannot run
		1964	*/
		1965	if (WARN_ON(__nvgpu_channel_worker_start(g))) {
		1966	nvgpu_warn(g, "channel worker cannot run!");
		1967	return;
		1968	}
		1969
		1970	/*
1927	* Ref released when this item gets processed. The caller should hold	1971	* Ref released when this item gets processed. The caller should hold
1928	* one ref already, so normally shouldn't fail, but the channel could	1972	* one ref already, so normally shouldn't fail, but the channel could
1929	* end up being freed between the time the caller got its reference and	1973	* end up being freed between the time the caller got its reference and


diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 0cd77d1e..35d58ef1 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -1215,6 +1215,7 @@ struct gk20a {
1215	struct nvgpu_cond wq;	1215	struct nvgpu_cond wq;
1216	struct nvgpu_list_node items;	1216	struct nvgpu_list_node items;
1217	struct nvgpu_spinlock items_lock;	1217	struct nvgpu_spinlock items_lock;
		1218	struct nvgpu_mutex start_lock;
1218	} channel_worker;	1219	} channel_worker;
1219		1220
1220	struct gk20a_scale_profile *scale_profile;	1221	struct gk20a_scale_profile *scale_profile;