summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorDavid Nieto <dmartineznie@nvidia.com>2017-04-06 18:46:36 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-09-22 18:49:37 -0400
commit7eabc16b8488e20a6cbfe1a80dc99a0b046750eb (patch)
tree1adf2b98e4473d575ebce14bb45298eb9efc7e35 /drivers/gpu
parent90568a2ce58c03f457bdd4fab6675cd327ed13fd (diff)
gpu: nvgpu: defer channel worker initialization
kthread_run can fail if SIGKILL is triggered on an application during driver load. On this change we defer the channel worker init to the enqueue to avoid this condition during driver power on which would cause the driver state to be corrupted leaving subsequent attempts to load the driver unsuccesful. By moving this code to a later time, it is now needed to protect the task structure with a mutex. JIRA: EVLR-956 Bug 1816515 Change-Id: I3a159de2d1f03e70b2a3969730a927532ede2d6e Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: http://git-master/r/1462490 Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Vladislav Buzov <vbuzov@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1460689 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c56
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h1
2 files changed, 51 insertions, 6 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index ea69d7cb..a0494e31 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1878,34 +1878,70 @@ static int gk20a_channel_poll_worker(void *arg)
1878 return 0; 1878 return 0;
1879} 1879}
1880 1880
1881static int __nvgpu_channel_worker_start(struct gk20a *g)
1882{
1883 char thread_name[64];
1884 int err = 0;
1885
1886 if (nvgpu_thread_is_running(&g->channel_worker.poll_task))
1887 return err;
1888
1889 nvgpu_mutex_acquire(&g->channel_worker.start_lock);
1890
1891 /*
1892 * We don't want to grab a mutex on every channel update so we check
1893 * again if the worker has been initialized before creating a new thread
1894 */
1895
1896 /*
1897 * Mutexes have implicit barriers, so there is no risk of a thread
1898 * having a stale copy of the poll_task variable as the call to
1899 * thread_is_running is volatile
1900 */
1901
1902 if (nvgpu_thread_is_running(&g->channel_worker.poll_task)) {
1903 nvgpu_mutex_release(&g->channel_worker.start_lock);
1904 return err;
1905 }
1906
1907 snprintf(thread_name, sizeof(thread_name),
1908 "nvgpu_channel_poll_%s", g->name);
1909
1910 err = nvgpu_thread_create(&g->channel_worker.poll_task, g,
1911 gk20a_channel_poll_worker, thread_name);
1912
1913 nvgpu_mutex_release(&g->channel_worker.start_lock);
1914 return err;
1915}
1881/** 1916/**
1882 * Initialize the channel worker's metadata and start the background thread. 1917 * Initialize the channel worker's metadata and start the background thread.
1883 */ 1918 */
1884int nvgpu_channel_worker_init(struct gk20a *g) 1919int nvgpu_channel_worker_init(struct gk20a *g)
1885{ 1920{
1886 int err; 1921 int err;
1887 char thread_name[64];
1888 1922
1889 nvgpu_atomic_set(&g->channel_worker.put, 0); 1923 nvgpu_atomic_set(&g->channel_worker.put, 0);
1890 nvgpu_cond_init(&g->channel_worker.wq); 1924 nvgpu_cond_init(&g->channel_worker.wq);
1891 nvgpu_init_list_node(&g->channel_worker.items); 1925 nvgpu_init_list_node(&g->channel_worker.items);
1892 nvgpu_spinlock_init(&g->channel_worker.items_lock); 1926 nvgpu_spinlock_init(&g->channel_worker.items_lock);
1893 snprintf(thread_name, sizeof(thread_name), 1927 err = nvgpu_mutex_init(&g->channel_worker.start_lock);
1894 "nvgpu_channel_poll_%s", g->name); 1928 if (err)
1929 goto error_check;
1895 1930
1896 err = nvgpu_thread_create(&g->channel_worker.poll_task, g, 1931 err = __nvgpu_channel_worker_start(g);
1897 gk20a_channel_poll_worker, thread_name); 1932error_check:
1898 if (err) { 1933 if (err) {
1899 nvgpu_err(g, "failed to start channel poller thread"); 1934 nvgpu_err(g, "failed to start channel poller thread");
1900 return err; 1935 return err;
1901 } 1936 }
1902
1903 return 0; 1937 return 0;
1904} 1938}
1905 1939
1906void nvgpu_channel_worker_deinit(struct gk20a *g) 1940void nvgpu_channel_worker_deinit(struct gk20a *g)
1907{ 1941{
1942 nvgpu_mutex_acquire(&g->channel_worker.start_lock);
1908 nvgpu_thread_stop(&g->channel_worker.poll_task); 1943 nvgpu_thread_stop(&g->channel_worker.poll_task);
1944 nvgpu_mutex_release(&g->channel_worker.start_lock);
1909} 1945}
1910 1946
1911/** 1947/**
@@ -1924,6 +1960,14 @@ static void gk20a_channel_worker_enqueue(struct channel_gk20a *ch)
1924 gk20a_dbg_fn(""); 1960 gk20a_dbg_fn("");
1925 1961
1926 /* 1962 /*
1963 * Warn if worker thread cannot run
1964 */
1965 if (WARN_ON(__nvgpu_channel_worker_start(g))) {
1966 nvgpu_warn(g, "channel worker cannot run!");
1967 return;
1968 }
1969
1970 /*
1927 * Ref released when this item gets processed. The caller should hold 1971 * Ref released when this item gets processed. The caller should hold
1928 * one ref already, so normally shouldn't fail, but the channel could 1972 * one ref already, so normally shouldn't fail, but the channel could
1929 * end up being freed between the time the caller got its reference and 1973 * end up being freed between the time the caller got its reference and
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 0cd77d1e..35d58ef1 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -1215,6 +1215,7 @@ struct gk20a {
1215 struct nvgpu_cond wq; 1215 struct nvgpu_cond wq;
1216 struct nvgpu_list_node items; 1216 struct nvgpu_list_node items;
1217 struct nvgpu_spinlock items_lock; 1217 struct nvgpu_spinlock items_lock;
1218 struct nvgpu_mutex start_lock;
1218 } channel_worker; 1219 } channel_worker;
1219 1220
1220 struct gk20a_scale_profile *scale_profile; 1221 struct gk20a_scale_profile *scale_profile;