gpu: nvgpu: improve sync create/destroy logic

This change improves the aggressive sync creation & destruction logic to avoid lock contention in the submit path. It does the following: 1) Removes the global sync destruction (channel) threshold, and adds a per-platform parameter. 2) Avoids lock contention in the clean-up/submit path when aggressive sync destruction is disabled. 3) Creates sync object at gpfifo allocation time (as long as we are not in aggressive sync destroy mode), to enable faster first submits Bug 1795076 Change-Id: Ifdb680100b08d00f37338063355bb2123ceb1b9f Signed-off-by: Sachit Kadle <skadle@nvidia.com> Reviewed-on: http://git-master/r/1202425 (cherry picked from commit ac0978711943a59c6f28c98c76b10759e0bff610) Reviewed-on: http://git-master/r/1202427 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Sachit Kadle <skadle@nvidia.com> 2016-09-19 17:18:11 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2016-09-20 13:43:35 -0400
commit: 246dcb824bc299b836be92fb74416c07a57e085b (patch)
tree: f1ce23cdea84e873fbf965ce903e31c7effa4739 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent: 3180ed70489113365203abc049223ad5956cb22e (diff)
1 files changed, 53 insertions, 24 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 7df794bf..79d449e6 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -43,8 +43,6 @@
 #define NVMAP_HANDLE_PARAM_SIZE 1
-#define NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT       64      /* channels */
 #define NVGPU_CHANNEL_MIN_TIMESLICE_US 1000
 #define NVGPU_CHANNEL_MAX_TIMESLICE_US 50000
@@ -91,7 +89,9 @@ static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
        }
        mutex_unlock(&f->free_chs_mutex);
-        if (f->used_channels > NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT)
+        if (platform->aggressive_sync_destroy_thresh &&
+                        (f->used_channels >
+                         platform->aggressive_sync_destroy_thresh))
                platform->aggressive_sync_destroy = true;
        return ch;
@@ -110,7 +110,9 @@ static void free_channel(struct fifo_gk20a *f,
        f->used_channels--;
        mutex_unlock(&f->free_chs_mutex);
-        if (f->used_channels < NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT)
+        if (platform->aggressive_sync_destroy_thresh &&
+                        (f->used_channels <
+                         platform->aggressive_sync_destroy_thresh))
                platform->aggressive_sync_destroy = false;
 }
@@ -1424,6 +1426,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 {
        struct gk20a *g = c->g;
        struct device *d = dev_from_gk20a(g);
+        struct gk20a_platform *platform = gk20a_get_platform(d);
        struct vm_gk20a *ch_vm;
        u32 gpfifo_size;
        int err = 0;
@@ -1487,26 +1490,46 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
        channel_gk20a_setup_userd(c);
+        if (!platform->aggressive_sync_destroy_thresh) {
+                mutex_lock(&c->sync_lock);
+                c->sync = gk20a_channel_sync_create(c);
+                if (!c->sync) {
+                        err = -ENOMEM;
+                        mutex_unlock(&c->sync_lock);
+                        goto clean_up_unmap;
+                }
+                mutex_unlock(&c->sync_lock);
+                if (g->ops.fifo.resetup_ramfc) {
+                        err = g->ops.fifo.resetup_ramfc(c);
+                        if (err)
+                                goto clean_up_sync;
+                }
+        }
        err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
                                        c->gpfifo.entry_num, args->flags);
        if (err)
-                goto clean_up_unmap;
+                goto clean_up_sync;
        /* TBD: setup engine contexts */
        err = channel_gk20a_alloc_priv_cmdbuf(c);
        if (err)
-                goto clean_up_unmap;
+                goto clean_up_sync;
        err = channel_gk20a_update_runlist(c, true);
        if (err)
-                goto clean_up_unmap;
+                goto clean_up_sync;
        g->ops.fifo.bind_channel(c);
        gk20a_dbg_fn("done");
        return 0;
+clean_up_sync:
+        gk20a_channel_sync_destroy(c->sync);
+        c->sync = NULL;
 clean_up_unmap:
        nvgpu_free(c->gpfifo.pipe);
        gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);
@@ -1911,18 +1934,21 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
                gk20a_channel_timeout_stop(c);
-                mutex_lock(&c->sync_lock);
+                WARN_ON(!c->sync);
                if (c->sync) {
                        c->sync->signal_timeline(c->sync);
-                        if (atomic_dec_and_test(&c->sync->refcount) &&
-                                        platform->aggressive_sync_destroy) {
+                        if (platform->aggressive_sync_destroy_thresh) {
-                                gk20a_channel_sync_destroy(c->sync);
+                                mutex_lock(&c->sync_lock);
-                                c->sync = NULL;
+                                if (atomic_dec_and_test(&c->sync->refcount) &&
+                                                platform->aggressive_sync_destroy) {
+                                        gk20a_channel_sync_destroy(c->sync);
+                                        c->sync = NULL;
+                                }
+                                mutex_unlock(&c->sync_lock);
                        }
-                } else {
-                        WARN_ON(1);
                }
-                mutex_unlock(&c->sync_lock);
                if (job->num_mapped_buffers)
                        gk20a_vm_put_buffers(vm, job->mapped_buffers,
@@ -2099,6 +2125,7 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
                                      u32 flags)
 {
        struct gk20a *g = c->g;
+        struct gk20a_platform *platform = gk20a_get_platform(g->dev);
        bool need_sync_fence = false;
        bool new_sync_created = false;
        int wait_fence_fd = -1;
@@ -2112,18 +2139,20 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
        if (force_need_sync_fence)
                need_sync_fence = true;
-        mutex_lock(&c->sync_lock);
+        if (platform->aggressive_sync_destroy_thresh) {
-        if (!c->sync) {
+                mutex_lock(&c->sync_lock);
-                c->sync = gk20a_channel_sync_create(c);
                if (!c->sync) {
-                        err = -ENOMEM;
+                        c->sync = gk20a_channel_sync_create(c);
-                        mutex_unlock(&c->sync_lock);
+                        if (!c->sync) {
-                        goto fail;
+                                err = -ENOMEM;
+                                mutex_unlock(&c->sync_lock);
+                                goto fail;
+                        }
+                        new_sync_created = true;
                }
-                new_sync_created = true;
+                atomic_inc(&c->sync->refcount);
+                mutex_unlock(&c->sync_lock);
        }
-        atomic_inc(&c->sync->refcount);
-        mutex_unlock(&c->sync_lock);
        if (g->ops.fifo.resetup_ramfc && new_sync_created) {
                err = g->ops.fifo.resetup_ramfc(c);
author	Sachit Kadle <skadle@nvidia.com>	2016-09-19 17:18:11 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2016-09-20 13:43:35 -0400
commit	246dcb824bc299b836be92fb74416c07a57e085b (patch)
tree	f1ce23cdea84e873fbf965ce903e31c7effa4739 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent	3180ed70489113365203abc049223ad5956cb22e (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 7df794bf..79d449e6 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -43,8 +43,6 @@
43		43
44	#define NVMAP_HANDLE_PARAM_SIZE 1	44	#define NVMAP_HANDLE_PARAM_SIZE 1
45		45
46	#define NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT 64 /* channels */
47
48	#define NVGPU_CHANNEL_MIN_TIMESLICE_US 1000	46	#define NVGPU_CHANNEL_MIN_TIMESLICE_US 1000
49	#define NVGPU_CHANNEL_MAX_TIMESLICE_US 50000	47	#define NVGPU_CHANNEL_MAX_TIMESLICE_US 50000
50		48
@@ -91,7 +89,9 @@ static struct channel_gk20a allocate_channel(struct fifo_gk20a f)
91	}	89	}
92	mutex_unlock(&f->free_chs_mutex);	90	mutex_unlock(&f->free_chs_mutex);
93		91
94	if (f->used_channels > NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT)	92	if (platform->aggressive_sync_destroy_thresh &&
		93	(f->used_channels >
		94	platform->aggressive_sync_destroy_thresh))
95	platform->aggressive_sync_destroy = true;	95	platform->aggressive_sync_destroy = true;
96		96
97	return ch;	97	return ch;
@@ -110,7 +110,9 @@ static void free_channel(struct fifo_gk20a *f,
110	f->used_channels--;	110	f->used_channels--;
111	mutex_unlock(&f->free_chs_mutex);	111	mutex_unlock(&f->free_chs_mutex);
112		112
113	if (f->used_channels < NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT)	113	if (platform->aggressive_sync_destroy_thresh &&
		114	(f->used_channels <
		115	platform->aggressive_sync_destroy_thresh))
114	platform->aggressive_sync_destroy = false;	116	platform->aggressive_sync_destroy = false;
115	}	117	}
116		118
@@ -1424,6 +1426,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
1424	{	1426	{
1425	struct gk20a *g = c->g;	1427	struct gk20a *g = c->g;
1426	struct device *d = dev_from_gk20a(g);	1428	struct device *d = dev_from_gk20a(g);
		1429	struct gk20a_platform *platform = gk20a_get_platform(d);
1427	struct vm_gk20a *ch_vm;	1430	struct vm_gk20a *ch_vm;
1428	u32 gpfifo_size;	1431	u32 gpfifo_size;
1429	int err = 0;	1432	int err = 0;
@@ -1487,26 +1490,46 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
1487		1490
1488	channel_gk20a_setup_userd(c);	1491	channel_gk20a_setup_userd(c);
1489		1492
		1493	if (!platform->aggressive_sync_destroy_thresh) {
		1494	mutex_lock(&c->sync_lock);
		1495	c->sync = gk20a_channel_sync_create(c);
		1496	if (!c->sync) {
		1497	err = -ENOMEM;
		1498	mutex_unlock(&c->sync_lock);
		1499	goto clean_up_unmap;
		1500	}
		1501	mutex_unlock(&c->sync_lock);
		1502
		1503	if (g->ops.fifo.resetup_ramfc) {
		1504	err = g->ops.fifo.resetup_ramfc(c);
		1505	if (err)
		1506	goto clean_up_sync;
		1507	}
		1508	}
		1509
1490	err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,	1510	err = g->ops.fifo.setup_ramfc(c, c->gpfifo.mem.gpu_va,
1491	c->gpfifo.entry_num, args->flags);	1511	c->gpfifo.entry_num, args->flags);
1492	if (err)	1512	if (err)
1493	goto clean_up_unmap;	1513	goto clean_up_sync;
1494		1514
1495	/* TBD: setup engine contexts */	1515	/* TBD: setup engine contexts */
1496		1516
1497	err = channel_gk20a_alloc_priv_cmdbuf(c);	1517	err = channel_gk20a_alloc_priv_cmdbuf(c);
1498	if (err)	1518	if (err)
1499	goto clean_up_unmap;	1519	goto clean_up_sync;
1500		1520
1501	err = channel_gk20a_update_runlist(c, true);	1521	err = channel_gk20a_update_runlist(c, true);
1502	if (err)	1522	if (err)
1503	goto clean_up_unmap;	1523	goto clean_up_sync;
1504		1524
1505	g->ops.fifo.bind_channel(c);	1525	g->ops.fifo.bind_channel(c);
1506		1526
1507	gk20a_dbg_fn("done");	1527	gk20a_dbg_fn("done");
1508	return 0;	1528	return 0;
1509		1529
		1530	clean_up_sync:
		1531	gk20a_channel_sync_destroy(c->sync);
		1532	c->sync = NULL;
1510	clean_up_unmap:	1533	clean_up_unmap:
1511	nvgpu_free(c->gpfifo.pipe);	1534	nvgpu_free(c->gpfifo.pipe);
1512	gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);	1535	gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);
@@ -1911,18 +1934,21 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
1911		1934
1912	gk20a_channel_timeout_stop(c);	1935	gk20a_channel_timeout_stop(c);
1913		1936
1914	mutex_lock(&c->sync_lock);	1937	WARN_ON(!c->sync);
		1938
1915	if (c->sync) {	1939	if (c->sync) {
1916	c->sync->signal_timeline(c->sync);	1940	c->sync->signal_timeline(c->sync);
1917	if (atomic_dec_and_test(&c->sync->refcount) &&	1941
1918	platform->aggressive_sync_destroy) {	1942	if (platform->aggressive_sync_destroy_thresh) {
1919	gk20a_channel_sync_destroy(c->sync);	1943	mutex_lock(&c->sync_lock);
1920	c->sync = NULL;	1944	if (atomic_dec_and_test(&c->sync->refcount) &&
		1945	platform->aggressive_sync_destroy) {
		1946	gk20a_channel_sync_destroy(c->sync);
		1947	c->sync = NULL;
		1948	}
		1949	mutex_unlock(&c->sync_lock);
1921	}	1950	}
1922	} else {
1923	WARN_ON(1);
1924	}	1951	}
1925	mutex_unlock(&c->sync_lock);
1926		1952
1927	if (job->num_mapped_buffers)	1953	if (job->num_mapped_buffers)
1928	gk20a_vm_put_buffers(vm, job->mapped_buffers,	1954	gk20a_vm_put_buffers(vm, job->mapped_buffers,
@@ -2099,6 +2125,7 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
2099	u32 flags)	2125	u32 flags)
2100	{	2126	{
2101	struct gk20a *g = c->g;	2127	struct gk20a *g = c->g;
		2128	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
2102	bool need_sync_fence = false;	2129	bool need_sync_fence = false;
2103	bool new_sync_created = false;	2130	bool new_sync_created = false;
2104	int wait_fence_fd = -1;	2131	int wait_fence_fd = -1;
@@ -2112,18 +2139,20 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
2112	if (force_need_sync_fence)	2139	if (force_need_sync_fence)
2113	need_sync_fence = true;	2140	need_sync_fence = true;
2114		2141
2115	mutex_lock(&c->sync_lock);	2142	if (platform->aggressive_sync_destroy_thresh) {
2116	if (!c->sync) {	2143	mutex_lock(&c->sync_lock);
2117	c->sync = gk20a_channel_sync_create(c);
2118	if (!c->sync) {	2144	if (!c->sync) {
2119	err = -ENOMEM;	2145	c->sync = gk20a_channel_sync_create(c);
2120	mutex_unlock(&c->sync_lock);	2146	if (!c->sync) {
2121	goto fail;	2147	err = -ENOMEM;
		2148	mutex_unlock(&c->sync_lock);
		2149	goto fail;
		2150	}
		2151	new_sync_created = true;
2122	}	2152	}
2123	new_sync_created = true;	2153	atomic_inc(&c->sync->refcount);
		2154	mutex_unlock(&c->sync_lock);
2124	}	2155	}
2125	atomic_inc(&c->sync->refcount);
2126	mutex_unlock(&c->sync_lock);
2127		2156
2128	if (g->ops.fifo.resetup_ramfc && new_sync_created) {	2157	if (g->ops.fifo.resetup_ramfc && new_sync_created) {
2129	err = g->ops.fifo.resetup_ramfc(c);	2158	err = g->ops.fifo.resetup_ramfc(c);