gpu: nvgpu: use gpfifo_mem via gk20a_mem_{rd,wr}

Use gk20a_mem_*() accessors for gpfifo memory in work submission instead of direct cpu accesses in order to support other apertures than sysmem. The gpfifo memory is still allocated from sysmem for dgpus too. Split the copying of priv_cmds and the main gpfifo to be submitted in gk20a_submit_channel_gpfifo() into separate functions. JIRA DNVGPU-21 Change-Id: If271ca8e7e34235f00d31855dbccf77c0008e10b Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/1145923 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2016-06-15 07:06:28 -0400
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2016-06-20 10:45:33 -0400
commit: 27baafaad1a5c999642939faef63cacab17c9ed6 (patch)
tree: 9ad2ce8dbf044f120c8959f4c69d825dee3183f8 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent: efb6113b65c4976cf718787b2adc64d495e8fd94 (diff)
1 files changed, 138 insertions, 104 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index fc1edd99..cc097ae4 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -918,7 +918,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
        memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));
        gk20a_gmmu_unmap_free(ch_vm, &ch->gpfifo.mem);
+        nvgpu_free(ch->gpfifo.pipe);
        memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
 #if defined(CONFIG_GK20A_CYCLE_STATS)
@@ -1430,7 +1430,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
        c->ramfc.offset = 0;
        c->ramfc.size = ram_in_ramfc_s() / 8;
-        if (c->gpfifo.mem.cpu_va) {
+        if (c->gpfifo.mem.size) {
                gk20a_err(d, "channel %d :"
                           "gpfifo already allocated", c->hw_chid);
                return -EEXIST;
@@ -1444,6 +1444,16 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
                goto clean_up;
        }
+        if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
+                c->gpfifo.pipe = nvgpu_alloc(
+                                gpfifo_size * sizeof(struct nvgpu_gpfifo),
+                                false);
+                if (!c->gpfifo.pipe) {
+                        err = -ENOMEM;
+                        goto clean_up_unmap;
+                }
+        }
        c->gpfifo.entry_num = gpfifo_size;
        c->gpfifo.get = c->gpfifo.put = 0;
@@ -1473,6 +1483,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
        return 0;
 clean_up_unmap:
+        kfree(c->gpfifo.pipe);
        gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);
 clean_up:
        memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
@@ -1568,7 +1579,7 @@ static void trace_write_pushbuffer(struct channel_gk20a *c,
 static void trace_write_pushbuffer_range(struct channel_gk20a *c,
                                         struct nvgpu_gpfifo *g,
-                                         struct nvgpu_submit_gpfifo_args *args,
+                                         struct nvgpu_gpfifo __user *user_gpfifo,
                                         int offset,
                                         int count)
 {
@@ -1580,18 +1591,17 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
        if (!gk20a_debug_trace_cmdbuf)
                return;
-        if (!g && !args)
+        if (!g && !user_gpfifo)
                return;
        if (!g) {
-                size = args->num_entries * sizeof(struct nvgpu_gpfifo);
+                size = count * sizeof(struct nvgpu_gpfifo);
                if (size) {
                        g = nvgpu_alloc(size, false);
                        if (!g)
                                return;
-                        if (copy_from_user(g,
+                        if (copy_from_user(g, user_gpfifo, size)) {
-                                (void __user *)(uintptr_t)args->gpfifo, size)) {
                                nvgpu_free(g);
                                return;
                        }
@@ -1984,6 +1994,116 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
        gk20a_channel_put(c);
 }
+static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
+                struct priv_cmd_entry *cmd)
+{
+        struct gk20a *g = c->g;
+        struct mem_desc *gpfifo_mem = &c->gpfifo.mem;
+        struct nvgpu_gpfifo x = {
+                .entry0 = u64_lo32(cmd->gva),
+                .entry1 = u64_hi32(cmd->gva) |
+                        pbdma_gp_entry1_length_f(cmd->size)
+        };
+        gk20a_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
+                        &x, sizeof(x));
+        if (cmd->mem->aperture == APERTURE_SYSMEM)
+                trace_gk20a_push_cmdbuf(dev_name(g->dev), 0, cmd->size, 0,
+                                cmd->mem->cpu_va + cmd->off * sizeof(u32));
+        c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
+}
+/*
+ * Copy source gpfifo entries into the gpfifo ring buffer, potentially
+ * splitting into two memcpys to handle wrap-around.
+ */
+static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
+                struct nvgpu_gpfifo *kern_gpfifo,
+                struct nvgpu_gpfifo __user *user_gpfifo,
+                u32 num_entries)
+{
+        /* byte offsets */
+        u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
+        u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
+        u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
+        u32 end = start + len; /* exclusive */
+        struct mem_desc *gpfifo_mem = &c->gpfifo.mem;
+        struct nvgpu_gpfifo *cpu_src;
+        int err;
+        if (user_gpfifo && !c->gpfifo.pipe) {
+                /*
+                 * This path (from userspace to sysmem) is special in order to
+                 * avoid two copies unnecessarily (from user to pipe, then from
+                 * pipe to gpu sysmem buffer).
+                 *
+                 * As a special case, the pipe buffer exists if PRAMIN writes
+                 * are forced, although the buffers may not be in vidmem in
+                 * that case.
+                 */
+                if (end > gpfifo_size) {
+                        /* wrap-around */
+                        int length0 = gpfifo_size - start;
+                        int length1 = len - length0;
+                        void *user2 = (u8*)user_gpfifo + length0;
+                        err = copy_from_user(gpfifo_mem->cpu_va + start,
+                                        user_gpfifo, length0);
+                        if (err)
+                                return err;
+                        err = copy_from_user(gpfifo_mem->cpu_va,
+                                        user2, length1);
+                        if (err)
+                                return err;
+                } else {
+                        err = copy_from_user(gpfifo_mem->cpu_va + start,
+                                        user_gpfifo, len);
+                        if (err)
+                                return err;
+                }
+                trace_write_pushbuffer_range(c, NULL, user_gpfifo,
+                                0, num_entries);
+                goto out;
+        } else if (user_gpfifo) {
+                /* from userspace to vidmem or sysmem when pramin forced, use
+                 * the common copy path below */
+                err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
+                if (err)
+                        return err;
+                cpu_src = c->gpfifo.pipe;
+        } else {
+                /* from kernel to either sysmem or vidmem, don't need
+                 * copy_from_user so use the common path below */
+                cpu_src = kern_gpfifo;
+        }
+        if (end > gpfifo_size) {
+                /* wrap-around */
+                int length0 = gpfifo_size - start;
+                int length1 = len - length0;
+                void *src2 = (u8 *)cpu_src + length0;
+                gk20a_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
+                gk20a_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
+        } else {
+                gk20a_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
+        }
+        trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
+out:
+        c->gpfifo.put = (c->gpfifo.put + num_entries) &
+                (c->gpfifo.entry_num - 1);
+        return 0;
+}
 int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
                                struct nvgpu_gpfifo *gpfifo,
                                struct nvgpu_submit_gpfifo_args *args,
@@ -1996,7 +2116,6 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
        struct gk20a *g = c->g;
        struct device *d = dev_from_gk20a(g);
        int err = 0;
-        int start, end;
        int wait_fence_fd = -1;
        struct priv_cmd_entry *wait_cmd = NULL;
        struct priv_cmd_entry *incr_cmd = NULL;
@@ -2006,11 +2125,12 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
         * and one for post fence. */
        const int extra_entries = 2;
        bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
-        struct nvgpu_gpfifo *gpfifo_mem = c->gpfifo.mem.cpu_va;
        bool skip_buffer_refcounting = (flags &
                        NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
        bool need_sync_fence = false;
        bool new_sync_created = false;
+        struct nvgpu_gpfifo __user *user_gpfifo = args ?
+                (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : 0;
        /*
         * If user wants to allocate sync_fence_fd always, then respect that;
@@ -2157,102 +2277,17 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
                goto clean_up;
        }
-        if (wait_cmd) {
+        if (wait_cmd)
-                gpfifo_mem[c->gpfifo.put].entry0 = u64_lo32(wait_cmd->gva);
+                gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
-                gpfifo_mem[c->gpfifo.put].entry1 = u64_hi32(wait_cmd->gva) |
-                        pbdma_gp_entry1_length_f(wait_cmd->size);
-                trace_gk20a_push_cmdbuf(dev_name(c->g->dev),
-                                0, wait_cmd->size, 0,
-                                wait_cmd->mem->cpu_va + wait_cmd->off *
-                                sizeof(u32));
-                c->gpfifo.put = (c->gpfifo.put + 1) &
-                        (c->gpfifo.entry_num - 1);
-        }
-        /*
-         * Copy source gpfifo entries into the gpfifo ring buffer,
-         * potentially splitting into two memcpies to handle the
-         * ring buffer wrap-around case.
-         */
-        start = c->gpfifo.put;
-        end = start + num_entries;
-        if (gpfifo) {
-                if (end > c->gpfifo.entry_num) {
-                        int length0 = c->gpfifo.entry_num - start;
-                        int length1 = num_entries - length0;
-                        memcpy(gpfifo_mem + start, gpfifo,
-                               length0 * sizeof(*gpfifo));
-                        memcpy(gpfifo_mem, gpfifo + length0,
-                               length1 * sizeof(*gpfifo));
-                        trace_write_pushbuffer_range(c, gpfifo, NULL,
-                                        0, length0);
-                        trace_write_pushbuffer_range(c, gpfifo, NULL,
-                                        length0, length1);
-                } else {
-                        memcpy(gpfifo_mem + start, gpfifo,
-                               num_entries * sizeof(*gpfifo));
-                        trace_write_pushbuffer_range(c, gpfifo, NULL,
-                                        0, num_entries);
-                }
-        } else {
-                struct nvgpu_gpfifo __user *user_gpfifo =
-                        (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo;
-                if (end > c->gpfifo.entry_num) {
-                        int length0 = c->gpfifo.entry_num - start;
-                        int length1 = num_entries - length0;
-                        err = copy_from_user(gpfifo_mem + start,
-                                user_gpfifo,
-                                length0 * sizeof(*user_gpfifo));
-                        if (err) {
-                                goto clean_up;
-                        }
-                        err = copy_from_user(gpfifo_mem,
+        if (gpfifo || user_gpfifo)
-                                user_gpfifo + length0,
+                err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
-                                length1 * sizeof(*user_gpfifo));
+                                num_entries);
-                        if (err) {
+        if (err)
-                                goto clean_up;
+                goto clean_up;
-                        }
-                        trace_write_pushbuffer_range(c, NULL, args,
-                                        0, length0);
-                        trace_write_pushbuffer_range(c, NULL, args,
-                                        length0, length1);
-                } else {
-                        err = copy_from_user(gpfifo_mem + start,
-                                user_gpfifo,
-                                num_entries * sizeof(*user_gpfifo));
-                        if (err) {
-                                goto clean_up;
-                        }
-                        trace_write_pushbuffer_range(c, NULL, args,
-                                        0, num_entries);
-                }
-        }
-        c->gpfifo.put = (c->gpfifo.put + num_entries) &
-                (c->gpfifo.entry_num - 1);
-        if (incr_cmd) {
-                gpfifo_mem[c->gpfifo.put].entry0 = u64_lo32(incr_cmd->gva);
-                gpfifo_mem[c->gpfifo.put].entry1 = u64_hi32(incr_cmd->gva) |
-                        pbdma_gp_entry1_length_f(incr_cmd->size);
-                trace_gk20a_push_cmdbuf(dev_name(c->g->dev),
-                                0, incr_cmd->size, 0,
-                                incr_cmd->mem->cpu_va + incr_cmd->off *
-                                sizeof(u32));
-                c->gpfifo.put = (c->gpfifo.put + 1) &
+        if (incr_cmd)
-                        (c->gpfifo.entry_num - 1);
+                gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
-        }
        mutex_lock(&c->last_submit.fence_lock);
        gk20a_fence_put(c->last_submit.pre_fence);
@@ -2892,7 +2927,6 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 {
        struct gk20a_fence *fence_out;
        int ret = 0;
        gk20a_dbg_fn("");
        if (ch->has_timedout)
author	Konsta Holtta <kholtta@nvidia.com>	2016-06-15 07:06:28 -0400
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2016-06-20 10:45:33 -0400
commit	27baafaad1a5c999642939faef63cacab17c9ed6 (patch)
tree	9ad2ce8dbf044f120c8959f4c69d825dee3183f8 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent	efb6113b65c4976cf718787b2adc64d495e8fd94 (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index fc1edd99..cc097ae4 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -918,7 +918,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
918	memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));	918	memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));
919		919
920	gk20a_gmmu_unmap_free(ch_vm, &ch->gpfifo.mem);	920	gk20a_gmmu_unmap_free(ch_vm, &ch->gpfifo.mem);
921		921	nvgpu_free(ch->gpfifo.pipe);
922	memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));	922	memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
923		923
924	#if defined(CONFIG_GK20A_CYCLE_STATS)	924	#if defined(CONFIG_GK20A_CYCLE_STATS)
@@ -1430,7 +1430,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
1430	c->ramfc.offset = 0;	1430	c->ramfc.offset = 0;
1431	c->ramfc.size = ram_in_ramfc_s() / 8;	1431	c->ramfc.size = ram_in_ramfc_s() / 8;
1432		1432
1433	if (c->gpfifo.mem.cpu_va) {	1433	if (c->gpfifo.mem.size) {
1434	gk20a_err(d, "channel %d :"	1434	gk20a_err(d, "channel %d :"
1435	"gpfifo already allocated", c->hw_chid);	1435	"gpfifo already allocated", c->hw_chid);
1436	return -EEXIST;	1436	return -EEXIST;
@@ -1444,6 +1444,16 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
1444	goto clean_up;	1444	goto clean_up;
1445	}	1445	}
1446		1446
		1447	if (c->gpfifo.mem.aperture == APERTURE_VIDMEM \|\| g->mm.force_pramin) {
		1448	c->gpfifo.pipe = nvgpu_alloc(
		1449	gpfifo_size * sizeof(struct nvgpu_gpfifo),
		1450	false);
		1451	if (!c->gpfifo.pipe) {
		1452	err = -ENOMEM;
		1453	goto clean_up_unmap;
		1454	}
		1455	}
		1456
1447	c->gpfifo.entry_num = gpfifo_size;	1457	c->gpfifo.entry_num = gpfifo_size;
1448	c->gpfifo.get = c->gpfifo.put = 0;	1458	c->gpfifo.get = c->gpfifo.put = 0;
1449		1459
@@ -1473,6 +1483,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
1473	return 0;	1483	return 0;
1474		1484
1475	clean_up_unmap:	1485	clean_up_unmap:
		1486	kfree(c->gpfifo.pipe);
1476	gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);	1487	gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);
1477	clean_up:	1488	clean_up:
1478	memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));	1489	memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
@@ -1568,7 +1579,7 @@ static void trace_write_pushbuffer(struct channel_gk20a *c,
1568		1579
1569	static void trace_write_pushbuffer_range(struct channel_gk20a *c,	1580	static void trace_write_pushbuffer_range(struct channel_gk20a *c,
1570	struct nvgpu_gpfifo *g,	1581	struct nvgpu_gpfifo *g,
1571	struct nvgpu_submit_gpfifo_args *args,	1582	struct nvgpu_gpfifo __user *user_gpfifo,
1572	int offset,	1583	int offset,
1573	int count)	1584	int count)
1574	{	1585	{
@@ -1580,18 +1591,17 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
1580	if (!gk20a_debug_trace_cmdbuf)	1591	if (!gk20a_debug_trace_cmdbuf)
1581	return;	1592	return;
1582		1593
1583	if (!g && !args)	1594	if (!g && !user_gpfifo)
1584	return;	1595	return;
1585		1596
1586	if (!g) {	1597	if (!g) {
1587	size = args->num_entries * sizeof(struct nvgpu_gpfifo);	1598	size = count * sizeof(struct nvgpu_gpfifo);
1588	if (size) {	1599	if (size) {
1589	g = nvgpu_alloc(size, false);	1600	g = nvgpu_alloc(size, false);
1590	if (!g)	1601	if (!g)
1591	return;	1602	return;
1592		1603
1593	if (copy_from_user(g,	1604	if (copy_from_user(g, user_gpfifo, size)) {
1594	(void __user *)(uintptr_t)args->gpfifo, size)) {
1595	nvgpu_free(g);	1605	nvgpu_free(g);
1596	return;	1606	return;
1597	}	1607	}
@@ -1984,6 +1994,116 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
1984	gk20a_channel_put(c);	1994	gk20a_channel_put(c);
1985	}	1995	}
1986		1996
		1997	static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
		1998	struct priv_cmd_entry *cmd)
		1999	{
		2000	struct gk20a *g = c->g;
		2001	struct mem_desc *gpfifo_mem = &c->gpfifo.mem;
		2002	struct nvgpu_gpfifo x = {
		2003	.entry0 = u64_lo32(cmd->gva),
		2004	.entry1 = u64_hi32(cmd->gva) \|
		2005	pbdma_gp_entry1_length_f(cmd->size)
		2006	};
		2007
		2008	gk20a_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
		2009	&x, sizeof(x));
		2010
		2011	if (cmd->mem->aperture == APERTURE_SYSMEM)
		2012	trace_gk20a_push_cmdbuf(dev_name(g->dev), 0, cmd->size, 0,
		2013	cmd->mem->cpu_va + cmd->off * sizeof(u32));
		2014
		2015	c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
		2016	}
		2017
		2018	/*
		2019	* Copy source gpfifo entries into the gpfifo ring buffer, potentially
		2020	* splitting into two memcpys to handle wrap-around.
		2021	*/
		2022	static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
		2023	struct nvgpu_gpfifo *kern_gpfifo,
		2024	struct nvgpu_gpfifo __user *user_gpfifo,
		2025	u32 num_entries)
		2026	{
		2027	/* byte offsets */
		2028	u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
		2029	u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
		2030	u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
		2031	u32 end = start + len; /* exclusive */
		2032	struct mem_desc *gpfifo_mem = &c->gpfifo.mem;
		2033	struct nvgpu_gpfifo *cpu_src;
		2034	int err;
		2035
		2036	if (user_gpfifo && !c->gpfifo.pipe) {
		2037	/*
		2038	* This path (from userspace to sysmem) is special in order to
		2039	* avoid two copies unnecessarily (from user to pipe, then from
		2040	* pipe to gpu sysmem buffer).
		2041	*
		2042	* As a special case, the pipe buffer exists if PRAMIN writes
		2043	* are forced, although the buffers may not be in vidmem in
		2044	* that case.
		2045	*/
		2046	if (end > gpfifo_size) {
		2047	/* wrap-around */
		2048	int length0 = gpfifo_size - start;
		2049	int length1 = len - length0;
		2050	void user2 = (u8)user_gpfifo + length0;
		2051
		2052	err = copy_from_user(gpfifo_mem->cpu_va + start,
		2053	user_gpfifo, length0);
		2054	if (err)
		2055	return err;
		2056
		2057	err = copy_from_user(gpfifo_mem->cpu_va,
		2058	user2, length1);
		2059	if (err)
		2060	return err;
		2061	} else {
		2062	err = copy_from_user(gpfifo_mem->cpu_va + start,
		2063	user_gpfifo, len);
		2064	if (err)
		2065	return err;
		2066	}
		2067
		2068	trace_write_pushbuffer_range(c, NULL, user_gpfifo,
		2069	0, num_entries);
		2070	goto out;
		2071	} else if (user_gpfifo) {
		2072	/* from userspace to vidmem or sysmem when pramin forced, use
		2073	* the common copy path below */
		2074	err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
		2075	if (err)
		2076	return err;
		2077
		2078	cpu_src = c->gpfifo.pipe;
		2079	} else {
		2080	/* from kernel to either sysmem or vidmem, don't need
		2081	* copy_from_user so use the common path below */
		2082	cpu_src = kern_gpfifo;
		2083	}
		2084
		2085	if (end > gpfifo_size) {
		2086	/* wrap-around */
		2087	int length0 = gpfifo_size - start;
		2088	int length1 = len - length0;
		2089	void src2 = (u8 )cpu_src + length0;
		2090
		2091	gk20a_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
		2092	gk20a_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
		2093	} else {
		2094	gk20a_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
		2095
		2096	}
		2097
		2098	trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
		2099
		2100	out:
		2101	c->gpfifo.put = (c->gpfifo.put + num_entries) &
		2102	(c->gpfifo.entry_num - 1);
		2103
		2104	return 0;
		2105	}
		2106
1987	int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,	2107	int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
1988	struct nvgpu_gpfifo *gpfifo,	2108	struct nvgpu_gpfifo *gpfifo,
1989	struct nvgpu_submit_gpfifo_args *args,	2109	struct nvgpu_submit_gpfifo_args *args,
@@ -1996,7 +2116,6 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
1996	struct gk20a *g = c->g;	2116	struct gk20a *g = c->g;
1997	struct device *d = dev_from_gk20a(g);	2117	struct device *d = dev_from_gk20a(g);
1998	int err = 0;	2118	int err = 0;
1999	int start, end;
2000	int wait_fence_fd = -1;	2119	int wait_fence_fd = -1;
2001	struct priv_cmd_entry *wait_cmd = NULL;	2120	struct priv_cmd_entry *wait_cmd = NULL;
2002	struct priv_cmd_entry *incr_cmd = NULL;	2121	struct priv_cmd_entry *incr_cmd = NULL;
@@ -2006,11 +2125,12 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
2006	* and one for post fence. */	2125	* and one for post fence. */
2007	const int extra_entries = 2;	2126	const int extra_entries = 2;
2008	bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);	2127	bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
2009	struct nvgpu_gpfifo *gpfifo_mem = c->gpfifo.mem.cpu_va;
2010	bool skip_buffer_refcounting = (flags &	2128	bool skip_buffer_refcounting = (flags &
2011	NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);	2129	NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
2012	bool need_sync_fence = false;	2130	bool need_sync_fence = false;
2013	bool new_sync_created = false;	2131	bool new_sync_created = false;
		2132	struct nvgpu_gpfifo __user *user_gpfifo = args ?
		2133	(struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : 0;
2014		2134
2015	/*	2135	/*
2016	* If user wants to allocate sync_fence_fd always, then respect that;	2136	* If user wants to allocate sync_fence_fd always, then respect that;
@@ -2157,102 +2277,17 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
2157	goto clean_up;	2277	goto clean_up;
2158	}	2278	}
2159		2279
2160	if (wait_cmd) {	2280	if (wait_cmd)
2161	gpfifo_mem[c->gpfifo.put].entry0 = u64_lo32(wait_cmd->gva);	2281	gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
2162	gpfifo_mem[c->gpfifo.put].entry1 = u64_hi32(wait_cmd->gva) \|
2163	pbdma_gp_entry1_length_f(wait_cmd->size);
2164	trace_gk20a_push_cmdbuf(dev_name(c->g->dev),
2165	0, wait_cmd->size, 0,
2166	wait_cmd->mem->cpu_va + wait_cmd->off *
2167	sizeof(u32));
2168
2169	c->gpfifo.put = (c->gpfifo.put + 1) &
2170	(c->gpfifo.entry_num - 1);
2171	}
2172
2173	/*
2174	* Copy source gpfifo entries into the gpfifo ring buffer,
2175	* potentially splitting into two memcpies to handle the
2176	* ring buffer wrap-around case.
2177	*/
2178	start = c->gpfifo.put;
2179	end = start + num_entries;
2180
2181	if (gpfifo) {
2182	if (end > c->gpfifo.entry_num) {
2183	int length0 = c->gpfifo.entry_num - start;
2184	int length1 = num_entries - length0;
2185
2186	memcpy(gpfifo_mem + start, gpfifo,
2187	length0 * sizeof(*gpfifo));
2188
2189	memcpy(gpfifo_mem, gpfifo + length0,
2190	length1 * sizeof(*gpfifo));
2191
2192	trace_write_pushbuffer_range(c, gpfifo, NULL,
2193	0, length0);
2194	trace_write_pushbuffer_range(c, gpfifo, NULL,
2195	length0, length1);
2196	} else {
2197	memcpy(gpfifo_mem + start, gpfifo,
2198	num_entries * sizeof(*gpfifo));
2199
2200	trace_write_pushbuffer_range(c, gpfifo, NULL,
2201	0, num_entries);
2202	}
2203	} else {
2204	struct nvgpu_gpfifo __user *user_gpfifo =
2205	(struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo;
2206	if (end > c->gpfifo.entry_num) {
2207	int length0 = c->gpfifo.entry_num - start;
2208	int length1 = num_entries - length0;
2209
2210	err = copy_from_user(gpfifo_mem + start,
2211	user_gpfifo,
2212	length0 * sizeof(*user_gpfifo));
2213	if (err) {
2214	goto clean_up;
2215	}
2216		2282
2217	err = copy_from_user(gpfifo_mem,	2283	if (gpfifo \|\| user_gpfifo)
2218	user_gpfifo + length0,	2284	err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
2219	length1 * sizeof(*user_gpfifo));	2285	num_entries);
2220	if (err) {	2286	if (err)
2221	goto clean_up;	2287	goto clean_up;
2222	}
2223
2224	trace_write_pushbuffer_range(c, NULL, args,
2225	0, length0);
2226	trace_write_pushbuffer_range(c, NULL, args,
2227	length0, length1);
2228	} else {
2229	err = copy_from_user(gpfifo_mem + start,
2230	user_gpfifo,
2231	num_entries * sizeof(*user_gpfifo));
2232	if (err) {
2233	goto clean_up;
2234	}
2235
2236	trace_write_pushbuffer_range(c, NULL, args,
2237	0, num_entries);
2238	}
2239	}
2240
2241	c->gpfifo.put = (c->gpfifo.put + num_entries) &
2242	(c->gpfifo.entry_num - 1);
2243
2244	if (incr_cmd) {
2245	gpfifo_mem[c->gpfifo.put].entry0 = u64_lo32(incr_cmd->gva);
2246	gpfifo_mem[c->gpfifo.put].entry1 = u64_hi32(incr_cmd->gva) \|
2247	pbdma_gp_entry1_length_f(incr_cmd->size);
2248	trace_gk20a_push_cmdbuf(dev_name(c->g->dev),
2249	0, incr_cmd->size, 0,
2250	incr_cmd->mem->cpu_va + incr_cmd->off *
2251	sizeof(u32));
2252		2288
2253	c->gpfifo.put = (c->gpfifo.put + 1) &	2289	if (incr_cmd)
2254	(c->gpfifo.entry_num - 1);	2290	gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
2255	}
2256		2291
2257	mutex_lock(&c->last_submit.fence_lock);	2292	mutex_lock(&c->last_submit.fence_lock);
2258	gk20a_fence_put(c->last_submit.pre_fence);	2293	gk20a_fence_put(c->last_submit.pre_fence);
@@ -2892,7 +2927,6 @@ static int gk20a_ioctl_channel_submit_gpfifo(
2892	{	2927	{
2893	struct gk20a_fence *fence_out;	2928	struct gk20a_fence *fence_out;
2894	int ret = 0;	2929	int ret = 0;
2895
2896	gk20a_dbg_fn("");	2930	gk20a_dbg_fn("");
2897		2931
2898	if (ch->has_timedout)	2932	if (ch->has_timedout)