summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
diff options
context:
space:
mode:
authorJanne Hellsten <jhellsten@nvidia.com>2015-01-23 08:40:36 -0500
committerDan Willemsen <dwillemsen@nvidia.com>2015-04-04 21:03:43 -0400
commit9148a1e62757bebf27eb38ba7e866c0ee5e0e6f3 (patch)
tree357aff360c3f50baf68b07ddb8793553cdbca7ea /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent624d7a2830370ec13402b964a1c8ff564249ddb6 (diff)
gpu: nvgpu: gk20a: Optimize gpfifo entry copy
Use memcpy for copying gpfifo inputs into the gpfifo ring buffer. This speeds up one command buffer heavy benchmark from 82 FPS to 86 FPS. Speed up is due to a) faster memory move and b) zero tracing overhead when PB tracing is disabled. Bug 1550886 Change-Id: If95ebff53745bbf59edeac32ad4f32f10f1ea7ee Signed-off-by: Janne Hellsten <jhellsten@nvidia.com> Reviewed-on: http://git-master/r/676967 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/channel_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c52
1 files changed, 42 insertions, 10 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index ab0333d6..6573d9ca 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1365,7 +1365,8 @@ static u32 get_gp_free_count(struct channel_gk20a *c)
1365 return gp_free_count(c); 1365 return gp_free_count(c);
1366} 1366}
1367 1367
1368static void trace_write_pushbuffer(struct channel_gk20a *c, struct gpfifo *g) 1368static void trace_write_pushbuffer(struct channel_gk20a *c,
1369 struct nvgpu_gpfifo *g)
1369{ 1370{
1370 void *mem = NULL; 1371 void *mem = NULL;
1371 unsigned int words; 1372 unsigned int words;
@@ -1401,6 +1402,18 @@ static void trace_write_pushbuffer(struct channel_gk20a *c, struct gpfifo *g)
1401 } 1402 }
1402} 1403}
1403 1404
1405static void trace_write_pushbuffer_range(struct channel_gk20a *c,
1406 struct nvgpu_gpfifo *g,
1407 int count)
1408{
1409 if (gk20a_debug_trace_cmdbuf) {
1410 int i;
1411 struct nvgpu_gpfifo *gp = g;
1412 for (i = 0; i < count; i++, gp++)
1413 trace_write_pushbuffer(c, gp);
1414 }
1415}
1416
1404static int gk20a_channel_add_job(struct channel_gk20a *c, 1417static int gk20a_channel_add_job(struct channel_gk20a *c,
1405 struct gk20a_fence *pre_fence, 1418 struct gk20a_fence *pre_fence,
1406 struct gk20a_fence *post_fence) 1419 struct gk20a_fence *post_fence)
@@ -1502,7 +1515,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
1502 struct gk20a *g = c->g; 1515 struct gk20a *g = c->g;
1503 struct device *d = dev_from_gk20a(g); 1516 struct device *d = dev_from_gk20a(g);
1504 int err = 0; 1517 int err = 0;
1505 int i; 1518 int start, end;
1506 int wait_fence_fd = -1; 1519 int wait_fence_fd = -1;
1507 struct priv_cmd_entry *wait_cmd = NULL; 1520 struct priv_cmd_entry *wait_cmd = NULL;
1508 struct priv_cmd_entry *incr_cmd = NULL; 1521 struct priv_cmd_entry *incr_cmd = NULL;
@@ -1653,15 +1666,34 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
1653 wait_cmd->gp_put = c->gpfifo.put; 1666 wait_cmd->gp_put = c->gpfifo.put;
1654 } 1667 }
1655 1668
1656 for (i = 0; i < num_entries; i++) { 1669 /*
1657 c->gpfifo.cpu_va[c->gpfifo.put].entry0 = 1670 * Copy source gpfifo entries into the gpfifo ring buffer,
1658 gpfifo[i].entry0; /* cmd buf va low 32 */ 1671 * potentially splitting into two memcpies to handle the
1659 c->gpfifo.cpu_va[c->gpfifo.put].entry1 = 1672 * ring buffer wrap-around case.
1660 gpfifo[i].entry1; /* cmd buf va high 32 | words << 10 */ 1673 */
1661 trace_write_pushbuffer(c, &c->gpfifo.cpu_va[c->gpfifo.put]); 1674 start = c->gpfifo.put;
1662 c->gpfifo.put = (c->gpfifo.put + 1) & 1675 end = start + num_entries;
1663 (c->gpfifo.entry_num - 1); 1676
1677 if (end > c->gpfifo.entry_num) {
1678 int length0 = c->gpfifo.entry_num - start;
1679 int length1 = num_entries - length0;
1680
1681 memcpy(c->gpfifo.cpu_va + start, gpfifo,
1682 length0 * sizeof(*gpfifo));
1683
1684 memcpy(c->gpfifo.cpu_va, gpfifo + length0,
1685 length1 * sizeof(*gpfifo));
1686
1687 trace_write_pushbuffer_range(c, gpfifo, length0);
1688 trace_write_pushbuffer_range(c, gpfifo + length0, length1);
1689 } else {
1690 memcpy(c->gpfifo.cpu_va + start, gpfifo,
1691 num_entries * sizeof(*gpfifo));
1692
1693 trace_write_pushbuffer_range(c, gpfifo, num_entries);
1664 } 1694 }
1695 c->gpfifo.put = (c->gpfifo.put + num_entries) &
1696 (c->gpfifo.entry_num - 1);
1665 1697
1666 if (incr_cmd) { 1698 if (incr_cmd) {
1667 c->gpfifo.cpu_va[c->gpfifo.put].entry0 = 1699 c->gpfifo.cpu_va[c->gpfifo.put].entry0 =