diff options
author | Janne Hellsten <jhellsten@nvidia.com> | 2015-01-23 08:40:36 -0500 |
---|---|---|
committer | Dan Willemsen <dwillemsen@nvidia.com> | 2015-04-04 21:03:43 -0400 |
commit | 9148a1e62757bebf27eb38ba7e866c0ee5e0e6f3 (patch) | |
tree | 357aff360c3f50baf68b07ddb8793553cdbca7ea /drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |
parent | 624d7a2830370ec13402b964a1c8ff564249ddb6 (diff) |
gpu: nvgpu: gk20a: Optimize gpfifo entry copy
Use memcpy for copying gpfifo inputs into the gpfifo ring buffer.
This speeds up one command buffer heavy benchmark from 82 FPS to 86
FPS. Speed up is due to a) faster memory move and b) zero tracing
overhead when PB tracing is disabled.
Bug 1550886
Change-Id: If95ebff53745bbf59edeac32ad4f32f10f1ea7ee
Signed-off-by: Janne Hellsten <jhellsten@nvidia.com>
Reviewed-on: http://git-master/r/676967
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/channel_gk20a.c')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 52 |
1 files changed, 42 insertions, 10 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index ab0333d6..6573d9ca 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |||
@@ -1365,7 +1365,8 @@ static u32 get_gp_free_count(struct channel_gk20a *c) | |||
1365 | return gp_free_count(c); | 1365 | return gp_free_count(c); |
1366 | } | 1366 | } |
1367 | 1367 | ||
1368 | static void trace_write_pushbuffer(struct channel_gk20a *c, struct gpfifo *g) | 1368 | static void trace_write_pushbuffer(struct channel_gk20a *c, |
1369 | struct nvgpu_gpfifo *g) | ||
1369 | { | 1370 | { |
1370 | void *mem = NULL; | 1371 | void *mem = NULL; |
1371 | unsigned int words; | 1372 | unsigned int words; |
@@ -1401,6 +1402,18 @@ static void trace_write_pushbuffer(struct channel_gk20a *c, struct gpfifo *g) | |||
1401 | } | 1402 | } |
1402 | } | 1403 | } |
1403 | 1404 | ||
1405 | static void trace_write_pushbuffer_range(struct channel_gk20a *c, | ||
1406 | struct nvgpu_gpfifo *g, | ||
1407 | int count) | ||
1408 | { | ||
1409 | if (gk20a_debug_trace_cmdbuf) { | ||
1410 | int i; | ||
1411 | struct nvgpu_gpfifo *gp = g; | ||
1412 | for (i = 0; i < count; i++, gp++) | ||
1413 | trace_write_pushbuffer(c, gp); | ||
1414 | } | ||
1415 | } | ||
1416 | |||
1404 | static int gk20a_channel_add_job(struct channel_gk20a *c, | 1417 | static int gk20a_channel_add_job(struct channel_gk20a *c, |
1405 | struct gk20a_fence *pre_fence, | 1418 | struct gk20a_fence *pre_fence, |
1406 | struct gk20a_fence *post_fence) | 1419 | struct gk20a_fence *post_fence) |
@@ -1502,7 +1515,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, | |||
1502 | struct gk20a *g = c->g; | 1515 | struct gk20a *g = c->g; |
1503 | struct device *d = dev_from_gk20a(g); | 1516 | struct device *d = dev_from_gk20a(g); |
1504 | int err = 0; | 1517 | int err = 0; |
1505 | int i; | 1518 | int start, end; |
1506 | int wait_fence_fd = -1; | 1519 | int wait_fence_fd = -1; |
1507 | struct priv_cmd_entry *wait_cmd = NULL; | 1520 | struct priv_cmd_entry *wait_cmd = NULL; |
1508 | struct priv_cmd_entry *incr_cmd = NULL; | 1521 | struct priv_cmd_entry *incr_cmd = NULL; |
@@ -1653,15 +1666,34 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, | |||
1653 | wait_cmd->gp_put = c->gpfifo.put; | 1666 | wait_cmd->gp_put = c->gpfifo.put; |
1654 | } | 1667 | } |
1655 | 1668 | ||
1656 | for (i = 0; i < num_entries; i++) { | 1669 | /* |
1657 | c->gpfifo.cpu_va[c->gpfifo.put].entry0 = | 1670 | * Copy source gpfifo entries into the gpfifo ring buffer, |
1658 | gpfifo[i].entry0; /* cmd buf va low 32 */ | 1671 | * potentially splitting into two memcpies to handle the |
1659 | c->gpfifo.cpu_va[c->gpfifo.put].entry1 = | 1672 | * ring buffer wrap-around case. |
1660 | gpfifo[i].entry1; /* cmd buf va high 32 | words << 10 */ | 1673 | */ |
1661 | trace_write_pushbuffer(c, &c->gpfifo.cpu_va[c->gpfifo.put]); | 1674 | start = c->gpfifo.put; |
1662 | c->gpfifo.put = (c->gpfifo.put + 1) & | 1675 | end = start + num_entries; |
1663 | (c->gpfifo.entry_num - 1); | 1676 | |
1677 | if (end > c->gpfifo.entry_num) { | ||
1678 | int length0 = c->gpfifo.entry_num - start; | ||
1679 | int length1 = num_entries - length0; | ||
1680 | |||
1681 | memcpy(c->gpfifo.cpu_va + start, gpfifo, | ||
1682 | length0 * sizeof(*gpfifo)); | ||
1683 | |||
1684 | memcpy(c->gpfifo.cpu_va, gpfifo + length0, | ||
1685 | length1 * sizeof(*gpfifo)); | ||
1686 | |||
1687 | trace_write_pushbuffer_range(c, gpfifo, length0); | ||
1688 | trace_write_pushbuffer_range(c, gpfifo + length0, length1); | ||
1689 | } else { | ||
1690 | memcpy(c->gpfifo.cpu_va + start, gpfifo, | ||
1691 | num_entries * sizeof(*gpfifo)); | ||
1692 | |||
1693 | trace_write_pushbuffer_range(c, gpfifo, num_entries); | ||
1664 | } | 1694 | } |
1695 | c->gpfifo.put = (c->gpfifo.put + num_entries) & | ||
1696 | (c->gpfifo.entry_num - 1); | ||
1665 | 1697 | ||
1666 | if (incr_cmd) { | 1698 | if (incr_cmd) { |
1667 | c->gpfifo.cpu_va[c->gpfifo.put].entry0 = | 1699 | c->gpfifo.cpu_va[c->gpfifo.put].entry0 = |