summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a
diff options
context:
space:
mode:
authorSachit Kadle <skadle@nvidia.com>2016-08-15 17:32:39 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2016-10-20 11:14:11 -0400
commit733fb79b39869665addcd80ccdf1c15f4a5aaa29 (patch)
treeb3c9c1ba32406ed9c2af4bebee820e83ea6172e4 /drivers/gpu/nvgpu/gk20a
parent63e8592e06939e20c7b9e56b430353ebbee31ad6 (diff)
gpu: nvgpu: add support for pre-allocated resources
Add support for pre-allocation of job tracking resources w/ new (extended) ioctl. Goal is to avoid dynamic memory allocation in the submit path. This patch does the following: 1) Intoduces a new ioctl, NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO_EX, which enables pre-allocation of tracking resources per job: a) 2x priv_cmd_entry b) 2x gk20a_fence 2) Implements circular ring buffer for job tracking to avoid lock contention between producer (submitter) and consumer (clean-up) Bug 1795076 Change-Id: I6b52e5c575871107ff380f9a5790f440a6969347 Signed-off-by: Sachit Kadle <skadle@nvidia.com> Reviewed-on: http://git-master/r/1203300 (cherry picked from commit 9fd270c22b860935dffe244753dabd87454bef39) Reviewed-on: http://git-master/r/1223934 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a')
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.c8
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.c8
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c390
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h28
-rw-r--r--drivers/gpu/nvgpu/gk20a/fence_gk20a.c70
-rw-r--r--drivers/gpu/nvgpu/gk20a/fence_gk20a.h15
6 files changed, 454 insertions, 65 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ca785b19..17453489 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1126,9 +1126,9 @@ __releases(&cde_app->mutex)
1126 struct gk20a_cde_app *cde_app = &g->cde_app; 1126 struct gk20a_cde_app *cde_app = &g->cde_app;
1127 bool channel_idle; 1127 bool channel_idle;
1128 1128
1129 spin_lock(&ch->jobs_lock); 1129 channel_gk20a_joblist_lock(ch);
1130 channel_idle = list_empty(&ch->jobs); 1130 channel_idle = channel_gk20a_joblist_is_empty(ch);
1131 spin_unlock(&ch->jobs_lock); 1131 channel_gk20a_joblist_unlock(ch);
1132 1132
1133 if (!channel_idle) 1133 if (!channel_idle)
1134 return; 1134 return;
@@ -1207,7 +1207,7 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
1207 1207
1208 /* allocate gpfifo (1024 should be more than enough) */ 1208 /* allocate gpfifo (1024 should be more than enough) */
1209 err = gk20a_alloc_channel_gpfifo(ch, 1209 err = gk20a_alloc_channel_gpfifo(ch,
1210 &(struct nvgpu_alloc_gpfifo_args){1024, 0}); 1210 &(struct nvgpu_alloc_gpfifo_ex_args){1024, 0, 0, {}});
1211 if (err) { 1211 if (err) {
1212 gk20a_warn(cde_ctx->dev, "cde: unable to allocate gpfifo"); 1212 gk20a_warn(cde_ctx->dev, "cde: unable to allocate gpfifo");
1213 goto err_alloc_gpfifo; 1213 goto err_alloc_gpfifo;
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 109ec240..bfd183fb 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -126,9 +126,9 @@ static void gk20a_ce_finished_ctx_cb(struct channel_gk20a *ch, void *data)
126 bool channel_idle; 126 bool channel_idle;
127 u32 event; 127 u32 event;
128 128
129 spin_lock(&ch->jobs_lock); 129 channel_gk20a_joblist_lock(ch);
130 channel_idle = list_empty(&ch->jobs); 130 channel_idle = channel_gk20a_joblist_is_empty(ch);
131 spin_unlock(&ch->jobs_lock); 131 channel_gk20a_joblist_unlock(ch);
132 132
133 if (!channel_idle) 133 if (!channel_idle)
134 return; 134 return;
@@ -462,7 +462,7 @@ u32 gk20a_ce_create_context_with_cb(struct device *dev,
462 462
463 /* allocate gpfifo (1024 should be more than enough) */ 463 /* allocate gpfifo (1024 should be more than enough) */
464 err = gk20a_alloc_channel_gpfifo(ce_ctx->ch, 464 err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
465 &(struct nvgpu_alloc_gpfifo_args){1024, 0}); 465 &(struct nvgpu_alloc_gpfifo_ex_args){1024, 0, 0, {}});
466 if (err) { 466 if (err) {
467 gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo"); 467 gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
468 goto end; 468 goto end;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 4019721a..cc3bbbd2 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -26,6 +26,7 @@
26#include <linux/anon_inodes.h> 26#include <linux/anon_inodes.h>
27#include <linux/dma-buf.h> 27#include <linux/dma-buf.h>
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29#include <linux/circ_buf.h>
29 30
30#include "debug_gk20a.h" 31#include "debug_gk20a.h"
31#include "ctxsw_trace_gk20a.h" 32#include "ctxsw_trace_gk20a.h"
@@ -55,6 +56,15 @@ static void free_priv_cmdbuf(struct channel_gk20a *c,
55static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c); 56static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
56static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c); 57static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);
57 58
59static void channel_gk20a_free_prealloc_resources(struct channel_gk20a *c);
60
61static void channel_gk20a_joblist_add(struct channel_gk20a *c,
62 struct channel_gk20a_job *job);
63static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
64 struct channel_gk20a_job *job);
65static struct channel_gk20a_job *channel_gk20a_joblist_peek(
66 struct channel_gk20a *c);
67
58static int channel_gk20a_commit_userd(struct channel_gk20a *c); 68static int channel_gk20a_commit_userd(struct channel_gk20a *c);
59static int channel_gk20a_setup_userd(struct channel_gk20a *c); 69static int channel_gk20a_setup_userd(struct channel_gk20a *c);
60 70
@@ -460,6 +470,7 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)
460{ 470{
461 struct channel_gk20a_job *job, *n; 471 struct channel_gk20a_job *job, *n;
462 bool released_job_semaphore = false; 472 bool released_job_semaphore = false;
473 bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(ch);
463 474
464 gk20a_channel_cancel_job_clean_up(ch, true); 475 gk20a_channel_cancel_job_clean_up(ch, true);
465 476
@@ -471,14 +482,37 @@ void gk20a_channel_abort_clean_up(struct channel_gk20a *ch)
471 482
472 /* release all job semaphores (applies only to jobs that use 483 /* release all job semaphores (applies only to jobs that use
473 semaphore synchronization) */ 484 semaphore synchronization) */
474 spin_lock(&ch->jobs_lock); 485 channel_gk20a_joblist_lock(ch);
475 list_for_each_entry_safe(job, n, &ch->jobs, list) { 486 if (pre_alloc_enabled) {
476 if (job->post_fence->semaphore) { 487 int tmp_get = ch->joblist.pre_alloc.get;
477 gk20a_semaphore_release(job->post_fence->semaphore); 488 int put = ch->joblist.pre_alloc.put;
478 released_job_semaphore = true; 489
490 /*
491 * ensure put is read before any subsequent reads.
492 * see corresponding wmb in gk20a_channel_add_job()
493 */
494 rmb();
495
496 while (tmp_get != put) {
497 job = &ch->joblist.pre_alloc.jobs[tmp_get];
498 if (job->post_fence->semaphore) {
499 gk20a_semaphore_release(
500 job->post_fence->semaphore);
501 released_job_semaphore = true;
502 }
503 tmp_get = (tmp_get + 1) % ch->joblist.pre_alloc.length;
504 }
505 } else {
506 list_for_each_entry_safe(job, n,
507 &ch->joblist.dynamic.jobs, list) {
508 if (job->post_fence->semaphore) {
509 gk20a_semaphore_release(
510 job->post_fence->semaphore);
511 released_job_semaphore = true;
512 }
479 } 513 }
480 } 514 }
481 spin_unlock(&ch->jobs_lock); 515 channel_gk20a_joblist_unlock(ch);
482 516
483 if (released_job_semaphore) 517 if (released_job_semaphore)
484 wake_up_interruptible_all(&ch->semaphore_wq); 518 wake_up_interruptible_all(&ch->semaphore_wq);
@@ -511,9 +545,9 @@ int gk20a_wait_channel_idle(struct channel_gk20a *ch)
511 msecs_to_jiffies(gk20a_get_gr_idle_timeout(ch->g)); 545 msecs_to_jiffies(gk20a_get_gr_idle_timeout(ch->g));
512 546
513 do { 547 do {
514 spin_lock(&ch->jobs_lock); 548 channel_gk20a_joblist_lock(ch);
515 channel_idle = list_empty(&ch->jobs); 549 channel_idle = channel_gk20a_joblist_is_empty(ch);
516 spin_unlock(&ch->jobs_lock); 550 channel_gk20a_joblist_unlock(ch);
517 if (channel_idle) 551 if (channel_idle)
518 break; 552 break;
519 553
@@ -1016,6 +1050,10 @@ unbind:
1016 1050
1017 mutex_unlock(&g->dbg_sessions_lock); 1051 mutex_unlock(&g->dbg_sessions_lock);
1018 1052
1053 /* free pre-allocated resources, if applicable */
1054 if (channel_gk20a_is_prealloc_enabled(ch))
1055 channel_gk20a_free_prealloc_resources(ch);
1056
1019 /* make sure we catch accesses of unopened channels in case 1057 /* make sure we catch accesses of unopened channels in case
1020 * there's non-refcounted channel pointers hanging around */ 1058 * there's non-refcounted channel pointers hanging around */
1021 ch->g = NULL; 1059 ch->g = NULL;
@@ -1422,7 +1460,10 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
1422 /* we already handled q->put + size > q->size so BUG_ON this */ 1460 /* we already handled q->put + size > q->size so BUG_ON this */
1423 BUG_ON(q->put > q->size); 1461 BUG_ON(q->put > q->size);
1424 1462
1425 /* commit the previous writes before making the entry valid */ 1463 /*
1464 * commit the previous writes before making the entry valid.
1465 * see the corresponding rmb() in gk20a_free_priv_cmdbuf().
1466 */
1426 wmb(); 1467 wmb();
1427 1468
1428 e->valid = true; 1469 e->valid = true;
@@ -1436,26 +1477,222 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
1436static void free_priv_cmdbuf(struct channel_gk20a *c, 1477static void free_priv_cmdbuf(struct channel_gk20a *c,
1437 struct priv_cmd_entry *e) 1478 struct priv_cmd_entry *e)
1438{ 1479{
1439 kfree(e); 1480 if (channel_gk20a_is_prealloc_enabled(c))
1481 memset(e, 0, sizeof(struct priv_cmd_entry));
1482 else
1483 kfree(e);
1484}
1485
1486static int channel_gk20a_alloc_job(struct channel_gk20a *c,
1487 struct channel_gk20a_job **job_out)
1488{
1489 int err = 0;
1490
1491 if (channel_gk20a_is_prealloc_enabled(c)) {
1492 int put = c->joblist.pre_alloc.put;
1493 int get = c->joblist.pre_alloc.get;
1494
1495 /*
1496 * ensure all subsequent reads happen after reading get.
1497 * see corresponding wmb in gk20a_channel_clean_up_jobs()
1498 */
1499 rmb();
1500
1501 if (CIRC_SPACE(put, get, c->joblist.pre_alloc.length))
1502 *job_out = &c->joblist.pre_alloc.jobs[put];
1503 else {
1504 gk20a_warn(dev_from_gk20a(c->g),
1505 "out of job ringbuffer space\n");
1506 err = -EAGAIN;
1507 }
1508 } else {
1509 *job_out = kzalloc(sizeof(struct channel_gk20a_job),
1510 GFP_KERNEL);
1511 if (!job_out)
1512 err = -ENOMEM;
1513 }
1514
1515 return err;
1516}
1517
1518static void channel_gk20a_free_job(struct channel_gk20a *c,
1519 struct channel_gk20a_job *job)
1520{
1521 /*
1522 * In case of pre_allocated jobs, we need to clean out
1523 * the job but maintain the pointers to the priv_cmd_entry,
1524 * since they're inherently tied to the job node.
1525 */
1526 if (channel_gk20a_is_prealloc_enabled(c)) {
1527 struct priv_cmd_entry *wait_cmd = job->wait_cmd;
1528 struct priv_cmd_entry *incr_cmd = job->incr_cmd;
1529 memset(job, 0, sizeof(*job));
1530 job->wait_cmd = wait_cmd;
1531 job->incr_cmd = incr_cmd;
1532 } else
1533 kfree(job);
1534}
1535
1536void channel_gk20a_joblist_lock(struct channel_gk20a *c)
1537{
1538 if (channel_gk20a_is_prealloc_enabled(c))
1539 mutex_lock(&c->joblist.pre_alloc.read_lock);
1540 else
1541 spin_lock(&c->joblist.dynamic.lock);
1440} 1542}
1441 1543
1442static struct channel_gk20a_job *channel_gk20a_alloc_job( 1544void channel_gk20a_joblist_unlock(struct channel_gk20a *c)
1545{
1546 if (channel_gk20a_is_prealloc_enabled(c))
1547 mutex_unlock(&c->joblist.pre_alloc.read_lock);
1548 else
1549 spin_unlock(&c->joblist.dynamic.lock);
1550}
1551
1552static struct channel_gk20a_job *channel_gk20a_joblist_peek(
1443 struct channel_gk20a *c) 1553 struct channel_gk20a *c)
1444{ 1554{
1555 int get;
1445 struct channel_gk20a_job *job = NULL; 1556 struct channel_gk20a_job *job = NULL;
1446 1557
1447 job = kzalloc(sizeof(*job), GFP_KERNEL); 1558 if (channel_gk20a_is_prealloc_enabled(c)) {
1559 if (!channel_gk20a_joblist_is_empty(c)) {
1560 get = c->joblist.pre_alloc.get;
1561 job = &c->joblist.pre_alloc.jobs[get];
1562 }
1563 } else {
1564 if (!list_empty(&c->joblist.dynamic.jobs))
1565 job = list_first_entry(&c->joblist.dynamic.jobs,
1566 struct channel_gk20a_job, list);
1567 }
1568
1448 return job; 1569 return job;
1449} 1570}
1450 1571
1451static void channel_gk20a_free_job(struct channel_gk20a *c, 1572static void channel_gk20a_joblist_add(struct channel_gk20a *c,
1452 struct channel_gk20a_job *job) 1573 struct channel_gk20a_job *job)
1453{ 1574{
1454 kfree(job); 1575 if (channel_gk20a_is_prealloc_enabled(c)) {
1576 c->joblist.pre_alloc.put = (c->joblist.pre_alloc.put + 1) %
1577 (c->joblist.pre_alloc.length);
1578 } else {
1579 list_add_tail(&job->list, &c->joblist.dynamic.jobs);
1580 }
1581}
1582
1583static void channel_gk20a_joblist_delete(struct channel_gk20a *c,
1584 struct channel_gk20a_job *job)
1585{
1586 if (channel_gk20a_is_prealloc_enabled(c)) {
1587 c->joblist.pre_alloc.get = (c->joblist.pre_alloc.get + 1) %
1588 (c->joblist.pre_alloc.length);
1589 } else {
1590 list_del_init(&job->list);
1591 }
1592}
1593
1594bool channel_gk20a_joblist_is_empty(struct channel_gk20a *c)
1595{
1596 if (channel_gk20a_is_prealloc_enabled(c)) {
1597 int get = c->joblist.pre_alloc.get;
1598 int put = c->joblist.pre_alloc.put;
1599 return !(CIRC_CNT(put, get, c->joblist.pre_alloc.length));
1600 }
1601
1602 return list_empty(&c->joblist.dynamic.jobs);
1603}
1604
1605bool channel_gk20a_is_prealloc_enabled(struct channel_gk20a *c)
1606{
1607 bool pre_alloc_enabled = c->joblist.pre_alloc.enabled;
1608
1609 rmb();
1610 return pre_alloc_enabled;
1611}
1612
1613static int channel_gk20a_prealloc_resources(struct channel_gk20a *c,
1614 unsigned int num_jobs)
1615{
1616 int i, err;
1617 size_t size;
1618 struct priv_cmd_entry *entries = NULL;
1619
1620 if (channel_gk20a_is_prealloc_enabled(c) || !num_jobs)
1621 return -EINVAL;
1622
1623 /*
1624 * pre-allocate the job list.
1625 * since vmalloc take in an unsigned long, we need
1626 * to make sure we don't hit an overflow condition
1627 */
1628 size = sizeof(struct channel_gk20a_job);
1629 if (num_jobs <= ULONG_MAX / size)
1630 c->joblist.pre_alloc.jobs = vzalloc(num_jobs * size);
1631 if (!c->joblist.pre_alloc.jobs) {
1632 err = -ENOMEM;
1633 goto clean_up;
1634 }
1635
1636 /*
1637 * pre-allocate 2x priv_cmd_entry for each job up front.
1638 * since vmalloc take in an unsigned long, we need
1639 * to make sure we don't hit an overflow condition
1640 */
1641 size = sizeof(struct priv_cmd_entry);
1642 if (num_jobs <= ULONG_MAX / (size << 1))
1643 entries = vzalloc((num_jobs << 1) * size);
1644 if (!entries) {
1645 err = -ENOMEM;
1646 goto clean_up_joblist;
1647 }
1648
1649 for (i = 0; i < num_jobs; i++) {
1650 c->joblist.pre_alloc.jobs[i].wait_cmd = &entries[i];
1651 c->joblist.pre_alloc.jobs[i].incr_cmd =
1652 &entries[i + num_jobs];
1653 }
1654
1655 /* pre-allocate a fence pool */
1656 err = gk20a_alloc_fence_pool(c, num_jobs);
1657 if (err)
1658 goto clean_up_priv_cmd;
1659
1660 c->joblist.pre_alloc.length = num_jobs;
1661
1662 /*
1663 * commit the previous writes before setting the flag.
1664 * see corresponding rmb in channel_gk20a_is_prealloc_enabled()
1665 */
1666 wmb();
1667 c->joblist.pre_alloc.enabled = true;
1668
1669 return 0;
1670
1671clean_up_priv_cmd:
1672 vfree(entries);
1673clean_up_joblist:
1674 vfree(c->joblist.pre_alloc.jobs);
1675clean_up:
1676 memset(&c->joblist.pre_alloc, 0, sizeof(c->joblist.pre_alloc));
1677 return err;
1678}
1679
1680static void channel_gk20a_free_prealloc_resources(struct channel_gk20a *c)
1681{
1682 vfree(c->joblist.pre_alloc.jobs[0].wait_cmd);
1683 vfree(c->joblist.pre_alloc.jobs);
1684 gk20a_free_fence_pool(c);
1685
1686 /*
1687 * commit the previous writes before disabling the flag.
1688 * see corresponding rmb in channel_gk20a_is_prealloc_enabled()
1689 */
1690 wmb();
1691 c->joblist.pre_alloc.enabled = false;
1455} 1692}
1456 1693
1457int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c, 1694int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
1458 struct nvgpu_alloc_gpfifo_args *args) 1695 struct nvgpu_alloc_gpfifo_ex_args *args)
1459{ 1696{
1460 struct gk20a *g = c->g; 1697 struct gk20a *g = c->g;
1461 struct device *d = dev_from_gk20a(g); 1698 struct device *d = dev_from_gk20a(g);
@@ -1539,19 +1776,30 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
1539 1776
1540 /* TBD: setup engine contexts */ 1777 /* TBD: setup engine contexts */
1541 1778
1779 if (args->num_inflight_jobs) {
1780 err = channel_gk20a_prealloc_resources(c,
1781 args->num_inflight_jobs);
1782 if (err)
1783 goto clean_up_sync;
1784 }
1785
1542 err = channel_gk20a_alloc_priv_cmdbuf(c); 1786 err = channel_gk20a_alloc_priv_cmdbuf(c);
1543 if (err) 1787 if (err)
1544 goto clean_up_sync; 1788 goto clean_up_prealloc;
1545 1789
1546 err = channel_gk20a_update_runlist(c, true); 1790 err = channel_gk20a_update_runlist(c, true);
1547 if (err) 1791 if (err)
1548 goto clean_up_sync; 1792 goto clean_up_priv_cmd;
1549 1793
1550 g->ops.fifo.bind_channel(c); 1794 g->ops.fifo.bind_channel(c);
1551 1795
1552 gk20a_dbg_fn("done"); 1796 gk20a_dbg_fn("done");
1553 return 0; 1797 return 0;
1554 1798
1799clean_up_priv_cmd:
1800 channel_gk20a_free_priv_cmdbuf(c);
1801clean_up_prealloc:
1802 channel_gk20a_free_prealloc_resources(c);
1555clean_up_sync: 1803clean_up_sync:
1556 gk20a_channel_sync_destroy(c->sync); 1804 gk20a_channel_sync_destroy(c->sync);
1557 c->sync = NULL; 1805 c->sync = NULL;
@@ -1878,6 +2126,7 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
1878 struct vm_gk20a *vm = c->vm; 2126 struct vm_gk20a *vm = c->vm;
1879 struct mapped_buffer_node **mapped_buffers = NULL; 2127 struct mapped_buffer_node **mapped_buffers = NULL;
1880 int err = 0, num_mapped_buffers = 0; 2128 int err = 0, num_mapped_buffers = 0;
2129 bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
1881 2130
1882 /* job needs reference to this vm (released in channel_update) */ 2131 /* job needs reference to this vm (released in channel_update) */
1883 gk20a_vm_get(vm); 2132 gk20a_vm_get(vm);
@@ -1898,9 +2147,19 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
1898 2147
1899 gk20a_channel_timeout_start(c, job); 2148 gk20a_channel_timeout_start(c, job);
1900 2149
1901 spin_lock(&c->jobs_lock); 2150 if (!pre_alloc_enabled)
1902 list_add_tail(&job->list, &c->jobs); 2151 channel_gk20a_joblist_lock(c);
1903 spin_unlock(&c->jobs_lock); 2152
2153 /*
2154 * ensure all pending write complete before adding to the list.
2155 * see corresponding rmb in gk20a_channel_clean_up_jobs() &
2156 * gk20a_channel_abort_clean_up()
2157 */
2158 wmb();
2159 channel_gk20a_joblist_add(c, job);
2160
2161 if (!pre_alloc_enabled)
2162 channel_gk20a_joblist_unlock(c);
1904 } else { 2163 } else {
1905 err = -ETIMEDOUT; 2164 err = -ETIMEDOUT;
1906 goto err_put_buffers; 2165 goto err_put_buffers;
@@ -1945,14 +2204,20 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
1945 while (1) { 2204 while (1) {
1946 bool completed; 2205 bool completed;
1947 2206
1948 spin_lock(&c->jobs_lock); 2207 channel_gk20a_joblist_lock(c);
1949 if (list_empty(&c->jobs)) { 2208 if (channel_gk20a_joblist_is_empty(c)) {
1950 spin_unlock(&c->jobs_lock); 2209 channel_gk20a_joblist_unlock(c);
1951 break; 2210 break;
1952 } 2211 }
1953 job = list_first_entry(&c->jobs, 2212
1954 struct channel_gk20a_job, list); 2213 /*
1955 spin_unlock(&c->jobs_lock); 2214 * ensure that all subsequent reads occur after checking
2215 * that we have a valid node. see corresponding wmb in
2216 * gk20a_channel_add_job().
2217 */
2218 rmb();
2219 job = channel_gk20a_joblist_peek(c);
2220 channel_gk20a_joblist_unlock(c);
1956 2221
1957 completed = gk20a_fence_is_expired(job->post_fence); 2222 completed = gk20a_fence_is_expired(job->post_fence);
1958 if (!completed) { 2223 if (!completed) {
@@ -1998,9 +2263,14 @@ static void gk20a_channel_clean_up_jobs(struct work_struct *work)
1998 * so this wouldn't get freed here. */ 2263 * so this wouldn't get freed here. */
1999 gk20a_channel_put(c); 2264 gk20a_channel_put(c);
2000 2265
2001 spin_lock(&c->jobs_lock); 2266 /*
2002 list_del_init(&job->list); 2267 * ensure all pending writes complete before deleting the node.
2003 spin_unlock(&c->jobs_lock); 2268 * see corresponding rmb in channel_gk20a_alloc_job().
2269 */
2270 wmb();
2271 channel_gk20a_joblist_lock(c);
2272 channel_gk20a_joblist_delete(c, job);
2273 channel_gk20a_joblist_unlock(c);
2004 2274
2005 channel_gk20a_free_job(c, job); 2275 channel_gk20a_free_job(c, job);
2006 job_finished = 1; 2276 job_finished = 1;
@@ -2160,6 +2430,7 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
2160 int wait_fence_fd = -1; 2430 int wait_fence_fd = -1;
2161 int err = 0; 2431 int err = 0;
2162 bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI); 2432 bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
2433 bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
2163 2434
2164 /* 2435 /*
2165 * If user wants to always allocate sync_fence_fds then respect that; 2436 * If user wants to always allocate sync_fence_fds then respect that;
@@ -2197,9 +2468,10 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
2197 * this condition. 2468 * this condition.
2198 */ 2469 */
2199 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) { 2470 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
2200 job->wait_cmd = kzalloc(sizeof(struct priv_cmd_entry),
2201 GFP_KERNEL);
2202 job->pre_fence = gk20a_alloc_fence(c); 2471 job->pre_fence = gk20a_alloc_fence(c);
2472 if (!pre_alloc_enabled)
2473 job->wait_cmd = kzalloc(sizeof(struct priv_cmd_entry),
2474 GFP_KERNEL);
2203 2475
2204 if (!job->wait_cmd || !job->pre_fence) { 2476 if (!job->wait_cmd || !job->pre_fence) {
2205 err = -ENOMEM; 2477 err = -ENOMEM;
@@ -2233,8 +2505,10 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
2233 * is used to keep track of method completion for idle railgating. The 2505 * is used to keep track of method completion for idle railgating. The
2234 * sync_pt/semaphore PB is added to the GPFIFO later on in submit. 2506 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
2235 */ 2507 */
2236 job->incr_cmd = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
2237 job->post_fence = gk20a_alloc_fence(c); 2508 job->post_fence = gk20a_alloc_fence(c);
2509 if (!pre_alloc_enabled)
2510 job->incr_cmd = kzalloc(sizeof(struct priv_cmd_entry),
2511 GFP_KERNEL);
2238 2512
2239 if (!job->incr_cmd || !job->post_fence) { 2513 if (!job->incr_cmd || !job->post_fence) {
2240 err = -ENOMEM; 2514 err = -ENOMEM;
@@ -2256,15 +2530,17 @@ static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
2256 return 0; 2530 return 0;
2257 2531
2258clean_up_post_fence: 2532clean_up_post_fence:
2259 gk20a_free_priv_cmdbuf(c, job->incr_cmd);
2260 gk20a_fence_put(job->post_fence); 2533 gk20a_fence_put(job->post_fence);
2261 job->incr_cmd = NULL;
2262 job->post_fence = NULL; 2534 job->post_fence = NULL;
2535 free_priv_cmdbuf(c, job->incr_cmd);
2536 if (!pre_alloc_enabled)
2537 job->incr_cmd = NULL;
2263clean_up_pre_fence: 2538clean_up_pre_fence:
2264 gk20a_free_priv_cmdbuf(c, job->wait_cmd);
2265 gk20a_fence_put(job->pre_fence); 2539 gk20a_fence_put(job->pre_fence);
2266 job->wait_cmd = NULL;
2267 job->pre_fence = NULL; 2540 job->pre_fence = NULL;
2541 free_priv_cmdbuf(c, job->wait_cmd);
2542 if (!pre_alloc_enabled)
2543 job->wait_cmd = NULL;
2268 *wait_cmd = NULL; 2544 *wait_cmd = NULL;
2269 *pre_fence = NULL; 2545 *pre_fence = NULL;
2270fail: 2546fail:
@@ -2388,11 +2664,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
2388 } 2664 }
2389 2665
2390 if (need_job_tracking) { 2666 if (need_job_tracking) {
2391 job = channel_gk20a_alloc_job(c); 2667 err = channel_gk20a_alloc_job(c, &job);
2392 if (!job) { 2668 if (err)
2393 err = -ENOMEM;
2394 goto clean_up; 2669 goto clean_up;
2395 }
2396 2670
2397 err = gk20a_submit_prepare_syncs(c, fence, job, 2671 err = gk20a_submit_prepare_syncs(c, fence, job,
2398 &wait_cmd, &incr_cmd, 2672 &wait_cmd, &incr_cmd,
@@ -2463,13 +2737,14 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
2463 init_waitqueue_head(&c->ref_count_dec_wq); 2737 init_waitqueue_head(&c->ref_count_dec_wq);
2464 mutex_init(&c->ioctl_lock); 2738 mutex_init(&c->ioctl_lock);
2465 mutex_init(&c->error_notifier_mutex); 2739 mutex_init(&c->error_notifier_mutex);
2466 spin_lock_init(&c->jobs_lock); 2740 spin_lock_init(&c->joblist.dynamic.lock);
2741 mutex_init(&c->joblist.pre_alloc.read_lock);
2467 raw_spin_lock_init(&c->timeout.lock); 2742 raw_spin_lock_init(&c->timeout.lock);
2468 mutex_init(&c->sync_lock); 2743 mutex_init(&c->sync_lock);
2469 INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler); 2744 INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
2470 INIT_DELAYED_WORK(&c->clean_up.wq, gk20a_channel_clean_up_jobs); 2745 INIT_DELAYED_WORK(&c->clean_up.wq, gk20a_channel_clean_up_jobs);
2471 mutex_init(&c->clean_up.lock); 2746 mutex_init(&c->clean_up.lock);
2472 INIT_LIST_HEAD(&c->jobs); 2747 INIT_LIST_HEAD(&c->joblist.dynamic.jobs);
2473#if defined(CONFIG_GK20A_CYCLE_STATS) 2748#if defined(CONFIG_GK20A_CYCLE_STATS)
2474 mutex_init(&c->cyclestate.cyclestate_buffer_mutex); 2749 mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
2475 mutex_init(&c->cs_client_mutex); 2750 mutex_init(&c->cs_client_mutex);
@@ -3119,7 +3394,7 @@ long gk20a_channel_ioctl(struct file *filp,
3119 (struct nvgpu_free_obj_ctx_args *)buf); 3394 (struct nvgpu_free_obj_ctx_args *)buf);
3120 gk20a_idle(dev); 3395 gk20a_idle(dev);
3121 break; 3396 break;
3122 case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO: 3397 case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO_EX:
3123 err = gk20a_busy(dev); 3398 err = gk20a_busy(dev);
3124 if (err) { 3399 if (err) {
3125 dev_err(dev, 3400 dev_err(dev,
@@ -3128,9 +3403,34 @@ long gk20a_channel_ioctl(struct file *filp,
3128 break; 3403 break;
3129 } 3404 }
3130 err = gk20a_alloc_channel_gpfifo(ch, 3405 err = gk20a_alloc_channel_gpfifo(ch,
3131 (struct nvgpu_alloc_gpfifo_args *)buf); 3406 (struct nvgpu_alloc_gpfifo_ex_args *)buf);
3407 gk20a_idle(dev);
3408 break;
3409 case NVGPU_IOCTL_CHANNEL_ALLOC_GPFIFO:
3410 {
3411 struct nvgpu_alloc_gpfifo_ex_args alloc_gpfifo_ex_args;
3412 struct nvgpu_alloc_gpfifo_args *alloc_gpfifo_args =
3413 (struct nvgpu_alloc_gpfifo_args *)buf;
3414
3415 err = gk20a_busy(dev);
3416 if (err) {
3417 dev_err(dev,
3418 "%s: failed to host gk20a for ioctl cmd: 0x%x",
3419 __func__, cmd);
3420 break;
3421 }
3422
3423 /* prepare new args structure */
3424 memset(&alloc_gpfifo_ex_args, 0,
3425 sizeof(struct nvgpu_alloc_gpfifo_ex_args));
3426 alloc_gpfifo_ex_args.num_entries =
3427 alloc_gpfifo_args->num_entries;
3428 alloc_gpfifo_ex_args.flags = alloc_gpfifo_args->flags;
3429
3430 err = gk20a_alloc_channel_gpfifo(ch, &alloc_gpfifo_ex_args);
3132 gk20a_idle(dev); 3431 gk20a_idle(dev);
3133 break; 3432 break;
3433 }
3134 case NVGPU_IOCTL_CHANNEL_SUBMIT_GPFIFO: 3434 case NVGPU_IOCTL_CHANNEL_SUBMIT_GPFIFO:
3135 err = gk20a_ioctl_channel_submit_gpfifo(ch, 3435 err = gk20a_ioctl_channel_submit_gpfifo(ch,
3136 (struct nvgpu_submit_gpfifo_args *)buf); 3436 (struct nvgpu_submit_gpfifo_args *)buf);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 0d8746b8..8cceb6b2 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -70,6 +70,22 @@ struct channel_gk20a_job {
70 struct list_head list; 70 struct list_head list;
71}; 71};
72 72
73struct channel_gk20a_joblist {
74 struct {
75 bool enabled;
76 unsigned int length;
77 unsigned int put;
78 unsigned int get;
79 struct channel_gk20a_job *jobs;
80 struct mutex read_lock;
81 } pre_alloc;
82
83 struct {
84 struct list_head jobs;
85 spinlock_t lock;
86 } dynamic;
87};
88
73struct channel_gk20a_timeout { 89struct channel_gk20a_timeout {
74 struct delayed_work wq; 90 struct delayed_work wq;
75 raw_spinlock_t lock; 91 raw_spinlock_t lock;
@@ -115,6 +131,7 @@ struct channel_gk20a {
115 bool bound; 131 bool bound;
116 bool first_init; 132 bool first_init;
117 bool vpr; 133 bool vpr;
134 bool no_block;
118 bool cde; 135 bool cde;
119 pid_t pid; 136 pid_t pid;
120 pid_t tgid; 137 pid_t tgid;
@@ -123,8 +140,8 @@ struct channel_gk20a {
123 int tsgid; 140 int tsgid;
124 struct list_head ch_entry; /* channel's entry in TSG */ 141 struct list_head ch_entry; /* channel's entry in TSG */
125 142
126 struct list_head jobs; 143 struct channel_gk20a_joblist joblist;
127 spinlock_t jobs_lock; 144 struct gk20a_allocator fence_allocator;
128 145
129 struct vm_gk20a *vm; 146 struct vm_gk20a *vm;
130 147
@@ -272,7 +289,7 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
272 bool force_need_sync_fence); 289 bool force_need_sync_fence);
273 290
274int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c, 291int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
275 struct nvgpu_alloc_gpfifo_args *args); 292 struct nvgpu_alloc_gpfifo_ex_args *args);
276 293
277void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a); 294void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
278void channel_gk20a_disable(struct channel_gk20a *ch); 295void channel_gk20a_disable(struct channel_gk20a *ch);
@@ -284,6 +301,11 @@ int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
284void channel_gk20a_enable(struct channel_gk20a *ch); 301void channel_gk20a_enable(struct channel_gk20a *ch);
285void gk20a_channel_timeout_restart_all_channels(struct gk20a *g); 302void gk20a_channel_timeout_restart_all_channels(struct gk20a *g);
286 303
304bool channel_gk20a_is_prealloc_enabled(struct channel_gk20a *c);
305void channel_gk20a_joblist_lock(struct channel_gk20a *c);
306void channel_gk20a_joblist_unlock(struct channel_gk20a *c);
307bool channel_gk20a_joblist_is_empty(struct channel_gk20a *c);
308
287int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g, 309int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
288 int timeslice_period, 310 int timeslice_period,
289 int *__timeslice_timeout, int *__timeslice_scale); 311 int *__timeslice_timeout, int *__timeslice_scale);
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
index f788829f..c11d363e 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -47,7 +47,12 @@ static void gk20a_fence_free(struct kref *ref)
47#endif 47#endif
48 if (f->semaphore) 48 if (f->semaphore)
49 gk20a_semaphore_put(f->semaphore); 49 gk20a_semaphore_put(f->semaphore);
50 kfree(f); 50
51 if (f->allocator) {
52 if (gk20a_alloc_initialized(f->allocator))
53 gk20a_free(f->allocator, (u64)f);
54 } else
55 kfree(f);
51} 56}
52 57
53void gk20a_fence_put(struct gk20a_fence *f) 58void gk20a_fence_put(struct gk20a_fence *f)
@@ -109,15 +114,66 @@ int gk20a_fence_install_fd(struct gk20a_fence *f)
109#endif 114#endif
110} 115}
111 116
112struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c) 117int gk20a_alloc_fence_pool(struct channel_gk20a *c, int count)
118{
119 int err;
120 size_t size;
121 struct gk20a_fence *fence_pool = NULL;
122
123 size = sizeof(struct gk20a_fence);
124 if (count <= ULONG_MAX / size) {
125 size = count * size;
126 fence_pool = vzalloc(size);
127 }
128
129 if (!fence_pool)
130 return -ENOMEM;
131
132 err = gk20a_lockless_allocator_init(&c->fence_allocator,
133 "fence_pool", (u64)fence_pool, size,
134 sizeof(struct gk20a_fence), 0);
135 if (err)
136 goto fail;
137
138 return 0;
139
140fail:
141 vfree(fence_pool);
142 return err;
143}
144
145void gk20a_free_fence_pool(struct channel_gk20a *c)
113{ 146{
114 struct gk20a_fence *fence; 147 if (gk20a_alloc_initialized(&c->fence_allocator)) {
148 void *base = (void *)gk20a_alloc_base(&c->fence_allocator);
149
150 gk20a_alloc_destroy(&c->fence_allocator);
151 vfree(base);
152 }
153}
115 154
116 fence = kzalloc(sizeof(struct gk20a_fence), GFP_KERNEL); 155struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c)
117 if (!fence) 156{
118 return NULL; 157 struct gk20a_fence *fence = NULL;
158
159 if (channel_gk20a_is_prealloc_enabled(c)) {
160 if (gk20a_alloc_initialized(&c->fence_allocator)) {
161 fence = (struct gk20a_fence *)
162 gk20a_alloc(&c->fence_allocator,
163 sizeof(struct gk20a_fence));
164
165 /* clear the node and reset the allocator pointer */
166 if (fence) {
167 memset(fence, 0, sizeof(*fence));
168 fence->allocator = &c->fence_allocator;
169 }
170 }
171 } else
172 fence = kzalloc(sizeof(struct gk20a_fence), GFP_KERNEL);
173
174 if (fence)
175 kref_init(&fence->ref);
119 176
120 kref_init(&fence->ref);
121 return fence; 177 return fence;
122} 178}
123 179
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
index 3fe2d8b2..97a7d957 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
@@ -3,7 +3,7 @@
3 * 3 *
4 * GK20A Fences 4 * GK20A Fences
5 * 5 *
6 * Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved. 6 * Copyright (c) 2014-2016, NVIDIA CORPORATION. All rights reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License, 9 * under the terms and conditions of the GNU General Public License,
@@ -45,6 +45,9 @@ struct gk20a_fence {
45 struct platform_device *host1x_pdev; 45 struct platform_device *host1x_pdev;
46 u32 syncpt_id; 46 u32 syncpt_id;
47 u32 syncpt_value; 47 u32 syncpt_value;
48
49 /* Valid for fences part of a pre-allocated fence pool */
50 struct gk20a_allocator *allocator;
48}; 51};
49 52
50/* Fences can be created from semaphores or syncpoint (id, value) pairs */ 53/* Fences can be created from semaphores or syncpoint (id, value) pairs */
@@ -62,7 +65,15 @@ int gk20a_fence_from_syncpt(
62 u32 id, u32 value, bool wfi, 65 u32 id, u32 value, bool wfi,
63 bool need_sync_fence); 66 bool need_sync_fence);
64 67
65struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c); 68int gk20a_alloc_fence_pool(
69 struct channel_gk20a *c,
70 int size);
71
72void gk20a_free_fence_pool(
73 struct channel_gk20a *c);
74
75struct gk20a_fence *gk20a_alloc_fence(
76 struct channel_gk20a *c);
66 77
67void gk20a_init_fence(struct gk20a_fence *f, 78void gk20a_init_fence(struct gk20a_fence *f,
68 const struct gk20a_fence_ops *ops, 79 const struct gk20a_fence_ops *ops,