summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
diff options
context:
space:
mode:
authorDeepak Nibade <dnibade@nvidia.com>2015-08-31 05:00:35 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2015-09-28 12:08:12 -0400
commit613990cb391c74436384d63d12240221565011d5 (patch)
tree27d7cd19bd84a6ce50fb579c5f6a08ada28ba5b7 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parentcb8c102131ec96767e01981dc9a9d26e30593a70 (diff)
gpu: nvgpu: implement per-channel watchdog
Implement per-channel watchdog/timer as per below rules : - start the timer while submitting first job on channel or if no timer is already running - cancel the timer when job completes - re-start the timer if there is any incomplete job left in the channel's queue - trigger appropriate recovery method as part of timeout handling mechanism Handle the timeout as per below : - get timed out channel, and job data - disable activity on all engines - check if fence is really pending - get information on failing engine - if no engine is failing, just abort the channel - if engine is failing, trigger the recovery Also, add flag "ch_wdt_enabled" to enable/disable channel watchdog mechanism. Watchdog can also be disabled using global flag "timeouts_enabled" Set the watchdog time to be 5s using macro NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS Bug 200133289 Change-Id: I401cf14dd34a210bc429f31bd5216a361edf1237 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/797072 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/channel_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c124
1 files changed, 123 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index c18a4e5d..2dc8e9a0 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1472,6 +1472,14 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
1472 ch->timeout_accumulated_ms > ch->timeout_ms_max; 1472 ch->timeout_accumulated_ms > ch->timeout_ms_max;
1473} 1473}
1474 1474
1475static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
1476{
1477 if (ch->g->timeouts_enabled && ch->g->ch_wdt_enabled)
1478 return NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS;
1479 else
1480 return (u32)MAX_SCHEDULE_TIMEOUT;
1481}
1482
1475static u32 get_gp_free_count(struct channel_gk20a *c) 1483static u32 get_gp_free_count(struct channel_gk20a *c)
1476{ 1484{
1477 update_gp_get(c->g, c); 1485 update_gp_get(c->g, c);
@@ -1527,6 +1535,112 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
1527 } 1535 }
1528} 1536}
1529 1537
1538static void gk20a_channel_timeout_start(struct channel_gk20a *ch,
1539 struct channel_gk20a_job *job)
1540{
1541 mutex_lock(&ch->timeout.lock);
1542
1543 if (ch->timeout.initialized) {
1544 mutex_unlock(&ch->timeout.lock);
1545 return;
1546 }
1547
1548 ch->timeout.job = job;
1549 ch->timeout.initialized = true;
1550 schedule_delayed_work(&ch->timeout.wq,
1551 msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch)));
1552
1553 mutex_unlock(&ch->timeout.lock);
1554}
1555
1556static void gk20a_channel_timeout_stop(struct channel_gk20a *ch)
1557{
1558 mutex_lock(&ch->timeout.lock);
1559
1560 if (!ch->timeout.initialized) {
1561 mutex_unlock(&ch->timeout.lock);
1562 return;
1563 }
1564
1565 ch->timeout.initialized = false;
1566 cancel_delayed_work_sync(&ch->timeout.wq);
1567
1568 mutex_unlock(&ch->timeout.lock);
1569}
1570
1571static void gk20a_channel_timeout_handler(struct work_struct *work)
1572{
1573 struct channel_gk20a_job *job;
1574 struct gk20a *g;
1575 struct channel_gk20a *ch;
1576 struct channel_gk20a *failing_ch;
1577 u32 engine_id;
1578 int id = -1;
1579 bool is_tsg = false;
1580
1581 ch = container_of(to_delayed_work(work), struct channel_gk20a,
1582 timeout.wq);
1583 ch = gk20a_channel_get(ch);
1584 if (!ch)
1585 return;
1586
1587 g = ch->g;
1588
1589 /* Need global lock since multiple channels can timeout at a time */
1590 mutex_lock(&g->ch_wdt_lock);
1591
1592 /* Get timed out job and reset the timer */
1593 mutex_lock(&ch->timeout.lock);
1594 job = ch->timeout.job;
1595 ch->timeout.initialized = false;
1596 mutex_unlock(&ch->timeout.lock);
1597
1598 if (gk20a_fifo_disable_all_engine_activity(g, true))
1599 goto fail_unlock;
1600
1601 if (gk20a_fence_is_expired(job->post_fence))
1602 goto fail_enable_engine_activity;
1603
1604 gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n",
1605 ch->hw_chid);
1606
1607 /* Get failing engine data */
1608 engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg);
1609
1610 if (engine_id >= g->fifo.max_engines) {
1611 /* If no failing engine, abort the channels */
1612 if (gk20a_is_channel_marked_as_tsg(ch)) {
1613 struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
1614
1615 gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
1616 gk20a_fifo_abort_tsg(g, ch->tsgid);
1617 } else {
1618 gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
1619 gk20a_channel_abort(ch);
1620 }
1621 } else {
1622 /* If failing engine, trigger recovery */
1623 failing_ch = gk20a_channel_get(&g->fifo.channel[id]);
1624 if (!failing_ch)
1625 goto fail_enable_engine_activity;
1626
1627 if (failing_ch->hw_chid != ch->hw_chid)
1628 gk20a_channel_timeout_start(ch, job);
1629
1630 gk20a_fifo_recover(g, BIT(engine_id),
1631 failing_ch->hw_chid, is_tsg,
1632 true, failing_ch->timeout_debug_dump);
1633
1634 gk20a_channel_put(failing_ch);
1635 }
1636
1637fail_enable_engine_activity:
1638 gk20a_fifo_enable_all_engine_activity(g);
1639fail_unlock:
1640 mutex_unlock(&g->ch_wdt_lock);
1641 gk20a_channel_put(ch);
1642}
1643
1530static int gk20a_channel_add_job(struct channel_gk20a *c, 1644static int gk20a_channel_add_job(struct channel_gk20a *c,
1531 struct gk20a_fence *pre_fence, 1645 struct gk20a_fence *pre_fence,
1532 struct gk20a_fence *post_fence) 1646 struct gk20a_fence *post_fence)
@@ -1561,6 +1675,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
1561 job->pre_fence = gk20a_fence_get(pre_fence); 1675 job->pre_fence = gk20a_fence_get(pre_fence);
1562 job->post_fence = gk20a_fence_get(post_fence); 1676 job->post_fence = gk20a_fence_get(post_fence);
1563 1677
1678 gk20a_channel_timeout_start(c, job);
1679
1564 mutex_lock(&c->jobs_lock); 1680 mutex_lock(&c->jobs_lock);
1565 list_add_tail(&job->list, &c->jobs); 1681 list_add_tail(&job->list, &c->jobs);
1566 mutex_unlock(&c->jobs_lock); 1682 mutex_unlock(&c->jobs_lock);
@@ -1586,8 +1702,12 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
1586 struct gk20a *g = c->g; 1702 struct gk20a *g = c->g;
1587 1703
1588 bool completed = gk20a_fence_is_expired(job->post_fence); 1704 bool completed = gk20a_fence_is_expired(job->post_fence);
1589 if (!completed) 1705 if (!completed) {
1706 gk20a_channel_timeout_start(c, job);
1590 break; 1707 break;
1708 }
1709
1710 gk20a_channel_timeout_stop(c);
1591 1711
1592 if (c->sync) 1712 if (c->sync)
1593 c->sync->signal_timeline(c->sync); 1713 c->sync->signal_timeline(c->sync);
@@ -1926,6 +2046,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
1926 mutex_init(&c->ioctl_lock); 2046 mutex_init(&c->ioctl_lock);
1927 mutex_init(&c->jobs_lock); 2047 mutex_init(&c->jobs_lock);
1928 mutex_init(&c->submit_lock); 2048 mutex_init(&c->submit_lock);
2049 mutex_init(&c->timeout.lock);
2050 INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
1929 INIT_LIST_HEAD(&c->jobs); 2051 INIT_LIST_HEAD(&c->jobs);
1930#if defined(CONFIG_GK20A_CYCLE_STATS) 2052#if defined(CONFIG_GK20A_CYCLE_STATS)
1931 mutex_init(&c->cyclestate.cyclestate_buffer_mutex); 2053 mutex_init(&c->cyclestate.cyclestate_buffer_mutex);