diff options
author | Deepak Nibade <dnibade@nvidia.com> | 2015-08-31 05:00:35 -0400 |
---|---|---|
committer | Terje Bergstrom <tbergstrom@nvidia.com> | 2015-09-28 12:08:12 -0400 |
commit | 613990cb391c74436384d63d12240221565011d5 (patch) | |
tree | 27d7cd19bd84a6ce50fb579c5f6a08ada28ba5b7 /drivers/gpu/nvgpu | |
parent | cb8c102131ec96767e01981dc9a9d26e30593a70 (diff) |
gpu: nvgpu: implement per-channel watchdog
Implement per-channel watchdog/timer as per below rules :
- start the timer while submitting first job on channel or if
no timer is already running
- cancel the timer when job completes
- re-start the timer if there is any incomplete job left
in the channel's queue
- trigger appropriate recovery method as part of timeout
handling mechanism
Handle the timeout as per below :
- get timed out channel, and job data
- disable activity on all engines
- check if fence is really pending
- get information on failing engine
- if no engine is failing, just abort the channel
- if engine is failing, trigger the recovery
Also, add flag "ch_wdt_enabled" to enable/disable channel
watchdog mechanism. Watchdog can also be disabled using
global flag "timeouts_enabled"
Set the watchdog time to be 5s using macro
NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS
Bug 200133289
Change-Id: I401cf14dd34a210bc429f31bd5216a361edf1237
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/797072
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 124 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 11 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 6 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 5 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gk20a.c | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gk20a.h | 3 |
6 files changed, 147 insertions, 4 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index c18a4e5d..2dc8e9a0 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |||
@@ -1472,6 +1472,14 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, | |||
1472 | ch->timeout_accumulated_ms > ch->timeout_ms_max; | 1472 | ch->timeout_accumulated_ms > ch->timeout_ms_max; |
1473 | } | 1473 | } |
1474 | 1474 | ||
1475 | static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch) | ||
1476 | { | ||
1477 | if (ch->g->timeouts_enabled && ch->g->ch_wdt_enabled) | ||
1478 | return NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS; | ||
1479 | else | ||
1480 | return (u32)MAX_SCHEDULE_TIMEOUT; | ||
1481 | } | ||
1482 | |||
1475 | static u32 get_gp_free_count(struct channel_gk20a *c) | 1483 | static u32 get_gp_free_count(struct channel_gk20a *c) |
1476 | { | 1484 | { |
1477 | update_gp_get(c->g, c); | 1485 | update_gp_get(c->g, c); |
@@ -1527,6 +1535,112 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c, | |||
1527 | } | 1535 | } |
1528 | } | 1536 | } |
1529 | 1537 | ||
1538 | static void gk20a_channel_timeout_start(struct channel_gk20a *ch, | ||
1539 | struct channel_gk20a_job *job) | ||
1540 | { | ||
1541 | mutex_lock(&ch->timeout.lock); | ||
1542 | |||
1543 | if (ch->timeout.initialized) { | ||
1544 | mutex_unlock(&ch->timeout.lock); | ||
1545 | return; | ||
1546 | } | ||
1547 | |||
1548 | ch->timeout.job = job; | ||
1549 | ch->timeout.initialized = true; | ||
1550 | schedule_delayed_work(&ch->timeout.wq, | ||
1551 | msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch))); | ||
1552 | |||
1553 | mutex_unlock(&ch->timeout.lock); | ||
1554 | } | ||
1555 | |||
1556 | static void gk20a_channel_timeout_stop(struct channel_gk20a *ch) | ||
1557 | { | ||
1558 | mutex_lock(&ch->timeout.lock); | ||
1559 | |||
1560 | if (!ch->timeout.initialized) { | ||
1561 | mutex_unlock(&ch->timeout.lock); | ||
1562 | return; | ||
1563 | } | ||
1564 | |||
1565 | ch->timeout.initialized = false; | ||
1566 | cancel_delayed_work_sync(&ch->timeout.wq); | ||
1567 | |||
1568 | mutex_unlock(&ch->timeout.lock); | ||
1569 | } | ||
1570 | |||
1571 | static void gk20a_channel_timeout_handler(struct work_struct *work) | ||
1572 | { | ||
1573 | struct channel_gk20a_job *job; | ||
1574 | struct gk20a *g; | ||
1575 | struct channel_gk20a *ch; | ||
1576 | struct channel_gk20a *failing_ch; | ||
1577 | u32 engine_id; | ||
1578 | int id = -1; | ||
1579 | bool is_tsg = false; | ||
1580 | |||
1581 | ch = container_of(to_delayed_work(work), struct channel_gk20a, | ||
1582 | timeout.wq); | ||
1583 | ch = gk20a_channel_get(ch); | ||
1584 | if (!ch) | ||
1585 | return; | ||
1586 | |||
1587 | g = ch->g; | ||
1588 | |||
1589 | /* Need global lock since multiple channels can timeout at a time */ | ||
1590 | mutex_lock(&g->ch_wdt_lock); | ||
1591 | |||
1592 | /* Get timed out job and reset the timer */ | ||
1593 | mutex_lock(&ch->timeout.lock); | ||
1594 | job = ch->timeout.job; | ||
1595 | ch->timeout.initialized = false; | ||
1596 | mutex_unlock(&ch->timeout.lock); | ||
1597 | |||
1598 | if (gk20a_fifo_disable_all_engine_activity(g, true)) | ||
1599 | goto fail_unlock; | ||
1600 | |||
1601 | if (gk20a_fence_is_expired(job->post_fence)) | ||
1602 | goto fail_enable_engine_activity; | ||
1603 | |||
1604 | gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n", | ||
1605 | ch->hw_chid); | ||
1606 | |||
1607 | /* Get failing engine data */ | ||
1608 | engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg); | ||
1609 | |||
1610 | if (engine_id >= g->fifo.max_engines) { | ||
1611 | /* If no failing engine, abort the channels */ | ||
1612 | if (gk20a_is_channel_marked_as_tsg(ch)) { | ||
1613 | struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid]; | ||
1614 | |||
1615 | gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg); | ||
1616 | gk20a_fifo_abort_tsg(g, ch->tsgid); | ||
1617 | } else { | ||
1618 | gk20a_fifo_set_ctx_mmu_error_ch(g, ch); | ||
1619 | gk20a_channel_abort(ch); | ||
1620 | } | ||
1621 | } else { | ||
1622 | /* If failing engine, trigger recovery */ | ||
1623 | failing_ch = gk20a_channel_get(&g->fifo.channel[id]); | ||
1624 | if (!failing_ch) | ||
1625 | goto fail_enable_engine_activity; | ||
1626 | |||
1627 | if (failing_ch->hw_chid != ch->hw_chid) | ||
1628 | gk20a_channel_timeout_start(ch, job); | ||
1629 | |||
1630 | gk20a_fifo_recover(g, BIT(engine_id), | ||
1631 | failing_ch->hw_chid, is_tsg, | ||
1632 | true, failing_ch->timeout_debug_dump); | ||
1633 | |||
1634 | gk20a_channel_put(failing_ch); | ||
1635 | } | ||
1636 | |||
1637 | fail_enable_engine_activity: | ||
1638 | gk20a_fifo_enable_all_engine_activity(g); | ||
1639 | fail_unlock: | ||
1640 | mutex_unlock(&g->ch_wdt_lock); | ||
1641 | gk20a_channel_put(ch); | ||
1642 | } | ||
1643 | |||
1530 | static int gk20a_channel_add_job(struct channel_gk20a *c, | 1644 | static int gk20a_channel_add_job(struct channel_gk20a *c, |
1531 | struct gk20a_fence *pre_fence, | 1645 | struct gk20a_fence *pre_fence, |
1532 | struct gk20a_fence *post_fence) | 1646 | struct gk20a_fence *post_fence) |
@@ -1561,6 +1675,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c, | |||
1561 | job->pre_fence = gk20a_fence_get(pre_fence); | 1675 | job->pre_fence = gk20a_fence_get(pre_fence); |
1562 | job->post_fence = gk20a_fence_get(post_fence); | 1676 | job->post_fence = gk20a_fence_get(post_fence); |
1563 | 1677 | ||
1678 | gk20a_channel_timeout_start(c, job); | ||
1679 | |||
1564 | mutex_lock(&c->jobs_lock); | 1680 | mutex_lock(&c->jobs_lock); |
1565 | list_add_tail(&job->list, &c->jobs); | 1681 | list_add_tail(&job->list, &c->jobs); |
1566 | mutex_unlock(&c->jobs_lock); | 1682 | mutex_unlock(&c->jobs_lock); |
@@ -1586,8 +1702,12 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed) | |||
1586 | struct gk20a *g = c->g; | 1702 | struct gk20a *g = c->g; |
1587 | 1703 | ||
1588 | bool completed = gk20a_fence_is_expired(job->post_fence); | 1704 | bool completed = gk20a_fence_is_expired(job->post_fence); |
1589 | if (!completed) | 1705 | if (!completed) { |
1706 | gk20a_channel_timeout_start(c, job); | ||
1590 | break; | 1707 | break; |
1708 | } | ||
1709 | |||
1710 | gk20a_channel_timeout_stop(c); | ||
1591 | 1711 | ||
1592 | if (c->sync) | 1712 | if (c->sync) |
1593 | c->sync->signal_timeline(c->sync); | 1713 | c->sync->signal_timeline(c->sync); |
@@ -1926,6 +2046,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid) | |||
1926 | mutex_init(&c->ioctl_lock); | 2046 | mutex_init(&c->ioctl_lock); |
1927 | mutex_init(&c->jobs_lock); | 2047 | mutex_init(&c->jobs_lock); |
1928 | mutex_init(&c->submit_lock); | 2048 | mutex_init(&c->submit_lock); |
2049 | mutex_init(&c->timeout.lock); | ||
2050 | INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler); | ||
1929 | INIT_LIST_HEAD(&c->jobs); | 2051 | INIT_LIST_HEAD(&c->jobs); |
1930 | #if defined(CONFIG_GK20A_CYCLE_STATS) | 2052 | #if defined(CONFIG_GK20A_CYCLE_STATS) |
1931 | mutex_init(&c->cyclestate.cyclestate_buffer_mutex); | 2053 | mutex_init(&c->cyclestate.cyclestate_buffer_mutex); |
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index 2ea5b4be..70930291 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h | |||
@@ -38,6 +38,8 @@ struct gk20a_fence; | |||
38 | #include "gr_gk20a.h" | 38 | #include "gr_gk20a.h" |
39 | #include "fence_gk20a.h" | 39 | #include "fence_gk20a.h" |
40 | 40 | ||
41 | #define NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS 5000 | ||
42 | |||
41 | struct gpfifo { | 43 | struct gpfifo { |
42 | u32 entry0; | 44 | u32 entry0; |
43 | u32 entry1; | 45 | u32 entry1; |
@@ -70,6 +72,13 @@ struct channel_gk20a_job { | |||
70 | struct list_head list; | 72 | struct list_head list; |
71 | }; | 73 | }; |
72 | 74 | ||
75 | struct channel_gk20a_timeout { | ||
76 | struct delayed_work wq; | ||
77 | struct mutex lock; | ||
78 | bool initialized; | ||
79 | struct channel_gk20a_job *job; | ||
80 | }; | ||
81 | |||
73 | struct channel_gk20a_poll_events { | 82 | struct channel_gk20a_poll_events { |
74 | struct mutex lock; | 83 | struct mutex lock; |
75 | bool events_enabled; | 84 | bool events_enabled; |
@@ -126,6 +135,8 @@ struct channel_gk20a { | |||
126 | u32 timeout_accumulated_ms; | 135 | u32 timeout_accumulated_ms; |
127 | u32 timeout_gpfifo_get; | 136 | u32 timeout_gpfifo_get; |
128 | 137 | ||
138 | struct channel_gk20a_timeout timeout; | ||
139 | |||
129 | bool cmds_pending; | 140 | bool cmds_pending; |
130 | struct { | 141 | struct { |
131 | /* These fences should be accessed with submit_lock held. */ | 142 | /* These fences should be accessed with submit_lock held. */ |
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 069ea82a..f736fe8c 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |||
@@ -852,7 +852,7 @@ static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g, | |||
852 | return verbose; | 852 | return verbose; |
853 | } | 853 | } |
854 | 854 | ||
855 | static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, | 855 | bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, |
856 | struct channel_gk20a *ch) | 856 | struct channel_gk20a *ch) |
857 | { | 857 | { |
858 | gk20a_err(dev_from_gk20a(g), | 858 | gk20a_err(dev_from_gk20a(g), |
@@ -861,7 +861,7 @@ static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, | |||
861 | return gk20a_fifo_set_ctx_mmu_error(g, ch); | 861 | return gk20a_fifo_set_ctx_mmu_error(g, ch); |
862 | } | 862 | } |
863 | 863 | ||
864 | static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, | 864 | bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, |
865 | struct tsg_gk20a *tsg) | 865 | struct tsg_gk20a *tsg) |
866 | { | 866 | { |
867 | bool ret = true; | 867 | bool ret = true; |
@@ -883,7 +883,7 @@ static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, | |||
883 | return ret; | 883 | return ret; |
884 | } | 884 | } |
885 | 885 | ||
886 | static void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid) | 886 | void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid) |
887 | { | 887 | { |
888 | struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; | 888 | struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; |
889 | struct channel_gk20a *ch; | 889 | struct channel_gk20a *ch; |
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 929b5c82..3f9fac54 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | |||
@@ -183,5 +183,10 @@ u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g); | |||
183 | u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g); | 183 | u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g); |
184 | u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, | 184 | u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, |
185 | int *__id, bool *__is_tsg); | 185 | int *__id, bool *__is_tsg); |
186 | bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, | ||
187 | struct tsg_gk20a *tsg); | ||
188 | void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid); | ||
189 | bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g, | ||
190 | struct channel_gk20a *ch); | ||
186 | 191 | ||
187 | #endif /*__GR_GK20A_H__*/ | 192 | #endif /*__GR_GK20A_H__*/ |
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index c0889571..fb8b8b14 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c | |||
@@ -667,6 +667,7 @@ static int gk20a_init_support(struct platform_device *dev) | |||
667 | 667 | ||
668 | mutex_init(&g->dbg_sessions_lock); | 668 | mutex_init(&g->dbg_sessions_lock); |
669 | mutex_init(&g->client_lock); | 669 | mutex_init(&g->client_lock); |
670 | mutex_init(&g->ch_wdt_lock); | ||
670 | 671 | ||
671 | g->remove_support = gk20a_remove_support; | 672 | g->remove_support = gk20a_remove_support; |
672 | return 0; | 673 | return 0; |
@@ -1449,6 +1450,7 @@ static int gk20a_probe(struct platform_device *dev) | |||
1449 | CONFIG_GK20A_DEFAULT_TIMEOUT; | 1450 | CONFIG_GK20A_DEFAULT_TIMEOUT; |
1450 | if (tegra_platform_is_silicon()) | 1451 | if (tegra_platform_is_silicon()) |
1451 | gk20a->timeouts_enabled = true; | 1452 | gk20a->timeouts_enabled = true; |
1453 | gk20a->ch_wdt_enabled = true; | ||
1452 | 1454 | ||
1453 | /* Set up initial power settings. For non-slicon platforms, disable * | 1455 | /* Set up initial power settings. For non-slicon platforms, disable * |
1454 | * power features and for silicon platforms, read from platform data */ | 1456 | * power features and for silicon platforms, read from platform data */ |
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index dd7a7ad4..46940744 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h | |||
@@ -475,6 +475,9 @@ struct gk20a { | |||
475 | u32 gr_idle_timeout_default; | 475 | u32 gr_idle_timeout_default; |
476 | u32 timeouts_enabled; | 476 | u32 timeouts_enabled; |
477 | 477 | ||
478 | u32 ch_wdt_enabled; | ||
479 | struct mutex ch_wdt_lock; | ||
480 | |||
478 | bool slcg_enabled; | 481 | bool slcg_enabled; |
479 | bool blcg_enabled; | 482 | bool blcg_enabled; |
480 | bool elcg_enabled; | 483 | bool elcg_enabled; |