diff options
author | Deepak Nibade <dnibade@nvidia.com> | 2015-08-31 05:00:35 -0400 |
---|---|---|
committer | Terje Bergstrom <tbergstrom@nvidia.com> | 2015-09-28 12:08:12 -0400 |
commit | 613990cb391c74436384d63d12240221565011d5 (patch) | |
tree | 27d7cd19bd84a6ce50fb579c5f6a08ada28ba5b7 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |
parent | cb8c102131ec96767e01981dc9a9d26e30593a70 (diff) |
gpu: nvgpu: implement per-channel watchdog
Implement per-channel watchdog/timer as per below rules :
- start the timer while submitting first job on channel or if
no timer is already running
- cancel the timer when job completes
- re-start the timer if there is any incomplete job left
in the channel's queue
- trigger appropriate recovery method as part of timeout
handling mechanism
Handle the timeout as per below :
- get timed out channel, and job data
- disable activity on all engines
- check if fence is really pending
- get information on failing engine
- if no engine is failing, just abort the channel
- if engine is failing, trigger the recovery
Also, add flag "ch_wdt_enabled" to enable/disable channel
watchdog mechanism. Watchdog can also be disabled using
global flag "timeouts_enabled"
Set the watchdog time to be 5s using macro
NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS
Bug 200133289
Change-Id: I401cf14dd34a210bc429f31bd5216a361edf1237
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/797072
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/channel_gk20a.c')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 124 |
1 files changed, 123 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index c18a4e5d..2dc8e9a0 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |||
@@ -1472,6 +1472,14 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, | |||
1472 | ch->timeout_accumulated_ms > ch->timeout_ms_max; | 1472 | ch->timeout_accumulated_ms > ch->timeout_ms_max; |
1473 | } | 1473 | } |
1474 | 1474 | ||
1475 | static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch) | ||
1476 | { | ||
1477 | if (ch->g->timeouts_enabled && ch->g->ch_wdt_enabled) | ||
1478 | return NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS; | ||
1479 | else | ||
1480 | return (u32)MAX_SCHEDULE_TIMEOUT; | ||
1481 | } | ||
1482 | |||
1475 | static u32 get_gp_free_count(struct channel_gk20a *c) | 1483 | static u32 get_gp_free_count(struct channel_gk20a *c) |
1476 | { | 1484 | { |
1477 | update_gp_get(c->g, c); | 1485 | update_gp_get(c->g, c); |
@@ -1527,6 +1535,112 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c, | |||
1527 | } | 1535 | } |
1528 | } | 1536 | } |
1529 | 1537 | ||
1538 | static void gk20a_channel_timeout_start(struct channel_gk20a *ch, | ||
1539 | struct channel_gk20a_job *job) | ||
1540 | { | ||
1541 | mutex_lock(&ch->timeout.lock); | ||
1542 | |||
1543 | if (ch->timeout.initialized) { | ||
1544 | mutex_unlock(&ch->timeout.lock); | ||
1545 | return; | ||
1546 | } | ||
1547 | |||
1548 | ch->timeout.job = job; | ||
1549 | ch->timeout.initialized = true; | ||
1550 | schedule_delayed_work(&ch->timeout.wq, | ||
1551 | msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch))); | ||
1552 | |||
1553 | mutex_unlock(&ch->timeout.lock); | ||
1554 | } | ||
1555 | |||
1556 | static void gk20a_channel_timeout_stop(struct channel_gk20a *ch) | ||
1557 | { | ||
1558 | mutex_lock(&ch->timeout.lock); | ||
1559 | |||
1560 | if (!ch->timeout.initialized) { | ||
1561 | mutex_unlock(&ch->timeout.lock); | ||
1562 | return; | ||
1563 | } | ||
1564 | |||
1565 | ch->timeout.initialized = false; | ||
1566 | cancel_delayed_work_sync(&ch->timeout.wq); | ||
1567 | |||
1568 | mutex_unlock(&ch->timeout.lock); | ||
1569 | } | ||
1570 | |||
1571 | static void gk20a_channel_timeout_handler(struct work_struct *work) | ||
1572 | { | ||
1573 | struct channel_gk20a_job *job; | ||
1574 | struct gk20a *g; | ||
1575 | struct channel_gk20a *ch; | ||
1576 | struct channel_gk20a *failing_ch; | ||
1577 | u32 engine_id; | ||
1578 | int id = -1; | ||
1579 | bool is_tsg = false; | ||
1580 | |||
1581 | ch = container_of(to_delayed_work(work), struct channel_gk20a, | ||
1582 | timeout.wq); | ||
1583 | ch = gk20a_channel_get(ch); | ||
1584 | if (!ch) | ||
1585 | return; | ||
1586 | |||
1587 | g = ch->g; | ||
1588 | |||
1589 | /* Need global lock since multiple channels can timeout at a time */ | ||
1590 | mutex_lock(&g->ch_wdt_lock); | ||
1591 | |||
1592 | /* Get timed out job and reset the timer */ | ||
1593 | mutex_lock(&ch->timeout.lock); | ||
1594 | job = ch->timeout.job; | ||
1595 | ch->timeout.initialized = false; | ||
1596 | mutex_unlock(&ch->timeout.lock); | ||
1597 | |||
1598 | if (gk20a_fifo_disable_all_engine_activity(g, true)) | ||
1599 | goto fail_unlock; | ||
1600 | |||
1601 | if (gk20a_fence_is_expired(job->post_fence)) | ||
1602 | goto fail_enable_engine_activity; | ||
1603 | |||
1604 | gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n", | ||
1605 | ch->hw_chid); | ||
1606 | |||
1607 | /* Get failing engine data */ | ||
1608 | engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg); | ||
1609 | |||
1610 | if (engine_id >= g->fifo.max_engines) { | ||
1611 | /* If no failing engine, abort the channels */ | ||
1612 | if (gk20a_is_channel_marked_as_tsg(ch)) { | ||
1613 | struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid]; | ||
1614 | |||
1615 | gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg); | ||
1616 | gk20a_fifo_abort_tsg(g, ch->tsgid); | ||
1617 | } else { | ||
1618 | gk20a_fifo_set_ctx_mmu_error_ch(g, ch); | ||
1619 | gk20a_channel_abort(ch); | ||
1620 | } | ||
1621 | } else { | ||
1622 | /* If failing engine, trigger recovery */ | ||
1623 | failing_ch = gk20a_channel_get(&g->fifo.channel[id]); | ||
1624 | if (!failing_ch) | ||
1625 | goto fail_enable_engine_activity; | ||
1626 | |||
1627 | if (failing_ch->hw_chid != ch->hw_chid) | ||
1628 | gk20a_channel_timeout_start(ch, job); | ||
1629 | |||
1630 | gk20a_fifo_recover(g, BIT(engine_id), | ||
1631 | failing_ch->hw_chid, is_tsg, | ||
1632 | true, failing_ch->timeout_debug_dump); | ||
1633 | |||
1634 | gk20a_channel_put(failing_ch); | ||
1635 | } | ||
1636 | |||
1637 | fail_enable_engine_activity: | ||
1638 | gk20a_fifo_enable_all_engine_activity(g); | ||
1639 | fail_unlock: | ||
1640 | mutex_unlock(&g->ch_wdt_lock); | ||
1641 | gk20a_channel_put(ch); | ||
1642 | } | ||
1643 | |||
1530 | static int gk20a_channel_add_job(struct channel_gk20a *c, | 1644 | static int gk20a_channel_add_job(struct channel_gk20a *c, |
1531 | struct gk20a_fence *pre_fence, | 1645 | struct gk20a_fence *pre_fence, |
1532 | struct gk20a_fence *post_fence) | 1646 | struct gk20a_fence *post_fence) |
@@ -1561,6 +1675,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c, | |||
1561 | job->pre_fence = gk20a_fence_get(pre_fence); | 1675 | job->pre_fence = gk20a_fence_get(pre_fence); |
1562 | job->post_fence = gk20a_fence_get(post_fence); | 1676 | job->post_fence = gk20a_fence_get(post_fence); |
1563 | 1677 | ||
1678 | gk20a_channel_timeout_start(c, job); | ||
1679 | |||
1564 | mutex_lock(&c->jobs_lock); | 1680 | mutex_lock(&c->jobs_lock); |
1565 | list_add_tail(&job->list, &c->jobs); | 1681 | list_add_tail(&job->list, &c->jobs); |
1566 | mutex_unlock(&c->jobs_lock); | 1682 | mutex_unlock(&c->jobs_lock); |
@@ -1586,8 +1702,12 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed) | |||
1586 | struct gk20a *g = c->g; | 1702 | struct gk20a *g = c->g; |
1587 | 1703 | ||
1588 | bool completed = gk20a_fence_is_expired(job->post_fence); | 1704 | bool completed = gk20a_fence_is_expired(job->post_fence); |
1589 | if (!completed) | 1705 | if (!completed) { |
1706 | gk20a_channel_timeout_start(c, job); | ||
1590 | break; | 1707 | break; |
1708 | } | ||
1709 | |||
1710 | gk20a_channel_timeout_stop(c); | ||
1591 | 1711 | ||
1592 | if (c->sync) | 1712 | if (c->sync) |
1593 | c->sync->signal_timeline(c->sync); | 1713 | c->sync->signal_timeline(c->sync); |
@@ -1926,6 +2046,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid) | |||
1926 | mutex_init(&c->ioctl_lock); | 2046 | mutex_init(&c->ioctl_lock); |
1927 | mutex_init(&c->jobs_lock); | 2047 | mutex_init(&c->jobs_lock); |
1928 | mutex_init(&c->submit_lock); | 2048 | mutex_init(&c->submit_lock); |
2049 | mutex_init(&c->timeout.lock); | ||
2050 | INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler); | ||
1929 | INIT_LIST_HEAD(&c->jobs); | 2051 | INIT_LIST_HEAD(&c->jobs); |
1930 | #if defined(CONFIG_GK20A_CYCLE_STATS) | 2052 | #if defined(CONFIG_GK20A_CYCLE_STATS) |
1931 | mutex_init(&c->cyclestate.cyclestate_buffer_mutex); | 2053 | mutex_init(&c->cyclestate.cyclestate_buffer_mutex); |