gpu: nvgpu: implement per-channel watchdog

Implement per-channel watchdog/timer as per below rules : - start the timer while submitting first job on channel or if no timer is already running - cancel the timer when job completes - re-start the timer if there is any incomplete job left in the channel's queue - trigger appropriate recovery method as part of timeout handling mechanism Handle the timeout as per below : - get timed out channel, and job data - disable activity on all engines - check if fence is really pending - get information on failing engine - if no engine is failing, just abort the channel - if engine is failing, trigger the recovery Also, add flag "ch_wdt_enabled" to enable/disable channel watchdog mechanism. Watchdog can also be disabled using global flag "timeouts_enabled" Set the watchdog time to be 5s using macro NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS Bug 200133289 Change-Id: I401cf14dd34a210bc429f31bd5216a361edf1237 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/797072 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Deepak Nibade <dnibade@nvidia.com> 2015-08-31 05:00:35 -0400
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2015-09-28 12:08:12 -0400
commit: 613990cb391c74436384d63d12240221565011d5 (patch)
tree: 27d7cd19bd84a6ce50fb579c5f6a08ada28ba5b7 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent: cb8c102131ec96767e01981dc9a9d26e30593a70 (diff)
1 files changed, 123 insertions, 1 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index c18a4e5d..2dc8e9a0 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1472,6 +1472,14 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
                ch->timeout_accumulated_ms > ch->timeout_ms_max;
 }
+static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
+{
+        if (ch->g->timeouts_enabled && ch->g->ch_wdt_enabled)
+                return NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS;
+        else
+                return (u32)MAX_SCHEDULE_TIMEOUT;
+}
 static u32 get_gp_free_count(struct channel_gk20a *c)
 {
        update_gp_get(c->g, c);
@@ -1527,6 +1535,112 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
        }
 }
+static void gk20a_channel_timeout_start(struct channel_gk20a *ch,
+                struct channel_gk20a_job *job)
+{
+        mutex_lock(&ch->timeout.lock);
+        if (ch->timeout.initialized) {
+                mutex_unlock(&ch->timeout.lock);
+                return;
+        }
+        ch->timeout.job = job;
+        ch->timeout.initialized = true;
+        schedule_delayed_work(&ch->timeout.wq,
+               msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch)));
+        mutex_unlock(&ch->timeout.lock);
+}
+static void gk20a_channel_timeout_stop(struct channel_gk20a *ch)
+{
+        mutex_lock(&ch->timeout.lock);
+        if (!ch->timeout.initialized) {
+                mutex_unlock(&ch->timeout.lock);
+                return;
+        }
+        ch->timeout.initialized = false;
+        cancel_delayed_work_sync(&ch->timeout.wq);
+        mutex_unlock(&ch->timeout.lock);
+}
+static void gk20a_channel_timeout_handler(struct work_struct *work)
+{
+        struct channel_gk20a_job *job;
+        struct gk20a *g;
+        struct channel_gk20a *ch;
+        struct channel_gk20a *failing_ch;
+        u32 engine_id;
+        int id = -1;
+        bool is_tsg = false;
+        ch = container_of(to_delayed_work(work), struct channel_gk20a,
+                        timeout.wq);
+        ch = gk20a_channel_get(ch);
+        if (!ch)
+                return;
+        g = ch->g;
+        /* Need global lock since multiple channels can timeout at a time */
+        mutex_lock(&g->ch_wdt_lock);
+        /* Get timed out job and reset the timer */
+        mutex_lock(&ch->timeout.lock);
+        job = ch->timeout.job;
+        ch->timeout.initialized = false;
+        mutex_unlock(&ch->timeout.lock);
+        if (gk20a_fifo_disable_all_engine_activity(g, true))
+                goto fail_unlock;
+        if (gk20a_fence_is_expired(job->post_fence))
+                goto fail_enable_engine_activity;
+        gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n",
+                ch->hw_chid);
+        /* Get failing engine data */
+        engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg);
+        if (engine_id >= g->fifo.max_engines) {
+                /* If no failing engine, abort the channels */
+                if (gk20a_is_channel_marked_as_tsg(ch)) {
+                        struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
+                        gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+                        gk20a_fifo_abort_tsg(g, ch->tsgid);
+                } else {
+                        gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
+                        gk20a_channel_abort(ch);
+                }
+        } else {
+                /* If failing engine, trigger recovery */
+                failing_ch = gk20a_channel_get(&g->fifo.channel[id]);
+                if (!failing_ch)
+                        goto fail_enable_engine_activity;
+                if (failing_ch->hw_chid != ch->hw_chid)
+                        gk20a_channel_timeout_start(ch, job);
+                gk20a_fifo_recover(g, BIT(engine_id),
+                        failing_ch->hw_chid, is_tsg,
+                        true, failing_ch->timeout_debug_dump);
+                gk20a_channel_put(failing_ch);
+        }
+fail_enable_engine_activity:
+        gk20a_fifo_enable_all_engine_activity(g);
+fail_unlock:
+        mutex_unlock(&g->ch_wdt_lock);
+        gk20a_channel_put(ch);
+}
 static int gk20a_channel_add_job(struct channel_gk20a *c,
                                 struct gk20a_fence *pre_fence,
                                 struct gk20a_fence *post_fence)
@@ -1561,6 +1675,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
                job->pre_fence = gk20a_fence_get(pre_fence);
                job->post_fence = gk20a_fence_get(post_fence);
+                gk20a_channel_timeout_start(c, job);
                mutex_lock(&c->jobs_lock);
                list_add_tail(&job->list, &c->jobs);
                mutex_unlock(&c->jobs_lock);
@@ -1586,8 +1702,12 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
                struct gk20a *g = c->g;
                bool completed = gk20a_fence_is_expired(job->post_fence);
-                if (!completed)
+                if (!completed) {
+                        gk20a_channel_timeout_start(c, job);
                        break;
+                }
+                gk20a_channel_timeout_stop(c);
                if (c->sync)
                        c->sync->signal_timeline(c->sync);
@@ -1926,6 +2046,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
        mutex_init(&c->ioctl_lock);
        mutex_init(&c->jobs_lock);
        mutex_init(&c->submit_lock);
+        mutex_init(&c->timeout.lock);
+        INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
        INIT_LIST_HEAD(&c->jobs);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
        mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
author	Deepak Nibade <dnibade@nvidia.com>	2015-08-31 05:00:35 -0400
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2015-09-28 12:08:12 -0400
commit	613990cb391c74436384d63d12240221565011d5 (patch)
tree	27d7cd19bd84a6ce50fb579c5f6a08ada28ba5b7 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent	cb8c102131ec96767e01981dc9a9d26e30593a70 (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index c18a4e5d..2dc8e9a0 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1472,6 +1472,14 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
1472	ch->timeout_accumulated_ms > ch->timeout_ms_max;	1472	ch->timeout_accumulated_ms > ch->timeout_ms_max;
1473	}	1473	}
1474		1474
		1475	static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
		1476	{
		1477	if (ch->g->timeouts_enabled && ch->g->ch_wdt_enabled)
		1478	return NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS;
		1479	else
		1480	return (u32)MAX_SCHEDULE_TIMEOUT;
		1481	}
		1482
1475	static u32 get_gp_free_count(struct channel_gk20a *c)	1483	static u32 get_gp_free_count(struct channel_gk20a *c)
1476	{	1484	{
1477	update_gp_get(c->g, c);	1485	update_gp_get(c->g, c);
@@ -1527,6 +1535,112 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
1527	}	1535	}
1528	}	1536	}
1529		1537
		1538	static void gk20a_channel_timeout_start(struct channel_gk20a *ch,
		1539	struct channel_gk20a_job *job)
		1540	{
		1541	mutex_lock(&ch->timeout.lock);
		1542
		1543	if (ch->timeout.initialized) {
		1544	mutex_unlock(&ch->timeout.lock);
		1545	return;
		1546	}
		1547
		1548	ch->timeout.job = job;
		1549	ch->timeout.initialized = true;
		1550	schedule_delayed_work(&ch->timeout.wq,
		1551	msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch)));
		1552
		1553	mutex_unlock(&ch->timeout.lock);
		1554	}
		1555
		1556	static void gk20a_channel_timeout_stop(struct channel_gk20a *ch)
		1557	{
		1558	mutex_lock(&ch->timeout.lock);
		1559
		1560	if (!ch->timeout.initialized) {
		1561	mutex_unlock(&ch->timeout.lock);
		1562	return;
		1563	}
		1564
		1565	ch->timeout.initialized = false;
		1566	cancel_delayed_work_sync(&ch->timeout.wq);
		1567
		1568	mutex_unlock(&ch->timeout.lock);
		1569	}
		1570
		1571	static void gk20a_channel_timeout_handler(struct work_struct *work)
		1572	{
		1573	struct channel_gk20a_job *job;
		1574	struct gk20a *g;
		1575	struct channel_gk20a *ch;
		1576	struct channel_gk20a *failing_ch;
		1577	u32 engine_id;
		1578	int id = -1;
		1579	bool is_tsg = false;
		1580
		1581	ch = container_of(to_delayed_work(work), struct channel_gk20a,
		1582	timeout.wq);
		1583	ch = gk20a_channel_get(ch);
		1584	if (!ch)
		1585	return;
		1586
		1587	g = ch->g;
		1588
		1589	/* Need global lock since multiple channels can timeout at a time */
		1590	mutex_lock(&g->ch_wdt_lock);
		1591
		1592	/* Get timed out job and reset the timer */
		1593	mutex_lock(&ch->timeout.lock);
		1594	job = ch->timeout.job;
		1595	ch->timeout.initialized = false;
		1596	mutex_unlock(&ch->timeout.lock);
		1597
		1598	if (gk20a_fifo_disable_all_engine_activity(g, true))
		1599	goto fail_unlock;
		1600
		1601	if (gk20a_fence_is_expired(job->post_fence))
		1602	goto fail_enable_engine_activity;
		1603
		1604	gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n",
		1605	ch->hw_chid);
		1606
		1607	/* Get failing engine data */
		1608	engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg);
		1609
		1610	if (engine_id >= g->fifo.max_engines) {
		1611	/* If no failing engine, abort the channels */
		1612	if (gk20a_is_channel_marked_as_tsg(ch)) {
		1613	struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
		1614
		1615	gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
		1616	gk20a_fifo_abort_tsg(g, ch->tsgid);
		1617	} else {
		1618	gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
		1619	gk20a_channel_abort(ch);
		1620	}
		1621	} else {
		1622	/* If failing engine, trigger recovery */
		1623	failing_ch = gk20a_channel_get(&g->fifo.channel[id]);
		1624	if (!failing_ch)
		1625	goto fail_enable_engine_activity;
		1626
		1627	if (failing_ch->hw_chid != ch->hw_chid)
		1628	gk20a_channel_timeout_start(ch, job);
		1629
		1630	gk20a_fifo_recover(g, BIT(engine_id),
		1631	failing_ch->hw_chid, is_tsg,
		1632	true, failing_ch->timeout_debug_dump);
		1633
		1634	gk20a_channel_put(failing_ch);
		1635	}
		1636
		1637	fail_enable_engine_activity:
		1638	gk20a_fifo_enable_all_engine_activity(g);
		1639	fail_unlock:
		1640	mutex_unlock(&g->ch_wdt_lock);
		1641	gk20a_channel_put(ch);
		1642	}
		1643
1530	static int gk20a_channel_add_job(struct channel_gk20a *c,	1644	static int gk20a_channel_add_job(struct channel_gk20a *c,
1531	struct gk20a_fence *pre_fence,	1645	struct gk20a_fence *pre_fence,
1532	struct gk20a_fence *post_fence)	1646	struct gk20a_fence *post_fence)
@@ -1561,6 +1675,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
1561	job->pre_fence = gk20a_fence_get(pre_fence);	1675	job->pre_fence = gk20a_fence_get(pre_fence);
1562	job->post_fence = gk20a_fence_get(post_fence);	1676	job->post_fence = gk20a_fence_get(post_fence);
1563		1677
		1678	gk20a_channel_timeout_start(c, job);
		1679
1564	mutex_lock(&c->jobs_lock);	1680	mutex_lock(&c->jobs_lock);
1565	list_add_tail(&job->list, &c->jobs);	1681	list_add_tail(&job->list, &c->jobs);
1566	mutex_unlock(&c->jobs_lock);	1682	mutex_unlock(&c->jobs_lock);
@@ -1586,8 +1702,12 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
1586	struct gk20a *g = c->g;	1702	struct gk20a *g = c->g;
1587		1703
1588	bool completed = gk20a_fence_is_expired(job->post_fence);	1704	bool completed = gk20a_fence_is_expired(job->post_fence);
1589	if (!completed)	1705	if (!completed) {
		1706	gk20a_channel_timeout_start(c, job);
1590	break;	1707	break;
		1708	}
		1709
		1710	gk20a_channel_timeout_stop(c);
1591		1711
1592	if (c->sync)	1712	if (c->sync)
1593	c->sync->signal_timeline(c->sync);	1713	c->sync->signal_timeline(c->sync);
@@ -1926,6 +2046,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
1926	mutex_init(&c->ioctl_lock);	2046	mutex_init(&c->ioctl_lock);
1927	mutex_init(&c->jobs_lock);	2047	mutex_init(&c->jobs_lock);
1928	mutex_init(&c->submit_lock);	2048	mutex_init(&c->submit_lock);
		2049	mutex_init(&c->timeout.lock);
		2050	INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
1929	INIT_LIST_HEAD(&c->jobs);	2051	INIT_LIST_HEAD(&c->jobs);
1930	#if defined(CONFIG_GK20A_CYCLE_STATS)	2052	#if defined(CONFIG_GK20A_CYCLE_STATS)
1931	mutex_init(&c->cyclestate.cyclestate_buffer_mutex);	2053	mutex_init(&c->cyclestate.cyclestate_buffer_mutex);