summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRichard Zhao <rizhao@nvidia.com>2016-08-12 20:10:28 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2016-08-18 18:03:54 -0400
commit198b895a881ca067a2411b7367579cb1d594ab5a (patch)
tree90e667382a589de9ba4d97bf511ca6bd42efe03c
parentdeffbf8ee2017d4ea804f35946673dd0f6e0fcf2 (diff)
gpu: nvgpu: use force_reset_ch in ch wdt handler
- let force_reset_ch pass down err code - force_reset_ch callback can cover vgpu too. Bug 1776876 JIRA VFND-2151 Change-Id: I48f7890294c6455247198e0cab5f21f83f61f0e1 Signed-off-by: Richard Zhao <rizhao@nvidia.com> Reviewed-on: http://git-master/r/1202255 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c52
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c9
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.h3
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h3
-rw-r--r--drivers/gpu/nvgpu/vgpu/fifo_vgpu.c9
5 files changed, 17 insertions, 59 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index a8a39302..41fced99 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1721,10 +1721,6 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
1721 struct channel_gk20a_job *job; 1721 struct channel_gk20a_job *job;
1722 struct gk20a *g; 1722 struct gk20a *g;
1723 struct channel_gk20a *ch; 1723 struct channel_gk20a *ch;
1724 struct channel_gk20a *failing_ch;
1725 u32 engine_id;
1726 int id = -1;
1727 bool is_tsg = false;
1728 1724
1729 ch = container_of(to_delayed_work(work), struct channel_gk20a, 1725 ch = container_of(to_delayed_work(work), struct channel_gk20a,
1730 timeout.wq); 1726 timeout.wq);
@@ -1746,16 +1742,11 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
1746 ch->timeout.initialized = false; 1742 ch->timeout.initialized = false;
1747 mutex_unlock(&ch->timeout.lock); 1743 mutex_unlock(&ch->timeout.lock);
1748 1744
1749 if (gr_gk20a_disable_ctxsw(g)) {
1750 gk20a_err(dev_from_gk20a(g), "Unable to disable ctxsw!");
1751 goto fail_unlock;
1752 }
1753
1754 if (gk20a_fence_is_expired(job->post_fence)) { 1745 if (gk20a_fence_is_expired(job->post_fence)) {
1755 gk20a_err(dev_from_gk20a(g), 1746 gk20a_err(dev_from_gk20a(g),
1756 "Timed out fence is expired on c=%d!", 1747 "Timed out fence is expired on c=%d!",
1757 ch->hw_chid); 1748 ch->hw_chid);
1758 goto fail_enable_ctxsw; 1749 goto fail_unlock;
1759 } 1750 }
1760 1751
1761 gk20a_err(dev_from_gk20a(g), "Confirmed: job on channel %d timed out", 1752 gk20a_err(dev_from_gk20a(g), "Confirmed: job on channel %d timed out",
@@ -1764,43 +1755,9 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
1764 gk20a_debug_dump(g->dev); 1755 gk20a_debug_dump(g->dev);
1765 gk20a_gr_debug_dump(g->dev); 1756 gk20a_gr_debug_dump(g->dev);
1766 1757
1767 /* Get failing engine data */ 1758 g->ops.fifo.force_reset_ch(ch,
1768 engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg); 1759 NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT, true);
1769
1770 if (!gk20a_fifo_is_valid_engine_id(g, engine_id)) {
1771 /* If no failing engine, abort the channels */
1772 if (gk20a_is_channel_marked_as_tsg(ch)) {
1773 struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
1774
1775 gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
1776 gk20a_fifo_abort_tsg(g, ch->tsgid, false);
1777 } else {
1778 gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
1779 gk20a_channel_abort(ch, false);
1780 }
1781 } else {
1782 /* If failing engine, trigger recovery */
1783 failing_ch = gk20a_channel_get(&g->fifo.channel[id]);
1784 if (!failing_ch)
1785 goto fail_enable_ctxsw;
1786
1787 if (failing_ch->hw_chid != ch->hw_chid) {
1788 gk20a_channel_timeout_start(ch, job);
1789
1790 mutex_lock(&failing_ch->timeout.lock);
1791 failing_ch->timeout.initialized = false;
1792 mutex_unlock(&failing_ch->timeout.lock);
1793 }
1794
1795 gk20a_fifo_recover(g, BIT(engine_id),
1796 failing_ch->hw_chid, is_tsg,
1797 true, failing_ch->timeout_debug_dump);
1798
1799 gk20a_channel_put(failing_ch);
1800 }
1801 1760
1802fail_enable_ctxsw:
1803 gr_gk20a_enable_ctxsw(g);
1804fail_unlock: 1761fail_unlock:
1805 mutex_unlock(&g->ch_wdt_lock); 1762 mutex_unlock(&g->ch_wdt_lock);
1806 gk20a_channel_put(ch); 1763 gk20a_channel_put(ch);
@@ -3231,7 +3188,8 @@ long gk20a_channel_ioctl(struct file *filp,
3231 __func__, cmd); 3188 __func__, cmd);
3232 break; 3189 break;
3233 } 3190 }
3234 err = ch->g->ops.fifo.force_reset_ch(ch, true); 3191 err = ch->g->ops.fifo.force_reset_ch(ch,
3192 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR, true);
3235 gk20a_idle(dev); 3193 gk20a_idle(dev);
3236 break; 3194 break;
3237 case NVGPU_IOCTL_CHANNEL_EVENT_ID_CTRL: 3195 case NVGPU_IOCTL_CHANNEL_EVENT_ID_CTRL:
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index b025f4d6..bd31656f 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1746,7 +1746,8 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
1746} 1746}
1747 1747
1748/* force reset channel and tsg (if it's part of one) */ 1748/* force reset channel and tsg (if it's part of one) */
1749int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose) 1749int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch,
1750 u32 err_code, bool verbose)
1750{ 1751{
1751 struct tsg_gk20a *tsg = NULL; 1752 struct tsg_gk20a *tsg = NULL;
1752 struct channel_gk20a *ch_tsg = NULL; 1753 struct channel_gk20a *ch_tsg = NULL;
@@ -1759,8 +1760,7 @@ int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose)
1759 1760
1760 list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) { 1761 list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
1761 if (gk20a_channel_get(ch_tsg)) { 1762 if (gk20a_channel_get(ch_tsg)) {
1762 gk20a_set_error_notifier(ch_tsg, 1763 gk20a_set_error_notifier(ch_tsg, err_code);
1763 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
1764 gk20a_channel_put(ch_tsg); 1764 gk20a_channel_put(ch_tsg);
1765 } 1765 }
1766 } 1766 }
@@ -1768,8 +1768,7 @@ int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose)
1768 mutex_unlock(&tsg->ch_list_lock); 1768 mutex_unlock(&tsg->ch_list_lock);
1769 gk20a_fifo_recover_tsg(g, ch->tsgid, verbose); 1769 gk20a_fifo_recover_tsg(g, ch->tsgid, verbose);
1770 } else { 1770 } else {
1771 gk20a_set_error_notifier(ch, 1771 gk20a_set_error_notifier(ch, err_code);
1772 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
1773 gk20a_fifo_recover_ch(g, ch->hw_chid, verbose); 1772 gk20a_fifo_recover_ch(g, ch->hw_chid, verbose);
1774 } 1773 }
1775 1774
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index f5a73a12..17c6dbf6 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -197,7 +197,8 @@ void gk20a_fifo_recover(struct gk20a *g,
197 bool id_is_known, bool verbose); 197 bool id_is_known, bool verbose);
198void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose); 198void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose);
199void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose); 199void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose);
200int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose); 200int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch,
201 u32 err_code, bool verbose);
201void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id); 202void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id);
202int gk20a_init_fifo_reset_enable_hw(struct gk20a *g); 203int gk20a_init_fifo_reset_enable_hw(struct gk20a *g);
203void gk20a_init_fifo(struct gpu_ops *gops); 204void gk20a_init_fifo(struct gpu_ops *gops);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 6f735af9..463317e3 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -363,7 +363,8 @@ struct gpu_ops {
363 int (*channel_set_timeslice)(struct channel_gk20a *ch, 363 int (*channel_set_timeslice)(struct channel_gk20a *ch,
364 u32 timeslice); 364 u32 timeslice);
365 int (*tsg_set_timeslice)(struct tsg_gk20a *tsg, u32 timeslice); 365 int (*tsg_set_timeslice)(struct tsg_gk20a *tsg, u32 timeslice);
366 int (*force_reset_ch)(struct channel_gk20a *ch, bool verbose); 366 int (*force_reset_ch)(struct channel_gk20a *ch,
367 u32 err_code, bool verbose);
367 int (*engine_enum_from_type)(struct gk20a *g, u32 engine_type, 368 int (*engine_enum_from_type)(struct gk20a *g, u32 engine_type,
368 u32 *inst_id); 369 u32 *inst_id);
369 void (*device_info_data_parse)(struct gk20a *g, 370 void (*device_info_data_parse)(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 9a8c319b..90e44e8c 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -634,7 +634,8 @@ static int vgpu_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice)
634 return err ? err : msg.ret; 634 return err ? err : msg.ret;
635} 635}
636 636
637static int vgpu_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose) 637static int vgpu_fifo_force_reset_ch(struct channel_gk20a *ch,
638 u32 err_code, bool verbose)
638{ 639{
639 struct tsg_gk20a *tsg = NULL; 640 struct tsg_gk20a *tsg = NULL;
640 struct channel_gk20a *ch_tsg = NULL; 641 struct channel_gk20a *ch_tsg = NULL;
@@ -653,16 +654,14 @@ static int vgpu_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose)
653 654
654 list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) { 655 list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
655 if (gk20a_channel_get(ch_tsg)) { 656 if (gk20a_channel_get(ch_tsg)) {
656 gk20a_set_error_notifier(ch_tsg, 657 gk20a_set_error_notifier(ch_tsg, err_code);
657 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
658 gk20a_channel_put(ch_tsg); 658 gk20a_channel_put(ch_tsg);
659 } 659 }
660 } 660 }
661 661
662 mutex_unlock(&tsg->ch_list_lock); 662 mutex_unlock(&tsg->ch_list_lock);
663 } else { 663 } else {
664 gk20a_set_error_notifier(ch, 664 gk20a_set_error_notifier(ch, err_code);
665 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
666 } 665 }
667 666
668 msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FORCE_RESET; 667 msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FORCE_RESET;