summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu
diff options
context:
space:
mode:
authorDebarshi Dutta <ddutta@nvidia.com>2019-04-30 05:41:31 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2019-05-09 17:42:33 -0400
commit6509bb49da19ba9b19e3df64e473b01d54fd310d (patch)
treeb34d19c88fc122f369b1f22094d9a5e22c67df92 /drivers/gpu/nvgpu
parent4d8ad643d67ac4044f76976c4085a35fcc5d4095 (diff)
gpu: nvgpu: protect recovery with engines_reset_mutex
Rename gr_reset_mutex to engines_reset_mutex and acquire it before initiating recovery. Recovery running in parallel with engine reset is not recommended. On hitting engine reset, h/w drops the ctxsw_status to INVALID in fifo_engine_status register. Also while the engine is held in reset h/w passes busy/idle straight through. fifo_engine_status registers are correct in that there is no context switch outstanding as the CTXSW is aborted when reset is asserted. Use deferred_reset_mutex to protect deferred_reset_pending variable If deferred_reset_pending is true then acquire engines_reset_mutex and call gk20a_fifo_deferred_reset. gk20a_fifo_deferred_reset would also check the value of deferred_reset_pending before initiating reset process Bug 2092051 Bug 2429295 Bug 2484211 Bug 1890287 Change-Id: I47de669a6203e0b2e9a8237ec4e4747339b9837c Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2022373 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> (cherry-picked from cb91bf1e13740023903282d1c2271d9154e940ba in dev-main) Reviewed-on: https://git-master.nvidia.com/r/2024901 GVS: Gerrit_Virtual_Submit Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r--drivers/gpu/nvgpu/common/fifo/channel.c19
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c68
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.c47
4 files changed, 90 insertions, 46 deletions
diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index d30b8ded..4bea032a 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -308,6 +308,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
308 struct dbg_session_data *session_data, *tmp_s; 308 struct dbg_session_data *session_data, *tmp_s;
309 struct dbg_session_channel_data *ch_data, *tmp; 309 struct dbg_session_channel_data *ch_data, *tmp;
310 int err; 310 int err;
311 bool deferred_reset_pending;
311 312
312 nvgpu_log_fn(g, " "); 313 nvgpu_log_fn(g, " ");
313 314
@@ -381,17 +382,17 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
381 382
382 /* if engine reset was deferred, perform it now */ 383 /* if engine reset was deferred, perform it now */
383 nvgpu_mutex_acquire(&f->deferred_reset_mutex); 384 nvgpu_mutex_acquire(&f->deferred_reset_mutex);
384 if (g->fifo.deferred_reset_pending) { 385 deferred_reset_pending = g->fifo.deferred_reset_pending;
386 nvgpu_mutex_release(&f->deferred_reset_mutex);
387
388 if (deferred_reset_pending) {
385 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was" 389 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
386 " deferred, running now"); 390 " deferred, running now");
387 /* if lock is already taken, a reset is taking place 391 nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
388 so no need to repeat */ 392 gk20a_fifo_deferred_reset(g, ch);
389 if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) { 393 nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
390 gk20a_fifo_deferred_reset(g, ch);
391 nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
392 }
393 } 394 }
394 nvgpu_mutex_release(&f->deferred_reset_mutex); 395
395 396
396 if (!gk20a_channel_as_bound(ch)) { 397 if (!gk20a_channel_as_bound(ch)) {
397 goto unbind; 398 goto unbind;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index b96372b4..5aca7d62 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -910,9 +910,9 @@ int gk20a_init_fifo_setup_sw_common(struct gk20a *g)
910 return err; 910 return err;
911 } 911 }
912 912
913 err = nvgpu_mutex_init(&f->gr_reset_mutex); 913 err = nvgpu_mutex_init(&f->engines_reset_mutex);
914 if (err) { 914 if (err) {
915 nvgpu_err(g, "failed to init gr_reset_mutex"); 915 nvgpu_err(g, "failed to init engines_reset_mutex");
916 return err; 916 return err;
917 } 917 }
918 918
@@ -1581,14 +1581,22 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch)
1581{ 1581{
1582 unsigned long engine_id, engines = 0U; 1582 unsigned long engine_id, engines = 0U;
1583 struct tsg_gk20a *tsg; 1583 struct tsg_gk20a *tsg;
1584 bool deferred_reset_pending;
1585 struct fifo_gk20a *f = &g->fifo;
1584 1586
1585 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1587 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1586 gr_gk20a_disable_ctxsw(g);
1587 1588
1588 if (!g->fifo.deferred_reset_pending) { 1589 nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1589 goto clean_up; 1590 deferred_reset_pending = g->fifo.deferred_reset_pending;
1591 nvgpu_mutex_release(&f->deferred_reset_mutex);
1592
1593 if (!deferred_reset_pending) {
1594 nvgpu_mutex_release(&g->dbg_sessions_lock);
1595 return 0;
1590 } 1596 }
1591 1597
1598 gr_gk20a_disable_ctxsw(g);
1599
1592 tsg = tsg_gk20a_from_ch(ch); 1600 tsg = tsg_gk20a_from_ch(ch);
1593 if (tsg != NULL) { 1601 if (tsg != NULL) {
1594 engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true); 1602 engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true);
@@ -1610,8 +1618,10 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch)
1610 } 1618 }
1611 } 1619 }
1612 1620
1621 nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1613 g->fifo.deferred_fault_engines = 0; 1622 g->fifo.deferred_fault_engines = 0;
1614 g->fifo.deferred_reset_pending = false; 1623 g->fifo.deferred_reset_pending = false;
1624 nvgpu_mutex_release(&f->deferred_reset_mutex);
1615 1625
1616clean_up: 1626clean_up:
1617 gr_gk20a_enable_ctxsw(g); 1627 gr_gk20a_enable_ctxsw(g);
@@ -1632,9 +1642,10 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
1632 bool verbose = true; 1642 bool verbose = true;
1633 u32 grfifo_ctl; 1643 u32 grfifo_ctl;
1634 1644
1635 nvgpu_log_fn(g, " "); 1645 bool deferred_reset_pending = false;
1646 struct fifo_gk20a *f = &g->fifo;
1636 1647
1637 g->fifo.deferred_reset_pending = false; 1648 nvgpu_log_fn(g, " ");
1638 1649
1639 /* Disable power management */ 1650 /* Disable power management */
1640 if (g->support_pmu) { 1651 if (g->support_pmu) {
@@ -1661,6 +1672,9 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
1661 gk20a_debug_dump(g); 1672 gk20a_debug_dump(g);
1662 } 1673 }
1663 1674
1675 nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1676 g->fifo.deferred_reset_pending = false;
1677 nvgpu_mutex_release(&f->deferred_reset_mutex);
1664 1678
1665 /* go through all faulted engines */ 1679 /* go through all faulted engines */
1666 for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) { 1680 for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) {
@@ -1761,17 +1775,17 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
1761 g->fifo.deferred_fault_engines |= BIT(engine_id); 1775 g->fifo.deferred_fault_engines |= BIT(engine_id);
1762 1776
1763 /* handled during channel free */ 1777 /* handled during channel free */
1778 nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1764 g->fifo.deferred_reset_pending = true; 1779 g->fifo.deferred_reset_pending = true;
1780 nvgpu_mutex_release(&f->deferred_reset_mutex);
1781
1782 deferred_reset_pending = true;
1783
1765 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, 1784 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
1766 "sm debugger attached," 1785 "sm debugger attached,"
1767 " deferring channel recovery to channel free"); 1786 " deferring channel recovery to channel free");
1768 } else { 1787 } else {
1769 /* if lock is already taken, a reset is taking place 1788 gk20a_fifo_reset_engine(g, engine_id);
1770 so no need to repeat */
1771 if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) {
1772 gk20a_fifo_reset_engine(g, engine_id);
1773 nvgpu_mutex_release(&g->fifo.gr_reset_mutex);
1774 }
1775 } 1789 }
1776 } 1790 }
1777 1791
@@ -1784,7 +1798,7 @@ static bool gk20a_fifo_handle_mmu_fault_locked(
1784 * Disable the channel/TSG from hw and increment syncpoints. 1798 * Disable the channel/TSG from hw and increment syncpoints.
1785 */ 1799 */
1786 if (tsg) { 1800 if (tsg) {
1787 if (g->fifo.deferred_reset_pending) { 1801 if (deferred_reset_pending) {
1788 gk20a_disable_tsg(tsg); 1802 gk20a_disable_tsg(tsg);
1789 } else { 1803 } else {
1790 if (!fake_fault) { 1804 if (!fake_fault) {
@@ -1847,6 +1861,9 @@ static bool gk20a_fifo_handle_mmu_fault(
1847 1861
1848 nvgpu_log_fn(g, " "); 1862 nvgpu_log_fn(g, " ");
1849 1863
1864 nvgpu_log_info(g, "acquire engines_reset_mutex");
1865 nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
1866
1850 nvgpu_log_info(g, "acquire runlist_lock for all runlists"); 1867 nvgpu_log_info(g, "acquire runlist_lock for all runlists");
1851 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { 1868 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1852 nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); 1869 nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
@@ -1859,6 +1876,10 @@ static bool gk20a_fifo_handle_mmu_fault(
1859 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { 1876 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1860 nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); 1877 nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
1861 } 1878 }
1879
1880 nvgpu_log_info(g, "release engines_reset_mutex");
1881 nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
1882
1862 return verbose; 1883 return verbose;
1863} 1884}
1864 1885
@@ -1954,6 +1975,16 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
1954 g->ops.fifo.disable_tsg(tsg); 1975 g->ops.fifo.disable_tsg(tsg);
1955 1976
1956 /* 1977 /*
1978 * On hitting engine reset, h/w drops the ctxsw_status to INVALID in
1979 * fifo_engine_status register. Also while the engine is held in reset
1980 * h/w passes busy/idle straight through. fifo_engine_status registers
1981 * are correct in that there is no context switch outstanding
1982 * as the CTXSW is aborted when reset is asserted.
1983 */
1984 nvgpu_log_info(g, "acquire engines_reset_mutex");
1985 nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
1986
1987 /*
1957 * stop context switching to prevent engine assignments from 1988 * stop context switching to prevent engine assignments from
1958 * changing until engine status is checked to make sure tsg 1989 * changing until engine status is checked to make sure tsg
1959 * being recovered is not loaded on the engines 1990 * being recovered is not loaded on the engines
@@ -1980,6 +2011,9 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
1980 } 2011 }
1981 } 2012 }
1982 2013
2014 nvgpu_log_info(g, "release engines_reset_mutex");
2015 nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
2016
1983 if (engines) { 2017 if (engines) {
1984 gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose, 2018 gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,
1985 rc_type); 2019 rc_type);
@@ -2030,6 +2064,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
2030 bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false; 2064 bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false;
2031 u32 rlid; 2065 u32 rlid;
2032 2066
2067 nvgpu_log_info(g, "acquire engines_reset_mutex");
2068 nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
2069
2033 nvgpu_log_info(g, "acquire runlist_lock for all runlists"); 2070 nvgpu_log_info(g, "acquire runlist_lock for all runlists");
2034 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { 2071 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
2035 nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); 2072 nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
@@ -2094,6 +2131,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
2094 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { 2131 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
2095 nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); 2132 nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
2096 } 2133 }
2134
2135 nvgpu_log_info(g, "release engines_reset_mutex");
2136 nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
2097} 2137}
2098 2138
2099void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, 2139void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 0c9d9101..26365cae 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -184,7 +184,7 @@ struct fifo_gk20a {
184 /* zero-kref'd channels here */ 184 /* zero-kref'd channels here */
185 struct nvgpu_list_node free_chs; 185 struct nvgpu_list_node free_chs;
186 struct nvgpu_mutex free_chs_mutex; 186 struct nvgpu_mutex free_chs_mutex;
187 struct nvgpu_mutex gr_reset_mutex; 187 struct nvgpu_mutex engines_reset_mutex;
188 188
189 struct tsg_gk20a *tsg; 189 struct tsg_gk20a *tsg;
190 struct nvgpu_mutex tsg_inuse_mutex; 190 struct nvgpu_mutex tsg_inuse_mutex;
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index b3c59f84..3c2de4f2 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -1024,6 +1024,11 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1024 u32 num_runlists = 0; 1024 u32 num_runlists = 0;
1025 unsigned long runlist_served_pbdmas; 1025 unsigned long runlist_served_pbdmas;
1026 1026
1027 bool deferred_reset_pending = false;
1028
1029 nvgpu_log_info(g, "acquire engines_reset_mutex");
1030 nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
1031
1027 nvgpu_log_fn(g, "acquire runlist_lock for all runlists"); 1032 nvgpu_log_fn(g, "acquire runlist_lock for all runlists");
1028 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { 1033 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1029 nvgpu_mutex_acquire(&f->runlist_info[rlid]. 1034 nvgpu_mutex_acquire(&f->runlist_info[rlid].
@@ -1094,8 +1099,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1094 /* Disable runlist scheduler */ 1099 /* Disable runlist scheduler */
1095 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED); 1100 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);
1096 1101
1097 g->fifo.deferred_reset_pending = false;
1098
1099 /* Disable power management */ 1102 /* Disable power management */
1100 if (g->support_pmu) { 1103 if (g->support_pmu) {
1101 if (nvgpu_cg_pg_disable(g) != 0) { 1104 if (nvgpu_cg_pg_disable(g) != 0) {
@@ -1143,6 +1146,10 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1143 } 1146 }
1144 } 1147 }
1145 1148
1149 nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1150 g->fifo.deferred_reset_pending = false;
1151 nvgpu_mutex_release(&f->deferred_reset_mutex);
1152
1146 /* check if engine reset should be deferred */ 1153 /* check if engine reset should be deferred */
1147 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { 1154 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1148 1155
@@ -1159,28 +1166,21 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1159 gk20a_fifo_should_defer_engine_reset(g, 1166 gk20a_fifo_should_defer_engine_reset(g,
1160 engine_id, client_type, false)) { 1167 engine_id, client_type, false)) {
1161 1168
1162 g->fifo.deferred_fault_engines |= 1169 g->fifo.deferred_fault_engines |=
1163 BIT(engine_id); 1170 BIT(engine_id);
1164 1171
1165 /* handled during channel free */ 1172 /* handled during channel free */
1166 g->fifo.deferred_reset_pending = true; 1173 nvgpu_mutex_acquire(&f->deferred_reset_mutex);
1167 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, 1174 g->fifo.deferred_reset_pending = true;
1168 "sm debugger attached," 1175 nvgpu_mutex_release(&f->deferred_reset_mutex);
1169 " deferring channel recovery to channel free"); 1176
1177 deferred_reset_pending = true;
1178
1179 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
1180 "sm debugger attached,"
1181 " deferring channel recovery to channel free");
1170 } else { 1182 } else {
1171 /* 1183 gk20a_fifo_reset_engine(g, engine_id);
1172 * if lock is already taken, a reset is
1173 * taking place so no need to repeat
1174 */
1175 if (nvgpu_mutex_tryacquire(
1176 &g->fifo.gr_reset_mutex)) {
1177
1178 gk20a_fifo_reset_engine(g,
1179 engine_id);
1180
1181 nvgpu_mutex_release(
1182 &g->fifo.gr_reset_mutex);
1183 }
1184 } 1184 }
1185 } 1185 }
1186 } 1186 }
@@ -1191,7 +1191,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1191 gk20a_ctxsw_trace_tsg_reset(g, tsg); 1191 gk20a_ctxsw_trace_tsg_reset(g, tsg);
1192#endif 1192#endif
1193 if (tsg) { 1193 if (tsg) {
1194 if (g->fifo.deferred_reset_pending) { 1194 if (deferred_reset_pending) {
1195 gk20a_disable_tsg(tsg); 1195 gk20a_disable_tsg(tsg);
1196 } else { 1196 } else {
1197 if (rc_type == RC_TYPE_MMU_FAULT) { 1197 if (rc_type == RC_TYPE_MMU_FAULT) {
@@ -1228,6 +1228,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1228 runlist_lock); 1228 runlist_lock);
1229 } 1229 }
1230 } 1230 }
1231
1232 nvgpu_log_info(g, "release engines_reset_mutex");
1233 nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
1231} 1234}
1232 1235
1233void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f) 1236void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)