summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/fifo_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c247
1 files changed, 178 insertions, 69 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 56b954a9..4ef310b2 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -515,6 +515,9 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
515 515
516 init_runlist(g, f); 516 init_runlist(g, f);
517 517
518 INIT_LIST_HEAD(&f->free_chs);
519 mutex_init(&f->free_chs_mutex);
520
518 for (chid = 0; chid < f->num_channels; chid++) { 521 for (chid = 0; chid < f->num_channels; chid++) {
519 f->channel[chid].userd_cpu_va = 522 f->channel[chid].userd_cpu_va =
520 f->userd.cpu_va + chid * f->userd_entry_size; 523 f->userd.cpu_va + chid * f->userd_entry_size;
@@ -527,7 +530,6 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
527 gk20a_init_channel_support(g, chid); 530 gk20a_init_channel_support(g, chid);
528 gk20a_init_tsg_support(g, chid); 531 gk20a_init_tsg_support(g, chid);
529 } 532 }
530 mutex_init(&f->ch_inuse_mutex);
531 mutex_init(&f->tsg_inuse_mutex); 533 mutex_init(&f->tsg_inuse_mutex);
532 534
533 f->remove_support = gk20a_remove_fifo_support; 535 f->remove_support = gk20a_remove_fifo_support;
@@ -637,6 +639,7 @@ int gk20a_init_fifo_support(struct gk20a *g)
637 return err; 639 return err;
638} 640}
639 641
642/* return with a reference to the channel, caller must put it back */
640static struct channel_gk20a * 643static struct channel_gk20a *
641channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr) 644channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
642{ 645{
@@ -644,10 +647,16 @@ channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
644 if (unlikely(!f->channel)) 647 if (unlikely(!f->channel))
645 return NULL; 648 return NULL;
646 for (ci = 0; ci < f->num_channels; ci++) { 649 for (ci = 0; ci < f->num_channels; ci++) {
647 struct channel_gk20a *c = f->channel+ci; 650 struct channel_gk20a *ch = gk20a_channel_get(&f->channel[ci]);
648 if (c->inst_block.cpu_va && 651 /* only alive channels are searched */
649 (inst_ptr == gk20a_mem_phys(&c->inst_block))) 652 if (!ch)
650 return f->channel+ci; 653 continue;
654
655 if (ch->inst_block.cpu_va &&
656 (inst_ptr == gk20a_mem_phys(&ch->inst_block)))
657 return ch;
658
659 gk20a_channel_put(ch);
651 } 660 }
652 return NULL; 661 return NULL;
653} 662}
@@ -803,6 +812,7 @@ static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
803 return true; 812 return true;
804} 813}
805 814
815/* caller must hold a channel reference */
806static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g, 816static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
807 struct channel_gk20a *ch) 817 struct channel_gk20a *ch)
808{ 818{
@@ -854,14 +864,38 @@ static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
854 "TSG %d generated a mmu fault", tsg->tsgid); 864 "TSG %d generated a mmu fault", tsg->tsgid);
855 865
856 mutex_lock(&tsg->ch_list_lock); 866 mutex_lock(&tsg->ch_list_lock);
857 list_for_each_entry(ch, &tsg->ch_list, ch_entry) 867 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
858 ret = gk20a_fifo_set_ctx_mmu_error(g, ch); 868 if (gk20a_channel_get(ch)) {
869 if (!gk20a_fifo_set_ctx_mmu_error(g, ch))
870 ret = false;
871 gk20a_channel_put(ch);
872 }
873 }
859 mutex_unlock(&tsg->ch_list_lock); 874 mutex_unlock(&tsg->ch_list_lock);
860 875
861 return ret; 876 return ret;
862} 877}
863 878
864static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g) 879static void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid)
880{
881 struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
882 struct channel_gk20a *ch;
883
884 mutex_lock(&tsg->ch_list_lock);
885 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
886 if (gk20a_channel_get(ch)) {
887 gk20a_channel_abort(ch);
888 gk20a_channel_put(ch);
889 }
890 }
891 mutex_unlock(&tsg->ch_list_lock);
892}
893
894static bool gk20a_fifo_handle_mmu_fault(
895 struct gk20a *g,
896 u32 mmu_fault_engines, /* queried from HW if 0 */
897 u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/
898 bool id_is_tsg)
865{ 899{
866 bool fake_fault; 900 bool fake_fault;
867 unsigned long fault_id; 901 unsigned long fault_id;
@@ -894,10 +928,8 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
894 grfifo_ctl | gr_gpfifo_ctl_access_f(0) | 928 grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
895 gr_gpfifo_ctl_semaphore_access_f(0)); 929 gr_gpfifo_ctl_semaphore_access_f(0));
896 930
897 /* If we have recovery in progress, MMU fault id is invalid */ 931 if (mmu_fault_engines) {
898 if (g->fifo.mmu_fault_engines) { 932 fault_id = mmu_fault_engines;
899 fault_id = g->fifo.mmu_fault_engines;
900 g->fifo.mmu_fault_engines = 0;
901 fake_fault = true; 933 fake_fault = true;
902 } else { 934 } else {
903 fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r()); 935 fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
@@ -914,6 +946,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
914 struct fifo_mmu_fault_info_gk20a f; 946 struct fifo_mmu_fault_info_gk20a f;
915 struct channel_gk20a *ch = NULL; 947 struct channel_gk20a *ch = NULL;
916 struct tsg_gk20a *tsg = NULL; 948 struct tsg_gk20a *tsg = NULL;
949 struct channel_gk20a *referenced_channel = 0;
917 /* read and parse engine status */ 950 /* read and parse engine status */
918 u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id)); 951 u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
919 u32 ctx_status = fifo_engine_status_ctx_status_v(status); 952 u32 ctx_status = fifo_engine_status_ctx_status_v(status);
@@ -953,22 +986,34 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
953 /* get the channel/TSG */ 986 /* get the channel/TSG */
954 if (fake_fault) { 987 if (fake_fault) {
955 /* use next_id if context load is failing */ 988 /* use next_id if context load is failing */
956 u32 id = (ctx_status == 989 u32 id, type;
957 fifo_engine_status_ctx_status_ctxsw_load_v()) ? 990
958 fifo_engine_status_next_id_v(status) : 991 if (hw_id == ~(u32)0) {
959 fifo_engine_status_id_v(status); 992 id = (ctx_status ==
960 u32 type = (ctx_status == 993 fifo_engine_status_ctx_status_ctxsw_load_v()) ?
961 fifo_engine_status_ctx_status_ctxsw_load_v()) ? 994 fifo_engine_status_next_id_v(status) :
962 fifo_engine_status_next_id_type_v(status) : 995 fifo_engine_status_id_v(status);
963 fifo_engine_status_id_type_v(status); 996 type = (ctx_status ==
997 fifo_engine_status_ctx_status_ctxsw_load_v()) ?
998 fifo_engine_status_next_id_type_v(status) :
999 fifo_engine_status_id_type_v(status);
1000 } else {
1001 id = hw_id;
1002 type = id_is_tsg ?
1003 fifo_engine_status_id_type_tsgid_v() :
1004 fifo_engine_status_id_type_chid_v();
1005 }
964 1006
965 if (type == fifo_engine_status_id_type_tsgid_v()) 1007 if (type == fifo_engine_status_id_type_tsgid_v())
966 tsg = &g->fifo.tsg[id]; 1008 tsg = &g->fifo.tsg[id];
967 else if (type == fifo_engine_status_id_type_chid_v()) 1009 else if (type == fifo_engine_status_id_type_chid_v()) {
968 ch = &g->fifo.channel[id]; 1010 ch = &g->fifo.channel[id];
1011 referenced_channel = gk20a_channel_get(ch);
1012 }
969 } else { 1013 } else {
970 /* read channel based on instruction pointer */ 1014 /* read channel based on instruction pointer */
971 ch = channel_from_inst_ptr(&g->fifo, f.inst_ptr); 1015 ch = channel_from_inst_ptr(&g->fifo, f.inst_ptr);
1016 referenced_channel = ch;
972 } 1017 }
973 1018
974 if (ch && gk20a_is_channel_marked_as_tsg(ch)) 1019 if (ch && gk20a_is_channel_marked_as_tsg(ch))
@@ -977,7 +1022,7 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
977 /* check if engine reset should be deferred */ 1022 /* check if engine reset should be deferred */
978 if ((ch || tsg) && gk20a_fifo_should_defer_engine_reset(g, 1023 if ((ch || tsg) && gk20a_fifo_should_defer_engine_reset(g,
979 engine_id, &f, fake_fault)) { 1024 engine_id, &f, fake_fault)) {
980 g->fifo.mmu_fault_engines = fault_id; 1025 g->fifo.deferred_fault_engines = fault_id;
981 1026
982 /* handled during channel free */ 1027 /* handled during channel free */
983 g->fifo.deferred_reset_pending = true; 1028 g->fifo.deferred_reset_pending = true;
@@ -988,19 +1033,31 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
988 * syncpoints */ 1033 * syncpoints */
989 1034
990 if (tsg) { 1035 if (tsg) {
991 struct channel_gk20a *ch = NULL;
992 if (!g->fifo.deferred_reset_pending) 1036 if (!g->fifo.deferred_reset_pending)
993 verbose = 1037 verbose =
994 gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg); 1038 gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
995 mutex_lock(&tsg->ch_list_lock); 1039
996 list_for_each_entry(ch, &tsg->ch_list, ch_entry) 1040 gk20a_fifo_abort_tsg(g, ch->tsgid);
997 gk20a_channel_abort(ch); 1041
998 mutex_unlock(&tsg->ch_list_lock); 1042 /* put back the ref taken early above */
1043 if (referenced_channel) {
1044 gk20a_channel_put(ch);
1045 } else {
1046 gk20a_err(dev_from_gk20a(g),
1047 "mmu error in freed tsg channel %d on tsgid %d",
1048 ch->hw_chid, ch->tsgid);
1049 }
999 } else if (ch) { 1050 } else if (ch) {
1000 if (!g->fifo.deferred_reset_pending) 1051 if (referenced_channel) {
1001 verbose = 1052 if (!g->fifo.deferred_reset_pending)
1002 gk20a_fifo_set_ctx_mmu_error_ch(g, ch); 1053 verbose = gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
1003 gk20a_channel_abort(ch); 1054 gk20a_channel_abort(ch);
1055 gk20a_channel_put(ch);
1056 } else {
1057 gk20a_err(dev_from_gk20a(g),
1058 "mmu error in freed channel %d",
1059 ch->hw_chid);
1060 }
1004 } else if (f.inst_ptr == 1061 } else if (f.inst_ptr ==
1005 gk20a_mem_phys(&g->mm.bar1.inst_block)) { 1062 gk20a_mem_phys(&g->mm.bar1.inst_block)) {
1006 gk20a_err(dev_from_gk20a(g), "mmu fault from bar1"); 1063 gk20a_err(dev_from_gk20a(g), "mmu fault from bar1");
@@ -1133,46 +1190,69 @@ static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg)
1133 1190
1134void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose) 1191void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose)
1135{ 1192{
1136 u32 engines = gk20a_fifo_engines_on_id(g, hw_chid, false); 1193 u32 engines;
1194
1195 /* stop context switching to prevent engine assignments from
1196 changing until channel is recovered */
1197 mutex_lock(&g->dbg_sessions_lock);
1198 gr_gk20a_disable_ctxsw(g);
1199
1200 engines = gk20a_fifo_engines_on_id(g, hw_chid, false);
1201
1137 if (engines) 1202 if (engines)
1138 gk20a_fifo_recover(g, engines, verbose); 1203 gk20a_fifo_recover(g, engines, hw_chid, false, verbose);
1139 else { 1204 else {
1140 struct channel_gk20a *ch = 1205 struct channel_gk20a *ch = &g->fifo.channel[hw_chid];
1141 g->fifo.channel + hw_chid;
1142 1206
1143 gk20a_channel_abort(ch); 1207 if (gk20a_channel_get(ch)) {
1208 gk20a_channel_abort(ch);
1144 1209
1145 if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch)) 1210 if (gk20a_fifo_set_ctx_mmu_error_ch(g, ch))
1146 gk20a_debug_dump(g->dev); 1211 gk20a_debug_dump(g->dev);
1212
1213 gk20a_channel_put(ch);
1214 }
1147 } 1215 }
1216
1217 gr_gk20a_enable_ctxsw(g);
1218 mutex_unlock(&g->dbg_sessions_lock);
1148} 1219}
1149 1220
1150void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose) 1221void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose)
1151{ 1222{
1152 u32 engines = gk20a_fifo_engines_on_id(g, tsgid, true); 1223 u32 engines;
1224
1225 /* stop context switching to prevent engine assignments from
1226 changing until TSG is recovered */
1227 mutex_lock(&g->dbg_sessions_lock);
1228 gr_gk20a_disable_ctxsw(g);
1229
1230 engines = gk20a_fifo_engines_on_id(g, tsgid, true);
1231
1153 if (engines) 1232 if (engines)
1154 gk20a_fifo_recover(g, engines, verbose); 1233 gk20a_fifo_recover(g, engines, tsgid, true, verbose);
1155 else { 1234 else {
1156 struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; 1235 struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
1157 struct channel_gk20a *ch;
1158 1236
1159 if (gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg)) 1237 if (gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg))
1160 gk20a_debug_dump(g->dev); 1238 gk20a_debug_dump(g->dev);
1161 1239
1162 mutex_lock(&tsg->ch_list_lock); 1240 gk20a_fifo_abort_tsg(g, tsgid);
1163 list_for_each_entry(ch, &tsg->ch_list, ch_entry)
1164 gk20a_channel_abort(ch);
1165 mutex_unlock(&tsg->ch_list_lock);
1166 } 1241 }
1242
1243 gr_gk20a_enable_ctxsw(g);
1244 mutex_unlock(&g->dbg_sessions_lock);
1167} 1245}
1168 1246
1169void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, 1247void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
1170 bool verbose) 1248 u32 hw_id, bool id_is_tsg,
1249 bool verbose)
1171{ 1250{
1172 unsigned long engine_id, i; 1251 unsigned long engine_id, i;
1173 unsigned long _engine_ids = __engine_ids; 1252 unsigned long _engine_ids = __engine_ids;
1174 unsigned long engine_ids = 0; 1253 unsigned long engine_ids = 0;
1175 u32 val; 1254 u32 val;
1255 u32 mmu_fault_engines = 0;
1176 1256
1177 if (verbose) 1257 if (verbose)
1178 gk20a_debug_dump(g->dev); 1258 gk20a_debug_dump(g->dev);
@@ -1181,7 +1261,6 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
1181 g->ops.ltc.flush(g); 1261 g->ops.ltc.flush(g);
1182 1262
1183 /* store faulted engines in advance */ 1263 /* store faulted engines in advance */
1184 g->fifo.mmu_fault_engines = 0;
1185 for_each_set_bit(engine_id, &_engine_ids, 32) { 1264 for_each_set_bit(engine_id, &_engine_ids, 32) {
1186 u32 ref_type; 1265 u32 ref_type;
1187 u32 ref_id; 1266 u32 ref_id;
@@ -1196,11 +1275,10 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
1196 gk20a_fifo_get_faulty_id_type(g, i, &id, &type); 1275 gk20a_fifo_get_faulty_id_type(g, i, &id, &type);
1197 if (ref_type == type && ref_id == id) { 1276 if (ref_type == type && ref_id == id) {
1198 engine_ids |= BIT(i); 1277 engine_ids |= BIT(i);
1199 g->fifo.mmu_fault_engines |= 1278 mmu_fault_engines |=
1200 BIT(gk20a_engine_id_to_mmu_id(i)); 1279 BIT(gk20a_engine_id_to_mmu_id(i));
1201 } 1280 }
1202 } 1281 }
1203
1204 } 1282 }
1205 1283
1206 /* 1284 /*
@@ -1214,7 +1292,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
1214 fifo_intr_0_sched_error_reset_f()); 1292 fifo_intr_0_sched_error_reset_f());
1215 1293
1216 g->ops.fifo.trigger_mmu_fault(g, engine_ids); 1294 g->ops.fifo.trigger_mmu_fault(g, engine_ids);
1217 gk20a_fifo_handle_mmu_fault(g); 1295 gk20a_fifo_handle_mmu_fault(g, engine_ids, hw_id, id_is_tsg);
1218 1296
1219 val = gk20a_readl(g, fifo_intr_en_0_r()); 1297 val = gk20a_readl(g, fifo_intr_en_0_r());
1220 val |= fifo_intr_en_0_mmu_fault_f(1) 1298 val |= fifo_intr_en_0_mmu_fault_f(1)
@@ -1222,25 +1300,32 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
1222 gk20a_writel(g, fifo_intr_en_0_r(), val); 1300 gk20a_writel(g, fifo_intr_en_0_r(), val);
1223} 1301}
1224 1302
1303/* force reset channel and tsg (if it's part of one) */
1225int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose) 1304int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose)
1226{ 1305{
1227 struct tsg_gk20a *tsg = NULL; 1306 struct tsg_gk20a *tsg = NULL;
1228 struct channel_gk20a *ch_tsg = NULL; 1307 struct channel_gk20a *ch_tsg = NULL;
1308 struct gk20a *g = ch->g;
1229 1309
1230 if (gk20a_is_channel_marked_as_tsg(ch)) { 1310 if (gk20a_is_channel_marked_as_tsg(ch)) {
1231 tsg = &ch->g->fifo.tsg[ch->hw_chid]; 1311 tsg = &g->fifo.tsg[ch->hw_chid];
1232 1312
1233 mutex_lock(&tsg->ch_list_lock); 1313 mutex_lock(&tsg->ch_list_lock);
1314
1234 list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) { 1315 list_for_each_entry(ch_tsg, &tsg->ch_list, ch_entry) {
1235 gk20a_set_error_notifier(ch_tsg, 1316 if (gk20a_channel_get(ch_tsg)) {
1236 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR); 1317 gk20a_set_error_notifier(ch_tsg,
1318 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
1319 gk20a_channel_put(ch_tsg);
1320 }
1237 } 1321 }
1322
1238 mutex_unlock(&tsg->ch_list_lock); 1323 mutex_unlock(&tsg->ch_list_lock);
1239 gk20a_fifo_recover_tsg(ch->g, ch->tsgid, verbose); 1324 gk20a_fifo_recover_tsg(g, ch->tsgid, verbose);
1240 } else { 1325 } else {
1241 gk20a_set_error_notifier(ch, 1326 gk20a_set_error_notifier(ch,
1242 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR); 1327 NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR);
1243 gk20a_fifo_recover_ch(ch->g, ch->hw_chid, verbose); 1328 gk20a_fifo_recover_ch(g, ch->hw_chid, verbose);
1244 } 1329 }
1245 1330
1246 return 0; 1331 return 0;
@@ -1300,11 +1385,14 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
1300 struct channel_gk20a *ch = &f->channel[id]; 1385 struct channel_gk20a *ch = &f->channel[id];
1301 1386
1302 if (non_chid) { 1387 if (non_chid) {
1303 gk20a_fifo_recover(g, BIT(engine_id), true); 1388 gk20a_fifo_recover(g, BIT(engine_id), id, true, true);
1304 ret = true; 1389 ret = true;
1305 goto err; 1390 goto err;
1306 } 1391 }
1307 1392
1393 if (!gk20a_channel_get(ch))
1394 goto err;
1395
1308 if (gk20a_channel_update_and_check_timeout(ch, 1396 if (gk20a_channel_update_and_check_timeout(ch,
1309 GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) { 1397 GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) {
1310 gk20a_set_error_notifier(ch, 1398 gk20a_set_error_notifier(ch,
@@ -1313,7 +1401,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
1313 "fifo sched ctxsw timeout error:" 1401 "fifo sched ctxsw timeout error:"
1314 "engine = %u, ch = %d", engine_id, id); 1402 "engine = %u, ch = %d", engine_id, id);
1315 gk20a_gr_debug_dump(g->dev); 1403 gk20a_gr_debug_dump(g->dev);
1316 gk20a_fifo_recover(g, BIT(engine_id), 1404 gk20a_fifo_recover(g, BIT(engine_id), id, false,
1317 ch->timeout_debug_dump); 1405 ch->timeout_debug_dump);
1318 ret = true; 1406 ret = true;
1319 } else { 1407 } else {
@@ -1324,6 +1412,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
1324 id); 1412 id);
1325 ret = false; 1413 ret = false;
1326 } 1414 }
1415 gk20a_channel_put(ch);
1327 return ret; 1416 return ret;
1328 } 1417 }
1329 1418
@@ -1336,7 +1425,7 @@ err:
1336 1425
1337static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr) 1426static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
1338{ 1427{
1339 bool print_channel_reset_log = false, reset_engine = false; 1428 bool print_channel_reset_log = false;
1340 struct device *dev = dev_from_gk20a(g); 1429 struct device *dev = dev_from_gk20a(g);
1341 u32 handled = 0; 1430 u32 handled = 0;
1342 1431
@@ -1367,8 +1456,8 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
1367 } 1456 }
1368 1457
1369 if (fifo_intr & fifo_intr_0_mmu_fault_pending_f()) { 1458 if (fifo_intr & fifo_intr_0_mmu_fault_pending_f()) {
1370 print_channel_reset_log = gk20a_fifo_handle_mmu_fault(g); 1459 print_channel_reset_log =
1371 reset_engine = true; 1460 gk20a_fifo_handle_mmu_fault(g, 0, ~(u32)0, false);
1372 handled |= fifo_intr_0_mmu_fault_pending_f(); 1461 handled |= fifo_intr_0_mmu_fault_pending_f();
1373 } 1462 }
1374 1463
@@ -1452,9 +1541,12 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
1452 == fifo_pbdma_status_id_type_chid_v()) { 1541 == fifo_pbdma_status_id_type_chid_v()) {
1453 struct channel_gk20a *ch = &f->channel[id]; 1542 struct channel_gk20a *ch = &f->channel[id];
1454 1543
1455 gk20a_set_error_notifier(ch, 1544 if (gk20a_channel_get(ch)) {
1456 NVGPU_CHANNEL_PBDMA_ERROR); 1545 gk20a_set_error_notifier(ch,
1457 gk20a_fifo_recover_ch(g, id, true); 1546 NVGPU_CHANNEL_PBDMA_ERROR);
1547 gk20a_fifo_recover_ch(g, id, true);
1548 gk20a_channel_put(ch);
1549 }
1458 } else if (fifo_pbdma_status_id_type_v(status) 1550 } else if (fifo_pbdma_status_id_type_v(status)
1459 == fifo_pbdma_status_id_type_tsgid_v()) { 1551 == fifo_pbdma_status_id_type_tsgid_v()) {
1460 struct tsg_gk20a *tsg = &f->tsg[id]; 1552 struct tsg_gk20a *tsg = &f->tsg[id];
@@ -1462,8 +1554,11 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
1462 1554
1463 mutex_lock(&tsg->ch_list_lock); 1555 mutex_lock(&tsg->ch_list_lock);
1464 list_for_each_entry(ch, &tsg->ch_list, ch_entry) { 1556 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
1465 gk20a_set_error_notifier(ch, 1557 if (gk20a_channel_get(ch)) {
1466 NVGPU_CHANNEL_PBDMA_ERROR); 1558 gk20a_set_error_notifier(ch,
1559 NVGPU_CHANNEL_PBDMA_ERROR);
1560 gk20a_channel_put(ch);
1561 }
1467 } 1562 }
1468 mutex_unlock(&tsg->ch_list_lock); 1563 mutex_unlock(&tsg->ch_list_lock);
1469 gk20a_fifo_recover_tsg(g, id, true); 1564 gk20a_fifo_recover_tsg(g, id, true);
@@ -1559,6 +1654,8 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
1559 + msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)); 1654 + msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
1560 u32 ret = 0; 1655 u32 ret = 0;
1561 1656
1657 gk20a_dbg_fn("%d", id);
1658
1562 /* issue preempt */ 1659 /* issue preempt */
1563 if (is_tsg) 1660 if (is_tsg)
1564 gk20a_writel(g, fifo_preempt_r(), 1661 gk20a_writel(g, fifo_preempt_r(),
@@ -1569,6 +1666,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
1569 fifo_preempt_chid_f(id) | 1666 fifo_preempt_chid_f(id) |
1570 fifo_preempt_type_channel_f()); 1667 fifo_preempt_type_channel_f());
1571 1668
1669 gk20a_dbg_fn("%d", id);
1572 /* wait for preempt */ 1670 /* wait for preempt */
1573 ret = -EBUSY; 1671 ret = -EBUSY;
1574 do { 1672 do {
@@ -1583,6 +1681,7 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
1583 } while (time_before(jiffies, end_jiffies) || 1681 } while (time_before(jiffies, end_jiffies) ||
1584 !tegra_platform_is_silicon()); 1682 !tegra_platform_is_silicon());
1585 1683
1684 gk20a_dbg_fn("%d", id);
1586 if (ret) { 1685 if (ret) {
1587 if (is_tsg) { 1686 if (is_tsg) {
1588 struct tsg_gk20a *tsg = &g->fifo.tsg[id]; 1687 struct tsg_gk20a *tsg = &g->fifo.tsg[id];
@@ -1593,8 +1692,11 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
1593 1692
1594 mutex_lock(&tsg->ch_list_lock); 1693 mutex_lock(&tsg->ch_list_lock);
1595 list_for_each_entry(ch, &tsg->ch_list, ch_entry) { 1694 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
1695 if (!gk20a_channel_get(ch))
1696 continue;
1596 gk20a_set_error_notifier(ch, 1697 gk20a_set_error_notifier(ch,
1597 NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); 1698 NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
1699 gk20a_channel_put(ch);
1598 } 1700 }
1599 mutex_unlock(&tsg->ch_list_lock); 1701 mutex_unlock(&tsg->ch_list_lock);
1600 gk20a_fifo_recover_tsg(g, id, true); 1702 gk20a_fifo_recover_tsg(g, id, true);
@@ -1604,9 +1706,12 @@ static int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
1604 gk20a_err(dev_from_gk20a(g), 1706 gk20a_err(dev_from_gk20a(g),
1605 "preempt channel %d timeout\n", id); 1707 "preempt channel %d timeout\n", id);
1606 1708
1607 gk20a_set_error_notifier(ch, 1709 if (gk20a_channel_get(ch)) {
1608 NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); 1710 gk20a_set_error_notifier(ch,
1609 gk20a_fifo_recover_ch(g, id, true); 1711 NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
1712 gk20a_fifo_recover_ch(g, id, true);
1713 gk20a_channel_put(ch);
1714 }
1610 } 1715 }
1611 } 1716 }
1612 1717
@@ -1790,7 +1895,9 @@ static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id)
1790 (f->engine_info[i].runlist_id == runlist_id)) 1895 (f->engine_info[i].runlist_id == runlist_id))
1791 engines |= BIT(i); 1896 engines |= BIT(i);
1792 } 1897 }
1793 gk20a_fifo_recover(g, engines, true); 1898
1899 if (engines)
1900 gk20a_fifo_recover(g, engines, ~(u32)0, false, true);
1794} 1901}
1795 1902
1796static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id) 1903static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
@@ -1994,6 +2101,8 @@ int gk20a_fifo_update_runlist(struct gk20a *g, u32 runlist_id, u32 hw_chid,
1994 u32 mutex_ret; 2101 u32 mutex_ret;
1995 u32 ret = 0; 2102 u32 ret = 0;
1996 2103
2104 gk20a_dbg_fn("");
2105
1997 runlist = &f->runlist_info[runlist_id]; 2106 runlist = &f->runlist_info[runlist_id];
1998 2107
1999 mutex_lock(&runlist->mutex); 2108 mutex_lock(&runlist->mutex);