summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b/fifo_gv11b.c')
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.c469
1 files changed, 164 insertions, 305 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 1fe98c35..9c64675f 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -381,24 +381,17 @@ u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g)
381 381
382u32 gv11b_fifo_get_preempt_timeout(struct gk20a *g) 382u32 gv11b_fifo_get_preempt_timeout(struct gk20a *g)
383{ 383{
384 /* if timeouts are enabled, using 3000ms timeout 384 return gk20a_get_gr_idle_timeout(g);
385 * for polling pdma/eng/runlist might kick in
386 * timeout handler in the cases where preempt
387 * is stuck. Use 1000ms timeout for polling when
388 * timeouts are enabled */
389 return nvgpu_is_timeouts_enabled(g) ? PREEMPT_TIMEOUT_1000_MS :
390 g->gr_idle_timeout_default;
391} 385}
392 386
393static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, 387static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
394 u32 pbdma_id) 388 u32 pbdma_id, unsigned int timeout_rc_type)
395{ 389{
396 struct nvgpu_timeout timeout; 390 struct nvgpu_timeout timeout;
397 unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ 391 unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
398 u32 pbdma_stat; 392 u32 pbdma_stat;
399 u32 chan_stat; 393 u32 chan_stat;
400 int ret = -EBUSY; 394 int ret = -EBUSY;
401 unsigned int loop_count = 0;
402 395
403 /* timeout in milli seconds */ 396 /* timeout in milli seconds */
404 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 397 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -407,14 +400,6 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
407 nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id); 400 nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id);
408 /* Verify that ch/tsg is no longer on the pbdma */ 401 /* Verify that ch/tsg is no longer on the pbdma */
409 do { 402 do {
410 if (!nvgpu_platform_is_silicon(g)) {
411 if (loop_count >= MAX_PRE_SI_RETRIES) {
412 nvgpu_err(g, "preempt pbdma retries: %u",
413 loop_count);
414 break;
415 }
416 loop_count++;
417 }
418 /* 403 /*
419 * If the PBDMA has a stalling interrupt and receives a NACK, 404 * If the PBDMA has a stalling interrupt and receives a NACK,
420 * the PBDMA won't save out until the STALLING interrupt is 405 * the PBDMA won't save out until the STALLING interrupt is
@@ -467,24 +452,21 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
467 nvgpu_usleep_range(delay, delay * 2); 452 nvgpu_usleep_range(delay, delay * 2);
468 delay = min_t(unsigned long, 453 delay = min_t(unsigned long,
469 delay << 1, GR_IDLE_CHECK_MAX); 454 delay << 1, GR_IDLE_CHECK_MAX);
470 } while (!nvgpu_timeout_expired(&timeout)); 455 } while (!nvgpu_timeout_expired_msg(&timeout,
471 456 "preempt timeout pbdma"));
472 if (ret)
473 nvgpu_err(g, "preempt timeout pbdma: %u pbdma_stat: %u "
474 "tsgid: %u", pbdma_id, pbdma_stat, id);
475 return ret; 457 return ret;
476} 458}
477 459
478static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, 460static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
479 u32 act_eng_id, u32 *reset_eng_bitmask) 461 u32 act_eng_id, u32 *reset_eng_bitmask,
462 unsigned int timeout_rc_type)
480{ 463{
481 struct nvgpu_timeout timeout; 464 struct nvgpu_timeout timeout;
482 unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ 465 unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
483 u32 eng_stat; 466 u32 eng_stat;
484 u32 ctx_stat; 467 u32 ctx_stat;
485 int ret = -EBUSY; 468 int ret = -EBUSY;
486 unsigned int loop_count = 0; 469 bool stall_intr = false;
487 u32 eng_intr_pending;
488 470
489 /* timeout in milli seconds */ 471 /* timeout in milli seconds */
490 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 472 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -494,56 +476,20 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
494 act_eng_id); 476 act_eng_id);
495 /* Check if ch/tsg has saved off the engine or if ctxsw is hung */ 477 /* Check if ch/tsg has saved off the engine or if ctxsw is hung */
496 do { 478 do {
497 if (!nvgpu_platform_is_silicon(g)) {
498 if (loop_count >= MAX_PRE_SI_RETRIES) {
499 nvgpu_err(g, "preempt eng retries: %u",
500 loop_count);
501 break;
502 }
503 loop_count++;
504 }
505 eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id)); 479 eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id));
506 ctx_stat = fifo_engine_status_ctx_status_v(eng_stat); 480 ctx_stat = fifo_engine_status_ctx_status_v(eng_stat);
507 481
508 if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id, 482 if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id)) {
509 &eng_intr_pending)) { 483 stall_intr = true;
510 /* From h/w team
511 * Engine save can be blocked by eng stalling interrupts.
512 * FIFO interrupts shouldn’t block an engine save from
513 * finishing, but could block FIFO from reporting preempt done.
514 * No immediate reason to reset the engine if FIFO interrupt is
515 * pending.
516 * The hub, priv_ring, and ltc interrupts could block context
517 * switch (or memory), but doesn’t necessarily have to.
518 * For Hub interrupts they just report access counters and page
519 * faults. Neither of these necessarily block context switch
520 * or preemption, but they could.
521 * For example a page fault for graphics would prevent graphics
522 * from saving out. An access counter interrupt is a
523 * notification and has no effect.
524 * SW should handle page faults though for preempt to complete.
525 * PRI interrupt (due to a failed PRI transaction) will result
526 * in ctxsw failure reported to HOST.
527 * LTC interrupts are generally ECC related and if so,
528 * certainly don’t block preemption/ctxsw but they could.
529 * Bus interrupts shouldn’t have anything to do with preemption
530 * state as they are part of the Host EXT pipe, though they may
531 * exhibit a symptom that indicates that GPU is in a bad state.
532 * To be completely fair, when an engine is preempting SW
533 * really should just handle other interrupts as they come in.
534 * It’s generally bad to just poll and wait on a preempt
535 * to complete since there are many things in the GPU which may
536 * cause a system to hang/stop responding.
537 */
538 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, 484 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
539 "stall intr set, " 485 "stall intr set, "
540 "preemption might not finish"); 486 "preemption will not finish");
541 } 487 }
542 if (ctx_stat == 488 if (ctx_stat ==
543 fifo_engine_status_ctx_status_ctxsw_switch_v()) { 489 fifo_engine_status_ctx_status_ctxsw_switch_v()) {
544 /* Eng save hasn't started yet. Continue polling */ 490 /* Eng save hasn't started yet. Continue polling */
545 if (eng_intr_pending) { 491 if (stall_intr) {
546 /* if eng intr, stop polling */ 492 /* if stall intr stop polling */
547 *reset_eng_bitmask |= BIT(act_eng_id); 493 *reset_eng_bitmask |= BIT(act_eng_id);
548 ret = 0; 494 ret = 0;
549 break; 495 break;
@@ -555,7 +501,8 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
555 fifo_engine_status_ctx_status_ctxsw_save_v()) { 501 fifo_engine_status_ctx_status_ctxsw_save_v()) {
556 502
557 if (id == fifo_engine_status_id_v(eng_stat)) { 503 if (id == fifo_engine_status_id_v(eng_stat)) {
558 if (eng_intr_pending) { 504 if (stall_intr ||
505 timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
559 /* preemption will not finish */ 506 /* preemption will not finish */
560 *reset_eng_bitmask |= BIT(act_eng_id); 507 *reset_eng_bitmask |= BIT(act_eng_id);
561 ret = 0; 508 ret = 0;
@@ -571,7 +518,9 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
571 fifo_engine_status_ctx_status_ctxsw_load_v()) { 518 fifo_engine_status_ctx_status_ctxsw_load_v()) {
572 519
573 if (id == fifo_engine_status_next_id_v(eng_stat)) { 520 if (id == fifo_engine_status_next_id_v(eng_stat)) {
574 if (eng_intr_pending) { 521
522 if (stall_intr ||
523 timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
575 /* preemption will not finish */ 524 /* preemption will not finish */
576 *reset_eng_bitmask |= BIT(act_eng_id); 525 *reset_eng_bitmask |= BIT(act_eng_id);
577 ret = 0; 526 ret = 0;
@@ -591,21 +540,8 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
591 nvgpu_usleep_range(delay, delay * 2); 540 nvgpu_usleep_range(delay, delay * 2);
592 delay = min_t(unsigned long, 541 delay = min_t(unsigned long,
593 delay << 1, GR_IDLE_CHECK_MAX); 542 delay << 1, GR_IDLE_CHECK_MAX);
594 } while (!nvgpu_timeout_expired(&timeout)); 543 } while (!nvgpu_timeout_expired_msg(&timeout,
595 544 "preempt timeout eng"));
596 if (ret) {
597 /*
598 * The reasons a preempt can fail are:
599 * 1.Some other stalling interrupt is asserted preventing
600 * channel or context save.
601 * 2.The memory system hangs.
602 * 3.The engine hangs during CTXSW.
603 */
604 nvgpu_err(g, "preempt timeout eng: %u ctx_stat: %u tsgid: %u",
605 act_eng_id, ctx_stat, id);
606 *reset_eng_bitmask |= BIT(act_eng_id);
607 }
608
609 return ret; 545 return ret;
610} 546}
611 547
@@ -652,19 +588,29 @@ static void gv11b_reset_pbdma_faulted_tsg(struct tsg_gk20a *tsg)
652} 588}
653 589
654void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g, 590void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
655 struct tsg_gk20a *tsg, 591 struct channel_gk20a *refch,
656 u32 faulted_pbdma, u32 faulted_engine) 592 u32 faulted_pbdma, u32 faulted_engine)
657{ 593{
658 if (!tsg) 594 struct tsg_gk20a *tsg;
659 return;
660 595
661 nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x", 596 nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
662 faulted_pbdma, faulted_engine); 597 faulted_pbdma, faulted_engine);
663 598
664 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID) 599 if (!refch)
665 gv11b_reset_pbdma_faulted_tsg(tsg); 600 return;
666 if (faulted_engine != FIFO_INVAL_ENGINE_ID) 601
667 gv11b_reset_eng_faulted_tsg(tsg); 602 if (gk20a_is_channel_marked_as_tsg(refch)) {
603 tsg = &g->fifo.tsg[refch->tsgid];
604 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
605 gv11b_reset_pbdma_faulted_tsg(tsg);
606 if (faulted_engine != FIFO_INVAL_ENGINE_ID)
607 gv11b_reset_eng_faulted_tsg(tsg);
608 } else {
609 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
610 gv11b_reset_pbdma_faulted_ch(g, refch->chid);
611 if (faulted_engine != FIFO_INVAL_ENGINE_ID)
612 gv11b_reset_eng_faulted_ch(g, refch->chid);
613 }
668} 614}
669 615
670static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask, 616static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
@@ -674,7 +620,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
674 u32 runlists_mask = 0; 620 u32 runlists_mask = 0;
675 struct fifo_gk20a *f = &g->fifo; 621 struct fifo_gk20a *f = &g->fifo;
676 struct fifo_runlist_info_gk20a *runlist; 622 struct fifo_runlist_info_gk20a *runlist;
677 u32 rlid, pbdma_bitmask = 0; 623 u32 pbdma_bitmask = 0;
678 624
679 if (id_type != ID_TYPE_UNKNOWN) { 625 if (id_type != ID_TYPE_UNKNOWN) {
680 if (id_type == ID_TYPE_TSG) 626 if (id_type == ID_TYPE_TSG)
@@ -689,31 +635,31 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
689 if (mmfault->faulted_pbdma != FIFO_INVAL_PBDMA_ID) 635 if (mmfault->faulted_pbdma != FIFO_INVAL_PBDMA_ID)
690 pbdma_bitmask = BIT(mmfault->faulted_pbdma); 636 pbdma_bitmask = BIT(mmfault->faulted_pbdma);
691 637
692 for (rlid = 0; rlid < f->max_runlists; rlid++) { 638 for (id = 0; id < f->max_runlists; id++) {
693 639
694 runlist = &f->runlist_info[rlid]; 640 runlist = &f->runlist_info[id];
695 641
696 if (runlist->eng_bitmask & act_eng_bitmask) 642 if (runlist->eng_bitmask & act_eng_bitmask)
697 runlists_mask |= 643 runlists_mask |=
698 fifo_sched_disable_runlist_m(rlid); 644 fifo_sched_disable_runlist_m(id);
699 645
700 if (runlist->pbdma_bitmask & pbdma_bitmask) 646 if (runlist->pbdma_bitmask & pbdma_bitmask)
701 runlists_mask |= 647 runlists_mask |=
702 fifo_sched_disable_runlist_m(rlid); 648 fifo_sched_disable_runlist_m(id);
703 } 649 }
704 } 650 }
705 651
706 if (id_type == ID_TYPE_UNKNOWN) { 652 if (id_type == ID_TYPE_UNKNOWN) {
707 for (rlid = 0; rlid < f->max_runlists; rlid++) { 653 for (id = 0; id < f->max_runlists; id++) {
708 if (act_eng_bitmask) { 654 if (act_eng_bitmask) {
709 /* eng ids are known */ 655 /* eng ids are known */
710 runlist = &f->runlist_info[rlid]; 656 runlist = &f->runlist_info[id];
711 if (runlist->eng_bitmask & act_eng_bitmask) 657 if (runlist->eng_bitmask & act_eng_bitmask)
712 runlists_mask |= 658 runlists_mask |=
713 fifo_sched_disable_runlist_m(rlid); 659 fifo_sched_disable_runlist_m(id);
714 } else { 660 } else {
715 runlists_mask |= 661 runlists_mask |=
716 fifo_sched_disable_runlist_m(rlid); 662 fifo_sched_disable_runlist_m(id);
717 } 663 }
718 } 664 }
719 } 665 }
@@ -745,20 +691,10 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g,
745 struct nvgpu_timeout timeout; 691 struct nvgpu_timeout timeout;
746 u32 delay = GR_IDLE_CHECK_DEFAULT; 692 u32 delay = GR_IDLE_CHECK_DEFAULT;
747 int ret = -EBUSY; 693 int ret = -EBUSY;
748 unsigned int loop_count = 0;
749 694
750 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 695 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
751 NVGPU_TIMER_CPU_TIMER); 696 NVGPU_TIMER_CPU_TIMER);
752 do { 697 do {
753 if (!nvgpu_platform_is_silicon(g)) {
754 if (loop_count >= MAX_PRE_SI_RETRIES) {
755 nvgpu_err(g, "preempt runlist retries: %u",
756 loop_count);
757 break;
758 }
759 loop_count++;
760 }
761
762 if (!((gk20a_readl(g, fifo_runlist_preempt_r())) & 698 if (!((gk20a_readl(g, fifo_runlist_preempt_r())) &
763 runlists_mask)) { 699 runlists_mask)) {
764 ret = 0; 700 ret = 0;
@@ -768,16 +704,13 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g,
768 nvgpu_usleep_range(delay, delay * 2); 704 nvgpu_usleep_range(delay, delay * 2);
769 delay = min_t(unsigned long, 705 delay = min_t(unsigned long,
770 delay << 1, GR_IDLE_CHECK_MAX); 706 delay << 1, GR_IDLE_CHECK_MAX);
771 } while (!nvgpu_timeout_expired(&timeout)); 707 } while (!nvgpu_timeout_expired_msg(&timeout,
772 708 "runlist preempt timeout"));
773 if (ret)
774 nvgpu_err(g, "preempt runlist timeout, runlists_mask:0x%08x",
775 runlists_mask);
776 return ret; 709 return ret;
777} 710}
778 711
779int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, 712int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
780 unsigned int id_type) 713 unsigned int id_type, unsigned int timeout_rc_type)
781{ 714{
782 struct fifo_gk20a *f = &g->fifo; 715 struct fifo_gk20a *f = &g->fifo;
783 unsigned long runlist_served_pbdmas; 716 unsigned long runlist_served_pbdmas;
@@ -785,6 +718,7 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
785 u32 pbdma_id; 718 u32 pbdma_id;
786 u32 act_eng_id; 719 u32 act_eng_id;
787 u32 runlist_id; 720 u32 runlist_id;
721 int func_ret;
788 int ret = 0; 722 int ret = 0;
789 u32 tsgid; 723 u32 tsgid;
790 724
@@ -801,14 +735,30 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
801 runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask; 735 runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask;
802 runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask; 736 runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask;
803 737
804 for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) 738 for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) {
805 ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id); 739
740 func_ret = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id,
741 timeout_rc_type);
742 if (func_ret != 0) {
743 nvgpu_log_info(g, "preempt timeout pbdma %d", pbdma_id);
744 ret |= func_ret;
745 }
746 }
806 747
807 f->runlist_info[runlist_id].reset_eng_bitmask = 0; 748 f->runlist_info[runlist_id].reset_eng_bitmask = 0;
808 749
809 for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) 750 for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) {
810 ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id, 751
811 &f->runlist_info[runlist_id].reset_eng_bitmask); 752 func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
753 &f->runlist_info[runlist_id].reset_eng_bitmask,
754 timeout_rc_type);
755
756 if (func_ret != 0) {
757 nvgpu_log_info(g, "preempt timeout engine %d", act_eng_id);
758 ret |= func_ret;
759 }
760 }
761
812 return ret; 762 return ret;
813} 763}
814 764
@@ -887,9 +837,6 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
887 837
888 nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock); 838 nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
889 839
890 /* WAR for Bug 2065990 */
891 gk20a_fifo_disable_tsg_sched(g, &f->tsg[tsgid]);
892
893 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 840 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
894 841
895 ret = __locked_fifo_preempt(g, tsgid, true); 842 ret = __locked_fifo_preempt(g, tsgid, true);
@@ -897,9 +844,6 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
897 if (!mutex_ret) 844 if (!mutex_ret)
898 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 845 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
899 846
900 /* WAR for Bug 2065990 */
901 gk20a_fifo_enable_tsg_sched(g, &f->tsg[tsgid]);
902
903 nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock); 847 nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
904 848
905 if (ret) 849 if (ret)
@@ -908,36 +852,44 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
908 return ret; 852 return ret;
909} 853}
910 854
911static void gv11b_fifo_locked_preempt_runlists(struct gk20a *g, u32 runlists_mask) 855static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask)
912{ 856{
913 int ret = 0; 857 int ret = 0;
914 u32 token = PMU_INVALID_MUTEX_OWNER_ID; 858 u32 token = PMU_INVALID_MUTEX_OWNER_ID;
915 u32 mutex_ret = 0; 859 u32 mutex_ret = 0;
916 u32 rlid; 860 u32 runlist_id;
861
862 nvgpu_log_fn(g, " ");
917 863
918 /* runlist_lock are locked by teardown and sched are disabled too */ 864 for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
919 nvgpu_log_fn(g, "preempt runlists_mask:0x%08x", runlists_mask); 865 if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id))
866 nvgpu_mutex_acquire(&g->fifo.
867 runlist_info[runlist_id].runlist_lock);
868 }
920 869
921 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 870 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
922 871
923 ret = __locked_fifo_preempt_runlists(g, runlists_mask); 872 ret = __locked_fifo_preempt_runlists(g, runlists_mask);
924 873
925 if (ret) { 874 if (!mutex_ret)
926 /* if preempt timed out, reset engs served by runlists */ 875 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
927 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { 876
928 if (runlists_mask & 877 for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
929 fifo_runlist_preempt_runlist_m(rlid)) 878 if (runlists_mask &
930 g->fifo.runlist_info[rlid].reset_eng_bitmask = 879 fifo_runlist_preempt_runlist_m(runlist_id)) {
931 g->fifo.runlist_info[rlid].eng_bitmask; 880 /* during recovery reset engs served by this runlist */
881 g->fifo.runlist_info[runlist_id].reset_eng_bitmask =
882 g->fifo.runlist_info[runlist_id].eng_bitmask;
883 nvgpu_mutex_release(&g->fifo.
884 runlist_info[runlist_id].runlist_lock);
932 } 885 }
933 } 886 }
934 887
935 if (!mutex_ret) 888 return ret;
936 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
937} 889}
938 890
939static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id, 891static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
940 unsigned int id_type) 892 unsigned int id_type, unsigned int timeout_rc_type)
941{ 893{
942 int ret; 894 int ret;
943 struct fifo_gk20a *f = &g->fifo; 895 struct fifo_gk20a *f = &g->fifo;
@@ -951,97 +903,52 @@ static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
951 gk20a_fifo_issue_preempt(g, id, true); 903 gk20a_fifo_issue_preempt(g, id, true);
952 904
953 /* wait for preempt */ 905 /* wait for preempt */
954 ret = g->ops.fifo.is_preempt_pending(g, id, id_type); 906 ret = g->ops.fifo.is_preempt_pending(g, id, id_type,
907 timeout_rc_type);
955 908
956 /* No recovery even if preempt timed out since 909 if (ret && (timeout_rc_type == PREEMPT_TIMEOUT_RC))
957 * this is called from recovery path 910 gk20a_fifo_preempt_timeout_rc(g, id, id_type);
958 */
959 911
960 return ret; 912 return ret;
961} 913}
962 914
963 915
964int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id, 916int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
965 unsigned int id_type) 917 unsigned int id_type, unsigned int timeout_rc_type)
966{ 918{
919 struct fifo_gk20a *f = &g->fifo;
967 u32 ret = 0; 920 u32 ret = 0;
968 u32 token = PMU_INVALID_MUTEX_OWNER_ID; 921 u32 token = PMU_INVALID_MUTEX_OWNER_ID;
969 u32 mutex_ret = 0; 922 u32 mutex_ret = 0;
923 u32 runlist_id;
970 924
971 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 925 if (id_type == ID_TYPE_TSG)
972 /* 926 runlist_id = f->tsg[id].runlist_id;
973 * This is called from teardown path only. runlist_lock 927 else if (id_type == ID_TYPE_CHANNEL)
974 * is already acquired before calling this function. 928 runlist_id = f->channel[id].runlist_id;
975 */ 929 else
976 ret = __locked_fifo_preempt_ch_tsg(g, id, id_type); 930 return -EINVAL;
977
978 if (!mutex_ret)
979 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
980
981 return ret;
982 931
983} 932 if (runlist_id >= g->fifo.max_runlists) {
933 nvgpu_log_info(g, "runlist_id = %d", runlist_id);
934 return -EINVAL;
935 }
984 936
985static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g, 937 nvgpu_log_fn(g, "preempt id = %d, runlist_id = %d", id, runlist_id);
986 unsigned int rc_type,
987 u32 runlists_mask)
988{
989 struct tsg_gk20a *tsg = NULL;
990 u32 rlid, tsgid;
991 struct fifo_runlist_info_gk20a *runlist = NULL;
992 u32 token = PMU_INVALID_MUTEX_OWNER_ID;
993 u32 mutex_ret = 0;
994 bool add = false, wait_for_finish = false;
995 int err;
996 938
997 nvgpu_err(g, "runlist id unknown, abort active tsgs in runlists"); 939 nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
998 940
999 /* runlist_lock are locked by teardown */
1000 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 941 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
1001 942
1002 for (rlid = 0; rlid < g->fifo.max_runlists; 943 ret = __locked_fifo_preempt_ch_tsg(g, id, id_type, timeout_rc_type);
1003 rlid++) {
1004 if (!(runlists_mask & BIT(rlid)))
1005 continue;
1006 nvgpu_log(g, gpu_dbg_info, "abort runlist id %d",
1007 rlid);
1008 runlist = &g->fifo.runlist_info[rlid];
1009
1010 for_each_set_bit(tsgid, runlist->active_tsgs,
1011 g->fifo.num_channels) {
1012 nvgpu_log(g, gpu_dbg_info, "abort tsg id %d", tsgid);
1013 tsg = &g->fifo.tsg[tsgid];
1014 gk20a_disable_tsg(tsg);
1015
1016 /* assume all pbdma and eng faulted are set */
1017 nvgpu_log(g, gpu_dbg_info, "reset pbdma and eng faulted");
1018 gv11b_reset_pbdma_faulted_tsg(tsg);
1019 gv11b_reset_eng_faulted_tsg(tsg);
1020 944
1021#ifdef CONFIG_GK20A_CTXSW_TRACE 945 if (!mutex_ret)
1022 gk20a_ctxsw_trace_tsg_reset(g, tsg); 946 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
1023#endif
1024 if (!g->fifo.deferred_reset_pending) {
1025 if (rc_type == RC_TYPE_MMU_FAULT) {
1026 gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
1027 gk20a_fifo_error_tsg(g, tsg);
1028 }
1029 }
1030 947
1031 /* (chid == ~0 && !add) remove all act ch from runlist*/ 948 nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
1032 err = gk20a_fifo_update_runlist_locked(g, rlid,
1033 FIFO_INVAL_CHANNEL_ID, add, wait_for_finish);
1034 if (err)
1035 nvgpu_err(g, "runlist id %d is not cleaned up",
1036 rlid);
1037 949
1038 gk20a_fifo_abort_tsg(g, tsg->tsgid, false); 950 return ret;
1039 951
1040 nvgpu_log(g, gpu_dbg_info, "aborted tsg id %d", tsgid);
1041 }
1042 }
1043 if (!mutex_ret)
1044 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
1045} 952}
1046 953
1047void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, 954void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
@@ -1049,66 +956,10 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1049 struct mmu_fault_info *mmfault) 956 struct mmu_fault_info *mmfault)
1050{ 957{
1051 struct tsg_gk20a *tsg = NULL; 958 struct tsg_gk20a *tsg = NULL;
1052 u32 runlists_mask, rlid; 959 struct channel_gk20a *refch = NULL;
960 u32 runlists_mask, runlist_id;
1053 struct fifo_runlist_info_gk20a *runlist = NULL; 961 struct fifo_runlist_info_gk20a *runlist = NULL;
1054 u32 engine_id, client_type = ~0; 962 u32 engine_id, client_type = ~0;
1055 struct fifo_gk20a *f = &g->fifo;
1056 u32 runlist_id = FIFO_INVAL_RUNLIST_ID;
1057 u32 num_runlists = 0;
1058
1059 nvgpu_log_fn(g, "acquire runlist_lock for all runlists");
1060 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++)
1061 nvgpu_mutex_acquire(&f->runlist_info[rlid].
1062 runlist_lock);
1063
1064 /* get runlist id and tsg */
1065 if (id_type == ID_TYPE_TSG) {
1066 if (id != FIFO_INVAL_TSG_ID) {
1067 tsg = &g->fifo.tsg[id];
1068 runlist_id = tsg->runlist_id;
1069 if (runlist_id != FIFO_INVAL_RUNLIST_ID)
1070 num_runlists++;
1071 else
1072 nvgpu_log_fn(g, "tsg runlist id is invalid");
1073 } else {
1074 nvgpu_log_fn(g, "id type is tsg but tsg id is inval");
1075 }
1076 } else {
1077 /*
1078 * id type is unknown, get runlist_id if eng mask is such that
1079 * it corresponds to single runlist id. If eng mask corresponds
1080 * to multiple runlists, then abort all runlists
1081 */
1082 for (rlid = 0; rlid < f->max_runlists; rlid++) {
1083 if (act_eng_bitmask) {
1084 /* eng ids are known */
1085 runlist = &f->runlist_info[rlid];
1086 if (runlist->eng_bitmask & act_eng_bitmask) {
1087 runlist_id = rlid;
1088 num_runlists++;
1089 }
1090 } else {
1091 break;
1092 }
1093 }
1094 if (num_runlists > 1 ) /* abort all runlists */
1095 runlist_id = FIFO_INVAL_RUNLIST_ID;
1096 }
1097
1098 /* if runlist_id is valid and there is only single runlist to be
1099 * aborted, release runlist lock that are not
1100 * needed for this recovery
1101 */
1102 if (runlist_id != FIFO_INVAL_RUNLIST_ID && num_runlists == 1) {
1103 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1104 if (rlid != runlist_id) {
1105 nvgpu_log_fn(g, "release runlist_lock for "
1106 "unused runlist id: %d", rlid);
1107 nvgpu_mutex_release(&f->runlist_info[rlid].
1108 runlist_lock);
1109 }
1110 }
1111 }
1112 963
1113 nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, " 964 nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
1114 "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p", 965 "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p",
@@ -1117,7 +968,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1117 runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, 968 runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,
1118 id_type, rc_type, mmfault); 969 id_type, rc_type, mmfault);
1119 970
1120 /* Disable runlist scheduler */
1121 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED); 971 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);
1122 972
1123 g->fifo.deferred_reset_pending = false; 973 g->fifo.deferred_reset_pending = false;
@@ -1139,41 +989,41 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1139 989
1140 gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN); 990 gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
1141 991
992 /* Get tsg/ch */
1142 if (rc_type == RC_TYPE_MMU_FAULT) { 993 if (rc_type == RC_TYPE_MMU_FAULT) {
1143 gk20a_debug_dump(g); 994 gk20a_debug_dump(g);
995 refch = mmfault->refch;
1144 client_type = mmfault->client_type; 996 client_type = mmfault->client_type;
1145 gv11b_fifo_reset_pbdma_and_eng_faulted(g, tsg, 997 gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch,
1146 mmfault->faulted_pbdma, 998 mmfault->faulted_pbdma,
1147 mmfault->faulted_engine); 999 mmfault->faulted_engine);
1148 } 1000 }
1149 1001
1002 if (id_type == ID_TYPE_TSG) {
1003 tsg = &g->fifo.tsg[id];
1004 } else if (id_type == ID_TYPE_CHANNEL) {
1005 if (refch == NULL)
1006 refch = gk20a_channel_get(&g->fifo.channel[id]);
1007 }
1008 /* Disable tsg/ch */
1150 if (tsg) 1009 if (tsg)
1151 gk20a_disable_tsg(tsg); 1010 gk20a_disable_tsg(tsg);
1011 else if (refch)
1012 g->ops.fifo.disable_channel(refch);
1152 1013
1153 /* 1014 /* Preempt tsg/ch */
1154 * Even though TSG preempt timed out, the RC sequence would by design 1015 if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) {
1155 * require s/w to issue another preempt. 1016 g->ops.fifo.preempt_ch_tsg(g, id, id_type,
1156 * If recovery includes an ENGINE_RESET, to not have race conditions, 1017 PREEMPT_TIMEOUT_NORC);
1157 * use RUNLIST_PREEMPT to kick all work off, and cancel any context
1158 * load which may be pending. This is also needed to make sure
1159 * that all PBDMAs serving the engine are not loaded when engine is
1160 * reset.
1161 */
1162 if (tsg) {
1163 int preempt_failed;
1164
1165 preempt_failed = g->ops.fifo.preempt_ch_tsg(g, id, id_type);
1166 if (preempt_failed)
1167 gv11b_fifo_locked_preempt_runlists(g, runlists_mask);
1168 } else { 1018 } else {
1169 gv11b_fifo_locked_preempt_runlists(g, runlists_mask); 1019 gv11b_fifo_preempt_runlists(g, runlists_mask);
1170 } 1020 }
1171 1021
1172 /* check if engine reset should be deferred */ 1022 /* check if engine reset should be deferred */
1173 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { 1023 for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) {
1174 1024
1175 runlist = &g->fifo.runlist_info[rlid]; 1025 runlist = &g->fifo.runlist_info[runlist_id];
1176 if ((runlists_mask & BIT(rlid)) && 1026 if ((runlists_mask & BIT(runlist_id)) &&
1177 runlist->reset_eng_bitmask) { 1027 runlist->reset_eng_bitmask) {
1178 1028
1179 unsigned long __reset_eng_bitmask = 1029 unsigned long __reset_eng_bitmask =
@@ -1181,7 +1031,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1181 1031
1182 for_each_set_bit(engine_id, &__reset_eng_bitmask, 1032 for_each_set_bit(engine_id, &__reset_eng_bitmask,
1183 g->fifo.max_engines) { 1033 g->fifo.max_engines) {
1184 if (tsg && 1034 if ((refch || tsg) &&
1185 gk20a_fifo_should_defer_engine_reset(g, 1035 gk20a_fifo_should_defer_engine_reset(g,
1186 engine_id, client_type, false)) { 1036 engine_id, client_type, false)) {
1187 1037
@@ -1213,9 +1063,13 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1213 } 1063 }
1214 1064
1215#ifdef CONFIG_GK20A_CTXSW_TRACE 1065#ifdef CONFIG_GK20A_CTXSW_TRACE
1066 /* tsg and refch both could be valid for mmu fault. Check tsg first */
1216 if (tsg) 1067 if (tsg)
1217 gk20a_ctxsw_trace_tsg_reset(g, tsg); 1068 gk20a_ctxsw_trace_tsg_reset(g, tsg);
1069 else if (refch)
1070 gk20a_ctxsw_trace_channel_reset(g, refch);
1218#endif 1071#endif
1072
1219 if (tsg) { 1073 if (tsg) {
1220 if (g->fifo.deferred_reset_pending) { 1074 if (g->fifo.deferred_reset_pending) {
1221 gk20a_disable_tsg(tsg); 1075 gk20a_disable_tsg(tsg);
@@ -1225,9 +1079,26 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1225 1079
1226 gk20a_fifo_abort_tsg(g, tsg->tsgid, false); 1080 gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
1227 } 1081 }
1082 if (refch)
1083 gk20a_channel_put(refch);
1084 } else if (refch) {
1085 if (g->fifo.deferred_reset_pending) {
1086 g->ops.fifo.disable_channel(refch);
1087 } else {
1088 if (rc_type == RC_TYPE_MMU_FAULT)
1089 gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
1090
1091 gk20a_channel_abort(refch, false);
1092 }
1093 gk20a_channel_put(refch);
1228 } else { 1094 } else {
1229 gv11b_fifo_locked_abort_runlist_active_tsgs(g, rc_type, 1095 nvgpu_err(g, "id unknown, abort runlist");
1230 runlists_mask); 1096 for (runlist_id = 0; runlist_id < g->fifo.max_runlists;
1097 runlist_id++) {
1098 if (runlists_mask & BIT(runlist_id))
1099 g->ops.fifo.update_runlist(g, runlist_id,
1100 FIFO_INVAL_CHANNEL_ID, false, true);
1101 }
1231 } 1102 }
1232 1103
1233 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED); 1104 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED);
@@ -1235,18 +1106,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1235 /* It is safe to enable ELPG again. */ 1106 /* It is safe to enable ELPG again. */
1236 if (g->support_pmu && g->elpg_enabled) 1107 if (g->support_pmu && g->elpg_enabled)
1237 nvgpu_pmu_enable_elpg(g); 1108 nvgpu_pmu_enable_elpg(g);
1238
1239 /* release runlist_lock */
1240 if (runlist_id != FIFO_INVAL_RUNLIST_ID) {
1241 nvgpu_log_fn(g, "release runlist_lock runlist_id = %d",
1242 runlist_id);
1243 nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
1244 } else {
1245 nvgpu_log_fn(g, "release runlist_lock for all runlists");
1246 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++)
1247 nvgpu_mutex_release(&f->runlist_info[rlid].
1248 runlist_lock);
1249 }
1250} 1109}
1251 1110
1252void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f) 1111void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)