summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b/fifo_gv11b.c')
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.c469
1 files changed, 305 insertions, 164 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 4edaaac1..f30f2ae1 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -387,17 +387,24 @@ u32 gv11b_fifo_intr_0_error_mask(struct gk20a *g)
387 387
388u32 gv11b_fifo_get_preempt_timeout(struct gk20a *g) 388u32 gv11b_fifo_get_preempt_timeout(struct gk20a *g)
389{ 389{
390 return gk20a_get_gr_idle_timeout(g); 390 /* if timeouts are enabled, using 3000ms timeout
391 * for polling pdma/eng/runlist might kick in
392 * timeout handler in the cases where preempt
393 * is stuck. Use 1000ms timeout for polling when
394 * timeouts are enabled */
395 return nvgpu_is_timeouts_enabled(g) ? PREEMPT_TIMEOUT_1000_MS :
396 g->gr_idle_timeout_default;
391} 397}
392 398
393static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, 399static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
394 u32 pbdma_id, unsigned int timeout_rc_type) 400 u32 pbdma_id)
395{ 401{
396 struct nvgpu_timeout timeout; 402 struct nvgpu_timeout timeout;
397 unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ 403 unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
398 u32 pbdma_stat; 404 u32 pbdma_stat;
399 u32 chan_stat; 405 u32 chan_stat;
400 int ret = -EBUSY; 406 int ret = -EBUSY;
407 unsigned int loop_count = 0;
401 408
402 /* timeout in milli seconds */ 409 /* timeout in milli seconds */
403 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 410 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -406,6 +413,14 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
406 nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id); 413 nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id);
407 /* Verify that ch/tsg is no longer on the pbdma */ 414 /* Verify that ch/tsg is no longer on the pbdma */
408 do { 415 do {
416 if (!nvgpu_platform_is_silicon(g)) {
417 if (loop_count >= MAX_PRE_SI_RETRIES) {
418 nvgpu_err(g, "preempt pbdma retries: %u",
419 loop_count);
420 break;
421 }
422 loop_count++;
423 }
409 /* 424 /*
410 * If the PBDMA has a stalling interrupt and receives a NACK, 425 * If the PBDMA has a stalling interrupt and receives a NACK,
411 * the PBDMA won't save out until the STALLING interrupt is 426 * the PBDMA won't save out until the STALLING interrupt is
@@ -458,21 +473,24 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
458 nvgpu_usleep_range(delay, delay * 2); 473 nvgpu_usleep_range(delay, delay * 2);
459 delay = min_t(unsigned long, 474 delay = min_t(unsigned long,
460 delay << 1, GR_IDLE_CHECK_MAX); 475 delay << 1, GR_IDLE_CHECK_MAX);
461 } while (!nvgpu_timeout_expired_msg(&timeout, 476 } while (!nvgpu_timeout_expired(&timeout));
462 "preempt timeout pbdma")); 477
478 if (ret)
479 nvgpu_err(g, "preempt timeout pbdma: %u pbdma_stat: %u "
480 "tsgid: %u", pbdma_id, pbdma_stat, id);
463 return ret; 481 return ret;
464} 482}
465 483
466static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, 484static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
467 u32 act_eng_id, u32 *reset_eng_bitmask, 485 u32 act_eng_id, u32 *reset_eng_bitmask)
468 unsigned int timeout_rc_type)
469{ 486{
470 struct nvgpu_timeout timeout; 487 struct nvgpu_timeout timeout;
471 unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ 488 unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */
472 u32 eng_stat; 489 u32 eng_stat;
473 u32 ctx_stat; 490 u32 ctx_stat;
474 int ret = -EBUSY; 491 int ret = -EBUSY;
475 bool stall_intr = false; 492 unsigned int loop_count = 0;
493 u32 eng_intr_pending;
476 494
477 /* timeout in milli seconds */ 495 /* timeout in milli seconds */
478 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 496 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -482,20 +500,56 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
482 act_eng_id); 500 act_eng_id);
483 /* Check if ch/tsg has saved off the engine or if ctxsw is hung */ 501 /* Check if ch/tsg has saved off the engine or if ctxsw is hung */
484 do { 502 do {
503 if (!nvgpu_platform_is_silicon(g)) {
504 if (loop_count >= MAX_PRE_SI_RETRIES) {
505 nvgpu_err(g, "preempt eng retries: %u",
506 loop_count);
507 break;
508 }
509 loop_count++;
510 }
485 eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id)); 511 eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id));
486 ctx_stat = fifo_engine_status_ctx_status_v(eng_stat); 512 ctx_stat = fifo_engine_status_ctx_status_v(eng_stat);
487 513
488 if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id)) { 514 if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id,
489 stall_intr = true; 515 &eng_intr_pending)) {
516 /* From h/w team
517 * Engine save can be blocked by eng stalling interrupts.
518 * FIFO interrupts shouldn’t block an engine save from
519 * finishing, but could block FIFO from reporting preempt done.
520 * No immediate reason to reset the engine if FIFO interrupt is
521 * pending.
522 * The hub, priv_ring, and ltc interrupts could block context
523 * switch (or memory), but doesn’t necessarily have to.
524 * For Hub interrupts they just report access counters and page
525 * faults. Neither of these necessarily block context switch
526 * or preemption, but they could.
527 * For example a page fault for graphics would prevent graphics
528 * from saving out. An access counter interrupt is a
529 * notification and has no effect.
530 * SW should handle page faults though for preempt to complete.
531 * PRI interrupt (due to a failed PRI transaction) will result
532 * in ctxsw failure reported to HOST.
533 * LTC interrupts are generally ECC related and if so,
534 * certainly don’t block preemption/ctxsw but they could.
535 * Bus interrupts shouldn’t have anything to do with preemption
536 * state as they are part of the Host EXT pipe, though they may
537 * exhibit a symptom that indicates that GPU is in a bad state.
538 * To be completely fair, when an engine is preempting SW
539 * really should just handle other interrupts as they come in.
540 * It’s generally bad to just poll and wait on a preempt
541 * to complete since there are many things in the GPU which may
542 * cause a system to hang/stop responding.
543 */
490 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, 544 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
491 "stall intr set, " 545 "stall intr set, "
492 "preemption will not finish"); 546 "preemption might not finish");
493 } 547 }
494 if (ctx_stat == 548 if (ctx_stat ==
495 fifo_engine_status_ctx_status_ctxsw_switch_v()) { 549 fifo_engine_status_ctx_status_ctxsw_switch_v()) {
496 /* Eng save hasn't started yet. Continue polling */ 550 /* Eng save hasn't started yet. Continue polling */
497 if (stall_intr) { 551 if (eng_intr_pending) {
498 /* if stall intr stop polling */ 552 /* if eng intr, stop polling */
499 *reset_eng_bitmask |= BIT(act_eng_id); 553 *reset_eng_bitmask |= BIT(act_eng_id);
500 ret = 0; 554 ret = 0;
501 break; 555 break;
@@ -507,8 +561,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
507 fifo_engine_status_ctx_status_ctxsw_save_v()) { 561 fifo_engine_status_ctx_status_ctxsw_save_v()) {
508 562
509 if (id == fifo_engine_status_id_v(eng_stat)) { 563 if (id == fifo_engine_status_id_v(eng_stat)) {
510 if (stall_intr || 564 if (eng_intr_pending) {
511 timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
512 /* preemption will not finish */ 565 /* preemption will not finish */
513 *reset_eng_bitmask |= BIT(act_eng_id); 566 *reset_eng_bitmask |= BIT(act_eng_id);
514 ret = 0; 567 ret = 0;
@@ -524,9 +577,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
524 fifo_engine_status_ctx_status_ctxsw_load_v()) { 577 fifo_engine_status_ctx_status_ctxsw_load_v()) {
525 578
526 if (id == fifo_engine_status_next_id_v(eng_stat)) { 579 if (id == fifo_engine_status_next_id_v(eng_stat)) {
527 580 if (eng_intr_pending) {
528 if (stall_intr ||
529 timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
530 /* preemption will not finish */ 581 /* preemption will not finish */
531 *reset_eng_bitmask |= BIT(act_eng_id); 582 *reset_eng_bitmask |= BIT(act_eng_id);
532 ret = 0; 583 ret = 0;
@@ -546,8 +597,21 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
546 nvgpu_usleep_range(delay, delay * 2); 597 nvgpu_usleep_range(delay, delay * 2);
547 delay = min_t(unsigned long, 598 delay = min_t(unsigned long,
548 delay << 1, GR_IDLE_CHECK_MAX); 599 delay << 1, GR_IDLE_CHECK_MAX);
549 } while (!nvgpu_timeout_expired_msg(&timeout, 600 } while (!nvgpu_timeout_expired(&timeout));
550 "preempt timeout eng")); 601
602 if (ret) {
603 /*
604 * The reasons a preempt can fail are:
605 * 1.Some other stalling interrupt is asserted preventing
606 * channel or context save.
607 * 2.The memory system hangs.
608 * 3.The engine hangs during CTXSW.
609 */
610 nvgpu_err(g, "preempt timeout eng: %u ctx_stat: %u tsgid: %u",
611 act_eng_id, ctx_stat, id);
612 *reset_eng_bitmask |= BIT(act_eng_id);
613 }
614
551 return ret; 615 return ret;
552} 616}
553 617
@@ -594,29 +658,19 @@ static void gv11b_reset_pbdma_faulted_tsg(struct tsg_gk20a *tsg)
594} 658}
595 659
596void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g, 660void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
597 struct channel_gk20a *refch, 661 struct tsg_gk20a *tsg,
598 u32 faulted_pbdma, u32 faulted_engine) 662 u32 faulted_pbdma, u32 faulted_engine)
599{ 663{
600 struct tsg_gk20a *tsg; 664 if (!tsg)
665 return;
601 666
602 nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x", 667 nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
603 faulted_pbdma, faulted_engine); 668 faulted_pbdma, faulted_engine);
604 669
605 if (!refch) 670 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
606 return; 671 gv11b_reset_pbdma_faulted_tsg(tsg);
607 672 if (faulted_engine != FIFO_INVAL_ENGINE_ID)
608 if (gk20a_is_channel_marked_as_tsg(refch)) { 673 gv11b_reset_eng_faulted_tsg(tsg);
609 tsg = &g->fifo.tsg[refch->tsgid];
610 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
611 gv11b_reset_pbdma_faulted_tsg(tsg);
612 if (faulted_engine != FIFO_INVAL_ENGINE_ID)
613 gv11b_reset_eng_faulted_tsg(tsg);
614 } else {
615 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
616 gv11b_reset_pbdma_faulted_ch(g, refch->chid);
617 if (faulted_engine != FIFO_INVAL_ENGINE_ID)
618 gv11b_reset_eng_faulted_ch(g, refch->chid);
619 }
620} 674}
621 675
622static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask, 676static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
@@ -626,7 +680,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
626 u32 runlists_mask = 0; 680 u32 runlists_mask = 0;
627 struct fifo_gk20a *f = &g->fifo; 681 struct fifo_gk20a *f = &g->fifo;
628 struct fifo_runlist_info_gk20a *runlist; 682 struct fifo_runlist_info_gk20a *runlist;
629 u32 pbdma_bitmask = 0; 683 u32 rlid, pbdma_bitmask = 0;
630 684
631 if (id_type != ID_TYPE_UNKNOWN) { 685 if (id_type != ID_TYPE_UNKNOWN) {
632 if (id_type == ID_TYPE_TSG) 686 if (id_type == ID_TYPE_TSG)
@@ -641,31 +695,31 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask,
641 if (mmfault->faulted_pbdma != FIFO_INVAL_PBDMA_ID) 695 if (mmfault->faulted_pbdma != FIFO_INVAL_PBDMA_ID)
642 pbdma_bitmask = BIT(mmfault->faulted_pbdma); 696 pbdma_bitmask = BIT(mmfault->faulted_pbdma);
643 697
644 for (id = 0; id < f->max_runlists; id++) { 698 for (rlid = 0; rlid < f->max_runlists; rlid++) {
645 699
646 runlist = &f->runlist_info[id]; 700 runlist = &f->runlist_info[rlid];
647 701
648 if (runlist->eng_bitmask & act_eng_bitmask) 702 if (runlist->eng_bitmask & act_eng_bitmask)
649 runlists_mask |= 703 runlists_mask |=
650 fifo_sched_disable_runlist_m(id); 704 fifo_sched_disable_runlist_m(rlid);
651 705
652 if (runlist->pbdma_bitmask & pbdma_bitmask) 706 if (runlist->pbdma_bitmask & pbdma_bitmask)
653 runlists_mask |= 707 runlists_mask |=
654 fifo_sched_disable_runlist_m(id); 708 fifo_sched_disable_runlist_m(rlid);
655 } 709 }
656 } 710 }
657 711
658 if (id_type == ID_TYPE_UNKNOWN) { 712 if (id_type == ID_TYPE_UNKNOWN) {
659 for (id = 0; id < f->max_runlists; id++) { 713 for (rlid = 0; rlid < f->max_runlists; rlid++) {
660 if (act_eng_bitmask) { 714 if (act_eng_bitmask) {
661 /* eng ids are known */ 715 /* eng ids are known */
662 runlist = &f->runlist_info[id]; 716 runlist = &f->runlist_info[rlid];
663 if (runlist->eng_bitmask & act_eng_bitmask) 717 if (runlist->eng_bitmask & act_eng_bitmask)
664 runlists_mask |= 718 runlists_mask |=
665 fifo_sched_disable_runlist_m(id); 719 fifo_sched_disable_runlist_m(rlid);
666 } else { 720 } else {
667 runlists_mask |= 721 runlists_mask |=
668 fifo_sched_disable_runlist_m(id); 722 fifo_sched_disable_runlist_m(rlid);
669 } 723 }
670 } 724 }
671 } 725 }
@@ -697,10 +751,20 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g,
697 struct nvgpu_timeout timeout; 751 struct nvgpu_timeout timeout;
698 u32 delay = GR_IDLE_CHECK_DEFAULT; 752 u32 delay = GR_IDLE_CHECK_DEFAULT;
699 int ret = -EBUSY; 753 int ret = -EBUSY;
754 unsigned int loop_count = 0;
700 755
701 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 756 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
702 NVGPU_TIMER_CPU_TIMER); 757 NVGPU_TIMER_CPU_TIMER);
703 do { 758 do {
759 if (!nvgpu_platform_is_silicon(g)) {
760 if (loop_count >= MAX_PRE_SI_RETRIES) {
761 nvgpu_err(g, "preempt runlist retries: %u",
762 loop_count);
763 break;
764 }
765 loop_count++;
766 }
767
704 if (!((gk20a_readl(g, fifo_runlist_preempt_r())) & 768 if (!((gk20a_readl(g, fifo_runlist_preempt_r())) &
705 runlists_mask)) { 769 runlists_mask)) {
706 ret = 0; 770 ret = 0;
@@ -710,13 +774,16 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g,
710 nvgpu_usleep_range(delay, delay * 2); 774 nvgpu_usleep_range(delay, delay * 2);
711 delay = min_t(unsigned long, 775 delay = min_t(unsigned long,
712 delay << 1, GR_IDLE_CHECK_MAX); 776 delay << 1, GR_IDLE_CHECK_MAX);
713 } while (!nvgpu_timeout_expired_msg(&timeout, 777 } while (!nvgpu_timeout_expired(&timeout));
714 "runlist preempt timeout")); 778
779 if (ret)
780 nvgpu_err(g, "preempt runlist timeout, runlists_mask:0x%08x",
781 runlists_mask);
715 return ret; 782 return ret;
716} 783}
717 784
718int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, 785int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
719 unsigned int id_type, unsigned int timeout_rc_type) 786 unsigned int id_type)
720{ 787{
721 struct fifo_gk20a *f = &g->fifo; 788 struct fifo_gk20a *f = &g->fifo;
722 unsigned long runlist_served_pbdmas; 789 unsigned long runlist_served_pbdmas;
@@ -724,7 +791,6 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
724 u32 pbdma_id; 791 u32 pbdma_id;
725 u32 act_eng_id; 792 u32 act_eng_id;
726 u32 runlist_id; 793 u32 runlist_id;
727 int func_ret;
728 int ret = 0; 794 int ret = 0;
729 u32 tsgid; 795 u32 tsgid;
730 796
@@ -741,30 +807,14 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
741 runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask; 807 runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask;
742 runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask; 808 runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask;
743 809
744 for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) { 810 for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma)
745 811 ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id);
746 func_ret = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id,
747 timeout_rc_type);
748 if (func_ret != 0) {
749 nvgpu_log_info(g, "preempt timeout pbdma %d", pbdma_id);
750 ret |= func_ret;
751 }
752 }
753 812
754 f->runlist_info[runlist_id].reset_eng_bitmask = 0; 813 f->runlist_info[runlist_id].reset_eng_bitmask = 0;
755 814
756 for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) { 815 for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines)
757 816 ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
758 func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id, 817 &f->runlist_info[runlist_id].reset_eng_bitmask);
759 &f->runlist_info[runlist_id].reset_eng_bitmask,
760 timeout_rc_type);
761
762 if (func_ret != 0) {
763 nvgpu_log_info(g, "preempt timeout engine %d", act_eng_id);
764 ret |= func_ret;
765 }
766 }
767
768 return ret; 818 return ret;
769} 819}
770 820
@@ -848,6 +898,9 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
848 898
849 nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock); 899 nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock);
850 900
901 /* WAR for Bug 2065990 */
902 gk20a_fifo_disable_tsg_sched(g, &f->tsg[tsgid]);
903
851 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 904 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
852 905
853 ret = __locked_fifo_preempt(g, tsgid, true); 906 ret = __locked_fifo_preempt(g, tsgid, true);
@@ -855,6 +908,9 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
855 if (!mutex_ret) 908 if (!mutex_ret)
856 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 909 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
857 910
911 /* WAR for Bug 2065990 */
912 gk20a_fifo_enable_tsg_sched(g, &f->tsg[tsgid]);
913
858 nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock); 914 nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
859 915
860 if (ret) 916 if (ret)
@@ -863,44 +919,36 @@ int gv11b_fifo_preempt_tsg(struct gk20a *g, u32 tsgid)
863 return ret; 919 return ret;
864} 920}
865 921
866static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask) 922static void gv11b_fifo_locked_preempt_runlists(struct gk20a *g, u32 runlists_mask)
867{ 923{
868 int ret = 0; 924 int ret = 0;
869 u32 token = PMU_INVALID_MUTEX_OWNER_ID; 925 u32 token = PMU_INVALID_MUTEX_OWNER_ID;
870 u32 mutex_ret = 0; 926 u32 mutex_ret = 0;
871 u32 runlist_id; 927 u32 rlid;
872
873 nvgpu_log_fn(g, " ");
874 928
875 for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { 929 /* runlist_lock are locked by teardown and sched are disabled too */
876 if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id)) 930 nvgpu_log_fn(g, "preempt runlists_mask:0x%08x", runlists_mask);
877 nvgpu_mutex_acquire(&g->fifo.
878 runlist_info[runlist_id].runlist_lock);
879 }
880 931
881 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 932 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
882 933
883 ret = __locked_fifo_preempt_runlists(g, runlists_mask); 934 ret = __locked_fifo_preempt_runlists(g, runlists_mask);
884 935
885 if (!mutex_ret) 936 if (ret) {
886 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 937 /* if preempt timed out, reset engs served by runlists */
887 938 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
888 for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { 939 if (runlists_mask &
889 if (runlists_mask & 940 fifo_runlist_preempt_runlist_m(rlid))
890 fifo_runlist_preempt_runlist_m(runlist_id)) { 941 g->fifo.runlist_info[rlid].reset_eng_bitmask =
891 /* during recovery reset engs served by this runlist */ 942 g->fifo.runlist_info[rlid].eng_bitmask;
892 g->fifo.runlist_info[runlist_id].reset_eng_bitmask =
893 g->fifo.runlist_info[runlist_id].eng_bitmask;
894 nvgpu_mutex_release(&g->fifo.
895 runlist_info[runlist_id].runlist_lock);
896 } 943 }
897 } 944 }
898 945
899 return ret; 946 if (!mutex_ret)
947 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
900} 948}
901 949
902static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id, 950static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
903 unsigned int id_type, unsigned int timeout_rc_type) 951 unsigned int id_type)
904{ 952{
905 int ret; 953 int ret;
906 struct fifo_gk20a *f = &g->fifo; 954 struct fifo_gk20a *f = &g->fifo;
@@ -914,52 +962,97 @@ static int __locked_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
914 gk20a_fifo_issue_preempt(g, id, true); 962 gk20a_fifo_issue_preempt(g, id, true);
915 963
916 /* wait for preempt */ 964 /* wait for preempt */
917 ret = g->ops.fifo.is_preempt_pending(g, id, id_type, 965 ret = g->ops.fifo.is_preempt_pending(g, id, id_type);
918 timeout_rc_type);
919 966
920 if (ret && (timeout_rc_type == PREEMPT_TIMEOUT_RC)) 967 /* No recovery even if preempt timed out since
921 gk20a_fifo_preempt_timeout_rc(g, id, id_type); 968 * this is called from recovery path
969 */
922 970
923 return ret; 971 return ret;
924} 972}
925 973
926 974
927int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id, 975int gv11b_fifo_preempt_ch_tsg(struct gk20a *g, u32 id,
928 unsigned int id_type, unsigned int timeout_rc_type) 976 unsigned int id_type)
929{ 977{
930 struct fifo_gk20a *f = &g->fifo;
931 u32 ret = 0; 978 u32 ret = 0;
932 u32 token = PMU_INVALID_MUTEX_OWNER_ID; 979 u32 token = PMU_INVALID_MUTEX_OWNER_ID;
933 u32 mutex_ret = 0; 980 u32 mutex_ret = 0;
934 u32 runlist_id;
935 981
936 if (id_type == ID_TYPE_TSG) 982 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
937 runlist_id = f->tsg[id].runlist_id; 983 /*
938 else if (id_type == ID_TYPE_CHANNEL) 984 * This is called from teardown path only. runlist_lock
939 runlist_id = f->channel[id].runlist_id; 985 * is already acquired before calling this function.
940 else 986 */
941 return -EINVAL; 987 ret = __locked_fifo_preempt_ch_tsg(g, id, id_type);
942 988
943 if (runlist_id >= g->fifo.max_runlists) { 989 if (!mutex_ret)
944 nvgpu_log_info(g, "runlist_id = %d", runlist_id); 990 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
945 return -EINVAL;
946 }
947 991
948 nvgpu_log_fn(g, "preempt id = %d, runlist_id = %d", id, runlist_id); 992 return ret;
949 993
950 nvgpu_mutex_acquire(&f->runlist_info[runlist_id].runlist_lock); 994}
995
996static void gv11b_fifo_locked_abort_runlist_active_tsgs(struct gk20a *g,
997 unsigned int rc_type,
998 u32 runlists_mask)
999{
1000 struct tsg_gk20a *tsg = NULL;
1001 u32 rlid, tsgid;
1002 struct fifo_runlist_info_gk20a *runlist = NULL;
1003 u32 token = PMU_INVALID_MUTEX_OWNER_ID;
1004 u32 mutex_ret = 0;
1005 bool add = false, wait_for_finish = false;
1006 int err;
951 1007
1008 nvgpu_err(g, "runlist id unknown, abort active tsgs in runlists");
1009
1010 /* runlist_lock are locked by teardown */
952 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 1011 mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
953 1012
954 ret = __locked_fifo_preempt_ch_tsg(g, id, id_type, timeout_rc_type); 1013 for (rlid = 0; rlid < g->fifo.max_runlists;
1014 rlid++) {
1015 if (!(runlists_mask & BIT(rlid)))
1016 continue;
1017 nvgpu_log(g, gpu_dbg_info, "abort runlist id %d",
1018 rlid);
1019 runlist = &g->fifo.runlist_info[rlid];
1020
1021 for_each_set_bit(tsgid, runlist->active_tsgs,
1022 g->fifo.num_channels) {
1023 nvgpu_log(g, gpu_dbg_info, "abort tsg id %d", tsgid);
1024 tsg = &g->fifo.tsg[tsgid];
1025 gk20a_disable_tsg(tsg);
955 1026
956 if (!mutex_ret) 1027 /* assume all pbdma and eng faulted are set */
957 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); 1028 nvgpu_log(g, gpu_dbg_info, "reset pbdma and eng faulted");
1029 gv11b_reset_pbdma_faulted_tsg(tsg);
1030 gv11b_reset_eng_faulted_tsg(tsg);
958 1031
959 nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock); 1032#ifdef CONFIG_GK20A_CTXSW_TRACE
1033 gk20a_ctxsw_trace_tsg_reset(g, tsg);
1034#endif
1035 if (!g->fifo.deferred_reset_pending) {
1036 if (rc_type == RC_TYPE_MMU_FAULT) {
1037 gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
1038 gk20a_fifo_error_tsg(g, tsg);
1039 }
1040 }
960 1041
961 return ret; 1042 /* (chid == ~0 && !add) remove all act ch from runlist*/
1043 err = gk20a_fifo_update_runlist_locked(g, rlid,
1044 FIFO_INVAL_CHANNEL_ID, add, wait_for_finish);
1045 if (err)
1046 nvgpu_err(g, "runlist id %d is not cleaned up",
1047 rlid);
962 1048
1049 gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
1050
1051 nvgpu_log(g, gpu_dbg_info, "aborted tsg id %d", tsgid);
1052 }
1053 }
1054 if (!mutex_ret)
1055 nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
963} 1056}
964 1057
965void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, 1058void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
@@ -967,10 +1060,66 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
967 struct mmu_fault_info *mmfault) 1060 struct mmu_fault_info *mmfault)
968{ 1061{
969 struct tsg_gk20a *tsg = NULL; 1062 struct tsg_gk20a *tsg = NULL;
970 struct channel_gk20a *refch = NULL; 1063 u32 runlists_mask, rlid;
971 u32 runlists_mask, runlist_id;
972 struct fifo_runlist_info_gk20a *runlist = NULL; 1064 struct fifo_runlist_info_gk20a *runlist = NULL;
973 u32 engine_id, client_type = ~0; 1065 u32 engine_id, client_type = ~0;
1066 struct fifo_gk20a *f = &g->fifo;
1067 u32 runlist_id = FIFO_INVAL_RUNLIST_ID;
1068 u32 num_runlists = 0;
1069
1070 nvgpu_log_fn(g, "acquire runlist_lock for all runlists");
1071 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++)
1072 nvgpu_mutex_acquire(&f->runlist_info[rlid].
1073 runlist_lock);
1074
1075 /* get runlist id and tsg */
1076 if (id_type == ID_TYPE_TSG) {
1077 if (id != FIFO_INVAL_TSG_ID) {
1078 tsg = &g->fifo.tsg[id];
1079 runlist_id = tsg->runlist_id;
1080 if (runlist_id != FIFO_INVAL_RUNLIST_ID)
1081 num_runlists++;
1082 else
1083 nvgpu_log_fn(g, "tsg runlist id is invalid");
1084 } else {
1085 nvgpu_log_fn(g, "id type is tsg but tsg id is inval");
1086 }
1087 } else {
1088 /*
1089 * id type is unknown, get runlist_id if eng mask is such that
1090 * it corresponds to single runlist id. If eng mask corresponds
1091 * to multiple runlists, then abort all runlists
1092 */
1093 for (rlid = 0; rlid < f->max_runlists; rlid++) {
1094 if (act_eng_bitmask) {
1095 /* eng ids are known */
1096 runlist = &f->runlist_info[rlid];
1097 if (runlist->eng_bitmask & act_eng_bitmask) {
1098 runlist_id = rlid;
1099 num_runlists++;
1100 }
1101 } else {
1102 break;
1103 }
1104 }
1105 if (num_runlists > 1 ) /* abort all runlists */
1106 runlist_id = FIFO_INVAL_RUNLIST_ID;
1107 }
1108
1109 /* if runlist_id is valid and there is only single runlist to be
1110 * aborted, release runlist lock that are not
1111 * needed for this recovery
1112 */
1113 if (runlist_id != FIFO_INVAL_RUNLIST_ID && num_runlists == 1) {
1114 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1115 if (rlid != runlist_id) {
1116 nvgpu_log_fn(g, "release runlist_lock for "
1117 "unused runlist id: %d", rlid);
1118 nvgpu_mutex_release(&f->runlist_info[rlid].
1119 runlist_lock);
1120 }
1121 }
1122 }
974 1123
975 nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, " 1124 nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, "
976 "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p", 1125 "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p",
@@ -979,6 +1128,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
979 runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, 1128 runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id,
980 id_type, rc_type, mmfault); 1129 id_type, rc_type, mmfault);
981 1130
1131 /* Disable runlist scheduler */
982 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED); 1132 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_DISABLED);
983 1133
984 g->fifo.deferred_reset_pending = false; 1134 g->fifo.deferred_reset_pending = false;
@@ -1000,41 +1150,41 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1000 1150
1001 gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN); 1151 gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN);
1002 1152
1003 /* Get tsg/ch */
1004 if (rc_type == RC_TYPE_MMU_FAULT) { 1153 if (rc_type == RC_TYPE_MMU_FAULT) {
1005 gk20a_debug_dump(g); 1154 gk20a_debug_dump(g);
1006 refch = mmfault->refch;
1007 client_type = mmfault->client_type; 1155 client_type = mmfault->client_type;
1008 gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch, 1156 gv11b_fifo_reset_pbdma_and_eng_faulted(g, tsg,
1009 mmfault->faulted_pbdma, 1157 mmfault->faulted_pbdma,
1010 mmfault->faulted_engine); 1158 mmfault->faulted_engine);
1011 } 1159 }
1012 1160
1013 if (id_type == ID_TYPE_TSG) {
1014 tsg = &g->fifo.tsg[id];
1015 } else if (id_type == ID_TYPE_CHANNEL) {
1016 if (refch == NULL)
1017 refch = gk20a_channel_get(&g->fifo.channel[id]);
1018 }
1019 /* Disable tsg/ch */
1020 if (tsg) 1161 if (tsg)
1021 gk20a_disable_tsg(tsg); 1162 gk20a_disable_tsg(tsg);
1022 else if (refch)
1023 g->ops.fifo.disable_channel(refch);
1024 1163
1025 /* Preempt tsg/ch */ 1164 /*
1026 if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) { 1165 * Even though TSG preempt timed out, the RC sequence would by design
1027 g->ops.fifo.preempt_ch_tsg(g, id, id_type, 1166 * require s/w to issue another preempt.
1028 PREEMPT_TIMEOUT_NORC); 1167 * If recovery includes an ENGINE_RESET, to not have race conditions,
1168 * use RUNLIST_PREEMPT to kick all work off, and cancel any context
1169 * load which may be pending. This is also needed to make sure
1170 * that all PBDMAs serving the engine are not loaded when engine is
1171 * reset.
1172 */
1173 if (tsg) {
1174 int preempt_failed;
1175
1176 preempt_failed = g->ops.fifo.preempt_ch_tsg(g, id, id_type);
1177 if (preempt_failed)
1178 gv11b_fifo_locked_preempt_runlists(g, runlists_mask);
1029 } else { 1179 } else {
1030 gv11b_fifo_preempt_runlists(g, runlists_mask); 1180 gv11b_fifo_locked_preempt_runlists(g, runlists_mask);
1031 } 1181 }
1032 1182
1033 /* check if engine reset should be deferred */ 1183 /* check if engine reset should be deferred */
1034 for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { 1184 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
1035 1185
1036 runlist = &g->fifo.runlist_info[runlist_id]; 1186 runlist = &g->fifo.runlist_info[rlid];
1037 if ((runlists_mask & BIT(runlist_id)) && 1187 if ((runlists_mask & BIT(rlid)) &&
1038 runlist->reset_eng_bitmask) { 1188 runlist->reset_eng_bitmask) {
1039 1189
1040 unsigned long __reset_eng_bitmask = 1190 unsigned long __reset_eng_bitmask =
@@ -1042,7 +1192,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1042 1192
1043 for_each_set_bit(engine_id, &__reset_eng_bitmask, 1193 for_each_set_bit(engine_id, &__reset_eng_bitmask,
1044 g->fifo.max_engines) { 1194 g->fifo.max_engines) {
1045 if ((refch || tsg) && 1195 if (tsg &&
1046 gk20a_fifo_should_defer_engine_reset(g, 1196 gk20a_fifo_should_defer_engine_reset(g,
1047 engine_id, client_type, false)) { 1197 engine_id, client_type, false)) {
1048 1198
@@ -1074,13 +1224,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1074 } 1224 }
1075 1225
1076#ifdef CONFIG_GK20A_CTXSW_TRACE 1226#ifdef CONFIG_GK20A_CTXSW_TRACE
1077 /* tsg and refch both could be valid for mmu fault. Check tsg first */
1078 if (tsg) 1227 if (tsg)
1079 gk20a_ctxsw_trace_tsg_reset(g, tsg); 1228 gk20a_ctxsw_trace_tsg_reset(g, tsg);
1080 else if (refch)
1081 gk20a_ctxsw_trace_channel_reset(g, refch);
1082#endif 1229#endif
1083
1084 if (tsg) { 1230 if (tsg) {
1085 if (g->fifo.deferred_reset_pending) { 1231 if (g->fifo.deferred_reset_pending) {
1086 gk20a_disable_tsg(tsg); 1232 gk20a_disable_tsg(tsg);
@@ -1090,26 +1236,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1090 1236
1091 gk20a_fifo_abort_tsg(g, tsg->tsgid, false); 1237 gk20a_fifo_abort_tsg(g, tsg->tsgid, false);
1092 } 1238 }
1093 if (refch)
1094 gk20a_channel_put(refch);
1095 } else if (refch) {
1096 if (g->fifo.deferred_reset_pending) {
1097 g->ops.fifo.disable_channel(refch);
1098 } else {
1099 if (rc_type == RC_TYPE_MMU_FAULT)
1100 gk20a_fifo_set_ctx_mmu_error_ch(g, refch);
1101
1102 gk20a_channel_abort(refch, false);
1103 }
1104 gk20a_channel_put(refch);
1105 } else { 1239 } else {
1106 nvgpu_err(g, "id unknown, abort runlist"); 1240 gv11b_fifo_locked_abort_runlist_active_tsgs(g, rc_type,
1107 for (runlist_id = 0; runlist_id < g->fifo.max_runlists; 1241 runlists_mask);
1108 runlist_id++) {
1109 if (runlists_mask & BIT(runlist_id))
1110 g->ops.fifo.update_runlist(g, runlist_id,
1111 FIFO_INVAL_CHANNEL_ID, false, true);
1112 }
1113 } 1242 }
1114 1243
1115 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED); 1244 gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED);
@@ -1117,6 +1246,18 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask,
1117 /* It is safe to enable ELPG again. */ 1246 /* It is safe to enable ELPG again. */
1118 if (g->support_pmu && g->elpg_enabled) 1247 if (g->support_pmu && g->elpg_enabled)
1119 nvgpu_pmu_enable_elpg(g); 1248 nvgpu_pmu_enable_elpg(g);
1249
1250 /* release runlist_lock */
1251 if (runlist_id != FIFO_INVAL_RUNLIST_ID) {
1252 nvgpu_log_fn(g, "release runlist_lock runlist_id = %d",
1253 runlist_id);
1254 nvgpu_mutex_release(&f->runlist_info[runlist_id].runlist_lock);
1255 } else {
1256 nvgpu_log_fn(g, "release runlist_lock for all runlists");
1257 for (rlid = 0; rlid < g->fifo.max_runlists; rlid++)
1258 nvgpu_mutex_release(&f->runlist_info[rlid].
1259 runlist_lock);
1260 }
1120} 1261}
1121 1262
1122void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f) 1263void gv11b_fifo_init_pbdma_intr_descs(struct fifo_gk20a *f)