summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gv100/mc_gv100.c16
-rw-r--r--drivers/gpu/nvgpu/gv100/mc_gv100.h3
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.c113
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.h2
-rw-r--r--drivers/gpu/nvgpu/gv11b/mc_gv11b.c16
-rw-r--r--drivers/gpu/nvgpu/gv11b/mc_gv11b.h3
7 files changed, 101 insertions, 54 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 9061236e..25146b8b 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -1109,7 +1109,7 @@ struct gpu_ops {
1109 bool (*is_intr_hub_pending)(struct gk20a *g, u32 mc_intr); 1109 bool (*is_intr_hub_pending)(struct gk20a *g, u32 mc_intr);
1110 bool (*is_intr_nvlink_pending)(struct gk20a *g, u32 mc_intr); 1110 bool (*is_intr_nvlink_pending)(struct gk20a *g, u32 mc_intr);
1111 bool (*is_stall_and_eng_intr_pending)(struct gk20a *g, 1111 bool (*is_stall_and_eng_intr_pending)(struct gk20a *g,
1112 u32 act_eng_id); 1112 u32 act_eng_id, u32 *eng_intr_pending);
1113 u32 (*intr_stall)(struct gk20a *g); 1113 u32 (*intr_stall)(struct gk20a *g);
1114 void (*intr_stall_pause)(struct gk20a *g); 1114 void (*intr_stall_pause)(struct gk20a *g);
1115 void (*intr_stall_resume)(struct gk20a *g); 1115 void (*intr_stall_resume)(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/gv100/mc_gv100.c b/drivers/gpu/nvgpu/gv100/mc_gv100.c
index 7ed9e6da..2d84a3a8 100644
--- a/drivers/gpu/nvgpu/gv100/mc_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/mc_gv100.c
@@ -72,15 +72,14 @@ bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0)
72 return (((mc_intr_0 & mc_intr_nvlink_pending_f()) != 0U) ? true : false); 72 return (((mc_intr_0 & mc_intr_nvlink_pending_f()) != 0U) ? true : false);
73} 73}
74 74
75bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id) 75bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
76 u32 *eng_intr_pending)
76{ 77{
77 u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); 78 u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0));
78 u32 stall_intr, eng_intr_mask; 79 u32 stall_intr, eng_intr_mask;
79 80
80 eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); 81 eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id);
81 if ((mc_intr_0 & eng_intr_mask) != 0U) { 82 *eng_intr_pending = mc_intr_0 & eng_intr_mask;
82 return true;
83 }
84 83
85 stall_intr = mc_intr_pfifo_pending_f() | 84 stall_intr = mc_intr_pfifo_pending_f() |
86 mc_intr_hub_pending_f() | 85 mc_intr_hub_pending_f() |
@@ -88,9 +87,10 @@ bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id)
88 mc_intr_pbus_pending_f() | 87 mc_intr_pbus_pending_f() |
89 mc_intr_ltc_pending_f() | 88 mc_intr_ltc_pending_f() |
90 mc_intr_nvlink_pending_f(); 89 mc_intr_nvlink_pending_f();
91 if ((mc_intr_0 & stall_intr) != 0U) {
92 return true;
93 }
94 90
95 return false; 91 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
92 "mc_intr_0 = 0x%08x, eng_intr = 0x%08x",
93 mc_intr_0 & stall_intr, *eng_intr_pending);
94
95 return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U;
96} 96}
diff --git a/drivers/gpu/nvgpu/gv100/mc_gv100.h b/drivers/gpu/nvgpu/gv100/mc_gv100.h
index 4aff4a36..e9069258 100644
--- a/drivers/gpu/nvgpu/gv100/mc_gv100.h
+++ b/drivers/gpu/nvgpu/gv100/mc_gv100.h
@@ -26,5 +26,6 @@ struct gk20a;
26 26
27void mc_gv100_intr_enable(struct gk20a *g); 27void mc_gv100_intr_enable(struct gk20a *g);
28bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0); 28bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0);
29bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id); 29bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
30 u32 *eng_intr_pending);
30#endif 31#endif
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 7e0ce4c6..13d498a7 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -392,6 +392,7 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
392 u32 pbdma_stat; 392 u32 pbdma_stat;
393 u32 chan_stat; 393 u32 chan_stat;
394 int ret = -EBUSY; 394 int ret = -EBUSY;
395 unsigned int loop_count = 0;
395 396
396 /* timeout in milli seconds */ 397 /* timeout in milli seconds */
397 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 398 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -400,6 +401,14 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
400 nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id); 401 nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id);
401 /* Verify that ch/tsg is no longer on the pbdma */ 402 /* Verify that ch/tsg is no longer on the pbdma */
402 do { 403 do {
404 if (!nvgpu_platform_is_silicon(g)) {
405 if (loop_count >= MAX_PRE_SI_RETRIES) {
406 nvgpu_err(g, "preempt pbdma retries: %u",
407 loop_count);
408 break;
409 }
410 loop_count++;
411 }
403 /* 412 /*
404 * If the PBDMA has a stalling interrupt and receives a NACK, 413 * If the PBDMA has a stalling interrupt and receives a NACK,
405 * the PBDMA won't save out until the STALLING interrupt is 414 * the PBDMA won't save out until the STALLING interrupt is
@@ -452,8 +461,11 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
452 nvgpu_usleep_range(delay, delay * 2); 461 nvgpu_usleep_range(delay, delay * 2);
453 delay = min_t(unsigned long, 462 delay = min_t(unsigned long,
454 delay << 1, GR_IDLE_CHECK_MAX); 463 delay << 1, GR_IDLE_CHECK_MAX);
455 } while (!nvgpu_timeout_expired_msg(&timeout, 464 } while (!nvgpu_timeout_expired(&timeout));
456 "preempt timeout pbdma")); 465
466 if (ret)
467 nvgpu_err(g, "preempt timeout pbdma: %u pbdma_stat: %u "
468 "tsgid: %u", pbdma_id, pbdma_stat, id);
457 return ret; 469 return ret;
458} 470}
459 471
@@ -466,7 +478,8 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
466 u32 eng_stat; 478 u32 eng_stat;
467 u32 ctx_stat; 479 u32 ctx_stat;
468 int ret = -EBUSY; 480 int ret = -EBUSY;
469 bool stall_intr = false; 481 unsigned int loop_count = 0;
482 u32 eng_intr_pending;
470 483
471 /* timeout in milli seconds */ 484 /* timeout in milli seconds */
472 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 485 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -476,20 +489,56 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
476 act_eng_id); 489 act_eng_id);
477 /* Check if ch/tsg has saved off the engine or if ctxsw is hung */ 490 /* Check if ch/tsg has saved off the engine or if ctxsw is hung */
478 do { 491 do {
492 if (!nvgpu_platform_is_silicon(g)) {
493 if (loop_count >= MAX_PRE_SI_RETRIES) {
494 nvgpu_err(g, "preempt eng retries: %u",
495 loop_count);
496 break;
497 }
498 loop_count++;
499 }
479 eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id)); 500 eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id));
480 ctx_stat = fifo_engine_status_ctx_status_v(eng_stat); 501 ctx_stat = fifo_engine_status_ctx_status_v(eng_stat);
481 502
482 if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id)) { 503 if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id,
483 stall_intr = true; 504 &eng_intr_pending)) {
505 /* From h/w team
506 * Engine save can be blocked by eng stalling interrupts.
507 * FIFO interrupts shouldn’t block an engine save from
508 * finishing, but could block FIFO from reporting preempt done.
509 * No immediate reason to reset the engine if FIFO interrupt is
510 * pending.
511 * The hub, priv_ring, and ltc interrupts could block context
512 * switch (or memory), but doesn’t necessarily have to.
513 * For Hub interrupts they just report access counters and page
514 * faults. Neither of these necessarily block context switch
515 * or preemption, but they could.
516 * For example a page fault for graphics would prevent graphics
517 * from saving out. An access counter interrupt is a
518 * notification and has no effect.
519 * SW should handle page faults though for preempt to complete.
520 * PRI interrupt (due to a failed PRI transaction) will result
521 * in ctxsw failure reported to HOST.
522 * LTC interrupts are generally ECC related and if so,
523 * certainly don’t block preemption/ctxsw but they could.
524 * Bus interrupts shouldn’t have anything to do with preemption
525 * state as they are part of the Host EXT pipe, though they may
526 * exhibit a symptom that indicates that GPU is in a bad state.
527 * To be completely fair, when an engine is preempting SW
528 * really should just handle other interrupts as they come in.
529 * It’s generally bad to just poll and wait on a preempt
530 * to complete since there are many things in the GPU which may
531 * cause a system to hang/stop responding.
532 */
484 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, 533 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
485 "stall intr set, " 534 "stall intr set, "
486 "preemption will not finish"); 535 "preemption might not finish");
487 } 536 }
488 if (ctx_stat == 537 if (ctx_stat ==
489 fifo_engine_status_ctx_status_ctxsw_switch_v()) { 538 fifo_engine_status_ctx_status_ctxsw_switch_v()) {
490 /* Eng save hasn't started yet. Continue polling */ 539 /* Eng save hasn't started yet. Continue polling */
491 if (stall_intr) { 540 if (eng_intr_pending) {
492 /* if stall intr stop polling */ 541 /* if eng intr, stop polling */
493 *reset_eng_bitmask |= BIT(act_eng_id); 542 *reset_eng_bitmask |= BIT(act_eng_id);
494 ret = 0; 543 ret = 0;
495 break; 544 break;
@@ -501,8 +550,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
501 fifo_engine_status_ctx_status_ctxsw_save_v()) { 550 fifo_engine_status_ctx_status_ctxsw_save_v()) {
502 551
503 if (id == fifo_engine_status_id_v(eng_stat)) { 552 if (id == fifo_engine_status_id_v(eng_stat)) {
504 if (stall_intr || 553 if (eng_intr_pending) {
505 timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
506 /* preemption will not finish */ 554 /* preemption will not finish */
507 *reset_eng_bitmask |= BIT(act_eng_id); 555 *reset_eng_bitmask |= BIT(act_eng_id);
508 ret = 0; 556 ret = 0;
@@ -518,9 +566,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
518 fifo_engine_status_ctx_status_ctxsw_load_v()) { 566 fifo_engine_status_ctx_status_ctxsw_load_v()) {
519 567
520 if (id == fifo_engine_status_next_id_v(eng_stat)) { 568 if (id == fifo_engine_status_next_id_v(eng_stat)) {
521 569 if (eng_intr_pending) {
522 if (stall_intr ||
523 timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
524 /* preemption will not finish */ 570 /* preemption will not finish */
525 *reset_eng_bitmask |= BIT(act_eng_id); 571 *reset_eng_bitmask |= BIT(act_eng_id);
526 ret = 0; 572 ret = 0;
@@ -540,8 +586,21 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
540 nvgpu_usleep_range(delay, delay * 2); 586 nvgpu_usleep_range(delay, delay * 2);
541 delay = min_t(unsigned long, 587 delay = min_t(unsigned long,
542 delay << 1, GR_IDLE_CHECK_MAX); 588 delay << 1, GR_IDLE_CHECK_MAX);
543 } while (!nvgpu_timeout_expired_msg(&timeout, 589 } while (!nvgpu_timeout_expired(&timeout));
544 "preempt timeout eng")); 590
591 if (ret) {
592 /*
593 * The reasons a preempt can fail are:
594 * 1.Some other stalling interrupt is asserted preventing
595 * channel or context save.
596 * 2.The memory system hangs.
597 * 3.The engine hangs during CTXSW.
598 */
599 nvgpu_err(g, "preempt timeout eng: %u ctx_stat: %u tsgid: %u",
600 act_eng_id, ctx_stat, id);
601 *reset_eng_bitmask |= BIT(act_eng_id);
602 }
603
545 return ret; 604 return ret;
546} 605}
547 606
@@ -718,7 +777,6 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
718 u32 pbdma_id; 777 u32 pbdma_id;
719 u32 act_eng_id; 778 u32 act_eng_id;
720 u32 runlist_id; 779 u32 runlist_id;
721 int func_ret;
722 int ret = 0; 780 int ret = 0;
723 u32 tsgid; 781 u32 tsgid;
724 782
@@ -735,30 +793,15 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
735 runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask; 793 runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask;
736 runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask; 794 runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask;
737 795
738 for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) { 796 for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma)
739 797 ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id,
740 func_ret = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id,
741 timeout_rc_type); 798 timeout_rc_type);
742 if (func_ret != 0) {
743 nvgpu_log_info(g, "preempt timeout pbdma %d", pbdma_id);
744 ret |= func_ret;
745 }
746 }
747
748 f->runlist_info[runlist_id].reset_eng_bitmask = 0; 799 f->runlist_info[runlist_id].reset_eng_bitmask = 0;
749 800
750 for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) { 801 for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines)
751 802 ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
752 func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
753 &f->runlist_info[runlist_id].reset_eng_bitmask, 803 &f->runlist_info[runlist_id].reset_eng_bitmask,
754 timeout_rc_type); 804 timeout_rc_type);
755
756 if (func_ret != 0) {
757 nvgpu_log_info(g, "preempt timeout engine %d", act_eng_id);
758 ret |= func_ret;
759 }
760 }
761
762 return ret; 805 return ret;
763} 806}
764 807
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
index 1ae3c93e..5ff16453 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
@@ -50,6 +50,8 @@
50 50
51#define CHANNEL_INFO_VEID0 0 51#define CHANNEL_INFO_VEID0 0
52 52
53#define MAX_PRE_SI_RETRIES 200000 /* 1G/500KHz * 100 */
54
53struct gpu_ops; 55struct gpu_ops;
54 56
55void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g, 57void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
index 31600828..dbeb0645 100644
--- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
@@ -71,24 +71,24 @@ bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0)
71 return (((mc_intr_0 & mc_intr_hub_pending_f()) != 0U) ? true : false); 71 return (((mc_intr_0 & mc_intr_hub_pending_f()) != 0U) ? true : false);
72} 72}
73 73
74bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id) 74bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
75 u32 *eng_intr_pending)
75{ 76{
76 u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); 77 u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0));
77 u32 stall_intr, eng_intr_mask; 78 u32 stall_intr, eng_intr_mask;
78 79
79 eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); 80 eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id);
80 if ((mc_intr_0 & eng_intr_mask) != 0U) { 81 *eng_intr_pending = mc_intr_0 & eng_intr_mask;
81 return true;
82 }
83 82
84 stall_intr = mc_intr_pfifo_pending_f() | 83 stall_intr = mc_intr_pfifo_pending_f() |
85 mc_intr_hub_pending_f() | 84 mc_intr_hub_pending_f() |
86 mc_intr_priv_ring_pending_f() | 85 mc_intr_priv_ring_pending_f() |
87 mc_intr_pbus_pending_f() | 86 mc_intr_pbus_pending_f() |
88 mc_intr_ltc_pending_f(); 87 mc_intr_ltc_pending_f();
89 if ((mc_intr_0 & stall_intr) != 0U) {
90 return true;
91 }
92 88
93 return false; 89 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
90 "mc_intr_0 = 0x%08x, eng_intr = 0x%08x",
91 mc_intr_0 & stall_intr, *eng_intr_pending);
92
93 return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U;
94} 94}
diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
index eb9d0e4e..faa4d38d 100644
--- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
@@ -26,5 +26,6 @@ struct gk20a;
26 26
27void mc_gv11b_intr_enable(struct gk20a *g); 27void mc_gv11b_intr_enable(struct gk20a *g);
28bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0); 28bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0);
29bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id); 29bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
30 u32 *eng_intr_pending);
30#endif 31#endif