summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorSeema Khowala <seemaj@nvidia.com>2018-04-12 19:09:43 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2018-06-24 12:53:20 -0400
commit1407133b7e1b27a92ee8c116009541904d2ff691 (patch)
tree5aeb8c87c89b52e682101ab7678b3e0da8e7ea05 /drivers
parent797dde3e32647df3b616cea67f4defae59d38b3f (diff)
gpu: nvgpu: gv11b: do not poll preempt done if eng intr pending
-During polling eng preempt done, reset eng only if eng stall intr is pending. Also stop polling for eng preempt done if eng intr is pending. -Add max retries for pre-si platforms for poll pbdma and eng preempt done polling loops. Bug 2125776 Bug 2108544 Bug 2105322 Bug 2092051 Bug 2048824 Bug 2043838 Bug 2039587 Bug 2028993 Bug 2029245 Bug 2065990 Bug 1945121 Bug 200401707 Bug 200393631 Bug 200327596 Change-Id: I66b07be9647f141bd03801f83e3cda797e88272f Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1694137 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gv100/mc_gv100.c16
-rw-r--r--drivers/gpu/nvgpu/gv100/mc_gv100.h3
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.c113
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.h2
-rw-r--r--drivers/gpu/nvgpu/gv11b/mc_gv11b.c16
-rw-r--r--drivers/gpu/nvgpu/gv11b/mc_gv11b.h3
7 files changed, 101 insertions, 54 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 9061236e..25146b8b 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -1109,7 +1109,7 @@ struct gpu_ops {
1109 bool (*is_intr_hub_pending)(struct gk20a *g, u32 mc_intr); 1109 bool (*is_intr_hub_pending)(struct gk20a *g, u32 mc_intr);
1110 bool (*is_intr_nvlink_pending)(struct gk20a *g, u32 mc_intr); 1110 bool (*is_intr_nvlink_pending)(struct gk20a *g, u32 mc_intr);
1111 bool (*is_stall_and_eng_intr_pending)(struct gk20a *g, 1111 bool (*is_stall_and_eng_intr_pending)(struct gk20a *g,
1112 u32 act_eng_id); 1112 u32 act_eng_id, u32 *eng_intr_pending);
1113 u32 (*intr_stall)(struct gk20a *g); 1113 u32 (*intr_stall)(struct gk20a *g);
1114 void (*intr_stall_pause)(struct gk20a *g); 1114 void (*intr_stall_pause)(struct gk20a *g);
1115 void (*intr_stall_resume)(struct gk20a *g); 1115 void (*intr_stall_resume)(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/gv100/mc_gv100.c b/drivers/gpu/nvgpu/gv100/mc_gv100.c
index 7ed9e6da..2d84a3a8 100644
--- a/drivers/gpu/nvgpu/gv100/mc_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/mc_gv100.c
@@ -72,15 +72,14 @@ bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0)
72 return (((mc_intr_0 & mc_intr_nvlink_pending_f()) != 0U) ? true : false); 72 return (((mc_intr_0 & mc_intr_nvlink_pending_f()) != 0U) ? true : false);
73} 73}
74 74
75bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id) 75bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
76 u32 *eng_intr_pending)
76{ 77{
77 u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); 78 u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0));
78 u32 stall_intr, eng_intr_mask; 79 u32 stall_intr, eng_intr_mask;
79 80
80 eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); 81 eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id);
81 if ((mc_intr_0 & eng_intr_mask) != 0U) { 82 *eng_intr_pending = mc_intr_0 & eng_intr_mask;
82 return true;
83 }
84 83
85 stall_intr = mc_intr_pfifo_pending_f() | 84 stall_intr = mc_intr_pfifo_pending_f() |
86 mc_intr_hub_pending_f() | 85 mc_intr_hub_pending_f() |
@@ -88,9 +87,10 @@ bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id)
88 mc_intr_pbus_pending_f() | 87 mc_intr_pbus_pending_f() |
89 mc_intr_ltc_pending_f() | 88 mc_intr_ltc_pending_f() |
90 mc_intr_nvlink_pending_f(); 89 mc_intr_nvlink_pending_f();
91 if ((mc_intr_0 & stall_intr) != 0U) {
92 return true;
93 }
94 90
95 return false; 91 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
92 "mc_intr_0 = 0x%08x, eng_intr = 0x%08x",
93 mc_intr_0 & stall_intr, *eng_intr_pending);
94
95 return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U;
96} 96}
diff --git a/drivers/gpu/nvgpu/gv100/mc_gv100.h b/drivers/gpu/nvgpu/gv100/mc_gv100.h
index 4aff4a36..e9069258 100644
--- a/drivers/gpu/nvgpu/gv100/mc_gv100.h
+++ b/drivers/gpu/nvgpu/gv100/mc_gv100.h
@@ -26,5 +26,6 @@ struct gk20a;
26 26
27void mc_gv100_intr_enable(struct gk20a *g); 27void mc_gv100_intr_enable(struct gk20a *g);
28bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0); 28bool gv100_mc_is_intr_nvlink_pending(struct gk20a *g, u32 mc_intr_0);
29bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id); 29bool gv100_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
30 u32 *eng_intr_pending);
30#endif 31#endif
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 7e0ce4c6..13d498a7 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -392,6 +392,7 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
392 u32 pbdma_stat; 392 u32 pbdma_stat;
393 u32 chan_stat; 393 u32 chan_stat;
394 int ret = -EBUSY; 394 int ret = -EBUSY;
395 unsigned int loop_count = 0;
395 396
396 /* timeout in milli seconds */ 397 /* timeout in milli seconds */
397 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 398 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -400,6 +401,14 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
400 nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id); 401 nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id);
401 /* Verify that ch/tsg is no longer on the pbdma */ 402 /* Verify that ch/tsg is no longer on the pbdma */
402 do { 403 do {
404 if (!nvgpu_platform_is_silicon(g)) {
405 if (loop_count >= MAX_PRE_SI_RETRIES) {
406 nvgpu_err(g, "preempt pbdma retries: %u",
407 loop_count);
408 break;
409 }
410 loop_count++;
411 }
403 /* 412 /*
404 * If the PBDMA has a stalling interrupt and receives a NACK, 413 * If the PBDMA has a stalling interrupt and receives a NACK,
405 * the PBDMA won't save out until the STALLING interrupt is 414 * the PBDMA won't save out until the STALLING interrupt is
@@ -452,8 +461,11 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id,
452 nvgpu_usleep_range(delay, delay * 2); 461 nvgpu_usleep_range(delay, delay * 2);
453 delay = min_t(unsigned long, 462 delay = min_t(unsigned long,
454 delay << 1, GR_IDLE_CHECK_MAX); 463 delay << 1, GR_IDLE_CHECK_MAX);
455 } while (!nvgpu_timeout_expired_msg(&timeout, 464 } while (!nvgpu_timeout_expired(&timeout));
456 "preempt timeout pbdma")); 465
466 if (ret)
467 nvgpu_err(g, "preempt timeout pbdma: %u pbdma_stat: %u "
468 "tsgid: %u", pbdma_id, pbdma_stat, id);
457 return ret; 469 return ret;
458} 470}
459 471
@@ -466,7 +478,8 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
466 u32 eng_stat; 478 u32 eng_stat;
467 u32 ctx_stat; 479 u32 ctx_stat;
468 int ret = -EBUSY; 480 int ret = -EBUSY;
469 bool stall_intr = false; 481 unsigned int loop_count = 0;
482 u32 eng_intr_pending;
470 483
471 /* timeout in milli seconds */ 484 /* timeout in milli seconds */
472 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), 485 nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g),
@@ -476,20 +489,56 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
476 act_eng_id); 489 act_eng_id);
477 /* Check if ch/tsg has saved off the engine or if ctxsw is hung */ 490 /* Check if ch/tsg has saved off the engine or if ctxsw is hung */
478 do { 491 do {
492 if (!nvgpu_platform_is_silicon(g)) {
493 if (loop_count >= MAX_PRE_SI_RETRIES) {
494 nvgpu_err(g, "preempt eng retries: %u",
495 loop_count);
496 break;
497 }
498 loop_count++;
499 }
479 eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id)); 500 eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id));
480 ctx_stat = fifo_engine_status_ctx_status_v(eng_stat); 501 ctx_stat = fifo_engine_status_ctx_status_v(eng_stat);
481 502
482 if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id)) { 503 if (g->ops.mc.is_stall_and_eng_intr_pending(g, act_eng_id,
483 stall_intr = true; 504 &eng_intr_pending)) {
505 /* From h/w team
506 * Engine save can be blocked by eng stalling interrupts.
507 * FIFO interrupts shouldn’t block an engine save from
508 * finishing, but could block FIFO from reporting preempt done.
509 * No immediate reason to reset the engine if FIFO interrupt is
510 * pending.
511 * The hub, priv_ring, and ltc interrupts could block context
512 * switch (or memory), but doesn’t necessarily have to.
513 * For Hub interrupts they just report access counters and page
514 * faults. Neither of these necessarily block context switch
515 * or preemption, but they could.
516 * For example a page fault for graphics would prevent graphics
517 * from saving out. An access counter interrupt is a
518 * notification and has no effect.
519 * SW should handle page faults though for preempt to complete.
520 * PRI interrupt (due to a failed PRI transaction) will result
521 * in ctxsw failure reported to HOST.
522 * LTC interrupts are generally ECC related and if so,
523 * certainly don’t block preemption/ctxsw but they could.
524 * Bus interrupts shouldn’t have anything to do with preemption
525 * state as they are part of the Host EXT pipe, though they may
526 * exhibit a symptom that indicates that GPU is in a bad state.
527 * To be completely fair, when an engine is preempting SW
528 * really should just handle other interrupts as they come in.
529 * It’s generally bad to just poll and wait on a preempt
530 * to complete since there are many things in the GPU which may
531 * cause a system to hang/stop responding.
532 */
484 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, 533 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
485 "stall intr set, " 534 "stall intr set, "
486 "preemption will not finish"); 535 "preemption might not finish");
487 } 536 }
488 if (ctx_stat == 537 if (ctx_stat ==
489 fifo_engine_status_ctx_status_ctxsw_switch_v()) { 538 fifo_engine_status_ctx_status_ctxsw_switch_v()) {
490 /* Eng save hasn't started yet. Continue polling */ 539 /* Eng save hasn't started yet. Continue polling */
491 if (stall_intr) { 540 if (eng_intr_pending) {
492 /* if stall intr stop polling */ 541 /* if eng intr, stop polling */
493 *reset_eng_bitmask |= BIT(act_eng_id); 542 *reset_eng_bitmask |= BIT(act_eng_id);
494 ret = 0; 543 ret = 0;
495 break; 544 break;
@@ -501,8 +550,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
501 fifo_engine_status_ctx_status_ctxsw_save_v()) { 550 fifo_engine_status_ctx_status_ctxsw_save_v()) {
502 551
503 if (id == fifo_engine_status_id_v(eng_stat)) { 552 if (id == fifo_engine_status_id_v(eng_stat)) {
504 if (stall_intr || 553 if (eng_intr_pending) {
505 timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
506 /* preemption will not finish */ 554 /* preemption will not finish */
507 *reset_eng_bitmask |= BIT(act_eng_id); 555 *reset_eng_bitmask |= BIT(act_eng_id);
508 ret = 0; 556 ret = 0;
@@ -518,9 +566,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
518 fifo_engine_status_ctx_status_ctxsw_load_v()) { 566 fifo_engine_status_ctx_status_ctxsw_load_v()) {
519 567
520 if (id == fifo_engine_status_next_id_v(eng_stat)) { 568 if (id == fifo_engine_status_next_id_v(eng_stat)) {
521 569 if (eng_intr_pending) {
522 if (stall_intr ||
523 timeout_rc_type == PREEMPT_TIMEOUT_NORC) {
524 /* preemption will not finish */ 570 /* preemption will not finish */
525 *reset_eng_bitmask |= BIT(act_eng_id); 571 *reset_eng_bitmask |= BIT(act_eng_id);
526 ret = 0; 572 ret = 0;
@@ -540,8 +586,21 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id,
540 nvgpu_usleep_range(delay, delay * 2); 586 nvgpu_usleep_range(delay, delay * 2);
541 delay = min_t(unsigned long, 587 delay = min_t(unsigned long,
542 delay << 1, GR_IDLE_CHECK_MAX); 588 delay << 1, GR_IDLE_CHECK_MAX);
543 } while (!nvgpu_timeout_expired_msg(&timeout, 589 } while (!nvgpu_timeout_expired(&timeout));
544 "preempt timeout eng")); 590
591 if (ret) {
592 /*
593 * The reasons a preempt can fail are:
594 * 1.Some other stalling interrupt is asserted preventing
595 * channel or context save.
596 * 2.The memory system hangs.
597 * 3.The engine hangs during CTXSW.
598 */
599 nvgpu_err(g, "preempt timeout eng: %u ctx_stat: %u tsgid: %u",
600 act_eng_id, ctx_stat, id);
601 *reset_eng_bitmask |= BIT(act_eng_id);
602 }
603
545 return ret; 604 return ret;
546} 605}
547 606
@@ -718,7 +777,6 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
718 u32 pbdma_id; 777 u32 pbdma_id;
719 u32 act_eng_id; 778 u32 act_eng_id;
720 u32 runlist_id; 779 u32 runlist_id;
721 int func_ret;
722 int ret = 0; 780 int ret = 0;
723 u32 tsgid; 781 u32 tsgid;
724 782
@@ -735,30 +793,15 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id,
735 runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask; 793 runlist_served_pbdmas = f->runlist_info[runlist_id].pbdma_bitmask;
736 runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask; 794 runlist_served_engines = f->runlist_info[runlist_id].eng_bitmask;
737 795
738 for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma) { 796 for_each_set_bit(pbdma_id, &runlist_served_pbdmas, f->num_pbdma)
739 797 ret |= gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id,
740 func_ret = gv11b_fifo_poll_pbdma_chan_status(g, tsgid, pbdma_id,
741 timeout_rc_type); 798 timeout_rc_type);
742 if (func_ret != 0) {
743 nvgpu_log_info(g, "preempt timeout pbdma %d", pbdma_id);
744 ret |= func_ret;
745 }
746 }
747
748 f->runlist_info[runlist_id].reset_eng_bitmask = 0; 799 f->runlist_info[runlist_id].reset_eng_bitmask = 0;
749 800
750 for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) { 801 for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines)
751 802 ret |= gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
752 func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id,
753 &f->runlist_info[runlist_id].reset_eng_bitmask, 803 &f->runlist_info[runlist_id].reset_eng_bitmask,
754 timeout_rc_type); 804 timeout_rc_type);
755
756 if (func_ret != 0) {
757 nvgpu_log_info(g, "preempt timeout engine %d", act_eng_id);
758 ret |= func_ret;
759 }
760 }
761
762 return ret; 805 return ret;
763} 806}
764 807
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
index 1ae3c93e..5ff16453 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h
@@ -50,6 +50,8 @@
50 50
51#define CHANNEL_INFO_VEID0 0 51#define CHANNEL_INFO_VEID0 0
52 52
53#define MAX_PRE_SI_RETRIES 200000 /* 1G/500KHz * 100 */
54
53struct gpu_ops; 55struct gpu_ops;
54 56
55void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g, 57void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
index 31600828..dbeb0645 100644
--- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c
@@ -71,24 +71,24 @@ bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0)
71 return (((mc_intr_0 & mc_intr_hub_pending_f()) != 0U) ? true : false); 71 return (((mc_intr_0 & mc_intr_hub_pending_f()) != 0U) ? true : false);
72} 72}
73 73
74bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id) 74bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
75 u32 *eng_intr_pending)
75{ 76{
76 u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); 77 u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0));
77 u32 stall_intr, eng_intr_mask; 78 u32 stall_intr, eng_intr_mask;
78 79
79 eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); 80 eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id);
80 if ((mc_intr_0 & eng_intr_mask) != 0U) { 81 *eng_intr_pending = mc_intr_0 & eng_intr_mask;
81 return true;
82 }
83 82
84 stall_intr = mc_intr_pfifo_pending_f() | 83 stall_intr = mc_intr_pfifo_pending_f() |
85 mc_intr_hub_pending_f() | 84 mc_intr_hub_pending_f() |
86 mc_intr_priv_ring_pending_f() | 85 mc_intr_priv_ring_pending_f() |
87 mc_intr_pbus_pending_f() | 86 mc_intr_pbus_pending_f() |
88 mc_intr_ltc_pending_f(); 87 mc_intr_ltc_pending_f();
89 if ((mc_intr_0 & stall_intr) != 0U) {
90 return true;
91 }
92 88
93 return false; 89 nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
90 "mc_intr_0 = 0x%08x, eng_intr = 0x%08x",
91 mc_intr_0 & stall_intr, *eng_intr_pending);
92
93 return (mc_intr_0 & (eng_intr_mask | stall_intr)) != 0U;
94} 94}
diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
index eb9d0e4e..faa4d38d 100644
--- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h
@@ -26,5 +26,6 @@ struct gk20a;
26 26
27void mc_gv11b_intr_enable(struct gk20a *g); 27void mc_gv11b_intr_enable(struct gk20a *g);
28bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0); 28bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0);
29bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id); 29bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id,
30 u32 *eng_intr_pending);
30#endif 31#endif