diff options
author | Seema Khowala <seemaj@nvidia.com> | 2017-09-22 18:07:13 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-11-15 05:05:16 -0500 |
commit | 72b51a129fda4a89f226aad7c99f062977a07189 (patch) | |
tree | e8027fc8c70bb45453f6897a4e5b15400e92ef7f | |
parent | f1c962daae5fdb231a8c8b0202d96c1c4e242ef1 (diff) |
gpu: nvgpu: gv11b: detect stall intr during preemption
Check for interrupts or hangs while waiting for the preempt to complete.
During pbdma/eng preempt done polling, any stalling interrupts relating
to the runlist must be detected and handled in order for the preemption
to complete.
When PBDMA fault or CE fault occurs, the PBDMA will save out
automatically. TSG related to the context in which the fault occurred
will not be scheduled again until the fault is handled.
In the case of some other issue requiring the engine to be reset, TSG
will need to be manually preempted.
In all cases, a PBDMA interrupt may occur prior to the PBDMA being able to
switch out. SW must handle these interrupts according to the relevant handling
procedure before the PBDMA preempt can complete.
Opt for eng reset instead of waiting for preemption to be finished when
there is any stall interrupt pending during engine context preempt completion.
Bug 200277163
Bug 1945121
Change-Id: Icaef79e3046d82987b8486d15cbfc8365aa26f2e
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1522914
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: David Martinez Nieto <dmartineznie@nvidia.com>
Tested-by: David Martinez Nieto <dmartineznie@nvidia.com>
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 89 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/mc_gv11b.c | 20 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/mc_gv11b.h | 1 |
3 files changed, 64 insertions, 46 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index a3cb9292..f87c6dea 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include "fifo_gv11b.h" | 56 | #include "fifo_gv11b.h" |
57 | #include "subctx_gv11b.h" | 57 | #include "subctx_gv11b.h" |
58 | #include "gr_gv11b.h" | 58 | #include "gr_gv11b.h" |
59 | #include "mc_gv11b.h" | ||
59 | 60 | ||
60 | #define PBDMA_SUBDEVICE_ID 1 | 61 | #define PBDMA_SUBDEVICE_ID 1 |
61 | 62 | ||
@@ -393,45 +394,35 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, | |||
393 | u32 pbdma_id, unsigned int timeout_rc_type) | 394 | u32 pbdma_id, unsigned int timeout_rc_type) |
394 | { | 395 | { |
395 | struct nvgpu_timeout timeout; | 396 | struct nvgpu_timeout timeout; |
396 | unsigned long delay = GR_IDLE_CHECK_DEFAULT; | 397 | unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ |
397 | u32 pbdma_stat; | 398 | u32 pbdma_stat; |
398 | u32 chan_stat; | 399 | u32 chan_stat; |
399 | int ret = -EBUSY; | 400 | int ret = -EBUSY; |
400 | 401 | ||
401 | /* | 402 | /* timeout in milli seconds */ |
402 | * If the PBDMA has a stalling interrupt and receives a NACK, the PBDMA | ||
403 | * won't save out until the STALLING interrupt is cleared. Note that | ||
404 | * the stalling interrupt need not be directly addressed, as simply | ||
405 | * clearing of the interrupt bit will be sufficient to allow the PBDMA | ||
406 | * to save out. If the stalling interrupt was due to a SW method or | ||
407 | * another deterministic failure, the PBDMA will assert it when the | ||
408 | * channel is reloaded/resumed. Note that the fault will still be | ||
409 | * reported to SW. | ||
410 | */ | ||
411 | |||
412 | if (timeout_rc_type == PREEMPT_TIMEOUT_NORC) { | ||
413 | /* called from recovery */ | ||
414 | u32 pbdma_intr_0, pbdma_intr_1; | ||
415 | |||
416 | pbdma_intr_0 = gk20a_readl(g, pbdma_intr_0_r(pbdma_id)); | ||
417 | pbdma_intr_1 = gk20a_readl(g, pbdma_intr_1_r(pbdma_id)); | ||
418 | |||
419 | if (pbdma_intr_0) | ||
420 | gk20a_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0); | ||
421 | if (pbdma_intr_1) | ||
422 | gk20a_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1); | ||
423 | } | ||
424 | |||
425 | nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), | 403 | nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), |
426 | NVGPU_TIMER_CPU_TIMER); | 404 | NVGPU_TIMER_CPU_TIMER); |
427 | 405 | ||
406 | nvgpu_log(g, gpu_dbg_info, "wait preempt pbdma %d", pbdma_id); | ||
428 | /* Verify that ch/tsg is no longer on the pbdma */ | 407 | /* Verify that ch/tsg is no longer on the pbdma */ |
429 | do { | 408 | do { |
409 | /* | ||
410 | * If the PBDMA has a stalling interrupt and receives a NACK, | ||
411 | * the PBDMA won't save out until the STALLING interrupt is | ||
412 | * cleared. Stalling interrupt need not be directly addressed, | ||
413 | * as simply clearing of the interrupt bit will be sufficient | ||
414 | * to allow the PBDMA to save out. If the stalling interrupt | ||
415 | * was due to a SW method or another deterministic failure, | ||
416 | * the PBDMA will assert it when the channel is reloaded | ||
417 | * or resumed. Note that the fault will still be | ||
418 | * reported to SW. | ||
419 | */ | ||
420 | |||
421 | gk20a_fifo_handle_pbdma_intr(g, &g->fifo, pbdma_id, RC_NO); | ||
422 | |||
430 | pbdma_stat = gk20a_readl(g, fifo_pbdma_status_r(pbdma_id)); | 423 | pbdma_stat = gk20a_readl(g, fifo_pbdma_status_r(pbdma_id)); |
431 | chan_stat = fifo_pbdma_status_chan_status_v(pbdma_stat); | 424 | chan_stat = fifo_pbdma_status_chan_status_v(pbdma_stat); |
432 | 425 | ||
433 | gk20a_dbg_info("wait preempt pbdma"); | ||
434 | |||
435 | if (chan_stat == | 426 | if (chan_stat == |
436 | fifo_pbdma_status_chan_status_valid_v() || | 427 | fifo_pbdma_status_chan_status_valid_v() || |
437 | chan_stat == | 428 | chan_stat == |
@@ -473,26 +464,36 @@ static int gv11b_fifo_poll_pbdma_chan_status(struct gk20a *g, u32 id, | |||
473 | } | 464 | } |
474 | 465 | ||
475 | static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, | 466 | static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, |
476 | u32 engine_idx, u32 *reset_eng_bitmask, | 467 | u32 act_eng_id, u32 *reset_eng_bitmask, |
477 | unsigned int timeout_rc_type) | 468 | unsigned int timeout_rc_type) |
478 | { | 469 | { |
479 | struct nvgpu_timeout timeout; | 470 | struct nvgpu_timeout timeout; |
480 | unsigned long delay = GR_IDLE_CHECK_DEFAULT; | 471 | unsigned long delay = GR_IDLE_CHECK_DEFAULT; /* in micro seconds */ |
481 | u32 eng_stat; | 472 | u32 eng_stat; |
482 | u32 ctx_stat; | 473 | u32 ctx_stat; |
483 | int ret = -EBUSY; | 474 | int ret = -EBUSY; |
475 | bool stall_intr = false; | ||
484 | 476 | ||
477 | /* timeout in milli seconds */ | ||
485 | nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), | 478 | nvgpu_timeout_init(g, &timeout, g->ops.fifo.get_preempt_timeout(g), |
486 | NVGPU_TIMER_CPU_TIMER); | 479 | NVGPU_TIMER_CPU_TIMER); |
487 | 480 | ||
481 | nvgpu_log(g, gpu_dbg_info, "wait preempt act engine id: %u", | ||
482 | act_eng_id); | ||
488 | /* Check if ch/tsg has saved off the engine or if ctxsw is hung */ | 483 | /* Check if ch/tsg has saved off the engine or if ctxsw is hung */ |
489 | do { | 484 | do { |
490 | eng_stat = gk20a_readl(g, fifo_engine_status_r(engine_idx)); | 485 | eng_stat = gk20a_readl(g, fifo_engine_status_r(act_eng_id)); |
491 | ctx_stat = fifo_engine_status_ctx_status_v(eng_stat); | 486 | ctx_stat = fifo_engine_status_ctx_status_v(eng_stat); |
492 | 487 | ||
488 | if (gv11b_mc_is_stall_and_eng_intr_pending(g, act_eng_id)) { | ||
489 | stall_intr = true; | ||
490 | nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr, | ||
491 | "stall intr set, " | ||
492 | "preemption will not finish"); | ||
493 | } | ||
493 | if (ctx_stat == | 494 | if (ctx_stat == |
494 | fifo_engine_status_ctx_status_ctxsw_switch_v()) { | 495 | fifo_engine_status_ctx_status_ctxsw_switch_v()) { |
495 | gk20a_dbg_info("engine save hasn't started yet"); | 496 | /* Eng save hasn't started yet. Continue polling */ |
496 | 497 | ||
497 | } else if (ctx_stat == | 498 | } else if (ctx_stat == |
498 | fifo_engine_status_ctx_status_valid_v() || | 499 | fifo_engine_status_ctx_status_valid_v() || |
@@ -500,14 +501,12 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, | |||
500 | fifo_engine_status_ctx_status_ctxsw_save_v()) { | 501 | fifo_engine_status_ctx_status_ctxsw_save_v()) { |
501 | 502 | ||
502 | if (id == fifo_engine_status_id_v(eng_stat)) { | 503 | if (id == fifo_engine_status_id_v(eng_stat)) { |
503 | if (timeout_rc_type == PREEMPT_TIMEOUT_NORC) { | 504 | if (stall_intr || |
504 | /* called from recovery, eng seems to be hung */ | 505 | timeout_rc_type == PREEMPT_TIMEOUT_NORC) { |
505 | *reset_eng_bitmask |= BIT(engine_idx); | 506 | /* preemption will not finish */ |
507 | *reset_eng_bitmask |= BIT(act_eng_id); | ||
506 | ret = 0; | 508 | ret = 0; |
507 | break; | 509 | break; |
508 | } else { | ||
509 | gk20a_dbg_info("wait preempt engine. " | ||
510 | "ctx_status (valid/save)=%u", ctx_stat); | ||
511 | } | 510 | } |
512 | } else { | 511 | } else { |
513 | /* context is not running on the engine */ | 512 | /* context is not running on the engine */ |
@@ -520,14 +519,12 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, | |||
520 | 519 | ||
521 | if (id == fifo_engine_status_next_id_v(eng_stat)) { | 520 | if (id == fifo_engine_status_next_id_v(eng_stat)) { |
522 | 521 | ||
523 | if (timeout_rc_type == PREEMPT_TIMEOUT_NORC) { | 522 | if (stall_intr || |
524 | /* called from recovery, eng seems to be hung */ | 523 | timeout_rc_type == PREEMPT_TIMEOUT_NORC) { |
525 | *reset_eng_bitmask |= BIT(engine_idx); | 524 | /* preemption will not finish */ |
525 | *reset_eng_bitmask |= BIT(act_eng_id); | ||
526 | ret = 0; | 526 | ret = 0; |
527 | break; | 527 | break; |
528 | } else { | ||
529 | gk20a_dbg_info("wait preempt engine. " | ||
530 | "ctx_status (load)=%u", ctx_stat); | ||
531 | } | 528 | } |
532 | } else { | 529 | } else { |
533 | /* context is not running on the engine */ | 530 | /* context is not running on the engine */ |
@@ -540,7 +537,7 @@ static int gv11b_fifo_poll_eng_ctx_status(struct gk20a *g, u32 id, | |||
540 | ret = 0; | 537 | ret = 0; |
541 | break; | 538 | break; |
542 | } | 539 | } |
543 | usleep_range(delay, delay * 2); | 540 | nvgpu_usleep_range(delay, delay * 2); |
544 | delay = min_t(unsigned long, | 541 | delay = min_t(unsigned long, |
545 | delay << 1, GR_IDLE_CHECK_MAX); | 542 | delay << 1, GR_IDLE_CHECK_MAX); |
546 | } while (!nvgpu_timeout_expired_msg(&timeout, | 543 | } while (!nvgpu_timeout_expired_msg(&timeout, |
@@ -712,7 +709,7 @@ static int gv11b_fifo_poll_runlist_preempt_pending(struct gk20a *g, | |||
712 | break; | 709 | break; |
713 | } | 710 | } |
714 | 711 | ||
715 | usleep_range(delay, delay * 2); | 712 | nvgpu_usleep_range(delay, delay * 2); |
716 | delay = min_t(unsigned long, | 713 | delay = min_t(unsigned long, |
717 | delay << 1, GR_IDLE_CHECK_MAX); | 714 | delay << 1, GR_IDLE_CHECK_MAX); |
718 | } while (!nvgpu_timeout_expired_msg(&timeout, | 715 | } while (!nvgpu_timeout_expired_msg(&timeout, |
@@ -758,7 +755,7 @@ int gv11b_fifo_is_preempt_pending(struct gk20a *g, u32 id, | |||
758 | 755 | ||
759 | f->runlist_info[runlist_id].reset_eng_bitmask = 0; | 756 | f->runlist_info[runlist_id].reset_eng_bitmask = 0; |
760 | 757 | ||
761 | for_each_set_bit(act_eng_id, &runlist_served_engines, f->num_engines) { | 758 | for_each_set_bit(act_eng_id, &runlist_served_engines, f->max_engines) { |
762 | 759 | ||
763 | func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id, | 760 | func_ret = gv11b_fifo_poll_eng_ctx_status(g, tsgid, act_eng_id, |
764 | &f->runlist_info[runlist_id].reset_eng_bitmask, | 761 | &f->runlist_info[runlist_id].reset_eng_bitmask, |
diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c index dab304e2..74c5c4d6 100644 --- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.c | |||
@@ -70,3 +70,23 @@ bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0) | |||
70 | { | 70 | { |
71 | return ((mc_intr_0 & mc_intr_hub_pending_f()) ? true : false); | 71 | return ((mc_intr_0 & mc_intr_hub_pending_f()) ? true : false); |
72 | } | 72 | } |
73 | |||
74 | bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id) | ||
75 | { | ||
76 | u32 mc_intr_0 = gk20a_readl(g, mc_intr_r(0)); | ||
77 | u32 stall_intr, eng_intr_mask; | ||
78 | |||
79 | eng_intr_mask = gk20a_fifo_act_eng_interrupt_mask(g, act_eng_id); | ||
80 | if (mc_intr_0 & eng_intr_mask) | ||
81 | return true; | ||
82 | |||
83 | stall_intr = mc_intr_pfifo_pending_f() | | ||
84 | mc_intr_hub_pending_f() | | ||
85 | mc_intr_priv_ring_pending_f() | | ||
86 | mc_intr_pbus_pending_f() | | ||
87 | mc_intr_ltc_pending_f(); | ||
88 | if (mc_intr_0 & stall_intr) | ||
89 | return true; | ||
90 | |||
91 | return false; | ||
92 | } | ||
diff --git a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h index de193a84..eb9d0e4e 100644 --- a/drivers/gpu/nvgpu/gv11b/mc_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/mc_gv11b.h | |||
@@ -26,4 +26,5 @@ struct gk20a; | |||
26 | 26 | ||
27 | void mc_gv11b_intr_enable(struct gk20a *g); | 27 | void mc_gv11b_intr_enable(struct gk20a *g); |
28 | bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0); | 28 | bool gv11b_mc_is_intr_hub_pending(struct gk20a *g, u32 mc_intr_0); |
29 | bool gv11b_mc_is_stall_and_eng_intr_pending(struct gk20a *g, u32 act_eng_id); | ||
29 | #endif | 30 | #endif |