summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorDavid Nieto <dmartineznie@nvidia.com>2017-09-18 23:31:28 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-09-22 18:44:25 -0400
commit7134e9e852116f86745cd23312bbfba34100bf6d (patch)
tree763a9db89290450a37ad19d7f57acfa66ce33526 /drivers/gpu
parentf6fcecfc6f437a6d24aa113f75e43cb6dbbd5e0f (diff)
gpu: nvgpu: prevent crash during unbind
This change solves crashes during bind that were introduced in the driver during the OS unification refactoring due to lack of coverage of the remove() function. The fixes during remove are: (1) Prevent NULL dereference on GPUs with secure boot (2) Prevent NULL dereferences when fecs_trace is not enabled (3) Added PRAMIN blocker during driver removal if HW is no longer accesible (4) Prevent double free of debugfs nodes as they are handled on the debugfs_remove_recursive() call (5) quiesce() can now be called without checking is HW accesible flag is set (6) added function to free irq so no IRQ association is left on the driver after it is removed (7) prevent NULL dereference on nvgpu_thread_stop() if the thread is already stopped JIRA: EVLR-1739 Change-Id: I787d38f202d5267a6b34815f23e1bc88110e8455 Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1563005 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/nvgpu/common/linux/debug.c2
-rw-r--r--drivers/gpu/nvgpu/common/linux/debug_allocator.c2
-rw-r--r--drivers/gpu/nvgpu/common/linux/debug_clk.c1
-rw-r--r--drivers/gpu/nvgpu/common/linux/debug_pmu.c1
-rw-r--r--drivers/gpu/nvgpu/common/linux/module.c55
-rw-r--r--drivers/gpu/nvgpu/common/linux/module.h1
-rw-r--r--drivers/gpu/nvgpu/common/linux/pci.c13
-rw-r--r--drivers/gpu/nvgpu/common/linux/thread.c6
-rw-r--r--drivers/gpu/nvgpu/common/pmu/pmu_fw.c3
-rw-r--r--drivers/gpu/nvgpu/common/pramin.c9
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c37
-rw-r--r--drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c33
12 files changed, 98 insertions, 65 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/debug.c b/drivers/gpu/nvgpu/common/linux/debug.c
index 5750800f..08d0e679 100644
--- a/drivers/gpu/nvgpu/common/linux/debug.c
+++ b/drivers/gpu/nvgpu/common/linux/debug.c
@@ -409,5 +409,5 @@ void gk20a_debug_deinit(struct gk20a *g)
409 gk20a_fifo_debugfs_deinit(g); 409 gk20a_fifo_debugfs_deinit(g);
410 410
411 debugfs_remove_recursive(l->debugfs); 411 debugfs_remove_recursive(l->debugfs);
412 debugfs_remove_recursive(l->debugfs_alias); 412 debugfs_remove(l->debugfs_alias);
413} 413}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_allocator.c b/drivers/gpu/nvgpu/common/linux/debug_allocator.c
index 91ae0512..d63a9030 100644
--- a/drivers/gpu/nvgpu/common/linux/debug_allocator.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_allocator.c
@@ -55,8 +55,6 @@ void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a)
55 55
56void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a) 56void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a)
57{ 57{
58 if (!IS_ERR_OR_NULL(a->debugfs_entry))
59 debugfs_remove(a->debugfs_entry);
60} 58}
61 59
62void nvgpu_alloc_debugfs_init(struct gk20a *g) 60void nvgpu_alloc_debugfs_init(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/common/linux/debug_clk.c b/drivers/gpu/nvgpu/common/linux/debug_clk.c
index b265ca69..81839de7 100644
--- a/drivers/gpu/nvgpu/common/linux/debug_clk.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_clk.c
@@ -267,6 +267,5 @@ int gm20b_clk_init_debugfs(struct gk20a *g)
267 267
268err_out: 268err_out:
269 pr_err("%s: Failed to make debugfs node\n", __func__); 269 pr_err("%s: Failed to make debugfs node\n", __func__);
270 debugfs_remove_recursive(l->debugfs);
271 return -ENOMEM; 270 return -ENOMEM;
272} 271}
diff --git a/drivers/gpu/nvgpu/common/linux/debug_pmu.c b/drivers/gpu/nvgpu/common/linux/debug_pmu.c
index 191fcb0e..ec997e28 100644
--- a/drivers/gpu/nvgpu/common/linux/debug_pmu.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_pmu.c
@@ -477,6 +477,5 @@ int gk20a_pmu_debugfs_init(struct gk20a *g)
477 return 0; 477 return 0;
478err_out: 478err_out:
479 pr_err("%s: Failed to make debugfs node\n", __func__); 479 pr_err("%s: Failed to make debugfs node\n", __func__);
480 debugfs_remove_recursive(l->debugfs);
481 return -ENOMEM; 480 return -ENOMEM;
482} 481}
diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c
index 46b89ad0..c474f36a 100644
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -226,9 +226,12 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
226 * After this point, gk20a interrupts should not get 226 * After this point, gk20a interrupts should not get
227 * serviced. 227 * serviced.
228 */ 228 */
229 disable_irq(g->irq_stall); 229 if (g->irqs_enabled) {
230 if (g->irq_stall != g->irq_nonstall) 230 disable_irq(g->irq_stall);
231 disable_irq(g->irq_nonstall); 231 if (g->irq_stall != g->irq_nonstall)
232 disable_irq(g->irq_nonstall);
233 g->irqs_enabled = 0;
234 }
232 235
233 /* Decrement platform power refcount */ 236 /* Decrement platform power refcount */
234 if (platform->idle) 237 if (platform->idle)
@@ -641,6 +644,18 @@ static int gk20a_pm_unrailgate(struct device *dev)
641} 644}
642 645
643/* 646/*
647 * Remove association of the driver with OS interrupt handler
648 */
649void nvgpu_free_irq(struct gk20a *g)
650{
651 struct device *dev = dev_from_gk20a(g);
652
653 devm_free_irq(dev, g->irq_stall, g);
654 if (g->irq_stall != g->irq_nonstall)
655 devm_free_irq(dev, g->irq_nonstall, g);
656}
657
658/*
644 * Idle the GPU in preparation of shutdown/remove. 659 * Idle the GPU in preparation of shutdown/remove.
645 * gk20a_driver_start_unload() does not idle the GPU, but instead changes the SW 660 * gk20a_driver_start_unload() does not idle the GPU, but instead changes the SW
646 * state to prevent further activity on the driver SW side. 661 * state to prevent further activity on the driver SW side.
@@ -651,24 +666,27 @@ int nvgpu_quiesce(struct gk20a *g)
651 int err; 666 int err;
652 struct device *dev = dev_from_gk20a(g); 667 struct device *dev = dev_from_gk20a(g);
653 668
654 err = gk20a_wait_for_idle(g); 669 if (g->power_on) {
655 if (err) { 670 err = gk20a_wait_for_idle(g);
656 nvgpu_err(g, "failed to idle GPU, err=%d", err); 671 if (err) {
657 return err; 672 nvgpu_err(g, "failed to idle GPU, err=%d", err);
658 } 673 return err;
674 }
659 675
660 err = gk20a_fifo_disable_all_engine_activity(g, true); 676 err = gk20a_fifo_disable_all_engine_activity(g, true);
661 if (err) { 677 if (err) {
662 nvgpu_err(g, "failed to disable engine activity, err=%d", 678 nvgpu_err(g,
663 err); 679 "failed to disable engine activity, err=%d",
680 err);
664 return err; 681 return err;
665 } 682 }
666 683
667 err = gk20a_fifo_wait_engine_idle(g); 684 err = gk20a_fifo_wait_engine_idle(g);
668 if (err) { 685 if (err) {
669 nvgpu_err(g, "failed to idle engines, err=%d", 686 nvgpu_err(g, "failed to idle engines, err=%d",
670 err); 687 err);
671 return err; 688 return err;
689 }
672 } 690 }
673 691
674 if (gk20a_gpu_is_virtual(dev)) 692 if (gk20a_gpu_is_virtual(dev))
@@ -679,6 +697,7 @@ int nvgpu_quiesce(struct gk20a *g)
679 if (err) 697 if (err)
680 nvgpu_err(g, "failed to prepare for poweroff, err=%d", 698 nvgpu_err(g, "failed to prepare for poweroff, err=%d",
681 err); 699 err);
700
682 return err; 701 return err;
683} 702}
684 703
diff --git a/drivers/gpu/nvgpu/common/linux/module.h b/drivers/gpu/nvgpu/common/linux/module.h
index def98288..55a3b692 100644
--- a/drivers/gpu/nvgpu/common/linux/module.h
+++ b/drivers/gpu/nvgpu/common/linux/module.h
@@ -21,6 +21,7 @@ void gk20a_remove_support(struct gk20a *g);
21void gk20a_driver_start_unload(struct gk20a *g); 21void gk20a_driver_start_unload(struct gk20a *g);
22int nvgpu_quiesce(struct gk20a *g); 22int nvgpu_quiesce(struct gk20a *g);
23int nvgpu_remove(struct device *dev, struct class *class); 23int nvgpu_remove(struct device *dev, struct class *class);
24void nvgpu_free_irq(struct gk20a *g);
24 25
25extern struct class nvgpu_class; 26extern struct class nvgpu_class;
26 27
diff --git a/drivers/gpu/nvgpu/common/linux/pci.c b/drivers/gpu/nvgpu/common/linux/pci.c
index f1d12367..1a7d1842 100644
--- a/drivers/gpu/nvgpu/common/linux/pci.c
+++ b/drivers/gpu/nvgpu/common/linux/pci.c
@@ -521,13 +521,12 @@ static void nvgpu_pci_remove(struct pci_dev *pdev)
521 if (gk20a_gpu_is_virtual(dev)) 521 if (gk20a_gpu_is_virtual(dev))
522 return; 522 return;
523 523
524 /* only idle the GPU if the GPU is powered on */ 524 gk20a_driver_start_unload(g);
525 if (g->power_on) { 525 err = nvgpu_quiesce(g);
526 gk20a_driver_start_unload(g); 526 /* TODO: handle failure to idle */
527 err = nvgpu_quiesce(g); 527 WARN(err, "gpu failed to idle during driver removal");
528 /* TODO: handle failure to idle */ 528
529 WARN(err, "gpu failed to idle during driver removal"); 529 nvgpu_free_irq(g);
530 }
531 530
532 nvgpu_remove(dev, &nvgpu_pci_class); 531 nvgpu_remove(dev, &nvgpu_pci_class);
533 532
diff --git a/drivers/gpu/nvgpu/common/linux/thread.c b/drivers/gpu/nvgpu/common/linux/thread.c
index fe3906eb..92c556f2 100644
--- a/drivers/gpu/nvgpu/common/linux/thread.c
+++ b/drivers/gpu/nvgpu/common/linux/thread.c
@@ -46,8 +46,10 @@ int nvgpu_thread_create(struct nvgpu_thread *thread,
46 46
47void nvgpu_thread_stop(struct nvgpu_thread *thread) 47void nvgpu_thread_stop(struct nvgpu_thread *thread)
48{ 48{
49 kthread_stop(thread->task); 49 if (thread->task) {
50 thread->task = NULL; 50 kthread_stop(thread->task);
51 thread->task = NULL;
52 }
51}; 53};
52 54
53bool nvgpu_thread_should_stop(struct nvgpu_thread *thread) 55bool nvgpu_thread_should_stop(struct nvgpu_thread *thread)
diff --git a/drivers/gpu/nvgpu/common/pmu/pmu_fw.c b/drivers/gpu/nvgpu/common/pmu/pmu_fw.c
index 077a1bf8..5fd8121d 100644
--- a/drivers/gpu/nvgpu/common/pmu/pmu_fw.c
+++ b/drivers/gpu/nvgpu/common/pmu/pmu_fw.c
@@ -2223,7 +2223,8 @@ static void nvgpu_remove_pmu_support(struct nvgpu_pmu *pmu)
2223 if (nvgpu_alloc_initialized(&pmu->dmem)) 2223 if (nvgpu_alloc_initialized(&pmu->dmem))
2224 nvgpu_alloc_destroy(&pmu->dmem); 2224 nvgpu_alloc_destroy(&pmu->dmem);
2225 2225
2226 nvgpu_release_firmware(g, pmu->fw); 2226 if (pmu->fw)
2227 nvgpu_release_firmware(g, pmu->fw);
2227 2228
2228 nvgpu_mutex_destroy(&pmu->elpg_mutex); 2229 nvgpu_mutex_destroy(&pmu->elpg_mutex);
2229 nvgpu_mutex_destroy(&pmu->pg_mutex); 2230 nvgpu_mutex_destroy(&pmu->pg_mutex);
diff --git a/drivers/gpu/nvgpu/common/pramin.c b/drivers/gpu/nvgpu/common/pramin.c
index ae9c9b1f..7955e6c2 100644
--- a/drivers/gpu/nvgpu/common/pramin.c
+++ b/drivers/gpu/nvgpu/common/pramin.c
@@ -16,6 +16,7 @@
16 16
17#include <nvgpu/pramin.h> 17#include <nvgpu/pramin.h>
18#include <nvgpu/page_allocator.h> 18#include <nvgpu/page_allocator.h>
19#include <nvgpu/enabled.h>
19 20
20#include "gk20a/gk20a.h" 21#include "gk20a/gk20a.h"
21 22
@@ -88,6 +89,14 @@ void nvgpu_pramin_access_batched(struct gk20a *g, struct nvgpu_mem *mem,
88 void *sgl; 89 void *sgl;
89 u32 byteoff, start_reg, until_end, n; 90 u32 byteoff, start_reg, until_end, n;
90 91
92 /*
93 * TODO: Vidmem is not accesible through pramin on shutdown path.
94 * driver should be refactored to prevent this from happening, but for
95 * now it is ok just to ignore the writes
96 */
97 if (!g->regs && nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
98 return;
99
91 alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl); 100 alloc = get_vidmem_page_alloc(mem->priv.sgt->sgl);
92 sgt = &alloc->sgt; 101 sgt = &alloc->sgt;
93 for (sgl = sgt->sgl; sgl; sgl = nvgpu_sgt_get_next(sgt, sgl)) { 102 for (sgl = sgt->sgl; sgl; sgl = nvgpu_sgt_get_next(sgt, sgl)) {
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 0b8422a6..ea69d7cb 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -465,21 +465,30 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
465 465
466 trace_gk20a_free_channel(ch->chid); 466 trace_gk20a_free_channel(ch->chid);
467 467
468 /* abort channel and remove from runlist */ 468 /*
469 if (gk20a_is_channel_marked_as_tsg(ch)) { 469 * Disable channel/TSG and unbind here. This should not be executed if
470 err = g->ops.fifo.tsg_unbind_channel(ch); 470 * HW access is not available during shutdown/removal path as it will
471 if (err) 471 * trigger a timeout
472 nvgpu_err(g, "failed to unbind channel %d from TSG", ch->chid); 472 */
473 /* 473 if (!nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) {
474 * Channel is not a part of TSG this point onwards 474 /* abort channel and remove from runlist */
475 * So stash its status and use it whenever necessary 475 if (gk20a_is_channel_marked_as_tsg(ch)) {
476 * e.g. while releasing gr_ctx in g->ops.gr.free_channel_ctx() 476 err = g->ops.fifo.tsg_unbind_channel(ch);
477 */ 477 if (err)
478 was_tsg = true; 478 nvgpu_err(g,
479 } else { 479 "failed to unbind channel %d from TSG",
480 gk20a_disable_channel(ch); 480 ch->chid);
481 /*
482 * Channel is not a part of TSG this point onwards
483 * So stash its status and use it whenever necessary
484 * e.g. while releasing gr_ctx in
485 * g->ops.gr.free_channel_ctx()
486 */
487 was_tsg = true;
488 } else {
489 gk20a_disable_channel(ch);
490 }
481 } 491 }
482
483 /* wait until there's only our ref to the channel */ 492 /* wait until there's only our ref to the channel */
484 if (!force) 493 if (!force)
485 gk20a_wait_until_counter_is_N( 494 gk20a_wait_until_counter_is_N(
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
index fea3b0fa..71cba9ec 100644
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -67,6 +67,7 @@ struct gk20a_fecs_trace {
67 struct nvgpu_mutex hash_lock; 67 struct nvgpu_mutex hash_lock;
68 struct nvgpu_mutex poll_lock; 68 struct nvgpu_mutex poll_lock;
69 struct nvgpu_thread poll_task; 69 struct nvgpu_thread poll_task;
70 bool init;
70}; 71};
71 72
72#ifdef CONFIG_GK20A_CTXSW_TRACE 73#ifdef CONFIG_GK20A_CTXSW_TRACE
@@ -547,23 +548,12 @@ static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
547 &gk20a_fecs_trace_debugfs_ring_fops); 548 &gk20a_fecs_trace_debugfs_ring_fops);
548} 549}
549 550
550static void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
551{
552 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
553
554 debugfs_remove_recursive(l->debugfs);
555}
556
557#else 551#else
558 552
559static void gk20a_fecs_trace_debugfs_init(struct gk20a *g) 553static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
560{ 554{
561} 555}
562 556
563static inline void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
564{
565}
566
567#endif /* CONFIG_DEBUG_FS */ 557#endif /* CONFIG_DEBUG_FS */
568 558
569int gk20a_fecs_trace_init(struct gk20a *g) 559int gk20a_fecs_trace_init(struct gk20a *g)
@@ -598,6 +588,9 @@ int gk20a_fecs_trace_init(struct gk20a *g)
598 NVGPU_GPU_FLAGS_SUPPORT_FECS_CTXSW_TRACE; 588 NVGPU_GPU_FLAGS_SUPPORT_FECS_CTXSW_TRACE;
599 589
600 gk20a_fecs_trace_debugfs_init(g); 590 gk20a_fecs_trace_debugfs_init(g);
591
592 trace->init = true;
593
601 return 0; 594 return 0;
602 595
603clean_hash_lock: 596clean_hash_lock:
@@ -682,15 +675,17 @@ int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
682{ 675{
683 u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch); 676 u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);
684 677
685 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, 678 if (g->fecs_trace) {
679 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
686 "ch=%p context_ptr=%x", ch, context_ptr); 680 "ch=%p context_ptr=%x", ch, context_ptr);
687 681
688 if (g->ops.fecs_trace.is_enabled(g)) { 682 if (g->ops.fecs_trace.is_enabled(g)) {
689 if (g->ops.fecs_trace.flush) 683 if (g->ops.fecs_trace.flush)
690 g->ops.fecs_trace.flush(g); 684 g->ops.fecs_trace.flush(g);
691 gk20a_fecs_trace_poll(g); 685 gk20a_fecs_trace_poll(g);
686 }
687 gk20a_fecs_trace_hash_del(g, context_ptr);
692 } 688 }
693 gk20a_fecs_trace_hash_del(g, context_ptr);
694 return 0; 689 return 0;
695} 690}
696 691
@@ -709,7 +704,9 @@ int gk20a_fecs_trace_deinit(struct gk20a *g)
709{ 704{
710 struct gk20a_fecs_trace *trace = g->fecs_trace; 705 struct gk20a_fecs_trace *trace = g->fecs_trace;
711 706
712 gk20a_fecs_trace_debugfs_cleanup(g); 707 if (!trace->init)
708 return 0;
709
713 nvgpu_thread_stop(&trace->poll_task); 710 nvgpu_thread_stop(&trace->poll_task);
714 gk20a_fecs_trace_free_ring(g); 711 gk20a_fecs_trace_free_ring(g);
715 gk20a_fecs_trace_free_hash_table(g); 712 gk20a_fecs_trace_free_hash_table(g);