diff options
author | Terje Bergstrom <tbergstrom@nvidia.com> | 2015-02-05 13:05:56 -0500 |
---|---|---|
committer | Dan Willemsen <dwillemsen@nvidia.com> | 2015-04-04 21:05:22 -0400 |
commit | 24ddf71b9009291b829e6c30eb1b22e8838f7367 (patch) | |
tree | 0a2816a1f813c17c32c51390f1d9311f7e95cc4d /drivers/gpu | |
parent | 5b6e8995b2a3d399a8cc7fd249301122053666e1 (diff) |
gpu: nvgpu: Use busy looping on memory ops
Use busy looping on L2 and TLB maintenance operations. This speeds
them up by an order of magnitude.
Add also trace points to measure performance for memory ops and
interrupt processing.
Change-Id: Ic4a8525d3d946b2b8f57b4b8ddcfc61605619399
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: http://git-master/r/681640
Diffstat (limited to 'drivers/gpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 3 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mc_gk20a.c | 9 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 31 |
3 files changed, 37 insertions, 6 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 9e19fa53..75775d57 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/dma-mapping.h> | 26 | #include <linux/dma-mapping.h> |
27 | #include <linux/firmware.h> | 27 | #include <linux/firmware.h> |
28 | #include <linux/nvhost.h> | 28 | #include <linux/nvhost.h> |
29 | #include <trace/events/gk20a.h> | ||
29 | 30 | ||
30 | #include "gk20a.h" | 31 | #include "gk20a.h" |
31 | #include "kind_gk20a.h" | 32 | #include "kind_gk20a.h" |
@@ -4998,6 +4999,8 @@ static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr, | |||
4998 | { | 4999 | { |
4999 | gk20a_dbg_fn(""); | 5000 | gk20a_dbg_fn(""); |
5000 | 5001 | ||
5002 | trace_gr_gk20a_handle_sw_method(g->dev->name); | ||
5003 | |||
5001 | if (class_num == KEPLER_COMPUTE_A) { | 5004 | if (class_num == KEPLER_COMPUTE_A) { |
5002 | switch (offset << 2) { | 5005 | switch (offset << 2) { |
5003 | case NVA0C0_SET_SHADER_EXCEPTIONS: | 5006 | case NVA0C0_SET_SHADER_EXCEPTIONS: |
diff --git a/drivers/gpu/nvgpu/gk20a/mc_gk20a.c b/drivers/gpu/nvgpu/gk20a/mc_gk20a.c index 9f9e756b..86fea3a1 100644 --- a/drivers/gpu/nvgpu/gk20a/mc_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mc_gk20a.c | |||
@@ -14,6 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/types.h> | 16 | #include <linux/types.h> |
17 | #include <trace/events/gk20a.h> | ||
17 | 18 | ||
18 | #include "gk20a.h" | 19 | #include "gk20a.h" |
19 | #include "mc_gk20a.h" | 20 | #include "mc_gk20a.h" |
@@ -23,6 +24,8 @@ irqreturn_t mc_gk20a_isr_stall(struct gk20a *g) | |||
23 | { | 24 | { |
24 | u32 mc_intr_0; | 25 | u32 mc_intr_0; |
25 | 26 | ||
27 | trace_mc_gk20a_intr_stall(g->dev->name); | ||
28 | |||
26 | if (!g->power_on) | 29 | if (!g->power_on) |
27 | return IRQ_NONE; | 30 | return IRQ_NONE; |
28 | 31 | ||
@@ -37,6 +40,8 @@ irqreturn_t mc_gk20a_isr_stall(struct gk20a *g) | |||
37 | /* flush previous write */ | 40 | /* flush previous write */ |
38 | gk20a_readl(g, mc_intr_en_0_r()); | 41 | gk20a_readl(g, mc_intr_en_0_r()); |
39 | 42 | ||
43 | trace_mc_gk20a_intr_stall_done(g->dev->name); | ||
44 | |||
40 | return IRQ_WAKE_THREAD; | 45 | return IRQ_WAKE_THREAD; |
41 | } | 46 | } |
42 | 47 | ||
@@ -67,6 +72,8 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g) | |||
67 | 72 | ||
68 | gk20a_dbg(gpu_dbg_intr, "interrupt thread launched"); | 73 | gk20a_dbg(gpu_dbg_intr, "interrupt thread launched"); |
69 | 74 | ||
75 | trace_mc_gk20a_intr_thread_stall(g->dev->name); | ||
76 | |||
70 | mc_intr_0 = gk20a_readl(g, mc_intr_0_r()); | 77 | mc_intr_0 = gk20a_readl(g, mc_intr_0_r()); |
71 | 78 | ||
72 | gk20a_dbg(gpu_dbg_intr, "stall intr %08x\n", mc_intr_0); | 79 | gk20a_dbg(gpu_dbg_intr, "stall intr %08x\n", mc_intr_0); |
@@ -92,6 +99,8 @@ irqreturn_t mc_gk20a_intr_thread_stall(struct gk20a *g) | |||
92 | /* flush previous write */ | 99 | /* flush previous write */ |
93 | gk20a_readl(g, mc_intr_en_0_r()); | 100 | gk20a_readl(g, mc_intr_en_0_r()); |
94 | 101 | ||
102 | trace_mc_gk20a_intr_thread_stall_done(g->dev->name); | ||
103 | |||
95 | return IRQ_HANDLED; | 104 | return IRQ_HANDLED; |
96 | } | 105 | } |
97 | 106 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index 2874567c..798b6405 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/vmalloc.h> | 27 | #include <linux/vmalloc.h> |
28 | #include <linux/dma-buf.h> | 28 | #include <linux/dma-buf.h> |
29 | #include <uapi/linux/nvgpu.h> | 29 | #include <uapi/linux/nvgpu.h> |
30 | #include <trace/events/gk20a.h> | ||
30 | 31 | ||
31 | #include "gk20a.h" | 32 | #include "gk20a.h" |
32 | #include "mm_gk20a.h" | 33 | #include "mm_gk20a.h" |
@@ -2816,6 +2817,9 @@ int gk20a_mm_fb_flush(struct gk20a *g) | |||
2816 | /* Make sure all previous writes are committed to the L2. There's no | 2817 | /* Make sure all previous writes are committed to the L2. There's no |
2817 | guarantee that writes are to DRAM. This will be a sysmembar internal | 2818 | guarantee that writes are to DRAM. This will be a sysmembar internal |
2818 | to the L2. */ | 2819 | to the L2. */ |
2820 | |||
2821 | trace_gk20a_mm_fb_flush(g->dev->name); | ||
2822 | |||
2819 | gk20a_writel(g, flush_fb_flush_r(), | 2823 | gk20a_writel(g, flush_fb_flush_r(), |
2820 | flush_fb_flush_pending_busy_f()); | 2824 | flush_fb_flush_pending_busy_f()); |
2821 | 2825 | ||
@@ -2828,7 +2832,7 @@ int gk20a_mm_fb_flush(struct gk20a *g) | |||
2828 | flush_fb_flush_pending_busy_v()) { | 2832 | flush_fb_flush_pending_busy_v()) { |
2829 | gk20a_dbg_info("fb_flush 0x%x", data); | 2833 | gk20a_dbg_info("fb_flush 0x%x", data); |
2830 | retry--; | 2834 | retry--; |
2831 | usleep_range(20, 40); | 2835 | udelay(5); |
2832 | } else | 2836 | } else |
2833 | break; | 2837 | break; |
2834 | } while (retry >= 0 || !tegra_platform_is_silicon()); | 2838 | } while (retry >= 0 || !tegra_platform_is_silicon()); |
@@ -2839,6 +2843,8 @@ int gk20a_mm_fb_flush(struct gk20a *g) | |||
2839 | ret = -EBUSY; | 2843 | ret = -EBUSY; |
2840 | } | 2844 | } |
2841 | 2845 | ||
2846 | trace_gk20a_mm_fb_flush_done(g->dev->name); | ||
2847 | |||
2842 | mutex_unlock(&mm->l2_op_lock); | 2848 | mutex_unlock(&mm->l2_op_lock); |
2843 | 2849 | ||
2844 | return ret; | 2850 | return ret; |
@@ -2849,6 +2855,8 @@ static void gk20a_mm_l2_invalidate_locked(struct gk20a *g) | |||
2849 | u32 data; | 2855 | u32 data; |
2850 | s32 retry = 200; | 2856 | s32 retry = 200; |
2851 | 2857 | ||
2858 | trace_gk20a_mm_l2_invalidate(g->dev->name); | ||
2859 | |||
2852 | /* Invalidate any clean lines from the L2 so subsequent reads go to | 2860 | /* Invalidate any clean lines from the L2 so subsequent reads go to |
2853 | DRAM. Dirty lines are not affected by this operation. */ | 2861 | DRAM. Dirty lines are not affected by this operation. */ |
2854 | gk20a_writel(g, flush_l2_system_invalidate_r(), | 2862 | gk20a_writel(g, flush_l2_system_invalidate_r(), |
@@ -2864,7 +2872,7 @@ static void gk20a_mm_l2_invalidate_locked(struct gk20a *g) | |||
2864 | gk20a_dbg_info("l2_system_invalidate 0x%x", | 2872 | gk20a_dbg_info("l2_system_invalidate 0x%x", |
2865 | data); | 2873 | data); |
2866 | retry--; | 2874 | retry--; |
2867 | usleep_range(20, 40); | 2875 | udelay(5); |
2868 | } else | 2876 | } else |
2869 | break; | 2877 | break; |
2870 | } while (retry >= 0 || !tegra_platform_is_silicon()); | 2878 | } while (retry >= 0 || !tegra_platform_is_silicon()); |
@@ -2872,6 +2880,8 @@ static void gk20a_mm_l2_invalidate_locked(struct gk20a *g) | |||
2872 | if (retry < 0) | 2880 | if (retry < 0) |
2873 | gk20a_warn(dev_from_gk20a(g), | 2881 | gk20a_warn(dev_from_gk20a(g), |
2874 | "l2_system_invalidate too many retries"); | 2882 | "l2_system_invalidate too many retries"); |
2883 | |||
2884 | trace_gk20a_mm_l2_invalidate_done(g->dev->name); | ||
2875 | } | 2885 | } |
2876 | 2886 | ||
2877 | void gk20a_mm_l2_invalidate(struct gk20a *g) | 2887 | void gk20a_mm_l2_invalidate(struct gk20a *g) |
@@ -2900,6 +2910,8 @@ void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate) | |||
2900 | 2910 | ||
2901 | mutex_lock(&mm->l2_op_lock); | 2911 | mutex_lock(&mm->l2_op_lock); |
2902 | 2912 | ||
2913 | trace_gk20a_mm_l2_flush(g->dev->name); | ||
2914 | |||
2903 | /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2 | 2915 | /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2 |
2904 | as clean, so subsequent reads might hit in the L2. */ | 2916 | as clean, so subsequent reads might hit in the L2. */ |
2905 | gk20a_writel(g, flush_l2_flush_dirty_r(), | 2917 | gk20a_writel(g, flush_l2_flush_dirty_r(), |
@@ -2914,7 +2926,7 @@ void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate) | |||
2914 | flush_l2_flush_dirty_pending_busy_v()) { | 2926 | flush_l2_flush_dirty_pending_busy_v()) { |
2915 | gk20a_dbg_info("l2_flush_dirty 0x%x", data); | 2927 | gk20a_dbg_info("l2_flush_dirty 0x%x", data); |
2916 | retry--; | 2928 | retry--; |
2917 | usleep_range(20, 40); | 2929 | udelay(5); |
2918 | } else | 2930 | } else |
2919 | break; | 2931 | break; |
2920 | } while (retry >= 0 || !tegra_platform_is_silicon()); | 2932 | } while (retry >= 0 || !tegra_platform_is_silicon()); |
@@ -2923,6 +2935,8 @@ void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate) | |||
2923 | gk20a_warn(dev_from_gk20a(g), | 2935 | gk20a_warn(dev_from_gk20a(g), |
2924 | "l2_flush_dirty too many retries"); | 2936 | "l2_flush_dirty too many retries"); |
2925 | 2937 | ||
2938 | trace_gk20a_mm_l2_flush_done(g->dev->name); | ||
2939 | |||
2926 | if (invalidate) | 2940 | if (invalidate) |
2927 | gk20a_mm_l2_invalidate_locked(g); | 2941 | gk20a_mm_l2_invalidate_locked(g); |
2928 | 2942 | ||
@@ -2964,7 +2978,7 @@ void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm) | |||
2964 | u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->mm->g, | 2978 | u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->mm->g, |
2965 | vm->pdes.sgt->sgl) >> 12); | 2979 | vm->pdes.sgt->sgl) >> 12); |
2966 | u32 data; | 2980 | u32 data; |
2967 | s32 retry = 200; | 2981 | s32 retry = 2000; |
2968 | static DEFINE_MUTEX(tlb_lock); | 2982 | static DEFINE_MUTEX(tlb_lock); |
2969 | 2983 | ||
2970 | gk20a_dbg_fn(""); | 2984 | gk20a_dbg_fn(""); |
@@ -2986,11 +3000,14 @@ void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm) | |||
2986 | } | 3000 | } |
2987 | 3001 | ||
2988 | mutex_lock(&tlb_lock); | 3002 | mutex_lock(&tlb_lock); |
3003 | |||
3004 | trace_gk20a_mm_tlb_invalidate(g->dev->name); | ||
3005 | |||
2989 | do { | 3006 | do { |
2990 | data = gk20a_readl(g, fb_mmu_ctrl_r()); | 3007 | data = gk20a_readl(g, fb_mmu_ctrl_r()); |
2991 | if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0) | 3008 | if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0) |
2992 | break; | 3009 | break; |
2993 | usleep_range(20, 40); | 3010 | udelay(2); |
2994 | retry--; | 3011 | retry--; |
2995 | } while (retry >= 0 || !tegra_platform_is_silicon()); | 3012 | } while (retry >= 0 || !tegra_platform_is_silicon()); |
2996 | 3013 | ||
@@ -3014,13 +3031,15 @@ void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm) | |||
3014 | fb_mmu_ctrl_pri_fifo_empty_false_f()) | 3031 | fb_mmu_ctrl_pri_fifo_empty_false_f()) |
3015 | break; | 3032 | break; |
3016 | retry--; | 3033 | retry--; |
3017 | usleep_range(20, 40); | 3034 | udelay(2); |
3018 | } while (retry >= 0 || !tegra_platform_is_silicon()); | 3035 | } while (retry >= 0 || !tegra_platform_is_silicon()); |
3019 | 3036 | ||
3020 | if (retry < 0) | 3037 | if (retry < 0) |
3021 | gk20a_warn(dev_from_gk20a(g), | 3038 | gk20a_warn(dev_from_gk20a(g), |
3022 | "mmu invalidate too many retries"); | 3039 | "mmu invalidate too many retries"); |
3023 | 3040 | ||
3041 | trace_gk20a_mm_tlb_invalidate_done(g->dev->name); | ||
3042 | |||
3024 | out: | 3043 | out: |
3025 | mutex_unlock(&tlb_lock); | 3044 | mutex_unlock(&tlb_lock); |
3026 | vm->tlb_dirty = false; | 3045 | vm->tlb_dirty = false; |