diff options
author | Deepak Nibade <dnibade@nvidia.com> | 2016-07-28 05:07:18 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2016-09-01 12:10:20 -0400 |
commit | f79639f61858c377cf1f3facfc0ce631f787f0e6 (patch) | |
tree | 188d4033c93fcf0e5b819c074dc436b9b36f448e /drivers/gpu/nvgpu | |
parent | aa7f4bf251ee6346bf300f3793002eb4a7f05562 (diff) |
gpu: nvgpu: clear whole vidmem on first allocation
We currently clear vidmem pages in gk20a_gmmu_alloc_attr_vid_at()
i.e. allocation path for each buffer
But since buffer allocation path could be latency critical,
clear whole vidmem first and before first User allcation
in gk20a_vidmem_buf_alloc()
And then clear buffer pages while releasing the buffer
In this way, we can ensure that vidmem pages are already cleared
during buffer allocation path
At a later stage, clearing of pages can be removed from free path
and moved to a separate worker as well
At this point, first allocation has overhead of clearing whole
vidmem which takes about 380mS and this should improve once
clocks are raised.
Also, this is one time larency, and subsequent allocations
should not have any overhead for clearing at all
Add API gk20a_vidmem_clear_all() to clear whole vidmem
We have WPR buffers allocated during boot up and
at fixed address in vidmem.
To prevent overwriting to these buffers in gk20a_vidmem_clear_all(),
clear whole vidmem except for the bootstrap allocator carveout
Add new API gk20a_gmmu_clear_vidmem_mem() to clear one mem_desc
Jira DNVGPU-84
Change-Id: I5661700585c6241a6a1ddeb5b7c068d3d2aed4b3
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/1194301
(cherry picked from commit 950ab61a04290ea405968d8b0d03e3bd044ce83d)
Reviewed-on: http://git-master/r/1193158
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 179 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 5 |
2 files changed, 136 insertions, 48 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index a5158e7c..65157ccd 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c | |||
@@ -774,6 +774,70 @@ static void gk20a_init_pramin(struct mm_gk20a *mm) | |||
774 | mm->force_pramin = GK20A_FORCE_PRAMIN_DEFAULT; | 774 | mm->force_pramin = GK20A_FORCE_PRAMIN_DEFAULT; |
775 | } | 775 | } |
776 | 776 | ||
777 | #if defined(CONFIG_GK20A_VIDMEM) | ||
778 | static int gk20a_vidmem_clear_all(struct gk20a *g) | ||
779 | { | ||
780 | struct mm_gk20a *mm = &g->mm; | ||
781 | struct gk20a_fence *gk20a_fence_out = NULL; | ||
782 | u64 region2_base = 0; | ||
783 | int err = 0; | ||
784 | |||
785 | if (mm->vidmem.ce_ctx_id == ~0) | ||
786 | return -EINVAL; | ||
787 | |||
788 | err = gk20a_ce_execute_ops(g->dev, | ||
789 | mm->vidmem.ce_ctx_id, | ||
790 | 0, | ||
791 | mm->vidmem.base, | ||
792 | mm->vidmem.bootstrap_base - mm->vidmem.base, | ||
793 | 0x00000000, | ||
794 | NVGPU_CE_DST_LOCATION_LOCAL_FB, | ||
795 | NVGPU_CE_MEMSET, | ||
796 | NULL, | ||
797 | 0, | ||
798 | NULL); | ||
799 | if (err) { | ||
800 | gk20a_err(g->dev, | ||
801 | "Failed to clear vidmem region 1 : %d", err); | ||
802 | return err; | ||
803 | } | ||
804 | |||
805 | region2_base = mm->vidmem.bootstrap_base + mm->vidmem.bootstrap_size; | ||
806 | |||
807 | err = gk20a_ce_execute_ops(g->dev, | ||
808 | mm->vidmem.ce_ctx_id, | ||
809 | 0, | ||
810 | region2_base, | ||
811 | mm->vidmem.size - region2_base, | ||
812 | 0x00000000, | ||
813 | NVGPU_CE_DST_LOCATION_LOCAL_FB, | ||
814 | NVGPU_CE_MEMSET, | ||
815 | NULL, | ||
816 | 0, | ||
817 | &gk20a_fence_out); | ||
818 | if (err) { | ||
819 | gk20a_err(g->dev, | ||
820 | "Failed to clear vidmem region 2 : %d", err); | ||
821 | return err; | ||
822 | } | ||
823 | |||
824 | if (gk20a_fence_out) { | ||
825 | err = gk20a_fence_wait(gk20a_fence_out, | ||
826 | gk20a_get_gr_idle_timeout(g)); | ||
827 | gk20a_fence_put(gk20a_fence_out); | ||
828 | if (err) { | ||
829 | gk20a_err(g->dev, | ||
830 | "fence wait failed for CE execute ops"); | ||
831 | return err; | ||
832 | } | ||
833 | } | ||
834 | |||
835 | mm->vidmem.cleared = true; | ||
836 | |||
837 | return 0; | ||
838 | } | ||
839 | #endif | ||
840 | |||
777 | static int gk20a_init_vidmem(struct mm_gk20a *mm) | 841 | static int gk20a_init_vidmem(struct mm_gk20a *mm) |
778 | { | 842 | { |
779 | #if defined(CONFIG_GK20A_VIDMEM) | 843 | #if defined(CONFIG_GK20A_VIDMEM) |
@@ -813,7 +877,10 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm) | |||
813 | gk20a_alloc_fixed(&g->mm.vidmem.allocator, | 877 | gk20a_alloc_fixed(&g->mm.vidmem.allocator, |
814 | bootstrap_base, bootstrap_size); | 878 | bootstrap_base, bootstrap_size); |
815 | 879 | ||
816 | mm->vidmem.size = size; | 880 | mm->vidmem.base = base; |
881 | mm->vidmem.size = size - base; | ||
882 | mm->vidmem.bootstrap_base = bootstrap_base; | ||
883 | mm->vidmem.bootstrap_size = bootstrap_size; | ||
817 | 884 | ||
818 | gk20a_dbg_info("registered vidmem: %zu MB", size / SZ_1M); | 885 | gk20a_dbg_info("registered vidmem: %zu MB", size / SZ_1M); |
819 | 886 | ||
@@ -2027,7 +2094,7 @@ int gk20a_vidmem_buf_alloc(struct gk20a *g, size_t bytes) | |||
2027 | { | 2094 | { |
2028 | #if defined(CONFIG_GK20A_VIDMEM) | 2095 | #if defined(CONFIG_GK20A_VIDMEM) |
2029 | struct gk20a_vidmem_buf *buf; | 2096 | struct gk20a_vidmem_buf *buf; |
2030 | int err, fd; | 2097 | int err = 0, fd; |
2031 | 2098 | ||
2032 | gk20a_dbg_fn(""); | 2099 | gk20a_dbg_fn(""); |
2033 | 2100 | ||
@@ -2037,6 +2104,14 @@ int gk20a_vidmem_buf_alloc(struct gk20a *g, size_t bytes) | |||
2037 | 2104 | ||
2038 | buf->g = g; | 2105 | buf->g = g; |
2039 | 2106 | ||
2107 | if (!g->mm.vidmem.cleared) { | ||
2108 | err = gk20a_vidmem_clear_all(g); | ||
2109 | if (err) { | ||
2110 | gk20a_err(g->dev, "failed to clear whole vidmem"); | ||
2111 | goto err_kfree; | ||
2112 | } | ||
2113 | } | ||
2114 | |||
2040 | err = gk20a_gmmu_alloc_vid(g, bytes, &buf->mem); | 2115 | err = gk20a_gmmu_alloc_vid(g, bytes, &buf->mem); |
2041 | if (err) | 2116 | if (err) |
2042 | goto err_kfree; | 2117 | goto err_kfree; |
@@ -2743,6 +2818,59 @@ static void gk20a_gmmu_free_attr_sys(struct gk20a *g, enum dma_attr attr, | |||
2743 | mem->aperture = APERTURE_INVALID; | 2818 | mem->aperture = APERTURE_INVALID; |
2744 | } | 2819 | } |
2745 | 2820 | ||
2821 | #if defined(CONFIG_GK20A_VIDMEM) | ||
2822 | static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct mem_desc *mem) | ||
2823 | { | ||
2824 | struct gk20a_fence *gk20a_fence_out = NULL; | ||
2825 | struct gk20a_fence *gk20a_last_fence = NULL; | ||
2826 | struct gk20a_page_alloc *alloc = NULL; | ||
2827 | struct page_alloc_chunk *chunk = NULL; | ||
2828 | int err = 0; | ||
2829 | |||
2830 | if (g->mm.vidmem.ce_ctx_id == ~0) | ||
2831 | return -EINVAL; | ||
2832 | |||
2833 | alloc = (struct gk20a_page_alloc *) | ||
2834 | g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0); | ||
2835 | |||
2836 | list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) { | ||
2837 | if (gk20a_last_fence) | ||
2838 | gk20a_fence_put(gk20a_last_fence); | ||
2839 | |||
2840 | err = gk20a_ce_execute_ops(g->dev, | ||
2841 | g->mm.vidmem.ce_ctx_id, | ||
2842 | 0, | ||
2843 | chunk->base, | ||
2844 | chunk->length, | ||
2845 | 0x00000000, | ||
2846 | NVGPU_CE_DST_LOCATION_LOCAL_FB, | ||
2847 | NVGPU_CE_MEMSET, | ||
2848 | NULL, | ||
2849 | 0, | ||
2850 | &gk20a_fence_out); | ||
2851 | |||
2852 | if (err) { | ||
2853 | gk20a_err(g->dev, | ||
2854 | "Failed gk20a_ce_execute_ops[%d]", err); | ||
2855 | return err; | ||
2856 | } | ||
2857 | |||
2858 | gk20a_last_fence = gk20a_fence_out; | ||
2859 | } | ||
2860 | |||
2861 | if (gk20a_last_fence) { | ||
2862 | err = gk20a_fence_wait(gk20a_last_fence, | ||
2863 | gk20a_get_gr_idle_timeout(g)); | ||
2864 | gk20a_fence_put(gk20a_last_fence); | ||
2865 | if (err) | ||
2866 | gk20a_err(g->dev, | ||
2867 | "fence wait failed for CE execute ops"); | ||
2868 | } | ||
2869 | |||
2870 | return err; | ||
2871 | } | ||
2872 | #endif | ||
2873 | |||
2746 | int gk20a_gmmu_alloc_vid(struct gk20a *g, size_t size, struct mem_desc *mem) | 2874 | int gk20a_gmmu_alloc_vid(struct gk20a *g, size_t size, struct mem_desc *mem) |
2747 | { | 2875 | { |
2748 | return gk20a_gmmu_alloc_attr_vid(g, 0, size, mem); | 2876 | return gk20a_gmmu_alloc_attr_vid(g, 0, size, mem); |
@@ -2803,56 +2931,10 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr, | |||
2803 | mem->size = size; | 2931 | mem->size = size; |
2804 | mem->aperture = APERTURE_VIDMEM; | 2932 | mem->aperture = APERTURE_VIDMEM; |
2805 | 2933 | ||
2806 | if (g->mm.vidmem.ce_ctx_id != ~0) { | ||
2807 | struct gk20a_fence *gk20a_fence_out = NULL; | ||
2808 | struct gk20a_fence *gk20a_last_fence = NULL; | ||
2809 | struct gk20a_page_alloc *alloc = NULL; | ||
2810 | struct page_alloc_chunk *chunk = NULL; | ||
2811 | |||
2812 | alloc = (struct gk20a_page_alloc *) | ||
2813 | g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0); | ||
2814 | |||
2815 | list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) { | ||
2816 | if (gk20a_last_fence) | ||
2817 | gk20a_fence_put(gk20a_last_fence); | ||
2818 | |||
2819 | err = gk20a_ce_execute_ops(g->dev, | ||
2820 | g->mm.vidmem.ce_ctx_id, | ||
2821 | 0, | ||
2822 | chunk->base, | ||
2823 | chunk->length, | ||
2824 | 0x00000000, | ||
2825 | NVGPU_CE_DST_LOCATION_LOCAL_FB, | ||
2826 | NVGPU_CE_MEMSET, | ||
2827 | NULL, | ||
2828 | 0, | ||
2829 | &gk20a_fence_out); | ||
2830 | |||
2831 | if (err) { | ||
2832 | gk20a_err(g->dev, | ||
2833 | "Failed gk20a_ce_execute_ops[%d]", err); | ||
2834 | goto fail_free_table; | ||
2835 | } | ||
2836 | |||
2837 | gk20a_last_fence = gk20a_fence_out; | ||
2838 | } | ||
2839 | |||
2840 | if (gk20a_last_fence) { | ||
2841 | err = gk20a_fence_wait(gk20a_last_fence, | ||
2842 | gk20a_get_gr_idle_timeout(g)); | ||
2843 | gk20a_fence_put(gk20a_last_fence); | ||
2844 | if (err) | ||
2845 | gk20a_err(g->dev, | ||
2846 | "Failed to get the fence_out from CE execute ops"); | ||
2847 | } | ||
2848 | } | ||
2849 | |||
2850 | gk20a_dbg_fn("done at 0x%llx size %zu", addr, size); | 2934 | gk20a_dbg_fn("done at 0x%llx size %zu", addr, size); |
2851 | 2935 | ||
2852 | return 0; | 2936 | return 0; |
2853 | 2937 | ||
2854 | fail_free_table: | ||
2855 | sg_free_table(mem->sgt); | ||
2856 | fail_kfree: | 2938 | fail_kfree: |
2857 | kfree(mem->sgt); | 2939 | kfree(mem->sgt); |
2858 | fail_physfree: | 2940 | fail_physfree: |
@@ -2867,6 +2949,7 @@ static void gk20a_gmmu_free_attr_vid(struct gk20a *g, enum dma_attr attr, | |||
2867 | struct mem_desc *mem) | 2949 | struct mem_desc *mem) |
2868 | { | 2950 | { |
2869 | #if defined(CONFIG_GK20A_VIDMEM) | 2951 | #if defined(CONFIG_GK20A_VIDMEM) |
2952 | gk20a_gmmu_clear_vidmem_mem(g, mem); | ||
2870 | gk20a_free(&g->mm.vidmem.allocator, sg_dma_address(mem->sgt->sgl)); | 2953 | gk20a_free(&g->mm.vidmem.allocator, sg_dma_address(mem->sgt->sgl)); |
2871 | gk20a_free_sgtable(&mem->sgt); | 2954 | gk20a_free_sgtable(&mem->sgt); |
2872 | mem->size = 0; | 2955 | mem->size = 0; |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index e4d7d741..c6360955 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h | |||
@@ -405,8 +405,13 @@ struct mm_gk20a { | |||
405 | 405 | ||
406 | struct { | 406 | struct { |
407 | size_t size; | 407 | size_t size; |
408 | u64 base; | ||
409 | size_t bootstrap_size; | ||
410 | u64 bootstrap_base; | ||
411 | |||
408 | struct gk20a_allocator allocator; | 412 | struct gk20a_allocator allocator; |
409 | struct gk20a_allocator bootstrap_allocator; | 413 | struct gk20a_allocator bootstrap_allocator; |
414 | |||
410 | u32 ce_ctx_id; | 415 | u32 ce_ctx_id; |
411 | bool cleared; | 416 | bool cleared; |
412 | } vidmem; | 417 | } vidmem; |