diff options
author | Sami Kiminki <skiminki@nvidia.com> | 2015-04-20 11:12:22 -0400 |
---|---|---|
committer | Terje Bergstrom <tbergstrom@nvidia.com> | 2015-06-30 11:35:23 -0400 |
commit | e7ba93fefbc4df9663302d240f9fbd5967a75a3c (patch) | |
tree | e38de3af69153d860d9cb666fb30be262321b198 /drivers/gpu/nvgpu/gk20a/mm_gk20a.c | |
parent | ae7b988b0d8767cfbc2cffe4c7ec8757e4dd94a6 (diff) |
gpu: nvgpu: Initial MAP_BUFFER_BATCH implementation
Add batch support for mapping and unmapping. Batching essentially
helps transform some per-map/unmap overhead to per-batch overhead,
namely gk20a_busy()/gk20a_idle() calls, GPU L2 flushes, and GPU TLB
invalidates. Batching with size 64 has been measured to yield >20x
speed-up in low-level fixed-address mapping microbenchmarks.
Bug 1614735
Bug 1623949
Change-Id: Ie22b9caea5a7c3fc68a968d1b7f8488dfce72085
Signed-off-by: Sami Kiminki <skiminki@nvidia.com>
Reviewed-on: http://git-master/r/733231
(cherry picked from commit de4a7cfb93e8228a4a0c6a2815755a8df4531c91)
Reviewed-on: http://git-master/r/763812
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/mm_gk20a.c')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 111 |
1 files changed, 87 insertions, 24 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index 197e2b81..f3512f90 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c | |||
@@ -787,7 +787,34 @@ static void gk20a_vm_unmap_locked_kref(struct kref *ref) | |||
787 | { | 787 | { |
788 | struct mapped_buffer_node *mapped_buffer = | 788 | struct mapped_buffer_node *mapped_buffer = |
789 | container_of(ref, struct mapped_buffer_node, ref); | 789 | container_of(ref, struct mapped_buffer_node, ref); |
790 | gk20a_vm_unmap_locked(mapped_buffer); | 790 | gk20a_vm_unmap_locked(mapped_buffer, mapped_buffer->vm->kref_put_batch); |
791 | } | ||
792 | |||
793 | void gk20a_vm_mapping_batch_start(struct vm_gk20a_mapping_batch *mapping_batch) | ||
794 | { | ||
795 | memset(mapping_batch, 0, sizeof(*mapping_batch)); | ||
796 | mapping_batch->gpu_l2_flushed = false; | ||
797 | mapping_batch->need_tlb_invalidate = false; | ||
798 | } | ||
799 | |||
800 | void gk20a_vm_mapping_batch_finish_locked( | ||
801 | struct vm_gk20a *vm, struct vm_gk20a_mapping_batch *mapping_batch) | ||
802 | { | ||
803 | /* hanging kref_put batch pointer? */ | ||
804 | WARN_ON(vm->kref_put_batch == mapping_batch); | ||
805 | |||
806 | if (mapping_batch->need_tlb_invalidate) { | ||
807 | struct gk20a *g = gk20a_from_vm(vm); | ||
808 | g->ops.mm.tlb_invalidate(vm); | ||
809 | } | ||
810 | } | ||
811 | |||
812 | void gk20a_vm_mapping_batch_finish(struct vm_gk20a *vm, | ||
813 | struct vm_gk20a_mapping_batch *mapping_batch) | ||
814 | { | ||
815 | mutex_lock(&vm->update_gmmu_lock); | ||
816 | gk20a_vm_mapping_batch_finish_locked(vm, mapping_batch); | ||
817 | mutex_unlock(&vm->update_gmmu_lock); | ||
791 | } | 818 | } |
792 | 819 | ||
793 | void gk20a_vm_put_buffers(struct vm_gk20a *vm, | 820 | void gk20a_vm_put_buffers(struct vm_gk20a *vm, |
@@ -795,19 +822,25 @@ void gk20a_vm_put_buffers(struct vm_gk20a *vm, | |||
795 | int num_buffers) | 822 | int num_buffers) |
796 | { | 823 | { |
797 | int i; | 824 | int i; |
825 | struct vm_gk20a_mapping_batch batch; | ||
798 | 826 | ||
799 | mutex_lock(&vm->update_gmmu_lock); | 827 | mutex_lock(&vm->update_gmmu_lock); |
828 | gk20a_vm_mapping_batch_start(&batch); | ||
829 | vm->kref_put_batch = &batch; | ||
800 | 830 | ||
801 | for (i = 0; i < num_buffers; ++i) | 831 | for (i = 0; i < num_buffers; ++i) |
802 | kref_put(&mapped_buffers[i]->ref, | 832 | kref_put(&mapped_buffers[i]->ref, |
803 | gk20a_vm_unmap_locked_kref); | 833 | gk20a_vm_unmap_locked_kref); |
804 | 834 | ||
835 | vm->kref_put_batch = NULL; | ||
836 | gk20a_vm_mapping_batch_finish_locked(vm, &batch); | ||
805 | mutex_unlock(&vm->update_gmmu_lock); | 837 | mutex_unlock(&vm->update_gmmu_lock); |
806 | 838 | ||
807 | nvgpu_free(mapped_buffers); | 839 | nvgpu_free(mapped_buffers); |
808 | } | 840 | } |
809 | 841 | ||
810 | static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset) | 842 | static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset, |
843 | struct vm_gk20a_mapping_batch *batch) | ||
811 | { | 844 | { |
812 | struct device *d = dev_from_vm(vm); | 845 | struct device *d = dev_from_vm(vm); |
813 | int retries = 10000; /* 50 ms */ | 846 | int retries = 10000; /* 50 ms */ |
@@ -840,7 +873,10 @@ static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset) | |||
840 | mapped_buffer->user_mapped--; | 873 | mapped_buffer->user_mapped--; |
841 | if (mapped_buffer->user_mapped == 0) | 874 | if (mapped_buffer->user_mapped == 0) |
842 | vm->num_user_mapped_buffers--; | 875 | vm->num_user_mapped_buffers--; |
876 | |||
877 | vm->kref_put_batch = batch; | ||
843 | kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref); | 878 | kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref); |
879 | vm->kref_put_batch = NULL; | ||
844 | 880 | ||
845 | mutex_unlock(&vm->update_gmmu_lock); | 881 | mutex_unlock(&vm->update_gmmu_lock); |
846 | } | 882 | } |
@@ -1131,7 +1167,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | |||
1131 | u32 flags, | 1167 | u32 flags, |
1132 | int rw_flag, | 1168 | int rw_flag, |
1133 | bool clear_ctags, | 1169 | bool clear_ctags, |
1134 | bool sparse) | 1170 | bool sparse, |
1171 | struct vm_gk20a_mapping_batch *batch) | ||
1135 | { | 1172 | { |
1136 | int err = 0; | 1173 | int err = 0; |
1137 | bool allocated = false; | 1174 | bool allocated = false; |
@@ -1177,7 +1214,10 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | |||
1177 | goto fail_validate; | 1214 | goto fail_validate; |
1178 | } | 1215 | } |
1179 | 1216 | ||
1180 | g->ops.mm.tlb_invalidate(vm); | 1217 | if (!batch) |
1218 | g->ops.mm.tlb_invalidate(vm); | ||
1219 | else | ||
1220 | batch->need_tlb_invalidate = true; | ||
1181 | 1221 | ||
1182 | return map_offset; | 1222 | return map_offset; |
1183 | fail_validate: | 1223 | fail_validate: |
@@ -1194,7 +1234,8 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, | |||
1194 | int pgsz_idx, | 1234 | int pgsz_idx, |
1195 | bool va_allocated, | 1235 | bool va_allocated, |
1196 | int rw_flag, | 1236 | int rw_flag, |
1197 | bool sparse) | 1237 | bool sparse, |
1238 | struct vm_gk20a_mapping_batch *batch) | ||
1198 | { | 1239 | { |
1199 | int err = 0; | 1240 | int err = 0; |
1200 | struct gk20a *g = gk20a_from_vm(vm); | 1241 | struct gk20a *g = gk20a_from_vm(vm); |
@@ -1230,9 +1271,16 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, | |||
1230 | * for gmmu ptes. note the positioning of this relative to any smmu | 1271 | * for gmmu ptes. note the positioning of this relative to any smmu |
1231 | * unmapping (below). */ | 1272 | * unmapping (below). */ |
1232 | 1273 | ||
1233 | gk20a_mm_l2_flush(g, true); | 1274 | if (!batch) { |
1234 | 1275 | gk20a_mm_l2_flush(g, true); | |
1235 | g->ops.mm.tlb_invalidate(vm); | 1276 | g->ops.mm.tlb_invalidate(vm); |
1277 | } else { | ||
1278 | if (!batch->gpu_l2_flushed) { | ||
1279 | gk20a_mm_l2_flush(g, true); | ||
1280 | batch->gpu_l2_flushed = true; | ||
1281 | } | ||
1282 | batch->need_tlb_invalidate = true; | ||
1283 | } | ||
1236 | } | 1284 | } |
1237 | 1285 | ||
1238 | static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm, | 1286 | static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm, |
@@ -1308,7 +1356,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm, | |||
1308 | bool user_mapped, | 1356 | bool user_mapped, |
1309 | int rw_flag, | 1357 | int rw_flag, |
1310 | u64 buffer_offset, | 1358 | u64 buffer_offset, |
1311 | u64 mapping_size) | 1359 | u64 mapping_size, |
1360 | struct vm_gk20a_mapping_batch *batch) | ||
1312 | { | 1361 | { |
1313 | struct gk20a *g = gk20a_from_vm(vm); | 1362 | struct gk20a *g = gk20a_from_vm(vm); |
1314 | struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags; | 1363 | struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags; |
@@ -1509,7 +1558,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm, | |||
1509 | bfr.ctag_offset, | 1558 | bfr.ctag_offset, |
1510 | flags, rw_flag, | 1559 | flags, rw_flag, |
1511 | clear_ctags, | 1560 | clear_ctags, |
1512 | false); | 1561 | false, |
1562 | batch); | ||
1513 | if (!map_offset) | 1563 | if (!map_offset) |
1514 | goto clean_up; | 1564 | goto clean_up; |
1515 | 1565 | ||
@@ -1727,8 +1777,9 @@ int gk20a_vm_map_compbits(struct vm_gk20a *vm, | |||
1727 | 0, /* ctag_offset */ | 1777 | 0, /* ctag_offset */ |
1728 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, | 1778 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, |
1729 | gk20a_mem_flag_read_only, | 1779 | gk20a_mem_flag_read_only, |
1730 | false, | 1780 | false, /* clear_ctags */ |
1731 | false); | 1781 | false, /* sparse */ |
1782 | NULL); /* mapping_batch handle */ | ||
1732 | 1783 | ||
1733 | if (!mapped_buffer->ctag_map_win_addr) { | 1784 | if (!mapped_buffer->ctag_map_win_addr) { |
1734 | mutex_unlock(&vm->update_gmmu_lock); | 1785 | mutex_unlock(&vm->update_gmmu_lock); |
@@ -1764,7 +1815,10 @@ u64 gk20a_gmmu_map(struct vm_gk20a *vm, | |||
1764 | 0, /* page size index = 0 i.e. SZ_4K */ | 1815 | 0, /* page size index = 0 i.e. SZ_4K */ |
1765 | 0, /* kind */ | 1816 | 0, /* kind */ |
1766 | 0, /* ctag_offset */ | 1817 | 0, /* ctag_offset */ |
1767 | flags, rw_flag, false, false); | 1818 | flags, rw_flag, |
1819 | false, /* clear_ctags */ | ||
1820 | false, /* sparse */ | ||
1821 | NULL); /* mapping_batch handle */ | ||
1768 | mutex_unlock(&vm->update_gmmu_lock); | 1822 | mutex_unlock(&vm->update_gmmu_lock); |
1769 | if (!vaddr) { | 1823 | if (!vaddr) { |
1770 | gk20a_err(dev_from_vm(vm), "failed to allocate va space"); | 1824 | gk20a_err(dev_from_vm(vm), "failed to allocate va space"); |
@@ -1930,7 +1984,8 @@ void gk20a_gmmu_unmap(struct vm_gk20a *vm, | |||
1930 | 0, /* page size 4K */ | 1984 | 0, /* page size 4K */ |
1931 | true, /*va_allocated */ | 1985 | true, /*va_allocated */ |
1932 | rw_flag, | 1986 | rw_flag, |
1933 | false); | 1987 | false, |
1988 | NULL); | ||
1934 | mutex_unlock(&vm->update_gmmu_lock); | 1989 | mutex_unlock(&vm->update_gmmu_lock); |
1935 | } | 1990 | } |
1936 | 1991 | ||
@@ -2378,7 +2433,8 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm, | |||
2378 | } | 2433 | } |
2379 | 2434 | ||
2380 | /* NOTE! mapped_buffers lock must be held */ | 2435 | /* NOTE! mapped_buffers lock must be held */ |
2381 | void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer) | 2436 | void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer, |
2437 | struct vm_gk20a_mapping_batch *batch) | ||
2382 | { | 2438 | { |
2383 | struct vm_gk20a *vm = mapped_buffer->vm; | 2439 | struct vm_gk20a *vm = mapped_buffer->vm; |
2384 | struct gk20a *g = vm->mm->g; | 2440 | struct gk20a *g = vm->mm->g; |
@@ -2392,7 +2448,8 @@ void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer) | |||
2392 | 0, /* page size 4k */ | 2448 | 0, /* page size 4k */ |
2393 | true, /* va allocated */ | 2449 | true, /* va allocated */ |
2394 | gk20a_mem_flag_none, | 2450 | gk20a_mem_flag_none, |
2395 | false); /* not sparse */ | 2451 | false, /* not sparse */ |
2452 | batch); /* batch handle */ | ||
2396 | } | 2453 | } |
2397 | 2454 | ||
2398 | g->ops.mm.gmmu_unmap(vm, | 2455 | g->ops.mm.gmmu_unmap(vm, |
@@ -2402,7 +2459,8 @@ void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer) | |||
2402 | mapped_buffer->va_allocated, | 2459 | mapped_buffer->va_allocated, |
2403 | gk20a_mem_flag_none, | 2460 | gk20a_mem_flag_none, |
2404 | mapped_buffer->va_node ? | 2461 | mapped_buffer->va_node ? |
2405 | mapped_buffer->va_node->sparse : false); | 2462 | mapped_buffer->va_node->sparse : false, |
2463 | batch); | ||
2406 | 2464 | ||
2407 | gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d", | 2465 | gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d", |
2408 | vm_aspace_id(vm), | 2466 | vm_aspace_id(vm), |
@@ -2479,7 +2537,7 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm) | |||
2479 | while (node) { | 2537 | while (node) { |
2480 | mapped_buffer = | 2538 | mapped_buffer = |
2481 | container_of(node, struct mapped_buffer_node, node); | 2539 | container_of(node, struct mapped_buffer_node, node); |
2482 | gk20a_vm_unmap_locked(mapped_buffer); | 2540 | gk20a_vm_unmap_locked(mapped_buffer, NULL); |
2483 | node = rb_first(&vm->mapped_buffers); | 2541 | node = rb_first(&vm->mapped_buffers); |
2484 | } | 2542 | } |
2485 | 2543 | ||
@@ -2776,7 +2834,8 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share, | |||
2776 | args->flags, | 2834 | args->flags, |
2777 | gk20a_mem_flag_none, | 2835 | gk20a_mem_flag_none, |
2778 | false, | 2836 | false, |
2779 | true); | 2837 | true, |
2838 | NULL); | ||
2780 | if (!map_offset) { | 2839 | if (!map_offset) { |
2781 | mutex_unlock(&vm->update_gmmu_lock); | 2840 | mutex_unlock(&vm->update_gmmu_lock); |
2782 | gk20a_bfree(vma, vaddr_start); | 2841 | gk20a_bfree(vma, vaddr_start); |
@@ -2841,7 +2900,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share, | |||
2841 | va_node->pgsz_idx, | 2900 | va_node->pgsz_idx, |
2842 | true, | 2901 | true, |
2843 | gk20a_mem_flag_none, | 2902 | gk20a_mem_flag_none, |
2844 | true); | 2903 | true, |
2904 | NULL); | ||
2845 | kfree(va_node); | 2905 | kfree(va_node); |
2846 | } | 2906 | } |
2847 | mutex_unlock(&vm->update_gmmu_lock); | 2907 | mutex_unlock(&vm->update_gmmu_lock); |
@@ -2960,7 +3020,8 @@ int gk20a_vm_map_buffer(struct vm_gk20a *vm, | |||
2960 | u32 flags, /*NVGPU_AS_MAP_BUFFER_FLAGS_*/ | 3020 | u32 flags, /*NVGPU_AS_MAP_BUFFER_FLAGS_*/ |
2961 | int kind, | 3021 | int kind, |
2962 | u64 buffer_offset, | 3022 | u64 buffer_offset, |
2963 | u64 mapping_size) | 3023 | u64 mapping_size, |
3024 | struct vm_gk20a_mapping_batch *batch) | ||
2964 | { | 3025 | { |
2965 | int err = 0; | 3026 | int err = 0; |
2966 | struct dma_buf *dmabuf; | 3027 | struct dma_buf *dmabuf; |
@@ -2986,7 +3047,8 @@ int gk20a_vm_map_buffer(struct vm_gk20a *vm, | |||
2986 | flags, kind, NULL, true, | 3047 | flags, kind, NULL, true, |
2987 | gk20a_mem_flag_none, | 3048 | gk20a_mem_flag_none, |
2988 | buffer_offset, | 3049 | buffer_offset, |
2989 | mapping_size); | 3050 | mapping_size, |
3051 | batch); | ||
2990 | 3052 | ||
2991 | *offset_align = ret_va; | 3053 | *offset_align = ret_va; |
2992 | if (!ret_va) { | 3054 | if (!ret_va) { |
@@ -2997,11 +3059,12 @@ int gk20a_vm_map_buffer(struct vm_gk20a *vm, | |||
2997 | return err; | 3059 | return err; |
2998 | } | 3060 | } |
2999 | 3061 | ||
3000 | int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset) | 3062 | int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset, |
3063 | struct vm_gk20a_mapping_batch *batch) | ||
3001 | { | 3064 | { |
3002 | gk20a_dbg_fn(""); | 3065 | gk20a_dbg_fn(""); |
3003 | 3066 | ||
3004 | gk20a_vm_unmap_user(vm, offset); | 3067 | gk20a_vm_unmap_user(vm, offset, batch); |
3005 | return 0; | 3068 | return 0; |
3006 | } | 3069 | } |
3007 | 3070 | ||