summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorSami Kiminki <skiminki@nvidia.com>2015-04-20 11:12:22 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2015-06-30 11:35:23 -0400
commite7ba93fefbc4df9663302d240f9fbd5967a75a3c (patch)
treee38de3af69153d860d9cb666fb30be262321b198 /drivers/gpu
parentae7b988b0d8767cfbc2cffe4c7ec8757e4dd94a6 (diff)
gpu: nvgpu: Initial MAP_BUFFER_BATCH implementation
Add batch support for mapping and unmapping. Batching essentially helps transform some per-map/unmap overhead to per-batch overhead, namely gk20a_busy()/gk20a_idle() calls, GPU L2 flushes, and GPU TLB invalidates. Batching with size 64 has been measured to yield >20x speed-up in low-level fixed-address mapping microbenchmarks. Bug 1614735 Bug 1623949 Change-Id: Ie22b9caea5a7c3fc68a968d1b7f8488dfce72085 Signed-off-by: Sami Kiminki <skiminki@nvidia.com> Reviewed-on: http://git-master/r/733231 (cherry picked from commit de4a7cfb93e8228a4a0c6a2815755a8df4531c91) Reviewed-on: http://git-master/r/763812 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/nvgpu/gk20a/as_gk20a.c91
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.c3
-rw-r--r--drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c7
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h6
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c111
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h42
-rw-r--r--drivers/gpu/nvgpu/vgpu/mm_vgpu.c8
8 files changed, 227 insertions, 43 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index eb18fa65..feb22ea8 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -151,8 +151,8 @@ static int gk20a_as_ioctl_map_buffer_ex(
151 &args->offset, args->flags, 151 &args->offset, args->flags,
152 args->kind, 152 args->kind,
153 args->buffer_offset, 153 args->buffer_offset,
154 args->mapping_size 154 args->mapping_size,
155 ); 155 NULL);
156} 156}
157 157
158static int gk20a_as_ioctl_map_buffer( 158static int gk20a_as_ioctl_map_buffer(
@@ -163,7 +163,7 @@ static int gk20a_as_ioctl_map_buffer(
163 return gk20a_vm_map_buffer(as_share->vm, args->dmabuf_fd, 163 return gk20a_vm_map_buffer(as_share->vm, args->dmabuf_fd,
164 &args->o_a.offset, 164 &args->o_a.offset,
165 args->flags, NV_KIND_DEFAULT, 165 args->flags, NV_KIND_DEFAULT,
166 0, 0); 166 0, 0, NULL);
167 /* args->o_a.offset will be set if !err */ 167 /* args->o_a.offset will be set if !err */
168} 168}
169 169
@@ -172,7 +172,86 @@ static int gk20a_as_ioctl_unmap_buffer(
172 struct nvgpu_as_unmap_buffer_args *args) 172 struct nvgpu_as_unmap_buffer_args *args)
173{ 173{
174 gk20a_dbg_fn(""); 174 gk20a_dbg_fn("");
175 return gk20a_vm_unmap_buffer(as_share->vm, args->offset); 175 return gk20a_vm_unmap_buffer(as_share->vm, args->offset, NULL);
176}
177
178static int gk20a_as_ioctl_map_buffer_batch(
179 struct gk20a_as_share *as_share,
180 struct nvgpu_as_map_buffer_batch_args *args)
181{
182 struct gk20a *g = as_share->vm->mm->g;
183 u32 i;
184 int err = 0;
185
186 struct nvgpu_as_unmap_buffer_args __user *user_unmap_args =
187 (struct nvgpu_as_unmap_buffer_args __user *)(uintptr_t)
188 args->unmaps;
189 struct nvgpu_as_map_buffer_ex_args __user *user_map_args =
190 (struct nvgpu_as_map_buffer_ex_args __user *)(uintptr_t)
191 args->maps;
192
193 struct vm_gk20a_mapping_batch batch;
194
195 gk20a_dbg_fn("");
196
197 if (args->num_unmaps > g->gpu_characteristics.map_buffer_batch_limit ||
198 args->num_maps > g->gpu_characteristics.map_buffer_batch_limit)
199 return -EINVAL;
200
201 gk20a_vm_mapping_batch_start(&batch);
202
203 for (i = 0; i < args->num_unmaps; ++i) {
204 struct nvgpu_as_unmap_buffer_args unmap_args;
205
206 if (copy_from_user(&unmap_args, &user_unmap_args[i],
207 sizeof(unmap_args))) {
208 err = -EFAULT;
209 break;
210 }
211
212 err = gk20a_vm_unmap_buffer(as_share->vm, unmap_args.offset,
213 &batch);
214 if (err)
215 break;
216 }
217
218 if (err) {
219 gk20a_vm_mapping_batch_finish(as_share->vm, &batch);
220
221 args->num_unmaps = i;
222 args->num_maps = 0;
223 return err;
224 }
225
226 for (i = 0; i < args->num_maps; ++i) {
227 struct nvgpu_as_map_buffer_ex_args map_args;
228 memset(&map_args, 0, sizeof(map_args));
229
230 if (copy_from_user(&map_args, &user_map_args[i],
231 sizeof(map_args))) {
232 err = -EFAULT;
233 break;
234 }
235
236 err = gk20a_vm_map_buffer(
237 as_share->vm, map_args.dmabuf_fd,
238 &map_args.offset, map_args.flags,
239 map_args.kind,
240 map_args.buffer_offset,
241 map_args.mapping_size,
242 &batch);
243 if (err)
244 break;
245 }
246
247 gk20a_vm_mapping_batch_finish(as_share->vm, &batch);
248
249 if (err)
250 args->num_maps = i;
251 /* note: args->num_unmaps will be unmodified, which is ok
252 * since all unmaps are done */
253
254 return err;
176} 255}
177 256
178static int gk20a_as_ioctl_get_va_regions( 257static int gk20a_as_ioctl_get_va_regions(
@@ -360,6 +439,10 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
360 err = gk20a_as_ioctl_map_buffer_compbits(as_share, 439 err = gk20a_as_ioctl_map_buffer_compbits(as_share,
361 (struct nvgpu_as_map_buffer_compbits_args *)buf); 440 (struct nvgpu_as_map_buffer_compbits_args *)buf);
362 break; 441 break;
442 case NVGPU_AS_IOCTL_MAP_BUFFER_BATCH:
443 err = gk20a_as_ioctl_map_buffer_batch(as_share,
444 (struct nvgpu_as_map_buffer_batch_args *)buf);
445 break;
363 default: 446 default:
364 dev_dbg(dev_from_gk20a(g), "unrecognized as ioctl: 0x%x", cmd); 447 dev_dbg(dev_from_gk20a(g), "unrecognized as ioctl: 0x%x", cmd);
365 err = -ENOTTY; 448 err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index b4fdfb44..7f212eca 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -952,7 +952,8 @@ __releases(&cde_app->mutex)
952 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 952 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
953 compbits_kind, NULL, true, 953 compbits_kind, NULL, true,
954 gk20a_mem_flag_none, 954 gk20a_mem_flag_none,
955 map_offset, map_size); 955 map_offset, map_size,
956 NULL);
956 if (!map_vaddr) { 957 if (!map_vaddr) {
957 dma_buf_put(compbits_buf); 958 dma_buf_put(compbits_buf);
958 err = -EINVAL; 959 err = -EINVAL;
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
index 217f0056..1e247859 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -828,7 +828,8 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
828 0, 828 0,
829 0, 829 0,
830 0, 830 0,
831 args->mapping_size); 831 args->mapping_size,
832 NULL);
832 if (err) 833 if (err)
833 return err; 834 return err;
834 835
@@ -839,7 +840,7 @@ static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
839 virt_addr_hi = u64_hi32(args->offset); 840 virt_addr_hi = u64_hi32(args->offset);
840 /* but check anyway */ 841 /* but check anyway */
841 if (args->offset + virt_size > SZ_4G) { 842 if (args->offset + virt_size > SZ_4G) {
842 gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset); 843 gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
843 return -EINVAL; 844 return -EINVAL;
844 } 845 }
845 846
@@ -881,7 +882,7 @@ static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
881 perf_pmasys_mem_block_valid_false_f() | 882 perf_pmasys_mem_block_valid_false_f() |
882 perf_pmasys_mem_block_target_f(0)); 883 perf_pmasys_mem_block_target_f(0));
883 884
884 gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset); 885 gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset, NULL);
885 886
886 return 0; 887 return 0;
887} 888}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index d5208e0d..5a25eecf 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -2033,6 +2033,8 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
2033 gpu->cbc_cache_line_size = g->gr.cacheline_size; 2033 gpu->cbc_cache_line_size = g->gr.cacheline_size;
2034 gpu->cbc_comptags_per_line = g->gr.comptags_per_cacheline; 2034 gpu->cbc_comptags_per_line = g->gr.comptags_per_cacheline;
2035 2035
2036 gpu->map_buffer_batch_limit = 256;
2037
2036 return 0; 2038 return 0;
2037} 2039}
2038 2040
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 9a183e44..fc2ed643 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -349,14 +349,16 @@ struct gpu_ops {
349 u32 flags, 349 u32 flags,
350 int rw_flag, 350 int rw_flag,
351 bool clear_ctags, 351 bool clear_ctags,
352 bool sparse); 352 bool sparse,
353 struct vm_gk20a_mapping_batch *batch);
353 void (*gmmu_unmap)(struct vm_gk20a *vm, 354 void (*gmmu_unmap)(struct vm_gk20a *vm,
354 u64 vaddr, 355 u64 vaddr,
355 u64 size, 356 u64 size,
356 int pgsz_idx, 357 int pgsz_idx,
357 bool va_allocated, 358 bool va_allocated,
358 int rw_flag, 359 int rw_flag,
359 bool sparse); 360 bool sparse,
361 struct vm_gk20a_mapping_batch *batch);
360 void (*vm_remove)(struct vm_gk20a *vm); 362 void (*vm_remove)(struct vm_gk20a *vm);
361 int (*vm_alloc_share)(struct gk20a_as_share *as_share, 363 int (*vm_alloc_share)(struct gk20a_as_share *as_share,
362 u32 flags); 364 u32 flags);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 197e2b81..f3512f90 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -787,7 +787,34 @@ static void gk20a_vm_unmap_locked_kref(struct kref *ref)
787{ 787{
788 struct mapped_buffer_node *mapped_buffer = 788 struct mapped_buffer_node *mapped_buffer =
789 container_of(ref, struct mapped_buffer_node, ref); 789 container_of(ref, struct mapped_buffer_node, ref);
790 gk20a_vm_unmap_locked(mapped_buffer); 790 gk20a_vm_unmap_locked(mapped_buffer, mapped_buffer->vm->kref_put_batch);
791}
792
793void gk20a_vm_mapping_batch_start(struct vm_gk20a_mapping_batch *mapping_batch)
794{
795 memset(mapping_batch, 0, sizeof(*mapping_batch));
796 mapping_batch->gpu_l2_flushed = false;
797 mapping_batch->need_tlb_invalidate = false;
798}
799
800void gk20a_vm_mapping_batch_finish_locked(
801 struct vm_gk20a *vm, struct vm_gk20a_mapping_batch *mapping_batch)
802{
803 /* hanging kref_put batch pointer? */
804 WARN_ON(vm->kref_put_batch == mapping_batch);
805
806 if (mapping_batch->need_tlb_invalidate) {
807 struct gk20a *g = gk20a_from_vm(vm);
808 g->ops.mm.tlb_invalidate(vm);
809 }
810}
811
812void gk20a_vm_mapping_batch_finish(struct vm_gk20a *vm,
813 struct vm_gk20a_mapping_batch *mapping_batch)
814{
815 mutex_lock(&vm->update_gmmu_lock);
816 gk20a_vm_mapping_batch_finish_locked(vm, mapping_batch);
817 mutex_unlock(&vm->update_gmmu_lock);
791} 818}
792 819
793void gk20a_vm_put_buffers(struct vm_gk20a *vm, 820void gk20a_vm_put_buffers(struct vm_gk20a *vm,
@@ -795,19 +822,25 @@ void gk20a_vm_put_buffers(struct vm_gk20a *vm,
795 int num_buffers) 822 int num_buffers)
796{ 823{
797 int i; 824 int i;
825 struct vm_gk20a_mapping_batch batch;
798 826
799 mutex_lock(&vm->update_gmmu_lock); 827 mutex_lock(&vm->update_gmmu_lock);
828 gk20a_vm_mapping_batch_start(&batch);
829 vm->kref_put_batch = &batch;
800 830
801 for (i = 0; i < num_buffers; ++i) 831 for (i = 0; i < num_buffers; ++i)
802 kref_put(&mapped_buffers[i]->ref, 832 kref_put(&mapped_buffers[i]->ref,
803 gk20a_vm_unmap_locked_kref); 833 gk20a_vm_unmap_locked_kref);
804 834
835 vm->kref_put_batch = NULL;
836 gk20a_vm_mapping_batch_finish_locked(vm, &batch);
805 mutex_unlock(&vm->update_gmmu_lock); 837 mutex_unlock(&vm->update_gmmu_lock);
806 838
807 nvgpu_free(mapped_buffers); 839 nvgpu_free(mapped_buffers);
808} 840}
809 841
810static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset) 842static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset,
843 struct vm_gk20a_mapping_batch *batch)
811{ 844{
812 struct device *d = dev_from_vm(vm); 845 struct device *d = dev_from_vm(vm);
813 int retries = 10000; /* 50 ms */ 846 int retries = 10000; /* 50 ms */
@@ -840,7 +873,10 @@ static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
840 mapped_buffer->user_mapped--; 873 mapped_buffer->user_mapped--;
841 if (mapped_buffer->user_mapped == 0) 874 if (mapped_buffer->user_mapped == 0)
842 vm->num_user_mapped_buffers--; 875 vm->num_user_mapped_buffers--;
876
877 vm->kref_put_batch = batch;
843 kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref); 878 kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
879 vm->kref_put_batch = NULL;
844 880
845 mutex_unlock(&vm->update_gmmu_lock); 881 mutex_unlock(&vm->update_gmmu_lock);
846} 882}
@@ -1131,7 +1167,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
1131 u32 flags, 1167 u32 flags,
1132 int rw_flag, 1168 int rw_flag,
1133 bool clear_ctags, 1169 bool clear_ctags,
1134 bool sparse) 1170 bool sparse,
1171 struct vm_gk20a_mapping_batch *batch)
1135{ 1172{
1136 int err = 0; 1173 int err = 0;
1137 bool allocated = false; 1174 bool allocated = false;
@@ -1177,7 +1214,10 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
1177 goto fail_validate; 1214 goto fail_validate;
1178 } 1215 }
1179 1216
1180 g->ops.mm.tlb_invalidate(vm); 1217 if (!batch)
1218 g->ops.mm.tlb_invalidate(vm);
1219 else
1220 batch->need_tlb_invalidate = true;
1181 1221
1182 return map_offset; 1222 return map_offset;
1183fail_validate: 1223fail_validate:
@@ -1194,7 +1234,8 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
1194 int pgsz_idx, 1234 int pgsz_idx,
1195 bool va_allocated, 1235 bool va_allocated,
1196 int rw_flag, 1236 int rw_flag,
1197 bool sparse) 1237 bool sparse,
1238 struct vm_gk20a_mapping_batch *batch)
1198{ 1239{
1199 int err = 0; 1240 int err = 0;
1200 struct gk20a *g = gk20a_from_vm(vm); 1241 struct gk20a *g = gk20a_from_vm(vm);
@@ -1230,9 +1271,16 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
1230 * for gmmu ptes. note the positioning of this relative to any smmu 1271 * for gmmu ptes. note the positioning of this relative to any smmu
1231 * unmapping (below). */ 1272 * unmapping (below). */
1232 1273
1233 gk20a_mm_l2_flush(g, true); 1274 if (!batch) {
1234 1275 gk20a_mm_l2_flush(g, true);
1235 g->ops.mm.tlb_invalidate(vm); 1276 g->ops.mm.tlb_invalidate(vm);
1277 } else {
1278 if (!batch->gpu_l2_flushed) {
1279 gk20a_mm_l2_flush(g, true);
1280 batch->gpu_l2_flushed = true;
1281 }
1282 batch->need_tlb_invalidate = true;
1283 }
1236} 1284}
1237 1285
1238static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm, 1286static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
@@ -1308,7 +1356,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
1308 bool user_mapped, 1356 bool user_mapped,
1309 int rw_flag, 1357 int rw_flag,
1310 u64 buffer_offset, 1358 u64 buffer_offset,
1311 u64 mapping_size) 1359 u64 mapping_size,
1360 struct vm_gk20a_mapping_batch *batch)
1312{ 1361{
1313 struct gk20a *g = gk20a_from_vm(vm); 1362 struct gk20a *g = gk20a_from_vm(vm);
1314 struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags; 1363 struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags;
@@ -1509,7 +1558,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
1509 bfr.ctag_offset, 1558 bfr.ctag_offset,
1510 flags, rw_flag, 1559 flags, rw_flag,
1511 clear_ctags, 1560 clear_ctags,
1512 false); 1561 false,
1562 batch);
1513 if (!map_offset) 1563 if (!map_offset)
1514 goto clean_up; 1564 goto clean_up;
1515 1565
@@ -1727,8 +1777,9 @@ int gk20a_vm_map_compbits(struct vm_gk20a *vm,
1727 0, /* ctag_offset */ 1777 0, /* ctag_offset */
1728 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 1778 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
1729 gk20a_mem_flag_read_only, 1779 gk20a_mem_flag_read_only,
1730 false, 1780 false, /* clear_ctags */
1731 false); 1781 false, /* sparse */
1782 NULL); /* mapping_batch handle */
1732 1783
1733 if (!mapped_buffer->ctag_map_win_addr) { 1784 if (!mapped_buffer->ctag_map_win_addr) {
1734 mutex_unlock(&vm->update_gmmu_lock); 1785 mutex_unlock(&vm->update_gmmu_lock);
@@ -1764,7 +1815,10 @@ u64 gk20a_gmmu_map(struct vm_gk20a *vm,
1764 0, /* page size index = 0 i.e. SZ_4K */ 1815 0, /* page size index = 0 i.e. SZ_4K */
1765 0, /* kind */ 1816 0, /* kind */
1766 0, /* ctag_offset */ 1817 0, /* ctag_offset */
1767 flags, rw_flag, false, false); 1818 flags, rw_flag,
1819 false, /* clear_ctags */
1820 false, /* sparse */
1821 NULL); /* mapping_batch handle */
1768 mutex_unlock(&vm->update_gmmu_lock); 1822 mutex_unlock(&vm->update_gmmu_lock);
1769 if (!vaddr) { 1823 if (!vaddr) {
1770 gk20a_err(dev_from_vm(vm), "failed to allocate va space"); 1824 gk20a_err(dev_from_vm(vm), "failed to allocate va space");
@@ -1930,7 +1984,8 @@ void gk20a_gmmu_unmap(struct vm_gk20a *vm,
1930 0, /* page size 4K */ 1984 0, /* page size 4K */
1931 true, /*va_allocated */ 1985 true, /*va_allocated */
1932 rw_flag, 1986 rw_flag,
1933 false); 1987 false,
1988 NULL);
1934 mutex_unlock(&vm->update_gmmu_lock); 1989 mutex_unlock(&vm->update_gmmu_lock);
1935} 1990}
1936 1991
@@ -2378,7 +2433,8 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
2378} 2433}
2379 2434
2380/* NOTE! mapped_buffers lock must be held */ 2435/* NOTE! mapped_buffers lock must be held */
2381void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer) 2436void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer,
2437 struct vm_gk20a_mapping_batch *batch)
2382{ 2438{
2383 struct vm_gk20a *vm = mapped_buffer->vm; 2439 struct vm_gk20a *vm = mapped_buffer->vm;
2384 struct gk20a *g = vm->mm->g; 2440 struct gk20a *g = vm->mm->g;
@@ -2392,7 +2448,8 @@ void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
2392 0, /* page size 4k */ 2448 0, /* page size 4k */
2393 true, /* va allocated */ 2449 true, /* va allocated */
2394 gk20a_mem_flag_none, 2450 gk20a_mem_flag_none,
2395 false); /* not sparse */ 2451 false, /* not sparse */
2452 batch); /* batch handle */
2396 } 2453 }
2397 2454
2398 g->ops.mm.gmmu_unmap(vm, 2455 g->ops.mm.gmmu_unmap(vm,
@@ -2402,7 +2459,8 @@ void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
2402 mapped_buffer->va_allocated, 2459 mapped_buffer->va_allocated,
2403 gk20a_mem_flag_none, 2460 gk20a_mem_flag_none,
2404 mapped_buffer->va_node ? 2461 mapped_buffer->va_node ?
2405 mapped_buffer->va_node->sparse : false); 2462 mapped_buffer->va_node->sparse : false,
2463 batch);
2406 2464
2407 gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d", 2465 gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d",
2408 vm_aspace_id(vm), 2466 vm_aspace_id(vm),
@@ -2479,7 +2537,7 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
2479 while (node) { 2537 while (node) {
2480 mapped_buffer = 2538 mapped_buffer =
2481 container_of(node, struct mapped_buffer_node, node); 2539 container_of(node, struct mapped_buffer_node, node);
2482 gk20a_vm_unmap_locked(mapped_buffer); 2540 gk20a_vm_unmap_locked(mapped_buffer, NULL);
2483 node = rb_first(&vm->mapped_buffers); 2541 node = rb_first(&vm->mapped_buffers);
2484 } 2542 }
2485 2543
@@ -2776,7 +2834,8 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2776 args->flags, 2834 args->flags,
2777 gk20a_mem_flag_none, 2835 gk20a_mem_flag_none,
2778 false, 2836 false,
2779 true); 2837 true,
2838 NULL);
2780 if (!map_offset) { 2839 if (!map_offset) {
2781 mutex_unlock(&vm->update_gmmu_lock); 2840 mutex_unlock(&vm->update_gmmu_lock);
2782 gk20a_bfree(vma, vaddr_start); 2841 gk20a_bfree(vma, vaddr_start);
@@ -2841,7 +2900,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
2841 va_node->pgsz_idx, 2900 va_node->pgsz_idx,
2842 true, 2901 true,
2843 gk20a_mem_flag_none, 2902 gk20a_mem_flag_none,
2844 true); 2903 true,
2904 NULL);
2845 kfree(va_node); 2905 kfree(va_node);
2846 } 2906 }
2847 mutex_unlock(&vm->update_gmmu_lock); 2907 mutex_unlock(&vm->update_gmmu_lock);
@@ -2960,7 +3020,8 @@ int gk20a_vm_map_buffer(struct vm_gk20a *vm,
2960 u32 flags, /*NVGPU_AS_MAP_BUFFER_FLAGS_*/ 3020 u32 flags, /*NVGPU_AS_MAP_BUFFER_FLAGS_*/
2961 int kind, 3021 int kind,
2962 u64 buffer_offset, 3022 u64 buffer_offset,
2963 u64 mapping_size) 3023 u64 mapping_size,
3024 struct vm_gk20a_mapping_batch *batch)
2964{ 3025{
2965 int err = 0; 3026 int err = 0;
2966 struct dma_buf *dmabuf; 3027 struct dma_buf *dmabuf;
@@ -2986,7 +3047,8 @@ int gk20a_vm_map_buffer(struct vm_gk20a *vm,
2986 flags, kind, NULL, true, 3047 flags, kind, NULL, true,
2987 gk20a_mem_flag_none, 3048 gk20a_mem_flag_none,
2988 buffer_offset, 3049 buffer_offset,
2989 mapping_size); 3050 mapping_size,
3051 batch);
2990 3052
2991 *offset_align = ret_va; 3053 *offset_align = ret_va;
2992 if (!ret_va) { 3054 if (!ret_va) {
@@ -2997,11 +3059,12 @@ int gk20a_vm_map_buffer(struct vm_gk20a *vm,
2997 return err; 3059 return err;
2998} 3060}
2999 3061
3000int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset) 3062int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset,
3063 struct vm_gk20a_mapping_batch *batch)
3001{ 3064{
3002 gk20a_dbg_fn(""); 3065 gk20a_dbg_fn("");
3003 3066
3004 gk20a_vm_unmap_user(vm, offset); 3067 gk20a_vm_unmap_user(vm, offset, batch);
3005 return 0; 3068 return 0;
3006} 3069}
3007 3070
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 1e97e859..ee99c821 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -225,6 +225,13 @@ struct gk20a_mmu_level {
225 size_t entry_size; 225 size_t entry_size;
226}; 226};
227 227
228/* map/unmap batch state */
229struct vm_gk20a_mapping_batch
230{
231 bool gpu_l2_flushed;
232 bool need_tlb_invalidate;
233};
234
228struct vm_gk20a { 235struct vm_gk20a {
229 struct mm_gk20a *mm; 236 struct mm_gk20a *mm;
230 struct gk20a_as_share *as_share; /* as_share this represents */ 237 struct gk20a_as_share *as_share; /* as_share this represents */
@@ -257,6 +264,10 @@ struct vm_gk20a {
257 u64 handle; 264 u64 handle;
258#endif 265#endif
259 u32 gmmu_page_sizes[gmmu_nr_page_sizes]; 266 u32 gmmu_page_sizes[gmmu_nr_page_sizes];
267
268 /* if non-NULL, kref_put will use this batch when
269 unmapping. Must hold vm->update_gmmu_lock. */
270 struct vm_gk20a_mapping_batch *kref_put_batch;
260}; 271};
261 272
262struct gk20a; 273struct gk20a;
@@ -486,7 +497,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
486 u32 flags, 497 u32 flags,
487 int rw_flag, 498 int rw_flag,
488 bool clear_ctags, 499 bool clear_ctags,
489 bool sparse); 500 bool sparse,
501 struct vm_gk20a_mapping_batch *batch);
490 502
491void gk20a_gmmu_unmap(struct vm_gk20a *vm, 503void gk20a_gmmu_unmap(struct vm_gk20a *vm,
492 u64 vaddr, 504 u64 vaddr,
@@ -499,7 +511,8 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
499 int pgsz_idx, 511 int pgsz_idx,
500 bool va_allocated, 512 bool va_allocated,
501 int rw_flag, 513 int rw_flag,
502 bool sparse); 514 bool sparse,
515 struct vm_gk20a_mapping_batch *batch);
503 516
504struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf); 517struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf);
505void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf, 518void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
@@ -514,7 +527,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
514 bool user_mapped, 527 bool user_mapped,
515 int rw_flag, 528 int rw_flag,
516 u64 buffer_offset, 529 u64 buffer_offset,
517 u64 mapping_size); 530 u64 mapping_size,
531 struct vm_gk20a_mapping_batch *mapping_batch);
518 532
519int gk20a_vm_get_compbits_info(struct vm_gk20a *vm, 533int gk20a_vm_get_compbits_info(struct vm_gk20a *vm,
520 u64 mapping_gva, 534 u64 mapping_gva,
@@ -532,7 +546,8 @@ int gk20a_vm_map_compbits(struct vm_gk20a *vm,
532/* unmap handle from kernel */ 546/* unmap handle from kernel */
533void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset); 547void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset);
534 548
535void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer); 549void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer,
550 struct vm_gk20a_mapping_batch *batch);
536 551
537/* get reference to all currently mapped buffers */ 552/* get reference to all currently mapped buffers */
538int gk20a_vm_get_buffers(struct vm_gk20a *vm, 553int gk20a_vm_get_buffers(struct vm_gk20a *vm,
@@ -576,13 +591,25 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
576 struct nvgpu_as_free_space_args *args); 591 struct nvgpu_as_free_space_args *args);
577int gk20a_vm_bind_channel(struct gk20a_as_share *as_share, 592int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
578 struct channel_gk20a *ch); 593 struct channel_gk20a *ch);
594
595/* batching eliminates redundant cache flushes and invalidates */
596void gk20a_vm_mapping_batch_start(struct vm_gk20a_mapping_batch *batch);
597void gk20a_vm_mapping_batch_finish(
598 struct vm_gk20a *vm, struct vm_gk20a_mapping_batch *batch);
599/* called when holding vm->update_gmmu_lock */
600void gk20a_vm_mapping_batch_finish_locked(
601 struct vm_gk20a *vm, struct vm_gk20a_mapping_batch *batch);
602
603
604/* Note: batch may be NULL if map op is not part of a batch */
579int gk20a_vm_map_buffer(struct vm_gk20a *vm, 605int gk20a_vm_map_buffer(struct vm_gk20a *vm,
580 int dmabuf_fd, 606 int dmabuf_fd,
581 u64 *offset_align, 607 u64 *offset_align,
582 u32 flags, /* NVGPU_AS_MAP_BUFFER_FLAGS_ */ 608 u32 flags, /* NVGPU_AS_MAP_BUFFER_FLAGS_ */
583 int kind, 609 int kind,
584 u64 buffer_offset, 610 u64 buffer_offset,
585 u64 mapping_size); 611 u64 mapping_size,
612 struct vm_gk20a_mapping_batch *batch);
586 613
587int gk20a_init_vm(struct mm_gk20a *mm, 614int gk20a_init_vm(struct mm_gk20a *mm,
588 struct vm_gk20a *vm, 615 struct vm_gk20a *vm,
@@ -592,7 +619,10 @@ int gk20a_init_vm(struct mm_gk20a *mm,
592 bool big_pages, 619 bool big_pages,
593 char *name); 620 char *name);
594void gk20a_deinit_vm(struct vm_gk20a *vm); 621void gk20a_deinit_vm(struct vm_gk20a *vm);
595int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset); 622
623/* Note: batch may be NULL if unmap op is not part of a batch */
624int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset,
625 struct vm_gk20a_mapping_batch *batch);
596void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf, 626void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf,
597 struct gk20a_comptags *comptags); 627 struct gk20a_comptags *comptags);
598dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr); 628dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr);
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index 855aac0d..be1fa47d 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -66,7 +66,8 @@ static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
66 u32 flags, 66 u32 flags,
67 int rw_flag, 67 int rw_flag,
68 bool clear_ctags, 68 bool clear_ctags,
69 bool sparse) 69 bool sparse,
70 struct vm_gk20a_mapping_batch *batch)
70{ 71{
71 int err = 0; 72 int err = 0;
72 struct device *d = dev_from_vm(vm); 73 struct device *d = dev_from_vm(vm);
@@ -130,7 +131,8 @@ static void vgpu_locked_gmmu_unmap(struct vm_gk20a *vm,
130 int pgsz_idx, 131 int pgsz_idx,
131 bool va_allocated, 132 bool va_allocated,
132 int rw_flag, 133 int rw_flag,
133 bool sparse) 134 bool sparse,
135 struct vm_gk20a_mapping_batch *batch)
134{ 136{
135 struct gk20a *g = gk20a_from_vm(vm); 137 struct gk20a *g = gk20a_from_vm(vm);
136 struct gk20a_platform *platform = gk20a_get_platform(g->dev); 138 struct gk20a_platform *platform = gk20a_get_platform(g->dev);
@@ -182,7 +184,7 @@ static void vgpu_vm_remove_support(struct vm_gk20a *vm)
182 while (node) { 184 while (node) {
183 mapped_buffer = 185 mapped_buffer =
184 container_of(node, struct mapped_buffer_node, node); 186 container_of(node, struct mapped_buffer_node, node);
185 gk20a_vm_unmap_locked(mapped_buffer); 187 gk20a_vm_unmap_locked(mapped_buffer, NULL);
186 node = rb_first(&vm->mapped_buffers); 188 node = rb_first(&vm->mapped_buffers);
187 } 189 }
188 190