From e7ba93fefbc4df9663302d240f9fbd5967a75a3c Mon Sep 17 00:00:00 2001 From: Sami Kiminki Date: Mon, 20 Apr 2015 18:12:22 +0300 Subject: gpu: nvgpu: Initial MAP_BUFFER_BATCH implementation Add batch support for mapping and unmapping. Batching essentially helps transform some per-map/unmap overhead to per-batch overhead, namely gk20a_busy()/gk20a_idle() calls, GPU L2 flushes, and GPU TLB invalidates. Batching with size 64 has been measured to yield >20x speed-up in low-level fixed-address mapping microbenchmarks. Bug 1614735 Bug 1623949 Change-Id: Ie22b9caea5a7c3fc68a968d1b7f8488dfce72085 Signed-off-by: Sami Kiminki Reviewed-on: http://git-master/r/733231 (cherry picked from commit de4a7cfb93e8228a4a0c6a2815755a8df4531c91) Reviewed-on: http://git-master/r/763812 Reviewed-by: Terje Bergstrom Tested-by: Terje Bergstrom --- drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 42 ++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/nvgpu/gk20a/mm_gk20a.h') diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index 1e97e859..ee99c821 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h @@ -225,6 +225,13 @@ struct gk20a_mmu_level { size_t entry_size; }; +/* map/unmap batch state */ +struct vm_gk20a_mapping_batch +{ + bool gpu_l2_flushed; + bool need_tlb_invalidate; +}; + struct vm_gk20a { struct mm_gk20a *mm; struct gk20a_as_share *as_share; /* as_share this represents */ @@ -257,6 +264,10 @@ struct vm_gk20a { u64 handle; #endif u32 gmmu_page_sizes[gmmu_nr_page_sizes]; + + /* if non-NULL, kref_put will use this batch when + unmapping. Must hold vm->update_gmmu_lock. */ + struct vm_gk20a_mapping_batch *kref_put_batch; }; struct gk20a; @@ -486,7 +497,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, u32 flags, int rw_flag, bool clear_ctags, - bool sparse); + bool sparse, + struct vm_gk20a_mapping_batch *batch); void gk20a_gmmu_unmap(struct vm_gk20a *vm, u64 vaddr, @@ -499,7 +511,8 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, int pgsz_idx, bool va_allocated, int rw_flag, - bool sparse); + bool sparse, + struct vm_gk20a_mapping_batch *batch); struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf); void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf, @@ -514,7 +527,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm, bool user_mapped, int rw_flag, u64 buffer_offset, - u64 mapping_size); + u64 mapping_size, + struct vm_gk20a_mapping_batch *mapping_batch); int gk20a_vm_get_compbits_info(struct vm_gk20a *vm, u64 mapping_gva, @@ -532,7 +546,8 @@ int gk20a_vm_map_compbits(struct vm_gk20a *vm, /* unmap handle from kernel */ void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset); -void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer); +void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer, + struct vm_gk20a_mapping_batch *batch); /* get reference to all currently mapped buffers */ int gk20a_vm_get_buffers(struct vm_gk20a *vm, @@ -576,13 +591,25 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share, struct nvgpu_as_free_space_args *args); int gk20a_vm_bind_channel(struct gk20a_as_share *as_share, struct channel_gk20a *ch); + +/* batching eliminates redundant cache flushes and invalidates */ +void gk20a_vm_mapping_batch_start(struct vm_gk20a_mapping_batch *batch); +void gk20a_vm_mapping_batch_finish( + struct vm_gk20a *vm, struct vm_gk20a_mapping_batch *batch); +/* called when holding vm->update_gmmu_lock */ +void gk20a_vm_mapping_batch_finish_locked( + struct vm_gk20a *vm, struct vm_gk20a_mapping_batch *batch); + + +/* Note: batch may be NULL if map op is not part of a batch */ int gk20a_vm_map_buffer(struct vm_gk20a *vm, int dmabuf_fd, u64 *offset_align, u32 flags, /* NVGPU_AS_MAP_BUFFER_FLAGS_ */ int kind, u64 buffer_offset, - u64 mapping_size); + u64 mapping_size, + struct vm_gk20a_mapping_batch *batch); int gk20a_init_vm(struct mm_gk20a *mm, struct vm_gk20a *vm, @@ -592,7 +619,10 @@ int gk20a_init_vm(struct mm_gk20a *mm, bool big_pages, char *name); void gk20a_deinit_vm(struct vm_gk20a *vm); -int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset); + +/* Note: batch may be NULL if unmap op is not part of a batch */ +int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset, + struct vm_gk20a_mapping_batch *batch); void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf, struct gk20a_comptags *comptags); dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr); -- cgit v1.2.2