drm/amdgpu: flip frag_ptes and update_pts

We can add the fragment params before we split the update for the page tables. That should save a few CPU cycles for larger updates. Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Christian König <christian.koenig@amd.com> 2016-08-05 07:56:35 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2016-08-10 14:05:58 -0400
commit: 92696dd52e58b5caaee1dc027cf14f327f91d2e1 (patch)
tree: 08656072d379f5ab1fefbe8e04601e626438dd4e /drivers/gpu/drm/amd
parent: 27c5f36fe138e29d63eea7d1445bda1ca64921d9 (diff)
1 files changed, 79 insertions, 87 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index fd7901c1320f..d2796bb4004b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -702,83 +702,6 @@ error_free:
 }
 /**
- * amdgpu_vm_frag_ptes - add fragment information to PTEs
- *
- * @params: see amdgpu_pte_update_params definition
- * @pe_start: first PTE to handle
- * @pe_end: last PTE to handle
- * @addr: addr those PTEs should point to
- * @flags: hw mapping flags
- */
-static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params,
-                                uint64_t pe_start, uint64_t pe_end,
-                                uint64_t addr, uint32_t flags)
-{
-        /**
-         * The MC L1 TLB supports variable sized pages, based on a fragment
-         * field in the PTE. When this field is set to a non-zero value, page
-         * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
-         * flags are considered valid for all PTEs within the fragment range
-         * and corresponding mappings are assumed to be physically contiguous.
-         *
-         * The L1 TLB can store a single PTE for the whole fragment,
-         * significantly increasing the space available for translation
-         * caching. This leads to large improvements in throughput when the
-         * TLB is under pressure.
-         *
-         * The L2 TLB distributes small and large fragments into two
-         * asymmetric partitions. The large fragment cache is significantly
-         * larger. Thus, we try to use large fragments wherever possible.
-         * Userspace can support this by aligning virtual base address and
-         * allocation size to the fragment size.
-         */
-        /* SI and newer are optimized for 64KB */
-        uint64_t frag_flags = AMDGPU_PTE_FRAG(AMDGPU_LOG2_PAGES_PER_FRAG);
-        uint64_t frag_align = 0x80;
-        uint64_t frag_start = ALIGN(pe_start, frag_align);
-        uint64_t frag_end = pe_end & ~(frag_align - 1);
-        unsigned count;
-        /* Abort early if there isn't anything to do */
-        if (pe_start == pe_end)
-                return;
-        /* system pages are non continuously */
-        if (params->src || params->pages_addr ||
-                !(flags & AMDGPU_PTE_VALID) || (frag_start >= frag_end)) {
-                count = (pe_end - pe_start) / 8;
-                amdgpu_vm_update_pages(params, pe_start, addr, count,
-                                       AMDGPU_GPU_PAGE_SIZE, flags);
-                return;
-        }
-        /* handle the 4K area at the beginning */
-        if (pe_start != frag_start) {
-                count = (frag_start - pe_start) / 8;
-                amdgpu_vm_update_pages(params, pe_start, addr, count,
-                                       AMDGPU_GPU_PAGE_SIZE, flags);
-                addr += AMDGPU_GPU_PAGE_SIZE * count;
-        }
-        /* handle the area in the middle */
-        count = (frag_end - frag_start) / 8;
-        amdgpu_vm_update_pages(params, frag_start, addr, count,
-                               AMDGPU_GPU_PAGE_SIZE, flags | frag_flags);
-        /* handle the 4K area at the end */
-        if (frag_end != pe_end) {
-                addr += AMDGPU_GPU_PAGE_SIZE * count;
-                count = (pe_end - frag_end) / 8;
-                amdgpu_vm_update_pages(params, frag_end, addr, count,
-                                       AMDGPU_GPU_PAGE_SIZE, flags);
-        }
-}
-/**
 * amdgpu_vm_update_ptes - make sure that page tables are valid
 *
 * @params: see amdgpu_pte_update_params definition
@@ -797,7 +720,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 {
        const uint64_t mask = AMDGPU_VM_PTE_COUNT - 1;
-        uint64_t cur_pe_start, cur_pe_end, cur_dst;
+        uint64_t cur_pe_start, cur_nptes, cur_dst;
        uint64_t addr; /* next GPU address to be updated */
        uint64_t pt_idx;
        struct amdgpu_bo *pt;
@@ -816,7 +739,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
        cur_pe_start = amdgpu_bo_gpu_offset(pt);
        cur_pe_start += (addr & mask) * 8;
-        cur_pe_end = cur_pe_start + 8 * nptes;
+        cur_nptes = nptes;
        cur_dst = dst;
        /* for next ptb*/
@@ -836,18 +759,19 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
                next_pe_start = amdgpu_bo_gpu_offset(pt);
                next_pe_start += (addr & mask) * 8;
-                if (cur_pe_end == next_pe_start) {
+                if ((cur_pe_start + 8 * cur_nptes) == next_pe_start) {
                        /* The next ptb is consecutive to current ptb.
-                         * Don't call amdgpu_vm_frag_ptes now.
+                         * Don't call amdgpu_vm_update_pages now.
                         * Will update two ptbs together in future.
                        */
-                        cur_pe_end += 8 * nptes;
+                        cur_nptes += nptes;
                } else {
-                        amdgpu_vm_frag_ptes(params, cur_pe_start, cur_pe_end,
+                        amdgpu_vm_update_pages(params, cur_pe_start, cur_dst,
-                                            cur_dst, flags);
+                                               cur_nptes, AMDGPU_GPU_PAGE_SIZE,
+                                               flags);
                        cur_pe_start = next_pe_start;
-                        cur_pe_end = next_pe_start + 8 * nptes;
+                        cur_nptes = nptes;
                        cur_dst = dst;
                }
@@ -856,7 +780,75 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
                dst += nptes * AMDGPU_GPU_PAGE_SIZE;
        }
-        amdgpu_vm_frag_ptes(params, cur_pe_start, cur_pe_end, cur_dst, flags);
+        amdgpu_vm_update_pages(params, cur_pe_start, cur_dst, cur_nptes,
+                               AMDGPU_GPU_PAGE_SIZE, flags);
+}
+/*
+ * amdgpu_vm_frag_ptes - add fragment information to PTEs
+ *
+ * @params: see amdgpu_pte_update_params definition
+ * @vm: requested vm
+ * @start: first PTE to handle
+ * @end: last PTE to handle
+ * @dst: addr those PTEs should point to
+ * @flags: hw mapping flags
+ */
+static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params,
+                                struct amdgpu_vm *vm,
+                                uint64_t start, uint64_t end,
+                                uint64_t dst, uint32_t flags)
+{
+        /**
+         * The MC L1 TLB supports variable sized pages, based on a fragment
+         * field in the PTE. When this field is set to a non-zero value, page
+         * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
+         * flags are considered valid for all PTEs within the fragment range
+         * and corresponding mappings are assumed to be physically contiguous.
+         *
+         * The L1 TLB can store a single PTE for the whole fragment,
+         * significantly increasing the space available for translation
+         * caching. This leads to large improvements in throughput when the
+         * TLB is under pressure.
+         *
+         * The L2 TLB distributes small and large fragments into two
+         * asymmetric partitions. The large fragment cache is significantly
+         * larger. Thus, we try to use large fragments wherever possible.
+         * Userspace can support this by aligning virtual base address and
+         * allocation size to the fragment size.
+         */
+        /* SI and newer are optimized for 64KB */
+        uint64_t frag_flags = AMDGPU_PTE_FRAG(AMDGPU_LOG2_PAGES_PER_FRAG);
+        uint64_t frag_align = 1 << AMDGPU_LOG2_PAGES_PER_FRAG;
+        uint64_t frag_start = ALIGN(start, frag_align);
+        uint64_t frag_end = end & ~(frag_align - 1);
+        /* system pages are non continuously */
+        if (params->src || params->pages_addr || !(flags & AMDGPU_PTE_VALID) ||
+            (frag_start >= frag_end)) {
+                amdgpu_vm_update_ptes(params, vm, start, end, dst, flags);
+                return;
+        }
+        /* handle the 4K area at the beginning */
+        if (start != frag_start) {
+                amdgpu_vm_update_ptes(params, vm, start, frag_start,
+                                      dst, flags);
+                dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE;
+        }
+        /* handle the area in the middle */
+        amdgpu_vm_update_ptes(params, vm, frag_start, frag_end, dst,
+                              flags | frag_flags);
+        /* handle the 4K area at the end */
+        if (frag_end != end) {
+                dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE;
+                amdgpu_vm_update_ptes(params, vm, frag_end, end, dst, flags);
+        }
 }
 /**
@@ -953,7 +945,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
        if (r)
                goto error_free;
-        amdgpu_vm_update_ptes(&params, vm, start, last + 1, addr, flags);
+        amdgpu_vm_frag_ptes(&params, vm, start, last + 1, addr, flags);
        amdgpu_ring_pad_ib(ring, params.ib);
        WARN_ON(params.ib->length_dw > ndw);
author	Christian König <christian.koenig@amd.com>	2016-08-05 07:56:35 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2016-08-10 14:05:58 -0400
commit	92696dd52e58b5caaee1dc027cf14f327f91d2e1 (patch)
tree	08656072d379f5ab1fefbe8e04601e626438dd4e /drivers/gpu/drm/amd
parent	27c5f36fe138e29d63eea7d1445bda1ca64921d9 (diff)