drm/radeon: add large PTE support for NI, SI and CIK v5

This patch implements support for VRAM page table entry compression. PTE construction is enhanced to identify physically contiguous page ranges and mark them in the PTE fragment field. L1/L2 TLB support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, significantly improving TLB utilization for VRAM allocations. Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS on default settings at 1920x1200 resolution with vsync disabled. See main comment in radeon_vm.c for a technical description. v2 (chk): rebased and simplified. v3 (chk): add missing hw setup v4 (chk): rebased on current drm-fixes-3.15 v5 (chk): fix comments and commit text Signed-off-by: Jay Cornwall <jay@jcornwall.me> Signed-off-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Christian König <christian.koenig@amd.com> 2014-05-10 06:17:55 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2014-06-02 10:25:02 -0400
commit: ec3dbbcbd7a6ee165ca7eeafec8dbc733901ab2f (patch)
tree: f0a715cec425e8815870874892cce639b25eda1f /drivers/gpu/drm/radeon/radeon_vm.c
parent: 831719d62f692e28699a7acd7b441c6f0c01b6f7 (diff)
1 files changed, 84 insertions, 7 deletions
diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c
index 2aae6ce49d32..f8d5b65932e5 100644
--- a/drivers/gpu/drm/radeon/radeon_vm.c
+++ b/drivers/gpu/drm/radeon/radeon_vm.c
@@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct radeon_device *rdev,
 }
 /**
+ * radeon_vm_frag_ptes - add fragment information to PTEs
+ *
+ * @rdev: radeon_device pointer
+ * @ib: IB for the update
+ * @pe_start: first PTE to handle
+ * @pe_end: last PTE to handle
+ * @addr: addr those PTEs should point to
+ * @flags: hw mapping flags
+ *
+ * Global and local mutex must be locked!
+ */
+static void radeon_vm_frag_ptes(struct radeon_device *rdev,
+                                struct radeon_ib *ib,
+                                uint64_t pe_start, uint64_t pe_end,
+                                uint64_t addr, uint32_t flags)
+{
+        /**
+         * The MC L1 TLB supports variable sized pages, based on a fragment
+         * field in the PTE. When this field is set to a non-zero value, page
+         * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
+         * flags are considered valid for all PTEs within the fragment range
+         * and corresponding mappings are assumed to be physically contiguous.
+         *
+         * The L1 TLB can store a single PTE for the whole fragment,
+         * significantly increasing the space available for translation
+         * caching. This leads to large improvements in throughput when the
+         * TLB is under pressure.
+         *
+         * The L2 TLB distributes small and large fragments into two
+         * asymmetric partitions. The large fragment cache is significantly
+         * larger. Thus, we try to use large fragments wherever possible.
+         * Userspace can support this by aligning virtual base address and
+         * allocation size to the fragment size.
+         */
+        /* NI is optimized for 256KB fragments, SI and newer for 64KB */
+        uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
+                        R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
+        uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
+        uint64_t frag_start = ALIGN(pe_start, frag_align);
+        uint64_t frag_end = pe_end & ~(frag_align - 1);
+        unsigned count;
+        /* system pages are non continuously */
+        if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
+            (frag_start >= frag_end)) {
+                count = (pe_end - pe_start) / 8;
+                radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
+                                        RADEON_GPU_PAGE_SIZE, flags);
+                return;
+        }
+        /* handle the 4K area at the beginning */
+        if (pe_start != frag_start) {
+                count = (frag_start - pe_start) / 8;
+                radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
+                                        RADEON_GPU_PAGE_SIZE, flags);
+                addr += RADEON_GPU_PAGE_SIZE * count;
+        }
+        /* handle the area in the middle */
+        count = (frag_end - frag_start) / 8;
+        radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
+                                RADEON_GPU_PAGE_SIZE, flags | frag_flags);
+        /* handle the 4K area at the end */
+        if (frag_end != pe_end) {
+                addr += RADEON_GPU_PAGE_SIZE * count;
+                count = (pe_end - frag_end) / 8;
+                radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
+                                        RADEON_GPU_PAGE_SIZE, flags);
+        }
+}
+/**
 * radeon_vm_update_ptes - make sure that page tables are valid
 *
 * @rdev: radeon_device pointer
@@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
                if ((last_pte + 8 * count) != pte) {
                        if (count) {
-                                radeon_asic_vm_set_page(rdev, ib, last_pte,
+                                radeon_vm_frag_ptes(rdev, ib, last_pte,
-                                                        last_dst, count,
+                                                    last_pte + 8 * count,
-                                                        RADEON_GPU_PAGE_SIZE,
+                                                    last_dst, flags);
-                                                        flags);
                        }
                        count = nptes;
@@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
        }
        if (count) {
-                radeon_asic_vm_set_page(rdev, ib, last_pte,
+                radeon_vm_frag_ptes(rdev, ib, last_pte,
-                                        last_dst, count,
+                                    last_pte + 8 * count,
-                                        RADEON_GPU_PAGE_SIZE, flags);
+                                    last_dst, flags);
        }
 }
author	Christian König <christian.koenig@amd.com>	2014-05-10 06:17:55 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2014-06-02 10:25:02 -0400
commit	ec3dbbcbd7a6ee165ca7eeafec8dbc733901ab2f (patch)
tree	f0a715cec425e8815870874892cce639b25eda1f /drivers/gpu/drm/radeon/radeon_vm.c
parent	831719d62f692e28699a7acd7b441c6f0c01b6f7 (diff)

diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index 2aae6ce49d32..f8d5b65932e5 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c
@@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct radeon_device *rdev,
658	}	658	}
659		659
660	/**	660	/**
		661	* radeon_vm_frag_ptes - add fragment information to PTEs
		662	*
		663	* @rdev: radeon_device pointer
		664	* @ib: IB for the update
		665	* @pe_start: first PTE to handle
		666	* @pe_end: last PTE to handle
		667	* @addr: addr those PTEs should point to
		668	* @flags: hw mapping flags
		669	*
		670	* Global and local mutex must be locked!
		671	*/
		672	static void radeon_vm_frag_ptes(struct radeon_device *rdev,
		673	struct radeon_ib *ib,
		674	uint64_t pe_start, uint64_t pe_end,
		675	uint64_t addr, uint32_t flags)
		676	{
		677	/**
		678	* The MC L1 TLB supports variable sized pages, based on a fragment
		679	* field in the PTE. When this field is set to a non-zero value, page
		680	* granularity is increased from 4KB to (1 << (12 + frag)). The PTE
		681	* flags are considered valid for all PTEs within the fragment range
		682	* and corresponding mappings are assumed to be physically contiguous.
		683	*
		684	* The L1 TLB can store a single PTE for the whole fragment,
		685	* significantly increasing the space available for translation
		686	* caching. This leads to large improvements in throughput when the
		687	* TLB is under pressure.
		688	*
		689	* The L2 TLB distributes small and large fragments into two
		690	* asymmetric partitions. The large fragment cache is significantly
		691	* larger. Thus, we try to use large fragments wherever possible.
		692	* Userspace can support this by aligning virtual base address and
		693	* allocation size to the fragment size.
		694	*/
		695
		696	/* NI is optimized for 256KB fragments, SI and newer for 64KB */
		697	uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
		698	R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
		699	uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
		700
		701	uint64_t frag_start = ALIGN(pe_start, frag_align);
		702	uint64_t frag_end = pe_end & ~(frag_align - 1);
		703
		704	unsigned count;
		705
		706	/* system pages are non continuously */
		707	if ((flags & R600_PTE_SYSTEM) \|\| !(flags & R600_PTE_VALID) \|\|
		708	(frag_start >= frag_end)) {
		709
		710	count = (pe_end - pe_start) / 8;
		711	radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
		712	RADEON_GPU_PAGE_SIZE, flags);
		713	return;
		714	}
		715
		716	/* handle the 4K area at the beginning */
		717	if (pe_start != frag_start) {
		718	count = (frag_start - pe_start) / 8;
		719	radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
		720	RADEON_GPU_PAGE_SIZE, flags);
		721	addr += RADEON_GPU_PAGE_SIZE * count;
		722	}
		723
		724	/* handle the area in the middle */
		725	count = (frag_end - frag_start) / 8;
		726	radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
		727	RADEON_GPU_PAGE_SIZE, flags \| frag_flags);
		728
		729	/* handle the 4K area at the end */
		730	if (frag_end != pe_end) {
		731	addr += RADEON_GPU_PAGE_SIZE * count;
		732	count = (pe_end - frag_end) / 8;
		733	radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
		734	RADEON_GPU_PAGE_SIZE, flags);
		735	}
		736	}
		737
		738	/**
661	* radeon_vm_update_ptes - make sure that page tables are valid	739	* radeon_vm_update_ptes - make sure that page tables are valid
662	*	740	*
663	* @rdev: radeon_device pointer	741	* @rdev: radeon_device pointer
@@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
703	if ((last_pte + 8 * count) != pte) {	781	if ((last_pte + 8 * count) != pte) {
704		782
705	if (count) {	783	if (count) {
706	radeon_asic_vm_set_page(rdev, ib, last_pte,	784	radeon_vm_frag_ptes(rdev, ib, last_pte,
707	last_dst, count,	785	last_pte + 8 * count,
708	RADEON_GPU_PAGE_SIZE,	786	last_dst, flags);
709	flags);
710	}	787	}
711		788
712	count = nptes;	789	count = nptes;
@@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
721	}	798	}
722		799
723	if (count) {	800	if (count) {
724	radeon_asic_vm_set_page(rdev, ib, last_pte,	801	radeon_vm_frag_ptes(rdev, ib, last_pte,
725	last_dst, count,	802	last_pte + 8 * count,
726	RADEON_GPU_PAGE_SIZE, flags);	803	last_dst, flags);
727	}	804	}
728	}	805	}
729		806