aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/radeon/radeon_vm.c
diff options
context:
space:
mode:
authorChristian König <christian.koenig@amd.com>2014-05-10 06:17:55 -0400
committerAlex Deucher <alexander.deucher@amd.com>2014-06-02 10:25:02 -0400
commitec3dbbcbd7a6ee165ca7eeafec8dbc733901ab2f (patch)
treef0a715cec425e8815870874892cce639b25eda1f /drivers/gpu/drm/radeon/radeon_vm.c
parent831719d62f692e28699a7acd7b441c6f0c01b6f7 (diff)
drm/radeon: add large PTE support for NI, SI and CIK v5
This patch implements support for VRAM page table entry compression. PTE construction is enhanced to identify physically contiguous page ranges and mark them in the PTE fragment field. L1/L2 TLB support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, significantly improving TLB utilization for VRAM allocations. Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS on default settings at 1920x1200 resolution with vsync disabled. See main comment in radeon_vm.c for a technical description. v2 (chk): rebased and simplified. v3 (chk): add missing hw setup v4 (chk): rebased on current drm-fixes-3.15 v5 (chk): fix comments and commit text Signed-off-by: Jay Cornwall <jay@jcornwall.me> Signed-off-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/radeon/radeon_vm.c')
-rw-r--r--drivers/gpu/drm/radeon/radeon_vm.c91
1 files changed, 84 insertions, 7 deletions
diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c
index 2aae6ce49d32..f8d5b65932e5 100644
--- a/drivers/gpu/drm/radeon/radeon_vm.c
+++ b/drivers/gpu/drm/radeon/radeon_vm.c
@@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct radeon_device *rdev,
658} 658}
659 659
660/** 660/**
661 * radeon_vm_frag_ptes - add fragment information to PTEs
662 *
663 * @rdev: radeon_device pointer
664 * @ib: IB for the update
665 * @pe_start: first PTE to handle
666 * @pe_end: last PTE to handle
667 * @addr: addr those PTEs should point to
668 * @flags: hw mapping flags
669 *
670 * Global and local mutex must be locked!
671 */
672static void radeon_vm_frag_ptes(struct radeon_device *rdev,
673 struct radeon_ib *ib,
674 uint64_t pe_start, uint64_t pe_end,
675 uint64_t addr, uint32_t flags)
676{
677 /**
678 * The MC L1 TLB supports variable sized pages, based on a fragment
679 * field in the PTE. When this field is set to a non-zero value, page
680 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
681 * flags are considered valid for all PTEs within the fragment range
682 * and corresponding mappings are assumed to be physically contiguous.
683 *
684 * The L1 TLB can store a single PTE for the whole fragment,
685 * significantly increasing the space available for translation
686 * caching. This leads to large improvements in throughput when the
687 * TLB is under pressure.
688 *
689 * The L2 TLB distributes small and large fragments into two
690 * asymmetric partitions. The large fragment cache is significantly
691 * larger. Thus, we try to use large fragments wherever possible.
692 * Userspace can support this by aligning virtual base address and
693 * allocation size to the fragment size.
694 */
695
696 /* NI is optimized for 256KB fragments, SI and newer for 64KB */
697 uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
698 R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
699 uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
700
701 uint64_t frag_start = ALIGN(pe_start, frag_align);
702 uint64_t frag_end = pe_end & ~(frag_align - 1);
703
704 unsigned count;
705
706 /* system pages are non continuously */
707 if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
708 (frag_start >= frag_end)) {
709
710 count = (pe_end - pe_start) / 8;
711 radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
712 RADEON_GPU_PAGE_SIZE, flags);
713 return;
714 }
715
716 /* handle the 4K area at the beginning */
717 if (pe_start != frag_start) {
718 count = (frag_start - pe_start) / 8;
719 radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
720 RADEON_GPU_PAGE_SIZE, flags);
721 addr += RADEON_GPU_PAGE_SIZE * count;
722 }
723
724 /* handle the area in the middle */
725 count = (frag_end - frag_start) / 8;
726 radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
727 RADEON_GPU_PAGE_SIZE, flags | frag_flags);
728
729 /* handle the 4K area at the end */
730 if (frag_end != pe_end) {
731 addr += RADEON_GPU_PAGE_SIZE * count;
732 count = (pe_end - frag_end) / 8;
733 radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
734 RADEON_GPU_PAGE_SIZE, flags);
735 }
736}
737
738/**
661 * radeon_vm_update_ptes - make sure that page tables are valid 739 * radeon_vm_update_ptes - make sure that page tables are valid
662 * 740 *
663 * @rdev: radeon_device pointer 741 * @rdev: radeon_device pointer
@@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
703 if ((last_pte + 8 * count) != pte) { 781 if ((last_pte + 8 * count) != pte) {
704 782
705 if (count) { 783 if (count) {
706 radeon_asic_vm_set_page(rdev, ib, last_pte, 784 radeon_vm_frag_ptes(rdev, ib, last_pte,
707 last_dst, count, 785 last_pte + 8 * count,
708 RADEON_GPU_PAGE_SIZE, 786 last_dst, flags);
709 flags);
710 } 787 }
711 788
712 count = nptes; 789 count = nptes;
@@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev,
721 } 798 }
722 799
723 if (count) { 800 if (count) {
724 radeon_asic_vm_set_page(rdev, ib, last_pte, 801 radeon_vm_frag_ptes(rdev, ib, last_pte,
725 last_dst, count, 802 last_pte + 8 * count,
726 RADEON_GPU_PAGE_SIZE, flags); 803 last_dst, flags);
727 } 804 }
728} 805}
729 806