aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd
diff options
context:
space:
mode:
authorChristian König <christian.koenig@amd.com>2016-08-05 07:56:35 -0400
committerAlex Deucher <alexander.deucher@amd.com>2016-08-10 14:05:58 -0400
commit92696dd52e58b5caaee1dc027cf14f327f91d2e1 (patch)
tree08656072d379f5ab1fefbe8e04601e626438dd4e /drivers/gpu/drm/amd
parent27c5f36fe138e29d63eea7d1445bda1ca64921d9 (diff)
drm/amdgpu: flip frag_ptes and update_pts
We can add the fragment params before we split the update for the page tables. That should save a few CPU cycles for larger updates. Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c166
1 files changed, 79 insertions, 87 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index fd7901c1320f..d2796bb4004b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -702,83 +702,6 @@ error_free:
702} 702}
703 703
704/** 704/**
705 * amdgpu_vm_frag_ptes - add fragment information to PTEs
706 *
707 * @params: see amdgpu_pte_update_params definition
708 * @pe_start: first PTE to handle
709 * @pe_end: last PTE to handle
710 * @addr: addr those PTEs should point to
711 * @flags: hw mapping flags
712 */
713static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params,
714 uint64_t pe_start, uint64_t pe_end,
715 uint64_t addr, uint32_t flags)
716{
717 /**
718 * The MC L1 TLB supports variable sized pages, based on a fragment
719 * field in the PTE. When this field is set to a non-zero value, page
720 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
721 * flags are considered valid for all PTEs within the fragment range
722 * and corresponding mappings are assumed to be physically contiguous.
723 *
724 * The L1 TLB can store a single PTE for the whole fragment,
725 * significantly increasing the space available for translation
726 * caching. This leads to large improvements in throughput when the
727 * TLB is under pressure.
728 *
729 * The L2 TLB distributes small and large fragments into two
730 * asymmetric partitions. The large fragment cache is significantly
731 * larger. Thus, we try to use large fragments wherever possible.
732 * Userspace can support this by aligning virtual base address and
733 * allocation size to the fragment size.
734 */
735
736 /* SI and newer are optimized for 64KB */
737 uint64_t frag_flags = AMDGPU_PTE_FRAG(AMDGPU_LOG2_PAGES_PER_FRAG);
738 uint64_t frag_align = 0x80;
739
740 uint64_t frag_start = ALIGN(pe_start, frag_align);
741 uint64_t frag_end = pe_end & ~(frag_align - 1);
742
743 unsigned count;
744
745 /* Abort early if there isn't anything to do */
746 if (pe_start == pe_end)
747 return;
748
749 /* system pages are non continuously */
750 if (params->src || params->pages_addr ||
751 !(flags & AMDGPU_PTE_VALID) || (frag_start >= frag_end)) {
752
753 count = (pe_end - pe_start) / 8;
754 amdgpu_vm_update_pages(params, pe_start, addr, count,
755 AMDGPU_GPU_PAGE_SIZE, flags);
756 return;
757 }
758
759 /* handle the 4K area at the beginning */
760 if (pe_start != frag_start) {
761 count = (frag_start - pe_start) / 8;
762 amdgpu_vm_update_pages(params, pe_start, addr, count,
763 AMDGPU_GPU_PAGE_SIZE, flags);
764 addr += AMDGPU_GPU_PAGE_SIZE * count;
765 }
766
767 /* handle the area in the middle */
768 count = (frag_end - frag_start) / 8;
769 amdgpu_vm_update_pages(params, frag_start, addr, count,
770 AMDGPU_GPU_PAGE_SIZE, flags | frag_flags);
771
772 /* handle the 4K area at the end */
773 if (frag_end != pe_end) {
774 addr += AMDGPU_GPU_PAGE_SIZE * count;
775 count = (pe_end - frag_end) / 8;
776 amdgpu_vm_update_pages(params, frag_end, addr, count,
777 AMDGPU_GPU_PAGE_SIZE, flags);
778 }
779}
780
781/**
782 * amdgpu_vm_update_ptes - make sure that page tables are valid 705 * amdgpu_vm_update_ptes - make sure that page tables are valid
783 * 706 *
784 * @params: see amdgpu_pte_update_params definition 707 * @params: see amdgpu_pte_update_params definition
@@ -797,7 +720,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
797{ 720{
798 const uint64_t mask = AMDGPU_VM_PTE_COUNT - 1; 721 const uint64_t mask = AMDGPU_VM_PTE_COUNT - 1;
799 722
800 uint64_t cur_pe_start, cur_pe_end, cur_dst; 723 uint64_t cur_pe_start, cur_nptes, cur_dst;
801 uint64_t addr; /* next GPU address to be updated */ 724 uint64_t addr; /* next GPU address to be updated */
802 uint64_t pt_idx; 725 uint64_t pt_idx;
803 struct amdgpu_bo *pt; 726 struct amdgpu_bo *pt;
@@ -816,7 +739,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
816 739
817 cur_pe_start = amdgpu_bo_gpu_offset(pt); 740 cur_pe_start = amdgpu_bo_gpu_offset(pt);
818 cur_pe_start += (addr & mask) * 8; 741 cur_pe_start += (addr & mask) * 8;
819 cur_pe_end = cur_pe_start + 8 * nptes; 742 cur_nptes = nptes;
820 cur_dst = dst; 743 cur_dst = dst;
821 744
822 /* for next ptb*/ 745 /* for next ptb*/
@@ -836,18 +759,19 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
836 next_pe_start = amdgpu_bo_gpu_offset(pt); 759 next_pe_start = amdgpu_bo_gpu_offset(pt);
837 next_pe_start += (addr & mask) * 8; 760 next_pe_start += (addr & mask) * 8;
838 761
839 if (cur_pe_end == next_pe_start) { 762 if ((cur_pe_start + 8 * cur_nptes) == next_pe_start) {
840 /* The next ptb is consecutive to current ptb. 763 /* The next ptb is consecutive to current ptb.
841 * Don't call amdgpu_vm_frag_ptes now. 764 * Don't call amdgpu_vm_update_pages now.
842 * Will update two ptbs together in future. 765 * Will update two ptbs together in future.
843 */ 766 */
844 cur_pe_end += 8 * nptes; 767 cur_nptes += nptes;
845 } else { 768 } else {
846 amdgpu_vm_frag_ptes(params, cur_pe_start, cur_pe_end, 769 amdgpu_vm_update_pages(params, cur_pe_start, cur_dst,
847 cur_dst, flags); 770 cur_nptes, AMDGPU_GPU_PAGE_SIZE,
771 flags);
848 772
849 cur_pe_start = next_pe_start; 773 cur_pe_start = next_pe_start;
850 cur_pe_end = next_pe_start + 8 * nptes; 774 cur_nptes = nptes;
851 cur_dst = dst; 775 cur_dst = dst;
852 } 776 }
853 777
@@ -856,7 +780,75 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
856 dst += nptes * AMDGPU_GPU_PAGE_SIZE; 780 dst += nptes * AMDGPU_GPU_PAGE_SIZE;
857 } 781 }
858 782
859 amdgpu_vm_frag_ptes(params, cur_pe_start, cur_pe_end, cur_dst, flags); 783 amdgpu_vm_update_pages(params, cur_pe_start, cur_dst, cur_nptes,
784 AMDGPU_GPU_PAGE_SIZE, flags);
785}
786
787/*
788 * amdgpu_vm_frag_ptes - add fragment information to PTEs
789 *
790 * @params: see amdgpu_pte_update_params definition
791 * @vm: requested vm
792 * @start: first PTE to handle
793 * @end: last PTE to handle
794 * @dst: addr those PTEs should point to
795 * @flags: hw mapping flags
796 */
797static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params,
798 struct amdgpu_vm *vm,
799 uint64_t start, uint64_t end,
800 uint64_t dst, uint32_t flags)
801{
802 /**
803 * The MC L1 TLB supports variable sized pages, based on a fragment
804 * field in the PTE. When this field is set to a non-zero value, page
805 * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
806 * flags are considered valid for all PTEs within the fragment range
807 * and corresponding mappings are assumed to be physically contiguous.
808 *
809 * The L1 TLB can store a single PTE for the whole fragment,
810 * significantly increasing the space available for translation
811 * caching. This leads to large improvements in throughput when the
812 * TLB is under pressure.
813 *
814 * The L2 TLB distributes small and large fragments into two
815 * asymmetric partitions. The large fragment cache is significantly
816 * larger. Thus, we try to use large fragments wherever possible.
817 * Userspace can support this by aligning virtual base address and
818 * allocation size to the fragment size.
819 */
820
821 /* SI and newer are optimized for 64KB */
822 uint64_t frag_flags = AMDGPU_PTE_FRAG(AMDGPU_LOG2_PAGES_PER_FRAG);
823 uint64_t frag_align = 1 << AMDGPU_LOG2_PAGES_PER_FRAG;
824
825 uint64_t frag_start = ALIGN(start, frag_align);
826 uint64_t frag_end = end & ~(frag_align - 1);
827
828 /* system pages are non continuously */
829 if (params->src || params->pages_addr || !(flags & AMDGPU_PTE_VALID) ||
830 (frag_start >= frag_end)) {
831
832 amdgpu_vm_update_ptes(params, vm, start, end, dst, flags);
833 return;
834 }
835
836 /* handle the 4K area at the beginning */
837 if (start != frag_start) {
838 amdgpu_vm_update_ptes(params, vm, start, frag_start,
839 dst, flags);
840 dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE;
841 }
842
843 /* handle the area in the middle */
844 amdgpu_vm_update_ptes(params, vm, frag_start, frag_end, dst,
845 flags | frag_flags);
846
847 /* handle the 4K area at the end */
848 if (frag_end != end) {
849 dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE;
850 amdgpu_vm_update_ptes(params, vm, frag_end, end, dst, flags);
851 }
860} 852}
861 853
862/** 854/**
@@ -953,7 +945,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
953 if (r) 945 if (r)
954 goto error_free; 946 goto error_free;
955 947
956 amdgpu_vm_update_ptes(&params, vm, start, last + 1, addr, flags); 948 amdgpu_vm_frag_ptes(&params, vm, start, last + 1, addr, flags);
957 949
958 amdgpu_ring_pad_ib(ring, params.ib); 950 amdgpu_ring_pad_ib(ring, params.ib);
959 WARN_ON(params.ib->length_dw > ndw); 951 WARN_ON(params.ib->length_dw > ndw);