diff options
author | Christian König <christian.koenig@amd.com> | 2016-08-05 07:56:35 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2016-08-10 14:05:58 -0400 |
commit | 92696dd52e58b5caaee1dc027cf14f327f91d2e1 (patch) | |
tree | 08656072d379f5ab1fefbe8e04601e626438dd4e /drivers/gpu/drm/amd | |
parent | 27c5f36fe138e29d63eea7d1445bda1ca64921d9 (diff) |
drm/amdgpu: flip frag_ptes and update_pts
We can add the fragment params before we split the update for the page tables.
That should save a few CPU cycles for larger updates.
Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 166 |
1 files changed, 79 insertions, 87 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index fd7901c1320f..d2796bb4004b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | |||
@@ -702,83 +702,6 @@ error_free: | |||
702 | } | 702 | } |
703 | 703 | ||
704 | /** | 704 | /** |
705 | * amdgpu_vm_frag_ptes - add fragment information to PTEs | ||
706 | * | ||
707 | * @params: see amdgpu_pte_update_params definition | ||
708 | * @pe_start: first PTE to handle | ||
709 | * @pe_end: last PTE to handle | ||
710 | * @addr: addr those PTEs should point to | ||
711 | * @flags: hw mapping flags | ||
712 | */ | ||
713 | static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, | ||
714 | uint64_t pe_start, uint64_t pe_end, | ||
715 | uint64_t addr, uint32_t flags) | ||
716 | { | ||
717 | /** | ||
718 | * The MC L1 TLB supports variable sized pages, based on a fragment | ||
719 | * field in the PTE. When this field is set to a non-zero value, page | ||
720 | * granularity is increased from 4KB to (1 << (12 + frag)). The PTE | ||
721 | * flags are considered valid for all PTEs within the fragment range | ||
722 | * and corresponding mappings are assumed to be physically contiguous. | ||
723 | * | ||
724 | * The L1 TLB can store a single PTE for the whole fragment, | ||
725 | * significantly increasing the space available for translation | ||
726 | * caching. This leads to large improvements in throughput when the | ||
727 | * TLB is under pressure. | ||
728 | * | ||
729 | * The L2 TLB distributes small and large fragments into two | ||
730 | * asymmetric partitions. The large fragment cache is significantly | ||
731 | * larger. Thus, we try to use large fragments wherever possible. | ||
732 | * Userspace can support this by aligning virtual base address and | ||
733 | * allocation size to the fragment size. | ||
734 | */ | ||
735 | |||
736 | /* SI and newer are optimized for 64KB */ | ||
737 | uint64_t frag_flags = AMDGPU_PTE_FRAG(AMDGPU_LOG2_PAGES_PER_FRAG); | ||
738 | uint64_t frag_align = 0x80; | ||
739 | |||
740 | uint64_t frag_start = ALIGN(pe_start, frag_align); | ||
741 | uint64_t frag_end = pe_end & ~(frag_align - 1); | ||
742 | |||
743 | unsigned count; | ||
744 | |||
745 | /* Abort early if there isn't anything to do */ | ||
746 | if (pe_start == pe_end) | ||
747 | return; | ||
748 | |||
749 | /* system pages are non continuously */ | ||
750 | if (params->src || params->pages_addr || | ||
751 | !(flags & AMDGPU_PTE_VALID) || (frag_start >= frag_end)) { | ||
752 | |||
753 | count = (pe_end - pe_start) / 8; | ||
754 | amdgpu_vm_update_pages(params, pe_start, addr, count, | ||
755 | AMDGPU_GPU_PAGE_SIZE, flags); | ||
756 | return; | ||
757 | } | ||
758 | |||
759 | /* handle the 4K area at the beginning */ | ||
760 | if (pe_start != frag_start) { | ||
761 | count = (frag_start - pe_start) / 8; | ||
762 | amdgpu_vm_update_pages(params, pe_start, addr, count, | ||
763 | AMDGPU_GPU_PAGE_SIZE, flags); | ||
764 | addr += AMDGPU_GPU_PAGE_SIZE * count; | ||
765 | } | ||
766 | |||
767 | /* handle the area in the middle */ | ||
768 | count = (frag_end - frag_start) / 8; | ||
769 | amdgpu_vm_update_pages(params, frag_start, addr, count, | ||
770 | AMDGPU_GPU_PAGE_SIZE, flags | frag_flags); | ||
771 | |||
772 | /* handle the 4K area at the end */ | ||
773 | if (frag_end != pe_end) { | ||
774 | addr += AMDGPU_GPU_PAGE_SIZE * count; | ||
775 | count = (pe_end - frag_end) / 8; | ||
776 | amdgpu_vm_update_pages(params, frag_end, addr, count, | ||
777 | AMDGPU_GPU_PAGE_SIZE, flags); | ||
778 | } | ||
779 | } | ||
780 | |||
781 | /** | ||
782 | * amdgpu_vm_update_ptes - make sure that page tables are valid | 705 | * amdgpu_vm_update_ptes - make sure that page tables are valid |
783 | * | 706 | * |
784 | * @params: see amdgpu_pte_update_params definition | 707 | * @params: see amdgpu_pte_update_params definition |
@@ -797,7 +720,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params, | |||
797 | { | 720 | { |
798 | const uint64_t mask = AMDGPU_VM_PTE_COUNT - 1; | 721 | const uint64_t mask = AMDGPU_VM_PTE_COUNT - 1; |
799 | 722 | ||
800 | uint64_t cur_pe_start, cur_pe_end, cur_dst; | 723 | uint64_t cur_pe_start, cur_nptes, cur_dst; |
801 | uint64_t addr; /* next GPU address to be updated */ | 724 | uint64_t addr; /* next GPU address to be updated */ |
802 | uint64_t pt_idx; | 725 | uint64_t pt_idx; |
803 | struct amdgpu_bo *pt; | 726 | struct amdgpu_bo *pt; |
@@ -816,7 +739,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params, | |||
816 | 739 | ||
817 | cur_pe_start = amdgpu_bo_gpu_offset(pt); | 740 | cur_pe_start = amdgpu_bo_gpu_offset(pt); |
818 | cur_pe_start += (addr & mask) * 8; | 741 | cur_pe_start += (addr & mask) * 8; |
819 | cur_pe_end = cur_pe_start + 8 * nptes; | 742 | cur_nptes = nptes; |
820 | cur_dst = dst; | 743 | cur_dst = dst; |
821 | 744 | ||
822 | /* for next ptb*/ | 745 | /* for next ptb*/ |
@@ -836,18 +759,19 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params, | |||
836 | next_pe_start = amdgpu_bo_gpu_offset(pt); | 759 | next_pe_start = amdgpu_bo_gpu_offset(pt); |
837 | next_pe_start += (addr & mask) * 8; | 760 | next_pe_start += (addr & mask) * 8; |
838 | 761 | ||
839 | if (cur_pe_end == next_pe_start) { | 762 | if ((cur_pe_start + 8 * cur_nptes) == next_pe_start) { |
840 | /* The next ptb is consecutive to current ptb. | 763 | /* The next ptb is consecutive to current ptb. |
841 | * Don't call amdgpu_vm_frag_ptes now. | 764 | * Don't call amdgpu_vm_update_pages now. |
842 | * Will update two ptbs together in future. | 765 | * Will update two ptbs together in future. |
843 | */ | 766 | */ |
844 | cur_pe_end += 8 * nptes; | 767 | cur_nptes += nptes; |
845 | } else { | 768 | } else { |
846 | amdgpu_vm_frag_ptes(params, cur_pe_start, cur_pe_end, | 769 | amdgpu_vm_update_pages(params, cur_pe_start, cur_dst, |
847 | cur_dst, flags); | 770 | cur_nptes, AMDGPU_GPU_PAGE_SIZE, |
771 | flags); | ||
848 | 772 | ||
849 | cur_pe_start = next_pe_start; | 773 | cur_pe_start = next_pe_start; |
850 | cur_pe_end = next_pe_start + 8 * nptes; | 774 | cur_nptes = nptes; |
851 | cur_dst = dst; | 775 | cur_dst = dst; |
852 | } | 776 | } |
853 | 777 | ||
@@ -856,7 +780,75 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params, | |||
856 | dst += nptes * AMDGPU_GPU_PAGE_SIZE; | 780 | dst += nptes * AMDGPU_GPU_PAGE_SIZE; |
857 | } | 781 | } |
858 | 782 | ||
859 | amdgpu_vm_frag_ptes(params, cur_pe_start, cur_pe_end, cur_dst, flags); | 783 | amdgpu_vm_update_pages(params, cur_pe_start, cur_dst, cur_nptes, |
784 | AMDGPU_GPU_PAGE_SIZE, flags); | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | * amdgpu_vm_frag_ptes - add fragment information to PTEs | ||
789 | * | ||
790 | * @params: see amdgpu_pte_update_params definition | ||
791 | * @vm: requested vm | ||
792 | * @start: first PTE to handle | ||
793 | * @end: last PTE to handle | ||
794 | * @dst: addr those PTEs should point to | ||
795 | * @flags: hw mapping flags | ||
796 | */ | ||
797 | static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params *params, | ||
798 | struct amdgpu_vm *vm, | ||
799 | uint64_t start, uint64_t end, | ||
800 | uint64_t dst, uint32_t flags) | ||
801 | { | ||
802 | /** | ||
803 | * The MC L1 TLB supports variable sized pages, based on a fragment | ||
804 | * field in the PTE. When this field is set to a non-zero value, page | ||
805 | * granularity is increased from 4KB to (1 << (12 + frag)). The PTE | ||
806 | * flags are considered valid for all PTEs within the fragment range | ||
807 | * and corresponding mappings are assumed to be physically contiguous. | ||
808 | * | ||
809 | * The L1 TLB can store a single PTE for the whole fragment, | ||
810 | * significantly increasing the space available for translation | ||
811 | * caching. This leads to large improvements in throughput when the | ||
812 | * TLB is under pressure. | ||
813 | * | ||
814 | * The L2 TLB distributes small and large fragments into two | ||
815 | * asymmetric partitions. The large fragment cache is significantly | ||
816 | * larger. Thus, we try to use large fragments wherever possible. | ||
817 | * Userspace can support this by aligning virtual base address and | ||
818 | * allocation size to the fragment size. | ||
819 | */ | ||
820 | |||
821 | /* SI and newer are optimized for 64KB */ | ||
822 | uint64_t frag_flags = AMDGPU_PTE_FRAG(AMDGPU_LOG2_PAGES_PER_FRAG); | ||
823 | uint64_t frag_align = 1 << AMDGPU_LOG2_PAGES_PER_FRAG; | ||
824 | |||
825 | uint64_t frag_start = ALIGN(start, frag_align); | ||
826 | uint64_t frag_end = end & ~(frag_align - 1); | ||
827 | |||
828 | /* system pages are non continuously */ | ||
829 | if (params->src || params->pages_addr || !(flags & AMDGPU_PTE_VALID) || | ||
830 | (frag_start >= frag_end)) { | ||
831 | |||
832 | amdgpu_vm_update_ptes(params, vm, start, end, dst, flags); | ||
833 | return; | ||
834 | } | ||
835 | |||
836 | /* handle the 4K area at the beginning */ | ||
837 | if (start != frag_start) { | ||
838 | amdgpu_vm_update_ptes(params, vm, start, frag_start, | ||
839 | dst, flags); | ||
840 | dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE; | ||
841 | } | ||
842 | |||
843 | /* handle the area in the middle */ | ||
844 | amdgpu_vm_update_ptes(params, vm, frag_start, frag_end, dst, | ||
845 | flags | frag_flags); | ||
846 | |||
847 | /* handle the 4K area at the end */ | ||
848 | if (frag_end != end) { | ||
849 | dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE; | ||
850 | amdgpu_vm_update_ptes(params, vm, frag_end, end, dst, flags); | ||
851 | } | ||
860 | } | 852 | } |
861 | 853 | ||
862 | /** | 854 | /** |
@@ -953,7 +945,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev, | |||
953 | if (r) | 945 | if (r) |
954 | goto error_free; | 946 | goto error_free; |
955 | 947 | ||
956 | amdgpu_vm_update_ptes(¶ms, vm, start, last + 1, addr, flags); | 948 | amdgpu_vm_frag_ptes(¶ms, vm, start, last + 1, addr, flags); |
957 | 949 | ||
958 | amdgpu_ring_pad_ib(ring, params.ib); | 950 | amdgpu_ring_pad_ib(ring, params.ib); |
959 | WARN_ON(params.ib->length_dw > ndw); | 951 | WARN_ON(params.ib->length_dw > ndw); |