diff options
Diffstat (limited to 'arch/powerpc/mm/hugetlbpage.c')
| -rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 299 |
1 files changed, 174 insertions, 125 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 4210549ac95e..834ca8eb38f2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
| @@ -21,6 +21,9 @@ | |||
| 21 | #include <asm/pgalloc.h> | 21 | #include <asm/pgalloc.h> |
| 22 | #include <asm/tlb.h> | 22 | #include <asm/tlb.h> |
| 23 | #include <asm/setup.h> | 23 | #include <asm/setup.h> |
| 24 | #include <asm/hugetlb.h> | ||
| 25 | |||
| 26 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 24 | 27 | ||
| 25 | #define PAGE_SHIFT_64K 16 | 28 | #define PAGE_SHIFT_64K 16 |
| 26 | #define PAGE_SHIFT_16M 24 | 29 | #define PAGE_SHIFT_16M 24 |
| @@ -100,68 +103,9 @@ int pgd_huge(pgd_t pgd) | |||
| 100 | } | 103 | } |
| 101 | #endif | 104 | #endif |
| 102 | 105 | ||
| 103 | /* | ||
| 104 | * We have 4 cases for pgds and pmds: | ||
| 105 | * (1) invalid (all zeroes) | ||
| 106 | * (2) pointer to next table, as normal; bottom 6 bits == 0 | ||
| 107 | * (3) leaf pte for huge page, bottom two bits != 00 | ||
| 108 | * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table | ||
| 109 | */ | ||
| 110 | pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) | ||
| 111 | { | ||
| 112 | pgd_t *pg; | ||
| 113 | pud_t *pu; | ||
| 114 | pmd_t *pm; | ||
| 115 | pte_t *ret_pte; | ||
| 116 | hugepd_t *hpdp = NULL; | ||
| 117 | unsigned pdshift = PGDIR_SHIFT; | ||
| 118 | |||
| 119 | if (shift) | ||
| 120 | *shift = 0; | ||
| 121 | |||
| 122 | pg = pgdir + pgd_index(ea); | ||
| 123 | |||
| 124 | if (pgd_huge(*pg)) { | ||
| 125 | ret_pte = (pte_t *) pg; | ||
| 126 | goto out; | ||
| 127 | } else if (is_hugepd(pg)) | ||
| 128 | hpdp = (hugepd_t *)pg; | ||
| 129 | else if (!pgd_none(*pg)) { | ||
| 130 | pdshift = PUD_SHIFT; | ||
| 131 | pu = pud_offset(pg, ea); | ||
| 132 | |||
| 133 | if (pud_huge(*pu)) { | ||
| 134 | ret_pte = (pte_t *) pu; | ||
| 135 | goto out; | ||
| 136 | } else if (is_hugepd(pu)) | ||
| 137 | hpdp = (hugepd_t *)pu; | ||
| 138 | else if (!pud_none(*pu)) { | ||
| 139 | pdshift = PMD_SHIFT; | ||
| 140 | pm = pmd_offset(pu, ea); | ||
| 141 | |||
| 142 | if (pmd_huge(*pm)) { | ||
| 143 | ret_pte = (pte_t *) pm; | ||
| 144 | goto out; | ||
| 145 | } else if (is_hugepd(pm)) | ||
| 146 | hpdp = (hugepd_t *)pm; | ||
| 147 | else if (!pmd_none(*pm)) | ||
| 148 | return pte_offset_kernel(pm, ea); | ||
| 149 | } | ||
| 150 | } | ||
| 151 | if (!hpdp) | ||
| 152 | return NULL; | ||
| 153 | |||
| 154 | ret_pte = hugepte_offset(hpdp, ea, pdshift); | ||
| 155 | pdshift = hugepd_shift(*hpdp); | ||
| 156 | out: | ||
| 157 | if (shift) | ||
| 158 | *shift = pdshift; | ||
| 159 | return ret_pte; | ||
| 160 | } | ||
| 161 | EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); | ||
| 162 | |||
| 163 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | 106 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
| 164 | { | 107 | { |
| 108 | /* Only called for hugetlbfs pages, hence can ignore THP */ | ||
| 165 | return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); | 109 | return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); |
| 166 | } | 110 | } |
| 167 | 111 | ||
| @@ -736,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |||
| 736 | struct page *page; | 680 | struct page *page; |
| 737 | unsigned shift; | 681 | unsigned shift; |
| 738 | unsigned long mask; | 682 | unsigned long mask; |
| 739 | 683 | /* | |
| 684 | * Transparent hugepages are handled by generic code. We can skip them | ||
| 685 | * here. | ||
| 686 | */ | ||
| 740 | ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); | 687 | ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); |
| 741 | 688 | ||
| 742 | /* Verify it is a huge page else bail. */ | 689 | /* Verify it is a huge page else bail. */ |
| 743 | if (!ptep || !shift) | 690 | if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) |
| 744 | return ERR_PTR(-EINVAL); | 691 | return ERR_PTR(-EINVAL); |
| 745 | 692 | ||
| 746 | mask = (1UL << shift) - 1; | 693 | mask = (1UL << shift) - 1; |
| @@ -759,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
| 759 | return NULL; | 706 | return NULL; |
| 760 | } | 707 | } |
| 761 | 708 | ||
| 762 | int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | ||
| 763 | unsigned long end, int write, struct page **pages, int *nr) | ||
| 764 | { | ||
| 765 | unsigned long mask; | ||
| 766 | unsigned long pte_end; | ||
| 767 | struct page *head, *page, *tail; | ||
| 768 | pte_t pte; | ||
| 769 | int refs; | ||
| 770 | |||
| 771 | pte_end = (addr + sz) & ~(sz-1); | ||
| 772 | if (pte_end < end) | ||
| 773 | end = pte_end; | ||
| 774 | |||
| 775 | pte = *ptep; | ||
| 776 | mask = _PAGE_PRESENT | _PAGE_USER; | ||
| 777 | if (write) | ||
| 778 | mask |= _PAGE_RW; | ||
| 779 | |||
| 780 | if ((pte_val(pte) & mask) != mask) | ||
| 781 | return 0; | ||
| 782 | |||
| 783 | /* hugepages are never "special" */ | ||
| 784 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
| 785 | |||
| 786 | refs = 0; | ||
| 787 | head = pte_page(pte); | ||
| 788 | |||
| 789 | page = head + ((addr & (sz-1)) >> PAGE_SHIFT); | ||
| 790 | tail = page; | ||
| 791 | do { | ||
| 792 | VM_BUG_ON(compound_head(page) != head); | ||
| 793 | pages[*nr] = page; | ||
| 794 | (*nr)++; | ||
| 795 | page++; | ||
| 796 | refs++; | ||
| 797 | } while (addr += PAGE_SIZE, addr != end); | ||
| 798 | |||
| 799 | if (!page_cache_add_speculative(head, refs)) { | ||
| 800 | *nr -= refs; | ||
| 801 | return 0; | ||
| 802 | } | ||
| 803 | |||
| 804 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
| 805 | /* Could be optimized better */ | ||
| 806 | *nr -= refs; | ||
| 807 | while (refs--) | ||
| 808 | put_page(head); | ||
| 809 | return 0; | ||
| 810 | } | ||
| 811 | |||
| 812 | /* | ||
| 813 | * Any tail page need their mapcount reference taken before we | ||
| 814 | * return. | ||
| 815 | */ | ||
| 816 | while (refs--) { | ||
| 817 | if (PageTail(tail)) | ||
| 818 | get_huge_page_tail(tail); | ||
| 819 | tail++; | ||
| 820 | } | ||
| 821 | |||
| 822 | return 1; | ||
| 823 | } | ||
| 824 | |||
| 825 | static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, | 709 | static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, |
| 826 | unsigned long sz) | 710 | unsigned long sz) |
| 827 | { | 711 | { |
| @@ -1038,3 +922,168 @@ void flush_dcache_icache_hugepage(struct page *page) | |||
| 1038 | } | 922 | } |
| 1039 | } | 923 | } |
| 1040 | } | 924 | } |
| 925 | |||
| 926 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
| 927 | |||
| 928 | /* | ||
| 929 | * We have 4 cases for pgds and pmds: | ||
| 930 | * (1) invalid (all zeroes) | ||
| 931 | * (2) pointer to next table, as normal; bottom 6 bits == 0 | ||
| 932 | * (3) leaf pte for huge page, bottom two bits != 00 | ||
| 933 | * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table | ||
| 934 | * | ||
| 935 | * So long as we atomically load page table pointers we are safe against teardown, | ||
| 936 | * we can follow the address down to the the page and take a ref on it. | ||
| 937 | */ | ||
| 938 | |||
| 939 | pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) | ||
| 940 | { | ||
| 941 | pgd_t pgd, *pgdp; | ||
| 942 | pud_t pud, *pudp; | ||
| 943 | pmd_t pmd, *pmdp; | ||
| 944 | pte_t *ret_pte; | ||
| 945 | hugepd_t *hpdp = NULL; | ||
| 946 | unsigned pdshift = PGDIR_SHIFT; | ||
| 947 | |||
| 948 | if (shift) | ||
| 949 | *shift = 0; | ||
| 950 | |||
| 951 | pgdp = pgdir + pgd_index(ea); | ||
| 952 | pgd = ACCESS_ONCE(*pgdp); | ||
| 953 | /* | ||
| 954 | * Always operate on the local stack value. This make sure the | ||
| 955 | * value don't get updated by a parallel THP split/collapse, | ||
| 956 | * page fault or a page unmap. The return pte_t * is still not | ||
| 957 | * stable. So should be checked there for above conditions. | ||
| 958 | */ | ||
| 959 | if (pgd_none(pgd)) | ||
| 960 | return NULL; | ||
| 961 | else if (pgd_huge(pgd)) { | ||
| 962 | ret_pte = (pte_t *) pgdp; | ||
| 963 | goto out; | ||
| 964 | } else if (is_hugepd(&pgd)) | ||
| 965 | hpdp = (hugepd_t *)&pgd; | ||
| 966 | else { | ||
| 967 | /* | ||
| 968 | * Even if we end up with an unmap, the pgtable will not | ||
| 969 | * be freed, because we do an rcu free and here we are | ||
| 970 | * irq disabled | ||
| 971 | */ | ||
| 972 | pdshift = PUD_SHIFT; | ||
| 973 | pudp = pud_offset(&pgd, ea); | ||
| 974 | pud = ACCESS_ONCE(*pudp); | ||
| 975 | |||
| 976 | if (pud_none(pud)) | ||
| 977 | return NULL; | ||
| 978 | else if (pud_huge(pud)) { | ||
| 979 | ret_pte = (pte_t *) pudp; | ||
| 980 | goto out; | ||
| 981 | } else if (is_hugepd(&pud)) | ||
| 982 | hpdp = (hugepd_t *)&pud; | ||
| 983 | else { | ||
| 984 | pdshift = PMD_SHIFT; | ||
| 985 | pmdp = pmd_offset(&pud, ea); | ||
| 986 | pmd = ACCESS_ONCE(*pmdp); | ||
| 987 | /* | ||
| 988 | * A hugepage collapse is captured by pmd_none, because | ||
| 989 | * it mark the pmd none and do a hpte invalidate. | ||
| 990 | * | ||
| 991 | * A hugepage split is captured by pmd_trans_splitting | ||
| 992 | * because we mark the pmd trans splitting and do a | ||
| 993 | * hpte invalidate | ||
| 994 | * | ||
| 995 | */ | ||
| 996 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
| 997 | return NULL; | ||
| 998 | |||
| 999 | if (pmd_huge(pmd) || pmd_large(pmd)) { | ||
| 1000 | ret_pte = (pte_t *) pmdp; | ||
| 1001 | goto out; | ||
| 1002 | } else if (is_hugepd(&pmd)) | ||
| 1003 | hpdp = (hugepd_t *)&pmd; | ||
| 1004 | else | ||
| 1005 | return pte_offset_kernel(&pmd, ea); | ||
| 1006 | } | ||
| 1007 | } | ||
| 1008 | if (!hpdp) | ||
| 1009 | return NULL; | ||
| 1010 | |||
| 1011 | ret_pte = hugepte_offset(hpdp, ea, pdshift); | ||
| 1012 | pdshift = hugepd_shift(*hpdp); | ||
| 1013 | out: | ||
| 1014 | if (shift) | ||
| 1015 | *shift = pdshift; | ||
| 1016 | return ret_pte; | ||
| 1017 | } | ||
| 1018 | EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); | ||
| 1019 | |||
| 1020 | int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | ||
| 1021 | unsigned long end, int write, struct page **pages, int *nr) | ||
| 1022 | { | ||
| 1023 | unsigned long mask; | ||
| 1024 | unsigned long pte_end; | ||
| 1025 | struct page *head, *page, *tail; | ||
| 1026 | pte_t pte; | ||
| 1027 | int refs; | ||
| 1028 | |||
| 1029 | pte_end = (addr + sz) & ~(sz-1); | ||
| 1030 | if (pte_end < end) | ||
| 1031 | end = pte_end; | ||
| 1032 | |||
| 1033 | pte = ACCESS_ONCE(*ptep); | ||
| 1034 | mask = _PAGE_PRESENT | _PAGE_USER; | ||
| 1035 | if (write) | ||
| 1036 | mask |= _PAGE_RW; | ||
| 1037 | |||
| 1038 | if ((pte_val(pte) & mask) != mask) | ||
| 1039 | return 0; | ||
| 1040 | |||
| 1041 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
| 1042 | /* | ||
| 1043 | * check for splitting here | ||
| 1044 | */ | ||
| 1045 | if (pmd_trans_splitting(pte_pmd(pte))) | ||
| 1046 | return 0; | ||
| 1047 | #endif | ||
| 1048 | |||
| 1049 | /* hugepages are never "special" */ | ||
| 1050 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
| 1051 | |||
| 1052 | refs = 0; | ||
| 1053 | head = pte_page(pte); | ||
| 1054 | |||
| 1055 | page = head + ((addr & (sz-1)) >> PAGE_SHIFT); | ||
| 1056 | tail = page; | ||
| 1057 | do { | ||
| 1058 | VM_BUG_ON(compound_head(page) != head); | ||
| 1059 | pages[*nr] = page; | ||
| 1060 | (*nr)++; | ||
| 1061 | page++; | ||
| 1062 | refs++; | ||
| 1063 | } while (addr += PAGE_SIZE, addr != end); | ||
| 1064 | |||
| 1065 | if (!page_cache_add_speculative(head, refs)) { | ||
| 1066 | *nr -= refs; | ||
| 1067 | return 0; | ||
| 1068 | } | ||
| 1069 | |||
| 1070 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
| 1071 | /* Could be optimized better */ | ||
| 1072 | *nr -= refs; | ||
| 1073 | while (refs--) | ||
| 1074 | put_page(head); | ||
| 1075 | return 0; | ||
| 1076 | } | ||
| 1077 | |||
| 1078 | /* | ||
| 1079 | * Any tail page need their mapcount reference taken before we | ||
| 1080 | * return. | ||
| 1081 | */ | ||
| 1082 | while (refs--) { | ||
| 1083 | if (PageTail(tail)) | ||
| 1084 | get_huge_page_tail(tail); | ||
| 1085 | tail++; | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | return 1; | ||
| 1089 | } | ||
