aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2013-06-20 05:00:22 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-06-21 02:01:56 -0400
commit0ac52dd7666d5c0d0147d73a8e4b1d1ffd81cdf3 (patch)
tree124824b5deed8f2ee0c0c56305283496099a4774 /arch/powerpc/mm
parent6d492ecc6489113968ec269be1cf88942d4a5d29 (diff)
powerpc: Make linux pagetable walk safe with THP enabled
We need to have irqs disabled to handle all the possible parallel update for linux page table without holding locks. Events that we are intersted in while walking page tables are 1) Page fault 2) umap 3) THP split 4) THP collapse A) local_irq_disabled: ------------------------ 1) page fault: A none to valid transition via page fault is not an issue because we would either see a none or valid. If it is none, we would error out the page table walk. We may need to use on stack values when checking for type of page table elements, because if we do if (!is_hugepd()) { if (!pmd_none() { if (pmd_bad() { We could take that bad condition because the pmd got converted to a hugepd after the !is_hugepd check via a hugetlb fault. The right way would be to check for pmd_none higher up or use on stack value. 2) A valid to none conversion via unmap: We can safely walk the upper level table, because we don't remove the the page table entries until rcu grace period. So even if we followed a wrong pointer we still have the pointer valid till the grace period. A PTE pointer returned need to be atomically checked for _PAGE_PRESENT and _PAGE_BUSY. A valid pointer returned could becoming none later. To prevent pte_clear we take _PAGE_BUSY. 3) THP split: A valid transparent hugepage is converted to nomal page. Before we split we do pmd_splitting_flush, which sets the hugepage PTE to _PAGE_SPLITTING So when walking page table we need to check for pmd_trans_splitting and handle that. The pte returned should also need to be checked for _PAGE_SPLITTING before setting _PAGE_BUSY similar to _PAGE_PRESENT. We save the value of PTE on stack and check for the flag in the local pte value. If we don't have the value set we can safely operate on the local pte value and we atomicaly set _PAGE_BUSY. 4) THP collapse: A normal page gets converted to hugepage. In the collapse path, we mark the pmd none early (pmdp_clear_flush). With irq disabled, if we are aleady walking page table we would see the pmd_none and won't continue. If we see a valid PMD, we should still check for _PAGE_PRESENT before setting _PAGE_BUSY, to make sure we didn't collapse the PTE to a Huge PTE. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/hash_utils_64.c27
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c3
-rw-r--r--arch/powerpc/mm/hugetlbpage.c72
-rw-r--r--arch/powerpc/mm/mem.c4
4 files changed, 68 insertions, 38 deletions
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7a81e866e7b1..845231643987 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1180,13 +1180,25 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1180 pgdir = mm->pgd; 1180 pgdir = mm->pgd;
1181 if (pgdir == NULL) 1181 if (pgdir == NULL)
1182 return; 1182 return;
1183
1184 /* Get VSID */
1185 ssize = user_segment_size(ea);
1186 vsid = get_vsid(mm->context.id, ea, ssize);
1187 if (!vsid)
1188 return;
1189 /*
1190 * Hash doesn't like irqs. Walking linux page table with irq disabled
1191 * saves us from holding multiple locks.
1192 */
1193 local_irq_save(flags);
1194
1183 /* 1195 /*
1184 * THP pages use update_mmu_cache_pmd. We don't do 1196 * THP pages use update_mmu_cache_pmd. We don't do
1185 * hash preload there. Hence can ignore THP here 1197 * hash preload there. Hence can ignore THP here
1186 */ 1198 */
1187 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); 1199 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
1188 if (!ptep) 1200 if (!ptep)
1189 return; 1201 goto out_exit;
1190 1202
1191 WARN_ON(hugepage_shift); 1203 WARN_ON(hugepage_shift);
1192#ifdef CONFIG_PPC_64K_PAGES 1204#ifdef CONFIG_PPC_64K_PAGES
@@ -1197,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1197 * page size demotion here 1209 * page size demotion here
1198 */ 1210 */
1199 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) 1211 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
1200 return; 1212 goto out_exit;
1201#endif /* CONFIG_PPC_64K_PAGES */ 1213#endif /* CONFIG_PPC_64K_PAGES */
1202 1214
1203 /* Get VSID */
1204 ssize = user_segment_size(ea);
1205 vsid = get_vsid(mm->context.id, ea, ssize);
1206 if (!vsid)
1207 return;
1208
1209 /* Hash doesn't like irqs */
1210 local_irq_save(flags);
1211
1212 /* Is that local to this CPU ? */ 1215 /* Is that local to this CPU ? */
1213 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 1216 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
1214 local = 1; 1217 local = 1;
@@ -1230,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1230 mm->context.user_psize, 1233 mm->context.user_psize,
1231 mm->context.user_psize, 1234 mm->context.user_psize,
1232 pte_val(*ptep)); 1235 pte_val(*ptep));
1233 1236out_exit:
1234 local_irq_restore(flags); 1237 local_irq_restore(flags);
1235} 1238}
1236 1239
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 3c22fa307b9b..34de9e0cdc34 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -37,6 +37,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
37 /* If PMD busy, retry the access */ 37 /* If PMD busy, retry the access */
38 if (unlikely(old_pmd & _PAGE_BUSY)) 38 if (unlikely(old_pmd & _PAGE_BUSY))
39 return 0; 39 return 0;
40 /* If PMD is trans splitting retry the access */
41 if (unlikely(old_pmd & _PAGE_SPLITTING))
42 return 0;
40 /* If PMD permissions don't match, take page fault */ 43 /* If PMD permissions don't match, take page fault */
41 if (unlikely(access & ~old_pmd)) 44 if (unlikely(access & ~old_pmd))
42 return 1; 45 return 1;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8add58061003..e9e6882231da 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -925,12 +925,16 @@ void flush_dcache_icache_hugepage(struct page *page)
925 * (2) pointer to next table, as normal; bottom 6 bits == 0 925 * (2) pointer to next table, as normal; bottom 6 bits == 0
926 * (3) leaf pte for huge page, bottom two bits != 00 926 * (3) leaf pte for huge page, bottom two bits != 00
927 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table 927 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
928 *
929 * So long as we atomically load page table pointers we are safe against teardown,
930 * we can follow the address down to the the page and take a ref on it.
928 */ 931 */
932
929pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) 933pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
930{ 934{
931 pgd_t *pg; 935 pgd_t pgd, *pgdp;
932 pud_t *pu; 936 pud_t pud, *pudp;
933 pmd_t *pm; 937 pmd_t pmd, *pmdp;
934 pte_t *ret_pte; 938 pte_t *ret_pte;
935 hugepd_t *hpdp = NULL; 939 hugepd_t *hpdp = NULL;
936 unsigned pdshift = PGDIR_SHIFT; 940 unsigned pdshift = PGDIR_SHIFT;
@@ -938,34 +942,42 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
938 if (shift) 942 if (shift)
939 *shift = 0; 943 *shift = 0;
940 944
941 pg = pgdir + pgd_index(ea); 945 pgdp = pgdir + pgd_index(ea);
942 946 pgd = ACCESS_ONCE(*pgdp);
943 /* 947 /*
944 * we should first check for none. That takes care of a 948 * Always operate on the local stack value. This make sure the
945 * a parallel hugetlb or THP pagefault moving none entries 949 * value don't get updated by a parallel THP split/collapse,
946 * to respective types. 950 * page fault or a page unmap. The return pte_t * is still not
951 * stable. So should be checked there for above conditions.
947 */ 952 */
948 if (pgd_none(*pg)) 953 if (pgd_none(pgd))
949 return NULL; 954 return NULL;
950 else if (pgd_huge(*pg)) { 955 else if (pgd_huge(pgd)) {
951 ret_pte = (pte_t *) pg; 956 ret_pte = (pte_t *) pgdp;
952 goto out; 957 goto out;
953 } else if (is_hugepd(pg)) 958 } else if (is_hugepd(&pgd))
954 hpdp = (hugepd_t *)pg; 959 hpdp = (hugepd_t *)&pgd;
955 else { 960 else {
961 /*
962 * Even if we end up with an unmap, the pgtable will not
963 * be freed, because we do an rcu free and here we are
964 * irq disabled
965 */
956 pdshift = PUD_SHIFT; 966 pdshift = PUD_SHIFT;
957 pu = pud_offset(pg, ea); 967 pudp = pud_offset(&pgd, ea);
968 pud = ACCESS_ONCE(*pudp);
958 969
959 if (pud_none(*pu)) 970 if (pud_none(pud))
960 return NULL; 971 return NULL;
961 else if (pud_huge(*pu)) { 972 else if (pud_huge(pud)) {
962 ret_pte = (pte_t *) pu; 973 ret_pte = (pte_t *) pudp;
963 goto out; 974 goto out;
964 } else if (is_hugepd(pu)) 975 } else if (is_hugepd(&pud))
965 hpdp = (hugepd_t *)pu; 976 hpdp = (hugepd_t *)&pud;
966 else { 977 else {
967 pdshift = PMD_SHIFT; 978 pdshift = PMD_SHIFT;
968 pm = pmd_offset(pu, ea); 979 pmdp = pmd_offset(&pud, ea);
980 pmd = ACCESS_ONCE(*pmdp);
969 /* 981 /*
970 * A hugepage collapse is captured by pmd_none, because 982 * A hugepage collapse is captured by pmd_none, because
971 * it mark the pmd none and do a hpte invalidate. 983 * it mark the pmd none and do a hpte invalidate.
@@ -975,16 +987,16 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
975 * hpte invalidate 987 * hpte invalidate
976 * 988 *
977 */ 989 */
978 if (pmd_none(*pm) || pmd_trans_splitting(*pm)) 990 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
979 return NULL; 991 return NULL;
980 992
981 if (pmd_huge(*pm) || pmd_large(*pm)) { 993 if (pmd_huge(pmd) || pmd_large(pmd)) {
982 ret_pte = (pte_t *) pm; 994 ret_pte = (pte_t *) pmdp;
983 goto out; 995 goto out;
984 } else if (is_hugepd(pm)) 996 } else if (is_hugepd(&pmd))
985 hpdp = (hugepd_t *)pm; 997 hpdp = (hugepd_t *)&pmd;
986 else 998 else
987 return pte_offset_kernel(pm, ea); 999 return pte_offset_kernel(&pmd, ea);
988 } 1000 }
989 } 1001 }
990 if (!hpdp) 1002 if (!hpdp)
@@ -1020,6 +1032,14 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1020 if ((pte_val(pte) & mask) != mask) 1032 if ((pte_val(pte) & mask) != mask)
1021 return 0; 1033 return 0;
1022 1034
1035#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1036 /*
1037 * check for splitting here
1038 */
1039 if (pmd_trans_splitting(pte_pmd(pte)))
1040 return 0;
1041#endif
1042
1023 /* hugepages are never "special" */ 1043 /* hugepages are never "special" */
1024 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1044 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1025 1045
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0988a26e0413..ccd49f9503a9 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -508,6 +508,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
508 pte_t *ptep) 508 pte_t *ptep)
509{ 509{
510#ifdef CONFIG_PPC_STD_MMU 510#ifdef CONFIG_PPC_STD_MMU
511 /*
512 * We don't need to worry about _PAGE_PRESENT here because we are
513 * called with either mm->page_table_lock held or ptl lock held
514 */
511 unsigned long access = 0, trap; 515 unsigned long access = 0, trap;
512 516
513 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ 517 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */