aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/paravirt.h11
-rw-r--r--arch/x86/include/asm/paravirt_types.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h17
-rw-r--r--arch/x86/include/asm/pgtable-3level.h30
-rw-r--r--arch/x86/include/asm/pgtable.h140
-rw-r--r--arch/x86/include/asm/pgtable_64.h15
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/mm/pgtable.c31
-rw-r--r--include/asm-generic/pgtable.h80
-rw-r--r--include/asm-generic/tlb.h14
-rw-r--r--include/linux/huge_mm.h83
-rw-r--r--include/linux/mm.h30
-rw-r--r--include/linux/mmu_notifier.h14
-rw-r--r--include/linux/pfn_t.h12
-rw-r--r--mm/gup.c7
-rw-r--r--mm/huge_memory.c249
-rw-r--r--mm/memory.c88
-rw-r--r--mm/pagewalk.c20
-rw-r--r--mm/pgtable-generic.c14
21 files changed, 844 insertions, 18 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index f761142976e5..d0012add6b19 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -571,6 +571,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
571config HAVE_ARCH_TRANSPARENT_HUGEPAGE 571config HAVE_ARCH_TRANSPARENT_HUGEPAGE
572 bool 572 bool
573 573
574config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
575 bool
576
574config HAVE_ARCH_HUGE_VMAP 577config HAVE_ARCH_HUGE_VMAP
575 bool 578 bool
576 579
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 874c1238dffd..33007aa74111 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -109,6 +109,7 @@ config X86
109 select HAVE_ARCH_SECCOMP_FILTER 109 select HAVE_ARCH_SECCOMP_FILTER
110 select HAVE_ARCH_TRACEHOOK 110 select HAVE_ARCH_TRACEHOOK
111 select HAVE_ARCH_TRANSPARENT_HUGEPAGE 111 select HAVE_ARCH_TRANSPARENT_HUGEPAGE
112 select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
112 select HAVE_ARCH_VMAP_STACK if X86_64 113 select HAVE_ARCH_VMAP_STACK if X86_64
113 select HAVE_ARCH_WITHIN_STACK_FRAMES 114 select HAVE_ARCH_WITHIN_STACK_FRAMES
114 select HAVE_CC_STACKPROTECTOR 115 select HAVE_CC_STACKPROTECTOR
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f75fbfe550f2..0489884fdc44 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -475,6 +475,17 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
475 native_pmd_val(pmd)); 475 native_pmd_val(pmd));
476} 476}
477 477
478static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
479 pud_t *pudp, pud_t pud)
480{
481 if (sizeof(pudval_t) > sizeof(long))
482 /* 5 arg words */
483 pv_mmu_ops.set_pud_at(mm, addr, pudp, pud);
484 else
485 PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp,
486 native_pud_val(pud));
487}
488
478static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 489static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
479{ 490{
480 pmdval_t val = native_pmd_val(pmd); 491 pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bb2de45a60f2..b060f962d581 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -249,6 +249,8 @@ struct pv_mmu_ops {
249 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 249 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
250 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, 250 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
251 pmd_t *pmdp, pmd_t pmdval); 251 pmd_t *pmdp, pmd_t pmdval);
252 void (*set_pud_at)(struct mm_struct *mm, unsigned long addr,
253 pud_t *pudp, pud_t pudval);
252 void (*pte_update)(struct mm_struct *mm, unsigned long addr, 254 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
253 pte_t *ptep); 255 pte_t *ptep);
254 256
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index fd74a11959de..a8b96e708c2b 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -21,6 +21,10 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
21 *pmdp = pmd; 21 *pmdp = pmd;
22} 22}
23 23
24static inline void native_set_pud(pud_t *pudp, pud_t pud)
25{
26}
27
24static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 28static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
25{ 29{
26 native_set_pte(ptep, pte); 30 native_set_pte(ptep, pte);
@@ -31,6 +35,10 @@ static inline void native_pmd_clear(pmd_t *pmdp)
31 native_set_pmd(pmdp, __pmd(0)); 35 native_set_pmd(pmdp, __pmd(0));
32} 36}
33 37
38static inline void native_pud_clear(pud_t *pudp)
39{
40}
41
34static inline void native_pte_clear(struct mm_struct *mm, 42static inline void native_pte_clear(struct mm_struct *mm,
35 unsigned long addr, pte_t *xp) 43 unsigned long addr, pte_t *xp)
36{ 44{
@@ -55,6 +63,15 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 63#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif 64#endif
57 65
66#ifdef CONFIG_SMP
67static inline pud_t native_pudp_get_and_clear(pud_t *xp)
68{
69 return __pud(xchg((pudval_t *)xp, 0));
70}
71#else
72#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
73#endif
74
58/* Bit manipulation helper on pte/pgoff entry */ 75/* Bit manipulation helper on pte/pgoff entry */
59static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift, 76static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
60 unsigned long mask, unsigned int leftshift) 77 unsigned long mask, unsigned int leftshift)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index cdaa58c9b39e..8f50fb3f04e1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -121,6 +121,12 @@ static inline void native_pmd_clear(pmd_t *pmd)
121 *(tmp + 1) = 0; 121 *(tmp + 1) = 0;
122} 122}
123 123
124#ifndef CONFIG_SMP
125static inline void native_pud_clear(pud_t *pudp)
126{
127}
128#endif
129
124static inline void pud_clear(pud_t *pudp) 130static inline void pud_clear(pud_t *pudp)
125{ 131{
126 set_pud(pudp, __pud(0)); 132 set_pud(pudp, __pud(0));
@@ -176,6 +182,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
176#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 182#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
177#endif 183#endif
178 184
185#ifdef CONFIG_SMP
186union split_pud {
187 struct {
188 u32 pud_low;
189 u32 pud_high;
190 };
191 pud_t pud;
192};
193
194static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
195{
196 union split_pud res, *orig = (union split_pud *)pudp;
197
198 /* xchg acts as a barrier before setting of the high bits */
199 res.pud_low = xchg(&orig->pud_low, 0);
200 res.pud_high = orig->pud_high;
201 orig->pud_high = 0;
202
203 return res.pud;
204}
205#else
206#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
207#endif
208
179/* Encode and de-code a swap entry */ 209/* Encode and de-code a swap entry */
180#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) 210#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
181#define __swp_type(x) (((x).val) & 0x1f) 211#define __swp_type(x) (((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 437feb436efa..1cfb36b8c024 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -46,6 +46,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
46#define set_pte(ptep, pte) native_set_pte(ptep, pte) 46#define set_pte(ptep, pte) native_set_pte(ptep, pte)
47#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 47#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
48#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) 48#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
49#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud)
49 50
50#define set_pte_atomic(ptep, pte) \ 51#define set_pte_atomic(ptep, pte) \
51 native_set_pte_atomic(ptep, pte) 52 native_set_pte_atomic(ptep, pte)
@@ -128,6 +129,16 @@ static inline int pmd_young(pmd_t pmd)
128 return pmd_flags(pmd) & _PAGE_ACCESSED; 129 return pmd_flags(pmd) & _PAGE_ACCESSED;
129} 130}
130 131
132static inline int pud_dirty(pud_t pud)
133{
134 return pud_flags(pud) & _PAGE_DIRTY;
135}
136
137static inline int pud_young(pud_t pud)
138{
139 return pud_flags(pud) & _PAGE_ACCESSED;
140}
141
131static inline int pte_write(pte_t pte) 142static inline int pte_write(pte_t pte)
132{ 143{
133 return pte_flags(pte) & _PAGE_RW; 144 return pte_flags(pte) & _PAGE_RW;
@@ -181,6 +192,13 @@ static inline int pmd_trans_huge(pmd_t pmd)
181 return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; 192 return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
182} 193}
183 194
195#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
196static inline int pud_trans_huge(pud_t pud)
197{
198 return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
199}
200#endif
201
184#define has_transparent_hugepage has_transparent_hugepage 202#define has_transparent_hugepage has_transparent_hugepage
185static inline int has_transparent_hugepage(void) 203static inline int has_transparent_hugepage(void)
186{ 204{
@@ -192,6 +210,18 @@ static inline int pmd_devmap(pmd_t pmd)
192{ 210{
193 return !!(pmd_val(pmd) & _PAGE_DEVMAP); 211 return !!(pmd_val(pmd) & _PAGE_DEVMAP);
194} 212}
213
214#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
215static inline int pud_devmap(pud_t pud)
216{
217 return !!(pud_val(pud) & _PAGE_DEVMAP);
218}
219#else
220static inline int pud_devmap(pud_t pud)
221{
222 return 0;
223}
224#endif
195#endif 225#endif
196#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 226#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
197 227
@@ -333,6 +363,65 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
333 return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); 363 return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
334} 364}
335 365
366static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
367{
368 pudval_t v = native_pud_val(pud);
369
370 return __pud(v | set);
371}
372
373static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
374{
375 pudval_t v = native_pud_val(pud);
376
377 return __pud(v & ~clear);
378}
379
380static inline pud_t pud_mkold(pud_t pud)
381{
382 return pud_clear_flags(pud, _PAGE_ACCESSED);
383}
384
385static inline pud_t pud_mkclean(pud_t pud)
386{
387 return pud_clear_flags(pud, _PAGE_DIRTY);
388}
389
390static inline pud_t pud_wrprotect(pud_t pud)
391{
392 return pud_clear_flags(pud, _PAGE_RW);
393}
394
395static inline pud_t pud_mkdirty(pud_t pud)
396{
397 return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
398}
399
400static inline pud_t pud_mkdevmap(pud_t pud)
401{
402 return pud_set_flags(pud, _PAGE_DEVMAP);
403}
404
405static inline pud_t pud_mkhuge(pud_t pud)
406{
407 return pud_set_flags(pud, _PAGE_PSE);
408}
409
410static inline pud_t pud_mkyoung(pud_t pud)
411{
412 return pud_set_flags(pud, _PAGE_ACCESSED);
413}
414
415static inline pud_t pud_mkwrite(pud_t pud)
416{
417 return pud_set_flags(pud, _PAGE_RW);
418}
419
420static inline pud_t pud_mknotpresent(pud_t pud)
421{
422 return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE);
423}
424
336#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 425#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
337static inline int pte_soft_dirty(pte_t pte) 426static inline int pte_soft_dirty(pte_t pte)
338{ 427{
@@ -344,6 +433,11 @@ static inline int pmd_soft_dirty(pmd_t pmd)
344 return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; 433 return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
345} 434}
346 435
436static inline int pud_soft_dirty(pud_t pud)
437{
438 return pud_flags(pud) & _PAGE_SOFT_DIRTY;
439}
440
347static inline pte_t pte_mksoft_dirty(pte_t pte) 441static inline pte_t pte_mksoft_dirty(pte_t pte)
348{ 442{
349 return pte_set_flags(pte, _PAGE_SOFT_DIRTY); 443 return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
@@ -354,6 +448,11 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
354 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); 448 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
355} 449}
356 450
451static inline pud_t pud_mksoft_dirty(pud_t pud)
452{
453 return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
454}
455
357static inline pte_t pte_clear_soft_dirty(pte_t pte) 456static inline pte_t pte_clear_soft_dirty(pte_t pte)
358{ 457{
359 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); 458 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -364,6 +463,11 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
364 return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); 463 return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
365} 464}
366 465
466static inline pud_t pud_clear_soft_dirty(pud_t pud)
467{
468 return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
469}
470
367#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ 471#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
368 472
369/* 473/*
@@ -392,6 +496,12 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
392 massage_pgprot(pgprot)); 496 massage_pgprot(pgprot));
393} 497}
394 498
499static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
500{
501 return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) |
502 massage_pgprot(pgprot));
503}
504
395static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 505static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
396{ 506{
397 pteval_t val = pte_val(pte); 507 pteval_t val = pte_val(pte);
@@ -771,6 +881,14 @@ static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
771 return res; 881 return res;
772} 882}
773 883
884static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
885{
886 pud_t res = *pudp;
887
888 native_pud_clear(pudp);
889 return res;
890}
891
774static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 892static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
775 pte_t *ptep , pte_t pte) 893 pte_t *ptep , pte_t pte)
776{ 894{
@@ -783,6 +901,12 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
783 native_set_pmd(pmdp, pmd); 901 native_set_pmd(pmdp, pmd);
784} 902}
785 903
904static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr,
905 pud_t *pudp, pud_t pud)
906{
907 native_set_pud(pudp, pud);
908}
909
786#ifndef CONFIG_PARAVIRT 910#ifndef CONFIG_PARAVIRT
787/* 911/*
788 * Rules for using pte_update - it must be called after any PTE update which 912 * Rules for using pte_update - it must be called after any PTE update which
@@ -861,10 +985,15 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
861extern int pmdp_set_access_flags(struct vm_area_struct *vma, 985extern int pmdp_set_access_flags(struct vm_area_struct *vma,
862 unsigned long address, pmd_t *pmdp, 986 unsigned long address, pmd_t *pmdp,
863 pmd_t entry, int dirty); 987 pmd_t entry, int dirty);
988extern int pudp_set_access_flags(struct vm_area_struct *vma,
989 unsigned long address, pud_t *pudp,
990 pud_t entry, int dirty);
864 991
865#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG 992#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
866extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, 993extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
867 unsigned long addr, pmd_t *pmdp); 994 unsigned long addr, pmd_t *pmdp);
995extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
996 unsigned long addr, pud_t *pudp);
868 997
869#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH 998#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
870extern int pmdp_clear_flush_young(struct vm_area_struct *vma, 999extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
@@ -884,6 +1013,13 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long
884 return native_pmdp_get_and_clear(pmdp); 1013 return native_pmdp_get_and_clear(pmdp);
885} 1014}
886 1015
1016#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
1017static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
1018 unsigned long addr, pud_t *pudp)
1019{
1020 return native_pudp_get_and_clear(pudp);
1021}
1022
887#define __HAVE_ARCH_PMDP_SET_WRPROTECT 1023#define __HAVE_ARCH_PMDP_SET_WRPROTECT
888static inline void pmdp_set_wrprotect(struct mm_struct *mm, 1024static inline void pmdp_set_wrprotect(struct mm_struct *mm,
889 unsigned long addr, pmd_t *pmdp) 1025 unsigned long addr, pmd_t *pmdp)
@@ -932,6 +1068,10 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
932 unsigned long addr, pmd_t *pmd) 1068 unsigned long addr, pmd_t *pmd)
933{ 1069{
934} 1070}
1071static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
1072 unsigned long addr, pud_t *pud)
1073{
1074}
935 1075
936#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 1076#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
937static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 1077static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 62b775926045..73c7ccc38912 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,6 +106,21 @@ static inline void native_pud_clear(pud_t *pud)
106 native_set_pud(pud, native_make_pud(0)); 106 native_set_pud(pud, native_make_pud(0));
107} 107}
108 108
109static inline pud_t native_pudp_get_and_clear(pud_t *xp)
110{
111#ifdef CONFIG_SMP
112 return native_make_pud(xchg(&xp->pud, 0));
113#else
114 /* native_local_pudp_get_and_clear,
115 * but duplicated because of cyclic dependency
116 */
117 pud_t ret = *xp;
118
119 native_pud_clear(xp);
120 return ret;
121#endif
122}
123
109static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) 124static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
110{ 125{
111 *pgdp = pgd; 126 *pgdp = pgd;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index a1bfba0f7234..4797e87b0fb6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -425,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
425 .pmd_clear = native_pmd_clear, 425 .pmd_clear = native_pmd_clear,
426#endif 426#endif
427 .set_pud = native_set_pud, 427 .set_pud = native_set_pud,
428 .set_pud_at = native_set_pud_at,
428 429
429 .pmd_val = PTE_IDENT, 430 .pmd_val = PTE_IDENT,
430 .make_pmd = PTE_IDENT, 431 .make_pmd = PTE_IDENT,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3feec5af4e67..6cbdff26bb96 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -445,6 +445,26 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
445 445
446 return changed; 446 return changed;
447} 447}
448
449int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
450 pud_t *pudp, pud_t entry, int dirty)
451{
452 int changed = !pud_same(*pudp, entry);
453
454 VM_BUG_ON(address & ~HPAGE_PUD_MASK);
455
456 if (changed && dirty) {
457 *pudp = entry;
458 /*
459 * We had a write-protection fault here and changed the pud
460 * to to more permissive. No need to flush the TLB for that,
461 * #PF is architecturally guaranteed to do that and in the
462 * worst-case we'll generate a spurious fault.
463 */
464 }
465
466 return changed;
467}
448#endif 468#endif
449 469
450int ptep_test_and_clear_young(struct vm_area_struct *vma, 470int ptep_test_and_clear_young(struct vm_area_struct *vma,
@@ -474,6 +494,17 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
474 494
475 return ret; 495 return ret;
476} 496}
497int pudp_test_and_clear_young(struct vm_area_struct *vma,
498 unsigned long addr, pud_t *pudp)
499{
500 int ret = 0;
501
502 if (pud_young(*pudp))
503 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
504 (unsigned long *)pudp);
505
506 return ret;
507}
477#endif 508#endif
478 509
479int ptep_clear_flush_young(struct vm_area_struct *vma, 510int ptep_clear_flush_young(struct vm_area_struct *vma,
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 18af2bcefe6a..a0aba0f9c57b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -36,6 +36,9 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
36extern int pmdp_set_access_flags(struct vm_area_struct *vma, 36extern int pmdp_set_access_flags(struct vm_area_struct *vma,
37 unsigned long address, pmd_t *pmdp, 37 unsigned long address, pmd_t *pmdp,
38 pmd_t entry, int dirty); 38 pmd_t entry, int dirty);
39extern int pudp_set_access_flags(struct vm_area_struct *vma,
40 unsigned long address, pud_t *pudp,
41 pud_t entry, int dirty);
39#else 42#else
40static inline int pmdp_set_access_flags(struct vm_area_struct *vma, 43static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
41 unsigned long address, pmd_t *pmdp, 44 unsigned long address, pmd_t *pmdp,
@@ -44,6 +47,13 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
44 BUILD_BUG(); 47 BUILD_BUG();
45 return 0; 48 return 0;
46} 49}
50static inline int pudp_set_access_flags(struct vm_area_struct *vma,
51 unsigned long address, pud_t *pudp,
52 pud_t entry, int dirty)
53{
54 BUILD_BUG();
55 return 0;
56}
47#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 57#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
48#endif 58#endif
49 59
@@ -121,8 +131,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
121} 131}
122#endif 132#endif
123 133
124#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
125#ifdef CONFIG_TRANSPARENT_HUGEPAGE 134#ifdef CONFIG_TRANSPARENT_HUGEPAGE
135#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
126static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 136static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
127 unsigned long address, 137 unsigned long address,
128 pmd_t *pmdp) 138 pmd_t *pmdp)
@@ -131,20 +141,40 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
131 pmd_clear(pmdp); 141 pmd_clear(pmdp);
132 return pmd; 142 return pmd;
133} 143}
144#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
145#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
146static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
147 unsigned long address,
148 pud_t *pudp)
149{
150 pud_t pud = *pudp;
151
152 pud_clear(pudp);
153 return pud;
154}
155#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
134#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 156#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
135#endif
136 157
137#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
138#ifdef CONFIG_TRANSPARENT_HUGEPAGE 158#ifdef CONFIG_TRANSPARENT_HUGEPAGE
159#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
139static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, 160static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
140 unsigned long address, pmd_t *pmdp, 161 unsigned long address, pmd_t *pmdp,
141 int full) 162 int full)
142{ 163{
143 return pmdp_huge_get_and_clear(mm, address, pmdp); 164 return pmdp_huge_get_and_clear(mm, address, pmdp);
144} 165}
145#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
146#endif 166#endif
147 167
168#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
169static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
170 unsigned long address, pud_t *pudp,
171 int full)
172{
173 return pudp_huge_get_and_clear(mm, address, pudp);
174}
175#endif
176#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
177
148#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL 178#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
149static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, 179static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
150 unsigned long address, pte_t *ptep, 180 unsigned long address, pte_t *ptep,
@@ -181,6 +211,9 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
181extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, 211extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
182 unsigned long address, 212 unsigned long address,
183 pmd_t *pmdp); 213 pmd_t *pmdp);
214extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
215 unsigned long address,
216 pud_t *pudp);
184#endif 217#endif
185 218
186#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT 219#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -208,6 +241,23 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
208} 241}
209#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 242#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
210#endif 243#endif
244#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
245#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
246static inline void pudp_set_wrprotect(struct mm_struct *mm,
247 unsigned long address, pud_t *pudp)
248{
249 pud_t old_pud = *pudp;
250
251 set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
252}
253#else
254static inline void pudp_set_wrprotect(struct mm_struct *mm,
255 unsigned long address, pud_t *pudp)
256{
257 BUILD_BUG();
258}
259#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
260#endif
211 261
212#ifndef pmdp_collapse_flush 262#ifndef pmdp_collapse_flush
213#ifdef CONFIG_TRANSPARENT_HUGEPAGE 263#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -273,12 +323,23 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
273{ 323{
274 return pmd_val(pmd_a) == pmd_val(pmd_b); 324 return pmd_val(pmd_a) == pmd_val(pmd_b);
275} 325}
326
327static inline int pud_same(pud_t pud_a, pud_t pud_b)
328{
329 return pud_val(pud_a) == pud_val(pud_b);
330}
276#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 331#else /* CONFIG_TRANSPARENT_HUGEPAGE */
277static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) 332static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
278{ 333{
279 BUILD_BUG(); 334 BUILD_BUG();
280 return 0; 335 return 0;
281} 336}
337
338static inline int pud_same(pud_t pud_a, pud_t pud_b)
339{
340 BUILD_BUG();
341 return 0;
342}
282#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 343#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
283#endif 344#endif
284 345
@@ -640,6 +701,15 @@ static inline int pmd_write(pmd_t pmd)
640#endif /* __HAVE_ARCH_PMD_WRITE */ 701#endif /* __HAVE_ARCH_PMD_WRITE */
641#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 702#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
642 703
704#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
705 (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
706 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
707static inline int pud_trans_huge(pud_t pud)
708{
709 return 0;
710}
711#endif
712
643#ifndef pmd_read_atomic 713#ifndef pmd_read_atomic
644static inline pmd_t pmd_read_atomic(pmd_t *pmdp) 714static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
645{ 715{
@@ -785,8 +855,10 @@ static inline int pmd_clear_huge(pmd_t *pmd)
785 * e.g. see arch/arc: flush_pmd_tlb_range 855 * e.g. see arch/arc: flush_pmd_tlb_range
786 */ 856 */
787#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) 857#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
858#define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
788#else 859#else
789#define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() 860#define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG()
861#define flush_pud_tlb_range(vma, addr, end) BUILD_BUG()
790#endif 862#endif
791#endif 863#endif
792 864
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 7eed8cf3130a..4329bc6ef04b 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -232,6 +232,20 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
232 __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ 232 __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
233 } while (0) 233 } while (0)
234 234
235/**
236 * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
237 * invalidation. This is a nop so far, because only x86 needs it.
238 */
239#ifndef __tlb_remove_pud_tlb_entry
240#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
241#endif
242
243#define tlb_remove_pud_tlb_entry(tlb, pudp, address) \
244 do { \
245 __tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE); \
246 __tlb_remove_pud_tlb_entry(tlb, pudp, address); \
247 } while (0)
248
235/* 249/*
236 * For things like page tables caches (ie caching addresses "inside" the 250 * For things like page tables caches (ie caching addresses "inside" the
237 * page tables, like x86 does), for legacy reasons, flushing an 251 * page tables, like x86 does), for legacy reasons, flushing an
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f0029e786205..a3762d49ba39 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -6,6 +6,18 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
6 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 6 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
7 struct vm_area_struct *vma); 7 struct vm_area_struct *vma);
8extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); 8extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
9extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
10 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
11 struct vm_area_struct *vma);
12
13#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
14extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
15#else
16static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
17{
18}
19#endif
20
9extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); 21extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
10extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 22extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
11 unsigned long addr, 23 unsigned long addr,
@@ -17,6 +29,9 @@ extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
17extern int zap_huge_pmd(struct mmu_gather *tlb, 29extern int zap_huge_pmd(struct mmu_gather *tlb,
18 struct vm_area_struct *vma, 30 struct vm_area_struct *vma,
19 pmd_t *pmd, unsigned long addr); 31 pmd_t *pmd, unsigned long addr);
32extern int zap_huge_pud(struct mmu_gather *tlb,
33 struct vm_area_struct *vma,
34 pud_t *pud, unsigned long addr);
20extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 35extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
21 unsigned long addr, unsigned long end, 36 unsigned long addr, unsigned long end,
22 unsigned char *vec); 37 unsigned char *vec);
@@ -26,8 +41,10 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
26extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 41extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
27 unsigned long addr, pgprot_t newprot, 42 unsigned long addr, pgprot_t newprot,
28 int prot_numa); 43 int prot_numa);
29int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, 44int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
30 pfn_t pfn, bool write); 45 pmd_t *pmd, pfn_t pfn, bool write);
46int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
47 pud_t *pud, pfn_t pfn, bool write);
31enum transparent_hugepage_flag { 48enum transparent_hugepage_flag {
32 TRANSPARENT_HUGEPAGE_FLAG, 49 TRANSPARENT_HUGEPAGE_FLAG,
33 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 50 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -58,13 +75,14 @@ extern struct kobj_attribute shmem_enabled_attr;
58#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) 75#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
59 76
60#ifdef CONFIG_TRANSPARENT_HUGEPAGE 77#ifdef CONFIG_TRANSPARENT_HUGEPAGE
61struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
62 pmd_t *pmd, int flags);
63
64#define HPAGE_PMD_SHIFT PMD_SHIFT 78#define HPAGE_PMD_SHIFT PMD_SHIFT
65#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) 79#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
66#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) 80#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
67 81
82#define HPAGE_PUD_SHIFT PUD_SHIFT
83#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT)
84#define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1))
85
68extern bool is_vma_temporary_stack(struct vm_area_struct *vma); 86extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
69 87
70#define transparent_hugepage_enabled(__vma) \ 88#define transparent_hugepage_enabled(__vma) \
@@ -118,6 +136,17 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
118void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 136void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
119 bool freeze, struct page *page); 137 bool freeze, struct page *page);
120 138
139void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
140 unsigned long address);
141
142#define split_huge_pud(__vma, __pud, __address) \
143 do { \
144 pud_t *____pud = (__pud); \
145 if (pud_trans_huge(*____pud) \
146 || pud_devmap(*____pud)) \
147 __split_huge_pud(__vma, __pud, __address); \
148 } while (0)
149
121extern int hugepage_madvise(struct vm_area_struct *vma, 150extern int hugepage_madvise(struct vm_area_struct *vma,
122 unsigned long *vm_flags, int advice); 151 unsigned long *vm_flags, int advice);
123extern void vma_adjust_trans_huge(struct vm_area_struct *vma, 152extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -126,6 +155,8 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
126 long adjust_next); 155 long adjust_next);
127extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, 156extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
128 struct vm_area_struct *vma); 157 struct vm_area_struct *vma);
158extern spinlock_t *__pud_trans_huge_lock(pud_t *pud,
159 struct vm_area_struct *vma);
129/* mmap_sem must be held on entry */ 160/* mmap_sem must be held on entry */
130static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, 161static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
131 struct vm_area_struct *vma) 162 struct vm_area_struct *vma)
@@ -136,6 +167,15 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
136 else 167 else
137 return NULL; 168 return NULL;
138} 169}
170static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
171 struct vm_area_struct *vma)
172{
173 VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
174 if (pud_trans_huge(*pud) || pud_devmap(*pud))
175 return __pud_trans_huge_lock(pud, vma);
176 else
177 return NULL;
178}
139static inline int hpage_nr_pages(struct page *page) 179static inline int hpage_nr_pages(struct page *page)
140{ 180{
141 if (unlikely(PageTransHuge(page))) 181 if (unlikely(PageTransHuge(page)))
@@ -143,6 +183,11 @@ static inline int hpage_nr_pages(struct page *page)
143 return 1; 183 return 1;
144} 184}
145 185
186struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
187 pmd_t *pmd, int flags);
188struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
189 pud_t *pud, int flags);
190
146extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); 191extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
147 192
148extern struct page *huge_zero_page; 193extern struct page *huge_zero_page;
@@ -157,6 +202,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
157 return is_huge_zero_page(pmd_page(pmd)); 202 return is_huge_zero_page(pmd_page(pmd));
158} 203}
159 204
205static inline bool is_huge_zero_pud(pud_t pud)
206{
207 return false;
208}
209
160struct page *mm_get_huge_zero_page(struct mm_struct *mm); 210struct page *mm_get_huge_zero_page(struct mm_struct *mm);
161void mm_put_huge_zero_page(struct mm_struct *mm); 211void mm_put_huge_zero_page(struct mm_struct *mm);
162 212
@@ -167,6 +217,10 @@ void mm_put_huge_zero_page(struct mm_struct *mm);
167#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) 217#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
168#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) 218#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
169 219
220#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
221#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
222#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
223
170#define hpage_nr_pages(x) 1 224#define hpage_nr_pages(x) 1
171 225
172#define transparent_hugepage_enabled(__vma) 0 226#define transparent_hugepage_enabled(__vma) 0
@@ -195,6 +249,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
195static inline void split_huge_pmd_address(struct vm_area_struct *vma, 249static inline void split_huge_pmd_address(struct vm_area_struct *vma,
196 unsigned long address, bool freeze, struct page *page) {} 250 unsigned long address, bool freeze, struct page *page) {}
197 251
252#define split_huge_pud(__vma, __pmd, __address) \
253 do { } while (0)
254
198static inline int hugepage_madvise(struct vm_area_struct *vma, 255static inline int hugepage_madvise(struct vm_area_struct *vma,
199 unsigned long *vm_flags, int advice) 256 unsigned long *vm_flags, int advice)
200{ 257{
@@ -212,6 +269,11 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
212{ 269{
213 return NULL; 270 return NULL;
214} 271}
272static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
273 struct vm_area_struct *vma)
274{
275 return NULL;
276}
215 277
216static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) 278static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
217{ 279{
@@ -223,6 +285,11 @@ static inline bool is_huge_zero_page(struct page *page)
223 return false; 285 return false;
224} 286}
225 287
288static inline bool is_huge_zero_pud(pud_t pud)
289{
290 return false;
291}
292
226static inline void mm_put_huge_zero_page(struct mm_struct *mm) 293static inline void mm_put_huge_zero_page(struct mm_struct *mm)
227{ 294{
228 return; 295 return;
@@ -233,6 +300,12 @@ static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
233{ 300{
234 return NULL; 301 return NULL;
235} 302}
303
304static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
305 unsigned long addr, pud_t *pud, int flags)
306{
307 return NULL;
308}
236#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 309#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
237 310
238#endif /* _LINUX_HUGE_MM_H */ 311#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 035a688e5472..d8b75d7d6a9e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -424,6 +424,10 @@ static inline int pmd_devmap(pmd_t pmd)
424{ 424{
425 return 0; 425 return 0;
426} 426}
427static inline int pud_devmap(pud_t pud)
428{
429 return 0;
430}
427#endif 431#endif
428 432
429/* 433/*
@@ -1199,6 +1203,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1199 1203
1200/** 1204/**
1201 * mm_walk - callbacks for walk_page_range 1205 * mm_walk - callbacks for walk_page_range
1206 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
1207 * this handler should only handle pud_trans_huge() puds.
1208 * the pmd_entry or pte_entry callbacks will be used for
1209 * regular PUDs.
1202 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 1210 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
1203 * this handler is required to be able to handle 1211 * this handler is required to be able to handle
1204 * pmd_trans_huge() pmds. They may simply choose to 1212 * pmd_trans_huge() pmds. They may simply choose to
@@ -1218,6 +1226,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1218 * (see the comment on walk_page_range() for more details) 1226 * (see the comment on walk_page_range() for more details)
1219 */ 1227 */
1220struct mm_walk { 1228struct mm_walk {
1229 int (*pud_entry)(pud_t *pud, unsigned long addr,
1230 unsigned long next, struct mm_walk *walk);
1221 int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 1231 int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
1222 unsigned long next, struct mm_walk *walk); 1232 unsigned long next, struct mm_walk *walk);
1223 int (*pte_entry)(pte_t *pte, unsigned long addr, 1233 int (*pte_entry)(pte_t *pte, unsigned long addr,
@@ -1801,8 +1811,26 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
1801 return ptl; 1811 return ptl;
1802} 1812}
1803 1813
1804extern void __init pagecache_init(void); 1814/*
1815 * No scalability reason to split PUD locks yet, but follow the same pattern
1816 * as the PMD locks to make it easier if we decide to. The VM should not be
1817 * considered ready to switch to split PUD locks yet; there may be places
1818 * which need to be converted from page_table_lock.
1819 */
1820static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
1821{
1822 return &mm->page_table_lock;
1823}
1824
1825static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
1826{
1827 spinlock_t *ptl = pud_lockptr(mm, pud);
1828
1829 spin_lock(ptl);
1830 return ptl;
1831}
1805 1832
1833extern void __init pagecache_init(void);
1806extern void free_area_init(unsigned long * zones_size); 1834extern void free_area_init(unsigned long * zones_size);
1807extern void free_area_init_node(int nid, unsigned long * zones_size, 1835extern void free_area_init_node(int nid, unsigned long * zones_size,
1808 unsigned long zone_start_pfn, unsigned long *zholes_size); 1836 unsigned long zone_start_pfn, unsigned long *zholes_size);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index a1a210d59961..51891fb0d3ce 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -381,6 +381,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
381 ___pmd; \ 381 ___pmd; \
382}) 382})
383 383
384#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \
385({ \
386 unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \
387 struct mm_struct *___mm = (__vma)->vm_mm; \
388 pud_t ___pud; \
389 \
390 ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \
391 mmu_notifier_invalidate_range(___mm, ___haddr, \
392 ___haddr + HPAGE_PUD_SIZE); \
393 \
394 ___pud; \
395})
396
384#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ 397#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \
385({ \ 398({ \
386 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ 399 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
@@ -475,6 +488,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
475#define pmdp_clear_young_notify pmdp_test_and_clear_young 488#define pmdp_clear_young_notify pmdp_test_and_clear_young
476#define ptep_clear_flush_notify ptep_clear_flush 489#define ptep_clear_flush_notify ptep_clear_flush
477#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush 490#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
491#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
478#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear 492#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
479#define set_pte_at_notify set_pte_at 493#define set_pte_at_notify set_pte_at
480 494
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 033fc7bbcefa..a49b3259cad7 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -90,6 +90,13 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
90{ 90{
91 return pfn_pmd(pfn_t_to_pfn(pfn), pgprot); 91 return pfn_pmd(pfn_t_to_pfn(pfn), pgprot);
92} 92}
93
94#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
95static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
96{
97 return pfn_pud(pfn_t_to_pfn(pfn), pgprot);
98}
99#endif
93#endif 100#endif
94 101
95#ifdef __HAVE_ARCH_PTE_DEVMAP 102#ifdef __HAVE_ARCH_PTE_DEVMAP
@@ -106,5 +113,10 @@ static inline bool pfn_t_devmap(pfn_t pfn)
106} 113}
107pte_t pte_mkdevmap(pte_t pte); 114pte_t pte_mkdevmap(pte_t pte);
108pmd_t pmd_mkdevmap(pmd_t pmd); 115pmd_t pmd_mkdevmap(pmd_t pmd);
116#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
117 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
118pud_t pud_mkdevmap(pud_t pud);
109#endif 119#endif
120#endif /* __HAVE_ARCH_PTE_DEVMAP */
121
110#endif /* _LINUX_PFN_T_H_ */ 122#endif /* _LINUX_PFN_T_H_ */
diff --git a/mm/gup.c b/mm/gup.c
index 40abe4c90383..1e67461b2733 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -253,6 +253,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
253 return page; 253 return page;
254 return no_page_table(vma, flags); 254 return no_page_table(vma, flags);
255 } 255 }
256 if (pud_devmap(*pud)) {
257 ptl = pud_lock(mm, pud);
258 page = follow_devmap_pud(vma, address, pud, flags);
259 spin_unlock(ptl);
260 if (page)
261 return page;
262 }
256 if (unlikely(pud_bad(*pud))) 263 if (unlikely(pud_bad(*pud)))
257 return no_page_table(vma, flags); 264 return no_page_table(vma, flags);
258 265
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f9ecc2aeadfc..85742ac5b32e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -757,6 +757,60 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
757} 757}
758EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 758EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
759 759
760#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
761static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
762{
763 if (likely(vma->vm_flags & VM_WRITE))
764 pud = pud_mkwrite(pud);
765 return pud;
766}
767
768static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
769 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
770{
771 struct mm_struct *mm = vma->vm_mm;
772 pud_t entry;
773 spinlock_t *ptl;
774
775 ptl = pud_lock(mm, pud);
776 entry = pud_mkhuge(pfn_t_pud(pfn, prot));
777 if (pfn_t_devmap(pfn))
778 entry = pud_mkdevmap(entry);
779 if (write) {
780 entry = pud_mkyoung(pud_mkdirty(entry));
781 entry = maybe_pud_mkwrite(entry, vma);
782 }
783 set_pud_at(mm, addr, pud, entry);
784 update_mmu_cache_pud(vma, addr, pud);
785 spin_unlock(ptl);
786}
787
788int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
789 pud_t *pud, pfn_t pfn, bool write)
790{
791 pgprot_t pgprot = vma->vm_page_prot;
792 /*
793 * If we had pud_special, we could avoid all these restrictions,
794 * but we need to be consistent with PTEs and architectures that
795 * can't support a 'special' bit.
796 */
797 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
798 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
799 (VM_PFNMAP|VM_MIXEDMAP));
800 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
801 BUG_ON(!pfn_t_devmap(pfn));
802
803 if (addr < vma->vm_start || addr >= vma->vm_end)
804 return VM_FAULT_SIGBUS;
805
806 track_pfn_insert(vma, &pgprot, pfn);
807
808 insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
809 return VM_FAULT_NOPAGE;
810}
811EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
812#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
813
760static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 814static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
761 pmd_t *pmd) 815 pmd_t *pmd)
762{ 816{
@@ -887,6 +941,123 @@ out:
887 return ret; 941 return ret;
888} 942}
889 943
944#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
945static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
946 pud_t *pud)
947{
948 pud_t _pud;
949
950 /*
951 * We should set the dirty bit only for FOLL_WRITE but for now
952 * the dirty bit in the pud is meaningless. And if the dirty
953 * bit will become meaningful and we'll only set it with
954 * FOLL_WRITE, an atomic set_bit will be required on the pud to
955 * set the young bit, instead of the current set_pud_at.
956 */
957 _pud = pud_mkyoung(pud_mkdirty(*pud));
958 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
959 pud, _pud, 1))
960 update_mmu_cache_pud(vma, addr, pud);
961}
962
963struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
964 pud_t *pud, int flags)
965{
966 unsigned long pfn = pud_pfn(*pud);
967 struct mm_struct *mm = vma->vm_mm;
968 struct dev_pagemap *pgmap;
969 struct page *page;
970
971 assert_spin_locked(pud_lockptr(mm, pud));
972
973 if (flags & FOLL_WRITE && !pud_write(*pud))
974 return NULL;
975
976 if (pud_present(*pud) && pud_devmap(*pud))
977 /* pass */;
978 else
979 return NULL;
980
981 if (flags & FOLL_TOUCH)
982 touch_pud(vma, addr, pud);
983
984 /*
985 * device mapped pages can only be returned if the
986 * caller will manage the page reference count.
987 */
988 if (!(flags & FOLL_GET))
989 return ERR_PTR(-EEXIST);
990
991 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
992 pgmap = get_dev_pagemap(pfn, NULL);
993 if (!pgmap)
994 return ERR_PTR(-EFAULT);
995 page = pfn_to_page(pfn);
996 get_page(page);
997 put_dev_pagemap(pgmap);
998
999 return page;
1000}
1001
1002int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1003 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1004 struct vm_area_struct *vma)
1005{
1006 spinlock_t *dst_ptl, *src_ptl;
1007 pud_t pud;
1008 int ret;
1009
1010 dst_ptl = pud_lock(dst_mm, dst_pud);
1011 src_ptl = pud_lockptr(src_mm, src_pud);
1012 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1013
1014 ret = -EAGAIN;
1015 pud = *src_pud;
1016 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1017 goto out_unlock;
1018
1019 /*
1020 * When page table lock is held, the huge zero pud should not be
1021 * under splitting since we don't split the page itself, only pud to
1022 * a page table.
1023 */
1024 if (is_huge_zero_pud(pud)) {
1025 /* No huge zero pud yet */
1026 }
1027
1028 pudp_set_wrprotect(src_mm, addr, src_pud);
1029 pud = pud_mkold(pud_wrprotect(pud));
1030 set_pud_at(dst_mm, addr, dst_pud, pud);
1031
1032 ret = 0;
1033out_unlock:
1034 spin_unlock(src_ptl);
1035 spin_unlock(dst_ptl);
1036 return ret;
1037}
1038
1039void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1040{
1041 pud_t entry;
1042 unsigned long haddr;
1043 bool write = vmf->flags & FAULT_FLAG_WRITE;
1044
1045 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1046 if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1047 goto unlock;
1048
1049 entry = pud_mkyoung(orig_pud);
1050 if (write)
1051 entry = pud_mkdirty(entry);
1052 haddr = vmf->address & HPAGE_PUD_MASK;
1053 if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
1054 update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
1055
1056unlock:
1057 spin_unlock(vmf->ptl);
1058}
1059#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1060
890void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) 1061void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
891{ 1062{
892 pmd_t entry; 1063 pmd_t entry;
@@ -1601,6 +1772,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1601 return NULL; 1772 return NULL;
1602} 1773}
1603 1774
1775/*
1776 * Returns true if a given pud maps a thp, false otherwise.
1777 *
1778 * Note that if it returns true, this routine returns without unlocking page
1779 * table lock. So callers must unlock it.
1780 */
1781spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
1782{
1783 spinlock_t *ptl;
1784
1785 ptl = pud_lock(vma->vm_mm, pud);
1786 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
1787 return ptl;
1788 spin_unlock(ptl);
1789 return NULL;
1790}
1791
1792#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1793int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
1794 pud_t *pud, unsigned long addr)
1795{
1796 pud_t orig_pud;
1797 spinlock_t *ptl;
1798
1799 ptl = __pud_trans_huge_lock(pud, vma);
1800 if (!ptl)
1801 return 0;
1802 /*
1803 * For architectures like ppc64 we look at deposited pgtable
1804 * when calling pudp_huge_get_and_clear. So do the
1805 * pgtable_trans_huge_withdraw after finishing pudp related
1806 * operations.
1807 */
1808 orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
1809 tlb->fullmm);
1810 tlb_remove_pud_tlb_entry(tlb, pud, addr);
1811 if (vma_is_dax(vma)) {
1812 spin_unlock(ptl);
1813 /* No zero page support yet */
1814 } else {
1815 /* No support for anonymous PUD pages yet */
1816 BUG();
1817 }
1818 return 1;
1819}
1820
1821static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
1822 unsigned long haddr)
1823{
1824 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
1825 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
1826 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
1827 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
1828
1829 count_vm_event(THP_SPLIT_PMD);
1830
1831 pudp_huge_clear_flush_notify(vma, haddr, pud);
1832}
1833
1834void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
1835 unsigned long address)
1836{
1837 spinlock_t *ptl;
1838 struct mm_struct *mm = vma->vm_mm;
1839 unsigned long haddr = address & HPAGE_PUD_MASK;
1840
1841 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
1842 ptl = pud_lock(mm, pud);
1843 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
1844 goto out;
1845 __split_huge_pud_locked(vma, pud, haddr);
1846
1847out:
1848 spin_unlock(ptl);
1849 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
1850}
1851#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1852
1604static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 1853static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
1605 unsigned long haddr, pmd_t *pmd) 1854 unsigned long haddr, pmd_t *pmd)
1606{ 1855{
diff --git a/mm/memory.c b/mm/memory.c
index e721e8eba570..41e2a2d4b2a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
1001 next = pmd_addr_end(addr, end); 1001 next = pmd_addr_end(addr, end);
1002 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { 1002 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
1003 int err; 1003 int err;
1004 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); 1004 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1005 err = copy_huge_pmd(dst_mm, src_mm, 1005 err = copy_huge_pmd(dst_mm, src_mm,
1006 dst_pmd, src_pmd, addr, vma); 1006 dst_pmd, src_pmd, addr, vma);
1007 if (err == -ENOMEM) 1007 if (err == -ENOMEM)
@@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
1032 src_pud = pud_offset(src_pgd, addr); 1032 src_pud = pud_offset(src_pgd, addr);
1033 do { 1033 do {
1034 next = pud_addr_end(addr, end); 1034 next = pud_addr_end(addr, end);
1035 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1036 int err;
1037
1038 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
1039 err = copy_huge_pud(dst_mm, src_mm,
1040 dst_pud, src_pud, addr, vma);
1041 if (err == -ENOMEM)
1042 return -ENOMEM;
1043 if (!err)
1044 continue;
1045 /* fall through */
1046 }
1035 if (pud_none_or_clear_bad(src_pud)) 1047 if (pud_none_or_clear_bad(src_pud))
1036 continue; 1048 continue;
1037 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, 1049 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1263 pud = pud_offset(pgd, addr); 1275 pud = pud_offset(pgd, addr);
1264 do { 1276 do {
1265 next = pud_addr_end(addr, end); 1277 next = pud_addr_end(addr, end);
1278 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1279 if (next - addr != HPAGE_PUD_SIZE) {
1280 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1281 split_huge_pud(vma, pud, addr);
1282 } else if (zap_huge_pud(tlb, vma, pud, addr))
1283 goto next;
1284 /* fall through */
1285 }
1266 if (pud_none_or_clear_bad(pud)) 1286 if (pud_none_or_clear_bad(pud))
1267 continue; 1287 continue;
1268 next = zap_pmd_range(tlb, vma, pud, addr, next, details); 1288 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1289next:
1290 cond_resched();
1269 } while (pud++, addr = next, addr != end); 1291 } while (pud++, addr = next, addr != end);
1270 1292
1271 return addr; 1293 return addr;
@@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
3490 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); 3512 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3491} 3513}
3492 3514
3515static int create_huge_pud(struct vm_fault *vmf)
3516{
3517#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3518 /* No support for anonymous transparent PUD pages yet */
3519 if (vma_is_anonymous(vmf->vma))
3520 return VM_FAULT_FALLBACK;
3521 if (vmf->vma->vm_ops->huge_fault)
3522 return vmf->vma->vm_ops->huge_fault(vmf);
3523#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3524 return VM_FAULT_FALLBACK;
3525}
3526
3527static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3528{
3529#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3530 /* No support for anonymous transparent PUD pages yet */
3531 if (vma_is_anonymous(vmf->vma))
3532 return VM_FAULT_FALLBACK;
3533 if (vmf->vma->vm_ops->huge_fault)
3534 return vmf->vma->vm_ops->huge_fault(vmf);
3535#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3536 return VM_FAULT_FALLBACK;
3537}
3538
3493/* 3539/*
3494 * These routines also need to handle stuff like marking pages dirty 3540 * These routines also need to handle stuff like marking pages dirty
3495 * and/or accessed for architectures that don't do it in hardware (most 3541 * and/or accessed for architectures that don't do it in hardware (most
@@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3605 }; 3651 };
3606 struct mm_struct *mm = vma->vm_mm; 3652 struct mm_struct *mm = vma->vm_mm;
3607 pgd_t *pgd; 3653 pgd_t *pgd;
3608 pud_t *pud;
3609 int ret; 3654 int ret;
3610 3655
3611 pgd = pgd_offset(mm, address); 3656 pgd = pgd_offset(mm, address);
3612 pud = pud_alloc(mm, pgd, address); 3657
3613 if (!pud) 3658 vmf.pud = pud_alloc(mm, pgd, address);
3659 if (!vmf.pud)
3614 return VM_FAULT_OOM; 3660 return VM_FAULT_OOM;
3615 vmf.pmd = pmd_alloc(mm, pud, address); 3661 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
3662 vmf.flags |= FAULT_FLAG_SIZE_PUD;
3663 ret = create_huge_pud(&vmf);
3664 if (!(ret & VM_FAULT_FALLBACK))
3665 return ret;
3666 } else {
3667 pud_t orig_pud = *vmf.pud;
3668
3669 barrier();
3670 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3671 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3672
3673 vmf.flags |= FAULT_FLAG_SIZE_PUD;
3674
3675 /* NUMA case for anonymous PUDs would go here */
3676
3677 if (dirty && !pud_write(orig_pud)) {
3678 ret = wp_huge_pud(&vmf, orig_pud);
3679 if (!(ret & VM_FAULT_FALLBACK))
3680 return ret;
3681 } else {
3682 huge_pud_set_accessed(&vmf, orig_pud);
3683 return 0;
3684 }
3685 }
3686 }
3687
3688 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3616 if (!vmf.pmd) 3689 if (!vmf.pmd)
3617 return VM_FAULT_OOM; 3690 return VM_FAULT_OOM;
3618 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { 3691 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
@@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3743 */ 3816 */
3744int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 3817int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3745{ 3818{
3819 spinlock_t *ptl;
3746 pmd_t *new = pmd_alloc_one(mm, address); 3820 pmd_t *new = pmd_alloc_one(mm, address);
3747 if (!new) 3821 if (!new)
3748 return -ENOMEM; 3822 return -ENOMEM;
3749 3823
3750 smp_wmb(); /* See comment in __pte_alloc */ 3824 smp_wmb(); /* See comment in __pte_alloc */
3751 3825
3752 spin_lock(&mm->page_table_lock); 3826 ptl = pud_lock(mm, pud);
3753#ifndef __ARCH_HAS_4LEVEL_HACK 3827#ifndef __ARCH_HAS_4LEVEL_HACK
3754 if (!pud_present(*pud)) { 3828 if (!pud_present(*pud)) {
3755 mm_inc_nr_pmds(mm); 3829 mm_inc_nr_pmds(mm);
@@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3763 } else /* Another has populated it */ 3837 } else /* Another has populated it */
3764 pmd_free(mm, new); 3838 pmd_free(mm, new);
3765#endif /* __ARCH_HAS_4LEVEL_HACK */ 3839#endif /* __ARCH_HAS_4LEVEL_HACK */
3766 spin_unlock(&mm->page_table_lock); 3840 spin_unlock(ptl);
3767 return 0; 3841 return 0;
3768} 3842}
3769#endif /* __PAGETABLE_PMD_FOLDED */ 3843#endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 207244489a68..03761577ae86 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -78,14 +78,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
78 78
79 pud = pud_offset(pgd, addr); 79 pud = pud_offset(pgd, addr);
80 do { 80 do {
81 again:
81 next = pud_addr_end(addr, end); 82 next = pud_addr_end(addr, end);
82 if (pud_none_or_clear_bad(pud)) { 83 if (pud_none(*pud) || !walk->vma) {
83 if (walk->pte_hole) 84 if (walk->pte_hole)
84 err = walk->pte_hole(addr, next, walk); 85 err = walk->pte_hole(addr, next, walk);
85 if (err) 86 if (err)
86 break; 87 break;
87 continue; 88 continue;
88 } 89 }
90
91 if (walk->pud_entry) {
92 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
93
94 if (ptl) {
95 err = walk->pud_entry(pud, addr, next, walk);
96 spin_unlock(ptl);
97 if (err)
98 break;
99 continue;
100 }
101 }
102
103 split_huge_pud(walk->vma, pud, addr);
104 if (pud_none(*pud))
105 goto again;
106
89 if (walk->pmd_entry || walk->pte_entry) 107 if (walk->pmd_entry || walk->pte_entry)
90 err = walk_pmd_range(pud, addr, next, walk); 108 err = walk_pmd_range(pud, addr, next, walk);
91 if (err) 109 if (err)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 71c5f9109f2a..4ed5908c65b0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -123,6 +123,20 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
123 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 123 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
124 return pmd; 124 return pmd;
125} 125}
126
127#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
128pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
129 pud_t *pudp)
130{
131 pud_t pud;
132
133 VM_BUG_ON(address & ~HPAGE_PUD_MASK);
134 VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
135 pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
136 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
137 return pud;
138}
139#endif
126#endif 140#endif
127 141
128#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 142#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT