summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatthew Wilcox <willy@linux.intel.com>2017-02-24 17:57:02 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-24 20:46:54 -0500
commita00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 (patch)
tree54d78e89c63e519cb9e00fdab9efbf3189ef2f5e
parenta2d581675d485eb7188f521f36efc114639a3096 (diff)
mm, x86: add support for PUD-sized transparent hugepages
The current transparent hugepage code only supports PMDs. This patch adds support for transparent use of PUDs with DAX. It does not include support for anonymous pages. x86 support code also added. Most of this patch simply parallels the work that was done for huge PMDs. The only major difference is how the new ->pud_entry method in mm_walk works. The ->pmd_entry method replaces the ->pte_entry method, whereas the ->pud_entry method works along with either ->pmd_entry or ->pte_entry. The pagewalk code takes care of locking the PUD before calling ->pud_walk, so handlers do not need to worry whether the PUD is stable. [dave.jiang@intel.com: fix SMP x86 32bit build for native_pud_clear()] Link: http://lkml.kernel.org/r/148719066814.31111.3239231168815337012.stgit@djiang5-desk3.ch.intel.com [dave.jiang@intel.com: native_pud_clear missing on i386 build] Link: http://lkml.kernel.org/r/148640375195.69754.3315433724330910314.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> Signed-off-by: Dave Jiang <dave.jiang@intel.com> Tested-by: Alexander Kapshuk <alexander.kapshuk@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jan Kara <jack@suse.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/paravirt.h11
-rw-r--r--arch/x86/include/asm/paravirt_types.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h17
-rw-r--r--arch/x86/include/asm/pgtable-3level.h30
-rw-r--r--arch/x86/include/asm/pgtable.h140
-rw-r--r--arch/x86/include/asm/pgtable_64.h15
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/mm/pgtable.c31
-rw-r--r--include/asm-generic/pgtable.h80
-rw-r--r--include/asm-generic/tlb.h14
-rw-r--r--include/linux/huge_mm.h83
-rw-r--r--include/linux/mm.h30
-rw-r--r--include/linux/mmu_notifier.h14
-rw-r--r--include/linux/pfn_t.h12
-rw-r--r--mm/gup.c7
-rw-r--r--mm/huge_memory.c249
-rw-r--r--mm/memory.c88
-rw-r--r--mm/pagewalk.c20
-rw-r--r--mm/pgtable-generic.c14
21 files changed, 844 insertions, 18 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index f761142976e5..d0012add6b19 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -571,6 +571,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
571config HAVE_ARCH_TRANSPARENT_HUGEPAGE 571config HAVE_ARCH_TRANSPARENT_HUGEPAGE
572 bool 572 bool
573 573
574config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
575 bool
576
574config HAVE_ARCH_HUGE_VMAP 577config HAVE_ARCH_HUGE_VMAP
575 bool 578 bool
576 579
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 874c1238dffd..33007aa74111 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -109,6 +109,7 @@ config X86
109 select HAVE_ARCH_SECCOMP_FILTER 109 select HAVE_ARCH_SECCOMP_FILTER
110 select HAVE_ARCH_TRACEHOOK 110 select HAVE_ARCH_TRACEHOOK
111 select HAVE_ARCH_TRANSPARENT_HUGEPAGE 111 select HAVE_ARCH_TRANSPARENT_HUGEPAGE
112 select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
112 select HAVE_ARCH_VMAP_STACK if X86_64 113 select HAVE_ARCH_VMAP_STACK if X86_64
113 select HAVE_ARCH_WITHIN_STACK_FRAMES 114 select HAVE_ARCH_WITHIN_STACK_FRAMES
114 select HAVE_CC_STACKPROTECTOR 115 select HAVE_CC_STACKPROTECTOR
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f75fbfe550f2..0489884fdc44 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -475,6 +475,17 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
475 native_pmd_val(pmd)); 475 native_pmd_val(pmd));
476} 476}
477 477
478static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
479 pud_t *pudp, pud_t pud)
480{
481 if (sizeof(pudval_t) > sizeof(long))
482 /* 5 arg words */
483 pv_mmu_ops.set_pud_at(mm, addr, pudp, pud);
484 else
485 PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp,
486 native_pud_val(pud));
487}
488
478static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 489static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
479{ 490{
480 pmdval_t val = native_pmd_val(pmd); 491 pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bb2de45a60f2..b060f962d581 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -249,6 +249,8 @@ struct pv_mmu_ops {
249 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 249 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
250 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, 250 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
251 pmd_t *pmdp, pmd_t pmdval); 251 pmd_t *pmdp, pmd_t pmdval);
252 void (*set_pud_at)(struct mm_struct *mm, unsigned long addr,
253 pud_t *pudp, pud_t pudval);
252 void (*pte_update)(struct mm_struct *mm, unsigned long addr, 254 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
253 pte_t *ptep); 255 pte_t *ptep);
254 256
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index fd74a11959de..a8b96e708c2b 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -21,6 +21,10 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
21 *pmdp = pmd; 21 *pmdp = pmd;
22} 22}
23 23
24static inline void native_set_pud(pud_t *pudp, pud_t pud)
25{
26}
27
24static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 28static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
25{ 29{
26 native_set_pte(ptep, pte); 30 native_set_pte(ptep, pte);
@@ -31,6 +35,10 @@ static inline void native_pmd_clear(pmd_t *pmdp)
31 native_set_pmd(pmdp, __pmd(0)); 35 native_set_pmd(pmdp, __pmd(0));
32} 36}
33 37
38static inline void native_pud_clear(pud_t *pudp)
39{
40}
41
34static inline void native_pte_clear(struct mm_struct *mm, 42static inline void native_pte_clear(struct mm_struct *mm,
35 unsigned long addr, pte_t *xp) 43 unsigned long addr, pte_t *xp)
36{ 44{
@@ -55,6 +63,15 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 63#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif 64#endif
57 65
66#ifdef CONFIG_SMP
67static inline pud_t native_pudp_get_and_clear(pud_t *xp)
68{
69 return __pud(xchg((pudval_t *)xp, 0));
70}
71#else
72#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
73#endif
74
58/* Bit manipulation helper on pte/pgoff entry */ 75/* Bit manipulation helper on pte/pgoff entry */
59static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift, 76static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
60 unsigned long mask, unsigned int leftshift) 77 unsigned long mask, unsigned int leftshift)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index cdaa58c9b39e..8f50fb3f04e1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -121,6 +121,12 @@ static inline void native_pmd_clear(pmd_t *pmd)
121 *(tmp + 1) = 0; 121 *(tmp + 1) = 0;
122} 122}
123 123
124#ifndef CONFIG_SMP
125static inline void native_pud_clear(pud_t *pudp)
126{
127}
128#endif
129
124static inline void pud_clear(pud_t *pudp) 130static inline void pud_clear(pud_t *pudp)
125{ 131{
126 set_pud(pudp, __pud(0)); 132 set_pud(pudp, __pud(0));
@@ -176,6 +182,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
176#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 182#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
177#endif 183#endif
178 184
185#ifdef CONFIG_SMP
186union split_pud {
187 struct {
188 u32 pud_low;
189 u32 pud_high;
190 };
191 pud_t pud;
192};
193
194static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
195{
196 union split_pud res, *orig = (union split_pud *)pudp;
197
198 /* xchg acts as a barrier before setting of the high bits */
199 res.pud_low = xchg(&orig->pud_low, 0);
200 res.pud_high = orig->pud_high;
201 orig->pud_high = 0;
202
203 return res.pud;
204}
205#else
206#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
207#endif
208
179/* Encode and de-code a swap entry */ 209/* Encode and de-code a swap entry */
180#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) 210#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
181#define __swp_type(x) (((x).val) & 0x1f) 211#define __swp_type(x) (((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 437feb436efa..1cfb36b8c024 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -46,6 +46,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
46#define set_pte(ptep, pte) native_set_pte(ptep, pte) 46#define set_pte(ptep, pte) native_set_pte(ptep, pte)
47#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 47#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
48#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) 48#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
49#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud)
49 50
50#define set_pte_atomic(ptep, pte) \ 51#define set_pte_atomic(ptep, pte) \
51 native_set_pte_atomic(ptep, pte) 52 native_set_pte_atomic(ptep, pte)
@@ -128,6 +129,16 @@ static inline int pmd_young(pmd_t pmd)
128 return pmd_flags(pmd) & _PAGE_ACCESSED; 129 return pmd_flags(pmd) & _PAGE_ACCESSED;
129} 130}
130 131
132static inline int pud_dirty(pud_t pud)
133{
134 return pud_flags(pud) & _PAGE_DIRTY;
135}
136
137static inline int pud_young(pud_t pud)
138{
139 return pud_flags(pud) & _PAGE_ACCESSED;
140}
141
131static inline int pte_write(pte_t pte) 142static inline int pte_write(pte_t pte)
132{ 143{
133 return pte_flags(pte) & _PAGE_RW; 144 return pte_flags(pte) & _PAGE_RW;
@@ -181,6 +192,13 @@ static inline int pmd_trans_huge(pmd_t pmd)
181 return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; 192 return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
182} 193}
183 194
195#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
196static inline int pud_trans_huge(pud_t pud)
197{
198 return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
199}
200#endif
201
184#define has_transparent_hugepage has_transparent_hugepage 202#define has_transparent_hugepage has_transparent_hugepage
185static inline int has_transparent_hugepage(void) 203static inline int has_transparent_hugepage(void)
186{ 204{
@@ -192,6 +210,18 @@ static inline int pmd_devmap(pmd_t pmd)
192{ 210{
193 return !!(pmd_val(pmd) & _PAGE_DEVMAP); 211 return !!(pmd_val(pmd) & _PAGE_DEVMAP);
194} 212}
213
214#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
215static inline int pud_devmap(pud_t pud)
216{
217 return !!(pud_val(pud) & _PAGE_DEVMAP);
218}
219#else
220static inline int pud_devmap(pud_t pud)
221{
222 return 0;
223}
224#endif
195#endif 225#endif
196#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 226#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
197 227
@@ -333,6 +363,65 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
333 return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); 363 return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
334} 364}
335 365
366static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
367{
368 pudval_t v = native_pud_val(pud);
369
370 return __pud(v | set);
371}
372
373static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
374{
375 pudval_t v = native_pud_val(pud);
376
377 return __pud(v & ~clear);
378}
379
380static inline pud_t pud_mkold(pud_t pud)
381{
382 return pud_clear_flags(pud, _PAGE_ACCESSED);
383}
384
385static inline pud_t pud_mkclean(pud_t pud)
386{
387 return pud_clear_flags(pud, _PAGE_DIRTY);
388}
389
390static inline pud_t pud_wrprotect(pud_t pud)
391{
392 return pud_clear_flags(pud, _PAGE_RW);
393}
394
395static inline pud_t pud_mkdirty(pud_t pud)
396{
397 return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
398}
399
400static inline pud_t pud_mkdevmap(pud_t pud)
401{
402 return pud_set_flags(pud, _PAGE_DEVMAP);
403}
404
405static inline pud_t pud_mkhuge(pud_t pud)
406{
407 return pud_set_flags(pud, _PAGE_PSE);
408}
409
410static inline pud_t pud_mkyoung(pud_t pud)
411{
412 return pud_set_flags(pud, _PAGE_ACCESSED);
413}
414
415static inline pud_t pud_mkwrite(pud_t pud)
416{
417 return pud_set_flags(pud, _PAGE_RW);
418}
419
420static inline pud_t pud_mknotpresent(pud_t pud)
421{
422 return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE);
423}
424
336#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 425#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
337static inline int pte_soft_dirty(pte_t pte) 426static inline int pte_soft_dirty(pte_t pte)
338{ 427{
@@ -344,6 +433,11 @@ static inline int pmd_soft_dirty(pmd_t pmd)
344 return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; 433 return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
345} 434}
346 435
436static inline int pud_soft_dirty(pud_t pud)
437{
438 return pud_flags(pud) & _PAGE_SOFT_DIRTY;
439}
440
347static inline pte_t pte_mksoft_dirty(pte_t pte) 441static inline pte_t pte_mksoft_dirty(pte_t pte)
348{ 442{
349 return pte_set_flags(pte, _PAGE_SOFT_DIRTY); 443 return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
@@ -354,6 +448,11 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
354 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); 448 return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
355} 449}
356 450
451static inline pud_t pud_mksoft_dirty(pud_t pud)
452{
453 return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
454}
455
357static inline pte_t pte_clear_soft_dirty(pte_t pte) 456static inline pte_t pte_clear_soft_dirty(pte_t pte)
358{ 457{
359 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); 458 return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -364,6 +463,11 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
364 return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); 463 return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
365} 464}
366 465
466static inline pud_t pud_clear_soft_dirty(pud_t pud)
467{
468 return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
469}
470
367#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ 471#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
368 472
369/* 473/*
@@ -392,6 +496,12 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
392 massage_pgprot(pgprot)); 496 massage_pgprot(pgprot));
393} 497}
394 498
499static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
500{
501 return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) |
502 massage_pgprot(pgprot));
503}
504
395static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 505static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
396{ 506{
397 pteval_t val = pte_val(pte); 507 pteval_t val = pte_val(pte);
@@ -771,6 +881,14 @@ static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
771 return res; 881 return res;
772} 882}
773 883
884static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
885{
886 pud_t res = *pudp;
887
888 native_pud_clear(pudp);
889 return res;
890}
891
774static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 892static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
775 pte_t *ptep , pte_t pte) 893 pte_t *ptep , pte_t pte)
776{ 894{
@@ -783,6 +901,12 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
783 native_set_pmd(pmdp, pmd); 901 native_set_pmd(pmdp, pmd);
784} 902}
785 903
904static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr,
905 pud_t *pudp, pud_t pud)
906{
907 native_set_pud(pudp, pud);
908}
909
786#ifndef CONFIG_PARAVIRT 910#ifndef CONFIG_PARAVIRT
787/* 911/*
788 * Rules for using pte_update - it must be called after any PTE update which 912 * Rules for using pte_update - it must be called after any PTE update which
@@ -861,10 +985,15 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
861extern int pmdp_set_access_flags(struct vm_area_struct *vma, 985extern int pmdp_set_access_flags(struct vm_area_struct *vma,
862 unsigned long address, pmd_t *pmdp, 986 unsigned long address, pmd_t *pmdp,
863 pmd_t entry, int dirty); 987 pmd_t entry, int dirty);
988extern int pudp_set_access_flags(struct vm_area_struct *vma,
989 unsigned long address, pud_t *pudp,
990 pud_t entry, int dirty);
864 991
865#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG 992#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
866extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, 993extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
867 unsigned long addr, pmd_t *pmdp); 994 unsigned long addr, pmd_t *pmdp);
995extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
996 unsigned long addr, pud_t *pudp);
868 997
869#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH 998#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
870extern int pmdp_clear_flush_young(struct vm_area_struct *vma, 999extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
@@ -884,6 +1013,13 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long
884 return native_pmdp_get_and_clear(pmdp); 1013 return native_pmdp_get_and_clear(pmdp);
885} 1014}
886 1015
1016#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
1017static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
1018 unsigned long addr, pud_t *pudp)
1019{
1020 return native_pudp_get_and_clear(pudp);
1021}
1022
887#define __HAVE_ARCH_PMDP_SET_WRPROTECT 1023#define __HAVE_ARCH_PMDP_SET_WRPROTECT
888static inline void pmdp_set_wrprotect(struct mm_struct *mm, 1024static inline void pmdp_set_wrprotect(struct mm_struct *mm,
889 unsigned long addr, pmd_t *pmdp) 1025 unsigned long addr, pmd_t *pmdp)
@@ -932,6 +1068,10 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
932 unsigned long addr, pmd_t *pmd) 1068 unsigned long addr, pmd_t *pmd)
933{ 1069{
934} 1070}
1071static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
1072 unsigned long addr, pud_t *pud)
1073{
1074}
935 1075
936#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 1076#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
937static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 1077static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 62b775926045..73c7ccc38912 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,6 +106,21 @@ static inline void native_pud_clear(pud_t *pud)
106 native_set_pud(pud, native_make_pud(0)); 106 native_set_pud(pud, native_make_pud(0));
107} 107}
108 108
109static inline pud_t native_pudp_get_and_clear(pud_t *xp)
110{
111#ifdef CONFIG_SMP
112 return native_make_pud(xchg(&xp->pud, 0));
113#else
114 /* native_local_pudp_get_and_clear,
115 * but duplicated because of cyclic dependency
116 */
117 pud_t ret = *xp;
118
119 native_pud_clear(xp);
120 return ret;
121#endif
122}
123
109static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) 124static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
110{ 125{
111 *pgdp = pgd; 126 *pgdp = pgd;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index a1bfba0f7234..4797e87b0fb6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -425,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
425 .pmd_clear = native_pmd_clear, 425 .pmd_clear = native_pmd_clear,
426#endif 426#endif
427 .set_pud = native_set_pud, 427 .set_pud = native_set_pud,
428 .set_pud_at = native_set_pud_at,
428 429
429 .pmd_val = PTE_IDENT, 430 .pmd_val = PTE_IDENT,
430 .make_pmd = PTE_IDENT, 431 .make_pmd = PTE_IDENT,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3feec5af4e67..6cbdff26bb96 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -445,6 +445,26 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
445 445
446 return changed; 446 return changed;
447} 447}
448
449int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
450 pud_t *pudp, pud_t entry, int dirty)
451{
452 int changed = !pud_same(*pudp, entry);
453
454 VM_BUG_ON(address & ~HPAGE_PUD_MASK);
455
456 if (changed && dirty) {
457 *pudp = entry;
458 /*
459 * We had a write-protection fault here and changed the pud
460 * to to more permissive. No need to flush the TLB for that,
461 * #PF is architecturally guaranteed to do that and in the
462 * worst-case we'll generate a spurious fault.
463 */
464 }
465
466 return changed;
467}
448#endif 468#endif
449 469
450int ptep_test_and_clear_young(struct vm_area_struct *vma, 470int ptep_test_and_clear_young(struct vm_area_struct *vma,
@@ -474,6 +494,17 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
474 494
475 return ret; 495 return ret;
476} 496}
497int pudp_test_and_clear_young(struct vm_area_struct *vma,
498 unsigned long addr, pud_t *pudp)
499{
500 int ret = 0;
501
502 if (pud_young(*pudp))
503 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
504 (unsigned long *)pudp);
505
506 return ret;
507}
477#endif 508#endif
478 509
479int ptep_clear_flush_young(struct vm_area_struct *vma, 510int ptep_clear_flush_young(struct vm_area_struct *vma,
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 18af2bcefe6a..a0aba0f9c57b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -36,6 +36,9 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
36extern int pmdp_set_access_flags(struct vm_area_struct *vma, 36extern int pmdp_set_access_flags(struct vm_area_struct *vma,
37 unsigned long address, pmd_t *pmdp, 37 unsigned long address, pmd_t *pmdp,
38 pmd_t entry, int dirty); 38 pmd_t entry, int dirty);
39extern int pudp_set_access_flags(struct vm_area_struct *vma,
40 unsigned long address, pud_t *pudp,
41 pud_t entry, int dirty);
39#else 42#else
40static inline int pmdp_set_access_flags(struct vm_area_struct *vma, 43static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
41 unsigned long address, pmd_t *pmdp, 44 unsigned long address, pmd_t *pmdp,
@@ -44,6 +47,13 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
44 BUILD_BUG(); 47 BUILD_BUG();
45 return 0; 48 return 0;
46} 49}
50static inline int pudp_set_access_flags(struct vm_area_struct *vma,
51 unsigned long address, pud_t *pudp,
52 pud_t entry, int dirty)
53{
54 BUILD_BUG();
55 return 0;
56}
47#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 57#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
48#endif 58#endif
49 59
@@ -121,8 +131,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
121} 131}
122#endif 132#endif
123 133
124#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
125#ifdef CONFIG_TRANSPARENT_HUGEPAGE 134#ifdef CONFIG_TRANSPARENT_HUGEPAGE
135#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
126static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 136static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
127 unsigned long address, 137 unsigned long address,
128 pmd_t *pmdp) 138 pmd_t *pmdp)
@@ -131,20 +141,40 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
131 pmd_clear(pmdp); 141 pmd_clear(pmdp);
132 return pmd; 142 return pmd;
133} 143}
144#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
145#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
146static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
147 unsigned long address,
148 pud_t *pudp)
149{
150 pud_t pud = *pudp;
151
152 pud_clear(pudp);
153 return pud;
154}
155#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
134#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 156#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
135#endif
136 157
137#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
138#ifdef CONFIG_TRANSPARENT_HUGEPAGE 158#ifdef CONFIG_TRANSPARENT_HUGEPAGE
159#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
139static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, 160static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
140 unsigned long address, pmd_t *pmdp, 161 unsigned long address, pmd_t *pmdp,
141 int full) 162 int full)
142{ 163{
143 return pmdp_huge_get_and_clear(mm, address, pmdp); 164 return pmdp_huge_get_and_clear(mm, address, pmdp);
144} 165}
145#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
146#endif 166#endif
147 167
168#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
169static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
170 unsigned long address, pud_t *pudp,
171 int full)
172{
173 return pudp_huge_get_and_clear(mm, address, pudp);
174}
175#endif
176#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
177
148#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL 178#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
149static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, 179static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
150 unsigned long address, pte_t *ptep, 180 unsigned long address, pte_t *ptep,
@@ -181,6 +211,9 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
181extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, 211extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
182 unsigned long address, 212 unsigned long address,
183 pmd_t *pmdp); 213 pmd_t *pmdp);
214extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
215 unsigned long address,
216 pud_t *pudp);
184#endif 217#endif
185 218
186#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT 219#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -208,6 +241,23 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
208} 241}
209#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 242#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
210#endif 243#endif
244#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
245#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
246static inline void pudp_set_wrprotect(struct mm_struct *mm,
247 unsigned long address, pud_t *pudp)
248{
249 pud_t old_pud = *pudp;
250
251 set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
252}
253#else
254static inline void pudp_set_wrprotect(struct mm_struct *mm,
255 unsigned long address, pud_t *pudp)
256{
257 BUILD_BUG();
258}
259#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
260#endif
211 261
212#ifndef pmdp_collapse_flush 262#ifndef pmdp_collapse_flush
213#ifdef CONFIG_TRANSPARENT_HUGEPAGE 263#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -273,12 +323,23 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
273{ 323{
274 return pmd_val(pmd_a) == pmd_val(pmd_b); 324 return pmd_val(pmd_a) == pmd_val(pmd_b);
275} 325}
326
327static inline int pud_same(pud_t pud_a, pud_t pud_b)
328{
329 return pud_val(pud_a) == pud_val(pud_b);
330}
276#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 331#else /* CONFIG_TRANSPARENT_HUGEPAGE */
277static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) 332static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
278{ 333{
279 BUILD_BUG(); 334 BUILD_BUG();
280 return 0; 335 return 0;
281} 336}
337
338static inline int pud_same(pud_t pud_a, pud_t pud_b)
339{
340 BUILD_BUG();
341 return 0;
342}
282#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 343#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
283#endif 344#endif
284 345
@@ -640,6 +701,15 @@ static inline int pmd_write(pmd_t pmd)
640#endif /* __HAVE_ARCH_PMD_WRITE */ 701#endif /* __HAVE_ARCH_PMD_WRITE */
641#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 702#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
642 703
704#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
705 (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
706 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
707static inline int pud_trans_huge(pud_t pud)
708{
709 return 0;
710}
711#endif
712
643#ifndef pmd_read_atomic 713#ifndef pmd_read_atomic
644static inline pmd_t pmd_read_atomic(pmd_t *pmdp) 714static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
645{ 715{
@@ -785,8 +855,10 @@ static inline int pmd_clear_huge(pmd_t *pmd)
785 * e.g. see arch/arc: flush_pmd_tlb_range 855 * e.g. see arch/arc: flush_pmd_tlb_range
786 */ 856 */
787#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) 857#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
858#define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
788#else 859#else
789#define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() 860#define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG()
861#define flush_pud_tlb_range(vma, addr, end) BUILD_BUG()
790#endif 862#endif
791#endif 863#endif
792 864
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 7eed8cf3130a..4329bc6ef04b 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -232,6 +232,20 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
232 __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ 232 __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
233 } while (0) 233 } while (0)
234 234
235/**
236 * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
237 * invalidation. This is a nop so far, because only x86 needs it.
238 */
239#ifndef __tlb_remove_pud_tlb_entry
240#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
241#endif
242
243#define tlb_remove_pud_tlb_entry(tlb, pudp, address) \
244 do { \
245 __tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE); \
246 __tlb_remove_pud_tlb_entry(tlb, pudp, address); \
247 } while (0)
248
235/* 249/*
236 * For things like page tables caches (ie caching addresses "inside" the 250 * For things like page tables caches (ie caching addresses "inside" the
237 * page tables, like x86 does), for legacy reasons, flushing an 251 * page tables, like x86 does), for legacy reasons, flushing an
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f0029e786205..a3762d49ba39 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -6,6 +6,18 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
6 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 6 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
7 struct vm_area_struct *vma); 7 struct vm_area_struct *vma);
8extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); 8extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
9extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
10 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
11 struct vm_area_struct *vma);
12
13#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
14extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
15#else
16static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
17{
18}
19#endif
20
9extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); 21extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
10extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 22extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
11 unsigned long addr, 23 unsigned long addr,
@@ -17,6 +29,9 @@ extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
17extern int zap_huge_pmd(struct mmu_gather *tlb, 29extern int zap_huge_pmd(struct mmu_gather *tlb,
18 struct vm_area_struct *vma, 30 struct vm_area_struct *vma,
19 pmd_t *pmd, unsigned long addr); 31 pmd_t *pmd, unsigned long addr);
32extern int zap_huge_pud(struct mmu_gather *tlb,
33 struct vm_area_struct *vma,
34 pud_t *pud, unsigned long addr);
20extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 35extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
21 unsigned long addr, unsigned long end, 36 unsigned long addr, unsigned long end,
22 unsigned char *vec); 37 unsigned char *vec);
@@ -26,8 +41,10 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
26extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 41extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
27 unsigned long addr, pgprot_t newprot, 42 unsigned long addr, pgprot_t newprot,
28 int prot_numa); 43 int prot_numa);
29int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, 44int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
30 pfn_t pfn, bool write); 45 pmd_t *pmd, pfn_t pfn, bool write);
46int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
47 pud_t *pud, pfn_t pfn, bool write);
31enum transparent_hugepage_flag { 48enum transparent_hugepage_flag {
32 TRANSPARENT_HUGEPAGE_FLAG, 49 TRANSPARENT_HUGEPAGE_FLAG,
33 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 50 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -58,13 +75,14 @@ extern struct kobj_attribute shmem_enabled_attr;
58#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) 75#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
59 76
60#ifdef CONFIG_TRANSPARENT_HUGEPAGE 77#ifdef CONFIG_TRANSPARENT_HUGEPAGE
61struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
62 pmd_t *pmd, int flags);
63
64#define HPAGE_PMD_SHIFT PMD_SHIFT 78#define HPAGE_PMD_SHIFT PMD_SHIFT
65#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) 79#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
66#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) 80#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
67 81
82#define HPAGE_PUD_SHIFT PUD_SHIFT
83#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT)
84#define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1))
85
68extern bool is_vma_temporary_stack(struct vm_area_struct *vma); 86extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
69 87
70#define transparent_hugepage_enabled(__vma) \ 88#define transparent_hugepage_enabled(__vma) \
@@ -118,6 +136,17 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
118void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 136void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
119 bool freeze, struct page *page); 137 bool freeze, struct page *page);
120 138
139void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
140 unsigned long address);
141
142#define split_huge_pud(__vma, __pud, __address) \
143 do { \
144 pud_t *____pud = (__pud); \
145 if (pud_trans_huge(*____pud) \
146 || pud_devmap(*____pud)) \
147 __split_huge_pud(__vma, __pud, __address); \
148 } while (0)
149
121extern int hugepage_madvise(struct vm_area_struct *vma, 150extern int hugepage_madvise(struct vm_area_struct *vma,
122 unsigned long *vm_flags, int advice); 151 unsigned long *vm_flags, int advice);
123extern void vma_adjust_trans_huge(struct vm_area_struct *vma, 152extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -126,6 +155,8 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
126 long adjust_next); 155 long adjust_next);
127extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, 156extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
128 struct vm_area_struct *vma); 157 struct vm_area_struct *vma);
158extern spinlock_t *__pud_trans_huge_lock(pud_t *pud,
159 struct vm_area_struct *vma);
129/* mmap_sem must be held on entry */ 160/* mmap_sem must be held on entry */
130static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, 161static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
131 struct vm_area_struct *vma) 162 struct vm_area_struct *vma)
@@ -136,6 +167,15 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
136 else 167 else
137 return NULL; 168 return NULL;
138} 169}
170static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
171 struct vm_area_struct *vma)
172{
173 VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
174 if (pud_trans_huge(*pud) || pud_devmap(*pud))
175 return __pud_trans_huge_lock(pud, vma);
176 else
177 return NULL;
178}
139static inline int hpage_nr_pages(struct page *page) 179static inline int hpage_nr_pages(struct page *page)
140{ 180{
141 if (unlikely(PageTransHuge(page))) 181 if (unlikely(PageTransHuge(page)))
@@ -143,6 +183,11 @@ static inline int hpage_nr_pages(struct page *page)
143 return 1; 183 return 1;
144} 184}
145 185
186struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
187 pmd_t *pmd, int flags);
188struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
189 pud_t *pud, int flags);
190
146extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); 191extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
147 192
148extern struct page *huge_zero_page; 193extern struct page *huge_zero_page;
@@ -157,6 +202,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
157 return is_huge_zero_page(pmd_page(pmd)); 202 return is_huge_zero_page(pmd_page(pmd));
158} 203}
159 204
205static inline bool is_huge_zero_pud(pud_t pud)
206{
207 return false;
208}
209
160struct page *mm_get_huge_zero_page(struct mm_struct *mm); 210struct page *mm_get_huge_zero_page(struct mm_struct *mm);
161void mm_put_huge_zero_page(struct mm_struct *mm); 211void mm_put_huge_zero_page(struct mm_struct *mm);
162 212
@@ -167,6 +217,10 @@ void mm_put_huge_zero_page(struct mm_struct *mm);
167#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) 217#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
168#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) 218#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
169 219
220#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
221#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
222#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
223
170#define hpage_nr_pages(x) 1 224#define hpage_nr_pages(x) 1
171 225
172#define transparent_hugepage_enabled(__vma) 0 226#define transparent_hugepage_enabled(__vma) 0
@@ -195,6 +249,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
195static inline void split_huge_pmd_address(struct vm_area_struct *vma, 249static inline void split_huge_pmd_address(struct vm_area_struct *vma,
196 unsigned long address, bool freeze, struct page *page) {} 250 unsigned long address, bool freeze, struct page *page) {}
197 251
252#define split_huge_pud(__vma, __pmd, __address) \
253 do { } while (0)
254
198static inline int hugepage_madvise(struct vm_area_struct *vma, 255static inline int hugepage_madvise(struct vm_area_struct *vma,
199 unsigned long *vm_flags, int advice) 256 unsigned long *vm_flags, int advice)
200{ 257{
@@ -212,6 +269,11 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
212{ 269{
213 return NULL; 270 return NULL;
214} 271}
272static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
273 struct vm_area_struct *vma)
274{
275 return NULL;
276}
215 277
216static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) 278static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
217{ 279{
@@ -223,6 +285,11 @@ static inline bool is_huge_zero_page(struct page *page)
223 return false; 285 return false;
224} 286}
225 287
288static inline bool is_huge_zero_pud(pud_t pud)
289{
290 return false;
291}
292
226static inline void mm_put_huge_zero_page(struct mm_struct *mm) 293static inline void mm_put_huge_zero_page(struct mm_struct *mm)
227{ 294{
228 return; 295 return;
@@ -233,6 +300,12 @@ static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
233{ 300{
234 return NULL; 301 return NULL;
235} 302}
303
304static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
305 unsigned long addr, pud_t *pud, int flags)
306{
307 return NULL;
308}
236#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 309#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
237 310
238#endif /* _LINUX_HUGE_MM_H */ 311#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 035a688e5472..d8b75d7d6a9e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -424,6 +424,10 @@ static inline int pmd_devmap(pmd_t pmd)
424{ 424{
425 return 0; 425 return 0;
426} 426}
427static inline int pud_devmap(pud_t pud)
428{
429 return 0;
430}
427#endif 431#endif
428 432
429/* 433/*
@@ -1199,6 +1203,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1199 1203
1200/** 1204/**
1201 * mm_walk - callbacks for walk_page_range 1205 * mm_walk - callbacks for walk_page_range
1206 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
1207 * this handler should only handle pud_trans_huge() puds.
1208 * the pmd_entry or pte_entry callbacks will be used for
1209 * regular PUDs.
1202 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 1210 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
1203 * this handler is required to be able to handle 1211 * this handler is required to be able to handle
1204 * pmd_trans_huge() pmds. They may simply choose to 1212 * pmd_trans_huge() pmds. They may simply choose to
@@ -1218,6 +1226,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1218 * (see the comment on walk_page_range() for more details) 1226 * (see the comment on walk_page_range() for more details)
1219 */ 1227 */
1220struct mm_walk { 1228struct mm_walk {
1229 int (*pud_entry)(pud_t *pud, unsigned long addr,
1230 unsigned long next, struct mm_walk *walk);
1221 int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 1231 int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
1222 unsigned long next, struct mm_walk *walk); 1232 unsigned long next, struct mm_walk *walk);
1223 int (*pte_entry)(pte_t *pte, unsigned long addr, 1233 int (*pte_entry)(pte_t *pte, unsigned long addr,
@@ -1801,8 +1811,26 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
1801 return ptl; 1811 return ptl;
1802} 1812}
1803 1813
1804extern void __init pagecache_init(void); 1814/*
1815 * No scalability reason to split PUD locks yet, but follow the same pattern
1816 * as the PMD locks to make it easier if we decide to. The VM should not be
1817 * considered ready to switch to split PUD locks yet; there may be places
1818 * which need to be converted from page_table_lock.
1819 */
1820static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
1821{
1822 return &mm->page_table_lock;
1823}
1824
1825static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
1826{
1827 spinlock_t *ptl = pud_lockptr(mm, pud);
1828
1829 spin_lock(ptl);
1830 return ptl;
1831}
1805 1832
1833extern void __init pagecache_init(void);
1806extern void free_area_init(unsigned long * zones_size); 1834extern void free_area_init(unsigned long * zones_size);
1807extern void free_area_init_node(int nid, unsigned long * zones_size, 1835extern void free_area_init_node(int nid, unsigned long * zones_size,
1808 unsigned long zone_start_pfn, unsigned long *zholes_size); 1836 unsigned long zone_start_pfn, unsigned long *zholes_size);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index a1a210d59961..51891fb0d3ce 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -381,6 +381,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
381 ___pmd; \ 381 ___pmd; \
382}) 382})
383 383
384#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \
385({ \
386 unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \
387 struct mm_struct *___mm = (__vma)->vm_mm; \
388 pud_t ___pud; \
389 \
390 ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \
391 mmu_notifier_invalidate_range(___mm, ___haddr, \
392 ___haddr + HPAGE_PUD_SIZE); \
393 \
394 ___pud; \
395})
396
384#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ 397#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \
385({ \ 398({ \
386 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ 399 unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
@@ -475,6 +488,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
475#define pmdp_clear_young_notify pmdp_test_and_clear_young 488#define pmdp_clear_young_notify pmdp_test_and_clear_young
476#define ptep_clear_flush_notify ptep_clear_flush 489#define ptep_clear_flush_notify ptep_clear_flush
477#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush 490#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
491#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
478#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear 492#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
479#define set_pte_at_notify set_pte_at 493#define set_pte_at_notify set_pte_at
480 494
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 033fc7bbcefa..a49b3259cad7 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -90,6 +90,13 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
90{ 90{
91 return pfn_pmd(pfn_t_to_pfn(pfn), pgprot); 91 return pfn_pmd(pfn_t_to_pfn(pfn), pgprot);
92} 92}
93
94#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
95static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
96{
97 return pfn_pud(pfn_t_to_pfn(pfn), pgprot);
98}
99#endif
93#endif 100#endif
94 101
95#ifdef __HAVE_ARCH_PTE_DEVMAP 102#ifdef __HAVE_ARCH_PTE_DEVMAP
@@ -106,5 +113,10 @@ static inline bool pfn_t_devmap(pfn_t pfn)
106} 113}
107pte_t pte_mkdevmap(pte_t pte); 114pte_t pte_mkdevmap(pte_t pte);
108pmd_t pmd_mkdevmap(pmd_t pmd); 115pmd_t pmd_mkdevmap(pmd_t pmd);
116#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
117 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
118pud_t pud_mkdevmap(pud_t pud);
109#endif 119#endif
120#endif /* __HAVE_ARCH_PTE_DEVMAP */
121
110#endif /* _LINUX_PFN_T_H_ */ 122#endif /* _LINUX_PFN_T_H_ */
diff --git a/mm/gup.c b/mm/gup.c
index 40abe4c90383..1e67461b2733 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -253,6 +253,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
253 return page; 253 return page;
254 return no_page_table(vma, flags); 254 return no_page_table(vma, flags);
255 } 255 }
256 if (pud_devmap(*pud)) {
257 ptl = pud_lock(mm, pud);
258 page = follow_devmap_pud(vma, address, pud, flags);
259 spin_unlock(ptl);
260 if (page)
261 return page;
262 }
256 if (unlikely(pud_bad(*pud))) 263 if (unlikely(pud_bad(*pud)))
257 return no_page_table(vma, flags); 264 return no_page_table(vma, flags);
258 265
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f9ecc2aeadfc..85742ac5b32e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -757,6 +757,60 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
757} 757}
758EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 758EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
759 759
760#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
761static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
762{
763 if (likely(vma->vm_flags & VM_WRITE))
764 pud = pud_mkwrite(pud);
765 return pud;
766}
767
768static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
769 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
770{
771 struct mm_struct *mm = vma->vm_mm;
772 pud_t entry;
773 spinlock_t *ptl;
774
775 ptl = pud_lock(mm, pud);
776 entry = pud_mkhuge(pfn_t_pud(pfn, prot));
777 if (pfn_t_devmap(pfn))
778 entry = pud_mkdevmap(entry);
779 if (write) {
780 entry = pud_mkyoung(pud_mkdirty(entry));
781 entry = maybe_pud_mkwrite(entry, vma);
782 }
783 set_pud_at(mm, addr, pud, entry);
784 update_mmu_cache_pud(vma, addr, pud);
785 spin_unlock(ptl);
786}
787
788int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
789 pud_t *pud, pfn_t pfn, bool write)
790{
791 pgprot_t pgprot = vma->vm_page_prot;
792 /*
793 * If we had pud_special, we could avoid all these restrictions,
794 * but we need to be consistent with PTEs and architectures that
795 * can't support a 'special' bit.
796 */
797 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
798 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
799 (VM_PFNMAP|VM_MIXEDMAP));
800 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
801 BUG_ON(!pfn_t_devmap(pfn));
802
803 if (addr < vma->vm_start || addr >= vma->vm_end)
804 return VM_FAULT_SIGBUS;
805
806 track_pfn_insert(vma, &pgprot, pfn);
807
808 insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
809 return VM_FAULT_NOPAGE;
810}
811EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
812#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
813
760static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 814static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
761 pmd_t *pmd) 815 pmd_t *pmd)
762{ 816{
@@ -887,6 +941,123 @@ out:
887 return ret; 941 return ret;
888} 942}
889 943
944#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
945static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
946 pud_t *pud)
947{
948 pud_t _pud;
949
950 /*
951 * We should set the dirty bit only for FOLL_WRITE but for now
952 * the dirty bit in the pud is meaningless. And if the dirty
953 * bit will become meaningful and we'll only set it with
954 * FOLL_WRITE, an atomic set_bit will be required on the pud to
955 * set the young bit, instead of the current set_pud_at.
956 */
957 _pud = pud_mkyoung(pud_mkdirty(*pud));
958 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
959 pud, _pud, 1))
960 update_mmu_cache_pud(vma, addr, pud);
961}
962
963struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
964 pud_t *pud, int flags)
965{
966 unsigned long pfn = pud_pfn(*pud);
967 struct mm_struct *mm = vma->vm_mm;
968 struct dev_pagemap *pgmap;
969 struct page *page;
970
971 assert_spin_locked(pud_lockptr(mm, pud));
972
973 if (flags & FOLL_WRITE && !pud_write(*pud))
974 return NULL;
975
976 if (pud_present(*pud) && pud_devmap(*pud))
977 /* pass */;
978 else
979 return NULL;
980
981 if (flags & FOLL_TOUCH)
982 touch_pud(vma, addr, pud);
983
984 /*
985 * device mapped pages can only be returned if the
986 * caller will manage the page reference count.
987 */
988 if (!(flags & FOLL_GET))
989 return ERR_PTR(-EEXIST);
990
991 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
992 pgmap = get_dev_pagemap(pfn, NULL);
993 if (!pgmap)
994 return ERR_PTR(-EFAULT);
995 page = pfn_to_page(pfn);
996 get_page(page);
997 put_dev_pagemap(pgmap);
998
999 return page;
1000}
1001
1002int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1003 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1004 struct vm_area_struct *vma)
1005{
1006 spinlock_t *dst_ptl, *src_ptl;
1007 pud_t pud;
1008 int ret;
1009
1010 dst_ptl = pud_lock(dst_mm, dst_pud);
1011 src_ptl = pud_lockptr(src_mm, src_pud);
1012 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1013
1014 ret = -EAGAIN;
1015 pud = *src_pud;
1016 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1017 goto out_unlock;
1018
1019 /*
1020 * When page table lock is held, the huge zero pud should not be
1021 * under splitting since we don't split the page itself, only pud to
1022 * a page table.
1023 */
1024 if (is_huge_zero_pud(pud)) {
1025 /* No huge zero pud yet */
1026 }
1027
1028 pudp_set_wrprotect(src_mm, addr, src_pud);
1029 pud = pud_mkold(pud_wrprotect(pud));
1030 set_pud_at(dst_mm, addr, dst_pud, pud);
1031
1032 ret = 0;
1033out_unlock:
1034 spin_unlock(src_ptl);
1035 spin_unlock(dst_ptl);
1036 return ret;
1037}
1038
1039void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1040{
1041 pud_t entry;
1042 unsigned long haddr;
1043 bool write = vmf->flags & FAULT_FLAG_WRITE;
1044
1045 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1046 if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1047 goto unlock;
1048
1049 entry = pud_mkyoung(orig_pud);
1050 if (write)
1051 entry = pud_mkdirty(entry);
1052 haddr = vmf->address & HPAGE_PUD_MASK;
1053 if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
1054 update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
1055
1056unlock:
1057 spin_unlock(vmf->ptl);
1058}
1059#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1060
890void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) 1061void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
891{ 1062{
892 pmd_t entry; 1063 pmd_t entry;
@@ -1601,6 +1772,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1601 return NULL; 1772 return NULL;
1602} 1773}
1603 1774
1775/*
1776 * Returns true if a given pud maps a thp, false otherwise.
1777 *
1778 * Note that if it returns true, this routine returns without unlocking page
1779 * table lock. So callers must unlock it.
1780 */
1781spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
1782{
1783 spinlock_t *ptl;
1784
1785 ptl = pud_lock(vma->vm_mm, pud);
1786 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
1787 return ptl;
1788 spin_unlock(ptl);
1789 return NULL;
1790}
1791
1792#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1793int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
1794 pud_t *pud, unsigned long addr)
1795{
1796 pud_t orig_pud;
1797 spinlock_t *ptl;
1798
1799 ptl = __pud_trans_huge_lock(pud, vma);
1800 if (!ptl)
1801 return 0;
1802 /*
1803 * For architectures like ppc64 we look at deposited pgtable
1804 * when calling pudp_huge_get_and_clear. So do the
1805 * pgtable_trans_huge_withdraw after finishing pudp related
1806 * operations.
1807 */
1808 orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
1809 tlb->fullmm);
1810 tlb_remove_pud_tlb_entry(tlb, pud, addr);
1811 if (vma_is_dax(vma)) {
1812 spin_unlock(ptl);
1813 /* No zero page support yet */
1814 } else {
1815 /* No support for anonymous PUD pages yet */
1816 BUG();
1817 }
1818 return 1;
1819}
1820
1821static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
1822 unsigned long haddr)
1823{
1824 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
1825 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
1826 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
1827 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
1828
1829 count_vm_event(THP_SPLIT_PMD);
1830
1831 pudp_huge_clear_flush_notify(vma, haddr, pud);
1832}
1833
1834void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
1835 unsigned long address)
1836{
1837 spinlock_t *ptl;
1838 struct mm_struct *mm = vma->vm_mm;
1839 unsigned long haddr = address & HPAGE_PUD_MASK;
1840
1841 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
1842 ptl = pud_lock(mm, pud);
1843 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
1844 goto out;
1845 __split_huge_pud_locked(vma, pud, haddr);
1846
1847out:
1848 spin_unlock(ptl);
1849 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
1850}
1851#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1852
1604static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 1853static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
1605 unsigned long haddr, pmd_t *pmd) 1854 unsigned long haddr, pmd_t *pmd)
1606{ 1855{
diff --git a/mm/memory.c b/mm/memory.c
index e721e8eba570..41e2a2d4b2a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
1001 next = pmd_addr_end(addr, end); 1001 next = pmd_addr_end(addr, end);
1002 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { 1002 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
1003 int err; 1003 int err;
1004 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); 1004 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1005 err = copy_huge_pmd(dst_mm, src_mm, 1005 err = copy_huge_pmd(dst_mm, src_mm,
1006 dst_pmd, src_pmd, addr, vma); 1006 dst_pmd, src_pmd, addr, vma);
1007 if (err == -ENOMEM) 1007 if (err == -ENOMEM)
@@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
1032 src_pud = pud_offset(src_pgd, addr); 1032 src_pud = pud_offset(src_pgd, addr);
1033 do { 1033 do {
1034 next = pud_addr_end(addr, end); 1034 next = pud_addr_end(addr, end);
1035 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1036 int err;
1037
1038 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
1039 err = copy_huge_pud(dst_mm, src_mm,
1040 dst_pud, src_pud, addr, vma);
1041 if (err == -ENOMEM)
1042 return -ENOMEM;
1043 if (!err)
1044 continue;
1045 /* fall through */
1046 }
1035 if (pud_none_or_clear_bad(src_pud)) 1047 if (pud_none_or_clear_bad(src_pud))
1036 continue; 1048 continue;
1037 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, 1049 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1263 pud = pud_offset(pgd, addr); 1275 pud = pud_offset(pgd, addr);
1264 do { 1276 do {
1265 next = pud_addr_end(addr, end); 1277 next = pud_addr_end(addr, end);
1278 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1279 if (next - addr != HPAGE_PUD_SIZE) {
1280 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1281 split_huge_pud(vma, pud, addr);
1282 } else if (zap_huge_pud(tlb, vma, pud, addr))
1283 goto next;
1284 /* fall through */
1285 }
1266 if (pud_none_or_clear_bad(pud)) 1286 if (pud_none_or_clear_bad(pud))
1267 continue; 1287 continue;
1268 next = zap_pmd_range(tlb, vma, pud, addr, next, details); 1288 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1289next:
1290 cond_resched();
1269 } while (pud++, addr = next, addr != end); 1291 } while (pud++, addr = next, addr != end);
1270 1292
1271 return addr; 1293 return addr;
@@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
3490 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); 3512 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3491} 3513}
3492 3514
3515static int create_huge_pud(struct vm_fault *vmf)
3516{
3517#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3518 /* No support for anonymous transparent PUD pages yet */
3519 if (vma_is_anonymous(vmf->vma))
3520 return VM_FAULT_FALLBACK;
3521 if (vmf->vma->vm_ops->huge_fault)
3522 return vmf->vma->vm_ops->huge_fault(vmf);
3523#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3524 return VM_FAULT_FALLBACK;
3525}
3526
3527static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3528{
3529#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3530 /* No support for anonymous transparent PUD pages yet */
3531 if (vma_is_anonymous(vmf->vma))
3532 return VM_FAULT_FALLBACK;
3533 if (vmf->vma->vm_ops->huge_fault)
3534 return vmf->vma->vm_ops->huge_fault(vmf);
3535#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3536 return VM_FAULT_FALLBACK;
3537}
3538
3493/* 3539/*
3494 * These routines also need to handle stuff like marking pages dirty 3540 * These routines also need to handle stuff like marking pages dirty
3495 * and/or accessed for architectures that don't do it in hardware (most 3541 * and/or accessed for architectures that don't do it in hardware (most
@@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3605 }; 3651 };
3606 struct mm_struct *mm = vma->vm_mm; 3652 struct mm_struct *mm = vma->vm_mm;
3607 pgd_t *pgd; 3653 pgd_t *pgd;
3608 pud_t *pud;
3609 int ret; 3654 int ret;
3610 3655
3611 pgd = pgd_offset(mm, address); 3656 pgd = pgd_offset(mm, address);
3612 pud = pud_alloc(mm, pgd, address); 3657
3613 if (!pud) 3658 vmf.pud = pud_alloc(mm, pgd, address);
3659 if (!vmf.pud)
3614 return VM_FAULT_OOM; 3660 return VM_FAULT_OOM;
3615 vmf.pmd = pmd_alloc(mm, pud, address); 3661 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
3662 vmf.flags |= FAULT_FLAG_SIZE_PUD;
3663 ret = create_huge_pud(&vmf);
3664 if (!(ret & VM_FAULT_FALLBACK))
3665 return ret;
3666 } else {
3667 pud_t orig_pud = *vmf.pud;
3668
3669 barrier();
3670 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3671 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3672
3673 vmf.flags |= FAULT_FLAG_SIZE_PUD;
3674
3675 /* NUMA case for anonymous PUDs would go here */
3676
3677 if (dirty && !pud_write(orig_pud)) {
3678 ret = wp_huge_pud(&vmf, orig_pud);
3679 if (!(ret & VM_FAULT_FALLBACK))
3680 return ret;
3681 } else {
3682 huge_pud_set_accessed(&vmf, orig_pud);
3683 return 0;
3684 }
3685 }
3686 }
3687
3688 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3616 if (!vmf.pmd) 3689 if (!vmf.pmd)
3617 return VM_FAULT_OOM; 3690 return VM_FAULT_OOM;
3618 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { 3691 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
@@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3743 */ 3816 */
3744int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 3817int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3745{ 3818{
3819 spinlock_t *ptl;
3746 pmd_t *new = pmd_alloc_one(mm, address); 3820 pmd_t *new = pmd_alloc_one(mm, address);
3747 if (!new) 3821 if (!new)
3748 return -ENOMEM; 3822 return -ENOMEM;
3749 3823
3750 smp_wmb(); /* See comment in __pte_alloc */ 3824 smp_wmb(); /* See comment in __pte_alloc */
3751 3825
3752 spin_lock(&mm->page_table_lock); 3826 ptl = pud_lock(mm, pud);
3753#ifndef __ARCH_HAS_4LEVEL_HACK 3827#ifndef __ARCH_HAS_4LEVEL_HACK
3754 if (!pud_present(*pud)) { 3828 if (!pud_present(*pud)) {
3755 mm_inc_nr_pmds(mm); 3829 mm_inc_nr_pmds(mm);
@@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3763 } else /* Another has populated it */ 3837 } else /* Another has populated it */
3764 pmd_free(mm, new); 3838 pmd_free(mm, new);
3765#endif /* __ARCH_HAS_4LEVEL_HACK */ 3839#endif /* __ARCH_HAS_4LEVEL_HACK */
3766 spin_unlock(&mm->page_table_lock); 3840 spin_unlock(ptl);
3767 return 0; 3841 return 0;
3768} 3842}
3769#endif /* __PAGETABLE_PMD_FOLDED */ 3843#endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 207244489a68..03761577ae86 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -78,14 +78,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
78 78
79 pud = pud_offset(pgd, addr); 79 pud = pud_offset(pgd, addr);
80 do { 80 do {
81 again:
81 next = pud_addr_end(addr, end); 82 next = pud_addr_end(addr, end);
82 if (pud_none_or_clear_bad(pud)) { 83 if (pud_none(*pud) || !walk->vma) {
83 if (walk->pte_hole) 84 if (walk->pte_hole)
84 err = walk->pte_hole(addr, next, walk); 85 err = walk->pte_hole(addr, next, walk);
85 if (err) 86 if (err)
86 break; 87 break;
87 continue; 88 continue;
88 } 89 }
90
91 if (walk->pud_entry) {
92 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
93
94 if (ptl) {
95 err = walk->pud_entry(pud, addr, next, walk);
96 spin_unlock(ptl);
97 if (err)
98 break;
99 continue;
100 }
101 }
102
103 split_huge_pud(walk->vma, pud, addr);
104 if (pud_none(*pud))
105 goto again;
106
89 if (walk->pmd_entry || walk->pte_entry) 107 if (walk->pmd_entry || walk->pte_entry)
90 err = walk_pmd_range(pud, addr, next, walk); 108 err = walk_pmd_range(pud, addr, next, walk);
91 if (err) 109 if (err)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 71c5f9109f2a..4ed5908c65b0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -123,6 +123,20 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
123 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 123 flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
124 return pmd; 124 return pmd;
125} 125}
126
127#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
128pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
129 pud_t *pudp)
130{
131 pud_t pud;
132
133 VM_BUG_ON(address & ~HPAGE_PUD_MASK);
134 VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
135 pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
136 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
137 return pud;
138}
139#endif
126#endif 140#endif
127 141
128#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 142#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT