summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Jiang <dave.jiang@intel.com>2017-02-24 17:56:59 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-24 20:46:54 -0500
commita2d581675d485eb7188f521f36efc114639a3096 (patch)
treeae566f77b965fed344458698fe6bb01280558647
parentbd233f538d51c2cae6f0bfc2cf7f0960e1683b8a (diff)
mm,fs,dax: change ->pmd_fault to ->huge_fault
Patch series "1G transparent hugepage support for device dax", v2. The following series implements support for 1G trasparent hugepage on x86 for device dax. The bulk of the code was written by Mathew Wilcox a while back supporting transparent 1G hugepage for fs DAX. I have forward ported the relevant bits to 4.10-rc. The current submission has only the necessary code to support device DAX. Comments from Dan Williams: So the motivation and intended user of this functionality mirrors the motivation and users of 1GB page support in hugetlbfs. Given expected capacities of persistent memory devices an in-memory database may want to reduce tlb pressure beyond what they can already achieve with 2MB mappings of a device-dax file. We have customer feedback to that effect as Willy mentioned in his previous version of these patches [1]. [1]: https://lkml.org/lkml/2016/1/31/52 Comments from Nilesh @ Oracle: There are applications which have a process model; and if you assume 10,000 processes attempting to mmap all the 6TB memory available on a server; we are looking at the following: processes : 10,000 memory : 6TB pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB pmd @ 2M page size: 120,000 / 512 = ~240GB pud @ 1G page size: 240GB / 512 = ~480MB As you can see with 2M pages, this system will use up an exorbitant amount of DRAM to hold the page tables; but the 1G pages finally brings it down to a reasonable level. Memory sizes will keep increasing; so this number will keep increasing. An argument can be made to convert the applications from process model to thread model, but in the real world that may not be always practical. Hopefully this helps explain the use case where this is valuable. This patch (of 3): In preparation for adding the ability to handle PUD pages, convert vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault. The vm_fault structure is extended to include a union of the different page table pointers that may be needed, and three flag bits are reserved to indicate which type of pointer is in the union. [ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()] Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com [dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path] Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> Signed-off-by: Dave Jiang <dave.jiang@intel.com> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jan Kara <jack@suse.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Dave Jiang <dave.jiang@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/dax/dax.c34
-rw-r--r--fs/dax.c45
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext4/file.c23
-rw-r--r--fs/xfs/xfs_file.c10
-rw-r--r--fs/xfs/xfs_trace.h2
-rw-r--r--include/linux/dax.h6
-rw-r--r--include/linux/mm.h10
-rw-r--r--mm/memory.c18
9 files changed, 74 insertions, 76 deletions
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 0261f332bf3e..922ec461dcaa 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -419,7 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
419 return -1; 419 return -1;
420} 420}
421 421
422static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 422static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
423{ 423{
424 struct device *dev = &dax_dev->dev; 424 struct device *dev = &dax_dev->dev;
425 struct dax_region *dax_region; 425 struct dax_region *dax_region;
@@ -455,23 +455,6 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
455 return VM_FAULT_NOPAGE; 455 return VM_FAULT_NOPAGE;
456} 456}
457 457
458static int dax_dev_fault(struct vm_fault *vmf)
459{
460 struct vm_area_struct *vma = vmf->vma;
461 int rc;
462 struct file *filp = vma->vm_file;
463 struct dax_dev *dax_dev = filp->private_data;
464
465 dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
466 current->comm, (vmf->flags & FAULT_FLAG_WRITE)
467 ? "write" : "read", vma->vm_start, vma->vm_end);
468 rcu_read_lock();
469 rc = __dax_dev_fault(dax_dev, vmf);
470 rcu_read_unlock();
471
472 return rc;
473}
474
475static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) 458static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
476{ 459{
477 unsigned long pmd_addr = vmf->address & PMD_MASK; 460 unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -510,7 +493,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
510 vmf->flags & FAULT_FLAG_WRITE); 493 vmf->flags & FAULT_FLAG_WRITE);
511} 494}
512 495
513static int dax_dev_pmd_fault(struct vm_fault *vmf) 496static int dax_dev_fault(struct vm_fault *vmf)
514{ 497{
515 int rc; 498 int rc;
516 struct file *filp = vmf->vma->vm_file; 499 struct file *filp = vmf->vma->vm_file;
@@ -522,7 +505,16 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
522 vmf->vma->vm_start, vmf->vma->vm_end); 505 vmf->vma->vm_start, vmf->vma->vm_end);
523 506
524 rcu_read_lock(); 507 rcu_read_lock();
525 rc = __dax_dev_pmd_fault(dax_dev, vmf); 508 switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
509 case FAULT_FLAG_SIZE_PTE:
510 rc = __dax_dev_pte_fault(dax_dev, vmf);
511 break;
512 case FAULT_FLAG_SIZE_PMD:
513 rc = __dax_dev_pmd_fault(dax_dev, vmf);
514 break;
515 default:
516 return VM_FAULT_FALLBACK;
517 }
526 rcu_read_unlock(); 518 rcu_read_unlock();
527 519
528 return rc; 520 return rc;
@@ -530,7 +522,7 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
530 522
531static const struct vm_operations_struct dax_dev_vm_ops = { 523static const struct vm_operations_struct dax_dev_vm_ops = {
532 .fault = dax_dev_fault, 524 .fault = dax_dev_fault,
533 .pmd_fault = dax_dev_pmd_fault, 525 .huge_fault = dax_dev_fault,
534}; 526};
535 527
536static int dax_mmap(struct file *filp, struct vm_area_struct *vma) 528static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/dax.c b/fs/dax.c
index f955c0df33bb..c3c29fbf64be 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1118,16 +1118,8 @@ static int dax_fault_return(int error)
1118 return VM_FAULT_SIGBUS; 1118 return VM_FAULT_SIGBUS;
1119} 1119}
1120 1120
1121/** 1121static int dax_iomap_pte_fault(struct vm_fault *vmf,
1122 * dax_iomap_fault - handle a page fault on a DAX file 1122 const struct iomap_ops *ops)
1123 * @vmf: The description of the fault
1124 * @ops: iomap ops passed from the file system
1125 *
1126 * When a page fault occurs, filesystems may call this helper in their fault
1127 * or mkwrite handler for DAX files. Assumes the caller has done all the
1128 * necessary locking for the page fault to proceed successfully.
1129 */
1130int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
1131{ 1123{
1132 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1124 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1133 struct inode *inode = mapping->host; 1125 struct inode *inode = mapping->host;
@@ -1244,7 +1236,6 @@ int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
1244 } 1236 }
1245 return vmf_ret; 1237 return vmf_ret;
1246} 1238}
1247EXPORT_SYMBOL_GPL(dax_iomap_fault);
1248 1239
1249#ifdef CONFIG_FS_DAX_PMD 1240#ifdef CONFIG_FS_DAX_PMD
1250/* 1241/*
@@ -1335,7 +1326,8 @@ fallback:
1335 return VM_FAULT_FALLBACK; 1326 return VM_FAULT_FALLBACK;
1336} 1327}
1337 1328
1338int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops) 1329static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1330 const struct iomap_ops *ops)
1339{ 1331{
1340 struct vm_area_struct *vma = vmf->vma; 1332 struct vm_area_struct *vma = vmf->vma;
1341 struct address_space *mapping = vma->vm_file->f_mapping; 1333 struct address_space *mapping = vma->vm_file->f_mapping;
@@ -1443,5 +1435,32 @@ out:
1443 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); 1435 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
1444 return result; 1436 return result;
1445} 1437}
1446EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); 1438#else
1439static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
1440{
1441 return VM_FAULT_FALLBACK;
1442}
1447#endif /* CONFIG_FS_DAX_PMD */ 1443#endif /* CONFIG_FS_DAX_PMD */
1444
1445/**
1446 * dax_iomap_fault - handle a page fault on a DAX file
1447 * @vmf: The description of the fault
1448 * @ops: iomap ops passed from the file system
1449 *
1450 * When a page fault occurs, filesystems may call this helper in
1451 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
1452 * has done all the necessary locking for page fault to proceed
1453 * successfully.
1454 */
1455int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
1456{
1457 switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
1458 case FAULT_FLAG_SIZE_PTE:
1459 return dax_iomap_pte_fault(vmf, ops);
1460 case FAULT_FLAG_SIZE_PMD:
1461 return dax_iomap_pmd_fault(vmf, ops);
1462 default:
1463 return VM_FAULT_FALLBACK;
1464 }
1465}
1466EXPORT_SYMBOL_GPL(dax_iomap_fault);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 0bf0d971205a..68738832beda 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -133,7 +133,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
133static const struct vm_operations_struct ext2_dax_vm_ops = { 133static const struct vm_operations_struct ext2_dax_vm_ops = {
134 .fault = ext2_dax_fault, 134 .fault = ext2_dax_fault,
135 /* 135 /*
136 * .pmd_fault is not supported for DAX because allocation in ext2 136 * .huge_fault is not supported for DAX because allocation in ext2
137 * cannot be reliably aligned to huge page sizes and so pmd faults 137 * cannot be reliably aligned to huge page sizes and so pmd faults
138 * will always fail and fail back to regular faults. 138 * will always fail and fail back to regular faults.
139 */ 139 */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 21e1f17fe36d..502d2d07d191 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,27 +273,6 @@ static int ext4_dax_fault(struct vm_fault *vmf)
273 return result; 273 return result;
274} 274}
275 275
276static int
277ext4_dax_pmd_fault(struct vm_fault *vmf)
278{
279 int result;
280 struct inode *inode = file_inode(vmf->vma->vm_file);
281 struct super_block *sb = inode->i_sb;
282 bool write = vmf->flags & FAULT_FLAG_WRITE;
283
284 if (write) {
285 sb_start_pagefault(sb);
286 file_update_time(vmf->vma->vm_file);
287 }
288 down_read(&EXT4_I(inode)->i_mmap_sem);
289 result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
290 up_read(&EXT4_I(inode)->i_mmap_sem);
291 if (write)
292 sb_end_pagefault(sb);
293
294 return result;
295}
296
297/* 276/*
298 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() 277 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
299 * handler we check for races agaist truncate. Note that since we cycle through 278 * handler we check for races agaist truncate. Note that since we cycle through
@@ -326,7 +305,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
326 305
327static const struct vm_operations_struct ext4_dax_vm_ops = { 306static const struct vm_operations_struct ext4_dax_vm_ops = {
328 .fault = ext4_dax_fault, 307 .fault = ext4_dax_fault,
329 .pmd_fault = ext4_dax_pmd_fault, 308 .huge_fault = ext4_dax_fault,
330 .page_mkwrite = ext4_dax_fault, 309 .page_mkwrite = ext4_dax_fault,
331 .pfn_mkwrite = ext4_dax_pfn_mkwrite, 310 .pfn_mkwrite = ext4_dax_pfn_mkwrite,
332}; 311};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9cc10136ba0b..990e03819370 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1429,12 +1429,12 @@ xfs_filemap_fault(
1429/* 1429/*
1430 * Similar to xfs_filemap_fault(), the DAX fault path can call into here on 1430 * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
1431 * both read and write faults. Hence we need to handle both cases. There is no 1431 * both read and write faults. Hence we need to handle both cases. There is no
1432 * ->pmd_mkwrite callout for huge pages, so we have a single function here to 1432 * ->huge_mkwrite callout for huge pages, so we have a single function here to
1433 * handle both cases here. @flags carries the information on the type of fault 1433 * handle both cases here. @flags carries the information on the type of fault
1434 * occuring. 1434 * occuring.
1435 */ 1435 */
1436STATIC int 1436STATIC int
1437xfs_filemap_pmd_fault( 1437xfs_filemap_huge_fault(
1438 struct vm_fault *vmf) 1438 struct vm_fault *vmf)
1439{ 1439{
1440 struct inode *inode = file_inode(vmf->vma->vm_file); 1440 struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -1444,7 +1444,7 @@ xfs_filemap_pmd_fault(
1444 if (!IS_DAX(inode)) 1444 if (!IS_DAX(inode))
1445 return VM_FAULT_FALLBACK; 1445 return VM_FAULT_FALLBACK;
1446 1446
1447 trace_xfs_filemap_pmd_fault(ip); 1447 trace_xfs_filemap_huge_fault(ip);
1448 1448
1449 if (vmf->flags & FAULT_FLAG_WRITE) { 1449 if (vmf->flags & FAULT_FLAG_WRITE) {
1450 sb_start_pagefault(inode->i_sb); 1450 sb_start_pagefault(inode->i_sb);
@@ -1452,7 +1452,7 @@ xfs_filemap_pmd_fault(
1452 } 1452 }
1453 1453
1454 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1454 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1455 ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops); 1455 ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
1456 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1456 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1457 1457
1458 if (vmf->flags & FAULT_FLAG_WRITE) 1458 if (vmf->flags & FAULT_FLAG_WRITE)
@@ -1497,7 +1497,7 @@ xfs_filemap_pfn_mkwrite(
1497 1497
1498static const struct vm_operations_struct xfs_file_vm_ops = { 1498static const struct vm_operations_struct xfs_file_vm_ops = {
1499 .fault = xfs_filemap_fault, 1499 .fault = xfs_filemap_fault,
1500 .pmd_fault = xfs_filemap_pmd_fault, 1500 .huge_fault = xfs_filemap_huge_fault,
1501 .map_pages = filemap_map_pages, 1501 .map_pages = filemap_map_pages,
1502 .page_mkwrite = xfs_filemap_page_mkwrite, 1502 .page_mkwrite = xfs_filemap_page_mkwrite,
1503 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1503 .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fb7555e73a62..383ac227ce2c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,7 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
687DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); 687DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
688 688
689DEFINE_INODE_EVENT(xfs_filemap_fault); 689DEFINE_INODE_EVENT(xfs_filemap_fault);
690DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); 690DEFINE_INODE_EVENT(xfs_filemap_huge_fault);
691DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); 691DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
692DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); 692DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
693 693
diff --git a/include/linux/dax.h b/include/linux/dax.h
index eeb02421c848..cf9af225962b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -70,17 +70,11 @@ static inline unsigned int dax_radix_order(void *entry)
70 return PMD_SHIFT - PAGE_SHIFT; 70 return PMD_SHIFT - PAGE_SHIFT;
71 return 0; 71 return 0;
72} 72}
73int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops);
74#else 73#else
75static inline unsigned int dax_radix_order(void *entry) 74static inline unsigned int dax_radix_order(void *entry)
76{ 75{
77 return 0; 76 return 0;
78} 77}
79static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
80 const struct iomap_ops *ops)
81{
82 return VM_FAULT_FALLBACK;
83}
84#endif 78#endif
85int dax_pfn_mkwrite(struct vm_fault *vmf); 79int dax_pfn_mkwrite(struct vm_fault *vmf);
86 80
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3dd80ba6568a..035a688e5472 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,6 +285,11 @@ extern pgprot_t protection_map[16];
285#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ 285#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
286#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ 286#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
287 287
288#define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */
289#define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */
290#define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */
291#define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */
292
288#define FAULT_FLAG_TRACE \ 293#define FAULT_FLAG_TRACE \
289 { FAULT_FLAG_WRITE, "WRITE" }, \ 294 { FAULT_FLAG_WRITE, "WRITE" }, \
290 { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ 295 { FAULT_FLAG_MKWRITE, "MKWRITE" }, \
@@ -314,6 +319,9 @@ struct vm_fault {
314 unsigned long address; /* Faulting virtual address */ 319 unsigned long address; /* Faulting virtual address */
315 pmd_t *pmd; /* Pointer to pmd entry matching 320 pmd_t *pmd; /* Pointer to pmd entry matching
316 * the 'address' */ 321 * the 'address' */
322 pud_t *pud; /* Pointer to pud entry matching
323 * the 'address'
324 */
317 pte_t orig_pte; /* Value of PTE at the time of fault */ 325 pte_t orig_pte; /* Value of PTE at the time of fault */
318 326
319 struct page *cow_page; /* Page handler may use for COW fault */ 327 struct page *cow_page; /* Page handler may use for COW fault */
@@ -351,7 +359,7 @@ struct vm_operations_struct {
351 void (*close)(struct vm_area_struct * area); 359 void (*close)(struct vm_area_struct * area);
352 int (*mremap)(struct vm_area_struct * area); 360 int (*mremap)(struct vm_area_struct * area);
353 int (*fault)(struct vm_fault *vmf); 361 int (*fault)(struct vm_fault *vmf);
354 int (*pmd_fault)(struct vm_fault *vmf); 362 int (*huge_fault)(struct vm_fault *vmf);
355 void (*map_pages)(struct vm_fault *vmf, 363 void (*map_pages)(struct vm_fault *vmf,
356 pgoff_t start_pgoff, pgoff_t end_pgoff); 364 pgoff_t start_pgoff, pgoff_t end_pgoff);
357 365
diff --git a/mm/memory.c b/mm/memory.c
index cf97d88158cd..e721e8eba570 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3466,8 +3466,8 @@ static int create_huge_pmd(struct vm_fault *vmf)
3466{ 3466{
3467 if (vma_is_anonymous(vmf->vma)) 3467 if (vma_is_anonymous(vmf->vma))
3468 return do_huge_pmd_anonymous_page(vmf); 3468 return do_huge_pmd_anonymous_page(vmf);
3469 if (vmf->vma->vm_ops->pmd_fault) 3469 if (vmf->vma->vm_ops->huge_fault)
3470 return vmf->vma->vm_ops->pmd_fault(vmf); 3470 return vmf->vma->vm_ops->huge_fault(vmf);
3471 return VM_FAULT_FALLBACK; 3471 return VM_FAULT_FALLBACK;
3472} 3472}
3473 3473
@@ -3475,8 +3475,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3475{ 3475{
3476 if (vma_is_anonymous(vmf->vma)) 3476 if (vma_is_anonymous(vmf->vma))
3477 return do_huge_pmd_wp_page(vmf, orig_pmd); 3477 return do_huge_pmd_wp_page(vmf, orig_pmd);
3478 if (vmf->vma->vm_ops->pmd_fault) 3478 if (vmf->vma->vm_ops->huge_fault)
3479 return vmf->vma->vm_ops->pmd_fault(vmf); 3479 return vmf->vma->vm_ops->huge_fault(vmf);
3480 3480
3481 /* COW handled on pte level: split pmd */ 3481 /* COW handled on pte level: split pmd */
3482 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); 3482 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3606,6 +3606,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3606 struct mm_struct *mm = vma->vm_mm; 3606 struct mm_struct *mm = vma->vm_mm;
3607 pgd_t *pgd; 3607 pgd_t *pgd;
3608 pud_t *pud; 3608 pud_t *pud;
3609 int ret;
3609 3610
3610 pgd = pgd_offset(mm, address); 3611 pgd = pgd_offset(mm, address);
3611 pud = pud_alloc(mm, pgd, address); 3612 pud = pud_alloc(mm, pgd, address);
@@ -3615,15 +3616,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3615 if (!vmf.pmd) 3616 if (!vmf.pmd)
3616 return VM_FAULT_OOM; 3617 return VM_FAULT_OOM;
3617 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { 3618 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
3618 int ret = create_huge_pmd(&vmf); 3619 vmf.flags |= FAULT_FLAG_SIZE_PMD;
3620 ret = create_huge_pmd(&vmf);
3619 if (!(ret & VM_FAULT_FALLBACK)) 3621 if (!(ret & VM_FAULT_FALLBACK))
3620 return ret; 3622 return ret;
3623 /* fall through path, remove PMD flag */
3624 vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
3621 } else { 3625 } else {
3622 pmd_t orig_pmd = *vmf.pmd; 3626 pmd_t orig_pmd = *vmf.pmd;
3623 int ret;
3624 3627
3625 barrier(); 3628 barrier();
3626 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { 3629 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3630 vmf.flags |= FAULT_FLAG_SIZE_PMD;
3627 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) 3631 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3628 return do_huge_pmd_numa_page(&vmf, orig_pmd); 3632 return do_huge_pmd_numa_page(&vmf, orig_pmd);
3629 3633
@@ -3632,6 +3636,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3632 ret = wp_huge_pmd(&vmf, orig_pmd); 3636 ret = wp_huge_pmd(&vmf, orig_pmd);
3633 if (!(ret & VM_FAULT_FALLBACK)) 3637 if (!(ret & VM_FAULT_FALLBACK))
3634 return ret; 3638 return ret;
3639 /* fall through path, remove PUD flag */
3640 vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
3635 } else { 3641 } else {
3636 huge_pmd_set_accessed(&vmf, orig_pmd); 3642 huge_pmd_set_accessed(&vmf, orig_pmd);
3637 return 0; 3643 return 0;