summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorDave Jiang <dave.jiang@intel.com>2017-02-24 17:56:59 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-24 20:46:54 -0500
commita2d581675d485eb7188f521f36efc114639a3096 (patch)
treeae566f77b965fed344458698fe6bb01280558647 /mm/memory.c
parentbd233f538d51c2cae6f0bfc2cf7f0960e1683b8a (diff)
mm,fs,dax: change ->pmd_fault to ->huge_fault
Patch series "1G transparent hugepage support for device dax", v2. The following series implements support for 1G trasparent hugepage on x86 for device dax. The bulk of the code was written by Mathew Wilcox a while back supporting transparent 1G hugepage for fs DAX. I have forward ported the relevant bits to 4.10-rc. The current submission has only the necessary code to support device DAX. Comments from Dan Williams: So the motivation and intended user of this functionality mirrors the motivation and users of 1GB page support in hugetlbfs. Given expected capacities of persistent memory devices an in-memory database may want to reduce tlb pressure beyond what they can already achieve with 2MB mappings of a device-dax file. We have customer feedback to that effect as Willy mentioned in his previous version of these patches [1]. [1]: https://lkml.org/lkml/2016/1/31/52 Comments from Nilesh @ Oracle: There are applications which have a process model; and if you assume 10,000 processes attempting to mmap all the 6TB memory available on a server; we are looking at the following: processes : 10,000 memory : 6TB pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB pmd @ 2M page size: 120,000 / 512 = ~240GB pud @ 1G page size: 240GB / 512 = ~480MB As you can see with 2M pages, this system will use up an exorbitant amount of DRAM to hold the page tables; but the 1G pages finally brings it down to a reasonable level. Memory sizes will keep increasing; so this number will keep increasing. An argument can be made to convert the applications from process model to thread model, but in the real world that may not be always practical. Hopefully this helps explain the use case where this is valuable. This patch (of 3): In preparation for adding the ability to handle PUD pages, convert vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault. The vm_fault structure is extended to include a union of the different page table pointers that may be needed, and three flag bits are reserved to indicate which type of pointer is in the union. [ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()] Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com [dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path] Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> Signed-off-by: Dave Jiang <dave.jiang@intel.com> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jan Kara <jack@suse.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Dave Jiang <dave.jiang@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c18
1 files changed, 12 insertions, 6 deletions
diff --git a/mm/memory.c b/mm/memory.c
index cf97d88158cd..e721e8eba570 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3466,8 +3466,8 @@ static int create_huge_pmd(struct vm_fault *vmf)
3466{ 3466{
3467 if (vma_is_anonymous(vmf->vma)) 3467 if (vma_is_anonymous(vmf->vma))
3468 return do_huge_pmd_anonymous_page(vmf); 3468 return do_huge_pmd_anonymous_page(vmf);
3469 if (vmf->vma->vm_ops->pmd_fault) 3469 if (vmf->vma->vm_ops->huge_fault)
3470 return vmf->vma->vm_ops->pmd_fault(vmf); 3470 return vmf->vma->vm_ops->huge_fault(vmf);
3471 return VM_FAULT_FALLBACK; 3471 return VM_FAULT_FALLBACK;
3472} 3472}
3473 3473
@@ -3475,8 +3475,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3475{ 3475{
3476 if (vma_is_anonymous(vmf->vma)) 3476 if (vma_is_anonymous(vmf->vma))
3477 return do_huge_pmd_wp_page(vmf, orig_pmd); 3477 return do_huge_pmd_wp_page(vmf, orig_pmd);
3478 if (vmf->vma->vm_ops->pmd_fault) 3478 if (vmf->vma->vm_ops->huge_fault)
3479 return vmf->vma->vm_ops->pmd_fault(vmf); 3479 return vmf->vma->vm_ops->huge_fault(vmf);
3480 3480
3481 /* COW handled on pte level: split pmd */ 3481 /* COW handled on pte level: split pmd */
3482 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); 3482 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3606,6 +3606,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3606 struct mm_struct *mm = vma->vm_mm; 3606 struct mm_struct *mm = vma->vm_mm;
3607 pgd_t *pgd; 3607 pgd_t *pgd;
3608 pud_t *pud; 3608 pud_t *pud;
3609 int ret;
3609 3610
3610 pgd = pgd_offset(mm, address); 3611 pgd = pgd_offset(mm, address);
3611 pud = pud_alloc(mm, pgd, address); 3612 pud = pud_alloc(mm, pgd, address);
@@ -3615,15 +3616,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3615 if (!vmf.pmd) 3616 if (!vmf.pmd)
3616 return VM_FAULT_OOM; 3617 return VM_FAULT_OOM;
3617 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { 3618 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
3618 int ret = create_huge_pmd(&vmf); 3619 vmf.flags |= FAULT_FLAG_SIZE_PMD;
3620 ret = create_huge_pmd(&vmf);
3619 if (!(ret & VM_FAULT_FALLBACK)) 3621 if (!(ret & VM_FAULT_FALLBACK))
3620 return ret; 3622 return ret;
3623 /* fall through path, remove PMD flag */
3624 vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
3621 } else { 3625 } else {
3622 pmd_t orig_pmd = *vmf.pmd; 3626 pmd_t orig_pmd = *vmf.pmd;
3623 int ret;
3624 3627
3625 barrier(); 3628 barrier();
3626 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { 3629 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3630 vmf.flags |= FAULT_FLAG_SIZE_PMD;
3627 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) 3631 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3628 return do_huge_pmd_numa_page(&vmf, orig_pmd); 3632 return do_huge_pmd_numa_page(&vmf, orig_pmd);
3629 3633
@@ -3632,6 +3636,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3632 ret = wp_huge_pmd(&vmf, orig_pmd); 3636 ret = wp_huge_pmd(&vmf, orig_pmd);
3633 if (!(ret & VM_FAULT_FALLBACK)) 3637 if (!(ret & VM_FAULT_FALLBACK))
3634 return ret; 3638 return ret;
3639 /* fall through path, remove PUD flag */
3640 vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
3635 } else { 3641 } else {
3636 huge_pmd_set_accessed(&vmf, orig_pmd); 3642 huge_pmd_set_accessed(&vmf, orig_pmd);
3637 return 0; 3643 return 0;