aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-01-13 18:47:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:47 -0500
commit60ab3244ec85c44276c585a2a20d3750402e1cf4 (patch)
treee69e866b370243fc58a6fc721e5347a265e8fd4f
parenta664b2d8555c659127bf8fe049a58449d394a707 (diff)
thp: khugepaged: make khugepaged aware about madvise
MADV_HUGEPAGE and MADV_NOHUGEPAGE were fully effective only if run after mmap and before touching the memory. While this is enough for most usages, it's little effort to make madvise more dynamic at runtime on an existing mapping by making khugepaged aware about madvise. MADV_HUGEPAGE: register in khugepaged immediately without waiting a page fault (that may not ever happen if all pages are already mapped and the "enabled" knob was set to madvise during the initial page faults). MADV_NOHUGEPAGE: skip vmas marked VM_NOHUGEPAGE in khugepaged to stop collapsing pages where not needed. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/huge_mm.h6
-rw-r--r--mm/huge_memory.c23
-rw-r--r--mm/madvise.c2
3 files changed, 24 insertions, 7 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a8b7e42d19ec..bddfba1d7b85 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -105,7 +105,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
105#if HPAGE_PMD_ORDER > MAX_ORDER 105#if HPAGE_PMD_ORDER > MAX_ORDER
106#error "hugepages can't be allocated by the buddy allocator" 106#error "hugepages can't be allocated by the buddy allocator"
107#endif 107#endif
108extern int hugepage_madvise(unsigned long *vm_flags, int advice); 108extern int hugepage_madvise(struct vm_area_struct *vma,
109 unsigned long *vm_flags, int advice);
109extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, 110extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
110 unsigned long start, 111 unsigned long start,
111 unsigned long end, 112 unsigned long end,
@@ -143,7 +144,8 @@ static inline int split_huge_page(struct page *page)
143 do { } while (0) 144 do { } while (0)
144#define wait_split_huge_page(__anon_vma, __pmd) \ 145#define wait_split_huge_page(__anon_vma, __pmd) \
145 do { } while (0) 146 do { } while (0)
146static inline int hugepage_madvise(unsigned long *vm_flags, int advice) 147static inline int hugepage_madvise(struct vm_area_struct *vma,
148 unsigned long *vm_flags, int advice)
147{ 149{
148 BUG(); 150 BUG();
149 return 0; 151 return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fce667c0281d..004c9c2aac78 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1389,7 +1389,8 @@ out:
1389 return ret; 1389 return ret;
1390} 1390}
1391 1391
1392int hugepage_madvise(unsigned long *vm_flags, int advice) 1392int hugepage_madvise(struct vm_area_struct *vma,
1393 unsigned long *vm_flags, int advice)
1393{ 1394{
1394 switch (advice) { 1395 switch (advice) {
1395 case MADV_HUGEPAGE: 1396 case MADV_HUGEPAGE:
@@ -1404,6 +1405,13 @@ int hugepage_madvise(unsigned long *vm_flags, int advice)
1404 return -EINVAL; 1405 return -EINVAL;
1405 *vm_flags &= ~VM_NOHUGEPAGE; 1406 *vm_flags &= ~VM_NOHUGEPAGE;
1406 *vm_flags |= VM_HUGEPAGE; 1407 *vm_flags |= VM_HUGEPAGE;
1408 /*
1409 * If the vma become good for khugepaged to scan,
1410 * register it here without waiting a page fault that
1411 * may not happen any time soon.
1412 */
1413 if (unlikely(khugepaged_enter_vma_merge(vma)))
1414 return -ENOMEM;
1407 break; 1415 break;
1408 case MADV_NOHUGEPAGE: 1416 case MADV_NOHUGEPAGE:
1409 /* 1417 /*
@@ -1417,6 +1425,11 @@ int hugepage_madvise(unsigned long *vm_flags, int advice)
1417 return -EINVAL; 1425 return -EINVAL;
1418 *vm_flags &= ~VM_HUGEPAGE; 1426 *vm_flags &= ~VM_HUGEPAGE;
1419 *vm_flags |= VM_NOHUGEPAGE; 1427 *vm_flags |= VM_NOHUGEPAGE;
1428 /*
1429 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1430 * this vma even if we leave the mm registered in khugepaged if
1431 * it got registered before VM_NOHUGEPAGE was set.
1432 */
1420 break; 1433 break;
1421 } 1434 }
1422 1435
@@ -1784,7 +1797,8 @@ static void collapse_huge_page(struct mm_struct *mm,
1784 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 1797 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1785 goto out; 1798 goto out;
1786 1799
1787 if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) 1800 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1801 (vma->vm_flags & VM_NOHUGEPAGE))
1788 goto out; 1802 goto out;
1789 1803
1790 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 1804 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
@@ -2007,8 +2021,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2007 break; 2021 break;
2008 } 2022 }
2009 2023
2010 if (!(vma->vm_flags & VM_HUGEPAGE) && 2024 if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2011 !khugepaged_always()) { 2025 !khugepaged_always()) ||
2026 (vma->vm_flags & VM_NOHUGEPAGE)) {
2012 progress++; 2027 progress++;
2013 continue; 2028 continue;
2014 } 2029 }
diff --git a/mm/madvise.c b/mm/madvise.c
index bbac126e03ed..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -73,7 +73,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
73 break; 73 break;
74 case MADV_HUGEPAGE: 74 case MADV_HUGEPAGE:
75 case MADV_NOHUGEPAGE: 75 case MADV_NOHUGEPAGE:
76 error = hugepage_madvise(&new_flags, behavior); 76 error = hugepage_madvise(vma, &new_flags, behavior);
77 if (error) 77 if (error)
78 goto out; 78 goto out;
79 break; 79 break;