diff options
author | Andrea Arcangeli <aarcange@redhat.com> | 2011-01-13 18:47:05 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 20:32:45 -0500 |
commit | 0bbbc0b33d141f78a0d9218a54a47f50621220d3 (patch) | |
tree | 3ef3363c189ac536926119731eb86dcf989f4adb | |
parent | d39d33c332c611094f84cee39715866f4cbf79e2 (diff) |
thp: add numa awareness to hugepage allocations
It's mostly a matter of replacing alloc_pages with alloc_pages_vma after
introducing alloc_pages_vma. khugepaged needs special handling as the
allocation has to happen inside collapse_huge_page where the vma is known
and an error has to be returned to the outer loop to sleep
alloc_sleep_millisecs in case of failure. But it retains the more
efficient logic of handling allocation failures in khugepaged in case of
CONFIG_NUMA=n.
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/gfp.h | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 87 | ||||
-rw-r--r-- | mm/mempolicy.c | 13 |
3 files changed, 87 insertions, 20 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d95082cc6f4a..a3b148a91874 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) | |||
331 | { | 331 | { |
332 | return alloc_pages_current(gfp_mask, order); | 332 | return alloc_pages_current(gfp_mask, order); |
333 | } | 333 | } |
334 | extern struct page *alloc_page_vma(gfp_t gfp_mask, | 334 | extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, |
335 | struct vm_area_struct *vma, unsigned long addr); | 335 | struct vm_area_struct *vma, unsigned long addr); |
336 | #else | 336 | #else |
337 | #define alloc_pages(gfp_mask, order) \ | 337 | #define alloc_pages(gfp_mask, order) \ |
338 | alloc_pages_node(numa_node_id(), gfp_mask, order) | 338 | alloc_pages_node(numa_node_id(), gfp_mask, order) |
339 | #define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) | 339 | #define alloc_pages_vma(gfp_mask, order, vma, addr) \ |
340 | alloc_pages(gfp_mask, order) | ||
340 | #endif | 341 | #endif |
341 | #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) | 342 | #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) |
343 | #define alloc_page_vma(gfp_mask, vma, addr) \ | ||
344 | alloc_pages_vma(gfp_mask, 0, vma, addr) | ||
342 | 345 | ||
343 | extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); | 346 | extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); |
344 | extern unsigned long get_zeroed_page(gfp_t gfp_mask); | 347 | extern unsigned long get_zeroed_page(gfp_t gfp_mask); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0415a83afd66..f6559e7711bd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -620,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
620 | return ret; | 620 | return ret; |
621 | } | 621 | } |
622 | 622 | ||
623 | static inline gfp_t alloc_hugepage_gfpmask(int defrag) | ||
624 | { | ||
625 | return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); | ||
626 | } | ||
627 | |||
628 | static inline struct page *alloc_hugepage_vma(int defrag, | ||
629 | struct vm_area_struct *vma, | ||
630 | unsigned long haddr) | ||
631 | { | ||
632 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), | ||
633 | HPAGE_PMD_ORDER, vma, haddr); | ||
634 | } | ||
635 | |||
636 | #ifndef CONFIG_NUMA | ||
623 | static inline struct page *alloc_hugepage(int defrag) | 637 | static inline struct page *alloc_hugepage(int defrag) |
624 | { | 638 | { |
625 | return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT), | 639 | return alloc_pages(alloc_hugepage_gfpmask(defrag), |
626 | HPAGE_PMD_ORDER); | 640 | HPAGE_PMD_ORDER); |
627 | } | 641 | } |
642 | #endif | ||
628 | 643 | ||
629 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 644 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
630 | unsigned long address, pmd_t *pmd, | 645 | unsigned long address, pmd_t *pmd, |
@@ -639,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
639 | return VM_FAULT_OOM; | 654 | return VM_FAULT_OOM; |
640 | if (unlikely(khugepaged_enter(vma))) | 655 | if (unlikely(khugepaged_enter(vma))) |
641 | return VM_FAULT_OOM; | 656 | return VM_FAULT_OOM; |
642 | page = alloc_hugepage(transparent_hugepage_defrag(vma)); | 657 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
658 | vma, haddr); | ||
643 | if (unlikely(!page)) | 659 | if (unlikely(!page)) |
644 | goto out; | 660 | goto out; |
645 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | 661 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { |
@@ -862,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
862 | 878 | ||
863 | if (transparent_hugepage_enabled(vma) && | 879 | if (transparent_hugepage_enabled(vma) && |
864 | !transparent_hugepage_debug_cow()) | 880 | !transparent_hugepage_debug_cow()) |
865 | new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); | 881 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
882 | vma, haddr); | ||
866 | else | 883 | else |
867 | new_page = NULL; | 884 | new_page = NULL; |
868 | 885 | ||
@@ -1661,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1661 | unsigned long hstart, hend; | 1678 | unsigned long hstart, hend; |
1662 | 1679 | ||
1663 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 1680 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
1681 | #ifndef CONFIG_NUMA | ||
1664 | VM_BUG_ON(!*hpage); | 1682 | VM_BUG_ON(!*hpage); |
1683 | #else | ||
1684 | VM_BUG_ON(*hpage); | ||
1685 | #endif | ||
1665 | 1686 | ||
1666 | /* | 1687 | /* |
1667 | * Prevent all access to pagetables with the exception of | 1688 | * Prevent all access to pagetables with the exception of |
@@ -1699,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1699 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | 1720 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) |
1700 | goto out; | 1721 | goto out; |
1701 | 1722 | ||
1723 | #ifndef CONFIG_NUMA | ||
1702 | new_page = *hpage; | 1724 | new_page = *hpage; |
1703 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | 1725 | #else |
1726 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); | ||
1727 | if (unlikely(!new_page)) { | ||
1728 | *hpage = ERR_PTR(-ENOMEM); | ||
1704 | goto out; | 1729 | goto out; |
1730 | } | ||
1731 | #endif | ||
1732 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | ||
1733 | goto out_put_page; | ||
1705 | 1734 | ||
1706 | anon_vma_lock(vma->anon_vma); | 1735 | anon_vma_lock(vma->anon_vma); |
1707 | 1736 | ||
@@ -1730,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1730 | spin_unlock(&mm->page_table_lock); | 1759 | spin_unlock(&mm->page_table_lock); |
1731 | anon_vma_unlock(vma->anon_vma); | 1760 | anon_vma_unlock(vma->anon_vma); |
1732 | mem_cgroup_uncharge_page(new_page); | 1761 | mem_cgroup_uncharge_page(new_page); |
1733 | goto out; | 1762 | goto out_put_page; |
1734 | } | 1763 | } |
1735 | 1764 | ||
1736 | /* | 1765 | /* |
@@ -1765,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1765 | mm->nr_ptes--; | 1794 | mm->nr_ptes--; |
1766 | spin_unlock(&mm->page_table_lock); | 1795 | spin_unlock(&mm->page_table_lock); |
1767 | 1796 | ||
1797 | #ifndef CONFIG_NUMA | ||
1768 | *hpage = NULL; | 1798 | *hpage = NULL; |
1799 | #endif | ||
1769 | khugepaged_pages_collapsed++; | 1800 | khugepaged_pages_collapsed++; |
1770 | out: | 1801 | out: |
1771 | up_write(&mm->mmap_sem); | 1802 | up_write(&mm->mmap_sem); |
1803 | return; | ||
1804 | |||
1805 | out_put_page: | ||
1806 | #ifdef CONFIG_NUMA | ||
1807 | put_page(new_page); | ||
1808 | #endif | ||
1809 | goto out; | ||
1772 | } | 1810 | } |
1773 | 1811 | ||
1774 | static int khugepaged_scan_pmd(struct mm_struct *mm, | 1812 | static int khugepaged_scan_pmd(struct mm_struct *mm, |
@@ -2001,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage) | |||
2001 | while (progress < pages) { | 2039 | while (progress < pages) { |
2002 | cond_resched(); | 2040 | cond_resched(); |
2003 | 2041 | ||
2042 | #ifndef CONFIG_NUMA | ||
2004 | if (!*hpage) { | 2043 | if (!*hpage) { |
2005 | *hpage = alloc_hugepage(khugepaged_defrag()); | 2044 | *hpage = alloc_hugepage(khugepaged_defrag()); |
2006 | if (unlikely(!*hpage)) | 2045 | if (unlikely(!*hpage)) |
2007 | break; | 2046 | break; |
2008 | } | 2047 | } |
2048 | #else | ||
2049 | if (IS_ERR(*hpage)) | ||
2050 | break; | ||
2051 | #endif | ||
2009 | 2052 | ||
2010 | spin_lock(&khugepaged_mm_lock); | 2053 | spin_lock(&khugepaged_mm_lock); |
2011 | if (!khugepaged_scan.mm_slot) | 2054 | if (!khugepaged_scan.mm_slot) |
@@ -2020,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage) | |||
2020 | } | 2063 | } |
2021 | } | 2064 | } |
2022 | 2065 | ||
2066 | static void khugepaged_alloc_sleep(void) | ||
2067 | { | ||
2068 | DEFINE_WAIT(wait); | ||
2069 | add_wait_queue(&khugepaged_wait, &wait); | ||
2070 | schedule_timeout_interruptible( | ||
2071 | msecs_to_jiffies( | ||
2072 | khugepaged_alloc_sleep_millisecs)); | ||
2073 | remove_wait_queue(&khugepaged_wait, &wait); | ||
2074 | } | ||
2075 | |||
2076 | #ifndef CONFIG_NUMA | ||
2023 | static struct page *khugepaged_alloc_hugepage(void) | 2077 | static struct page *khugepaged_alloc_hugepage(void) |
2024 | { | 2078 | { |
2025 | struct page *hpage; | 2079 | struct page *hpage; |
2026 | 2080 | ||
2027 | do { | 2081 | do { |
2028 | hpage = alloc_hugepage(khugepaged_defrag()); | 2082 | hpage = alloc_hugepage(khugepaged_defrag()); |
2029 | if (!hpage) { | 2083 | if (!hpage) |
2030 | DEFINE_WAIT(wait); | 2084 | khugepaged_alloc_sleep(); |
2031 | add_wait_queue(&khugepaged_wait, &wait); | ||
2032 | schedule_timeout_interruptible( | ||
2033 | msecs_to_jiffies( | ||
2034 | khugepaged_alloc_sleep_millisecs)); | ||
2035 | remove_wait_queue(&khugepaged_wait, &wait); | ||
2036 | } | ||
2037 | } while (unlikely(!hpage) && | 2085 | } while (unlikely(!hpage) && |
2038 | likely(khugepaged_enabled())); | 2086 | likely(khugepaged_enabled())); |
2039 | return hpage; | 2087 | return hpage; |
2040 | } | 2088 | } |
2089 | #endif | ||
2041 | 2090 | ||
2042 | static void khugepaged_loop(void) | 2091 | static void khugepaged_loop(void) |
2043 | { | 2092 | { |
2044 | struct page *hpage; | 2093 | struct page *hpage; |
2045 | 2094 | ||
2095 | #ifdef CONFIG_NUMA | ||
2096 | hpage = NULL; | ||
2097 | #endif | ||
2046 | while (likely(khugepaged_enabled())) { | 2098 | while (likely(khugepaged_enabled())) { |
2099 | #ifndef CONFIG_NUMA | ||
2047 | hpage = khugepaged_alloc_hugepage(); | 2100 | hpage = khugepaged_alloc_hugepage(); |
2048 | if (unlikely(!hpage)) | 2101 | if (unlikely(!hpage)) |
2049 | break; | 2102 | break; |
2103 | #else | ||
2104 | if (IS_ERR(hpage)) { | ||
2105 | khugepaged_alloc_sleep(); | ||
2106 | hpage = NULL; | ||
2107 | } | ||
2108 | #endif | ||
2050 | 2109 | ||
2051 | khugepaged_do_scan(&hpage); | 2110 | khugepaged_do_scan(&hpage); |
2111 | #ifndef CONFIG_NUMA | ||
2052 | if (hpage) | 2112 | if (hpage) |
2053 | put_page(hpage); | 2113 | put_page(hpage); |
2114 | #endif | ||
2054 | if (khugepaged_has_work()) { | 2115 | if (khugepaged_has_work()) { |
2055 | DEFINE_WAIT(wait); | 2116 | DEFINE_WAIT(wait); |
2056 | if (!khugepaged_scan_sleep_millisecs) | 2117 | if (!khugepaged_scan_sleep_millisecs) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 83b7df309fc4..368fc9d23610 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1796 | } | 1796 | } |
1797 | 1797 | ||
1798 | /** | 1798 | /** |
1799 | * alloc_page_vma - Allocate a page for a VMA. | 1799 | * alloc_pages_vma - Allocate a page for a VMA. |
1800 | * | 1800 | * |
1801 | * @gfp: | 1801 | * @gfp: |
1802 | * %GFP_USER user allocation. | 1802 | * %GFP_USER user allocation. |
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1805 | * %GFP_FS allocation should not call back into a file system. | 1805 | * %GFP_FS allocation should not call back into a file system. |
1806 | * %GFP_ATOMIC don't sleep. | 1806 | * %GFP_ATOMIC don't sleep. |
1807 | * | 1807 | * |
1808 | * @order:Order of the GFP allocation. | ||
1808 | * @vma: Pointer to VMA or NULL if not available. | 1809 | * @vma: Pointer to VMA or NULL if not available. |
1809 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 1810 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
1810 | * | 1811 | * |
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1818 | * Should be called with the mm_sem of the vma hold. | 1819 | * Should be called with the mm_sem of the vma hold. |
1819 | */ | 1820 | */ |
1820 | struct page * | 1821 | struct page * |
1821 | alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | 1822 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1823 | unsigned long addr) | ||
1822 | { | 1824 | { |
1823 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1825 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1824 | struct zonelist *zl; | 1826 | struct zonelist *zl; |
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1830 | 1832 | ||
1831 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1833 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
1832 | mpol_cond_put(pol); | 1834 | mpol_cond_put(pol); |
1833 | page = alloc_page_interleave(gfp, 0, nid); | 1835 | page = alloc_page_interleave(gfp, order, nid); |
1834 | put_mems_allowed(); | 1836 | put_mems_allowed(); |
1835 | return page; | 1837 | return page; |
1836 | } | 1838 | } |
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1839 | /* | 1841 | /* |
1840 | * slow path: ref counted shared policy | 1842 | * slow path: ref counted shared policy |
1841 | */ | 1843 | */ |
1842 | struct page *page = __alloc_pages_nodemask(gfp, 0, | 1844 | struct page *page = __alloc_pages_nodemask(gfp, order, |
1843 | zl, policy_nodemask(gfp, pol)); | 1845 | zl, policy_nodemask(gfp, pol)); |
1844 | __mpol_put(pol); | 1846 | __mpol_put(pol); |
1845 | put_mems_allowed(); | 1847 | put_mems_allowed(); |
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1848 | /* | 1850 | /* |
1849 | * fast path: default or task policy | 1851 | * fast path: default or task policy |
1850 | */ | 1852 | */ |
1851 | page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); | 1853 | page = __alloc_pages_nodemask(gfp, order, zl, |
1854 | policy_nodemask(gfp, pol)); | ||
1852 | put_mems_allowed(); | 1855 | put_mems_allowed(); |
1853 | return page; | 1856 | return page; |
1854 | } | 1857 | } |