aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/huge_memory.c78
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c13
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c98
-rw-r--r--mm/memory-failure.c94
-rw-r--r--mm/memory.c34
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c15
-rw-r--r--mm/mlock.c7
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/page_alloc.c23
-rw-r--r--mm/pgtable-generic.c1
-rw-r--r--mm/rmap.c54
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/vmscan.c39
19 files changed, 323 insertions, 171 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3ad483bdf505..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
179config COMPACTION 179config COMPACTION
180 bool "Allow for memory compaction" 180 bool "Allow for memory compaction"
181 select MIGRATION 181 select MIGRATION
182 depends on EXPERIMENTAL && HUGETLB_PAGE && MMU 182 depends on MMU
183 help 183 help
184 Allows the compaction of memory for the allocation of huge pages. 184 Allows the compaction of memory for the allocation of huge pages.
185 185
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e187454d82f6..113e35c47502 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag)
650 650
651static inline struct page *alloc_hugepage_vma(int defrag, 651static inline struct page *alloc_hugepage_vma(int defrag,
652 struct vm_area_struct *vma, 652 struct vm_area_struct *vma,
653 unsigned long haddr) 653 unsigned long haddr, int nd)
654{ 654{
655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), 655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
656 HPAGE_PMD_ORDER, vma, haddr); 656 HPAGE_PMD_ORDER, vma, haddr, nd);
657} 657}
658 658
659#ifndef CONFIG_NUMA 659#ifndef CONFIG_NUMA
@@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
678 if (unlikely(khugepaged_enter(vma))) 678 if (unlikely(khugepaged_enter(vma)))
679 return VM_FAULT_OOM; 679 return VM_FAULT_OOM;
680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
681 vma, haddr); 681 vma, haddr, numa_node_id());
682 if (unlikely(!page)) 682 if (unlikely(!page))
683 goto out; 683 goto out;
684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
799 } 799 }
800 800
801 for (i = 0; i < HPAGE_PMD_NR; i++) { 801 for (i = 0; i < HPAGE_PMD_NR; i++) {
802 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 802 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE,
803 vma, address); 803 vma, address, page_to_nid(page));
804 if (unlikely(!pages[i] || 804 if (unlikely(!pages[i] ||
805 mem_cgroup_newpage_charge(pages[i], mm, 805 mem_cgroup_newpage_charge(pages[i], mm,
806 GFP_KERNEL))) { 806 GFP_KERNEL))) {
@@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
902 if (transparent_hugepage_enabled(vma) && 902 if (transparent_hugepage_enabled(vma) &&
903 !transparent_hugepage_debug_cow()) 903 !transparent_hugepage_debug_cow())
904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
905 vma, haddr); 905 vma, haddr, numa_node_id());
906 else 906 else
907 new_page = NULL; 907 new_page = NULL;
908 908
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page)
1162 /* after clearing PageTail the gup refcount can be released */ 1162 /* after clearing PageTail the gup refcount can be released */
1163 smp_mb(); 1163 smp_mb();
1164 1164
1165 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1165 /*
1166 * retain hwpoison flag of the poisoned tail page:
1167 * fix for the unsuitable process killed on Guest Machine(KVM)
1168 * by the memory-failure.
1169 */
1170 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1166 page_tail->flags |= (page->flags & 1171 page_tail->flags |= (page->flags &
1167 ((1L << PG_referenced) | 1172 ((1L << PG_referenced) |
1168 (1L << PG_swapbacked) | 1173 (1L << PG_swapbacked) |
@@ -1740,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1740static void collapse_huge_page(struct mm_struct *mm, 1745static void collapse_huge_page(struct mm_struct *mm,
1741 unsigned long address, 1746 unsigned long address,
1742 struct page **hpage, 1747 struct page **hpage,
1743 struct vm_area_struct *vma) 1748 struct vm_area_struct *vma,
1749 int node)
1744{ 1750{
1745 pgd_t *pgd; 1751 pgd_t *pgd;
1746 pud_t *pud; 1752 pud_t *pud;
@@ -1756,6 +1762,10 @@ static void collapse_huge_page(struct mm_struct *mm,
1756#ifndef CONFIG_NUMA 1762#ifndef CONFIG_NUMA
1757 VM_BUG_ON(!*hpage); 1763 VM_BUG_ON(!*hpage);
1758 new_page = *hpage; 1764 new_page = *hpage;
1765 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1766 up_read(&mm->mmap_sem);
1767 return;
1768 }
1759#else 1769#else
1760 VM_BUG_ON(*hpage); 1770 VM_BUG_ON(*hpage);
1761 /* 1771 /*
@@ -1768,18 +1778,19 @@ static void collapse_huge_page(struct mm_struct *mm,
1768 * mmap_sem in read mode is good idea also to allow greater 1778 * mmap_sem in read mode is good idea also to allow greater
1769 * scalability. 1779 * scalability.
1770 */ 1780 */
1771 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); 1781 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1782 node);
1772 if (unlikely(!new_page)) { 1783 if (unlikely(!new_page)) {
1773 up_read(&mm->mmap_sem); 1784 up_read(&mm->mmap_sem);
1774 *hpage = ERR_PTR(-ENOMEM); 1785 *hpage = ERR_PTR(-ENOMEM);
1775 return; 1786 return;
1776 } 1787 }
1777#endif
1778 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1788 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1779 up_read(&mm->mmap_sem); 1789 up_read(&mm->mmap_sem);
1780 put_page(new_page); 1790 put_page(new_page);
1781 return; 1791 return;
1782 } 1792 }
1793#endif
1783 1794
1784 /* after allocating the hugepage upgrade to mmap_sem write mode */ 1795 /* after allocating the hugepage upgrade to mmap_sem write mode */
1785 up_read(&mm->mmap_sem); 1796 up_read(&mm->mmap_sem);
@@ -1806,6 +1817,8 @@ static void collapse_huge_page(struct mm_struct *mm,
1806 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 1817 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1807 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) 1818 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1808 goto out; 1819 goto out;
1820 if (is_vma_temporary_stack(vma))
1821 goto out;
1809 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 1822 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1810 1823
1811 pgd = pgd_offset(mm, address); 1824 pgd = pgd_offset(mm, address);
@@ -1847,7 +1860,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1847 set_pmd_at(mm, address, pmd, _pmd); 1860 set_pmd_at(mm, address, pmd, _pmd);
1848 spin_unlock(&mm->page_table_lock); 1861 spin_unlock(&mm->page_table_lock);
1849 anon_vma_unlock(vma->anon_vma); 1862 anon_vma_unlock(vma->anon_vma);
1850 mem_cgroup_uncharge_page(new_page);
1851 goto out; 1863 goto out;
1852 } 1864 }
1853 1865
@@ -1893,6 +1905,7 @@ out_up_write:
1893 return; 1905 return;
1894 1906
1895out: 1907out:
1908 mem_cgroup_uncharge_page(new_page);
1896#ifdef CONFIG_NUMA 1909#ifdef CONFIG_NUMA
1897 put_page(new_page); 1910 put_page(new_page);
1898#endif 1911#endif
@@ -1912,6 +1925,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
1912 struct page *page; 1925 struct page *page;
1913 unsigned long _address; 1926 unsigned long _address;
1914 spinlock_t *ptl; 1927 spinlock_t *ptl;
1928 int node = -1;
1915 1929
1916 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1930 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1917 1931
@@ -1942,6 +1956,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
1942 page = vm_normal_page(vma, _address, pteval); 1956 page = vm_normal_page(vma, _address, pteval);
1943 if (unlikely(!page)) 1957 if (unlikely(!page))
1944 goto out_unmap; 1958 goto out_unmap;
1959 /*
1960 * Chose the node of the first page. This could
1961 * be more sophisticated and look at more pages,
1962 * but isn't for now.
1963 */
1964 if (node == -1)
1965 node = page_to_nid(page);
1945 VM_BUG_ON(PageCompound(page)); 1966 VM_BUG_ON(PageCompound(page));
1946 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 1967 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1947 goto out_unmap; 1968 goto out_unmap;
@@ -1958,7 +1979,7 @@ out_unmap:
1958 pte_unmap_unlock(pte, ptl); 1979 pte_unmap_unlock(pte, ptl);
1959 if (ret) 1980 if (ret)
1960 /* collapse_huge_page will return with the mmap_sem released */ 1981 /* collapse_huge_page will return with the mmap_sem released */
1961 collapse_huge_page(mm, address, hpage, vma); 1982 collapse_huge_page(mm, address, hpage, vma, node);
1962out: 1983out:
1963 return ret; 1984 return ret;
1964} 1985}
@@ -2027,32 +2048,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2027 if ((!(vma->vm_flags & VM_HUGEPAGE) && 2048 if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2028 !khugepaged_always()) || 2049 !khugepaged_always()) ||
2029 (vma->vm_flags & VM_NOHUGEPAGE)) { 2050 (vma->vm_flags & VM_NOHUGEPAGE)) {
2051 skip:
2030 progress++; 2052 progress++;
2031 continue; 2053 continue;
2032 } 2054 }
2033
2034 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 2055 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
2035 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { 2056 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
2036 khugepaged_scan.address = vma->vm_end; 2057 goto skip;
2037 progress++; 2058 if (is_vma_temporary_stack(vma))
2038 continue; 2059 goto skip;
2039 } 2060
2040 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 2061 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
2041 2062
2042 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2063 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2043 hend = vma->vm_end & HPAGE_PMD_MASK; 2064 hend = vma->vm_end & HPAGE_PMD_MASK;
2044 if (hstart >= hend) { 2065 if (hstart >= hend)
2045 progress++; 2066 goto skip;
2046 continue; 2067 if (khugepaged_scan.address > hend)
2047 } 2068 goto skip;
2048 if (khugepaged_scan.address < hstart) 2069 if (khugepaged_scan.address < hstart)
2049 khugepaged_scan.address = hstart; 2070 khugepaged_scan.address = hstart;
2050 if (khugepaged_scan.address > hend) { 2071 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2051 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
2052 progress++;
2053 continue;
2054 }
2055 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2056 2072
2057 while (khugepaged_scan.address < hend) { 2073 while (khugepaged_scan.address < hend) {
2058 int ret; 2074 int ret;
@@ -2081,7 +2097,7 @@ breakouterloop:
2081breakouterloop_mmap_sem: 2097breakouterloop_mmap_sem:
2082 2098
2083 spin_lock(&khugepaged_mm_lock); 2099 spin_lock(&khugepaged_mm_lock);
2084 BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2100 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2085 /* 2101 /*
2086 * Release the current mm_slot if this mm is about to die, or 2102 * Release the current mm_slot if this mm is about to die, or
2087 * if we scanned all vmas of this mm. 2103 * if we scanned all vmas of this mm.
@@ -2236,9 +2252,9 @@ static int khugepaged(void *none)
2236 2252
2237 for (;;) { 2253 for (;;) {
2238 mutex_unlock(&khugepaged_mutex); 2254 mutex_unlock(&khugepaged_mutex);
2239 BUG_ON(khugepaged_thread != current); 2255 VM_BUG_ON(khugepaged_thread != current);
2240 khugepaged_loop(); 2256 khugepaged_loop();
2241 BUG_ON(khugepaged_thread != current); 2257 VM_BUG_ON(khugepaged_thread != current);
2242 2258
2243 mutex_lock(&khugepaged_mutex); 2259 mutex_lock(&khugepaged_mutex);
2244 if (!khugepaged_enabled()) 2260 if (!khugepaged_enabled())
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
75 * after the module is removed. 75 * after the module is removed.
76 */ 76 */
77 for (i = 0; i < 10; i++) { 77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 78 elem = kzalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); 79 pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem) 80 if (!elem)
81 return -ENOMEM; 81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list); 82 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list); 83 list_add_tail(&elem->list, &test_list);
86 } 84 }
87 85
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
113#define BYTES_PER_POINTER sizeof(void *) 113#define BYTES_PER_POINTER sizeof(void *)
114 114
115/* GFP bitmask for kmemleak internal allocations */ 115/* GFP bitmask for kmemleak internal allocations */
116#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) 116#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
117 __GFP_NORETRY | __GFP_NOMEMALLOC | \
118 __GFP_NOWARN)
117 119
118/* scanning area inside a memory block */ 120/* scanning area inside a memory block */
119struct kmemleak_scan_area { 121struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
511 struct kmemleak_object *object; 513 struct kmemleak_object *object;
512 struct prio_tree_node *node; 514 struct prio_tree_node *node;
513 515
514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 516 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
515 if (!object) { 517 if (!object) {
516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 518 pr_warning("Cannot allocate a kmemleak_object structure\n");
519 kmemleak_disable();
517 return NULL; 520 return NULL;
518 } 521 }
519 522
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
734 return; 737 return;
735 } 738 }
736 739
737 area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); 740 area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
738 if (!area) { 741 if (!area) {
739 kmemleak_warn("Cannot allocate a scan area\n"); 742 pr_warning("Cannot allocate a scan area\n");
740 goto out; 743 goto out;
741 } 744 }
742 745
diff --git a/mm/memblock.c b/mm/memblock.c
index bdba245d8afd..4618fda975a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
137 137
138 BUG_ON(0 == size); 138 BUG_ON(0 == size);
139 139
140 size = memblock_align_up(size, align);
141
142 /* Pump up max_addr */ 140 /* Pump up max_addr */
143 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 141 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
144 end = memblock.current_limit; 142 end = memblock.current_limit;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db76ef726293..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
612 /* pagein of a big page is an event. So, ignore page size */ 612 /* pagein of a big page is an event. So, ignore page size */
613 if (nr_pages > 0) 613 if (nr_pages > 0)
614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
615 else 615 else {
616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
617 nr_pages = -nr_pages; /* for event */
618 }
617 619
618 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); 620 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
619 621
@@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1111 return false; 1113 return false;
1112} 1114}
1113 1115
1116/**
1117 * mem_cgroup_check_margin - check if the memory cgroup allows charging
1118 * @mem: memory cgroup to check
1119 * @bytes: the number of bytes the caller intends to charge
1120 *
1121 * Returns a boolean value on whether @mem can be charged @bytes or
1122 * whether this would exceed the limit.
1123 */
1124static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
1125{
1126 if (!res_counter_check_margin(&mem->res, bytes))
1127 return false;
1128 if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
1129 return false;
1130 return true;
1131}
1132
1114static unsigned int get_swappiness(struct mem_cgroup *memcg) 1133static unsigned int get_swappiness(struct mem_cgroup *memcg)
1115{ 1134{
1116 struct cgroup *cgrp = memcg->css.cgroup; 1135 struct cgroup *cgrp = memcg->css.cgroup;
@@ -1832,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1832 if (likely(!ret)) 1851 if (likely(!ret))
1833 return CHARGE_OK; 1852 return CHARGE_OK;
1834 1853
1854 res_counter_uncharge(&mem->res, csize);
1835 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1855 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1836 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1856 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1837 } else 1857 } else
1838 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1858 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1839 1859 /*
1840 if (csize > PAGE_SIZE) /* change csize and retry */ 1860 * csize can be either a huge page (HPAGE_SIZE), a batch of
1861 * regular pages (CHARGE_SIZE), or a single regular page
1862 * (PAGE_SIZE).
1863 *
1864 * Never reclaim on behalf of optional batching, retry with a
1865 * single page instead.
1866 */
1867 if (csize == CHARGE_SIZE)
1841 return CHARGE_RETRY; 1868 return CHARGE_RETRY;
1842 1869
1843 if (!(gfp_mask & __GFP_WAIT)) 1870 if (!(gfp_mask & __GFP_WAIT))
1844 return CHARGE_WOULDBLOCK; 1871 return CHARGE_WOULDBLOCK;
1845 1872
1846 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1873 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1847 gfp_mask, flags); 1874 gfp_mask, flags);
1875 if (mem_cgroup_check_margin(mem_over_limit, csize))
1876 return CHARGE_RETRY;
1848 /* 1877 /*
1849 * try_to_free_mem_cgroup_pages() might not give us a full 1878 * Even though the limit is exceeded at this point, reclaim
1850 * picture of reclaim. Some pages are reclaimed and might be 1879 * may have been able to free some pages. Retry the charge
1851 * moved to swap cache or just unmapped from the cgroup. 1880 * before killing the task.
1852 * Check the limit again to see if the reclaim reduced the 1881 *
1853 * current usage of the cgroup before giving up 1882 * Only for regular pages, though: huge pages are rather
1883 * unlikely to succeed so close to the limit, and we fall back
1884 * to regular pages anyway in case of failure.
1854 */ 1885 */
1855 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 1886 if (csize == PAGE_SIZE && ret)
1856 return CHARGE_RETRY; 1887 return CHARGE_RETRY;
1857 1888
1858 /* 1889 /*
@@ -2144,6 +2175,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2144 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2175 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2145 unsigned long flags; 2176 unsigned long flags;
2146 2177
2178 if (mem_cgroup_disabled())
2179 return;
2147 /* 2180 /*
2148 * We have no races with charge/uncharge but will have races with 2181 * We have no races with charge/uncharge but will have races with
2149 * page state accounting. 2182 * page state accounting.
@@ -2233,7 +2266,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
2233{ 2266{
2234 int ret = -EINVAL; 2267 int ret = -EINVAL;
2235 unsigned long flags; 2268 unsigned long flags;
2236 2269 /*
2270 * The page is isolated from LRU. So, collapse function
2271 * will not handle this page. But page splitting can happen.
2272 * Do this check under compound_page_lock(). The caller should
2273 * hold it.
2274 */
2237 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) 2275 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2238 return -EBUSY; 2276 return -EBUSY;
2239 2277
@@ -2265,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2265 struct cgroup *cg = child->css.cgroup; 2303 struct cgroup *cg = child->css.cgroup;
2266 struct cgroup *pcg = cg->parent; 2304 struct cgroup *pcg = cg->parent;
2267 struct mem_cgroup *parent; 2305 struct mem_cgroup *parent;
2268 int charge = PAGE_SIZE; 2306 int page_size = PAGE_SIZE;
2269 unsigned long flags; 2307 unsigned long flags;
2270 int ret; 2308 int ret;
2271 2309
@@ -2278,23 +2316,26 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2278 goto out; 2316 goto out;
2279 if (isolate_lru_page(page)) 2317 if (isolate_lru_page(page))
2280 goto put; 2318 goto put;
2281 /* The page is isolated from LRU and we have no race with splitting */ 2319
2282 charge = PAGE_SIZE << compound_order(page); 2320 if (PageTransHuge(page))
2321 page_size = HPAGE_SIZE;
2283 2322
2284 parent = mem_cgroup_from_cont(pcg); 2323 parent = mem_cgroup_from_cont(pcg);
2285 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); 2324 ret = __mem_cgroup_try_charge(NULL, gfp_mask,
2325 &parent, false, page_size);
2286 if (ret || !parent) 2326 if (ret || !parent)
2287 goto put_back; 2327 goto put_back;
2288 2328
2289 if (charge > PAGE_SIZE) 2329 if (page_size > PAGE_SIZE)
2290 flags = compound_lock_irqsave(page); 2330 flags = compound_lock_irqsave(page);
2291 2331
2292 ret = mem_cgroup_move_account(pc, child, parent, true, charge); 2332 ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
2293 if (ret) 2333 if (ret)
2294 mem_cgroup_cancel_charge(parent, charge); 2334 mem_cgroup_cancel_charge(parent, page_size);
2295put_back: 2335
2296 if (charge > PAGE_SIZE) 2336 if (page_size > PAGE_SIZE)
2297 compound_unlock_irqrestore(page, flags); 2337 compound_unlock_irqrestore(page, flags);
2338put_back:
2298 putback_lru_page(page); 2339 putback_lru_page(page);
2299put: 2340put:
2300 put_page(page); 2341 put_page(page);
@@ -2312,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2312 gfp_t gfp_mask, enum charge_type ctype) 2353 gfp_t gfp_mask, enum charge_type ctype)
2313{ 2354{
2314 struct mem_cgroup *mem = NULL; 2355 struct mem_cgroup *mem = NULL;
2356 int page_size = PAGE_SIZE;
2315 struct page_cgroup *pc; 2357 struct page_cgroup *pc;
2358 bool oom = true;
2316 int ret; 2359 int ret;
2317 int page_size = PAGE_SIZE;
2318 2360
2319 if (PageTransHuge(page)) { 2361 if (PageTransHuge(page)) {
2320 page_size <<= compound_order(page); 2362 page_size <<= compound_order(page);
2321 VM_BUG_ON(!PageTransHuge(page)); 2363 VM_BUG_ON(!PageTransHuge(page));
2364 /*
2365 * Never OOM-kill a process for a huge page. The
2366 * fault handler will fall back to regular pages.
2367 */
2368 oom = false;
2322 } 2369 }
2323 2370
2324 pc = lookup_page_cgroup(page); 2371 pc = lookup_page_cgroup(page);
@@ -2327,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2327 return 0; 2374 return 0;
2328 prefetchw(pc); 2375 prefetchw(pc);
2329 2376
2330 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); 2377 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
2331 if (ret || !mem) 2378 if (ret || !mem)
2332 return ret; 2379 return ret;
2333 2380
@@ -5013,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
5013static int __init enable_swap_account(char *s) 5060static int __init enable_swap_account(char *s)
5014{ 5061{
5015 /* consider enabled if no parameter or 1 is given */ 5062 /* consider enabled if no parameter or 1 is given */
5016 if (!s || !strcmp(s, "1")) 5063 if (!(*s) || !strcmp(s, "=1"))
5017 really_do_swap_account = 1; 5064 really_do_swap_account = 1;
5018 else if (!strcmp(s, "0")) 5065 else if (!strcmp(s, "=0"))
5019 really_do_swap_account = 0; 5066 really_do_swap_account = 0;
5020 return 1; 5067 return 1;
5021} 5068}
@@ -5023,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
5023 5070
5024static int __init disable_swap_account(char *s) 5071static int __init disable_swap_account(char *s)
5025{ 5072{
5026 enable_swap_account("0"); 5073 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5074 enable_swap_account("=0");
5027 return 1; 5075 return 1;
5028} 5076}
5029__setup("noswapaccount", disable_swap_account); 5077__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
233 } 233 }
234 234
235 /* 235 /*
236 * Only all shrink_slab here (which would also 236 * Only call shrink_slab here (which would also shrink other caches) if
237 * shrink other caches) if access is not potentially fatal. 237 * access is not potentially fatal.
238 */ 238 */
239 if (access) { 239 if (access) {
240 int nr; 240 int nr;
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
386 struct task_struct *tsk; 386 struct task_struct *tsk;
387 struct anon_vma *av; 387 struct anon_vma *av;
388 388
389 if (!PageHuge(page) && unlikely(split_huge_page(page)))
390 return;
391 read_lock(&tasklist_lock); 389 read_lock(&tasklist_lock);
392 av = page_lock_anon_vma(page); 390 av = page_lock_anon_vma(page);
393 if (av == NULL) /* Not actually mapped anymore */ 391 if (av == NULL) /* Not actually mapped anymore */
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
856 int ret; 854 int ret;
857 int kill = 1; 855 int kill = 1;
858 struct page *hpage = compound_head(p); 856 struct page *hpage = compound_head(p);
857 struct page *ppage;
859 858
860 if (PageReserved(p) || PageSlab(p)) 859 if (PageReserved(p) || PageSlab(p))
861 return SWAP_SUCCESS; 860 return SWAP_SUCCESS;
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
897 } 896 }
898 897
899 /* 898 /*
899 * ppage: poisoned page
900 * if p is regular page(4k page)
901 * ppage == real poisoned page;
902 * else p is hugetlb or THP, ppage == head page.
903 */
904 ppage = hpage;
905
906 if (PageTransHuge(hpage)) {
907 /*
908 * Verify that this isn't a hugetlbfs head page, the check for
909 * PageAnon is just for avoid tripping a split_huge_page
910 * internal debug check, as split_huge_page refuses to deal with
911 * anything that isn't an anon page. PageAnon can't go away fro
912 * under us because we hold a refcount on the hpage, without a
913 * refcount on the hpage. split_huge_page can't be safely called
914 * in the first place, having a refcount on the tail isn't
915 * enough * to be safe.
916 */
917 if (!PageHuge(hpage) && PageAnon(hpage)) {
918 if (unlikely(split_huge_page(hpage))) {
919 /*
920 * FIXME: if splitting THP is failed, it is
921 * better to stop the following operation rather
922 * than causing panic by unmapping. System might
923 * survive if the page is freed later.
924 */
925 printk(KERN_INFO
926 "MCE %#lx: failed to split THP\n", pfn);
927
928 BUG_ON(!PageHWPoison(p));
929 return SWAP_FAIL;
930 }
931 /* THP is split, so ppage should be the real poisoned page. */
932 ppage = p;
933 }
934 }
935
936 /*
900 * First collect all the processes that have the page 937 * First collect all the processes that have the page
901 * mapped in dirty form. This has to be done before try_to_unmap, 938 * mapped in dirty form. This has to be done before try_to_unmap,
902 * because ttu takes the rmap data structures down. 939 * because ttu takes the rmap data structures down.
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
905 * there's nothing that can be done. 942 * there's nothing that can be done.
906 */ 943 */
907 if (kill) 944 if (kill)
908 collect_procs(hpage, &tokill); 945 collect_procs(ppage, &tokill);
946
947 if (hpage != ppage)
948 lock_page_nosync(ppage);
909 949
910 ret = try_to_unmap(hpage, ttu); 950 ret = try_to_unmap(ppage, ttu);
911 if (ret != SWAP_SUCCESS) 951 if (ret != SWAP_SUCCESS)
912 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 952 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
913 pfn, page_mapcount(hpage)); 953 pfn, page_mapcount(ppage));
954
955 if (hpage != ppage)
956 unlock_page(ppage);
914 957
915 /* 958 /*
916 * Now that the dirty bit has been propagated to the 959 * Now that the dirty bit has been propagated to the
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
921 * use a more force-full uncatchable kill to prevent 964 * use a more force-full uncatchable kill to prevent
922 * any accesses to the poisoned memory. 965 * any accesses to the poisoned memory.
923 */ 966 */
924 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, 967 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
925 ret != SWAP_SUCCESS, p, pfn); 968 ret != SWAP_SUCCESS, p, pfn);
926 969
927 return ret; 970 return ret;
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1022 * The check (unnecessarily) ignores LRU pages being isolated and 1065 * The check (unnecessarily) ignores LRU pages being isolated and
1023 * walked by the page reclaim code, however that's not a big loss. 1066 * walked by the page reclaim code, however that's not a big loss.
1024 */ 1067 */
1025 if (!PageLRU(p) && !PageHuge(p)) 1068 if (!PageHuge(p) && !PageTransCompound(p)) {
1026 shake_page(p, 0); 1069 if (!PageLRU(p))
1027 if (!PageLRU(p) && !PageHuge(p)) { 1070 shake_page(p, 0);
1028 /* 1071 if (!PageLRU(p)) {
1029 * shake_page could have turned it free. 1072 /*
1030 */ 1073 * shake_page could have turned it free.
1031 if (is_free_buddy_page(p)) { 1074 */
1032 action_result(pfn, "free buddy, 2nd try", DELAYED); 1075 if (is_free_buddy_page(p)) {
1033 return 0; 1076 action_result(pfn, "free buddy, 2nd try",
1077 DELAYED);
1078 return 0;
1079 }
1080 action_result(pfn, "non LRU", IGNORED);
1081 put_page(p);
1082 return -EBUSY;
1034 } 1083 }
1035 action_result(pfn, "non LRU", IGNORED);
1036 put_page(p);
1037 return -EBUSY;
1038 } 1084 }
1039 1085
1040 /* 1086 /*
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1064 * For error on the tail page, we should set PG_hwpoison 1110 * For error on the tail page, we should set PG_hwpoison
1065 * on the head page to show that the hugepage is hwpoisoned 1111 * on the head page to show that the hugepage is hwpoisoned
1066 */ 1112 */
1067 if (PageTail(p) && TestSetPageHWPoison(hpage)) { 1113 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1068 action_result(pfn, "hugepage already hardware poisoned", 1114 action_result(pfn, "hugepage already hardware poisoned",
1069 IGNORED); 1115 IGNORED);
1070 unlock_page(hpage); 1116 unlock_page(hpage);
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
1295 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1341 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1296 true); 1342 true);
1297 if (ret) { 1343 if (ret) {
1298 putback_lru_pages(&pagelist); 1344 struct page *page1, *page2;
1345 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1346 put_page(page1);
1347
1299 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1348 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1300 pfn, ret, page->flags); 1349 pfn, ret, page->flags);
1301 if (ret > 0) 1350 if (ret > 0)
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags)
1419 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1468 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1420 0, true); 1469 0, true);
1421 if (ret) { 1470 if (ret) {
1471 putback_lru_pages(&pagelist);
1422 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1472 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1423 pfn, ret, page->flags); 1473 pfn, ret, page->flags);
1424 if (ret > 0) 1474 if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 31250faff390..5823698c2b71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2219 &ptl); 2219 &ptl);
2220 if (!pte_same(*page_table, orig_pte)) { 2220 if (!pte_same(*page_table, orig_pte)) {
2221 unlock_page(old_page); 2221 unlock_page(old_page);
2222 page_cache_release(old_page);
2223 goto unlock; 2222 goto unlock;
2224 } 2223 }
2225 page_cache_release(old_page); 2224 page_cache_release(old_page);
@@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2289 &ptl); 2288 &ptl);
2290 if (!pte_same(*page_table, orig_pte)) { 2289 if (!pte_same(*page_table, orig_pte)) {
2291 unlock_page(old_page); 2290 unlock_page(old_page);
2292 page_cache_release(old_page);
2293 goto unlock; 2291 goto unlock;
2294 } 2292 }
2295 2293
@@ -2367,16 +2365,6 @@ gotten:
2367 } 2365 }
2368 __SetPageUptodate(new_page); 2366 __SetPageUptodate(new_page);
2369 2367
2370 /*
2371 * Don't let another task, with possibly unlocked vma,
2372 * keep the mlocked page.
2373 */
2374 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2375 lock_page(old_page); /* for LRU manipulation */
2376 clear_page_mlock(old_page);
2377 unlock_page(old_page);
2378 }
2379
2380 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2368 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2381 goto oom_free_new; 2369 goto oom_free_new;
2382 2370
@@ -2444,10 +2432,20 @@ gotten:
2444 2432
2445 if (new_page) 2433 if (new_page)
2446 page_cache_release(new_page); 2434 page_cache_release(new_page);
2447 if (old_page)
2448 page_cache_release(old_page);
2449unlock: 2435unlock:
2450 pte_unmap_unlock(page_table, ptl); 2436 pte_unmap_unlock(page_table, ptl);
2437 if (old_page) {
2438 /*
2439 * Don't let another task, with possibly unlocked vma,
2440 * keep the mlocked page.
2441 */
2442 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2443 lock_page(old_page); /* LRU manipulation */
2444 munlock_vma_page(old_page);
2445 unlock_page(old_page);
2446 }
2447 page_cache_release(old_page);
2448 }
2451 return ret; 2449 return ret;
2452oom_free_new: 2450oom_free_new:
2453 page_cache_release(new_page); 2451 page_cache_release(new_page);
@@ -2650,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping,
2650 details.last_index = ULONG_MAX; 2648 details.last_index = ULONG_MAX;
2651 details.i_mmap_lock = &mapping->i_mmap_lock; 2649 details.i_mmap_lock = &mapping->i_mmap_lock;
2652 2650
2651 mutex_lock(&mapping->unmap_mutex);
2653 spin_lock(&mapping->i_mmap_lock); 2652 spin_lock(&mapping->i_mmap_lock);
2654 2653
2655 /* Protect against endless unmapping loops */ 2654 /* Protect against endless unmapping loops */
@@ -2666,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping,
2666 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2665 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2667 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2666 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2668 spin_unlock(&mapping->i_mmap_lock); 2667 spin_unlock(&mapping->i_mmap_lock);
2668 mutex_unlock(&mapping->unmap_mutex);
2669} 2669}
2670EXPORT_SYMBOL(unmap_mapping_range); 2670EXPORT_SYMBOL(unmap_mapping_range);
2671 2671
@@ -3053,12 +3053,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3053 goto out; 3053 goto out;
3054 } 3054 }
3055 charged = 1; 3055 charged = 1;
3056 /*
3057 * Don't let another task, with possibly unlocked vma,
3058 * keep the mlocked page.
3059 */
3060 if (vma->vm_flags & VM_LOCKED)
3061 clear_page_mlock(vmf.page);
3062 copy_user_highpage(page, vmf.page, address, vma); 3056 copy_user_highpage(page, vmf.page, address, vma);
3063 __SetPageUptodate(page); 3057 __SetPageUptodate(page);
3064 } else { 3058 } else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 368fc9d23610..b53ec99f1428 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1524} 1524}
1525 1525
1526/* Return a zonelist indicated by gfp for node representing a mempolicy */ 1526/* Return a zonelist indicated by gfp for node representing a mempolicy */
1527static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) 1527static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1528 int nd)
1528{ 1529{
1529 int nd = numa_node_id();
1530
1531 switch (policy->mode) { 1530 switch (policy->mode) {
1532 case MPOL_PREFERRED: 1531 case MPOL_PREFERRED:
1533 if (!(policy->flags & MPOL_F_LOCAL)) 1532 if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1679 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1678 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1680 huge_page_shift(hstate_vma(vma))), gfp_flags); 1679 huge_page_shift(hstate_vma(vma))), gfp_flags);
1681 } else { 1680 } else {
1682 zl = policy_zonelist(gfp_flags, *mpol); 1681 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1683 if ((*mpol)->mode == MPOL_BIND) 1682 if ((*mpol)->mode == MPOL_BIND)
1684 *nodemask = &(*mpol)->v.nodes; 1683 *nodemask = &(*mpol)->v.nodes;
1685 } 1684 }
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1820 */ 1819 */
1821struct page * 1820struct page *
1822alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1821alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1823 unsigned long addr) 1822 unsigned long addr, int node)
1824{ 1823{
1825 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1824 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1826 struct zonelist *zl; 1825 struct zonelist *zl;
@@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1830 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1829 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1831 unsigned nid; 1830 unsigned nid;
1832 1831
1833 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1832 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1834 mpol_cond_put(pol); 1833 mpol_cond_put(pol);
1835 page = alloc_page_interleave(gfp, order, nid); 1834 page = alloc_page_interleave(gfp, order, nid);
1836 put_mems_allowed(); 1835 put_mems_allowed();
1837 return page; 1836 return page;
1838 } 1837 }
1839 zl = policy_zonelist(gfp, pol); 1838 zl = policy_zonelist(gfp, pol, node);
1840 if (unlikely(mpol_needs_cond_ref(pol))) { 1839 if (unlikely(mpol_needs_cond_ref(pol))) {
1841 /* 1840 /*
1842 * slow path: ref counted shared policy 1841 * slow path: ref counted shared policy
@@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1892 page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1891 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1893 else 1892 else
1894 page = __alloc_pages_nodemask(gfp, order, 1893 page = __alloc_pages_nodemask(gfp, order,
1895 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); 1894 policy_zonelist(gfp, pol, numa_node_id()),
1895 policy_nodemask(gfp, pol));
1896 put_mems_allowed(); 1896 put_mems_allowed();
1897 return page; 1897 return page;
1898} 1898}
diff --git a/mm/migrate.c b/mm/migrate.c
index 46fe8cc13d67..352de555626c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -772,6 +772,7 @@ uncharge:
772unlock: 772unlock:
773 unlock_page(page); 773 unlock_page(page);
774 774
775move_newpage:
775 if (rc != -EAGAIN) { 776 if (rc != -EAGAIN) {
776 /* 777 /*
777 * A page that has been migrated has all references 778 * A page that has been migrated has all references
@@ -785,8 +786,6 @@ unlock:
785 putback_lru_page(page); 786 putback_lru_page(page);
786 } 787 }
787 788
788move_newpage:
789
790 /* 789 /*
791 * Move the new page to the LRU. If migration was not successful 790 * Move the new page to the LRU. If migration was not successful
792 * then this will free the page. 791 * then this will free the page.
@@ -888,7 +887,7 @@ out:
888 * are movable anymore because to has become empty 887 * are movable anymore because to has become empty
889 * or no retryable pages exist anymore. 888 * or no retryable pages exist anymore.
890 * Caller should call putback_lru_pages to return pages to the LRU 889 * Caller should call putback_lru_pages to return pages to the LRU
891 * or free list. 890 * or free list only if ret != 0.
892 * 891 *
893 * Return: Number of pages not migrated or error code. 892 * Return: Number of pages not migrated or error code.
894 */ 893 */
@@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
981 } 980 }
982 rc = 0; 981 rc = 0;
983out: 982out:
984
985 list_for_each_entry_safe(page, page2, from, lru)
986 put_page(page);
987
988 if (rc) 983 if (rc)
989 return rc; 984 return rc;
990 985
@@ -1292,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1292 return -EPERM; 1287 return -EPERM;
1293 1288
1294 /* Find the mm_struct */ 1289 /* Find the mm_struct */
1295 read_lock(&tasklist_lock); 1290 rcu_read_lock();
1296 task = pid ? find_task_by_vpid(pid) : current; 1291 task = pid ? find_task_by_vpid(pid) : current;
1297 if (!task) { 1292 if (!task) {
1298 read_unlock(&tasklist_lock); 1293 rcu_read_unlock();
1299 return -ESRCH; 1294 return -ESRCH;
1300 } 1295 }
1301 mm = get_task_mm(task); 1296 mm = get_task_mm(task);
1302 read_unlock(&tasklist_lock); 1297 rcu_read_unlock();
1303 1298
1304 if (!mm) 1299 if (!mm)
1305 return -EINVAL; 1300 return -EINVAL;
diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
179 gup_flags |= FOLL_WRITE; 179 gup_flags |= FOLL_WRITE;
180 180
181 /*
182 * We want mlock to succeed for regions that have any permissions
183 * other than PROT_NONE.
184 */
185 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
186 gup_flags |= FOLL_FORCE;
187
181 if (vma->vm_flags & VM_LOCKED) 188 if (vma->vm_flags & VM_LOCKED)
182 gup_flags |= FOLL_MLOCK; 189 gup_flags |= FOLL_MLOCK;
183 190
diff --git a/mm/mremap.c b/mm/mremap.c
index 9925b6391b80..1de98d492ddc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
94 */ 94 */
95 mapping = vma->vm_file->f_mapping; 95 mapping = vma->vm_file->f_mapping;
96 spin_lock(&mapping->i_mmap_lock); 96 spin_lock(&mapping->i_mmap_lock);
97 if (new_vma->vm_truncate_count && 97 new_vma->vm_truncate_count = 0;
98 new_vma->vm_truncate_count != vma->vm_truncate_count)
99 new_vma->vm_truncate_count = 0;
100 } 98 }
101 99
102 /* 100 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90c1439549fd..cdef1d4b4e47 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
1088 pset = per_cpu_ptr(zone->pageset, cpu); 1088 pset = per_cpu_ptr(zone->pageset, cpu);
1089 1089
1090 pcp = &pset->pcp; 1090 pcp = &pset->pcp;
1091 free_pcppages_bulk(zone, pcp->count, pcp); 1091 if (pcp->count) {
1092 pcp->count = 0; 1092 free_pcppages_bulk(zone, pcp->count, pcp);
1093 pcp->count = 0;
1094 }
1093 local_irq_restore(flags); 1095 local_irq_restore(flags);
1094 } 1096 }
1095} 1097}
@@ -2034,6 +2036,14 @@ restart:
2034 */ 2036 */
2035 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2037 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2036 2038
2039 /*
2040 * Find the true preferred zone if the allocation is unconstrained by
2041 * cpusets.
2042 */
2043 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2044 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2045 &preferred_zone);
2046
2037 /* This is the last chance, in general, before the goto nopage. */ 2047 /* This is the last chance, in general, before the goto nopage. */
2038 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2048 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2039 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2049 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2192 2202
2193 get_mems_allowed(); 2203 get_mems_allowed();
2194 /* The preferred zone is used for statistics later */ 2204 /* The preferred zone is used for statistics later */
2195 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2205 first_zones_zonelist(zonelist, high_zoneidx,
2206 nodemask ? : &cpuset_current_mems_allowed,
2207 &preferred_zone);
2196 if (!preferred_zone) { 2208 if (!preferred_zone) {
2197 put_mems_allowed(); 2209 put_mems_allowed();
2198 return NULL; 2210 return NULL;
@@ -5364,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5364 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5376 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5365 unsigned long check = pfn + iter; 5377 unsigned long check = pfn + iter;
5366 5378
5367 if (!pfn_valid_within(check)) { 5379 if (!pfn_valid_within(check))
5368 iter++;
5369 continue; 5380 continue;
5370 } 5381
5371 page = pfn_to_page(check); 5382 page = pfn_to_page(check);
5372 if (!page_count(page)) { 5383 if (!page_count(page)) {
5373 if (PageBuddy(page)) 5384 if (PageBuddy(page))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0369f5b3ba1b..eb663fb533e0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2010 Linus Torvalds 6 * Copyright (C) 2010 Linus Torvalds
7 */ 7 */
8 8
9#include <linux/pagemap.h>
9#include <asm/tlb.h> 10#include <asm/tlb.h>
10#include <asm-generic/pgtable.h> 11#include <asm-generic/pgtable.h>
11 12
diff --git a/mm/rmap.c b/mm/rmap.c
index f21f4a1d6a1c..941bf82e8961 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -497,41 +497,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
497 struct mm_struct *mm = vma->vm_mm; 497 struct mm_struct *mm = vma->vm_mm;
498 int referenced = 0; 498 int referenced = 0;
499 499
500 /*
501 * Don't want to elevate referenced for mlocked page that gets this far,
502 * in order that it progresses to try_to_unmap and is moved to the
503 * unevictable list.
504 */
505 if (vma->vm_flags & VM_LOCKED) {
506 *mapcount = 0; /* break early from loop */
507 *vm_flags |= VM_LOCKED;
508 goto out;
509 }
510
511 /* Pretend the page is referenced if the task has the
512 swap token and is in the middle of a page fault. */
513 if (mm != current->mm && has_swap_token(mm) &&
514 rwsem_is_locked(&mm->mmap_sem))
515 referenced++;
516
517 if (unlikely(PageTransHuge(page))) { 500 if (unlikely(PageTransHuge(page))) {
518 pmd_t *pmd; 501 pmd_t *pmd;
519 502
520 spin_lock(&mm->page_table_lock); 503 spin_lock(&mm->page_table_lock);
504 /*
505 * rmap might return false positives; we must filter
506 * these out using page_check_address_pmd().
507 */
521 pmd = page_check_address_pmd(page, mm, address, 508 pmd = page_check_address_pmd(page, mm, address,
522 PAGE_CHECK_ADDRESS_PMD_FLAG); 509 PAGE_CHECK_ADDRESS_PMD_FLAG);
523 if (pmd && !pmd_trans_splitting(*pmd) && 510 if (!pmd) {
524 pmdp_clear_flush_young_notify(vma, address, pmd)) 511 spin_unlock(&mm->page_table_lock);
512 goto out;
513 }
514
515 if (vma->vm_flags & VM_LOCKED) {
516 spin_unlock(&mm->page_table_lock);
517 *mapcount = 0; /* break early from loop */
518 *vm_flags |= VM_LOCKED;
519 goto out;
520 }
521
522 /* go ahead even if the pmd is pmd_trans_splitting() */
523 if (pmdp_clear_flush_young_notify(vma, address, pmd))
525 referenced++; 524 referenced++;
526 spin_unlock(&mm->page_table_lock); 525 spin_unlock(&mm->page_table_lock);
527 } else { 526 } else {
528 pte_t *pte; 527 pte_t *pte;
529 spinlock_t *ptl; 528 spinlock_t *ptl;
530 529
530 /*
531 * rmap might return false positives; we must filter
532 * these out using page_check_address().
533 */
531 pte = page_check_address(page, mm, address, &ptl, 0); 534 pte = page_check_address(page, mm, address, &ptl, 0);
532 if (!pte) 535 if (!pte)
533 goto out; 536 goto out;
534 537
538 if (vma->vm_flags & VM_LOCKED) {
539 pte_unmap_unlock(pte, ptl);
540 *mapcount = 0; /* break early from loop */
541 *vm_flags |= VM_LOCKED;
542 goto out;
543 }
544
535 if (ptep_clear_flush_young_notify(vma, address, pte)) { 545 if (ptep_clear_flush_young_notify(vma, address, pte)) {
536 /* 546 /*
537 * Don't treat a reference through a sequentially read 547 * Don't treat a reference through a sequentially read
@@ -546,6 +556,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
546 pte_unmap_unlock(pte, ptl); 556 pte_unmap_unlock(pte, ptl);
547 } 557 }
548 558
559 /* Pretend the page is referenced if the task has the
560 swap token and is in the middle of a page fault. */
561 if (mm != current->mm && has_swap_token(mm) &&
562 rwsem_is_locked(&mm->mmap_sem))
563 referenced++;
564
549 (*mapcount)--; 565 (*mapcount)--;
550 566
551 if (referenced) 567 if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ee67c990602..3437b65d6d6e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2144,8 +2144,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2144{ 2144{
2145 struct inode *inode = dentry->d_inode; 2145 struct inode *inode = dentry->d_inode;
2146 2146
2147 if (*len < 3) 2147 if (*len < 3) {
2148 *len = 3;
2148 return 255; 2149 return 255;
2150 }
2149 2151
2150 if (inode_unhashed(inode)) { 2152 if (inode_unhashed(inode)) {
2151 /* Unfortunately insert_inode_hash is not idempotent, 2153 /* Unfortunately insert_inode_hash is not idempotent,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 07a458d72fa8..0341c5700e34 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1940 1940
1941 error = -EINVAL; 1941 error = -EINVAL;
1942 if (S_ISBLK(inode->i_mode)) { 1942 if (S_ISBLK(inode->i_mode)) {
1943 bdev = I_BDEV(inode); 1943 bdev = bdgrab(I_BDEV(inode));
1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, 1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1945 sys_swapon); 1945 sys_swapon);
1946 if (error < 0) { 1946 if (error < 0) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 49feb46e77b8..d64296be00d3 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
225 next = start; 225 next = start;
226 while (next <= end && 226 while (next <= end &&
227 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 227 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
228 mem_cgroup_uncharge_start();
228 for (i = 0; i < pagevec_count(&pvec); i++) { 229 for (i = 0; i < pagevec_count(&pvec); i++) {
229 struct page *page = pvec.pages[i]; 230 struct page *page = pvec.pages[i];
230 pgoff_t page_index = page->index; 231 pgoff_t page_index = page->index;
@@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
247 unlock_page(page); 248 unlock_page(page);
248 } 249 }
249 pagevec_release(&pvec); 250 pagevec_release(&pvec);
251 mem_cgroup_uncharge_end();
250 cond_resched(); 252 cond_resched();
251 } 253 }
252 254
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f5d90dedebba..6771ea70bfe7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone,
1841 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1841 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1842 return false; 1842 return false;
1843 1843
1844 /* 1844 /* Consider stopping depending on scan and reclaim activity */
1845 * If we failed to reclaim and have scanned the full list, stop. 1845 if (sc->gfp_mask & __GFP_REPEAT) {
1846 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far 1846 /*
1847 * faster but obviously would be less likely to succeed 1847 * For __GFP_REPEAT allocations, stop reclaiming if the
1848 * allocation. If this is desirable, use GFP_REPEAT to decide 1848 * full LRU list has been scanned and we are still failing
1849 * if both reclaimed and scanned should be checked or just 1849 * to reclaim pages. This full LRU scan is potentially
1850 * reclaimed 1850 * expensive but a __GFP_REPEAT caller really wants to succeed
1851 */ 1851 */
1852 if (!nr_reclaimed && !nr_scanned) 1852 if (!nr_reclaimed && !nr_scanned)
1853 return false; 1853 return false;
1854 } else {
1855 /*
1856 * For non-__GFP_REPEAT allocations which can presumably
1857 * fail without consequence, stop if we failed to reclaim
1858 * any pages from the last SWAP_CLUSTER_MAX number of
1859 * pages that were scanned. This will return to the
1860 * caller faster at the risk reclaim/compaction and
1861 * the resulting allocation attempt fails
1862 */
1863 if (!nr_reclaimed)
1864 return false;
1865 }
1854 1866
1855 /* 1867 /*
1856 * If we have not reclaimed enough pages for compaction and the 1868 * If we have not reclaimed enough pages for compaction and the
@@ -1882,12 +1894,12 @@ static void shrink_zone(int priority, struct zone *zone,
1882 unsigned long nr[NR_LRU_LISTS]; 1894 unsigned long nr[NR_LRU_LISTS];
1883 unsigned long nr_to_scan; 1895 unsigned long nr_to_scan;
1884 enum lru_list l; 1896 enum lru_list l;
1885 unsigned long nr_reclaimed; 1897 unsigned long nr_reclaimed, nr_scanned;
1886 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1898 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1887 unsigned long nr_scanned = sc->nr_scanned;
1888 1899
1889restart: 1900restart:
1890 nr_reclaimed = 0; 1901 nr_reclaimed = 0;
1902 nr_scanned = sc->nr_scanned;
1891 get_scan_count(zone, sc, nr, priority); 1903 get_scan_count(zone, sc, nr, priority);
1892 1904
1893 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1905 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2083,7 +2095,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2083 struct zone *preferred_zone; 2095 struct zone *preferred_zone;
2084 2096
2085 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2097 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2086 NULL, &preferred_zone); 2098 &cpuset_current_mems_allowed,
2099 &preferred_zone);
2087 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2100 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2088 } 2101 }
2089 } 2102 }