aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorPekka Enberg <penberg@kernel.org>2011-03-11 11:10:45 -0500
committerPekka Enberg <penberg@kernel.org>2011-03-11 11:10:45 -0500
commitc9149556756d56c68451a4a8735c37e7062fd3d7 (patch)
treea2dae56b22adaa9a23c8f92f30c3b3ad3b610850 /mm
parentd71f606f687ef9d0cdddfd3619ca7cb9a0b3fb63 (diff)
parent5bfe53a77e8a3ffce4a10003c75f464a138e272d (diff)
Merge branch 'slab/rcu' into slab/next
Conflicts: mm/slub.c
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/huge_memory.c72
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c13
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c98
-rw-r--r--mm/memory-failure.c94
-rw-r--r--mm/memory.c34
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c15
-rw-r--r--mm/mlock.c7
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/page_alloc.c23
-rw-r--r--mm/pgtable-generic.c1
-rw-r--r--mm/slab.c39
-rw-r--r--mm/slub.c77
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/vmscan.c39
19 files changed, 356 insertions, 190 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3ad483bdf505..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
179config COMPACTION 179config COMPACTION
180 bool "Allow for memory compaction" 180 bool "Allow for memory compaction"
181 select MIGRATION 181 select MIGRATION
182 depends on EXPERIMENTAL && HUGETLB_PAGE && MMU 182 depends on MMU
183 help 183 help
184 Allows the compaction of memory for the allocation of huge pages. 184 Allows the compaction of memory for the allocation of huge pages.
185 185
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e187454d82f6..dbe99a5f2073 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag)
650 650
651static inline struct page *alloc_hugepage_vma(int defrag, 651static inline struct page *alloc_hugepage_vma(int defrag,
652 struct vm_area_struct *vma, 652 struct vm_area_struct *vma,
653 unsigned long haddr) 653 unsigned long haddr, int nd)
654{ 654{
655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), 655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
656 HPAGE_PMD_ORDER, vma, haddr); 656 HPAGE_PMD_ORDER, vma, haddr, nd);
657} 657}
658 658
659#ifndef CONFIG_NUMA 659#ifndef CONFIG_NUMA
@@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
678 if (unlikely(khugepaged_enter(vma))) 678 if (unlikely(khugepaged_enter(vma)))
679 return VM_FAULT_OOM; 679 return VM_FAULT_OOM;
680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
681 vma, haddr); 681 vma, haddr, numa_node_id());
682 if (unlikely(!page)) 682 if (unlikely(!page))
683 goto out; 683 goto out;
684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
799 } 799 }
800 800
801 for (i = 0; i < HPAGE_PMD_NR; i++) { 801 for (i = 0; i < HPAGE_PMD_NR; i++) {
802 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 802 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE,
803 vma, address); 803 vma, address, page_to_nid(page));
804 if (unlikely(!pages[i] || 804 if (unlikely(!pages[i] ||
805 mem_cgroup_newpage_charge(pages[i], mm, 805 mem_cgroup_newpage_charge(pages[i], mm,
806 GFP_KERNEL))) { 806 GFP_KERNEL))) {
@@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
902 if (transparent_hugepage_enabled(vma) && 902 if (transparent_hugepage_enabled(vma) &&
903 !transparent_hugepage_debug_cow()) 903 !transparent_hugepage_debug_cow())
904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
905 vma, haddr); 905 vma, haddr, numa_node_id());
906 else 906 else
907 new_page = NULL; 907 new_page = NULL;
908 908
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page)
1162 /* after clearing PageTail the gup refcount can be released */ 1162 /* after clearing PageTail the gup refcount can be released */
1163 smp_mb(); 1163 smp_mb();
1164 1164
1165 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1165 /*
1166 * retain hwpoison flag of the poisoned tail page:
1167 * fix for the unsuitable process killed on Guest Machine(KVM)
1168 * by the memory-failure.
1169 */
1170 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1166 page_tail->flags |= (page->flags & 1171 page_tail->flags |= (page->flags &
1167 ((1L << PG_referenced) | 1172 ((1L << PG_referenced) |
1168 (1L << PG_swapbacked) | 1173 (1L << PG_swapbacked) |
@@ -1740,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1740static void collapse_huge_page(struct mm_struct *mm, 1745static void collapse_huge_page(struct mm_struct *mm,
1741 unsigned long address, 1746 unsigned long address,
1742 struct page **hpage, 1747 struct page **hpage,
1743 struct vm_area_struct *vma) 1748 struct vm_area_struct *vma,
1749 int node)
1744{ 1750{
1745 pgd_t *pgd; 1751 pgd_t *pgd;
1746 pud_t *pud; 1752 pud_t *pud;
@@ -1768,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm,
1768 * mmap_sem in read mode is good idea also to allow greater 1774 * mmap_sem in read mode is good idea also to allow greater
1769 * scalability. 1775 * scalability.
1770 */ 1776 */
1771 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); 1777 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1778 node);
1772 if (unlikely(!new_page)) { 1779 if (unlikely(!new_page)) {
1773 up_read(&mm->mmap_sem); 1780 up_read(&mm->mmap_sem);
1774 *hpage = ERR_PTR(-ENOMEM); 1781 *hpage = ERR_PTR(-ENOMEM);
@@ -1806,6 +1813,8 @@ static void collapse_huge_page(struct mm_struct *mm,
1806 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 1813 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1807 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) 1814 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1808 goto out; 1815 goto out;
1816 if (is_vma_temporary_stack(vma))
1817 goto out;
1809 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 1818 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1810 1819
1811 pgd = pgd_offset(mm, address); 1820 pgd = pgd_offset(mm, address);
@@ -1847,7 +1856,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1847 set_pmd_at(mm, address, pmd, _pmd); 1856 set_pmd_at(mm, address, pmd, _pmd);
1848 spin_unlock(&mm->page_table_lock); 1857 spin_unlock(&mm->page_table_lock);
1849 anon_vma_unlock(vma->anon_vma); 1858 anon_vma_unlock(vma->anon_vma);
1850 mem_cgroup_uncharge_page(new_page);
1851 goto out; 1859 goto out;
1852 } 1860 }
1853 1861
@@ -1893,6 +1901,7 @@ out_up_write:
1893 return; 1901 return;
1894 1902
1895out: 1903out:
1904 mem_cgroup_uncharge_page(new_page);
1896#ifdef CONFIG_NUMA 1905#ifdef CONFIG_NUMA
1897 put_page(new_page); 1906 put_page(new_page);
1898#endif 1907#endif
@@ -1912,6 +1921,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
1912 struct page *page; 1921 struct page *page;
1913 unsigned long _address; 1922 unsigned long _address;
1914 spinlock_t *ptl; 1923 spinlock_t *ptl;
1924 int node = -1;
1915 1925
1916 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1926 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1917 1927
@@ -1942,6 +1952,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
1942 page = vm_normal_page(vma, _address, pteval); 1952 page = vm_normal_page(vma, _address, pteval);
1943 if (unlikely(!page)) 1953 if (unlikely(!page))
1944 goto out_unmap; 1954 goto out_unmap;
1955 /*
1956 * Chose the node of the first page. This could
1957 * be more sophisticated and look at more pages,
1958 * but isn't for now.
1959 */
1960 if (node == -1)
1961 node = page_to_nid(page);
1945 VM_BUG_ON(PageCompound(page)); 1962 VM_BUG_ON(PageCompound(page));
1946 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 1963 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1947 goto out_unmap; 1964 goto out_unmap;
@@ -1958,7 +1975,7 @@ out_unmap:
1958 pte_unmap_unlock(pte, ptl); 1975 pte_unmap_unlock(pte, ptl);
1959 if (ret) 1976 if (ret)
1960 /* collapse_huge_page will return with the mmap_sem released */ 1977 /* collapse_huge_page will return with the mmap_sem released */
1961 collapse_huge_page(mm, address, hpage, vma); 1978 collapse_huge_page(mm, address, hpage, vma, node);
1962out: 1979out:
1963 return ret; 1980 return ret;
1964} 1981}
@@ -2027,32 +2044,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2027 if ((!(vma->vm_flags & VM_HUGEPAGE) && 2044 if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2028 !khugepaged_always()) || 2045 !khugepaged_always()) ||
2029 (vma->vm_flags & VM_NOHUGEPAGE)) { 2046 (vma->vm_flags & VM_NOHUGEPAGE)) {
2047 skip:
2030 progress++; 2048 progress++;
2031 continue; 2049 continue;
2032 } 2050 }
2033
2034 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 2051 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
2035 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { 2052 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
2036 khugepaged_scan.address = vma->vm_end; 2053 goto skip;
2037 progress++; 2054 if (is_vma_temporary_stack(vma))
2038 continue; 2055 goto skip;
2039 } 2056
2040 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 2057 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
2041 2058
2042 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2059 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2043 hend = vma->vm_end & HPAGE_PMD_MASK; 2060 hend = vma->vm_end & HPAGE_PMD_MASK;
2044 if (hstart >= hend) { 2061 if (hstart >= hend)
2045 progress++; 2062 goto skip;
2046 continue; 2063 if (khugepaged_scan.address > hend)
2047 } 2064 goto skip;
2048 if (khugepaged_scan.address < hstart) 2065 if (khugepaged_scan.address < hstart)
2049 khugepaged_scan.address = hstart; 2066 khugepaged_scan.address = hstart;
2050 if (khugepaged_scan.address > hend) { 2067 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2051 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
2052 progress++;
2053 continue;
2054 }
2055 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2056 2068
2057 while (khugepaged_scan.address < hend) { 2069 while (khugepaged_scan.address < hend) {
2058 int ret; 2070 int ret;
@@ -2081,7 +2093,7 @@ breakouterloop:
2081breakouterloop_mmap_sem: 2093breakouterloop_mmap_sem:
2082 2094
2083 spin_lock(&khugepaged_mm_lock); 2095 spin_lock(&khugepaged_mm_lock);
2084 BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2096 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2085 /* 2097 /*
2086 * Release the current mm_slot if this mm is about to die, or 2098 * Release the current mm_slot if this mm is about to die, or
2087 * if we scanned all vmas of this mm. 2099 * if we scanned all vmas of this mm.
@@ -2236,9 +2248,9 @@ static int khugepaged(void *none)
2236 2248
2237 for (;;) { 2249 for (;;) {
2238 mutex_unlock(&khugepaged_mutex); 2250 mutex_unlock(&khugepaged_mutex);
2239 BUG_ON(khugepaged_thread != current); 2251 VM_BUG_ON(khugepaged_thread != current);
2240 khugepaged_loop(); 2252 khugepaged_loop();
2241 BUG_ON(khugepaged_thread != current); 2253 VM_BUG_ON(khugepaged_thread != current);
2242 2254
2243 mutex_lock(&khugepaged_mutex); 2255 mutex_lock(&khugepaged_mutex);
2244 if (!khugepaged_enabled()) 2256 if (!khugepaged_enabled())
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
75 * after the module is removed. 75 * after the module is removed.
76 */ 76 */
77 for (i = 0; i < 10; i++) { 77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 78 elem = kzalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); 79 pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem) 80 if (!elem)
81 return -ENOMEM; 81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list); 82 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list); 83 list_add_tail(&elem->list, &test_list);
86 } 84 }
87 85
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
113#define BYTES_PER_POINTER sizeof(void *) 113#define BYTES_PER_POINTER sizeof(void *)
114 114
115/* GFP bitmask for kmemleak internal allocations */ 115/* GFP bitmask for kmemleak internal allocations */
116#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) 116#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
117 __GFP_NORETRY | __GFP_NOMEMALLOC | \
118 __GFP_NOWARN)
117 119
118/* scanning area inside a memory block */ 120/* scanning area inside a memory block */
119struct kmemleak_scan_area { 121struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
511 struct kmemleak_object *object; 513 struct kmemleak_object *object;
512 struct prio_tree_node *node; 514 struct prio_tree_node *node;
513 515
514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 516 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
515 if (!object) { 517 if (!object) {
516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 518 pr_warning("Cannot allocate a kmemleak_object structure\n");
519 kmemleak_disable();
517 return NULL; 520 return NULL;
518 } 521 }
519 522
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
734 return; 737 return;
735 } 738 }
736 739
737 area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); 740 area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
738 if (!area) { 741 if (!area) {
739 kmemleak_warn("Cannot allocate a scan area\n"); 742 pr_warning("Cannot allocate a scan area\n");
740 goto out; 743 goto out;
741 } 744 }
742 745
diff --git a/mm/memblock.c b/mm/memblock.c
index bdba245d8afd..4618fda975a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
137 137
138 BUG_ON(0 == size); 138 BUG_ON(0 == size);
139 139
140 size = memblock_align_up(size, align);
141
142 /* Pump up max_addr */ 140 /* Pump up max_addr */
143 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 141 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
144 end = memblock.current_limit; 142 end = memblock.current_limit;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db76ef726293..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
612 /* pagein of a big page is an event. So, ignore page size */ 612 /* pagein of a big page is an event. So, ignore page size */
613 if (nr_pages > 0) 613 if (nr_pages > 0)
614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
615 else 615 else {
616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
617 nr_pages = -nr_pages; /* for event */
618 }
617 619
618 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); 620 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
619 621
@@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1111 return false; 1113 return false;
1112} 1114}
1113 1115
1116/**
1117 * mem_cgroup_check_margin - check if the memory cgroup allows charging
1118 * @mem: memory cgroup to check
1119 * @bytes: the number of bytes the caller intends to charge
1120 *
1121 * Returns a boolean value on whether @mem can be charged @bytes or
1122 * whether this would exceed the limit.
1123 */
1124static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
1125{
1126 if (!res_counter_check_margin(&mem->res, bytes))
1127 return false;
1128 if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
1129 return false;
1130 return true;
1131}
1132
1114static unsigned int get_swappiness(struct mem_cgroup *memcg) 1133static unsigned int get_swappiness(struct mem_cgroup *memcg)
1115{ 1134{
1116 struct cgroup *cgrp = memcg->css.cgroup; 1135 struct cgroup *cgrp = memcg->css.cgroup;
@@ -1832,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1832 if (likely(!ret)) 1851 if (likely(!ret))
1833 return CHARGE_OK; 1852 return CHARGE_OK;
1834 1853
1854 res_counter_uncharge(&mem->res, csize);
1835 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1855 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1836 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1856 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1837 } else 1857 } else
1838 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1858 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1839 1859 /*
1840 if (csize > PAGE_SIZE) /* change csize and retry */ 1860 * csize can be either a huge page (HPAGE_SIZE), a batch of
1861 * regular pages (CHARGE_SIZE), or a single regular page
1862 * (PAGE_SIZE).
1863 *
1864 * Never reclaim on behalf of optional batching, retry with a
1865 * single page instead.
1866 */
1867 if (csize == CHARGE_SIZE)
1841 return CHARGE_RETRY; 1868 return CHARGE_RETRY;
1842 1869
1843 if (!(gfp_mask & __GFP_WAIT)) 1870 if (!(gfp_mask & __GFP_WAIT))
1844 return CHARGE_WOULDBLOCK; 1871 return CHARGE_WOULDBLOCK;
1845 1872
1846 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1873 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1847 gfp_mask, flags); 1874 gfp_mask, flags);
1875 if (mem_cgroup_check_margin(mem_over_limit, csize))
1876 return CHARGE_RETRY;
1848 /* 1877 /*
1849 * try_to_free_mem_cgroup_pages() might not give us a full 1878 * Even though the limit is exceeded at this point, reclaim
1850 * picture of reclaim. Some pages are reclaimed and might be 1879 * may have been able to free some pages. Retry the charge
1851 * moved to swap cache or just unmapped from the cgroup. 1880 * before killing the task.
1852 * Check the limit again to see if the reclaim reduced the 1881 *
1853 * current usage of the cgroup before giving up 1882 * Only for regular pages, though: huge pages are rather
1883 * unlikely to succeed so close to the limit, and we fall back
1884 * to regular pages anyway in case of failure.
1854 */ 1885 */
1855 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 1886 if (csize == PAGE_SIZE && ret)
1856 return CHARGE_RETRY; 1887 return CHARGE_RETRY;
1857 1888
1858 /* 1889 /*
@@ -2144,6 +2175,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2144 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2175 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2145 unsigned long flags; 2176 unsigned long flags;
2146 2177
2178 if (mem_cgroup_disabled())
2179 return;
2147 /* 2180 /*
2148 * We have no races with charge/uncharge but will have races with 2181 * We have no races with charge/uncharge but will have races with
2149 * page state accounting. 2182 * page state accounting.
@@ -2233,7 +2266,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
2233{ 2266{
2234 int ret = -EINVAL; 2267 int ret = -EINVAL;
2235 unsigned long flags; 2268 unsigned long flags;
2236 2269 /*
2270 * The page is isolated from LRU. So, collapse function
2271 * will not handle this page. But page splitting can happen.
2272 * Do this check under compound_page_lock(). The caller should
2273 * hold it.
2274 */
2237 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) 2275 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2238 return -EBUSY; 2276 return -EBUSY;
2239 2277
@@ -2265,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2265 struct cgroup *cg = child->css.cgroup; 2303 struct cgroup *cg = child->css.cgroup;
2266 struct cgroup *pcg = cg->parent; 2304 struct cgroup *pcg = cg->parent;
2267 struct mem_cgroup *parent; 2305 struct mem_cgroup *parent;
2268 int charge = PAGE_SIZE; 2306 int page_size = PAGE_SIZE;
2269 unsigned long flags; 2307 unsigned long flags;
2270 int ret; 2308 int ret;
2271 2309
@@ -2278,23 +2316,26 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2278 goto out; 2316 goto out;
2279 if (isolate_lru_page(page)) 2317 if (isolate_lru_page(page))
2280 goto put; 2318 goto put;
2281 /* The page is isolated from LRU and we have no race with splitting */ 2319
2282 charge = PAGE_SIZE << compound_order(page); 2320 if (PageTransHuge(page))
2321 page_size = HPAGE_SIZE;
2283 2322
2284 parent = mem_cgroup_from_cont(pcg); 2323 parent = mem_cgroup_from_cont(pcg);
2285 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); 2324 ret = __mem_cgroup_try_charge(NULL, gfp_mask,
2325 &parent, false, page_size);
2286 if (ret || !parent) 2326 if (ret || !parent)
2287 goto put_back; 2327 goto put_back;
2288 2328
2289 if (charge > PAGE_SIZE) 2329 if (page_size > PAGE_SIZE)
2290 flags = compound_lock_irqsave(page); 2330 flags = compound_lock_irqsave(page);
2291 2331
2292 ret = mem_cgroup_move_account(pc, child, parent, true, charge); 2332 ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
2293 if (ret) 2333 if (ret)
2294 mem_cgroup_cancel_charge(parent, charge); 2334 mem_cgroup_cancel_charge(parent, page_size);
2295put_back: 2335
2296 if (charge > PAGE_SIZE) 2336 if (page_size > PAGE_SIZE)
2297 compound_unlock_irqrestore(page, flags); 2337 compound_unlock_irqrestore(page, flags);
2338put_back:
2298 putback_lru_page(page); 2339 putback_lru_page(page);
2299put: 2340put:
2300 put_page(page); 2341 put_page(page);
@@ -2312,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2312 gfp_t gfp_mask, enum charge_type ctype) 2353 gfp_t gfp_mask, enum charge_type ctype)
2313{ 2354{
2314 struct mem_cgroup *mem = NULL; 2355 struct mem_cgroup *mem = NULL;
2356 int page_size = PAGE_SIZE;
2315 struct page_cgroup *pc; 2357 struct page_cgroup *pc;
2358 bool oom = true;
2316 int ret; 2359 int ret;
2317 int page_size = PAGE_SIZE;
2318 2360
2319 if (PageTransHuge(page)) { 2361 if (PageTransHuge(page)) {
2320 page_size <<= compound_order(page); 2362 page_size <<= compound_order(page);
2321 VM_BUG_ON(!PageTransHuge(page)); 2363 VM_BUG_ON(!PageTransHuge(page));
2364 /*
2365 * Never OOM-kill a process for a huge page. The
2366 * fault handler will fall back to regular pages.
2367 */
2368 oom = false;
2322 } 2369 }
2323 2370
2324 pc = lookup_page_cgroup(page); 2371 pc = lookup_page_cgroup(page);
@@ -2327,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2327 return 0; 2374 return 0;
2328 prefetchw(pc); 2375 prefetchw(pc);
2329 2376
2330 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); 2377 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
2331 if (ret || !mem) 2378 if (ret || !mem)
2332 return ret; 2379 return ret;
2333 2380
@@ -5013,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
5013static int __init enable_swap_account(char *s) 5060static int __init enable_swap_account(char *s)
5014{ 5061{
5015 /* consider enabled if no parameter or 1 is given */ 5062 /* consider enabled if no parameter or 1 is given */
5016 if (!s || !strcmp(s, "1")) 5063 if (!(*s) || !strcmp(s, "=1"))
5017 really_do_swap_account = 1; 5064 really_do_swap_account = 1;
5018 else if (!strcmp(s, "0")) 5065 else if (!strcmp(s, "=0"))
5019 really_do_swap_account = 0; 5066 really_do_swap_account = 0;
5020 return 1; 5067 return 1;
5021} 5068}
@@ -5023,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
5023 5070
5024static int __init disable_swap_account(char *s) 5071static int __init disable_swap_account(char *s)
5025{ 5072{
5026 enable_swap_account("0"); 5073 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5074 enable_swap_account("=0");
5027 return 1; 5075 return 1;
5028} 5076}
5029__setup("noswapaccount", disable_swap_account); 5077__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
233 } 233 }
234 234
235 /* 235 /*
236 * Only all shrink_slab here (which would also 236 * Only call shrink_slab here (which would also shrink other caches) if
237 * shrink other caches) if access is not potentially fatal. 237 * access is not potentially fatal.
238 */ 238 */
239 if (access) { 239 if (access) {
240 int nr; 240 int nr;
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
386 struct task_struct *tsk; 386 struct task_struct *tsk;
387 struct anon_vma *av; 387 struct anon_vma *av;
388 388
389 if (!PageHuge(page) && unlikely(split_huge_page(page)))
390 return;
391 read_lock(&tasklist_lock); 389 read_lock(&tasklist_lock);
392 av = page_lock_anon_vma(page); 390 av = page_lock_anon_vma(page);
393 if (av == NULL) /* Not actually mapped anymore */ 391 if (av == NULL) /* Not actually mapped anymore */
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
856 int ret; 854 int ret;
857 int kill = 1; 855 int kill = 1;
858 struct page *hpage = compound_head(p); 856 struct page *hpage = compound_head(p);
857 struct page *ppage;
859 858
860 if (PageReserved(p) || PageSlab(p)) 859 if (PageReserved(p) || PageSlab(p))
861 return SWAP_SUCCESS; 860 return SWAP_SUCCESS;
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
897 } 896 }
898 897
899 /* 898 /*
899 * ppage: poisoned page
900 * if p is regular page(4k page)
901 * ppage == real poisoned page;
902 * else p is hugetlb or THP, ppage == head page.
903 */
904 ppage = hpage;
905
906 if (PageTransHuge(hpage)) {
907 /*
908 * Verify that this isn't a hugetlbfs head page, the check for
909 * PageAnon is just for avoid tripping a split_huge_page
910 * internal debug check, as split_huge_page refuses to deal with
911 * anything that isn't an anon page. PageAnon can't go away fro
912 * under us because we hold a refcount on the hpage, without a
913 * refcount on the hpage. split_huge_page can't be safely called
914 * in the first place, having a refcount on the tail isn't
915 * enough * to be safe.
916 */
917 if (!PageHuge(hpage) && PageAnon(hpage)) {
918 if (unlikely(split_huge_page(hpage))) {
919 /*
920 * FIXME: if splitting THP is failed, it is
921 * better to stop the following operation rather
922 * than causing panic by unmapping. System might
923 * survive if the page is freed later.
924 */
925 printk(KERN_INFO
926 "MCE %#lx: failed to split THP\n", pfn);
927
928 BUG_ON(!PageHWPoison(p));
929 return SWAP_FAIL;
930 }
931 /* THP is split, so ppage should be the real poisoned page. */
932 ppage = p;
933 }
934 }
935
936 /*
900 * First collect all the processes that have the page 937 * First collect all the processes that have the page
901 * mapped in dirty form. This has to be done before try_to_unmap, 938 * mapped in dirty form. This has to be done before try_to_unmap,
902 * because ttu takes the rmap data structures down. 939 * because ttu takes the rmap data structures down.
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
905 * there's nothing that can be done. 942 * there's nothing that can be done.
906 */ 943 */
907 if (kill) 944 if (kill)
908 collect_procs(hpage, &tokill); 945 collect_procs(ppage, &tokill);
946
947 if (hpage != ppage)
948 lock_page_nosync(ppage);
909 949
910 ret = try_to_unmap(hpage, ttu); 950 ret = try_to_unmap(ppage, ttu);
911 if (ret != SWAP_SUCCESS) 951 if (ret != SWAP_SUCCESS)
912 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 952 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
913 pfn, page_mapcount(hpage)); 953 pfn, page_mapcount(ppage));
954
955 if (hpage != ppage)
956 unlock_page(ppage);
914 957
915 /* 958 /*
916 * Now that the dirty bit has been propagated to the 959 * Now that the dirty bit has been propagated to the
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
921 * use a more force-full uncatchable kill to prevent 964 * use a more force-full uncatchable kill to prevent
922 * any accesses to the poisoned memory. 965 * any accesses to the poisoned memory.
923 */ 966 */
924 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, 967 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
925 ret != SWAP_SUCCESS, p, pfn); 968 ret != SWAP_SUCCESS, p, pfn);
926 969
927 return ret; 970 return ret;
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1022 * The check (unnecessarily) ignores LRU pages being isolated and 1065 * The check (unnecessarily) ignores LRU pages being isolated and
1023 * walked by the page reclaim code, however that's not a big loss. 1066 * walked by the page reclaim code, however that's not a big loss.
1024 */ 1067 */
1025 if (!PageLRU(p) && !PageHuge(p)) 1068 if (!PageHuge(p) && !PageTransCompound(p)) {
1026 shake_page(p, 0); 1069 if (!PageLRU(p))
1027 if (!PageLRU(p) && !PageHuge(p)) { 1070 shake_page(p, 0);
1028 /* 1071 if (!PageLRU(p)) {
1029 * shake_page could have turned it free. 1072 /*
1030 */ 1073 * shake_page could have turned it free.
1031 if (is_free_buddy_page(p)) { 1074 */
1032 action_result(pfn, "free buddy, 2nd try", DELAYED); 1075 if (is_free_buddy_page(p)) {
1033 return 0; 1076 action_result(pfn, "free buddy, 2nd try",
1077 DELAYED);
1078 return 0;
1079 }
1080 action_result(pfn, "non LRU", IGNORED);
1081 put_page(p);
1082 return -EBUSY;
1034 } 1083 }
1035 action_result(pfn, "non LRU", IGNORED);
1036 put_page(p);
1037 return -EBUSY;
1038 } 1084 }
1039 1085
1040 /* 1086 /*
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1064 * For error on the tail page, we should set PG_hwpoison 1110 * For error on the tail page, we should set PG_hwpoison
1065 * on the head page to show that the hugepage is hwpoisoned 1111 * on the head page to show that the hugepage is hwpoisoned
1066 */ 1112 */
1067 if (PageTail(p) && TestSetPageHWPoison(hpage)) { 1113 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1068 action_result(pfn, "hugepage already hardware poisoned", 1114 action_result(pfn, "hugepage already hardware poisoned",
1069 IGNORED); 1115 IGNORED);
1070 unlock_page(hpage); 1116 unlock_page(hpage);
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
1295 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1341 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1296 true); 1342 true);
1297 if (ret) { 1343 if (ret) {
1298 putback_lru_pages(&pagelist); 1344 struct page *page1, *page2;
1345 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1346 put_page(page1);
1347
1299 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1348 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1300 pfn, ret, page->flags); 1349 pfn, ret, page->flags);
1301 if (ret > 0) 1350 if (ret > 0)
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags)
1419 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1468 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1420 0, true); 1469 0, true);
1421 if (ret) { 1470 if (ret) {
1471 putback_lru_pages(&pagelist);
1422 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1472 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1423 pfn, ret, page->flags); 1473 pfn, ret, page->flags);
1424 if (ret > 0) 1474 if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 31250faff390..5823698c2b71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2219 &ptl); 2219 &ptl);
2220 if (!pte_same(*page_table, orig_pte)) { 2220 if (!pte_same(*page_table, orig_pte)) {
2221 unlock_page(old_page); 2221 unlock_page(old_page);
2222 page_cache_release(old_page);
2223 goto unlock; 2222 goto unlock;
2224 } 2223 }
2225 page_cache_release(old_page); 2224 page_cache_release(old_page);
@@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2289 &ptl); 2288 &ptl);
2290 if (!pte_same(*page_table, orig_pte)) { 2289 if (!pte_same(*page_table, orig_pte)) {
2291 unlock_page(old_page); 2290 unlock_page(old_page);
2292 page_cache_release(old_page);
2293 goto unlock; 2291 goto unlock;
2294 } 2292 }
2295 2293
@@ -2367,16 +2365,6 @@ gotten:
2367 } 2365 }
2368 __SetPageUptodate(new_page); 2366 __SetPageUptodate(new_page);
2369 2367
2370 /*
2371 * Don't let another task, with possibly unlocked vma,
2372 * keep the mlocked page.
2373 */
2374 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2375 lock_page(old_page); /* for LRU manipulation */
2376 clear_page_mlock(old_page);
2377 unlock_page(old_page);
2378 }
2379
2380 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2368 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2381 goto oom_free_new; 2369 goto oom_free_new;
2382 2370
@@ -2444,10 +2432,20 @@ gotten:
2444 2432
2445 if (new_page) 2433 if (new_page)
2446 page_cache_release(new_page); 2434 page_cache_release(new_page);
2447 if (old_page)
2448 page_cache_release(old_page);
2449unlock: 2435unlock:
2450 pte_unmap_unlock(page_table, ptl); 2436 pte_unmap_unlock(page_table, ptl);
2437 if (old_page) {
2438 /*
2439 * Don't let another task, with possibly unlocked vma,
2440 * keep the mlocked page.
2441 */
2442 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2443 lock_page(old_page); /* LRU manipulation */
2444 munlock_vma_page(old_page);
2445 unlock_page(old_page);
2446 }
2447 page_cache_release(old_page);
2448 }
2451 return ret; 2449 return ret;
2452oom_free_new: 2450oom_free_new:
2453 page_cache_release(new_page); 2451 page_cache_release(new_page);
@@ -2650,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping,
2650 details.last_index = ULONG_MAX; 2648 details.last_index = ULONG_MAX;
2651 details.i_mmap_lock = &mapping->i_mmap_lock; 2649 details.i_mmap_lock = &mapping->i_mmap_lock;
2652 2650
2651 mutex_lock(&mapping->unmap_mutex);
2653 spin_lock(&mapping->i_mmap_lock); 2652 spin_lock(&mapping->i_mmap_lock);
2654 2653
2655 /* Protect against endless unmapping loops */ 2654 /* Protect against endless unmapping loops */
@@ -2666,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping,
2666 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2665 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2667 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2666 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2668 spin_unlock(&mapping->i_mmap_lock); 2667 spin_unlock(&mapping->i_mmap_lock);
2668 mutex_unlock(&mapping->unmap_mutex);
2669} 2669}
2670EXPORT_SYMBOL(unmap_mapping_range); 2670EXPORT_SYMBOL(unmap_mapping_range);
2671 2671
@@ -3053,12 +3053,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3053 goto out; 3053 goto out;
3054 } 3054 }
3055 charged = 1; 3055 charged = 1;
3056 /*
3057 * Don't let another task, with possibly unlocked vma,
3058 * keep the mlocked page.
3059 */
3060 if (vma->vm_flags & VM_LOCKED)
3061 clear_page_mlock(vmf.page);
3062 copy_user_highpage(page, vmf.page, address, vma); 3056 copy_user_highpage(page, vmf.page, address, vma);
3063 __SetPageUptodate(page); 3057 __SetPageUptodate(page);
3064 } else { 3058 } else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 368fc9d23610..b53ec99f1428 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1524} 1524}
1525 1525
1526/* Return a zonelist indicated by gfp for node representing a mempolicy */ 1526/* Return a zonelist indicated by gfp for node representing a mempolicy */
1527static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) 1527static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1528 int nd)
1528{ 1529{
1529 int nd = numa_node_id();
1530
1531 switch (policy->mode) { 1530 switch (policy->mode) {
1532 case MPOL_PREFERRED: 1531 case MPOL_PREFERRED:
1533 if (!(policy->flags & MPOL_F_LOCAL)) 1532 if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1679 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1678 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1680 huge_page_shift(hstate_vma(vma))), gfp_flags); 1679 huge_page_shift(hstate_vma(vma))), gfp_flags);
1681 } else { 1680 } else {
1682 zl = policy_zonelist(gfp_flags, *mpol); 1681 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1683 if ((*mpol)->mode == MPOL_BIND) 1682 if ((*mpol)->mode == MPOL_BIND)
1684 *nodemask = &(*mpol)->v.nodes; 1683 *nodemask = &(*mpol)->v.nodes;
1685 } 1684 }
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1820 */ 1819 */
1821struct page * 1820struct page *
1822alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1821alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1823 unsigned long addr) 1822 unsigned long addr, int node)
1824{ 1823{
1825 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1824 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1826 struct zonelist *zl; 1825 struct zonelist *zl;
@@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1830 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1829 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1831 unsigned nid; 1830 unsigned nid;
1832 1831
1833 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1832 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1834 mpol_cond_put(pol); 1833 mpol_cond_put(pol);
1835 page = alloc_page_interleave(gfp, order, nid); 1834 page = alloc_page_interleave(gfp, order, nid);
1836 put_mems_allowed(); 1835 put_mems_allowed();
1837 return page; 1836 return page;
1838 } 1837 }
1839 zl = policy_zonelist(gfp, pol); 1838 zl = policy_zonelist(gfp, pol, node);
1840 if (unlikely(mpol_needs_cond_ref(pol))) { 1839 if (unlikely(mpol_needs_cond_ref(pol))) {
1841 /* 1840 /*
1842 * slow path: ref counted shared policy 1841 * slow path: ref counted shared policy
@@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1892 page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1891 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1893 else 1892 else
1894 page = __alloc_pages_nodemask(gfp, order, 1893 page = __alloc_pages_nodemask(gfp, order,
1895 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); 1894 policy_zonelist(gfp, pol, numa_node_id()),
1895 policy_nodemask(gfp, pol));
1896 put_mems_allowed(); 1896 put_mems_allowed();
1897 return page; 1897 return page;
1898} 1898}
diff --git a/mm/migrate.c b/mm/migrate.c
index 46fe8cc13d67..352de555626c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -772,6 +772,7 @@ uncharge:
772unlock: 772unlock:
773 unlock_page(page); 773 unlock_page(page);
774 774
775move_newpage:
775 if (rc != -EAGAIN) { 776 if (rc != -EAGAIN) {
776 /* 777 /*
777 * A page that has been migrated has all references 778 * A page that has been migrated has all references
@@ -785,8 +786,6 @@ unlock:
785 putback_lru_page(page); 786 putback_lru_page(page);
786 } 787 }
787 788
788move_newpage:
789
790 /* 789 /*
791 * Move the new page to the LRU. If migration was not successful 790 * Move the new page to the LRU. If migration was not successful
792 * then this will free the page. 791 * then this will free the page.
@@ -888,7 +887,7 @@ out:
888 * are movable anymore because to has become empty 887 * are movable anymore because to has become empty
889 * or no retryable pages exist anymore. 888 * or no retryable pages exist anymore.
890 * Caller should call putback_lru_pages to return pages to the LRU 889 * Caller should call putback_lru_pages to return pages to the LRU
891 * or free list. 890 * or free list only if ret != 0.
892 * 891 *
893 * Return: Number of pages not migrated or error code. 892 * Return: Number of pages not migrated or error code.
894 */ 893 */
@@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
981 } 980 }
982 rc = 0; 981 rc = 0;
983out: 982out:
984
985 list_for_each_entry_safe(page, page2, from, lru)
986 put_page(page);
987
988 if (rc) 983 if (rc)
989 return rc; 984 return rc;
990 985
@@ -1292,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1292 return -EPERM; 1287 return -EPERM;
1293 1288
1294 /* Find the mm_struct */ 1289 /* Find the mm_struct */
1295 read_lock(&tasklist_lock); 1290 rcu_read_lock();
1296 task = pid ? find_task_by_vpid(pid) : current; 1291 task = pid ? find_task_by_vpid(pid) : current;
1297 if (!task) { 1292 if (!task) {
1298 read_unlock(&tasklist_lock); 1293 rcu_read_unlock();
1299 return -ESRCH; 1294 return -ESRCH;
1300 } 1295 }
1301 mm = get_task_mm(task); 1296 mm = get_task_mm(task);
1302 read_unlock(&tasklist_lock); 1297 rcu_read_unlock();
1303 1298
1304 if (!mm) 1299 if (!mm)
1305 return -EINVAL; 1300 return -EINVAL;
diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
179 gup_flags |= FOLL_WRITE; 179 gup_flags |= FOLL_WRITE;
180 180
181 /*
182 * We want mlock to succeed for regions that have any permissions
183 * other than PROT_NONE.
184 */
185 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
186 gup_flags |= FOLL_FORCE;
187
181 if (vma->vm_flags & VM_LOCKED) 188 if (vma->vm_flags & VM_LOCKED)
182 gup_flags |= FOLL_MLOCK; 189 gup_flags |= FOLL_MLOCK;
183 190
diff --git a/mm/mremap.c b/mm/mremap.c
index 9925b6391b80..1de98d492ddc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
94 */ 94 */
95 mapping = vma->vm_file->f_mapping; 95 mapping = vma->vm_file->f_mapping;
96 spin_lock(&mapping->i_mmap_lock); 96 spin_lock(&mapping->i_mmap_lock);
97 if (new_vma->vm_truncate_count && 97 new_vma->vm_truncate_count = 0;
98 new_vma->vm_truncate_count != vma->vm_truncate_count)
99 new_vma->vm_truncate_count = 0;
100 } 98 }
101 99
102 /* 100 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90c1439549fd..cdef1d4b4e47 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
1088 pset = per_cpu_ptr(zone->pageset, cpu); 1088 pset = per_cpu_ptr(zone->pageset, cpu);
1089 1089
1090 pcp = &pset->pcp; 1090 pcp = &pset->pcp;
1091 free_pcppages_bulk(zone, pcp->count, pcp); 1091 if (pcp->count) {
1092 pcp->count = 0; 1092 free_pcppages_bulk(zone, pcp->count, pcp);
1093 pcp->count = 0;
1094 }
1093 local_irq_restore(flags); 1095 local_irq_restore(flags);
1094 } 1096 }
1095} 1097}
@@ -2034,6 +2036,14 @@ restart:
2034 */ 2036 */
2035 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2037 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2036 2038
2039 /*
2040 * Find the true preferred zone if the allocation is unconstrained by
2041 * cpusets.
2042 */
2043 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2044 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2045 &preferred_zone);
2046
2037 /* This is the last chance, in general, before the goto nopage. */ 2047 /* This is the last chance, in general, before the goto nopage. */
2038 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2048 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2039 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2049 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2192 2202
2193 get_mems_allowed(); 2203 get_mems_allowed();
2194 /* The preferred zone is used for statistics later */ 2204 /* The preferred zone is used for statistics later */
2195 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2205 first_zones_zonelist(zonelist, high_zoneidx,
2206 nodemask ? : &cpuset_current_mems_allowed,
2207 &preferred_zone);
2196 if (!preferred_zone) { 2208 if (!preferred_zone) {
2197 put_mems_allowed(); 2209 put_mems_allowed();
2198 return NULL; 2210 return NULL;
@@ -5364,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5364 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5376 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5365 unsigned long check = pfn + iter; 5377 unsigned long check = pfn + iter;
5366 5378
5367 if (!pfn_valid_within(check)) { 5379 if (!pfn_valid_within(check))
5368 iter++;
5369 continue; 5380 continue;
5370 } 5381
5371 page = pfn_to_page(check); 5382 page = pfn_to_page(check);
5372 if (!page_count(page)) { 5383 if (!page_count(page)) {
5373 if (PageBuddy(page)) 5384 if (PageBuddy(page))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0369f5b3ba1b..eb663fb533e0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2010 Linus Torvalds 6 * Copyright (C) 2010 Linus Torvalds
7 */ 7 */
8 8
9#include <linux/pagemap.h>
9#include <asm/tlb.h> 10#include <asm/tlb.h>
10#include <asm-generic/pgtable.h> 11#include <asm-generic/pgtable.h>
11 12
diff --git a/mm/slab.c b/mm/slab.c
index 4bab2d1a8291..7d92f08b88d7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t;
191#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 191#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
192 192
193/* 193/*
194 * struct slab
195 *
196 * Manages the objs in a slab. Placed either at the beginning of mem allocated
197 * for a slab, or allocated from an general cache.
198 * Slabs are chained into three list: fully used, partial, fully free slabs.
199 */
200struct slab {
201 struct list_head list;
202 unsigned long colouroff;
203 void *s_mem; /* including colour offset */
204 unsigned int inuse; /* num of objs active in slab */
205 kmem_bufctl_t free;
206 unsigned short nodeid;
207};
208
209/*
210 * struct slab_rcu 194 * struct slab_rcu
211 * 195 *
212 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 196 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
@@ -219,8 +203,6 @@ struct slab {
219 * 203 *
220 * rcu_read_lock before reading the address, then rcu_read_unlock after 204 * rcu_read_lock before reading the address, then rcu_read_unlock after
221 * taking the spinlock within the structure expected at that address. 205 * taking the spinlock within the structure expected at that address.
222 *
223 * We assume struct slab_rcu can overlay struct slab when destroying.
224 */ 206 */
225struct slab_rcu { 207struct slab_rcu {
226 struct rcu_head head; 208 struct rcu_head head;
@@ -229,6 +211,27 @@ struct slab_rcu {
229}; 211};
230 212
231/* 213/*
214 * struct slab
215 *
216 * Manages the objs in a slab. Placed either at the beginning of mem allocated
217 * for a slab, or allocated from an general cache.
218 * Slabs are chained into three list: fully used, partial, fully free slabs.
219 */
220struct slab {
221 union {
222 struct {
223 struct list_head list;
224 unsigned long colouroff;
225 void *s_mem; /* including colour offset */
226 unsigned int inuse; /* num of objs active in slab */
227 kmem_bufctl_t free;
228 unsigned short nodeid;
229 };
230 struct slab_rcu __slab_cover_slab_rcu;
231 };
232};
233
234/*
232 * struct array_cache 235 * struct array_cache
233 * 236 *
234 * Purpose: 237 * Purpose:
diff --git a/mm/slub.c b/mm/slub.c
index ea6f0390996f..e841d8921c22 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -305,11 +305,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
305 return s->size; 305 return s->size;
306} 306}
307 307
308static inline int order_objects(int order, unsigned long size, int reserved)
309{
310 return ((PAGE_SIZE << order) - reserved) / size;
311}
312
308static inline struct kmem_cache_order_objects oo_make(int order, 313static inline struct kmem_cache_order_objects oo_make(int order,
309 unsigned long size) 314 unsigned long size, int reserved)
310{ 315{
311 struct kmem_cache_order_objects x = { 316 struct kmem_cache_order_objects x = {
312 (order << OO_SHIFT) + (PAGE_SIZE << order) / size 317 (order << OO_SHIFT) + order_objects(order, size, reserved)
313 }; 318 };
314 319
315 return x; 320 return x;
@@ -641,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
641 return 1; 646 return 1;
642 647
643 start = page_address(page); 648 start = page_address(page);
644 length = (PAGE_SIZE << compound_order(page)); 649 length = (PAGE_SIZE << compound_order(page)) - s->reserved;
645 end = start + length; 650 end = start + length;
646 remainder = length % s->size; 651 remainder = length % s->size;
647 if (!remainder) 652 if (!remainder)
@@ -722,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
722 return 0; 727 return 0;
723 } 728 }
724 729
725 maxobj = (PAGE_SIZE << compound_order(page)) / s->size; 730 maxobj = order_objects(compound_order(page), s->size, s->reserved);
726 if (page->objects > maxobj) { 731 if (page->objects > maxobj) {
727 slab_err(s, page, "objects %u > max %u", 732 slab_err(s, page, "objects %u > max %u",
728 s->name, page->objects, maxobj); 733 s->name, page->objects, maxobj);
@@ -772,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
772 nr++; 777 nr++;
773 } 778 }
774 779
775 max_objects = (PAGE_SIZE << compound_order(page)) / s->size; 780 max_objects = order_objects(compound_order(page), s->size, s->reserved);
776 if (max_objects > MAX_OBJS_PER_PAGE) 781 if (max_objects > MAX_OBJS_PER_PAGE)
777 max_objects = MAX_OBJS_PER_PAGE; 782 max_objects = MAX_OBJS_PER_PAGE;
778 783
@@ -1273,21 +1278,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1273 __free_pages(page, order); 1278 __free_pages(page, order);
1274} 1279}
1275 1280
1281#define need_reserve_slab_rcu \
1282 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1283
1276static void rcu_free_slab(struct rcu_head *h) 1284static void rcu_free_slab(struct rcu_head *h)
1277{ 1285{
1278 struct page *page; 1286 struct page *page;
1279 1287
1280 page = container_of((struct list_head *)h, struct page, lru); 1288 if (need_reserve_slab_rcu)
1289 page = virt_to_head_page(h);
1290 else
1291 page = container_of((struct list_head *)h, struct page, lru);
1292
1281 __free_slab(page->slab, page); 1293 __free_slab(page->slab, page);
1282} 1294}
1283 1295
1284static void free_slab(struct kmem_cache *s, struct page *page) 1296static void free_slab(struct kmem_cache *s, struct page *page)
1285{ 1297{
1286 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1298 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1287 /* 1299 struct rcu_head *head;
1288 * RCU free overloads the RCU head over the LRU 1300
1289 */ 1301 if (need_reserve_slab_rcu) {
1290 struct rcu_head *head = (void *)&page->lru; 1302 int order = compound_order(page);
1303 int offset = (PAGE_SIZE << order) - s->reserved;
1304
1305 VM_BUG_ON(s->reserved != sizeof(*head));
1306 head = page_address(page) + offset;
1307 } else {
1308 /*
1309 * RCU free overloads the RCU head over the LRU
1310 */
1311 head = (void *)&page->lru;
1312 }
1291 1313
1292 call_rcu(head, rcu_free_slab); 1314 call_rcu(head, rcu_free_slab);
1293 } else 1315 } else
@@ -2012,13 +2034,13 @@ static int slub_nomerge;
2012 * the smallest order which will fit the object. 2034 * the smallest order which will fit the object.
2013 */ 2035 */
2014static inline int slab_order(int size, int min_objects, 2036static inline int slab_order(int size, int min_objects,
2015 int max_order, int fract_leftover) 2037 int max_order, int fract_leftover, int reserved)
2016{ 2038{
2017 int order; 2039 int order;
2018 int rem; 2040 int rem;
2019 int min_order = slub_min_order; 2041 int min_order = slub_min_order;
2020 2042
2021 if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) 2043 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
2022 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2044 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
2023 2045
2024 for (order = max(min_order, 2046 for (order = max(min_order,
@@ -2027,10 +2049,10 @@ static inline int slab_order(int size, int min_objects,
2027 2049
2028 unsigned long slab_size = PAGE_SIZE << order; 2050 unsigned long slab_size = PAGE_SIZE << order;
2029 2051
2030 if (slab_size < min_objects * size) 2052 if (slab_size < min_objects * size + reserved)
2031 continue; 2053 continue;
2032 2054
2033 rem = slab_size % size; 2055 rem = (slab_size - reserved) % size;
2034 2056
2035 if (rem <= slab_size / fract_leftover) 2057 if (rem <= slab_size / fract_leftover)
2036 break; 2058 break;
@@ -2040,7 +2062,7 @@ static inline int slab_order(int size, int min_objects,
2040 return order; 2062 return order;
2041} 2063}
2042 2064
2043static inline int calculate_order(int size) 2065static inline int calculate_order(int size, int reserved)
2044{ 2066{
2045 int order; 2067 int order;
2046 int min_objects; 2068 int min_objects;
@@ -2058,14 +2080,14 @@ static inline int calculate_order(int size)
2058 min_objects = slub_min_objects; 2080 min_objects = slub_min_objects;
2059 if (!min_objects) 2081 if (!min_objects)
2060 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2082 min_objects = 4 * (fls(nr_cpu_ids) + 1);
2061 max_objects = (PAGE_SIZE << slub_max_order)/size; 2083 max_objects = order_objects(slub_max_order, size, reserved);
2062 min_objects = min(min_objects, max_objects); 2084 min_objects = min(min_objects, max_objects);
2063 2085
2064 while (min_objects > 1) { 2086 while (min_objects > 1) {
2065 fraction = 16; 2087 fraction = 16;
2066 while (fraction >= 4) { 2088 while (fraction >= 4) {
2067 order = slab_order(size, min_objects, 2089 order = slab_order(size, min_objects,
2068 slub_max_order, fraction); 2090 slub_max_order, fraction, reserved);
2069 if (order <= slub_max_order) 2091 if (order <= slub_max_order)
2070 return order; 2092 return order;
2071 fraction /= 2; 2093 fraction /= 2;
@@ -2077,14 +2099,14 @@ static inline int calculate_order(int size)
2077 * We were unable to place multiple objects in a slab. Now 2099 * We were unable to place multiple objects in a slab. Now
2078 * lets see if we can place a single object there. 2100 * lets see if we can place a single object there.
2079 */ 2101 */
2080 order = slab_order(size, 1, slub_max_order, 1); 2102 order = slab_order(size, 1, slub_max_order, 1, reserved);
2081 if (order <= slub_max_order) 2103 if (order <= slub_max_order)
2082 return order; 2104 return order;
2083 2105
2084 /* 2106 /*
2085 * Doh this slab cannot be placed using slub_max_order. 2107 * Doh this slab cannot be placed using slub_max_order.
2086 */ 2108 */
2087 order = slab_order(size, 1, MAX_ORDER, 1); 2109 order = slab_order(size, 1, MAX_ORDER, 1, reserved);
2088 if (order < MAX_ORDER) 2110 if (order < MAX_ORDER)
2089 return order; 2111 return order;
2090 return -ENOSYS; 2112 return -ENOSYS;
@@ -2335,7 +2357,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2335 if (forced_order >= 0) 2357 if (forced_order >= 0)
2336 order = forced_order; 2358 order = forced_order;
2337 else 2359 else
2338 order = calculate_order(size); 2360 order = calculate_order(size, s->reserved);
2339 2361
2340 if (order < 0) 2362 if (order < 0)
2341 return 0; 2363 return 0;
@@ -2353,8 +2375,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2353 /* 2375 /*
2354 * Determine the number of objects per slab 2376 * Determine the number of objects per slab
2355 */ 2377 */
2356 s->oo = oo_make(order, size); 2378 s->oo = oo_make(order, size, s->reserved);
2357 s->min = oo_make(get_order(size), size); 2379 s->min = oo_make(get_order(size), size, s->reserved);
2358 if (oo_objects(s->oo) > oo_objects(s->max)) 2380 if (oo_objects(s->oo) > oo_objects(s->max))
2359 s->max = s->oo; 2381 s->max = s->oo;
2360 2382
@@ -2373,6 +2395,10 @@ static int kmem_cache_open(struct kmem_cache *s,
2373 s->objsize = size; 2395 s->objsize = size;
2374 s->align = align; 2396 s->align = align;
2375 s->flags = kmem_cache_flags(size, flags, name, ctor); 2397 s->flags = kmem_cache_flags(size, flags, name, ctor);
2398 s->reserved = 0;
2399
2400 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
2401 s->reserved = sizeof(struct rcu_head);
2376 2402
2377 if (!calculate_sizes(s, -1)) 2403 if (!calculate_sizes(s, -1))
2378 goto error; 2404 goto error;
@@ -4014,6 +4040,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4014} 4040}
4015SLAB_ATTR_RO(destroy_by_rcu); 4041SLAB_ATTR_RO(destroy_by_rcu);
4016 4042
4043static ssize_t reserved_show(struct kmem_cache *s, char *buf)
4044{
4045 return sprintf(buf, "%d\n", s->reserved);
4046}
4047SLAB_ATTR_RO(reserved);
4048
4017#ifdef CONFIG_SLUB_DEBUG 4049#ifdef CONFIG_SLUB_DEBUG
4018static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4050static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4019{ 4051{
@@ -4300,6 +4332,7 @@ static struct attribute *slab_attrs[] = {
4300 &reclaim_account_attr.attr, 4332 &reclaim_account_attr.attr,
4301 &destroy_by_rcu_attr.attr, 4333 &destroy_by_rcu_attr.attr,
4302 &shrink_attr.attr, 4334 &shrink_attr.attr,
4335 &reserved_attr.attr,
4303#ifdef CONFIG_SLUB_DEBUG 4336#ifdef CONFIG_SLUB_DEBUG
4304 &total_objects_attr.attr, 4337 &total_objects_attr.attr,
4305 &slabs_attr.attr, 4338 &slabs_attr.attr,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 07a458d72fa8..0341c5700e34 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1940 1940
1941 error = -EINVAL; 1941 error = -EINVAL;
1942 if (S_ISBLK(inode->i_mode)) { 1942 if (S_ISBLK(inode->i_mode)) {
1943 bdev = I_BDEV(inode); 1943 bdev = bdgrab(I_BDEV(inode));
1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, 1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1945 sys_swapon); 1945 sys_swapon);
1946 if (error < 0) { 1946 if (error < 0) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 49feb46e77b8..d64296be00d3 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
225 next = start; 225 next = start;
226 while (next <= end && 226 while (next <= end &&
227 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 227 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
228 mem_cgroup_uncharge_start();
228 for (i = 0; i < pagevec_count(&pvec); i++) { 229 for (i = 0; i < pagevec_count(&pvec); i++) {
229 struct page *page = pvec.pages[i]; 230 struct page *page = pvec.pages[i];
230 pgoff_t page_index = page->index; 231 pgoff_t page_index = page->index;
@@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
247 unlock_page(page); 248 unlock_page(page);
248 } 249 }
249 pagevec_release(&pvec); 250 pagevec_release(&pvec);
251 mem_cgroup_uncharge_end();
250 cond_resched(); 252 cond_resched();
251 } 253 }
252 254
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f5d90dedebba..6771ea70bfe7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone,
1841 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1841 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1842 return false; 1842 return false;
1843 1843
1844 /* 1844 /* Consider stopping depending on scan and reclaim activity */
1845 * If we failed to reclaim and have scanned the full list, stop. 1845 if (sc->gfp_mask & __GFP_REPEAT) {
1846 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far 1846 /*
1847 * faster but obviously would be less likely to succeed 1847 * For __GFP_REPEAT allocations, stop reclaiming if the
1848 * allocation. If this is desirable, use GFP_REPEAT to decide 1848 * full LRU list has been scanned and we are still failing
1849 * if both reclaimed and scanned should be checked or just 1849 * to reclaim pages. This full LRU scan is potentially
1850 * reclaimed 1850 * expensive but a __GFP_REPEAT caller really wants to succeed
1851 */ 1851 */
1852 if (!nr_reclaimed && !nr_scanned) 1852 if (!nr_reclaimed && !nr_scanned)
1853 return false; 1853 return false;
1854 } else {
1855 /*
1856 * For non-__GFP_REPEAT allocations which can presumably
1857 * fail without consequence, stop if we failed to reclaim
1858 * any pages from the last SWAP_CLUSTER_MAX number of
1859 * pages that were scanned. This will return to the
1860 * caller faster at the risk reclaim/compaction and
1861 * the resulting allocation attempt fails
1862 */
1863 if (!nr_reclaimed)
1864 return false;
1865 }
1854 1866
1855 /* 1867 /*
1856 * If we have not reclaimed enough pages for compaction and the 1868 * If we have not reclaimed enough pages for compaction and the
@@ -1882,12 +1894,12 @@ static void shrink_zone(int priority, struct zone *zone,
1882 unsigned long nr[NR_LRU_LISTS]; 1894 unsigned long nr[NR_LRU_LISTS];
1883 unsigned long nr_to_scan; 1895 unsigned long nr_to_scan;
1884 enum lru_list l; 1896 enum lru_list l;
1885 unsigned long nr_reclaimed; 1897 unsigned long nr_reclaimed, nr_scanned;
1886 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1898 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1887 unsigned long nr_scanned = sc->nr_scanned;
1888 1899
1889restart: 1900restart:
1890 nr_reclaimed = 0; 1901 nr_reclaimed = 0;
1902 nr_scanned = sc->nr_scanned;
1891 get_scan_count(zone, sc, nr, priority); 1903 get_scan_count(zone, sc, nr, priority);
1892 1904
1893 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1905 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2083,7 +2095,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2083 struct zone *preferred_zone; 2095 struct zone *preferred_zone;
2084 2096
2085 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2097 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2086 NULL, &preferred_zone); 2098 &cpuset_current_mems_allowed,
2099 &preferred_zone);
2087 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2100 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2088 } 2101 }
2089 } 2102 }