aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSascha Hauer <s.hauer@pengutronix.de>2011-02-11 02:32:18 -0500
committerSascha Hauer <s.hauer@pengutronix.de>2011-02-11 02:33:14 -0500
commitf19693a17c6705e197eb24d4618060eaac1b535c (patch)
treefc39dc23297c0e6be730cb0dfd74a34d9c0b8bfd /mm
parent23b120cdfae4f5c29da69de750d545bad719ead4 (diff)
parent100b33c8bd8a3235fd0b7948338d6cbb3db3c63d (diff)
Merge commit 'v2.6.38-rc4' into imx-for-2.6.39
Conflicts: arch/arm/mach-mxs/clock-mx28.c Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/compaction.c11
-rw-r--r--mm/huge_memory.c12
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c13
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c266
-rw-r--r--mm/memory-failure.c94
-rw-r--r--mm/migrate.c9
-rw-r--r--mm/mlock.c7
-rw-r--r--mm/page_alloc.c18
-rw-r--r--mm/pgtable-generic.c1
-rw-r--r--mm/truncate.c11
-rw-r--r--mm/vmscan.c4
14 files changed, 312 insertions, 150 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3ad483bdf505..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
179config COMPACTION 179config COMPACTION
180 bool "Allow for memory compaction" 180 bool "Allow for memory compaction"
181 select MIGRATION 181 select MIGRATION
182 depends on EXPERIMENTAL && HUGETLB_PAGE && MMU 182 depends on MMU
183 help 183 help
184 Allows the compaction of memory for the allocation of huge pages. 184 Allows the compaction of memory for the allocation of huge pages.
185 185
diff --git a/mm/compaction.c b/mm/compaction.c
index 6d592a021072..8be430b812de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -406,6 +406,10 @@ static int compact_finished(struct zone *zone,
406 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 406 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
407 return COMPACT_CONTINUE; 407 return COMPACT_CONTINUE;
408 408
409 /*
410 * order == -1 is expected when compacting via
411 * /proc/sys/vm/compact_memory
412 */
409 if (cc->order == -1) 413 if (cc->order == -1)
410 return COMPACT_CONTINUE; 414 return COMPACT_CONTINUE;
411 415
@@ -454,6 +458,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
454 return COMPACT_SKIPPED; 458 return COMPACT_SKIPPED;
455 459
456 /* 460 /*
461 * order == -1 is expected when compacting via
462 * /proc/sys/vm/compact_memory
463 */
464 if (order == -1)
465 return COMPACT_CONTINUE;
466
467 /*
457 * fragmentation index determines if allocation failures are due to 468 * fragmentation index determines if allocation failures are due to
458 * low memory or external fragmentation 469 * low memory or external fragmentation
459 * 470 *
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 004c9c2aac78..b6c1ce3c53b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page)
1162 /* after clearing PageTail the gup refcount can be released */ 1162 /* after clearing PageTail the gup refcount can be released */
1163 smp_mb(); 1163 smp_mb();
1164 1164
1165 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1165 /*
1166 * retain hwpoison flag of the poisoned tail page:
1167 * fix for the unsuitable process killed on Guest Machine(KVM)
1168 * by the memory-failure.
1169 */
1170 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1166 page_tail->flags |= (page->flags & 1171 page_tail->flags |= (page->flags &
1167 ((1L << PG_referenced) | 1172 ((1L << PG_referenced) |
1168 (1L << PG_swapbacked) | 1173 (1L << PG_swapbacked) |
@@ -1203,6 +1208,8 @@ static void __split_huge_page_refcount(struct page *page)
1203 BUG_ON(!PageDirty(page_tail)); 1208 BUG_ON(!PageDirty(page_tail));
1204 BUG_ON(!PageSwapBacked(page_tail)); 1209 BUG_ON(!PageSwapBacked(page_tail));
1205 1210
1211 mem_cgroup_split_huge_fixup(page, page_tail);
1212
1206 lru_add_page_tail(zone, page, page_tail); 1213 lru_add_page_tail(zone, page, page_tail);
1207 } 1214 }
1208 1215
@@ -1837,9 +1844,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1837 spin_lock(ptl); 1844 spin_lock(ptl);
1838 isolated = __collapse_huge_page_isolate(vma, address, pte); 1845 isolated = __collapse_huge_page_isolate(vma, address, pte);
1839 spin_unlock(ptl); 1846 spin_unlock(ptl);
1840 pte_unmap(pte);
1841 1847
1842 if (unlikely(!isolated)) { 1848 if (unlikely(!isolated)) {
1849 pte_unmap(pte);
1843 spin_lock(&mm->page_table_lock); 1850 spin_lock(&mm->page_table_lock);
1844 BUG_ON(!pmd_none(*pmd)); 1851 BUG_ON(!pmd_none(*pmd));
1845 set_pmd_at(mm, address, pmd, _pmd); 1852 set_pmd_at(mm, address, pmd, _pmd);
@@ -1856,6 +1863,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1856 anon_vma_unlock(vma->anon_vma); 1863 anon_vma_unlock(vma->anon_vma);
1857 1864
1858 __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 1865 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1866 pte_unmap(pte);
1859 __SetPageUptodate(new_page); 1867 __SetPageUptodate(new_page);
1860 pgtable = pmd_pgtable(_pmd); 1868 pgtable = pmd_pgtable(_pmd);
1861 VM_BUG_ON(page_count(pgtable) != 1); 1869 VM_BUG_ON(page_count(pgtable) != 1);
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
75 * after the module is removed. 75 * after the module is removed.
76 */ 76 */
77 for (i = 0; i < 10; i++) { 77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 78 elem = kzalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); 79 pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem) 80 if (!elem)
81 return -ENOMEM; 81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list); 82 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list); 83 list_add_tail(&elem->list, &test_list);
86 } 84 }
87 85
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
113#define BYTES_PER_POINTER sizeof(void *) 113#define BYTES_PER_POINTER sizeof(void *)
114 114
115/* GFP bitmask for kmemleak internal allocations */ 115/* GFP bitmask for kmemleak internal allocations */
116#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) 116#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
117 __GFP_NORETRY | __GFP_NOMEMALLOC | \
118 __GFP_NOWARN)
117 119
118/* scanning area inside a memory block */ 120/* scanning area inside a memory block */
119struct kmemleak_scan_area { 121struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
511 struct kmemleak_object *object; 513 struct kmemleak_object *object;
512 struct prio_tree_node *node; 514 struct prio_tree_node *node;
513 515
514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 516 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
515 if (!object) { 517 if (!object) {
516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 518 pr_warning("Cannot allocate a kmemleak_object structure\n");
519 kmemleak_disable();
517 return NULL; 520 return NULL;
518 } 521 }
519 522
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
734 return; 737 return;
735 } 738 }
736 739
737 area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); 740 area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
738 if (!area) { 741 if (!area) {
739 kmemleak_warn("Cannot allocate a scan area\n"); 742 pr_warning("Cannot allocate a scan area\n");
740 goto out; 743 goto out;
741 } 744 }
742 745
diff --git a/mm/memblock.c b/mm/memblock.c
index 400dc62697d7..bdba245d8afd 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -683,13 +683,13 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
683 683
684int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 684int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
685{ 685{
686 int idx = memblock_search(&memblock.reserved, base); 686 int idx = memblock_search(&memblock.memory, base);
687 687
688 if (idx == -1) 688 if (idx == -1)
689 return 0; 689 return 0;
690 return memblock.reserved.regions[idx].base <= base && 690 return memblock.memory.regions[idx].base <= base &&
691 (memblock.reserved.regions[idx].base + 691 (memblock.memory.regions[idx].base +
692 memblock.reserved.regions[idx].size) >= (base + size); 692 memblock.memory.regions[idx].size) >= (base + size);
693} 693}
694 694
695int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 695int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ab841031436..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -600,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
600} 600}
601 601
602static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 602static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
603 struct page_cgroup *pc, 603 bool file, int nr_pages)
604 bool charge)
605{ 604{
606 int val = (charge) ? 1 : -1;
607
608 preempt_disable(); 605 preempt_disable();
609 606
610 if (PageCgroupCache(pc)) 607 if (file)
611 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 608 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
612 else 609 else
613 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 610 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
614 611
615 if (charge) 612 /* pagein of a big page is an event. So, ignore page size */
613 if (nr_pages > 0)
616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
617 else 615 else {
618 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
619 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); 617 nr_pages = -nr_pages; /* for event */
618 }
619
620 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
620 621
621 preempt_enable(); 622 preempt_enable();
622} 623}
@@ -815,7 +816,8 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
815 * removed from global LRU. 816 * removed from global LRU.
816 */ 817 */
817 mz = page_cgroup_zoneinfo(pc); 818 mz = page_cgroup_zoneinfo(pc);
818 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 819 /* huge page split is done under lru_lock. so, we have no races. */
820 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
819 if (mem_cgroup_is_root(pc->mem_cgroup)) 821 if (mem_cgroup_is_root(pc->mem_cgroup))
820 return; 822 return;
821 VM_BUG_ON(list_empty(&pc->lru)); 823 VM_BUG_ON(list_empty(&pc->lru));
@@ -836,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
836 return; 838 return;
837 839
838 pc = lookup_page_cgroup(page); 840 pc = lookup_page_cgroup(page);
839 /*
840 * Used bit is set without atomic ops but after smp_wmb().
841 * For making pc->mem_cgroup visible, insert smp_rmb() here.
842 */
843 smp_rmb();
844 /* unused or root page is not rotated. */ 841 /* unused or root page is not rotated. */
845 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 842 if (!PageCgroupUsed(pc))
843 return;
844 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
845 smp_rmb();
846 if (mem_cgroup_is_root(pc->mem_cgroup))
846 return; 847 return;
847 mz = page_cgroup_zoneinfo(pc); 848 mz = page_cgroup_zoneinfo(pc);
848 list_move(&pc->lru, &mz->lists[lru]); 849 list_move(&pc->lru, &mz->lists[lru]);
@@ -857,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
857 return; 858 return;
858 pc = lookup_page_cgroup(page); 859 pc = lookup_page_cgroup(page);
859 VM_BUG_ON(PageCgroupAcctLRU(pc)); 860 VM_BUG_ON(PageCgroupAcctLRU(pc));
860 /*
861 * Used bit is set without atomic ops but after smp_wmb().
862 * For making pc->mem_cgroup visible, insert smp_rmb() here.
863 */
864 smp_rmb();
865 if (!PageCgroupUsed(pc)) 861 if (!PageCgroupUsed(pc))
866 return; 862 return;
867 863 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
864 smp_rmb();
868 mz = page_cgroup_zoneinfo(pc); 865 mz = page_cgroup_zoneinfo(pc);
869 MEM_CGROUP_ZSTAT(mz, lru) += 1; 866 /* huge page split is done under lru_lock. so, we have no races. */
867 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
870 SetPageCgroupAcctLRU(pc); 868 SetPageCgroupAcctLRU(pc);
871 if (mem_cgroup_is_root(pc->mem_cgroup)) 869 if (mem_cgroup_is_root(pc->mem_cgroup))
872 return; 870 return;
@@ -1030,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1030 return NULL; 1028 return NULL;
1031 1029
1032 pc = lookup_page_cgroup(page); 1030 pc = lookup_page_cgroup(page);
1033 /*
1034 * Used bit is set without atomic ops but after smp_wmb().
1035 * For making pc->mem_cgroup visible, insert smp_rmb() here.
1036 */
1037 smp_rmb();
1038 if (!PageCgroupUsed(pc)) 1031 if (!PageCgroupUsed(pc))
1039 return NULL; 1032 return NULL;
1040 1033 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1034 smp_rmb();
1041 mz = page_cgroup_zoneinfo(pc); 1035 mz = page_cgroup_zoneinfo(pc);
1042 if (!mz) 1036 if (!mz)
1043 return NULL; 1037 return NULL;
@@ -1119,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1119 return false; 1113 return false;
1120} 1114}
1121 1115
1116/**
1117 * mem_cgroup_check_margin - check if the memory cgroup allows charging
1118 * @mem: memory cgroup to check
1119 * @bytes: the number of bytes the caller intends to charge
1120 *
1121 * Returns a boolean value on whether @mem can be charged @bytes or
1122 * whether this would exceed the limit.
1123 */
1124static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
1125{
1126 if (!res_counter_check_margin(&mem->res, bytes))
1127 return false;
1128 if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
1129 return false;
1130 return true;
1131}
1132
1122static unsigned int get_swappiness(struct mem_cgroup *memcg) 1133static unsigned int get_swappiness(struct mem_cgroup *memcg)
1123{ 1134{
1124 struct cgroup *cgrp = memcg->css.cgroup; 1135 struct cgroup *cgrp = memcg->css.cgroup;
@@ -1615,7 +1626,7 @@ void mem_cgroup_update_page_stat(struct page *page,
1615 if (unlikely(!mem || !PageCgroupUsed(pc))) 1626 if (unlikely(!mem || !PageCgroupUsed(pc)))
1616 goto out; 1627 goto out;
1617 /* pc->mem_cgroup is unstable ? */ 1628 /* pc->mem_cgroup is unstable ? */
1618 if (unlikely(mem_cgroup_stealed(mem))) { 1629 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1619 /* take a lock against to access pc->mem_cgroup */ 1630 /* take a lock against to access pc->mem_cgroup */
1620 move_lock_page_cgroup(pc, &flags); 1631 move_lock_page_cgroup(pc, &flags);
1621 need_unlock = true; 1632 need_unlock = true;
@@ -1840,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1840 if (likely(!ret)) 1851 if (likely(!ret))
1841 return CHARGE_OK; 1852 return CHARGE_OK;
1842 1853
1854 res_counter_uncharge(&mem->res, csize);
1843 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1855 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1844 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1856 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1845 } else 1857 } else
1846 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1858 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1847 1859 /*
1848 if (csize > PAGE_SIZE) /* change csize and retry */ 1860 * csize can be either a huge page (HPAGE_SIZE), a batch of
1861 * regular pages (CHARGE_SIZE), or a single regular page
1862 * (PAGE_SIZE).
1863 *
1864 * Never reclaim on behalf of optional batching, retry with a
1865 * single page instead.
1866 */
1867 if (csize == CHARGE_SIZE)
1849 return CHARGE_RETRY; 1868 return CHARGE_RETRY;
1850 1869
1851 if (!(gfp_mask & __GFP_WAIT)) 1870 if (!(gfp_mask & __GFP_WAIT))
1852 return CHARGE_WOULDBLOCK; 1871 return CHARGE_WOULDBLOCK;
1853 1872
1854 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1873 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1855 gfp_mask, flags); 1874 gfp_mask, flags);
1875 if (mem_cgroup_check_margin(mem_over_limit, csize))
1876 return CHARGE_RETRY;
1856 /* 1877 /*
1857 * try_to_free_mem_cgroup_pages() might not give us a full 1878 * Even though the limit is exceeded at this point, reclaim
1858 * picture of reclaim. Some pages are reclaimed and might be 1879 * may have been able to free some pages. Retry the charge
1859 * moved to swap cache or just unmapped from the cgroup. 1880 * before killing the task.
1860 * Check the limit again to see if the reclaim reduced the 1881 *
1861 * current usage of the cgroup before giving up 1882 * Only for regular pages, though: huge pages are rather
1883 * unlikely to succeed so close to the limit, and we fall back
1884 * to regular pages anyway in case of failure.
1862 */ 1885 */
1863 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 1886 if (csize == PAGE_SIZE && ret)
1864 return CHARGE_RETRY; 1887 return CHARGE_RETRY;
1865 1888
1866 /* 1889 /*
@@ -2084,14 +2107,27 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2084 return mem; 2107 return mem;
2085} 2108}
2086 2109
2087/* 2110static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2088 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 2111 struct page_cgroup *pc,
2089 * USED state. If already USED, uncharge and return. 2112 enum charge_type ctype,
2090 */ 2113 int page_size)
2091static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
2092 struct page_cgroup *pc,
2093 enum charge_type ctype)
2094{ 2114{
2115 int nr_pages = page_size >> PAGE_SHIFT;
2116
2117 /* try_charge() can return NULL to *memcg, taking care of it. */
2118 if (!mem)
2119 return;
2120
2121 lock_page_cgroup(pc);
2122 if (unlikely(PageCgroupUsed(pc))) {
2123 unlock_page_cgroup(pc);
2124 mem_cgroup_cancel_charge(mem, page_size);
2125 return;
2126 }
2127 /*
2128 * we don't need page_cgroup_lock about tail pages, becase they are not
2129 * accessed by any other context at this point.
2130 */
2095 pc->mem_cgroup = mem; 2131 pc->mem_cgroup = mem;
2096 /* 2132 /*
2097 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2133 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2115,43 +2151,57 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
2115 break; 2151 break;
2116 } 2152 }
2117 2153
2118 mem_cgroup_charge_statistics(mem, pc, true); 2154 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2155 unlock_page_cgroup(pc);
2156 /*
2157 * "charge_statistics" updated event counter. Then, check it.
2158 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2159 * if they exceeds softlimit.
2160 */
2161 memcg_check_events(mem, pc->page);
2119} 2162}
2120 2163
2121static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2164#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2122 struct page_cgroup *pc,
2123 enum charge_type ctype,
2124 int page_size)
2125{
2126 int i;
2127 int count = page_size >> PAGE_SHIFT;
2128 2165
2129 /* try_charge() can return NULL to *memcg, taking care of it. */ 2166#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2130 if (!mem) 2167 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2131 return; 2168/*
2169 * Because tail pages are not marked as "used", set it. We're under
2170 * zone->lru_lock, 'splitting on pmd' and compund_lock.
2171 */
2172void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2173{
2174 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2175 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2176 unsigned long flags;
2132 2177
2133 lock_page_cgroup(pc); 2178 if (mem_cgroup_disabled())
2134 if (unlikely(PageCgroupUsed(pc))) {
2135 unlock_page_cgroup(pc);
2136 mem_cgroup_cancel_charge(mem, page_size);
2137 return; 2179 return;
2138 }
2139
2140 /* 2180 /*
2141 * we don't need page_cgroup_lock about tail pages, becase they are not 2181 * We have no races with charge/uncharge but will have races with
2142 * accessed by any other context at this point. 2182 * page state accounting.
2143 */ 2183 */
2144 for (i = 0; i < count; i++) 2184 move_lock_page_cgroup(head_pc, &flags);
2145 ____mem_cgroup_commit_charge(mem, pc + i, ctype);
2146 2185
2147 unlock_page_cgroup(pc); 2186 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2148 /* 2187 smp_wmb(); /* see __commit_charge() */
2149 * "charge_statistics" updated event counter. Then, check it. 2188 if (PageCgroupAcctLRU(head_pc)) {
2150 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2189 enum lru_list lru;
2151 * if they exceeds softlimit. 2190 struct mem_cgroup_per_zone *mz;
2152 */ 2191
2153 memcg_check_events(mem, pc->page); 2192 /*
2193 * LRU flags cannot be copied because we need to add tail
2194 *.page to LRU by generic call and our hook will be called.
2195 * We hold lru_lock, then, reduce counter directly.
2196 */
2197 lru = page_lru(head);
2198 mz = page_cgroup_zoneinfo(head_pc);
2199 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2200 }
2201 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2202 move_unlock_page_cgroup(head_pc, &flags);
2154} 2203}
2204#endif
2155 2205
2156/** 2206/**
2157 * __mem_cgroup_move_account - move account of the page 2207 * __mem_cgroup_move_account - move account of the page
@@ -2171,8 +2221,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2171 */ 2221 */
2172 2222
2173static void __mem_cgroup_move_account(struct page_cgroup *pc, 2223static void __mem_cgroup_move_account(struct page_cgroup *pc,
2174 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2224 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
2225 int charge_size)
2175{ 2226{
2227 int nr_pages = charge_size >> PAGE_SHIFT;
2228
2176 VM_BUG_ON(from == to); 2229 VM_BUG_ON(from == to);
2177 VM_BUG_ON(PageLRU(pc->page)); 2230 VM_BUG_ON(PageLRU(pc->page));
2178 VM_BUG_ON(!page_is_cgroup_locked(pc)); 2231 VM_BUG_ON(!page_is_cgroup_locked(pc));
@@ -2186,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2186 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2239 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2187 preempt_enable(); 2240 preempt_enable();
2188 } 2241 }
2189 mem_cgroup_charge_statistics(from, pc, false); 2242 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2190 if (uncharge) 2243 if (uncharge)
2191 /* This is not "cancel", but cancel_charge does all we need. */ 2244 /* This is not "cancel", but cancel_charge does all we need. */
2192 mem_cgroup_cancel_charge(from, PAGE_SIZE); 2245 mem_cgroup_cancel_charge(from, charge_size);
2193 2246
2194 /* caller should have done css_get */ 2247 /* caller should have done css_get */
2195 pc->mem_cgroup = to; 2248 pc->mem_cgroup = to;
2196 mem_cgroup_charge_statistics(to, pc, true); 2249 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2197 /* 2250 /*
2198 * We charges against "to" which may not have any tasks. Then, "to" 2251 * We charges against "to" which may not have any tasks. Then, "to"
2199 * can be under rmdir(). But in current implementation, caller of 2252 * can be under rmdir(). But in current implementation, caller of
@@ -2208,15 +2261,24 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2208 * __mem_cgroup_move_account() 2261 * __mem_cgroup_move_account()
2209 */ 2262 */
2210static int mem_cgroup_move_account(struct page_cgroup *pc, 2263static int mem_cgroup_move_account(struct page_cgroup *pc,
2211 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2264 struct mem_cgroup *from, struct mem_cgroup *to,
2265 bool uncharge, int charge_size)
2212{ 2266{
2213 int ret = -EINVAL; 2267 int ret = -EINVAL;
2214 unsigned long flags; 2268 unsigned long flags;
2269 /*
2270 * The page is isolated from LRU. So, collapse function
2271 * will not handle this page. But page splitting can happen.
2272 * Do this check under compound_page_lock(). The caller should
2273 * hold it.
2274 */
2275 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2276 return -EBUSY;
2215 2277
2216 lock_page_cgroup(pc); 2278 lock_page_cgroup(pc);
2217 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 2279 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2218 move_lock_page_cgroup(pc, &flags); 2280 move_lock_page_cgroup(pc, &flags);
2219 __mem_cgroup_move_account(pc, from, to, uncharge); 2281 __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
2220 move_unlock_page_cgroup(pc, &flags); 2282 move_unlock_page_cgroup(pc, &flags);
2221 ret = 0; 2283 ret = 0;
2222 } 2284 }
@@ -2241,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2241 struct cgroup *cg = child->css.cgroup; 2303 struct cgroup *cg = child->css.cgroup;
2242 struct cgroup *pcg = cg->parent; 2304 struct cgroup *pcg = cg->parent;
2243 struct mem_cgroup *parent; 2305 struct mem_cgroup *parent;
2306 int page_size = PAGE_SIZE;
2307 unsigned long flags;
2244 int ret; 2308 int ret;
2245 2309
2246 /* Is ROOT ? */ 2310 /* Is ROOT ? */
@@ -2253,15 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2253 if (isolate_lru_page(page)) 2317 if (isolate_lru_page(page))
2254 goto put; 2318 goto put;
2255 2319
2320 if (PageTransHuge(page))
2321 page_size = HPAGE_SIZE;
2322
2256 parent = mem_cgroup_from_cont(pcg); 2323 parent = mem_cgroup_from_cont(pcg);
2257 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, 2324 ret = __mem_cgroup_try_charge(NULL, gfp_mask,
2258 PAGE_SIZE); 2325 &parent, false, page_size);
2259 if (ret || !parent) 2326 if (ret || !parent)
2260 goto put_back; 2327 goto put_back;
2261 2328
2262 ret = mem_cgroup_move_account(pc, child, parent, true); 2329 if (page_size > PAGE_SIZE)
2330 flags = compound_lock_irqsave(page);
2331
2332 ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
2263 if (ret) 2333 if (ret)
2264 mem_cgroup_cancel_charge(parent, PAGE_SIZE); 2334 mem_cgroup_cancel_charge(parent, page_size);
2335
2336 if (page_size > PAGE_SIZE)
2337 compound_unlock_irqrestore(page, flags);
2265put_back: 2338put_back:
2266 putback_lru_page(page); 2339 putback_lru_page(page);
2267put: 2340put:
@@ -2280,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2280 gfp_t gfp_mask, enum charge_type ctype) 2353 gfp_t gfp_mask, enum charge_type ctype)
2281{ 2354{
2282 struct mem_cgroup *mem = NULL; 2355 struct mem_cgroup *mem = NULL;
2356 int page_size = PAGE_SIZE;
2283 struct page_cgroup *pc; 2357 struct page_cgroup *pc;
2358 bool oom = true;
2284 int ret; 2359 int ret;
2285 int page_size = PAGE_SIZE;
2286 2360
2287 if (PageTransHuge(page)) { 2361 if (PageTransHuge(page)) {
2288 page_size <<= compound_order(page); 2362 page_size <<= compound_order(page);
2289 VM_BUG_ON(!PageTransHuge(page)); 2363 VM_BUG_ON(!PageTransHuge(page));
2364 /*
2365 * Never OOM-kill a process for a huge page. The
2366 * fault handler will fall back to regular pages.
2367 */
2368 oom = false;
2290 } 2369 }
2291 2370
2292 pc = lookup_page_cgroup(page); 2371 pc = lookup_page_cgroup(page);
@@ -2295,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2295 return 0; 2374 return 0;
2296 prefetchw(pc); 2375 prefetchw(pc);
2297 2376
2298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); 2377 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
2299 if (ret || !mem) 2378 if (ret || !mem)
2300 return ret; 2379 return ret;
2301 2380
@@ -2546,7 +2625,6 @@ direct_uncharge:
2546static struct mem_cgroup * 2625static struct mem_cgroup *
2547__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2626__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2548{ 2627{
2549 int i;
2550 int count; 2628 int count;
2551 struct page_cgroup *pc; 2629 struct page_cgroup *pc;
2552 struct mem_cgroup *mem = NULL; 2630 struct mem_cgroup *mem = NULL;
@@ -2596,8 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2596 break; 2674 break;
2597 } 2675 }
2598 2676
2599 for (i = 0; i < count; i++) 2677 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
2600 mem_cgroup_charge_statistics(mem, pc + i, false);
2601 2678
2602 ClearPageCgroupUsed(pc); 2679 ClearPageCgroupUsed(pc);
2603 /* 2680 /*
@@ -4844,7 +4921,7 @@ retry:
4844 goto put; 4921 goto put;
4845 pc = lookup_page_cgroup(page); 4922 pc = lookup_page_cgroup(page);
4846 if (!mem_cgroup_move_account(pc, 4923 if (!mem_cgroup_move_account(pc,
4847 mc.from, mc.to, false)) { 4924 mc.from, mc.to, false, PAGE_SIZE)) {
4848 mc.precharge--; 4925 mc.precharge--;
4849 /* we uncharge from mc.from later. */ 4926 /* we uncharge from mc.from later. */
4850 mc.moved_charge++; 4927 mc.moved_charge++;
@@ -4983,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
4983static int __init enable_swap_account(char *s) 5060static int __init enable_swap_account(char *s)
4984{ 5061{
4985 /* consider enabled if no parameter or 1 is given */ 5062 /* consider enabled if no parameter or 1 is given */
4986 if (!s || !strcmp(s, "1")) 5063 if (!(*s) || !strcmp(s, "=1"))
4987 really_do_swap_account = 1; 5064 really_do_swap_account = 1;
4988 else if (!strcmp(s, "0")) 5065 else if (!strcmp(s, "=0"))
4989 really_do_swap_account = 0; 5066 really_do_swap_account = 0;
4990 return 1; 5067 return 1;
4991} 5068}
@@ -4993,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
4993 5070
4994static int __init disable_swap_account(char *s) 5071static int __init disable_swap_account(char *s)
4995{ 5072{
4996 enable_swap_account("0"); 5073 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5074 enable_swap_account("=0");
4997 return 1; 5075 return 1;
4998} 5076}
4999__setup("noswapaccount", disable_swap_account); 5077__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
233 } 233 }
234 234
235 /* 235 /*
236 * Only all shrink_slab here (which would also 236 * Only call shrink_slab here (which would also shrink other caches) if
237 * shrink other caches) if access is not potentially fatal. 237 * access is not potentially fatal.
238 */ 238 */
239 if (access) { 239 if (access) {
240 int nr; 240 int nr;
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
386 struct task_struct *tsk; 386 struct task_struct *tsk;
387 struct anon_vma *av; 387 struct anon_vma *av;
388 388
389 if (!PageHuge(page) && unlikely(split_huge_page(page)))
390 return;
391 read_lock(&tasklist_lock); 389 read_lock(&tasklist_lock);
392 av = page_lock_anon_vma(page); 390 av = page_lock_anon_vma(page);
393 if (av == NULL) /* Not actually mapped anymore */ 391 if (av == NULL) /* Not actually mapped anymore */
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
856 int ret; 854 int ret;
857 int kill = 1; 855 int kill = 1;
858 struct page *hpage = compound_head(p); 856 struct page *hpage = compound_head(p);
857 struct page *ppage;
859 858
860 if (PageReserved(p) || PageSlab(p)) 859 if (PageReserved(p) || PageSlab(p))
861 return SWAP_SUCCESS; 860 return SWAP_SUCCESS;
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
897 } 896 }
898 897
899 /* 898 /*
899 * ppage: poisoned page
900 * if p is regular page(4k page)
901 * ppage == real poisoned page;
902 * else p is hugetlb or THP, ppage == head page.
903 */
904 ppage = hpage;
905
906 if (PageTransHuge(hpage)) {
907 /*
908 * Verify that this isn't a hugetlbfs head page, the check for
909 * PageAnon is just for avoid tripping a split_huge_page
910 * internal debug check, as split_huge_page refuses to deal with
911 * anything that isn't an anon page. PageAnon can't go away fro
912 * under us because we hold a refcount on the hpage, without a
913 * refcount on the hpage. split_huge_page can't be safely called
914 * in the first place, having a refcount on the tail isn't
915 * enough * to be safe.
916 */
917 if (!PageHuge(hpage) && PageAnon(hpage)) {
918 if (unlikely(split_huge_page(hpage))) {
919 /*
920 * FIXME: if splitting THP is failed, it is
921 * better to stop the following operation rather
922 * than causing panic by unmapping. System might
923 * survive if the page is freed later.
924 */
925 printk(KERN_INFO
926 "MCE %#lx: failed to split THP\n", pfn);
927
928 BUG_ON(!PageHWPoison(p));
929 return SWAP_FAIL;
930 }
931 /* THP is split, so ppage should be the real poisoned page. */
932 ppage = p;
933 }
934 }
935
936 /*
900 * First collect all the processes that have the page 937 * First collect all the processes that have the page
901 * mapped in dirty form. This has to be done before try_to_unmap, 938 * mapped in dirty form. This has to be done before try_to_unmap,
902 * because ttu takes the rmap data structures down. 939 * because ttu takes the rmap data structures down.
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
905 * there's nothing that can be done. 942 * there's nothing that can be done.
906 */ 943 */
907 if (kill) 944 if (kill)
908 collect_procs(hpage, &tokill); 945 collect_procs(ppage, &tokill);
946
947 if (hpage != ppage)
948 lock_page_nosync(ppage);
909 949
910 ret = try_to_unmap(hpage, ttu); 950 ret = try_to_unmap(ppage, ttu);
911 if (ret != SWAP_SUCCESS) 951 if (ret != SWAP_SUCCESS)
912 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 952 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
913 pfn, page_mapcount(hpage)); 953 pfn, page_mapcount(ppage));
954
955 if (hpage != ppage)
956 unlock_page(ppage);
914 957
915 /* 958 /*
916 * Now that the dirty bit has been propagated to the 959 * Now that the dirty bit has been propagated to the
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
921 * use a more force-full uncatchable kill to prevent 964 * use a more force-full uncatchable kill to prevent
922 * any accesses to the poisoned memory. 965 * any accesses to the poisoned memory.
923 */ 966 */
924 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, 967 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
925 ret != SWAP_SUCCESS, p, pfn); 968 ret != SWAP_SUCCESS, p, pfn);
926 969
927 return ret; 970 return ret;
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1022 * The check (unnecessarily) ignores LRU pages being isolated and 1065 * The check (unnecessarily) ignores LRU pages being isolated and
1023 * walked by the page reclaim code, however that's not a big loss. 1066 * walked by the page reclaim code, however that's not a big loss.
1024 */ 1067 */
1025 if (!PageLRU(p) && !PageHuge(p)) 1068 if (!PageHuge(p) && !PageTransCompound(p)) {
1026 shake_page(p, 0); 1069 if (!PageLRU(p))
1027 if (!PageLRU(p) && !PageHuge(p)) { 1070 shake_page(p, 0);
1028 /* 1071 if (!PageLRU(p)) {
1029 * shake_page could have turned it free. 1072 /*
1030 */ 1073 * shake_page could have turned it free.
1031 if (is_free_buddy_page(p)) { 1074 */
1032 action_result(pfn, "free buddy, 2nd try", DELAYED); 1075 if (is_free_buddy_page(p)) {
1033 return 0; 1076 action_result(pfn, "free buddy, 2nd try",
1077 DELAYED);
1078 return 0;
1079 }
1080 action_result(pfn, "non LRU", IGNORED);
1081 put_page(p);
1082 return -EBUSY;
1034 } 1083 }
1035 action_result(pfn, "non LRU", IGNORED);
1036 put_page(p);
1037 return -EBUSY;
1038 } 1084 }
1039 1085
1040 /* 1086 /*
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1064 * For error on the tail page, we should set PG_hwpoison 1110 * For error on the tail page, we should set PG_hwpoison
1065 * on the head page to show that the hugepage is hwpoisoned 1111 * on the head page to show that the hugepage is hwpoisoned
1066 */ 1112 */
1067 if (PageTail(p) && TestSetPageHWPoison(hpage)) { 1113 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1068 action_result(pfn, "hugepage already hardware poisoned", 1114 action_result(pfn, "hugepage already hardware poisoned",
1069 IGNORED); 1115 IGNORED);
1070 unlock_page(hpage); 1116 unlock_page(hpage);
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
1295 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1341 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1296 true); 1342 true);
1297 if (ret) { 1343 if (ret) {
1298 putback_lru_pages(&pagelist); 1344 struct page *page1, *page2;
1345 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1346 put_page(page1);
1347
1299 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1348 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1300 pfn, ret, page->flags); 1349 pfn, ret, page->flags);
1301 if (ret > 0) 1350 if (ret > 0)
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags)
1419 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1468 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1420 0, true); 1469 0, true);
1421 if (ret) { 1470 if (ret) {
1471 putback_lru_pages(&pagelist);
1422 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1472 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1423 pfn, ret, page->flags); 1473 pfn, ret, page->flags);
1424 if (ret > 0) 1474 if (ret > 0)
diff --git a/mm/migrate.c b/mm/migrate.c
index 46fe8cc13d67..766115253807 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -772,6 +772,7 @@ uncharge:
772unlock: 772unlock:
773 unlock_page(page); 773 unlock_page(page);
774 774
775move_newpage:
775 if (rc != -EAGAIN) { 776 if (rc != -EAGAIN) {
776 /* 777 /*
777 * A page that has been migrated has all references 778 * A page that has been migrated has all references
@@ -785,8 +786,6 @@ unlock:
785 putback_lru_page(page); 786 putback_lru_page(page);
786 } 787 }
787 788
788move_newpage:
789
790 /* 789 /*
791 * Move the new page to the LRU. If migration was not successful 790 * Move the new page to the LRU. If migration was not successful
792 * then this will free the page. 791 * then this will free the page.
@@ -888,7 +887,7 @@ out:
888 * are movable anymore because to has become empty 887 * are movable anymore because to has become empty
889 * or no retryable pages exist anymore. 888 * or no retryable pages exist anymore.
890 * Caller should call putback_lru_pages to return pages to the LRU 889 * Caller should call putback_lru_pages to return pages to the LRU
891 * or free list. 890 * or free list only if ret != 0.
892 * 891 *
893 * Return: Number of pages not migrated or error code. 892 * Return: Number of pages not migrated or error code.
894 */ 893 */
@@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
981 } 980 }
982 rc = 0; 981 rc = 0;
983out: 982out:
984
985 list_for_each_entry_safe(page, page2, from, lru)
986 put_page(page);
987
988 if (rc) 983 if (rc)
989 return rc; 984 return rc;
990 985
diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
179 gup_flags |= FOLL_WRITE; 179 gup_flags |= FOLL_WRITE;
180 180
181 /*
182 * We want mlock to succeed for regions that have any permissions
183 * other than PROT_NONE.
184 */
185 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
186 gup_flags |= FOLL_FORCE;
187
181 if (vma->vm_flags & VM_LOCKED) 188 if (vma->vm_flags & VM_LOCKED)
182 gup_flags |= FOLL_MLOCK; 189 gup_flags |= FOLL_MLOCK;
183 190
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90c1439549fd..a873e61e312e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
1088 pset = per_cpu_ptr(zone->pageset, cpu); 1088 pset = per_cpu_ptr(zone->pageset, cpu);
1089 1089
1090 pcp = &pset->pcp; 1090 pcp = &pset->pcp;
1091 free_pcppages_bulk(zone, pcp->count, pcp); 1091 if (pcp->count) {
1092 pcp->count = 0; 1092 free_pcppages_bulk(zone, pcp->count, pcp);
1093 pcp->count = 0;
1094 }
1093 local_irq_restore(flags); 1095 local_irq_restore(flags);
1094 } 1096 }
1095} 1097}
@@ -2034,6 +2036,14 @@ restart:
2034 */ 2036 */
2035 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2037 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2036 2038
2039 /*
2040 * Find the true preferred zone if the allocation is unconstrained by
2041 * cpusets.
2042 */
2043 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2044 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2045 &preferred_zone);
2046
2037 /* This is the last chance, in general, before the goto nopage. */ 2047 /* This is the last chance, in general, before the goto nopage. */
2038 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2048 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2039 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2049 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2192 2202
2193 get_mems_allowed(); 2203 get_mems_allowed();
2194 /* The preferred zone is used for statistics later */ 2204 /* The preferred zone is used for statistics later */
2195 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2205 first_zones_zonelist(zonelist, high_zoneidx,
2206 nodemask ? : &cpuset_current_mems_allowed,
2207 &preferred_zone);
2196 if (!preferred_zone) { 2208 if (!preferred_zone) {
2197 put_mems_allowed(); 2209 put_mems_allowed();
2198 return NULL; 2210 return NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0369f5b3ba1b..eb663fb533e0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2010 Linus Torvalds 6 * Copyright (C) 2010 Linus Torvalds
7 */ 7 */
8 8
9#include <linux/pagemap.h>
9#include <asm/tlb.h> 10#include <asm/tlb.h>
10#include <asm-generic/pgtable.h> 11#include <asm-generic/pgtable.h>
11 12
diff --git a/mm/truncate.c b/mm/truncate.c
index 3c2d5ddfa0d4..49feb46e77b8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -549,13 +549,12 @@ EXPORT_SYMBOL(truncate_pagecache);
549 * @inode: inode 549 * @inode: inode
550 * @newsize: new file size 550 * @newsize: new file size
551 * 551 *
552 * truncate_setsize updastes i_size update and performs pagecache 552 * truncate_setsize updates i_size and performs pagecache truncation (if
553 * truncation (if necessary) for a file size updates. It will be 553 * necessary) to @newsize. It will be typically be called from the filesystem's
554 * typically be called from the filesystem's setattr function when 554 * setattr function when ATTR_SIZE is passed in.
555 * ATTR_SIZE is passed in.
556 * 555 *
557 * Must be called with inode_mutex held and after all filesystem 556 * Must be called with inode_mutex held and before all filesystem specific
558 * specific block truncation has been performed. 557 * block truncation has been performed.
559 */ 558 */
560void truncate_setsize(struct inode *inode, loff_t newsize) 559void truncate_setsize(struct inode *inode, loff_t newsize)
561{ 560{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 47a50962ce81..148c6e630df2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,7 +41,6 @@
41#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
42#include <linux/delayacct.h> 42#include <linux/delayacct.h>
43#include <linux/sysctl.h> 43#include <linux/sysctl.h>
44#include <linux/compaction.h>
45 44
46#include <asm/tlbflush.h> 45#include <asm/tlbflush.h>
47#include <asm/div64.h> 46#include <asm/div64.h>
@@ -2084,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2084 struct zone *preferred_zone; 2083 struct zone *preferred_zone;
2085 2084
2086 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2085 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2087 NULL, &preferred_zone); 2086 &cpuset_current_mems_allowed,
2087 &preferred_zone);
2088 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2088 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2089 } 2089 }
2090 } 2090 }