aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-06-21 01:29:08 -0400
committerDavid S. Miller <davem@davemloft.net>2011-06-21 01:29:08 -0400
commit9f6ec8d697c08963d83880ccd35c13c5ace716ea (patch)
treead8d93cf6fcdd09b86ade09f5fcbbc66cdb1cca2 /mm
parent4aa3a715551c93eda32d79bd52042ce500bd5383 (diff)
parent56299378726d5f2ba8d3c8cbbd13cb280ba45e4f (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
Conflicts: drivers/net/wireless/iwlwifi/iwl-agn-rxon.c drivers/net/wireless/rtlwifi/pci.c net/netfilter/ipvs/ip_vs_core.c
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c76
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/huge_memory.c5
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/memcontrol.c81
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c8
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/page_cgroup.c71
-rw-r--r--mm/rmap.c106
-rw-r--r--mm/slab.c9
-rw-r--r--mm/slub.c12
-rw-r--r--mm/thrash.c105
-rw-r--r--mm/vmscan.c20
17 files changed, 383 insertions, 154 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 021a2960ef9e..6cc604bd5649 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone,
144 int nr_freepages = cc->nr_freepages; 144 int nr_freepages = cc->nr_freepages;
145 struct list_head *freelist = &cc->freepages; 145 struct list_head *freelist = &cc->freepages;
146 146
147 /*
148 * Initialise the free scanner. The starting point is where we last
149 * scanned from (or the end of the zone if starting). The low point
150 * is the end of the pageblock the migration scanner is using.
151 */
147 pfn = cc->free_pfn; 152 pfn = cc->free_pfn;
148 low_pfn = cc->migrate_pfn + pageblock_nr_pages; 153 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
149 high_pfn = low_pfn; 154
155 /*
156 * Take care that if the migration scanner is at the end of the zone
157 * that the free scanner does not accidentally move to the next zone
158 * in the next isolation cycle.
159 */
160 high_pfn = min(low_pfn, pfn);
150 161
151 /* 162 /*
152 * Isolate free pages until enough are available to migrate the 163 * Isolate free pages until enough are available to migrate the
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone)
240 return isolated > (inactive + active) / 2; 251 return isolated > (inactive + active) / 2;
241} 252}
242 253
254/* possible outcome of isolate_migratepages */
255typedef enum {
256 ISOLATE_ABORT, /* Abort compaction now */
257 ISOLATE_NONE, /* No pages isolated, continue scanning */
258 ISOLATE_SUCCESS, /* Pages isolated, migrate */
259} isolate_migrate_t;
260
243/* 261/*
244 * Isolate all pages that can be migrated from the block pointed to by 262 * Isolate all pages that can be migrated from the block pointed to by
245 * the migrate scanner within compact_control. 263 * the migrate scanner within compact_control.
246 */ 264 */
247static unsigned long isolate_migratepages(struct zone *zone, 265static isolate_migrate_t isolate_migratepages(struct zone *zone,
248 struct compact_control *cc) 266 struct compact_control *cc)
249{ 267{
250 unsigned long low_pfn, end_pfn; 268 unsigned long low_pfn, end_pfn;
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
261 /* Do not cross the free scanner or scan within a memory hole */ 279 /* Do not cross the free scanner or scan within a memory hole */
262 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 280 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
263 cc->migrate_pfn = end_pfn; 281 cc->migrate_pfn = end_pfn;
264 return 0; 282 return ISOLATE_NONE;
265 } 283 }
266 284
267 /* 285 /*
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone,
270 * delay for some time until fewer pages are isolated 288 * delay for some time until fewer pages are isolated
271 */ 289 */
272 while (unlikely(too_many_isolated(zone))) { 290 while (unlikely(too_many_isolated(zone))) {
291 /* async migration should just abort */
292 if (!cc->sync)
293 return ISOLATE_ABORT;
294
273 congestion_wait(BLK_RW_ASYNC, HZ/10); 295 congestion_wait(BLK_RW_ASYNC, HZ/10);
274 296
275 if (fatal_signal_pending(current)) 297 if (fatal_signal_pending(current))
276 return 0; 298 return ISOLATE_ABORT;
277 } 299 }
278 300
279 /* Time to isolate some pages for migration */ 301 /* Time to isolate some pages for migration */
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
358 380
359 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 381 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
360 382
361 return cc->nr_migratepages; 383 return ISOLATE_SUCCESS;
362} 384}
363 385
364/* 386/*
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone,
420 if (cc->free_pfn <= cc->migrate_pfn) 442 if (cc->free_pfn <= cc->migrate_pfn)
421 return COMPACT_COMPLETE; 443 return COMPACT_COMPLETE;
422 444
423 /* Compaction run is not finished if the watermark is not met */
424 watermark = low_wmark_pages(zone);
425 watermark += (1 << cc->order);
426
427 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
428 return COMPACT_CONTINUE;
429
430 /* 445 /*
431 * order == -1 is expected when compacting via 446 * order == -1 is expected when compacting via
432 * /proc/sys/vm/compact_memory 447 * /proc/sys/vm/compact_memory
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone,
434 if (cc->order == -1) 449 if (cc->order == -1)
435 return COMPACT_CONTINUE; 450 return COMPACT_CONTINUE;
436 451
452 /* Compaction run is not finished if the watermark is not met */
453 watermark = low_wmark_pages(zone);
454 watermark += (1 << cc->order);
455
456 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
457 return COMPACT_CONTINUE;
458
437 /* Direct compactor: Is a suitable page free? */ 459 /* Direct compactor: Is a suitable page free? */
438 for (order = cc->order; order < MAX_ORDER; order++) { 460 for (order = cc->order; order < MAX_ORDER; order++) {
439 /* Job done if page is free of the right migratetype */ 461 /* Job done if page is free of the right migratetype */
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
461 unsigned long watermark; 483 unsigned long watermark;
462 484
463 /* 485 /*
486 * order == -1 is expected when compacting via
487 * /proc/sys/vm/compact_memory
488 */
489 if (order == -1)
490 return COMPACT_CONTINUE;
491
492 /*
464 * Watermarks for order-0 must be met for compaction. Note the 2UL. 493 * Watermarks for order-0 must be met for compaction. Note the 2UL.
465 * This is because during migration, copies of pages need to be 494 * This is because during migration, copies of pages need to be
466 * allocated and for a short time, the footprint is higher 495 * allocated and for a short time, the footprint is higher
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
470 return COMPACT_SKIPPED; 499 return COMPACT_SKIPPED;
471 500
472 /* 501 /*
473 * order == -1 is expected when compacting via
474 * /proc/sys/vm/compact_memory
475 */
476 if (order == -1)
477 return COMPACT_CONTINUE;
478
479 /*
480 * fragmentation index determines if allocation failures are due to 502 * fragmentation index determines if allocation failures are due to
481 * low memory or external fragmentation 503 * low memory or external fragmentation
482 * 504 *
483 * index of -1 implies allocations might succeed dependingon watermarks 505 * index of -1000 implies allocations might succeed depending on
506 * watermarks
484 * index towards 0 implies failure is due to lack of memory 507 * index towards 0 implies failure is due to lack of memory
485 * index towards 1000 implies failure is due to fragmentation 508 * index towards 1000 implies failure is due to fragmentation
486 * 509 *
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
490 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 513 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
491 return COMPACT_SKIPPED; 514 return COMPACT_SKIPPED;
492 515
493 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) 516 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
517 0, 0))
494 return COMPACT_PARTIAL; 518 return COMPACT_PARTIAL;
495 519
496 return COMPACT_CONTINUE; 520 return COMPACT_CONTINUE;
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
522 unsigned long nr_migrate, nr_remaining; 546 unsigned long nr_migrate, nr_remaining;
523 int err; 547 int err;
524 548
525 if (!isolate_migratepages(zone, cc)) 549 switch (isolate_migratepages(zone, cc)) {
550 case ISOLATE_ABORT:
551 ret = COMPACT_PARTIAL;
552 goto out;
553 case ISOLATE_NONE:
526 continue; 554 continue;
555 case ISOLATE_SUCCESS:
556 ;
557 }
527 558
528 nr_migrate = cc->nr_migratepages; 559 nr_migrate = cc->nr_migratepages;
529 err = migrate_pages(&cc->migratepages, compaction_alloc, 560 err = migrate_pages(&cc->migratepages, compaction_alloc,
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
547 578
548 } 579 }
549 580
581out:
550 /* Release free pages and check accounting */ 582 /* Release free pages and check accounting */
551 cc->nr_freepages -= release_freepages(&cc->freepages); 583 cc->nr_freepages -= release_freepages(&cc->freepages);
552 VM_BUG_ON(cc->nr_freepages != 0); 584 VM_BUG_ON(cc->nr_freepages != 0);
diff --git a/mm/filemap.c b/mm/filemap.c
index d7b10578a64b..a8251a8d3457 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2000,7 +2000,7 @@ int file_remove_suid(struct file *file)
2000 error = security_inode_killpriv(dentry); 2000 error = security_inode_killpriv(dentry);
2001 if (!error && killsuid) 2001 if (!error && killsuid)
2002 error = __remove_suid(dentry, killsuid); 2002 error = __remove_suid(dentry, killsuid);
2003 if (!error) 2003 if (!error && (inode->i_sb->s_flags & MS_NOSEC))
2004 inode->i_flags |= S_NOSEC; 2004 inode->i_flags |= S_NOSEC;
2005 2005
2006 return error; 2006 return error;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 615d9743a3cb..81532f297fd2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2234,11 +2234,8 @@ static void khugepaged_loop(void)
2234 while (likely(khugepaged_enabled())) { 2234 while (likely(khugepaged_enabled())) {
2235#ifndef CONFIG_NUMA 2235#ifndef CONFIG_NUMA
2236 hpage = khugepaged_alloc_hugepage(); 2236 hpage = khugepaged_alloc_hugepage();
2237 if (unlikely(!hpage)) { 2237 if (unlikely(!hpage))
2238 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2239 break; 2238 break;
2240 }
2241 count_vm_event(THP_COLLAPSE_ALLOC);
2242#else 2239#else
2243 if (IS_ERR(hpage)) { 2240 if (IS_ERR(hpage)) {
2244 khugepaged_alloc_sleep(); 2241 khugepaged_alloc_sleep();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f33bb319b73f..bfcf153bc829 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1033,10 +1033,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1033 */ 1033 */
1034 chg = vma_needs_reservation(h, vma, addr); 1034 chg = vma_needs_reservation(h, vma, addr);
1035 if (chg < 0) 1035 if (chg < 0)
1036 return ERR_PTR(chg); 1036 return ERR_PTR(-VM_FAULT_OOM);
1037 if (chg) 1037 if (chg)
1038 if (hugetlb_get_quota(inode->i_mapping, chg)) 1038 if (hugetlb_get_quota(inode->i_mapping, chg))
1039 return ERR_PTR(-ENOSPC); 1039 return ERR_PTR(-VM_FAULT_SIGBUS);
1040 1040
1041 spin_lock(&hugetlb_lock); 1041 spin_lock(&hugetlb_lock);
1042 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1042 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
@@ -1111,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void)
1111 WARN_ON(page_count(page) != 1); 1111 WARN_ON(page_count(page) != 1);
1112 prep_compound_huge_page(page, h->order); 1112 prep_compound_huge_page(page, h->order);
1113 prep_new_huge_page(h, page, page_to_nid(page)); 1113 prep_new_huge_page(h, page, page_to_nid(page));
1114 /*
1115 * If we had gigantic hugepages allocated at boot time, we need
1116 * to restore the 'stolen' pages to totalram_pages in order to
1117 * fix confusing memory reports from free(1) and another
1118 * side-effects, like CommitLimit going negative.
1119 */
1120 if (h->order > (MAX_ORDER - 1))
1121 totalram_pages += 1 << h->order;
1114 } 1122 }
1115} 1123}
1116 1124
diff --git a/mm/ksm.c b/mm/ksm.c
index d708b3ef2260..9a68b0cf0a1c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1302 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1302 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1303 ksm_scan.mm_slot = slot; 1303 ksm_scan.mm_slot = slot;
1304 spin_unlock(&ksm_mmlist_lock); 1304 spin_unlock(&ksm_mmlist_lock);
1305 /*
1306 * Although we tested list_empty() above, a racing __ksm_exit
1307 * of the last mm on the list may have removed it since then.
1308 */
1309 if (slot == &ksm_mm_head)
1310 return NULL;
1305next_mm: 1311next_mm:
1306 ksm_scan.address = 0; 1312 ksm_scan.address = 0;
1307 ksm_scan.rmap_list = &slot->rmap_list; 1313 ksm_scan.rmap_list = &slot->rmap_list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3ad..cf7d027a8844 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -359,7 +359,7 @@ enum charge_type {
359static void mem_cgroup_get(struct mem_cgroup *mem); 359static void mem_cgroup_get(struct mem_cgroup *mem);
360static void mem_cgroup_put(struct mem_cgroup *mem); 360static void mem_cgroup_put(struct mem_cgroup *mem);
361static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 361static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
362static void drain_all_stock_async(void); 362static void drain_all_stock_async(struct mem_cgroup *mem);
363 363
364static struct mem_cgroup_per_zone * 364static struct mem_cgroup_per_zone *
365mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 365mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -735,7 +735,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
735 struct mem_cgroup, css); 735 struct mem_cgroup, css);
736} 736}
737 737
738static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 738struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
739{ 739{
740 struct mem_cgroup *mem = NULL; 740 struct mem_cgroup *mem = NULL;
741 741
@@ -1663,15 +1663,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1664 1664
1665 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1665 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1666 if (root_mem->memsw_is_minimum) 1666 if (!check_soft && root_mem->memsw_is_minimum)
1667 noswap = true; 1667 noswap = true;
1668 1668
1669 while (1) { 1669 while (1) {
1670 victim = mem_cgroup_select_victim(root_mem); 1670 victim = mem_cgroup_select_victim(root_mem);
1671 if (victim == root_mem) { 1671 if (victim == root_mem) {
1672 loop++; 1672 loop++;
1673 if (loop >= 1) 1673 /*
1674 drain_all_stock_async(); 1674 * We are not draining per cpu cached charges during
1675 * soft limit reclaim because global reclaim doesn't
1676 * care about charges. It tries to free some memory and
1677 * charges will not give any.
1678 */
1679 if (!check_soft && loop >= 1)
1680 drain_all_stock_async(root_mem);
1675 if (loop >= 2) { 1681 if (loop >= 2) {
1676 /* 1682 /*
1677 * If we have not been able to reclaim 1683 * If we have not been able to reclaim
@@ -1934,9 +1940,11 @@ struct memcg_stock_pcp {
1934 struct mem_cgroup *cached; /* this never be root cgroup */ 1940 struct mem_cgroup *cached; /* this never be root cgroup */
1935 unsigned int nr_pages; 1941 unsigned int nr_pages;
1936 struct work_struct work; 1942 struct work_struct work;
1943 unsigned long flags;
1944#define FLUSHING_CACHED_CHARGE (0)
1937}; 1945};
1938static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1946static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1939static atomic_t memcg_drain_count; 1947static DEFINE_MUTEX(percpu_charge_mutex);
1940 1948
1941/* 1949/*
1942 * Try to consume stocked charge on this cpu. If success, one page is consumed 1950 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +1992,7 @@ static void drain_local_stock(struct work_struct *dummy)
1984{ 1992{
1985 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1993 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1986 drain_stock(stock); 1994 drain_stock(stock);
1995 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1987} 1996}
1988 1997
1989/* 1998/*
@@ -2008,26 +2017,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2008 * expects some charges will be back to res_counter later but cannot wait for 2017 * expects some charges will be back to res_counter later but cannot wait for
2009 * it. 2018 * it.
2010 */ 2019 */
2011static void drain_all_stock_async(void) 2020static void drain_all_stock_async(struct mem_cgroup *root_mem)
2012{ 2021{
2013 int cpu; 2022 int cpu, curcpu;
2014 /* This function is for scheduling "drain" in asynchronous way. 2023 /*
2015 * The result of "drain" is not directly handled by callers. Then, 2024 * If someone calls draining, avoid adding more kworker runs.
2016 * if someone is calling drain, we don't have to call drain more.
2017 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
2018 * there is a race. We just do loose check here.
2019 */ 2025 */
2020 if (atomic_read(&memcg_drain_count)) 2026 if (!mutex_trylock(&percpu_charge_mutex))
2021 return; 2027 return;
2022 /* Notify other cpus that system-wide "drain" is running */ 2028 /* Notify other cpus that system-wide "drain" is running */
2023 atomic_inc(&memcg_drain_count);
2024 get_online_cpus(); 2029 get_online_cpus();
2030 /*
2031 * Get a hint for avoiding draining charges on the current cpu,
2032 * which must be exhausted by our charging. It is not required that
2033 * this be a precise check, so we use raw_smp_processor_id() instead of
2034 * getcpu()/putcpu().
2035 */
2036 curcpu = raw_smp_processor_id();
2025 for_each_online_cpu(cpu) { 2037 for_each_online_cpu(cpu) {
2026 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2038 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2027 schedule_work_on(cpu, &stock->work); 2039 struct mem_cgroup *mem;
2040
2041 if (cpu == curcpu)
2042 continue;
2043
2044 mem = stock->cached;
2045 if (!mem)
2046 continue;
2047 if (mem != root_mem) {
2048 if (!root_mem->use_hierarchy)
2049 continue;
2050 /* check whether "mem" is under tree of "root_mem" */
2051 if (!css_is_ancestor(&mem->css, &root_mem->css))
2052 continue;
2053 }
2054 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2055 schedule_work_on(cpu, &stock->work);
2028 } 2056 }
2029 put_online_cpus(); 2057 put_online_cpus();
2030 atomic_dec(&memcg_drain_count); 2058 mutex_unlock(&percpu_charge_mutex);
2031 /* We don't wait for flush_work */ 2059 /* We don't wait for flush_work */
2032} 2060}
2033 2061
@@ -2035,9 +2063,9 @@ static void drain_all_stock_async(void)
2035static void drain_all_stock_sync(void) 2063static void drain_all_stock_sync(void)
2036{ 2064{
2037 /* called when force_empty is called */ 2065 /* called when force_empty is called */
2038 atomic_inc(&memcg_drain_count); 2066 mutex_lock(&percpu_charge_mutex);
2039 schedule_on_each_cpu(drain_local_stock); 2067 schedule_on_each_cpu(drain_local_stock);
2040 atomic_dec(&memcg_drain_count); 2068 mutex_unlock(&percpu_charge_mutex);
2041} 2069}
2042 2070
2043/* 2071/*
@@ -4640,6 +4668,7 @@ static struct cftype mem_cgroup_files[] = {
4640 { 4668 {
4641 .name = "numa_stat", 4669 .name = "numa_stat",
4642 .open = mem_control_numa_stat_open, 4670 .open = mem_control_numa_stat_open,
4671 .mode = S_IRUGO,
4643 }, 4672 },
4644#endif 4673#endif
4645}; 4674};
@@ -5414,18 +5443,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5414 struct cgroup *old_cont, 5443 struct cgroup *old_cont,
5415 struct task_struct *p) 5444 struct task_struct *p)
5416{ 5445{
5417 struct mm_struct *mm; 5446 struct mm_struct *mm = get_task_mm(p);
5418 5447
5419 if (!mc.to)
5420 /* no need to move charge */
5421 return;
5422
5423 mm = get_task_mm(p);
5424 if (mm) { 5448 if (mm) {
5425 mem_cgroup_move_charge(mm); 5449 if (mc.to)
5450 mem_cgroup_move_charge(mm);
5451 put_swap_token(mm);
5426 mmput(mm); 5452 mmput(mm);
5427 } 5453 }
5428 mem_cgroup_clear_mc(); 5454 if (mc.to)
5455 mem_cgroup_clear_mc();
5429} 5456}
5430#else /* !CONFIG_MMU */ 5457#else /* !CONFIG_MMU */
5431static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5458static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5c8f7e08928d..eac0ba561491 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,7 @@
52#include <linux/swapops.h> 52#include <linux/swapops.h>
53#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h> 54#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h>
55#include "internal.h" 56#include "internal.h"
56 57
57int sysctl_memory_failure_early_kill __read_mostly = 0; 58int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1468,7 +1469,8 @@ int soft_offline_page(struct page *page, int flags)
1468 put_page(page); 1469 put_page(page);
1469 if (!ret) { 1470 if (!ret) {
1470 LIST_HEAD(pagelist); 1471 LIST_HEAD(pagelist);
1471 1472 inc_zone_page_state(page, NR_ISOLATED_ANON +
1473 page_is_file_cache(page));
1472 list_add(&page->lru, &pagelist); 1474 list_add(&page->lru, &pagelist);
1473 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1475 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1474 0, true); 1476 0, true);
diff --git a/mm/memory.c b/mm/memory.c
index 6953d3926e01..87d935333f0d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1112,11 +1112,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1112 int force_flush = 0; 1112 int force_flush = 0;
1113 int rss[NR_MM_COUNTERS]; 1113 int rss[NR_MM_COUNTERS];
1114 spinlock_t *ptl; 1114 spinlock_t *ptl;
1115 pte_t *start_pte;
1115 pte_t *pte; 1116 pte_t *pte;
1116 1117
1117again: 1118again:
1118 init_rss_vec(rss); 1119 init_rss_vec(rss);
1119 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1120 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1121 pte = start_pte;
1120 arch_enter_lazy_mmu_mode(); 1122 arch_enter_lazy_mmu_mode();
1121 do { 1123 do {
1122 pte_t ptent = *pte; 1124 pte_t ptent = *pte;
@@ -1196,7 +1198,7 @@ again:
1196 1198
1197 add_mm_rss_vec(mm, rss); 1199 add_mm_rss_vec(mm, rss);
1198 arch_leave_lazy_mmu_mode(); 1200 arch_leave_lazy_mmu_mode();
1199 pte_unmap_unlock(pte - 1, ptl); 1201 pte_unmap_unlock(start_pte, ptl);
1200 1202
1201 /* 1203 /*
1202 * mmu_gather ran out of room to batch pages, we break out of 1204 * mmu_gather ran out of room to batch pages, we break out of
@@ -1296,7 +1298,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1296 1298
1297/** 1299/**
1298 * unmap_vmas - unmap a range of memory covered by a list of vma's 1300 * unmap_vmas - unmap a range of memory covered by a list of vma's
1299 * @tlbp: address of the caller's struct mmu_gather 1301 * @tlb: address of the caller's struct mmu_gather
1300 * @vma: the starting vma 1302 * @vma: the starting vma
1301 * @start_addr: virtual address at which to start unmapping 1303 * @start_addr: virtual address at which to start unmapping
1302 * @end_addr: virtual address at which to end unmapping 1304 * @end_addr: virtual address at which to end unmapping
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9f646374e32f..02159c755136 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -494,6 +494,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
494 /* init node's zones as empty zones, we don't have any present pages.*/ 494 /* init node's zones as empty zones, we don't have any present pages.*/
495 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 495 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
496 496
497 /*
498 * The node we allocated has no zone fallback lists. For avoiding
499 * to access not-initialized zonelist, build here.
500 */
501 build_all_zonelists(NULL);
502
497 return pgdat; 503 return pgdat;
498} 504}
499 505
diff --git a/mm/migrate.c b/mm/migrate.c
index e4a5c912983d..666e4e677414 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
288 */ 288 */
289 __dec_zone_page_state(page, NR_FILE_PAGES); 289 __dec_zone_page_state(page, NR_FILE_PAGES);
290 __inc_zone_page_state(newpage, NR_FILE_PAGES); 290 __inc_zone_page_state(newpage, NR_FILE_PAGES);
291 if (PageSwapBacked(page)) { 291 if (!PageSwapCache(page) && PageSwapBacked(page)) {
292 __dec_zone_page_state(page, NR_SHMEM); 292 __dec_zone_page_state(page, NR_SHMEM);
293 __inc_zone_page_state(newpage, NR_SHMEM); 293 __inc_zone_page_state(newpage, NR_SHMEM);
294 } 294 }
diff --git a/mm/mmap.c b/mm/mmap.c
index bbdc9af5e117..d49736ff8a8d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -906,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
906 if (anon_vma) 906 if (anon_vma)
907 return anon_vma; 907 return anon_vma;
908try_prev: 908try_prev:
909 /* 909 near = vma->vm_prev;
910 * It is potentially slow to have to call find_vma_prev here.
911 * But it's only on the first write fault on the vma, not
912 * every time, and we could devise a way to avoid it later
913 * (e.g. stash info in next's anon_vma_node when assigning
914 * an anon_vma, or when trying vma_merge). Another time.
915 */
916 BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
917 if (!near) 910 if (!near)
918 goto none; 911 goto none;
919 912
@@ -2044,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2044 return -EINVAL; 2037 return -EINVAL;
2045 2038
2046 /* Find the first overlapping VMA */ 2039 /* Find the first overlapping VMA */
2047 vma = find_vma_prev(mm, start, &prev); 2040 vma = find_vma(mm, start);
2048 if (!vma) 2041 if (!vma)
2049 return 0; 2042 return 0;
2043 prev = vma->vm_prev;
2050 /* we have start < vma->vm_end */ 2044 /* we have start < vma->vm_end */
2051 2045
2052 /* if it doesn't overlap, we have nothing.. */ 2046 /* if it doesn't overlap, we have nothing.. */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 74ccff61d1be..53bffc6c293e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr)
162} 162}
163#endif 163#endif
164 164
165static int __meminit init_section_page_cgroup(unsigned long pfn) 165static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
166{ 166{
167 struct page_cgroup *base, *pc; 167 struct page_cgroup *base, *pc;
168 struct mem_section *section; 168 struct mem_section *section;
169 unsigned long table_size; 169 unsigned long table_size;
170 unsigned long nr; 170 unsigned long nr;
171 int nid, index; 171 int index;
172 172
173 nr = pfn_to_section_nr(pfn); 173 nr = pfn_to_section_nr(pfn);
174 section = __nr_to_section(nr); 174 section = __nr_to_section(nr);
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
176 if (section->page_cgroup) 176 if (section->page_cgroup)
177 return 0; 177 return 0;
178 178
179 nid = page_to_nid(pfn_to_page(pfn));
180 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 179 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
181 base = alloc_page_cgroup(table_size, nid); 180 base = alloc_page_cgroup(table_size, nid);
182 181
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
196 pc = base + index; 195 pc = base + index;
197 init_page_cgroup(pc, nr); 196 init_page_cgroup(pc, nr);
198 } 197 }
199 198 /*
199 * The passed "pfn" may not be aligned to SECTION. For the calculation
200 * we need to apply a mask.
201 */
202 pfn &= PAGE_SECTION_MASK;
200 section->page_cgroup = base - pfn; 203 section->page_cgroup = base - pfn;
201 total_usage += table_size; 204 total_usage += table_size;
202 return 0; 205 return 0;
@@ -225,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
225 start = start_pfn & ~(PAGES_PER_SECTION - 1); 228 start = start_pfn & ~(PAGES_PER_SECTION - 1);
226 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 229 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
227 230
231 if (nid == -1) {
232 /*
233 * In this case, "nid" already exists and contains valid memory.
234 * "start_pfn" passed to us is a pfn which is an arg for
235 * online__pages(), and start_pfn should exist.
236 */
237 nid = pfn_to_nid(start_pfn);
238 VM_BUG_ON(!node_state(nid, N_ONLINE));
239 }
240
228 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { 241 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
229 if (!pfn_present(pfn)) 242 if (!pfn_present(pfn))
230 continue; 243 continue;
231 fail = init_section_page_cgroup(pfn); 244 fail = init_section_page_cgroup(pfn, nid);
232 } 245 }
233 if (!fail) 246 if (!fail)
234 return 0; 247 return 0;
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
284void __init page_cgroup_init(void) 297void __init page_cgroup_init(void)
285{ 298{
286 unsigned long pfn; 299 unsigned long pfn;
287 int fail = 0; 300 int nid;
288 301
289 if (mem_cgroup_disabled()) 302 if (mem_cgroup_disabled())
290 return; 303 return;
291 304
292 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { 305 for_each_node_state(nid, N_HIGH_MEMORY) {
293 if (!pfn_present(pfn)) 306 unsigned long start_pfn, end_pfn;
294 continue; 307
295 fail = init_section_page_cgroup(pfn); 308 start_pfn = node_start_pfn(nid);
296 } 309 end_pfn = node_end_pfn(nid);
297 if (fail) { 310 /*
298 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); 311 * start_pfn and end_pfn may not be aligned to SECTION and the
299 panic("Out of memory"); 312 * page->flags of out of node pages are not initialized. So we
300 } else { 313 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
301 hotplug_memory_notifier(page_cgroup_callback, 0); 314 */
315 for (pfn = start_pfn;
316 pfn < end_pfn;
317 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
318
319 if (!pfn_valid(pfn))
320 continue;
321 /*
322 * Nodes's pfns can be overlapping.
323 * We know some arch can have a nodes layout such as
324 * -------------pfn-------------->
325 * N0 | N1 | N2 | N0 | N1 | N2|....
326 */
327 if (pfn_to_nid(pfn) != nid)
328 continue;
329 if (init_section_page_cgroup(pfn, nid))
330 goto oom;
331 }
302 } 332 }
333 hotplug_memory_notifier(page_cgroup_callback, 0);
303 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 334 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
304 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" 335 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
305 " want memory cgroups\n"); 336 "don't want memory cgroups\n");
337 return;
338oom:
339 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
340 panic("Out of memory");
306} 341}
307 342
308void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 343void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff --git a/mm/rmap.c b/mm/rmap.c
index 0eb463ea88dd..27dfd3b82b0f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -112,9 +112,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
112 kmem_cache_free(anon_vma_cachep, anon_vma); 112 kmem_cache_free(anon_vma_cachep, anon_vma);
113} 113}
114 114
115static inline struct anon_vma_chain *anon_vma_chain_alloc(void) 115static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
116{ 116{
117 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); 117 return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
118} 118}
119 119
120static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 120static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -159,7 +159,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
159 struct mm_struct *mm = vma->vm_mm; 159 struct mm_struct *mm = vma->vm_mm;
160 struct anon_vma *allocated; 160 struct anon_vma *allocated;
161 161
162 avc = anon_vma_chain_alloc(); 162 avc = anon_vma_chain_alloc(GFP_KERNEL);
163 if (!avc) 163 if (!avc)
164 goto out_enomem; 164 goto out_enomem;
165 165
@@ -200,6 +200,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
200 return -ENOMEM; 200 return -ENOMEM;
201} 201}
202 202
203/*
204 * This is a useful helper function for locking the anon_vma root as
205 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
206 * have the same vma.
207 *
208 * Such anon_vma's should have the same root, so you'd expect to see
209 * just a single mutex_lock for the whole traversal.
210 */
211static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
212{
213 struct anon_vma *new_root = anon_vma->root;
214 if (new_root != root) {
215 if (WARN_ON_ONCE(root))
216 mutex_unlock(&root->mutex);
217 root = new_root;
218 mutex_lock(&root->mutex);
219 }
220 return root;
221}
222
223static inline void unlock_anon_vma_root(struct anon_vma *root)
224{
225 if (root)
226 mutex_unlock(&root->mutex);
227}
228
203static void anon_vma_chain_link(struct vm_area_struct *vma, 229static void anon_vma_chain_link(struct vm_area_struct *vma,
204 struct anon_vma_chain *avc, 230 struct anon_vma_chain *avc,
205 struct anon_vma *anon_vma) 231 struct anon_vma *anon_vma)
@@ -208,13 +234,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
208 avc->anon_vma = anon_vma; 234 avc->anon_vma = anon_vma;
209 list_add(&avc->same_vma, &vma->anon_vma_chain); 235 list_add(&avc->same_vma, &vma->anon_vma_chain);
210 236
211 anon_vma_lock(anon_vma);
212 /* 237 /*
213 * It's critical to add new vmas to the tail of the anon_vma, 238 * It's critical to add new vmas to the tail of the anon_vma,
214 * see comment in huge_memory.c:__split_huge_page(). 239 * see comment in huge_memory.c:__split_huge_page().
215 */ 240 */
216 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 241 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
217 anon_vma_unlock(anon_vma);
218} 242}
219 243
220/* 244/*
@@ -224,13 +248,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
224int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 248int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
225{ 249{
226 struct anon_vma_chain *avc, *pavc; 250 struct anon_vma_chain *avc, *pavc;
251 struct anon_vma *root = NULL;
227 252
228 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 253 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
229 avc = anon_vma_chain_alloc(); 254 struct anon_vma *anon_vma;
230 if (!avc) 255
231 goto enomem_failure; 256 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
232 anon_vma_chain_link(dst, avc, pavc->anon_vma); 257 if (unlikely(!avc)) {
258 unlock_anon_vma_root(root);
259 root = NULL;
260 avc = anon_vma_chain_alloc(GFP_KERNEL);
261 if (!avc)
262 goto enomem_failure;
263 }
264 anon_vma = pavc->anon_vma;
265 root = lock_anon_vma_root(root, anon_vma);
266 anon_vma_chain_link(dst, avc, anon_vma);
233 } 267 }
268 unlock_anon_vma_root(root);
234 return 0; 269 return 0;
235 270
236 enomem_failure: 271 enomem_failure:
@@ -263,7 +298,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
263 anon_vma = anon_vma_alloc(); 298 anon_vma = anon_vma_alloc();
264 if (!anon_vma) 299 if (!anon_vma)
265 goto out_error; 300 goto out_error;
266 avc = anon_vma_chain_alloc(); 301 avc = anon_vma_chain_alloc(GFP_KERNEL);
267 if (!avc) 302 if (!avc)
268 goto out_error_free_anon_vma; 303 goto out_error_free_anon_vma;
269 304
@@ -280,7 +315,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
280 get_anon_vma(anon_vma->root); 315 get_anon_vma(anon_vma->root);
281 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 316 /* Mark this anon_vma as the one where our new (COWed) pages go. */
282 vma->anon_vma = anon_vma; 317 vma->anon_vma = anon_vma;
318 anon_vma_lock(anon_vma);
283 anon_vma_chain_link(vma, avc, anon_vma); 319 anon_vma_chain_link(vma, avc, anon_vma);
320 anon_vma_unlock(anon_vma);
284 321
285 return 0; 322 return 0;
286 323
@@ -291,36 +328,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
291 return -ENOMEM; 328 return -ENOMEM;
292} 329}
293 330
294static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
295{
296 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
297 int empty;
298
299 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
300 if (!anon_vma)
301 return;
302
303 anon_vma_lock(anon_vma);
304 list_del(&anon_vma_chain->same_anon_vma);
305
306 /* We must garbage collect the anon_vma if it's empty */
307 empty = list_empty(&anon_vma->head);
308 anon_vma_unlock(anon_vma);
309
310 if (empty)
311 put_anon_vma(anon_vma);
312}
313
314void unlink_anon_vmas(struct vm_area_struct *vma) 331void unlink_anon_vmas(struct vm_area_struct *vma)
315{ 332{
316 struct anon_vma_chain *avc, *next; 333 struct anon_vma_chain *avc, *next;
334 struct anon_vma *root = NULL;
317 335
318 /* 336 /*
319 * Unlink each anon_vma chained to the VMA. This list is ordered 337 * Unlink each anon_vma chained to the VMA. This list is ordered
320 * from newest to oldest, ensuring the root anon_vma gets freed last. 338 * from newest to oldest, ensuring the root anon_vma gets freed last.
321 */ 339 */
322 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 340 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
323 anon_vma_unlink(avc); 341 struct anon_vma *anon_vma = avc->anon_vma;
342
343 root = lock_anon_vma_root(root, anon_vma);
344 list_del(&avc->same_anon_vma);
345
346 /*
347 * Leave empty anon_vmas on the list - we'll need
348 * to free them outside the lock.
349 */
350 if (list_empty(&anon_vma->head))
351 continue;
352
353 list_del(&avc->same_vma);
354 anon_vma_chain_free(avc);
355 }
356 unlock_anon_vma_root(root);
357
358 /*
359 * Iterate the list once more, it now only contains empty and unlinked
360 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
361 * needing to acquire the anon_vma->root->mutex.
362 */
363 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
364 struct anon_vma *anon_vma = avc->anon_vma;
365
366 put_anon_vma(anon_vma);
367
324 list_del(&avc->same_vma); 368 list_del(&avc->same_vma);
325 anon_vma_chain_free(avc); 369 anon_vma_chain_free(avc);
326 } 370 }
diff --git a/mm/slab.c b/mm/slab.c
index bcfa4987c8ae..d96e223de775 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3604,13 +3604,14 @@ free_done:
3604 * Release an obj back to its cache. If the obj has a constructed state, it must 3604 * Release an obj back to its cache. If the obj has a constructed state, it must
3605 * be in this state _before_ it is released. Called with disabled ints. 3605 * be in this state _before_ it is released. Called with disabled ints.
3606 */ 3606 */
3607static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3607static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3608 void *caller)
3608{ 3609{
3609 struct array_cache *ac = cpu_cache_get(cachep); 3610 struct array_cache *ac = cpu_cache_get(cachep);
3610 3611
3611 check_irq_off(); 3612 check_irq_off();
3612 kmemleak_free_recursive(objp, cachep->flags); 3613 kmemleak_free_recursive(objp, cachep->flags);
3613 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3614 objp = cache_free_debugcheck(cachep, objp, caller);
3614 3615
3615 kmemcheck_slab_free(cachep, objp, obj_size(cachep)); 3616 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3616 3617
@@ -3801,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3801 debug_check_no_locks_freed(objp, obj_size(cachep)); 3802 debug_check_no_locks_freed(objp, obj_size(cachep));
3802 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3803 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3803 debug_check_no_obj_freed(objp, obj_size(cachep)); 3804 debug_check_no_obj_freed(objp, obj_size(cachep));
3804 __cache_free(cachep, objp); 3805 __cache_free(cachep, objp, __builtin_return_address(0));
3805 local_irq_restore(flags); 3806 local_irq_restore(flags);
3806 3807
3807 trace_kmem_cache_free(_RET_IP_, objp); 3808 trace_kmem_cache_free(_RET_IP_, objp);
@@ -3831,7 +3832,7 @@ void kfree(const void *objp)
3831 c = virt_to_cache(objp); 3832 c = virt_to_cache(objp);
3832 debug_check_no_locks_freed(objp, obj_size(c)); 3833 debug_check_no_locks_freed(objp, obj_size(c));
3833 debug_check_no_obj_freed(objp, obj_size(c)); 3834 debug_check_no_obj_freed(objp, obj_size(c));
3834 __cache_free(c, (void *)objp); 3835 __cache_free(c, (void *)objp, __builtin_return_address(0));
3835 local_irq_restore(flags); 3836 local_irq_restore(flags);
3836} 3837}
3837EXPORT_SYMBOL(kfree); 3838EXPORT_SYMBOL(kfree);
diff --git a/mm/slub.c b/mm/slub.c
index 7be0223531b0..35f351f26193 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2320 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2320 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2321 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2321 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2322 2322
2323#ifdef CONFIG_CMPXCHG_LOCAL
2324 /* 2323 /*
2325 * Must align to double word boundary for the double cmpxchg instructions 2324 * Must align to double word boundary for the double cmpxchg
2326 * to work. 2325 * instructions to work; see __pcpu_double_call_return_bool().
2327 */ 2326 */
2328 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); 2327 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
2329#else 2328 2 * sizeof(void *));
2330 /* Regular alignment is sufficient */
2331 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2332#endif
2333 2329
2334 if (!s->cpu_slab) 2330 if (!s->cpu_slab)
2335 return 0; 2331 return 0;
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd8..fabf2d0f5169 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,14 +21,40 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/memcontrol.h>
25
26#include <trace/events/vmscan.h>
27
28#define TOKEN_AGING_INTERVAL (0xFF)
24 29
25static DEFINE_SPINLOCK(swap_token_lock); 30static DEFINE_SPINLOCK(swap_token_lock);
26struct mm_struct *swap_token_mm; 31struct mm_struct *swap_token_mm;
32struct mem_cgroup *swap_token_memcg;
27static unsigned int global_faults; 33static unsigned int global_faults;
34static unsigned int last_aging;
35
36#ifdef CONFIG_CGROUP_MEM_RES_CTLR
37static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
38{
39 struct mem_cgroup *memcg;
40
41 memcg = try_get_mem_cgroup_from_mm(mm);
42 if (memcg)
43 css_put(mem_cgroup_css(memcg));
44
45 return memcg;
46}
47#else
48static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
49{
50 return NULL;
51}
52#endif
28 53
29void grab_swap_token(struct mm_struct *mm) 54void grab_swap_token(struct mm_struct *mm)
30{ 55{
31 int current_interval; 56 int current_interval;
57 unsigned int old_prio = mm->token_priority;
32 58
33 global_faults++; 59 global_faults++;
34 60
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm)
38 return; 64 return;
39 65
40 /* First come first served */ 66 /* First come first served */
41 if (swap_token_mm == NULL) { 67 if (!swap_token_mm)
42 mm->token_priority = mm->token_priority + 2; 68 goto replace_token;
43 swap_token_mm = mm; 69
44 goto out; 70 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
71 swap_token_mm->token_priority /= 2;
72 last_aging = global_faults;
45 } 73 }
46 74
47 if (mm != swap_token_mm) { 75 if (mm == swap_token_mm) {
48 if (current_interval < mm->last_interval)
49 mm->token_priority++;
50 else {
51 if (likely(mm->token_priority > 0))
52 mm->token_priority--;
53 }
54 /* Check if we deserve the token */
55 if (mm->token_priority > swap_token_mm->token_priority) {
56 mm->token_priority += 2;
57 swap_token_mm = mm;
58 }
59 } else {
60 /* Token holder came in again! */
61 mm->token_priority += 2; 76 mm->token_priority += 2;
77 goto update_priority;
78 }
79
80 if (current_interval < mm->last_interval)
81 mm->token_priority++;
82 else {
83 if (likely(mm->token_priority > 0))
84 mm->token_priority--;
62 } 85 }
63 86
87 /* Check if we deserve the token */
88 if (mm->token_priority > swap_token_mm->token_priority)
89 goto replace_token;
90
91update_priority:
92 trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
93
64out: 94out:
65 mm->faultstamp = global_faults; 95 mm->faultstamp = global_faults;
66 mm->last_interval = current_interval; 96 mm->last_interval = current_interval;
67 spin_unlock(&swap_token_lock); 97 spin_unlock(&swap_token_lock);
98 return;
99
100replace_token:
101 mm->token_priority += 2;
102 trace_replace_swap_token(swap_token_mm, mm);
103 swap_token_mm = mm;
104 swap_token_memcg = swap_token_memcg_from_mm(mm);
105 last_aging = global_faults;
106 goto out;
68} 107}
69 108
70/* Called on process exit. */ 109/* Called on process exit. */
71void __put_swap_token(struct mm_struct *mm) 110void __put_swap_token(struct mm_struct *mm)
72{ 111{
73 spin_lock(&swap_token_lock); 112 spin_lock(&swap_token_lock);
74 if (likely(mm == swap_token_mm)) 113 if (likely(mm == swap_token_mm)) {
114 trace_put_swap_token(swap_token_mm);
75 swap_token_mm = NULL; 115 swap_token_mm = NULL;
116 swap_token_memcg = NULL;
117 }
76 spin_unlock(&swap_token_lock); 118 spin_unlock(&swap_token_lock);
77} 119}
120
121static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
122{
123 if (!a)
124 return true;
125 if (!b)
126 return true;
127 if (a == b)
128 return true;
129 return false;
130}
131
132void disable_swap_token(struct mem_cgroup *memcg)
133{
134 /* memcg reclaim don't disable unrelated mm token. */
135 if (match_memcg(memcg, swap_token_memcg)) {
136 spin_lock(&swap_token_lock);
137 if (match_memcg(memcg, swap_token_memcg)) {
138 trace_disable_swap_token(swap_token_mm);
139 swap_token_mm = NULL;
140 swap_token_memcg = NULL;
141 }
142 spin_unlock(&swap_token_lock);
143 }
144}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a088f9cc..8ff834e19c24 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1124,8 +1124,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1124 nr_lumpy_dirty++; 1124 nr_lumpy_dirty++;
1125 scan++; 1125 scan++;
1126 } else { 1126 } else {
1127 /* the page is freed already. */ 1127 /*
1128 if (!page_count(cursor_page)) 1128 * Check if the page is freed already.
1129 *
1130 * We can't use page_count() as that
1131 * requires compound_head and we don't
1132 * have a pin on the page here. If a
1133 * page is tail, we may or may not
1134 * have isolated the head, so assume
1135 * it's not free, it'd be tricky to
1136 * track the head status without a
1137 * page pin.
1138 */
1139 if (!PageTail(cursor_page) &&
1140 !atomic_read(&cursor_page->_count))
1129 continue; 1141 continue;
1130 break; 1142 break;
1131 } 1143 }
@@ -2081,7 +2093,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2081 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2093 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2082 sc->nr_scanned = 0; 2094 sc->nr_scanned = 0;
2083 if (!priority) 2095 if (!priority)
2084 disable_swap_token(); 2096 disable_swap_token(sc->mem_cgroup);
2085 total_scanned += shrink_zones(priority, zonelist, sc); 2097 total_scanned += shrink_zones(priority, zonelist, sc);
2086 /* 2098 /*
2087 * Don't shrink slabs when reclaiming memory from 2099 * Don't shrink slabs when reclaiming memory from
@@ -2407,7 +2419,7 @@ loop_again:
2407 2419
2408 /* The swap token gets in the way of swapout... */ 2420 /* The swap token gets in the way of swapout... */
2409 if (!priority) 2421 if (!priority)
2410 disable_swap_token(); 2422 disable_swap_token(NULL);
2411 2423
2412 all_zones_ok = 1; 2424 all_zones_ok = 1;
2413 balanced = 0; 2425 balanced = 0;