aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/filemap.c18
-rw-r--r--mm/huge_memory.c93
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/memcontrol.c1102
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c4
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c173
-rw-r--r--mm/oom_kill.c42
-rw-r--r--mm/page_alloc.c55
-rw-r--r--mm/page_cgroup.c164
-rw-r--r--mm/rmap.c20
-rw-r--r--mm/slub.c9
-rw-r--r--mm/swap.c79
-rw-r--r--mm/swap_state.c10
-rw-r--r--mm/swapfile.c9
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c680
-rw-r--r--mm/vmstat.c2
21 files changed, 1306 insertions, 1185 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index e6670c34eb49..71a58f67f481 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -350,7 +350,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
350 } 350 }
351 351
352 if (!cc->sync) 352 if (!cc->sync)
353 mode |= ISOLATE_CLEAN; 353 mode |= ISOLATE_ASYNC_MIGRATE;
354 354
355 /* Try isolate the page */ 355 /* Try isolate the page */
356 if (__isolate_lru_page(page, mode, 0) != 0) 356 if (__isolate_lru_page(page, mode, 0) != 0)
@@ -557,7 +557,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
557 nr_migrate = cc->nr_migratepages; 557 nr_migrate = cc->nr_migratepages;
558 err = migrate_pages(&cc->migratepages, compaction_alloc, 558 err = migrate_pages(&cc->migratepages, compaction_alloc,
559 (unsigned long)cc, false, 559 (unsigned long)cc, false,
560 cc->sync); 560 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
561 update_nr_listpages(cc); 561 update_nr_listpages(cc);
562 nr_remaining = cc->nr_migratepages; 562 nr_remaining = cc->nr_migratepages;
563 563
@@ -671,6 +671,7 @@ static int compact_node(int nid)
671 .nr_freepages = 0, 671 .nr_freepages = 0,
672 .nr_migratepages = 0, 672 .nr_migratepages = 0,
673 .order = -1, 673 .order = -1,
674 .sync = true,
674 }; 675 };
675 676
676 zone = &pgdat->node_zones[zoneid]; 677 zone = &pgdat->node_zones[zoneid];
diff --git a/mm/filemap.c b/mm/filemap.c
index c4ee2e918bea..97f49ed35bd2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
393int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 393int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
394{ 394{
395 int error; 395 int error;
396 struct mem_cgroup *memcg = NULL;
397 396
398 VM_BUG_ON(!PageLocked(old)); 397 VM_BUG_ON(!PageLocked(old));
399 VM_BUG_ON(!PageLocked(new)); 398 VM_BUG_ON(!PageLocked(new));
400 VM_BUG_ON(new->mapping); 399 VM_BUG_ON(new->mapping);
401 400
402 /*
403 * This is not page migration, but prepare_migration and
404 * end_migration does enough work for charge replacement.
405 *
406 * In the longer term we probably want a specialized function
407 * for moving the charge from old to new in a more efficient
408 * manner.
409 */
410 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
411 if (error)
412 return error;
413
414 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 401 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
415 if (!error) { 402 if (!error) {
416 struct address_space *mapping = old->mapping; 403 struct address_space *mapping = old->mapping;
@@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
432 if (PageSwapBacked(new)) 419 if (PageSwapBacked(new))
433 __inc_zone_page_state(new, NR_SHMEM); 420 __inc_zone_page_state(new, NR_SHMEM);
434 spin_unlock_irq(&mapping->tree_lock); 421 spin_unlock_irq(&mapping->tree_lock);
422 /* mem_cgroup codes must not be called under tree_lock */
423 mem_cgroup_replace_page_cache(old, new);
435 radix_tree_preload_end(); 424 radix_tree_preload_end();
436 if (freepage) 425 if (freepage)
437 freepage(old); 426 freepage(old);
438 page_cache_release(old); 427 page_cache_release(old);
439 mem_cgroup_end_migration(memcg, old, new, true);
440 } else {
441 mem_cgroup_end_migration(memcg, old, new, false);
442 } 428 }
443 429
444 return error; 430 return error;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 36b3d988b4ef..b3ffc21ce801 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -487,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = {
487 .attrs = khugepaged_attr, 487 .attrs = khugepaged_attr,
488 .name = "khugepaged", 488 .name = "khugepaged",
489}; 489};
490#endif /* CONFIG_SYSFS */
491 490
492static int __init hugepage_init(void) 491static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
493{ 492{
494 int err; 493 int err;
495#ifdef CONFIG_SYSFS
496 static struct kobject *hugepage_kobj;
497#endif
498
499 err = -EINVAL;
500 if (!has_transparent_hugepage()) {
501 transparent_hugepage_flags = 0;
502 goto out;
503 }
504 494
505#ifdef CONFIG_SYSFS 495 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
506 err = -ENOMEM; 496 if (unlikely(!*hugepage_kobj)) {
507 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
508 if (unlikely(!hugepage_kobj)) {
509 printk(KERN_ERR "hugepage: failed kobject create\n"); 497 printk(KERN_ERR "hugepage: failed kobject create\n");
510 goto out; 498 return -ENOMEM;
511 } 499 }
512 500
513 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); 501 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
514 if (err) { 502 if (err) {
515 printk(KERN_ERR "hugepage: failed register hugeage group\n"); 503 printk(KERN_ERR "hugepage: failed register hugeage group\n");
516 goto out; 504 goto delete_obj;
517 } 505 }
518 506
519 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); 507 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
520 if (err) { 508 if (err) {
521 printk(KERN_ERR "hugepage: failed register hugeage group\n"); 509 printk(KERN_ERR "hugepage: failed register hugeage group\n");
522 goto out; 510 goto remove_hp_group;
523 } 511 }
524#endif 512
513 return 0;
514
515remove_hp_group:
516 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
517delete_obj:
518 kobject_put(*hugepage_kobj);
519 return err;
520}
521
522static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
523{
524 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
525 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
526 kobject_put(hugepage_kobj);
527}
528#else
529static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
530{
531 return 0;
532}
533
534static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
535{
536}
537#endif /* CONFIG_SYSFS */
538
539static int __init hugepage_init(void)
540{
541 int err;
542 struct kobject *hugepage_kobj;
543
544 if (!has_transparent_hugepage()) {
545 transparent_hugepage_flags = 0;
546 return -EINVAL;
547 }
548
549 err = hugepage_init_sysfs(&hugepage_kobj);
550 if (err)
551 return err;
525 552
526 err = khugepaged_slab_init(); 553 err = khugepaged_slab_init();
527 if (err) 554 if (err)
@@ -545,7 +572,9 @@ static int __init hugepage_init(void)
545 572
546 set_recommended_min_free_kbytes(); 573 set_recommended_min_free_kbytes();
547 574
575 return 0;
548out: 576out:
577 hugepage_exit_sysfs(hugepage_kobj);
549 return err; 578 return err;
550} 579}
551module_init(hugepage_init) 580module_init(hugepage_init)
@@ -997,7 +1026,7 @@ out:
997} 1026}
998 1027
999int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1028int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1000 pmd_t *pmd) 1029 pmd_t *pmd, unsigned long addr)
1001{ 1030{
1002 int ret = 0; 1031 int ret = 0;
1003 1032
@@ -1013,6 +1042,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1013 pgtable = get_pmd_huge_pte(tlb->mm); 1042 pgtable = get_pmd_huge_pte(tlb->mm);
1014 page = pmd_page(*pmd); 1043 page = pmd_page(*pmd);
1015 pmd_clear(pmd); 1044 pmd_clear(pmd);
1045 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1016 page_remove_rmap(page); 1046 page_remove_rmap(page);
1017 VM_BUG_ON(page_mapcount(page) < 0); 1047 VM_BUG_ON(page_mapcount(page) < 0);
1018 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1048 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1116,7 +1146,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1116 entry = pmd_modify(entry, newprot); 1146 entry = pmd_modify(entry, newprot);
1117 set_pmd_at(mm, addr, pmd, entry); 1147 set_pmd_at(mm, addr, pmd, entry);
1118 spin_unlock(&vma->vm_mm->page_table_lock); 1148 spin_unlock(&vma->vm_mm->page_table_lock);
1119 flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1120 ret = 1; 1149 ret = 1;
1121 } 1150 }
1122 } else 1151 } else
@@ -1199,16 +1228,16 @@ static int __split_huge_page_splitting(struct page *page,
1199static void __split_huge_page_refcount(struct page *page) 1228static void __split_huge_page_refcount(struct page *page)
1200{ 1229{
1201 int i; 1230 int i;
1202 unsigned long head_index = page->index;
1203 struct zone *zone = page_zone(page); 1231 struct zone *zone = page_zone(page);
1204 int zonestat;
1205 int tail_count = 0; 1232 int tail_count = 0;
1206 1233
1207 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1234 /* prevent PageLRU to go away from under us, and freeze lru stats */
1208 spin_lock_irq(&zone->lru_lock); 1235 spin_lock_irq(&zone->lru_lock);
1209 compound_lock(page); 1236 compound_lock(page);
1237 /* complete memcg works before add pages to LRU */
1238 mem_cgroup_split_huge_fixup(page);
1210 1239
1211 for (i = 1; i < HPAGE_PMD_NR; i++) { 1240 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
1212 struct page *page_tail = page + i; 1241 struct page *page_tail = page + i;
1213 1242
1214 /* tail_page->_mapcount cannot change */ 1243 /* tail_page->_mapcount cannot change */
@@ -1271,14 +1300,13 @@ static void __split_huge_page_refcount(struct page *page)
1271 BUG_ON(page_tail->mapping); 1300 BUG_ON(page_tail->mapping);
1272 page_tail->mapping = page->mapping; 1301 page_tail->mapping = page->mapping;
1273 1302
1274 page_tail->index = ++head_index; 1303 page_tail->index = page->index + i;
1275 1304
1276 BUG_ON(!PageAnon(page_tail)); 1305 BUG_ON(!PageAnon(page_tail));
1277 BUG_ON(!PageUptodate(page_tail)); 1306 BUG_ON(!PageUptodate(page_tail));
1278 BUG_ON(!PageDirty(page_tail)); 1307 BUG_ON(!PageDirty(page_tail));
1279 BUG_ON(!PageSwapBacked(page_tail)); 1308 BUG_ON(!PageSwapBacked(page_tail));
1280 1309
1281 mem_cgroup_split_huge_fixup(page, page_tail);
1282 1310
1283 lru_add_page_tail(zone, page, page_tail); 1311 lru_add_page_tail(zone, page, page_tail);
1284 } 1312 }
@@ -1288,15 +1316,6 @@ static void __split_huge_page_refcount(struct page *page)
1288 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1316 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1289 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1317 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1290 1318
1291 /*
1292 * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
1293 * so adjust those appropriately if this page is on the LRU.
1294 */
1295 if (PageLRU(page)) {
1296 zonestat = NR_LRU_BASE + page_lru(page);
1297 __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
1298 }
1299
1300 ClearPageCompound(page); 1319 ClearPageCompound(page);
1301 compound_unlock(page); 1320 compound_unlock(page);
1302 spin_unlock_irq(&zone->lru_lock); 1321 spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/ksm.c b/mm/ksm.c
index 310544a379ae..1925ffbfb27f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -28,6 +28,7 @@
28#include <linux/kthread.h> 28#include <linux/kthread.h>
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/memcontrol.h>
31#include <linux/rbtree.h> 32#include <linux/rbtree.h>
32#include <linux/memory.h> 33#include <linux/memory.h>
33#include <linux/mmu_notifier.h> 34#include <linux/mmu_notifier.h>
@@ -1571,6 +1572,16 @@ struct page *ksm_does_need_to_copy(struct page *page,
1571 1572
1572 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1573 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1573 if (new_page) { 1574 if (new_page) {
1575 /*
1576 * The memcg-specific accounting when moving
1577 * pages around the LRU lists relies on the
1578 * page's owner (memcg) to be valid. Usually,
1579 * pages are assigned to a new owner before
1580 * being put on the LRU list, but since this
1581 * is not the case here, the stale owner from
1582 * a previous allocation cycle must be reset.
1583 */
1584 mem_cgroup_reset_owner(new_page);
1574 copy_user_highpage(new_page, page, address, vma); 1585 copy_user_highpage(new_page, page, address, vma);
1575 1586
1576 SetPageDirty(new_page); 1587 SetPageDirty(new_page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d87aa3510c5e..602207be9853 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu {
123 unsigned long targets[MEM_CGROUP_NTARGETS]; 123 unsigned long targets[MEM_CGROUP_NTARGETS];
124}; 124};
125 125
126struct mem_cgroup_reclaim_iter {
127 /* css_id of the last scanned hierarchy member */
128 int position;
129 /* scan generation, increased every round-trip */
130 unsigned int generation;
131};
132
126/* 133/*
127 * per-zone information in memory controller. 134 * per-zone information in memory controller.
128 */ 135 */
129struct mem_cgroup_per_zone { 136struct mem_cgroup_per_zone {
130 /* 137 struct lruvec lruvec;
131 * spin_lock to protect the per cgroup LRU
132 */
133 struct list_head lists[NR_LRU_LISTS];
134 unsigned long count[NR_LRU_LISTS]; 138 unsigned long count[NR_LRU_LISTS];
135 139
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141
136 struct zone_reclaim_stat reclaim_stat; 142 struct zone_reclaim_stat reclaim_stat;
137 struct rb_node tree_node; /* RB tree node */ 143 struct rb_node tree_node; /* RB tree node */
138 unsigned long long usage_in_excess;/* Set to the value by which */ 144 unsigned long long usage_in_excess;/* Set to the value by which */
@@ -233,11 +239,6 @@ struct mem_cgroup {
233 * per zone LRU lists. 239 * per zone LRU lists.
234 */ 240 */
235 struct mem_cgroup_lru_info info; 241 struct mem_cgroup_lru_info info;
236 /*
237 * While reclaiming in a hierarchy, we cache the last child we
238 * reclaimed from.
239 */
240 int last_scanned_child;
241 int last_scanned_node; 242 int last_scanned_node;
242#if MAX_NUMNODES > 1 243#if MAX_NUMNODES > 1
243 nodemask_t scan_nodes; 244 nodemask_t scan_nodes;
@@ -366,8 +367,6 @@ enum charge_type {
366#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 367#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
367#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 368#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
368#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 369#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
369#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
370#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
371 370
372static void mem_cgroup_get(struct mem_cgroup *memcg); 371static void mem_cgroup_get(struct mem_cgroup *memcg);
373static void mem_cgroup_put(struct mem_cgroup *memcg); 372static void mem_cgroup_put(struct mem_cgroup *memcg);
@@ -566,7 +565,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
566 struct mem_cgroup_per_zone *mz; 565 struct mem_cgroup_per_zone *mz;
567 struct mem_cgroup_tree_per_zone *mctz; 566 struct mem_cgroup_tree_per_zone *mctz;
568 567
569 for_each_node_state(node, N_POSSIBLE) { 568 for_each_node(node) {
570 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 569 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
571 mz = mem_cgroup_zoneinfo(memcg, node, zone); 570 mz = mem_cgroup_zoneinfo(memcg, node, zone);
572 mctz = soft_limit_tree_node_zone(node, zone); 571 mctz = soft_limit_tree_node_zone(node, zone);
@@ -656,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
656 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 655 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
657} 656}
658 657
659void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
660{
661 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
662}
663
664void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
665{
666 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
667}
668
669static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 658static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
670 enum mem_cgroup_events_index idx) 659 enum mem_cgroup_events_index idx)
671{ 660{
@@ -749,37 +738,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
749 return total; 738 return total;
750} 739}
751 740
752static bool __memcg_event_check(struct mem_cgroup *memcg, int target) 741static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
742 enum mem_cgroup_events_target target)
753{ 743{
754 unsigned long val, next; 744 unsigned long val, next;
755 745
756 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 746 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
757 next = __this_cpu_read(memcg->stat->targets[target]); 747 next = __this_cpu_read(memcg->stat->targets[target]);
758 /* from time_after() in jiffies.h */ 748 /* from time_after() in jiffies.h */
759 return ((long)next - (long)val < 0); 749 if ((long)next - (long)val < 0) {
760} 750 switch (target) {
761 751 case MEM_CGROUP_TARGET_THRESH:
762static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) 752 next = val + THRESHOLDS_EVENTS_TARGET;
763{ 753 break;
764 unsigned long val, next; 754 case MEM_CGROUP_TARGET_SOFTLIMIT:
765 755 next = val + SOFTLIMIT_EVENTS_TARGET;
766 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 756 break;
767 757 case MEM_CGROUP_TARGET_NUMAINFO:
768 switch (target) { 758 next = val + NUMAINFO_EVENTS_TARGET;
769 case MEM_CGROUP_TARGET_THRESH: 759 break;
770 next = val + THRESHOLDS_EVENTS_TARGET; 760 default:
771 break; 761 break;
772 case MEM_CGROUP_TARGET_SOFTLIMIT: 762 }
773 next = val + SOFTLIMIT_EVENTS_TARGET; 763 __this_cpu_write(memcg->stat->targets[target], next);
774 break; 764 return true;
775 case MEM_CGROUP_TARGET_NUMAINFO:
776 next = val + NUMAINFO_EVENTS_TARGET;
777 break;
778 default:
779 return;
780 } 765 }
781 766 return false;
782 __this_cpu_write(memcg->stat->targets[target], next);
783} 767}
784 768
785/* 769/*
@@ -790,25 +774,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
790{ 774{
791 preempt_disable(); 775 preempt_disable();
792 /* threshold event is triggered in finer grain than soft limit */ 776 /* threshold event is triggered in finer grain than soft limit */
793 if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { 777 if (unlikely(mem_cgroup_event_ratelimit(memcg,
778 MEM_CGROUP_TARGET_THRESH))) {
779 bool do_softlimit, do_numainfo;
780
781 do_softlimit = mem_cgroup_event_ratelimit(memcg,
782 MEM_CGROUP_TARGET_SOFTLIMIT);
783#if MAX_NUMNODES > 1
784 do_numainfo = mem_cgroup_event_ratelimit(memcg,
785 MEM_CGROUP_TARGET_NUMAINFO);
786#endif
787 preempt_enable();
788
794 mem_cgroup_threshold(memcg); 789 mem_cgroup_threshold(memcg);
795 __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); 790 if (unlikely(do_softlimit))
796 if (unlikely(__memcg_event_check(memcg,
797 MEM_CGROUP_TARGET_SOFTLIMIT))) {
798 mem_cgroup_update_tree(memcg, page); 791 mem_cgroup_update_tree(memcg, page);
799 __mem_cgroup_target_update(memcg,
800 MEM_CGROUP_TARGET_SOFTLIMIT);
801 }
802#if MAX_NUMNODES > 1 792#if MAX_NUMNODES > 1
803 if (unlikely(__memcg_event_check(memcg, 793 if (unlikely(do_numainfo))
804 MEM_CGROUP_TARGET_NUMAINFO))) {
805 atomic_inc(&memcg->numainfo_events); 794 atomic_inc(&memcg->numainfo_events);
806 __mem_cgroup_target_update(memcg,
807 MEM_CGROUP_TARGET_NUMAINFO);
808 }
809#endif 795#endif
810 } 796 } else
811 preempt_enable(); 797 preempt_enable();
812} 798}
813 799
814struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 800struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -853,83 +839,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
853 return memcg; 839 return memcg;
854} 840}
855 841
856/* The caller has to guarantee "mem" exists before calling this */ 842/**
857static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) 843 * mem_cgroup_iter - iterate over memory cgroup hierarchy
844 * @root: hierarchy root
845 * @prev: previously returned memcg, NULL on first invocation
846 * @reclaim: cookie for shared reclaim walks, NULL for full walks
847 *
848 * Returns references to children of the hierarchy below @root, or
849 * @root itself, or %NULL after a full round-trip.
850 *
851 * Caller must pass the return value in @prev on subsequent
852 * invocations for reference counting, or use mem_cgroup_iter_break()
853 * to cancel a hierarchy walk before the round-trip is complete.
854 *
855 * Reclaimers can specify a zone and a priority level in @reclaim to
856 * divide up the memcgs in the hierarchy among all concurrent
857 * reclaimers operating on the same zone and priority.
858 */
859struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
860 struct mem_cgroup *prev,
861 struct mem_cgroup_reclaim_cookie *reclaim)
858{ 862{
859 struct cgroup_subsys_state *css; 863 struct mem_cgroup *memcg = NULL;
860 int found; 864 int id = 0;
861 865
862 if (!memcg) /* ROOT cgroup has the smallest ID */ 866 if (mem_cgroup_disabled())
863 return root_mem_cgroup; /*css_put/get against root is ignored*/
864 if (!memcg->use_hierarchy) {
865 if (css_tryget(&memcg->css))
866 return memcg;
867 return NULL; 867 return NULL;
868 }
869 rcu_read_lock();
870 /*
871 * searching a memory cgroup which has the smallest ID under given
872 * ROOT cgroup. (ID >= 1)
873 */
874 css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
875 if (css && css_tryget(css))
876 memcg = container_of(css, struct mem_cgroup, css);
877 else
878 memcg = NULL;
879 rcu_read_unlock();
880 return memcg;
881}
882 868
883static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 869 if (!root)
884 struct mem_cgroup *root, 870 root = root_mem_cgroup;
885 bool cond)
886{
887 int nextid = css_id(&iter->css) + 1;
888 int found;
889 int hierarchy_used;
890 struct cgroup_subsys_state *css;
891 871
892 hierarchy_used = iter->use_hierarchy; 872 if (prev && !reclaim)
873 id = css_id(&prev->css);
893 874
894 css_put(&iter->css); 875 if (prev && prev != root)
895 /* If no ROOT, walk all, ignore hierarchy */ 876 css_put(&prev->css);
896 if (!cond || (root && !hierarchy_used))
897 return NULL;
898 877
899 if (!root) 878 if (!root->use_hierarchy && root != root_mem_cgroup) {
900 root = root_mem_cgroup; 879 if (prev)
880 return NULL;
881 return root;
882 }
901 883
902 do { 884 while (!memcg) {
903 iter = NULL; 885 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
904 rcu_read_lock(); 886 struct cgroup_subsys_state *css;
887
888 if (reclaim) {
889 int nid = zone_to_nid(reclaim->zone);
890 int zid = zone_idx(reclaim->zone);
891 struct mem_cgroup_per_zone *mz;
905 892
906 css = css_get_next(&mem_cgroup_subsys, nextid, 893 mz = mem_cgroup_zoneinfo(root, nid, zid);
907 &root->css, &found); 894 iter = &mz->reclaim_iter[reclaim->priority];
908 if (css && css_tryget(css)) 895 if (prev && reclaim->generation != iter->generation)
909 iter = container_of(css, struct mem_cgroup, css); 896 return NULL;
897 id = iter->position;
898 }
899
900 rcu_read_lock();
901 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
902 if (css) {
903 if (css == &root->css || css_tryget(css))
904 memcg = container_of(css,
905 struct mem_cgroup, css);
906 } else
907 id = 0;
910 rcu_read_unlock(); 908 rcu_read_unlock();
911 /* If css is NULL, no more cgroups will be found */
912 nextid = found + 1;
913 } while (css && !iter);
914 909
915 return iter; 910 if (reclaim) {
911 iter->position = id;
912 if (!css)
913 iter->generation++;
914 else if (!prev && memcg)
915 reclaim->generation = iter->generation;
916 }
917
918 if (prev && !css)
919 return NULL;
920 }
921 return memcg;
916} 922}
917/*
918 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
919 * be careful that "break" loop is not allowed. We have reference count.
920 * Instead of that modify "cond" to be false and "continue" to exit the loop.
921 */
922#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
923 for (iter = mem_cgroup_start_loop(root);\
924 iter != NULL;\
925 iter = mem_cgroup_get_next(iter, root, cond))
926 923
927#define for_each_mem_cgroup_tree(iter, root) \ 924/**
928 for_each_mem_cgroup_tree_cond(iter, root, true) 925 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
926 * @root: hierarchy root
927 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
928 */
929void mem_cgroup_iter_break(struct mem_cgroup *root,
930 struct mem_cgroup *prev)
931{
932 if (!root)
933 root = root_mem_cgroup;
934 if (prev && prev != root)
935 css_put(&prev->css);
936}
929 937
930#define for_each_mem_cgroup_all(iter) \ 938/*
931 for_each_mem_cgroup_tree_cond(iter, NULL, true) 939 * Iteration constructs for visiting all cgroups (under a tree). If
940 * loops are exited prematurely (break), mem_cgroup_iter_break() must
941 * be used for reference counting.
942 */
943#define for_each_mem_cgroup_tree(iter, root) \
944 for (iter = mem_cgroup_iter(root, NULL, NULL); \
945 iter != NULL; \
946 iter = mem_cgroup_iter(root, iter, NULL))
932 947
948#define for_each_mem_cgroup(iter) \
949 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
950 iter != NULL; \
951 iter = mem_cgroup_iter(NULL, iter, NULL))
933 952
934static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 953static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
935{ 954{
@@ -949,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
949 goto out; 968 goto out;
950 969
951 switch (idx) { 970 switch (idx) {
952 case PGMAJFAULT:
953 mem_cgroup_pgmajfault(memcg, 1);
954 break;
955 case PGFAULT: 971 case PGFAULT:
956 mem_cgroup_pgfault(memcg, 1); 972 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
973 break;
974 case PGMAJFAULT:
975 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
957 break; 976 break;
958 default: 977 default:
959 BUG(); 978 BUG();
@@ -963,6 +982,27 @@ out:
963} 982}
964EXPORT_SYMBOL(mem_cgroup_count_vm_event); 983EXPORT_SYMBOL(mem_cgroup_count_vm_event);
965 984
985/**
986 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
987 * @zone: zone of the wanted lruvec
988 * @mem: memcg of the wanted lruvec
989 *
990 * Returns the lru list vector holding pages for the given @zone and
991 * @mem. This can be the global zone lruvec, if the memory controller
992 * is disabled.
993 */
994struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
995 struct mem_cgroup *memcg)
996{
997 struct mem_cgroup_per_zone *mz;
998
999 if (mem_cgroup_disabled())
1000 return &zone->lruvec;
1001
1002 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1003 return &mz->lruvec;
1004}
1005
966/* 1006/*
967 * Following LRU functions are allowed to be used without PCG_LOCK. 1007 * Following LRU functions are allowed to be used without PCG_LOCK.
968 * Operations are called by routine of global LRU independently from memcg. 1008 * Operations are called by routine of global LRU independently from memcg.
@@ -977,180 +1017,91 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
977 * When moving account, the page is not on LRU. It's isolated. 1017 * When moving account, the page is not on LRU. It's isolated.
978 */ 1018 */
979 1019
980void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 1020/**
981{ 1021 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
982 struct page_cgroup *pc; 1022 * @zone: zone of the page
983 struct mem_cgroup_per_zone *mz; 1023 * @page: the page
984 1024 * @lru: current lru
985 if (mem_cgroup_disabled()) 1025 *
986 return; 1026 * This function accounts for @page being added to @lru, and returns
987 pc = lookup_page_cgroup(page); 1027 * the lruvec for the given @zone and the memcg @page is charged to.
988 /* can happen while we handle swapcache. */ 1028 *
989 if (!TestClearPageCgroupAcctLRU(pc)) 1029 * The callsite is then responsible for physically linking the page to
990 return; 1030 * the returned lruvec->lists[@lru].
991 VM_BUG_ON(!pc->mem_cgroup);
992 /*
993 * We don't check PCG_USED bit. It's cleared when the "page" is finally
994 * removed from global LRU.
995 */
996 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
997 /* huge page split is done under lru_lock. so, we have no races. */
998 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
999 if (mem_cgroup_is_root(pc->mem_cgroup))
1000 return;
1001 VM_BUG_ON(list_empty(&pc->lru));
1002 list_del_init(&pc->lru);
1003}
1004
1005void mem_cgroup_del_lru(struct page *page)
1006{
1007 mem_cgroup_del_lru_list(page, page_lru(page));
1008}
1009
1010/*
1011 * Writeback is about to end against a page which has been marked for immediate
1012 * reclaim. If it still appears to be reclaimable, move it to the tail of the
1013 * inactive list.
1014 */ 1031 */
1015void mem_cgroup_rotate_reclaimable_page(struct page *page) 1032struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1033 enum lru_list lru)
1016{ 1034{
1017 struct mem_cgroup_per_zone *mz; 1035 struct mem_cgroup_per_zone *mz;
1036 struct mem_cgroup *memcg;
1018 struct page_cgroup *pc; 1037 struct page_cgroup *pc;
1019 enum lru_list lru = page_lru(page);
1020 1038
1021 if (mem_cgroup_disabled()) 1039 if (mem_cgroup_disabled())
1022 return; 1040 return &zone->lruvec;
1023 1041
1024 pc = lookup_page_cgroup(page); 1042 pc = lookup_page_cgroup(page);
1025 /* unused or root page is not rotated. */ 1043 memcg = pc->mem_cgroup;
1026 if (!PageCgroupUsed(pc)) 1044 mz = page_cgroup_zoneinfo(memcg, page);
1027 return; 1045 /* compound_order() is stabilized through lru_lock */
1028 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1046 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1029 smp_rmb(); 1047 return &mz->lruvec;
1030 if (mem_cgroup_is_root(pc->mem_cgroup))
1031 return;
1032 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1033 list_move_tail(&pc->lru, &mz->lists[lru]);
1034} 1048}
1035 1049
1036void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 1050/**
1051 * mem_cgroup_lru_del_list - account for removing an lru page
1052 * @page: the page
1053 * @lru: target lru
1054 *
1055 * This function accounts for @page being removed from @lru.
1056 *
1057 * The callsite is then responsible for physically unlinking
1058 * @page->lru.
1059 */
1060void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1037{ 1061{
1038 struct mem_cgroup_per_zone *mz; 1062 struct mem_cgroup_per_zone *mz;
1063 struct mem_cgroup *memcg;
1039 struct page_cgroup *pc; 1064 struct page_cgroup *pc;
1040 1065
1041 if (mem_cgroup_disabled()) 1066 if (mem_cgroup_disabled())
1042 return; 1067 return;
1043 1068
1044 pc = lookup_page_cgroup(page); 1069 pc = lookup_page_cgroup(page);
1045 /* unused or root page is not rotated. */ 1070 memcg = pc->mem_cgroup;
1046 if (!PageCgroupUsed(pc)) 1071 VM_BUG_ON(!memcg);
1047 return; 1072 mz = page_cgroup_zoneinfo(memcg, page);
1048 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1049 smp_rmb();
1050 if (mem_cgroup_is_root(pc->mem_cgroup))
1051 return;
1052 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1053 list_move(&pc->lru, &mz->lists[lru]);
1054}
1055
1056void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
1057{
1058 struct page_cgroup *pc;
1059 struct mem_cgroup_per_zone *mz;
1060
1061 if (mem_cgroup_disabled())
1062 return;
1063 pc = lookup_page_cgroup(page);
1064 VM_BUG_ON(PageCgroupAcctLRU(pc));
1065 /*
1066 * putback: charge:
1067 * SetPageLRU SetPageCgroupUsed
1068 * smp_mb smp_mb
1069 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1070 *
1071 * Ensure that one of the two sides adds the page to the memcg
1072 * LRU during a race.
1073 */
1074 smp_mb();
1075 if (!PageCgroupUsed(pc))
1076 return;
1077 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1078 smp_rmb();
1079 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1080 /* huge page split is done under lru_lock. so, we have no races. */ 1073 /* huge page split is done under lru_lock. so, we have no races. */
1081 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1074 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
1082 SetPageCgroupAcctLRU(pc); 1075 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
1083 if (mem_cgroup_is_root(pc->mem_cgroup))
1084 return;
1085 list_add(&pc->lru, &mz->lists[lru]);
1086}
1087
1088/*
1089 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
1090 * while it's linked to lru because the page may be reused after it's fully
1091 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
1092 * It's done under lock_page and expected that zone->lru_lock isnever held.
1093 */
1094static void mem_cgroup_lru_del_before_commit(struct page *page)
1095{
1096 unsigned long flags;
1097 struct zone *zone = page_zone(page);
1098 struct page_cgroup *pc = lookup_page_cgroup(page);
1099
1100 /*
1101 * Doing this check without taking ->lru_lock seems wrong but this
1102 * is safe. Because if page_cgroup's USED bit is unset, the page
1103 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1104 * set, the commit after this will fail, anyway.
1105 * This all charge/uncharge is done under some mutual execustion.
1106 * So, we don't need to taking care of changes in USED bit.
1107 */
1108 if (likely(!PageLRU(page)))
1109 return;
1110
1111 spin_lock_irqsave(&zone->lru_lock, flags);
1112 /*
1113 * Forget old LRU when this page_cgroup is *not* used. This Used bit
1114 * is guarded by lock_page() because the page is SwapCache.
1115 */
1116 if (!PageCgroupUsed(pc))
1117 mem_cgroup_del_lru_list(page, page_lru(page));
1118 spin_unlock_irqrestore(&zone->lru_lock, flags);
1119} 1076}
1120 1077
1121static void mem_cgroup_lru_add_after_commit(struct page *page) 1078void mem_cgroup_lru_del(struct page *page)
1122{ 1079{
1123 unsigned long flags; 1080 mem_cgroup_lru_del_list(page, page_lru(page));
1124 struct zone *zone = page_zone(page);
1125 struct page_cgroup *pc = lookup_page_cgroup(page);
1126 /*
1127 * putback: charge:
1128 * SetPageLRU SetPageCgroupUsed
1129 * smp_mb smp_mb
1130 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1131 *
1132 * Ensure that one of the two sides adds the page to the memcg
1133 * LRU during a race.
1134 */
1135 smp_mb();
1136 /* taking care of that the page is added to LRU while we commit it */
1137 if (likely(!PageLRU(page)))
1138 return;
1139 spin_lock_irqsave(&zone->lru_lock, flags);
1140 /* link when the page is linked to LRU but page_cgroup isn't */
1141 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
1142 mem_cgroup_add_lru_list(page, page_lru(page));
1143 spin_unlock_irqrestore(&zone->lru_lock, flags);
1144} 1081}
1145 1082
1146 1083/**
1147void mem_cgroup_move_lists(struct page *page, 1084 * mem_cgroup_lru_move_lists - account for moving a page between lrus
1148 enum lru_list from, enum lru_list to) 1085 * @zone: zone of the page
1086 * @page: the page
1087 * @from: current lru
1088 * @to: target lru
1089 *
1090 * This function accounts for @page being moved between the lrus @from
1091 * and @to, and returns the lruvec for the given @zone and the memcg
1092 * @page is charged to.
1093 *
1094 * The callsite is then responsible for physically relinking
1095 * @page->lru to the returned lruvec->lists[@to].
1096 */
1097struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1098 struct page *page,
1099 enum lru_list from,
1100 enum lru_list to)
1149{ 1101{
1150 if (mem_cgroup_disabled()) 1102 /* XXX: Optimize this, especially for @from == @to */
1151 return; 1103 mem_cgroup_lru_del_list(page, from);
1152 mem_cgroup_del_lru_list(page, from); 1104 return mem_cgroup_lru_add_list(zone, page, to);
1153 mem_cgroup_add_lru_list(page, to);
1154} 1105}
1155 1106
1156/* 1107/*
@@ -1175,10 +1126,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1175 struct task_struct *p; 1126 struct task_struct *p;
1176 1127
1177 p = find_lock_task_mm(task); 1128 p = find_lock_task_mm(task);
1178 if (!p) 1129 if (p) {
1179 return 0; 1130 curr = try_get_mem_cgroup_from_mm(p->mm);
1180 curr = try_get_mem_cgroup_from_mm(p->mm); 1131 task_unlock(p);
1181 task_unlock(p); 1132 } else {
1133 /*
1134 * All threads may have already detached their mm's, but the oom
1135 * killer still needs to detect if they have already been oom
1136 * killed to prevent needlessly killing additional tasks.
1137 */
1138 task_lock(task);
1139 curr = mem_cgroup_from_task(task);
1140 if (curr)
1141 css_get(&curr->css);
1142 task_unlock(task);
1143 }
1182 if (!curr) 1144 if (!curr)
1183 return 0; 1145 return 0;
1184 /* 1146 /*
@@ -1258,68 +1220,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1258 return &mz->reclaim_stat; 1220 return &mz->reclaim_stat;
1259} 1221}
1260 1222
1261unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1262 struct list_head *dst,
1263 unsigned long *scanned, int order,
1264 isolate_mode_t mode,
1265 struct zone *z,
1266 struct mem_cgroup *mem_cont,
1267 int active, int file)
1268{
1269 unsigned long nr_taken = 0;
1270 struct page *page;
1271 unsigned long scan;
1272 LIST_HEAD(pc_list);
1273 struct list_head *src;
1274 struct page_cgroup *pc, *tmp;
1275 int nid = zone_to_nid(z);
1276 int zid = zone_idx(z);
1277 struct mem_cgroup_per_zone *mz;
1278 int lru = LRU_FILE * file + active;
1279 int ret;
1280
1281 BUG_ON(!mem_cont);
1282 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1283 src = &mz->lists[lru];
1284
1285 scan = 0;
1286 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1287 if (scan >= nr_to_scan)
1288 break;
1289
1290 if (unlikely(!PageCgroupUsed(pc)))
1291 continue;
1292
1293 page = lookup_cgroup_page(pc);
1294
1295 if (unlikely(!PageLRU(page)))
1296 continue;
1297
1298 scan++;
1299 ret = __isolate_lru_page(page, mode, file);
1300 switch (ret) {
1301 case 0:
1302 list_move(&page->lru, dst);
1303 mem_cgroup_del_lru(page);
1304 nr_taken += hpage_nr_pages(page);
1305 break;
1306 case -EBUSY:
1307 /* we don't affect global LRU but rotate in our LRU */
1308 mem_cgroup_rotate_lru_list(page, page_lru(page));
1309 break;
1310 default:
1311 break;
1312 }
1313 }
1314
1315 *scanned = scan;
1316
1317 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1318 0, 0, 0, mode);
1319
1320 return nr_taken;
1321}
1322
1323#define mem_cgroup_from_res_counter(counter, member) \ 1223#define mem_cgroup_from_res_counter(counter, member) \
1324 container_of(counter, struct mem_cgroup, member) 1224 container_of(counter, struct mem_cgroup, member)
1325 1225
@@ -1536,41 +1436,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1536 return min(limit, memsw); 1436 return min(limit, memsw);
1537} 1437}
1538 1438
1539/* 1439static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1540 * Visit the first child (need not be the first child as per the ordering 1440 gfp_t gfp_mask,
1541 * of the cgroup list, since we track last_scanned_child) of @mem and use 1441 unsigned long flags)
1542 * that to reclaim free pages from.
1543 */
1544static struct mem_cgroup *
1545mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
1546{ 1442{
1547 struct mem_cgroup *ret = NULL; 1443 unsigned long total = 0;
1548 struct cgroup_subsys_state *css; 1444 bool noswap = false;
1549 int nextid, found; 1445 int loop;
1550
1551 if (!root_memcg->use_hierarchy) {
1552 css_get(&root_memcg->css);
1553 ret = root_memcg;
1554 }
1555 1446
1556 while (!ret) { 1447 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1557 rcu_read_lock(); 1448 noswap = true;
1558 nextid = root_memcg->last_scanned_child + 1; 1449 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1559 css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, 1450 noswap = true;
1560 &found);
1561 if (css && css_tryget(css))
1562 ret = container_of(css, struct mem_cgroup, css);
1563 1451
1564 rcu_read_unlock(); 1452 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1565 /* Updates scanning parameter */ 1453 if (loop)
1566 if (!css) { 1454 drain_all_stock_async(memcg);
1567 /* this means start scan from ID:1 */ 1455 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1568 root_memcg->last_scanned_child = 0; 1456 /*
1569 } else 1457 * Allow limit shrinkers, which are triggered directly
1570 root_memcg->last_scanned_child = found; 1458 * by userspace, to catch signals and stop reclaim
1459 * after minimal progress, regardless of the margin.
1460 */
1461 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1462 break;
1463 if (mem_cgroup_margin(memcg))
1464 break;
1465 /*
1466 * If nothing was reclaimed after two attempts, there
1467 * may be no reclaimable pages in this hierarchy.
1468 */
1469 if (loop && !total)
1470 break;
1571 } 1471 }
1572 1472 return total;
1573 return ret;
1574} 1473}
1575 1474
1576/** 1475/**
@@ -1710,61 +1609,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1710} 1609}
1711#endif 1610#endif
1712 1611
1713/* 1612static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1714 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1613 struct zone *zone,
1715 * we reclaimed from, so that we don't end up penalizing one child extensively 1614 gfp_t gfp_mask,
1716 * based on its position in the children list. 1615 unsigned long *total_scanned)
1717 * 1616{
1718 * root_memcg is the original ancestor that we've been reclaim from. 1617 struct mem_cgroup *victim = NULL;
1719 * 1618 int total = 0;
1720 * We give up and return to the caller when we visit root_memcg twice.
1721 * (other groups can be removed while we're walking....)
1722 *
1723 * If shrink==true, for avoiding to free too much, this returns immedieately.
1724 */
1725static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1726 struct zone *zone,
1727 gfp_t gfp_mask,
1728 unsigned long reclaim_options,
1729 unsigned long *total_scanned)
1730{
1731 struct mem_cgroup *victim;
1732 int ret, total = 0;
1733 int loop = 0; 1619 int loop = 0;
1734 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1735 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1736 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1737 unsigned long excess; 1620 unsigned long excess;
1738 unsigned long nr_scanned; 1621 unsigned long nr_scanned;
1622 struct mem_cgroup_reclaim_cookie reclaim = {
1623 .zone = zone,
1624 .priority = 0,
1625 };
1739 1626
1740 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1627 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1741 1628
1742 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1743 if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
1744 noswap = true;
1745
1746 while (1) { 1629 while (1) {
1747 victim = mem_cgroup_select_victim(root_memcg); 1630 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1748 if (victim == root_memcg) { 1631 if (!victim) {
1749 loop++; 1632 loop++;
1750 /*
1751 * We are not draining per cpu cached charges during
1752 * soft limit reclaim because global reclaim doesn't
1753 * care about charges. It tries to free some memory and
1754 * charges will not give any.
1755 */
1756 if (!check_soft && loop >= 1)
1757 drain_all_stock_async(root_memcg);
1758 if (loop >= 2) { 1633 if (loop >= 2) {
1759 /* 1634 /*
1760 * If we have not been able to reclaim 1635 * If we have not been able to reclaim
1761 * anything, it might because there are 1636 * anything, it might because there are
1762 * no reclaimable pages under this hierarchy 1637 * no reclaimable pages under this hierarchy
1763 */ 1638 */
1764 if (!check_soft || !total) { 1639 if (!total)
1765 css_put(&victim->css);
1766 break; 1640 break;
1767 }
1768 /* 1641 /*
1769 * We want to do more targeted reclaim. 1642 * We want to do more targeted reclaim.
1770 * excess >> 2 is not to excessive so as to 1643 * excess >> 2 is not to excessive so as to
@@ -1772,40 +1645,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1772 * coming back to reclaim from this cgroup 1645 * coming back to reclaim from this cgroup
1773 */ 1646 */
1774 if (total >= (excess >> 2) || 1647 if (total >= (excess >> 2) ||
1775 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1648 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1776 css_put(&victim->css);
1777 break; 1649 break;
1778 }
1779 } 1650 }
1780 }
1781 if (!mem_cgroup_reclaimable(victim, noswap)) {
1782 /* this cgroup's local usage == 0 */
1783 css_put(&victim->css);
1784 continue; 1651 continue;
1785 } 1652 }
1786 /* we use swappiness of local cgroup */ 1653 if (!mem_cgroup_reclaimable(victim, false))
1787 if (check_soft) { 1654 continue;
1788 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1655 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1789 noswap, zone, &nr_scanned); 1656 zone, &nr_scanned);
1790 *total_scanned += nr_scanned; 1657 *total_scanned += nr_scanned;
1791 } else 1658 if (!res_counter_soft_limit_excess(&root_memcg->res))
1792 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1659 break;
1793 noswap);
1794 css_put(&victim->css);
1795 /*
1796 * At shrinking usage, we can't check we should stop here or
1797 * reclaim more. It's depends on callers. last_scanned_child
1798 * will work enough for keeping fairness under tree.
1799 */
1800 if (shrink)
1801 return ret;
1802 total += ret;
1803 if (check_soft) {
1804 if (!res_counter_soft_limit_excess(&root_memcg->res))
1805 return total;
1806 } else if (mem_cgroup_margin(root_memcg))
1807 return total;
1808 } 1660 }
1661 mem_cgroup_iter_break(root_memcg, victim);
1809 return total; 1662 return total;
1810} 1663}
1811 1664
@@ -1817,16 +1670,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1817static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 1670static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1818{ 1671{
1819 struct mem_cgroup *iter, *failed = NULL; 1672 struct mem_cgroup *iter, *failed = NULL;
1820 bool cond = true;
1821 1673
1822 for_each_mem_cgroup_tree_cond(iter, memcg, cond) { 1674 for_each_mem_cgroup_tree(iter, memcg) {
1823 if (iter->oom_lock) { 1675 if (iter->oom_lock) {
1824 /* 1676 /*
1825 * this subtree of our hierarchy is already locked 1677 * this subtree of our hierarchy is already locked
1826 * so we cannot give a lock. 1678 * so we cannot give a lock.
1827 */ 1679 */
1828 failed = iter; 1680 failed = iter;
1829 cond = false; 1681 mem_cgroup_iter_break(memcg, iter);
1682 break;
1830 } else 1683 } else
1831 iter->oom_lock = true; 1684 iter->oom_lock = true;
1832 } 1685 }
@@ -1838,11 +1691,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1838 * OK, we failed to lock the whole subtree so we have to clean up 1691 * OK, we failed to lock the whole subtree so we have to clean up
1839 * what we set up to the failing subtree 1692 * what we set up to the failing subtree
1840 */ 1693 */
1841 cond = true; 1694 for_each_mem_cgroup_tree(iter, memcg) {
1842 for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1843 if (iter == failed) { 1695 if (iter == failed) {
1844 cond = false; 1696 mem_cgroup_iter_break(memcg, iter);
1845 continue; 1697 break;
1846 } 1698 }
1847 iter->oom_lock = false; 1699 iter->oom_lock = false;
1848 } 1700 }
@@ -2007,7 +1859,7 @@ void mem_cgroup_update_page_stat(struct page *page,
2007 bool need_unlock = false; 1859 bool need_unlock = false;
2008 unsigned long uninitialized_var(flags); 1860 unsigned long uninitialized_var(flags);
2009 1861
2010 if (unlikely(!pc)) 1862 if (mem_cgroup_disabled())
2011 return; 1863 return;
2012 1864
2013 rcu_read_lock(); 1865 rcu_read_lock();
@@ -2238,7 +2090,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2238 struct mem_cgroup *iter; 2090 struct mem_cgroup *iter;
2239 2091
2240 if ((action == CPU_ONLINE)) { 2092 if ((action == CPU_ONLINE)) {
2241 for_each_mem_cgroup_all(iter) 2093 for_each_mem_cgroup(iter)
2242 synchronize_mem_cgroup_on_move(iter, cpu); 2094 synchronize_mem_cgroup_on_move(iter, cpu);
2243 return NOTIFY_OK; 2095 return NOTIFY_OK;
2244 } 2096 }
@@ -2246,7 +2098,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2246 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2098 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2247 return NOTIFY_OK; 2099 return NOTIFY_OK;
2248 2100
2249 for_each_mem_cgroup_all(iter) 2101 for_each_mem_cgroup(iter)
2250 mem_cgroup_drain_pcp_counter(iter, cpu); 2102 mem_cgroup_drain_pcp_counter(iter, cpu);
2251 2103
2252 stock = &per_cpu(memcg_stock, cpu); 2104 stock = &per_cpu(memcg_stock, cpu);
@@ -2300,8 +2152,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2300 if (!(gfp_mask & __GFP_WAIT)) 2152 if (!(gfp_mask & __GFP_WAIT))
2301 return CHARGE_WOULDBLOCK; 2153 return CHARGE_WOULDBLOCK;
2302 2154
2303 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2155 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2304 gfp_mask, flags, NULL);
2305 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2156 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2306 return CHARGE_RETRY; 2157 return CHARGE_RETRY;
2307 /* 2158 /*
@@ -2334,8 +2185,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2334} 2185}
2335 2186
2336/* 2187/*
2337 * Unlike exported interface, "oom" parameter is added. if oom==true, 2188 * __mem_cgroup_try_charge() does
2338 * oom-killer can be invoked. 2189 * 1. detect memcg to be charged against from passed *mm and *ptr,
2190 * 2. update res_counter
2191 * 3. call memory reclaim if necessary.
2192 *
2193 * In some special case, if the task is fatal, fatal_signal_pending() or
2194 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
2195 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
2196 * as possible without any hazards. 2: all pages should have a valid
2197 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
2198 * pointer, that is treated as a charge to root_mem_cgroup.
2199 *
2200 * So __mem_cgroup_try_charge() will return
2201 * 0 ... on success, filling *ptr with a valid memcg pointer.
2202 * -ENOMEM ... charge failure because of resource limits.
2203 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
2204 *
2205 * Unlike the exported interface, an "oom" parameter is added. if oom==true,
2206 * the oom-killer can be invoked.
2339 */ 2207 */
2340static int __mem_cgroup_try_charge(struct mm_struct *mm, 2208static int __mem_cgroup_try_charge(struct mm_struct *mm,
2341 gfp_t gfp_mask, 2209 gfp_t gfp_mask,
@@ -2364,7 +2232,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2364 * set, if so charge the init_mm (happens for pagecache usage). 2232 * set, if so charge the init_mm (happens for pagecache usage).
2365 */ 2233 */
2366 if (!*ptr && !mm) 2234 if (!*ptr && !mm)
2367 goto bypass; 2235 *ptr = root_mem_cgroup;
2368again: 2236again:
2369 if (*ptr) { /* css should be a valid one */ 2237 if (*ptr) { /* css should be a valid one */
2370 memcg = *ptr; 2238 memcg = *ptr;
@@ -2390,7 +2258,9 @@ again:
2390 * task-struct. So, mm->owner can be NULL. 2258 * task-struct. So, mm->owner can be NULL.
2391 */ 2259 */
2392 memcg = mem_cgroup_from_task(p); 2260 memcg = mem_cgroup_from_task(p);
2393 if (!memcg || mem_cgroup_is_root(memcg)) { 2261 if (!memcg)
2262 memcg = root_mem_cgroup;
2263 if (mem_cgroup_is_root(memcg)) {
2394 rcu_read_unlock(); 2264 rcu_read_unlock();
2395 goto done; 2265 goto done;
2396 } 2266 }
@@ -2465,8 +2335,8 @@ nomem:
2465 *ptr = NULL; 2335 *ptr = NULL;
2466 return -ENOMEM; 2336 return -ENOMEM;
2467bypass: 2337bypass:
2468 *ptr = NULL; 2338 *ptr = root_mem_cgroup;
2469 return 0; 2339 return -EINTR;
2470} 2340}
2471 2341
2472/* 2342/*
@@ -2522,7 +2392,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2522 memcg = NULL; 2392 memcg = NULL;
2523 } else if (PageSwapCache(page)) { 2393 } else if (PageSwapCache(page)) {
2524 ent.val = page_private(page); 2394 ent.val = page_private(page);
2525 id = lookup_swap_cgroup(ent); 2395 id = lookup_swap_cgroup_id(ent);
2526 rcu_read_lock(); 2396 rcu_read_lock();
2527 memcg = mem_cgroup_lookup(id); 2397 memcg = mem_cgroup_lookup(id);
2528 if (memcg && !css_tryget(&memcg->css)) 2398 if (memcg && !css_tryget(&memcg->css))
@@ -2574,6 +2444,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2574 2444
2575 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2445 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2576 unlock_page_cgroup(pc); 2446 unlock_page_cgroup(pc);
2447 WARN_ON_ONCE(PageLRU(page));
2577 /* 2448 /*
2578 * "charge_statistics" updated event counter. Then, check it. 2449 * "charge_statistics" updated event counter. Then, check it.
2579 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2450 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -2585,44 +2456,29 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2585#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2456#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2586 2457
2587#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2458#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2588 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2459 (1 << PCG_MIGRATION))
2589/* 2460/*
2590 * Because tail pages are not marked as "used", set it. We're under 2461 * Because tail pages are not marked as "used", set it. We're under
2591 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2462 * zone->lru_lock, 'splitting on pmd' and compound_lock.
2463 * charge/uncharge will be never happen and move_account() is done under
2464 * compound_lock(), so we don't have to take care of races.
2592 */ 2465 */
2593void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2466void mem_cgroup_split_huge_fixup(struct page *head)
2594{ 2467{
2595 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2468 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2596 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2469 struct page_cgroup *pc;
2597 unsigned long flags; 2470 int i;
2598 2471
2599 if (mem_cgroup_disabled()) 2472 if (mem_cgroup_disabled())
2600 return; 2473 return;
2601 /* 2474 for (i = 1; i < HPAGE_PMD_NR; i++) {
2602 * We have no races with charge/uncharge but will have races with 2475 pc = head_pc + i;
2603 * page state accounting. 2476 pc->mem_cgroup = head_pc->mem_cgroup;
2604 */ 2477 smp_wmb();/* see __commit_charge() */
2605 move_lock_page_cgroup(head_pc, &flags); 2478 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2606
2607 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2608 smp_wmb(); /* see __commit_charge() */
2609 if (PageCgroupAcctLRU(head_pc)) {
2610 enum lru_list lru;
2611 struct mem_cgroup_per_zone *mz;
2612
2613 /*
2614 * LRU flags cannot be copied because we need to add tail
2615 *.page to LRU by generic call and our hook will be called.
2616 * We hold lru_lock, then, reduce counter directly.
2617 */
2618 lru = page_lru(head);
2619 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2620 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2621 } 2479 }
2622 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2623 move_unlock_page_cgroup(head_pc, &flags);
2624} 2480}
2625#endif 2481#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2626 2482
2627/** 2483/**
2628 * mem_cgroup_move_account - move account of the page 2484 * mem_cgroup_move_account - move account of the page
@@ -2737,7 +2593,7 @@ static int mem_cgroup_move_parent(struct page *page,
2737 2593
2738 parent = mem_cgroup_from_cont(pcg); 2594 parent = mem_cgroup_from_cont(pcg);
2739 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2595 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2740 if (ret || !parent) 2596 if (ret)
2741 goto put_back; 2597 goto put_back;
2742 2598
2743 if (nr_pages > 1) 2599 if (nr_pages > 1)
@@ -2783,12 +2639,9 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2783 } 2639 }
2784 2640
2785 pc = lookup_page_cgroup(page); 2641 pc = lookup_page_cgroup(page);
2786 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2787
2788 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2642 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2789 if (ret || !memcg) 2643 if (ret == -ENOMEM)
2790 return ret; 2644 return ret;
2791
2792 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); 2645 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
2793 return 0; 2646 return 0;
2794} 2647}
@@ -2798,19 +2651,11 @@ int mem_cgroup_newpage_charge(struct page *page,
2798{ 2651{
2799 if (mem_cgroup_disabled()) 2652 if (mem_cgroup_disabled())
2800 return 0; 2653 return 0;
2801 /* 2654 VM_BUG_ON(page_mapped(page));
2802 * If already mapped, we don't have to account. 2655 VM_BUG_ON(page->mapping && !PageAnon(page));
2803 * If page cache, page->mapping has address_space. 2656 VM_BUG_ON(!mm);
2804 * But page->mapping may have out-of-use anon_vma pointer,
2805 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2806 * is NULL.
2807 */
2808 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2809 return 0;
2810 if (unlikely(!mm))
2811 mm = &init_mm;
2812 return mem_cgroup_charge_common(page, mm, gfp_mask, 2657 return mem_cgroup_charge_common(page, mm, gfp_mask,
2813 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2658 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2814} 2659}
2815 2660
2816static void 2661static void
@@ -2822,14 +2667,27 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2822 enum charge_type ctype) 2667 enum charge_type ctype)
2823{ 2668{
2824 struct page_cgroup *pc = lookup_page_cgroup(page); 2669 struct page_cgroup *pc = lookup_page_cgroup(page);
2670 struct zone *zone = page_zone(page);
2671 unsigned long flags;
2672 bool removed = false;
2673
2825 /* 2674 /*
2826 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page 2675 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2827 * is already on LRU. It means the page may on some other page_cgroup's 2676 * is already on LRU. It means the page may on some other page_cgroup's
2828 * LRU. Take care of it. 2677 * LRU. Take care of it.
2829 */ 2678 */
2830 mem_cgroup_lru_del_before_commit(page); 2679 spin_lock_irqsave(&zone->lru_lock, flags);
2680 if (PageLRU(page)) {
2681 del_page_from_lru_list(zone, page, page_lru(page));
2682 ClearPageLRU(page);
2683 removed = true;
2684 }
2831 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); 2685 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2832 mem_cgroup_lru_add_after_commit(page); 2686 if (removed) {
2687 add_page_to_lru_list(zone, page, page_lru(page));
2688 SetPageLRU(page);
2689 }
2690 spin_unlock_irqrestore(&zone->lru_lock, flags);
2833 return; 2691 return;
2834} 2692}
2835 2693
@@ -2837,6 +2695,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2837 gfp_t gfp_mask) 2695 gfp_t gfp_mask)
2838{ 2696{
2839 struct mem_cgroup *memcg = NULL; 2697 struct mem_cgroup *memcg = NULL;
2698 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2840 int ret; 2699 int ret;
2841 2700
2842 if (mem_cgroup_disabled()) 2701 if (mem_cgroup_disabled())
@@ -2846,31 +2705,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2846 2705
2847 if (unlikely(!mm)) 2706 if (unlikely(!mm))
2848 mm = &init_mm; 2707 mm = &init_mm;
2708 if (!page_is_file_cache(page))
2709 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2849 2710
2850 if (page_is_file_cache(page)) { 2711 if (!PageSwapCache(page))
2851 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); 2712 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2852 if (ret || !memcg) 2713 else { /* page is swapcache/shmem */
2853 return ret;
2854
2855 /*
2856 * FUSE reuses pages without going through the final
2857 * put that would remove them from the LRU list, make
2858 * sure that they get relinked properly.
2859 */
2860 __mem_cgroup_commit_charge_lrucare(page, memcg,
2861 MEM_CGROUP_CHARGE_TYPE_CACHE);
2862 return ret;
2863 }
2864 /* shmem */
2865 if (PageSwapCache(page)) {
2866 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); 2714 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2867 if (!ret) 2715 if (!ret)
2868 __mem_cgroup_commit_charge_swapin(page, memcg, 2716 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2869 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2717 }
2870 } else
2871 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2872 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2873
2874 return ret; 2718 return ret;
2875} 2719}
2876 2720
@@ -2882,12 +2726,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2882 */ 2726 */
2883int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2727int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2884 struct page *page, 2728 struct page *page,
2885 gfp_t mask, struct mem_cgroup **ptr) 2729 gfp_t mask, struct mem_cgroup **memcgp)
2886{ 2730{
2887 struct mem_cgroup *memcg; 2731 struct mem_cgroup *memcg;
2888 int ret; 2732 int ret;
2889 2733
2890 *ptr = NULL; 2734 *memcgp = NULL;
2891 2735
2892 if (mem_cgroup_disabled()) 2736 if (mem_cgroup_disabled())
2893 return 0; 2737 return 0;
@@ -2905,27 +2749,32 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2905 memcg = try_get_mem_cgroup_from_page(page); 2749 memcg = try_get_mem_cgroup_from_page(page);
2906 if (!memcg) 2750 if (!memcg)
2907 goto charge_cur_mm; 2751 goto charge_cur_mm;
2908 *ptr = memcg; 2752 *memcgp = memcg;
2909 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2753 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
2910 css_put(&memcg->css); 2754 css_put(&memcg->css);
2755 if (ret == -EINTR)
2756 ret = 0;
2911 return ret; 2757 return ret;
2912charge_cur_mm: 2758charge_cur_mm:
2913 if (unlikely(!mm)) 2759 if (unlikely(!mm))
2914 mm = &init_mm; 2760 mm = &init_mm;
2915 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); 2761 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2762 if (ret == -EINTR)
2763 ret = 0;
2764 return ret;
2916} 2765}
2917 2766
2918static void 2767static void
2919__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2768__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2920 enum charge_type ctype) 2769 enum charge_type ctype)
2921{ 2770{
2922 if (mem_cgroup_disabled()) 2771 if (mem_cgroup_disabled())
2923 return; 2772 return;
2924 if (!ptr) 2773 if (!memcg)
2925 return; 2774 return;
2926 cgroup_exclude_rmdir(&ptr->css); 2775 cgroup_exclude_rmdir(&memcg->css);
2927 2776
2928 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); 2777 __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
2929 /* 2778 /*
2930 * Now swap is on-memory. This means this page may be 2779 * Now swap is on-memory. This means this page may be
2931 * counted both as mem and swap....double count. 2780 * counted both as mem and swap....double count.
@@ -2935,21 +2784,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2935 */ 2784 */
2936 if (do_swap_account && PageSwapCache(page)) { 2785 if (do_swap_account && PageSwapCache(page)) {
2937 swp_entry_t ent = {.val = page_private(page)}; 2786 swp_entry_t ent = {.val = page_private(page)};
2787 struct mem_cgroup *swap_memcg;
2938 unsigned short id; 2788 unsigned short id;
2939 struct mem_cgroup *memcg;
2940 2789
2941 id = swap_cgroup_record(ent, 0); 2790 id = swap_cgroup_record(ent, 0);
2942 rcu_read_lock(); 2791 rcu_read_lock();
2943 memcg = mem_cgroup_lookup(id); 2792 swap_memcg = mem_cgroup_lookup(id);
2944 if (memcg) { 2793 if (swap_memcg) {
2945 /* 2794 /*
2946 * This recorded memcg can be obsolete one. So, avoid 2795 * This recorded memcg can be obsolete one. So, avoid
2947 * calling css_tryget 2796 * calling css_tryget
2948 */ 2797 */
2949 if (!mem_cgroup_is_root(memcg)) 2798 if (!mem_cgroup_is_root(swap_memcg))
2950 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2799 res_counter_uncharge(&swap_memcg->memsw,
2951 mem_cgroup_swap_statistics(memcg, false); 2800 PAGE_SIZE);
2952 mem_cgroup_put(memcg); 2801 mem_cgroup_swap_statistics(swap_memcg, false);
2802 mem_cgroup_put(swap_memcg);
2953 } 2803 }
2954 rcu_read_unlock(); 2804 rcu_read_unlock();
2955 } 2805 }
@@ -2958,13 +2808,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2958 * So, rmdir()->pre_destroy() can be called while we do this charge. 2808 * So, rmdir()->pre_destroy() can be called while we do this charge.
2959 * In that case, we need to call pre_destroy() again. check it here. 2809 * In that case, we need to call pre_destroy() again. check it here.
2960 */ 2810 */
2961 cgroup_release_and_wakeup_rmdir(&ptr->css); 2811 cgroup_release_and_wakeup_rmdir(&memcg->css);
2962} 2812}
2963 2813
2964void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2814void mem_cgroup_commit_charge_swapin(struct page *page,
2815 struct mem_cgroup *memcg)
2965{ 2816{
2966 __mem_cgroup_commit_charge_swapin(page, ptr, 2817 __mem_cgroup_commit_charge_swapin(page, memcg,
2967 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2818 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2968} 2819}
2969 2820
2970void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2821void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
@@ -3054,7 +2905,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3054 * Check if our page_cgroup is valid 2905 * Check if our page_cgroup is valid
3055 */ 2906 */
3056 pc = lookup_page_cgroup(page); 2907 pc = lookup_page_cgroup(page);
3057 if (unlikely(!pc || !PageCgroupUsed(pc))) 2908 if (unlikely(!PageCgroupUsed(pc)))
3058 return NULL; 2909 return NULL;
3059 2910
3060 lock_page_cgroup(pc); 2911 lock_page_cgroup(pc);
@@ -3117,8 +2968,7 @@ void mem_cgroup_uncharge_page(struct page *page)
3117 /* early check. */ 2968 /* early check. */
3118 if (page_mapped(page)) 2969 if (page_mapped(page))
3119 return; 2970 return;
3120 if (page->mapping && !PageAnon(page)) 2971 VM_BUG_ON(page->mapping && !PageAnon(page));
3121 return;
3122 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2972 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3123} 2973}
3124 2974
@@ -3176,6 +3026,23 @@ void mem_cgroup_uncharge_end(void)
3176 batch->memcg = NULL; 3026 batch->memcg = NULL;
3177} 3027}
3178 3028
3029/*
3030 * A function for resetting pc->mem_cgroup for newly allocated pages.
3031 * This function should be called if the newpage will be added to LRU
3032 * before start accounting.
3033 */
3034void mem_cgroup_reset_owner(struct page *newpage)
3035{
3036 struct page_cgroup *pc;
3037
3038 if (mem_cgroup_disabled())
3039 return;
3040
3041 pc = lookup_page_cgroup(newpage);
3042 VM_BUG_ON(PageCgroupUsed(pc));
3043 pc->mem_cgroup = root_mem_cgroup;
3044}
3045
3179#ifdef CONFIG_SWAP 3046#ifdef CONFIG_SWAP
3180/* 3047/*
3181 * called after __delete_from_swap_cache() and drop "page" account. 3048 * called after __delete_from_swap_cache() and drop "page" account.
@@ -3293,14 +3160,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3293 * page belongs to. 3160 * page belongs to.
3294 */ 3161 */
3295int mem_cgroup_prepare_migration(struct page *page, 3162int mem_cgroup_prepare_migration(struct page *page,
3296 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3163 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
3297{ 3164{
3298 struct mem_cgroup *memcg = NULL; 3165 struct mem_cgroup *memcg = NULL;
3299 struct page_cgroup *pc; 3166 struct page_cgroup *pc;
3300 enum charge_type ctype; 3167 enum charge_type ctype;
3301 int ret = 0; 3168 int ret = 0;
3302 3169
3303 *ptr = NULL; 3170 *memcgp = NULL;
3304 3171
3305 VM_BUG_ON(PageTransHuge(page)); 3172 VM_BUG_ON(PageTransHuge(page));
3306 if (mem_cgroup_disabled()) 3173 if (mem_cgroup_disabled())
@@ -3351,10 +3218,10 @@ int mem_cgroup_prepare_migration(struct page *page,
3351 if (!memcg) 3218 if (!memcg)
3352 return 0; 3219 return 0;
3353 3220
3354 *ptr = memcg; 3221 *memcgp = memcg;
3355 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3222 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3356 css_put(&memcg->css);/* drop extra refcnt */ 3223 css_put(&memcg->css);/* drop extra refcnt */
3357 if (ret || *ptr == NULL) { 3224 if (ret) {
3358 if (PageAnon(page)) { 3225 if (PageAnon(page)) {
3359 lock_page_cgroup(pc); 3226 lock_page_cgroup(pc);
3360 ClearPageCgroupMigration(pc); 3227 ClearPageCgroupMigration(pc);
@@ -3364,6 +3231,7 @@ int mem_cgroup_prepare_migration(struct page *page,
3364 */ 3231 */
3365 mem_cgroup_uncharge_page(page); 3232 mem_cgroup_uncharge_page(page);
3366 } 3233 }
3234 /* we'll need to revisit this error code (we have -EINTR) */
3367 return -ENOMEM; 3235 return -ENOMEM;
3368 } 3236 }
3369 /* 3237 /*
@@ -3432,12 +3300,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3432 cgroup_release_and_wakeup_rmdir(&memcg->css); 3300 cgroup_release_and_wakeup_rmdir(&memcg->css);
3433} 3301}
3434 3302
3303/*
3304 * At replace page cache, newpage is not under any memcg but it's on
3305 * LRU. So, this function doesn't touch res_counter but handles LRU
3306 * in correct way. Both pages are locked so we cannot race with uncharge.
3307 */
3308void mem_cgroup_replace_page_cache(struct page *oldpage,
3309 struct page *newpage)
3310{
3311 struct mem_cgroup *memcg;
3312 struct page_cgroup *pc;
3313 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3314
3315 if (mem_cgroup_disabled())
3316 return;
3317
3318 pc = lookup_page_cgroup(oldpage);
3319 /* fix accounting on old pages */
3320 lock_page_cgroup(pc);
3321 memcg = pc->mem_cgroup;
3322 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3323 ClearPageCgroupUsed(pc);
3324 unlock_page_cgroup(pc);
3325
3326 if (PageSwapBacked(oldpage))
3327 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3328
3329 /*
3330 * Even if newpage->mapping was NULL before starting replacement,
3331 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3332 * LRU while we overwrite pc->mem_cgroup.
3333 */
3334 __mem_cgroup_commit_charge_lrucare(newpage, memcg, type);
3335}
3336
3435#ifdef CONFIG_DEBUG_VM 3337#ifdef CONFIG_DEBUG_VM
3436static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3338static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3437{ 3339{
3438 struct page_cgroup *pc; 3340 struct page_cgroup *pc;
3439 3341
3440 pc = lookup_page_cgroup(page); 3342 pc = lookup_page_cgroup(page);
3343 /*
3344 * Can be NULL while feeding pages into the page allocator for
3345 * the first time, i.e. during boot or memory hotplug;
3346 * or when mem_cgroup_disabled().
3347 */
3441 if (likely(pc) && PageCgroupUsed(pc)) 3348 if (likely(pc) && PageCgroupUsed(pc))
3442 return pc; 3349 return pc;
3443 return NULL; 3350 return NULL;
@@ -3457,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page)
3457 3364
3458 pc = lookup_page_cgroup_used(page); 3365 pc = lookup_page_cgroup_used(page);
3459 if (pc) { 3366 if (pc) {
3460 int ret = -1; 3367 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3461 char *path;
3462
3463 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3464 pc, pc->flags, pc->mem_cgroup); 3368 pc, pc->flags, pc->mem_cgroup);
3465
3466 path = kmalloc(PATH_MAX, GFP_KERNEL);
3467 if (path) {
3468 rcu_read_lock();
3469 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3470 path, PATH_MAX);
3471 rcu_read_unlock();
3472 }
3473
3474 printk(KERN_CONT "(%s)\n",
3475 (ret < 0) ? "cannot get the path" : path);
3476 kfree(path);
3477 } 3369 }
3478} 3370}
3479#endif 3371#endif
@@ -3534,9 +3426,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3534 if (!ret) 3426 if (!ret)
3535 break; 3427 break;
3536 3428
3537 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3429 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3538 MEM_CGROUP_RECLAIM_SHRINK, 3430 MEM_CGROUP_RECLAIM_SHRINK);
3539 NULL);
3540 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3431 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3541 /* Usage is reduced ? */ 3432 /* Usage is reduced ? */
3542 if (curusage >= oldusage) 3433 if (curusage >= oldusage)
@@ -3594,10 +3485,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3594 if (!ret) 3485 if (!ret)
3595 break; 3486 break;
3596 3487
3597 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3488 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3598 MEM_CGROUP_RECLAIM_NOSWAP | 3489 MEM_CGROUP_RECLAIM_NOSWAP |
3599 MEM_CGROUP_RECLAIM_SHRINK, 3490 MEM_CGROUP_RECLAIM_SHRINK);
3600 NULL);
3601 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3491 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3602 /* Usage is reduced ? */ 3492 /* Usage is reduced ? */
3603 if (curusage >= oldusage) 3493 if (curusage >= oldusage)
@@ -3640,10 +3530,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3640 break; 3530 break;
3641 3531
3642 nr_scanned = 0; 3532 nr_scanned = 0;
3643 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3533 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
3644 gfp_mask, 3534 gfp_mask, &nr_scanned);
3645 MEM_CGROUP_RECLAIM_SOFT,
3646 &nr_scanned);
3647 nr_reclaimed += reclaimed; 3535 nr_reclaimed += reclaimed;
3648 *total_scanned += nr_scanned; 3536 *total_scanned += nr_scanned;
3649 spin_lock(&mctz->lock); 3537 spin_lock(&mctz->lock);
@@ -3711,22 +3599,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3711static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3599static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3712 int node, int zid, enum lru_list lru) 3600 int node, int zid, enum lru_list lru)
3713{ 3601{
3714 struct zone *zone;
3715 struct mem_cgroup_per_zone *mz; 3602 struct mem_cgroup_per_zone *mz;
3716 struct page_cgroup *pc, *busy;
3717 unsigned long flags, loop; 3603 unsigned long flags, loop;
3718 struct list_head *list; 3604 struct list_head *list;
3605 struct page *busy;
3606 struct zone *zone;
3719 int ret = 0; 3607 int ret = 0;
3720 3608
3721 zone = &NODE_DATA(node)->node_zones[zid]; 3609 zone = &NODE_DATA(node)->node_zones[zid];
3722 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3610 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3723 list = &mz->lists[lru]; 3611 list = &mz->lruvec.lists[lru];
3724 3612
3725 loop = MEM_CGROUP_ZSTAT(mz, lru); 3613 loop = MEM_CGROUP_ZSTAT(mz, lru);
3726 /* give some margin against EBUSY etc...*/ 3614 /* give some margin against EBUSY etc...*/
3727 loop += 256; 3615 loop += 256;
3728 busy = NULL; 3616 busy = NULL;
3729 while (loop--) { 3617 while (loop--) {
3618 struct page_cgroup *pc;
3730 struct page *page; 3619 struct page *page;
3731 3620
3732 ret = 0; 3621 ret = 0;
@@ -3735,24 +3624,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3735 spin_unlock_irqrestore(&zone->lru_lock, flags); 3624 spin_unlock_irqrestore(&zone->lru_lock, flags);
3736 break; 3625 break;
3737 } 3626 }
3738 pc = list_entry(list->prev, struct page_cgroup, lru); 3627 page = list_entry(list->prev, struct page, lru);
3739 if (busy == pc) { 3628 if (busy == page) {
3740 list_move(&pc->lru, list); 3629 list_move(&page->lru, list);
3741 busy = NULL; 3630 busy = NULL;
3742 spin_unlock_irqrestore(&zone->lru_lock, flags); 3631 spin_unlock_irqrestore(&zone->lru_lock, flags);
3743 continue; 3632 continue;
3744 } 3633 }
3745 spin_unlock_irqrestore(&zone->lru_lock, flags); 3634 spin_unlock_irqrestore(&zone->lru_lock, flags);
3746 3635
3747 page = lookup_cgroup_page(pc); 3636 pc = lookup_page_cgroup(page);
3748 3637
3749 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3638 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3750 if (ret == -ENOMEM) 3639 if (ret == -ENOMEM || ret == -EINTR)
3751 break; 3640 break;
3752 3641
3753 if (ret == -EBUSY || ret == -EINVAL) { 3642 if (ret == -EBUSY || ret == -EINVAL) {
3754 /* found lock contention or "pc" is obsolete. */ 3643 /* found lock contention or "pc" is obsolete. */
3755 busy = pc; 3644 busy = page;
3756 cond_resched(); 3645 cond_resched();
3757 } else 3646 } else
3758 busy = NULL; 3647 busy = NULL;
@@ -4846,7 +4735,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4846 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4735 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4847 mz = &pn->zoneinfo[zone]; 4736 mz = &pn->zoneinfo[zone];
4848 for_each_lru(l) 4737 for_each_lru(l)
4849 INIT_LIST_HEAD(&mz->lists[l]); 4738 INIT_LIST_HEAD(&mz->lruvec.lists[l]);
4850 mz->usage_in_excess = 0; 4739 mz->usage_in_excess = 0;
4851 mz->on_tree = false; 4740 mz->on_tree = false;
4852 mz->mem = memcg; 4741 mz->mem = memcg;
@@ -4906,7 +4795,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4906 mem_cgroup_remove_from_trees(memcg); 4795 mem_cgroup_remove_from_trees(memcg);
4907 free_css_id(&mem_cgroup_subsys, &memcg->css); 4796 free_css_id(&mem_cgroup_subsys, &memcg->css);
4908 4797
4909 for_each_node_state(node, N_POSSIBLE) 4798 for_each_node(node)
4910 free_mem_cgroup_per_zone_info(memcg, node); 4799 free_mem_cgroup_per_zone_info(memcg, node);
4911 4800
4912 free_percpu(memcg->stat); 4801 free_percpu(memcg->stat);
@@ -4965,13 +4854,13 @@ static int mem_cgroup_soft_limit_tree_init(void)
4965 struct mem_cgroup_tree_per_zone *rtpz; 4854 struct mem_cgroup_tree_per_zone *rtpz;
4966 int tmp, node, zone; 4855 int tmp, node, zone;
4967 4856
4968 for_each_node_state(node, N_POSSIBLE) { 4857 for_each_node(node) {
4969 tmp = node; 4858 tmp = node;
4970 if (!node_state(node, N_NORMAL_MEMORY)) 4859 if (!node_state(node, N_NORMAL_MEMORY))
4971 tmp = -1; 4860 tmp = -1;
4972 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4861 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4973 if (!rtpn) 4862 if (!rtpn)
4974 return 1; 4863 goto err_cleanup;
4975 4864
4976 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4865 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4977 4866
@@ -4982,6 +4871,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
4982 } 4871 }
4983 } 4872 }
4984 return 0; 4873 return 0;
4874
4875err_cleanup:
4876 for_each_node(node) {
4877 if (!soft_limit_tree.rb_tree_per_node[node])
4878 break;
4879 kfree(soft_limit_tree.rb_tree_per_node[node]);
4880 soft_limit_tree.rb_tree_per_node[node] = NULL;
4881 }
4882 return 1;
4883
4985} 4884}
4986 4885
4987static struct cgroup_subsys_state * __ref 4886static struct cgroup_subsys_state * __ref
@@ -4995,7 +4894,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4995 if (!memcg) 4894 if (!memcg)
4996 return ERR_PTR(error); 4895 return ERR_PTR(error);
4997 4896
4998 for_each_node_state(node, N_POSSIBLE) 4897 for_each_node(node)
4999 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4898 if (alloc_mem_cgroup_per_zone_info(memcg, node))
5000 goto free_out; 4899 goto free_out;
5001 4900
@@ -5033,7 +4932,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5033 res_counter_init(&memcg->res, NULL); 4932 res_counter_init(&memcg->res, NULL);
5034 res_counter_init(&memcg->memsw, NULL); 4933 res_counter_init(&memcg->memsw, NULL);
5035 } 4934 }
5036 memcg->last_scanned_child = 0;
5037 memcg->last_scanned_node = MAX_NUMNODES; 4935 memcg->last_scanned_node = MAX_NUMNODES;
5038 INIT_LIST_HEAD(&memcg->oom_notify); 4936 INIT_LIST_HEAD(&memcg->oom_notify);
5039 4937
@@ -5129,9 +5027,9 @@ one_by_one:
5129 } 5027 }
5130 ret = __mem_cgroup_try_charge(NULL, 5028 ret = __mem_cgroup_try_charge(NULL,
5131 GFP_KERNEL, 1, &memcg, false); 5029 GFP_KERNEL, 1, &memcg, false);
5132 if (ret || !memcg) 5030 if (ret)
5133 /* mem_cgroup_clear_mc() will do uncharge later */ 5031 /* mem_cgroup_clear_mc() will do uncharge later */
5134 return -ENOMEM; 5032 return ret;
5135 mc.precharge++; 5033 mc.precharge++;
5136 } 5034 }
5137 return ret; 5035 return ret;
@@ -5276,7 +5174,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5276 } 5174 }
5277 /* There is a swap entry and a page doesn't exist or isn't charged */ 5175 /* There is a swap entry and a page doesn't exist or isn't charged */
5278 if (ent.val && !ret && 5176 if (ent.val && !ret &&
5279 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 5177 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
5280 ret = MC_TARGET_SWAP; 5178 ret = MC_TARGET_SWAP;
5281 if (target) 5179 if (target)
5282 target->ent = ent; 5180 target->ent = ent;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 06d3479513aa..56080ea36140 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags)
1557 page_is_file_cache(page)); 1557 page_is_file_cache(page));
1558 list_add(&page->lru, &pagelist); 1558 list_add(&page->lru, &pagelist);
1559 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1559 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1560 0, true); 1560 0, MIGRATE_SYNC);
1561 if (ret) { 1561 if (ret) {
1562 putback_lru_pages(&pagelist); 1562 putback_lru_pages(&pagelist);
1563 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1563 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 829d43735402..5e30583c2605 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{ 293{
294 struct mmu_gather_batch *batch; 294 struct mmu_gather_batch *batch;
295 295
296 tlb->need_flush = 1; 296 VM_BUG_ON(!tlb->need_flush);
297 297
298 if (tlb_fast_mode(tlb)) { 298 if (tlb_fast_mode(tlb)) {
299 free_page_and_swap_cache(page); 299 free_page_and_swap_cache(page);
@@ -1231,7 +1231,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1231 if (next-addr != HPAGE_PMD_SIZE) { 1231 if (next-addr != HPAGE_PMD_SIZE) {
1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1233 split_huge_page_pmd(vma->vm_mm, pmd); 1233 split_huge_page_pmd(vma->vm_mm, pmd);
1234 } else if (zap_huge_pmd(tlb, vma, pmd)) 1234 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1235 continue; 1235 continue;
1236 /* fall through */ 1236 /* fall through */
1237 } 1237 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2168489c0bc9..6629fafd6ce4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
809 } 809 }
810 /* this function returns # of failed pages */ 810 /* this function returns # of failed pages */
811 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 811 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
812 true, true); 812 true, MIGRATE_SYNC);
813 if (ret) 813 if (ret)
814 putback_lru_pages(&source); 814 putback_lru_pages(&source);
815 } 815 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e3d58f088466..06b145fb64ab 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -942,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
942 942
943 if (!list_empty(&pagelist)) { 943 if (!list_empty(&pagelist)) {
944 err = migrate_pages(&pagelist, new_node_page, dest, 944 err = migrate_pages(&pagelist, new_node_page, dest,
945 false, true); 945 false, MIGRATE_SYNC);
946 if (err) 946 if (err)
947 putback_lru_pages(&pagelist); 947 putback_lru_pages(&pagelist);
948 } 948 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 89ea0854332e..9871a56d82c3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -216,6 +216,56 @@ out:
216 pte_unmap_unlock(ptep, ptl); 216 pte_unmap_unlock(ptep, ptl);
217} 217}
218 218
219#ifdef CONFIG_BLOCK
220/* Returns true if all buffers are successfully locked */
221static bool buffer_migrate_lock_buffers(struct buffer_head *head,
222 enum migrate_mode mode)
223{
224 struct buffer_head *bh = head;
225
226 /* Simple case, sync compaction */
227 if (mode != MIGRATE_ASYNC) {
228 do {
229 get_bh(bh);
230 lock_buffer(bh);
231 bh = bh->b_this_page;
232
233 } while (bh != head);
234
235 return true;
236 }
237
238 /* async case, we cannot block on lock_buffer so use trylock_buffer */
239 do {
240 get_bh(bh);
241 if (!trylock_buffer(bh)) {
242 /*
243 * We failed to lock the buffer and cannot stall in
244 * async migration. Release the taken locks
245 */
246 struct buffer_head *failed_bh = bh;
247 put_bh(failed_bh);
248 bh = head;
249 while (bh != failed_bh) {
250 unlock_buffer(bh);
251 put_bh(bh);
252 bh = bh->b_this_page;
253 }
254 return false;
255 }
256
257 bh = bh->b_this_page;
258 } while (bh != head);
259 return true;
260}
261#else
262static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
263 enum migrate_mode mode)
264{
265 return true;
266}
267#endif /* CONFIG_BLOCK */
268
219/* 269/*
220 * Replace the page in the mapping. 270 * Replace the page in the mapping.
221 * 271 *
@@ -225,7 +275,8 @@ out:
225 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 275 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
226 */ 276 */
227static int migrate_page_move_mapping(struct address_space *mapping, 277static int migrate_page_move_mapping(struct address_space *mapping,
228 struct page *newpage, struct page *page) 278 struct page *newpage, struct page *page,
279 struct buffer_head *head, enum migrate_mode mode)
229{ 280{
230 int expected_count; 281 int expected_count;
231 void **pslot; 282 void **pslot;
@@ -255,6 +306,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
255 } 306 }
256 307
257 /* 308 /*
309 * In the async migration case of moving a page with buffers, lock the
310 * buffers using trylock before the mapping is moved. If the mapping
311 * was moved, we later failed to lock the buffers and could not move
312 * the mapping back due to an elevated page count, we would have to
313 * block waiting on other references to be dropped.
314 */
315 if (mode == MIGRATE_ASYNC && head &&
316 !buffer_migrate_lock_buffers(head, mode)) {
317 page_unfreeze_refs(page, expected_count);
318 spin_unlock_irq(&mapping->tree_lock);
319 return -EAGAIN;
320 }
321
322 /*
258 * Now we know that no one else is looking at the page. 323 * Now we know that no one else is looking at the page.
259 */ 324 */
260 get_page(newpage); /* add cache reference */ 325 get_page(newpage); /* add cache reference */
@@ -409,13 +474,14 @@ EXPORT_SYMBOL(fail_migrate_page);
409 * Pages are locked upon entry and exit. 474 * Pages are locked upon entry and exit.
410 */ 475 */
411int migrate_page(struct address_space *mapping, 476int migrate_page(struct address_space *mapping,
412 struct page *newpage, struct page *page) 477 struct page *newpage, struct page *page,
478 enum migrate_mode mode)
413{ 479{
414 int rc; 480 int rc;
415 481
416 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 482 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
417 483
418 rc = migrate_page_move_mapping(mapping, newpage, page); 484 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
419 485
420 if (rc) 486 if (rc)
421 return rc; 487 return rc;
@@ -432,28 +498,28 @@ EXPORT_SYMBOL(migrate_page);
432 * exist. 498 * exist.
433 */ 499 */
434int buffer_migrate_page(struct address_space *mapping, 500int buffer_migrate_page(struct address_space *mapping,
435 struct page *newpage, struct page *page) 501 struct page *newpage, struct page *page, enum migrate_mode mode)
436{ 502{
437 struct buffer_head *bh, *head; 503 struct buffer_head *bh, *head;
438 int rc; 504 int rc;
439 505
440 if (!page_has_buffers(page)) 506 if (!page_has_buffers(page))
441 return migrate_page(mapping, newpage, page); 507 return migrate_page(mapping, newpage, page, mode);
442 508
443 head = page_buffers(page); 509 head = page_buffers(page);
444 510
445 rc = migrate_page_move_mapping(mapping, newpage, page); 511 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
446 512
447 if (rc) 513 if (rc)
448 return rc; 514 return rc;
449 515
450 bh = head; 516 /*
451 do { 517 * In the async case, migrate_page_move_mapping locked the buffers
452 get_bh(bh); 518 * with an IRQ-safe spinlock held. In the sync case, the buffers
453 lock_buffer(bh); 519 * need to be locked now
454 bh = bh->b_this_page; 520 */
455 521 if (mode != MIGRATE_ASYNC)
456 } while (bh != head); 522 BUG_ON(!buffer_migrate_lock_buffers(head, mode));
457 523
458 ClearPagePrivate(page); 524 ClearPagePrivate(page);
459 set_page_private(newpage, page_private(page)); 525 set_page_private(newpage, page_private(page));
@@ -530,10 +596,14 @@ static int writeout(struct address_space *mapping, struct page *page)
530 * Default handling if a filesystem does not provide a migration function. 596 * Default handling if a filesystem does not provide a migration function.
531 */ 597 */
532static int fallback_migrate_page(struct address_space *mapping, 598static int fallback_migrate_page(struct address_space *mapping,
533 struct page *newpage, struct page *page) 599 struct page *newpage, struct page *page, enum migrate_mode mode)
534{ 600{
535 if (PageDirty(page)) 601 if (PageDirty(page)) {
602 /* Only writeback pages in full synchronous migration */
603 if (mode != MIGRATE_SYNC)
604 return -EBUSY;
536 return writeout(mapping, page); 605 return writeout(mapping, page);
606 }
537 607
538 /* 608 /*
539 * Buffers may be managed in a filesystem specific way. 609 * Buffers may be managed in a filesystem specific way.
@@ -543,7 +613,7 @@ static int fallback_migrate_page(struct address_space *mapping,
543 !try_to_release_page(page, GFP_KERNEL)) 613 !try_to_release_page(page, GFP_KERNEL))
544 return -EAGAIN; 614 return -EAGAIN;
545 615
546 return migrate_page(mapping, newpage, page); 616 return migrate_page(mapping, newpage, page, mode);
547} 617}
548 618
549/* 619/*
@@ -558,7 +628,7 @@ static int fallback_migrate_page(struct address_space *mapping,
558 * == 0 - success 628 * == 0 - success
559 */ 629 */
560static int move_to_new_page(struct page *newpage, struct page *page, 630static int move_to_new_page(struct page *newpage, struct page *page,
561 int remap_swapcache, bool sync) 631 int remap_swapcache, enum migrate_mode mode)
562{ 632{
563 struct address_space *mapping; 633 struct address_space *mapping;
564 int rc; 634 int rc;
@@ -579,29 +649,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
579 649
580 mapping = page_mapping(page); 650 mapping = page_mapping(page);
581 if (!mapping) 651 if (!mapping)
582 rc = migrate_page(mapping, newpage, page); 652 rc = migrate_page(mapping, newpage, page, mode);
583 else { 653 else if (mapping->a_ops->migratepage)
584 /* 654 /*
585 * Do not writeback pages if !sync and migratepage is 655 * Most pages have a mapping and most filesystems provide a
586 * not pointing to migrate_page() which is nonblocking 656 * migratepage callback. Anonymous pages are part of swap
587 * (swapcache/tmpfs uses migratepage = migrate_page). 657 * space which also has its own migratepage callback. This
658 * is the most common path for page migration.
588 */ 659 */
589 if (PageDirty(page) && !sync && 660 rc = mapping->a_ops->migratepage(mapping,
590 mapping->a_ops->migratepage != migrate_page) 661 newpage, page, mode);
591 rc = -EBUSY; 662 else
592 else if (mapping->a_ops->migratepage) 663 rc = fallback_migrate_page(mapping, newpage, page, mode);
593 /*
594 * Most pages have a mapping and most filesystems
595 * should provide a migration function. Anonymous
596 * pages are part of swap space which also has its
597 * own migration function. This is the most common
598 * path for page migration.
599 */
600 rc = mapping->a_ops->migratepage(mapping,
601 newpage, page);
602 else
603 rc = fallback_migrate_page(mapping, newpage, page);
604 }
605 664
606 if (rc) { 665 if (rc) {
607 newpage->mapping = NULL; 666 newpage->mapping = NULL;
@@ -616,7 +675,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
616} 675}
617 676
618static int __unmap_and_move(struct page *page, struct page *newpage, 677static int __unmap_and_move(struct page *page, struct page *newpage,
619 int force, bool offlining, bool sync) 678 int force, bool offlining, enum migrate_mode mode)
620{ 679{
621 int rc = -EAGAIN; 680 int rc = -EAGAIN;
622 int remap_swapcache = 1; 681 int remap_swapcache = 1;
@@ -625,7 +684,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
625 struct anon_vma *anon_vma = NULL; 684 struct anon_vma *anon_vma = NULL;
626 685
627 if (!trylock_page(page)) { 686 if (!trylock_page(page)) {
628 if (!force || !sync) 687 if (!force || mode == MIGRATE_ASYNC)
629 goto out; 688 goto out;
630 689
631 /* 690 /*
@@ -671,10 +730,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
671 730
672 if (PageWriteback(page)) { 731 if (PageWriteback(page)) {
673 /* 732 /*
674 * For !sync, there is no point retrying as the retry loop 733 * Only in the case of a full syncronous migration is it
675 * is expected to be too short for PageWriteback to be cleared 734 * necessary to wait for PageWriteback. In the async case,
735 * the retry loop is too short and in the sync-light case,
736 * the overhead of stalling is too much
676 */ 737 */
677 if (!sync) { 738 if (mode != MIGRATE_SYNC) {
678 rc = -EBUSY; 739 rc = -EBUSY;
679 goto uncharge; 740 goto uncharge;
680 } 741 }
@@ -745,7 +806,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
745 806
746skip_unmap: 807skip_unmap:
747 if (!page_mapped(page)) 808 if (!page_mapped(page))
748 rc = move_to_new_page(newpage, page, remap_swapcache, sync); 809 rc = move_to_new_page(newpage, page, remap_swapcache, mode);
749 810
750 if (rc && remap_swapcache) 811 if (rc && remap_swapcache)
751 remove_migration_ptes(page, page); 812 remove_migration_ptes(page, page);
@@ -768,7 +829,8 @@ out:
768 * to the newly allocated page in newpage. 829 * to the newly allocated page in newpage.
769 */ 830 */
770static int unmap_and_move(new_page_t get_new_page, unsigned long private, 831static int unmap_and_move(new_page_t get_new_page, unsigned long private,
771 struct page *page, int force, bool offlining, bool sync) 832 struct page *page, int force, bool offlining,
833 enum migrate_mode mode)
772{ 834{
773 int rc = 0; 835 int rc = 0;
774 int *result = NULL; 836 int *result = NULL;
@@ -777,6 +839,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
777 if (!newpage) 839 if (!newpage)
778 return -ENOMEM; 840 return -ENOMEM;
779 841
842 mem_cgroup_reset_owner(newpage);
843
780 if (page_count(page) == 1) { 844 if (page_count(page) == 1) {
781 /* page was freed from under us. So we are done. */ 845 /* page was freed from under us. So we are done. */
782 goto out; 846 goto out;
@@ -786,7 +850,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
786 if (unlikely(split_huge_page(page))) 850 if (unlikely(split_huge_page(page)))
787 goto out; 851 goto out;
788 852
789 rc = __unmap_and_move(page, newpage, force, offlining, sync); 853 rc = __unmap_and_move(page, newpage, force, offlining, mode);
790out: 854out:
791 if (rc != -EAGAIN) { 855 if (rc != -EAGAIN) {
792 /* 856 /*
@@ -834,7 +898,8 @@ out:
834 */ 898 */
835static int unmap_and_move_huge_page(new_page_t get_new_page, 899static int unmap_and_move_huge_page(new_page_t get_new_page,
836 unsigned long private, struct page *hpage, 900 unsigned long private, struct page *hpage,
837 int force, bool offlining, bool sync) 901 int force, bool offlining,
902 enum migrate_mode mode)
838{ 903{
839 int rc = 0; 904 int rc = 0;
840 int *result = NULL; 905 int *result = NULL;
@@ -847,7 +912,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
847 rc = -EAGAIN; 912 rc = -EAGAIN;
848 913
849 if (!trylock_page(hpage)) { 914 if (!trylock_page(hpage)) {
850 if (!force || !sync) 915 if (!force || mode != MIGRATE_SYNC)
851 goto out; 916 goto out;
852 lock_page(hpage); 917 lock_page(hpage);
853 } 918 }
@@ -858,7 +923,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
858 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 923 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
859 924
860 if (!page_mapped(hpage)) 925 if (!page_mapped(hpage))
861 rc = move_to_new_page(new_hpage, hpage, 1, sync); 926 rc = move_to_new_page(new_hpage, hpage, 1, mode);
862 927
863 if (rc) 928 if (rc)
864 remove_migration_ptes(hpage, hpage); 929 remove_migration_ptes(hpage, hpage);
@@ -901,7 +966,7 @@ out:
901 */ 966 */
902int migrate_pages(struct list_head *from, 967int migrate_pages(struct list_head *from,
903 new_page_t get_new_page, unsigned long private, bool offlining, 968 new_page_t get_new_page, unsigned long private, bool offlining,
904 bool sync) 969 enum migrate_mode mode)
905{ 970{
906 int retry = 1; 971 int retry = 1;
907 int nr_failed = 0; 972 int nr_failed = 0;
@@ -922,7 +987,7 @@ int migrate_pages(struct list_head *from,
922 987
923 rc = unmap_and_move(get_new_page, private, 988 rc = unmap_and_move(get_new_page, private,
924 page, pass > 2, offlining, 989 page, pass > 2, offlining,
925 sync); 990 mode);
926 991
927 switch(rc) { 992 switch(rc) {
928 case -ENOMEM: 993 case -ENOMEM:
@@ -952,7 +1017,7 @@ out:
952 1017
953int migrate_huge_pages(struct list_head *from, 1018int migrate_huge_pages(struct list_head *from,
954 new_page_t get_new_page, unsigned long private, bool offlining, 1019 new_page_t get_new_page, unsigned long private, bool offlining,
955 bool sync) 1020 enum migrate_mode mode)
956{ 1021{
957 int retry = 1; 1022 int retry = 1;
958 int nr_failed = 0; 1023 int nr_failed = 0;
@@ -969,7 +1034,7 @@ int migrate_huge_pages(struct list_head *from,
969 1034
970 rc = unmap_and_move_huge_page(get_new_page, 1035 rc = unmap_and_move_huge_page(get_new_page,
971 private, page, pass > 2, offlining, 1036 private, page, pass > 2, offlining,
972 sync); 1037 mode);
973 1038
974 switch(rc) { 1039 switch(rc) {
975 case -ENOMEM: 1040 case -ENOMEM:
@@ -1098,7 +1163,7 @@ set_status:
1098 err = 0; 1163 err = 0;
1099 if (!list_empty(&pagelist)) { 1164 if (!list_empty(&pagelist)) {
1100 err = migrate_pages(&pagelist, new_page_node, 1165 err = migrate_pages(&pagelist, new_page_node,
1101 (unsigned long)pm, 0, true); 1166 (unsigned long)pm, 0, MIGRATE_SYNC);
1102 if (err) 1167 if (err)
1103 putback_lru_pages(&pagelist); 1168 putback_lru_pages(&pagelist);
1104 } 1169 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7c122faa05c5..2958fd8e7c9a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -152,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
152 152
153/* return true if the task is not adequate as candidate victim task. */ 153/* return true if the task is not adequate as candidate victim task. */
154static bool oom_unkillable_task(struct task_struct *p, 154static bool oom_unkillable_task(struct task_struct *p,
155 const struct mem_cgroup *mem, const nodemask_t *nodemask) 155 const struct mem_cgroup *memcg, const nodemask_t *nodemask)
156{ 156{
157 if (is_global_init(p)) 157 if (is_global_init(p))
158 return true; 158 return true;
@@ -160,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p,
160 return true; 160 return true;
161 161
162 /* When mem_cgroup_out_of_memory() and p is not member of the group */ 162 /* When mem_cgroup_out_of_memory() and p is not member of the group */
163 if (mem && !task_in_mem_cgroup(p, mem)) 163 if (memcg && !task_in_mem_cgroup(p, memcg))
164 return true; 164 return true;
165 165
166 /* p may not have freeable memory in nodemask */ 166 /* p may not have freeable memory in nodemask */
@@ -179,12 +179,12 @@ static bool oom_unkillable_task(struct task_struct *p,
179 * predictable as possible. The goal is to return the highest value for the 179 * predictable as possible. The goal is to return the highest value for the
180 * task consuming the most memory to avoid subsequent oom failures. 180 * task consuming the most memory to avoid subsequent oom failures.
181 */ 181 */
182unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, 182unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
183 const nodemask_t *nodemask, unsigned long totalpages) 183 const nodemask_t *nodemask, unsigned long totalpages)
184{ 184{
185 long points; 185 long points;
186 186
187 if (oom_unkillable_task(p, mem, nodemask)) 187 if (oom_unkillable_task(p, memcg, nodemask))
188 return 0; 188 return 0;
189 189
190 p = find_lock_task_mm(p); 190 p = find_lock_task_mm(p);
@@ -308,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
308 * (not docbooked, we don't want this one cluttering up the manual) 308 * (not docbooked, we don't want this one cluttering up the manual)
309 */ 309 */
310static struct task_struct *select_bad_process(unsigned int *ppoints, 310static struct task_struct *select_bad_process(unsigned int *ppoints,
311 unsigned long totalpages, struct mem_cgroup *mem, 311 unsigned long totalpages, struct mem_cgroup *memcg,
312 const nodemask_t *nodemask) 312 const nodemask_t *nodemask)
313{ 313{
314 struct task_struct *g, *p; 314 struct task_struct *g, *p;
@@ -320,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
320 320
321 if (p->exit_state) 321 if (p->exit_state)
322 continue; 322 continue;
323 if (oom_unkillable_task(p, mem, nodemask)) 323 if (oom_unkillable_task(p, memcg, nodemask))
324 continue; 324 continue;
325 325
326 /* 326 /*
@@ -364,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
364 } 364 }
365 } 365 }
366 366
367 points = oom_badness(p, mem, nodemask, totalpages); 367 points = oom_badness(p, memcg, nodemask, totalpages);
368 if (points > *ppoints) { 368 if (points > *ppoints) {
369 chosen = p; 369 chosen = p;
370 *ppoints = points; 370 *ppoints = points;
@@ -387,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
387 * 387 *
388 * Call with tasklist_lock read-locked. 388 * Call with tasklist_lock read-locked.
389 */ 389 */
390static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) 390static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
391{ 391{
392 struct task_struct *p; 392 struct task_struct *p;
393 struct task_struct *task; 393 struct task_struct *task;
394 394
395 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 395 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
396 for_each_process(p) { 396 for_each_process(p) {
397 if (oom_unkillable_task(p, mem, nodemask)) 397 if (oom_unkillable_task(p, memcg, nodemask))
398 continue; 398 continue;
399 399
400 task = find_lock_task_mm(p); 400 task = find_lock_task_mm(p);
@@ -417,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
417} 417}
418 418
419static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 419static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
420 struct mem_cgroup *mem, const nodemask_t *nodemask) 420 struct mem_cgroup *memcg, const nodemask_t *nodemask)
421{ 421{
422 task_lock(current); 422 task_lock(current);
423 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 423 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -427,14 +427,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
427 cpuset_print_task_mems_allowed(current); 427 cpuset_print_task_mems_allowed(current);
428 task_unlock(current); 428 task_unlock(current);
429 dump_stack(); 429 dump_stack();
430 mem_cgroup_print_oom_info(mem, p); 430 mem_cgroup_print_oom_info(memcg, p);
431 show_mem(SHOW_MEM_FILTER_NODES); 431 show_mem(SHOW_MEM_FILTER_NODES);
432 if (sysctl_oom_dump_tasks) 432 if (sysctl_oom_dump_tasks)
433 dump_tasks(mem, nodemask); 433 dump_tasks(memcg, nodemask);
434} 434}
435 435
436#define K(x) ((x) << (PAGE_SHIFT-10)) 436#define K(x) ((x) << (PAGE_SHIFT-10))
437static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) 437static int oom_kill_task(struct task_struct *p)
438{ 438{
439 struct task_struct *q; 439 struct task_struct *q;
440 struct mm_struct *mm; 440 struct mm_struct *mm;
@@ -484,7 +484,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
484 484
485static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 485static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
486 unsigned int points, unsigned long totalpages, 486 unsigned int points, unsigned long totalpages,
487 struct mem_cgroup *mem, nodemask_t *nodemask, 487 struct mem_cgroup *memcg, nodemask_t *nodemask,
488 const char *message) 488 const char *message)
489{ 489{
490 struct task_struct *victim = p; 490 struct task_struct *victim = p;
@@ -493,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
493 unsigned int victim_points = 0; 493 unsigned int victim_points = 0;
494 494
495 if (printk_ratelimit()) 495 if (printk_ratelimit())
496 dump_header(p, gfp_mask, order, mem, nodemask); 496 dump_header(p, gfp_mask, order, memcg, nodemask);
497 497
498 /* 498 /*
499 * If the task is already exiting, don't alarm the sysadmin or kill 499 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -524,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
524 /* 524 /*
525 * oom_badness() returns 0 if the thread is unkillable 525 * oom_badness() returns 0 if the thread is unkillable
526 */ 526 */
527 child_points = oom_badness(child, mem, nodemask, 527 child_points = oom_badness(child, memcg, nodemask,
528 totalpages); 528 totalpages);
529 if (child_points > victim_points) { 529 if (child_points > victim_points) {
530 victim = child; 530 victim = child;
@@ -533,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
533 } 533 }
534 } while_each_thread(p, t); 534 } while_each_thread(p, t);
535 535
536 return oom_kill_task(victim, mem); 536 return oom_kill_task(victim);
537} 537}
538 538
539/* 539/*
@@ -561,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
561} 561}
562 562
563#ifdef CONFIG_CGROUP_MEM_RES_CTLR 563#ifdef CONFIG_CGROUP_MEM_RES_CTLR
564void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 564void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
565{ 565{
566 unsigned long limit; 566 unsigned long limit;
567 unsigned int points = 0; 567 unsigned int points = 0;
@@ -578,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
578 } 578 }
579 579
580 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 580 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
581 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; 581 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
582 read_lock(&tasklist_lock); 582 read_lock(&tasklist_lock);
583retry: 583retry:
584 p = select_bad_process(&points, limit, mem, NULL); 584 p = select_bad_process(&points, limit, memcg, NULL);
585 if (!p || PTR_ERR(p) == -1UL) 585 if (!p || PTR_ERR(p) == -1UL)
586 goto out; 586 goto out;
587 587
588 if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, 588 if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
589 "Memory cgroup out of memory")) 589 "Memory cgroup out of memory"))
590 goto retry; 590 goto retry;
591out: 591out:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 794e6715c226..0027d8f4a1bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1981,14 +1981,20 @@ static struct page *
1981__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1981__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1982 struct zonelist *zonelist, enum zone_type high_zoneidx, 1982 struct zonelist *zonelist, enum zone_type high_zoneidx,
1983 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1983 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1984 int migratetype, unsigned long *did_some_progress, 1984 int migratetype, bool sync_migration,
1985 bool sync_migration) 1985 bool *deferred_compaction,
1986 unsigned long *did_some_progress)
1986{ 1987{
1987 struct page *page; 1988 struct page *page;
1988 1989
1989 if (!order || compaction_deferred(preferred_zone)) 1990 if (!order)
1990 return NULL; 1991 return NULL;
1991 1992
1993 if (compaction_deferred(preferred_zone)) {
1994 *deferred_compaction = true;
1995 return NULL;
1996 }
1997
1992 current->flags |= PF_MEMALLOC; 1998 current->flags |= PF_MEMALLOC;
1993 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1999 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1994 nodemask, sync_migration); 2000 nodemask, sync_migration);
@@ -2016,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2016 * but not enough to satisfy watermarks. 2022 * but not enough to satisfy watermarks.
2017 */ 2023 */
2018 count_vm_event(COMPACTFAIL); 2024 count_vm_event(COMPACTFAIL);
2019 defer_compaction(preferred_zone); 2025
2026 /*
2027 * As async compaction considers a subset of pageblocks, only
2028 * defer if the failure was a sync compaction failure.
2029 */
2030 if (sync_migration)
2031 defer_compaction(preferred_zone);
2020 2032
2021 cond_resched(); 2033 cond_resched();
2022 } 2034 }
@@ -2028,8 +2040,9 @@ static inline struct page *
2028__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2040__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2029 struct zonelist *zonelist, enum zone_type high_zoneidx, 2041 struct zonelist *zonelist, enum zone_type high_zoneidx,
2030 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2042 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2031 int migratetype, unsigned long *did_some_progress, 2043 int migratetype, bool sync_migration,
2032 bool sync_migration) 2044 bool *deferred_compaction,
2045 unsigned long *did_some_progress)
2033{ 2046{
2034 return NULL; 2047 return NULL;
2035} 2048}
@@ -2179,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2179 unsigned long pages_reclaimed = 0; 2192 unsigned long pages_reclaimed = 0;
2180 unsigned long did_some_progress; 2193 unsigned long did_some_progress;
2181 bool sync_migration = false; 2194 bool sync_migration = false;
2195 bool deferred_compaction = false;
2182 2196
2183 /* 2197 /*
2184 * In the slowpath, we sanity check order to avoid ever trying to 2198 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2259,12 +2273,22 @@ rebalance:
2259 zonelist, high_zoneidx, 2273 zonelist, high_zoneidx,
2260 nodemask, 2274 nodemask,
2261 alloc_flags, preferred_zone, 2275 alloc_flags, preferred_zone,
2262 migratetype, &did_some_progress, 2276 migratetype, sync_migration,
2263 sync_migration); 2277 &deferred_compaction,
2278 &did_some_progress);
2264 if (page) 2279 if (page)
2265 goto got_pg; 2280 goto got_pg;
2266 sync_migration = true; 2281 sync_migration = true;
2267 2282
2283 /*
2284 * If compaction is deferred for high-order allocations, it is because
2285 * sync compaction recently failed. In this is the case and the caller
2286 * has requested the system not be heavily disrupted, fail the
2287 * allocation now instead of entering direct reclaim
2288 */
2289 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
2290 goto nopage;
2291
2268 /* Try direct reclaim and then allocating */ 2292 /* Try direct reclaim and then allocating */
2269 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2293 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2270 zonelist, high_zoneidx, 2294 zonelist, high_zoneidx,
@@ -2328,8 +2352,9 @@ rebalance:
2328 zonelist, high_zoneidx, 2352 zonelist, high_zoneidx,
2329 nodemask, 2353 nodemask,
2330 alloc_flags, preferred_zone, 2354 alloc_flags, preferred_zone,
2331 migratetype, &did_some_progress, 2355 migratetype, sync_migration,
2332 sync_migration); 2356 &deferred_compaction,
2357 &did_some_progress);
2333 if (page) 2358 if (page)
2334 goto got_pg; 2359 goto got_pg;
2335 } 2360 }
@@ -4237,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4237 for (j = 0; j < MAX_NR_ZONES; j++) { 4262 for (j = 0; j < MAX_NR_ZONES; j++) {
4238 struct zone *zone = pgdat->node_zones + j; 4263 struct zone *zone = pgdat->node_zones + j;
4239 unsigned long size, realsize, memmap_pages; 4264 unsigned long size, realsize, memmap_pages;
4240 enum lru_list l; 4265 enum lru_list lru;
4241 4266
4242 size = zone_spanned_pages_in_node(nid, j, zones_size); 4267 size = zone_spanned_pages_in_node(nid, j, zones_size);
4243 realsize = size - zone_absent_pages_in_node(nid, j, 4268 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4287,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4287 zone->zone_pgdat = pgdat; 4312 zone->zone_pgdat = pgdat;
4288 4313
4289 zone_pcp_init(zone); 4314 zone_pcp_init(zone);
4290 for_each_lru(l) 4315 for_each_lru(lru)
4291 INIT_LIST_HEAD(&zone->lru[l].list); 4316 INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
4292 zone->reclaim_stat.recent_rotated[0] = 0; 4317 zone->reclaim_stat.recent_rotated[0] = 0;
4293 zone->reclaim_stat.recent_rotated[1] = 0; 4318 zone->reclaim_stat.recent_rotated[1] = 0;
4294 zone->reclaim_stat.recent_scanned[0] = 0; 4319 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4642,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
4642 4667
4643 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4668 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4644 struct zone *zone = &pgdat->node_zones[zone_type]; 4669 struct zone *zone = &pgdat->node_zones[zone_type];
4645 if (zone->present_pages) 4670 if (zone->present_pages) {
4646 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4671 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
4672 break;
4673 }
4647 } 4674 }
4648#endif 4675#endif
4649} 4676}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 2d123f94a8df..de1616aa9b1e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,13 +11,6 @@
11#include <linux/swapops.h> 11#include <linux/swapops.h>
12#include <linux/kmemleak.h> 12#include <linux/kmemleak.h>
13 13
14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
15{
16 pc->flags = 0;
17 set_page_cgroup_array_id(pc, id);
18 pc->mem_cgroup = NULL;
19 INIT_LIST_HEAD(&pc->lru);
20}
21static unsigned long total_usage; 14static unsigned long total_usage;
22 15
23#if !defined(CONFIG_SPARSEMEM) 16#if !defined(CONFIG_SPARSEMEM)
@@ -35,35 +28,27 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
35 struct page_cgroup *base; 28 struct page_cgroup *base;
36 29
37 base = NODE_DATA(page_to_nid(page))->node_page_cgroup; 30 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
31#ifdef CONFIG_DEBUG_VM
32 /*
33 * The sanity checks the page allocator does upon freeing a
34 * page can reach here before the page_cgroup arrays are
35 * allocated when feeding a range of pages to the allocator
36 * for the first time during bootup or memory hotplug.
37 */
38 if (unlikely(!base)) 38 if (unlikely(!base))
39 return NULL; 39 return NULL;
40 40#endif
41 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; 41 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
42 return base + offset; 42 return base + offset;
43} 43}
44 44
45struct page *lookup_cgroup_page(struct page_cgroup *pc)
46{
47 unsigned long pfn;
48 struct page *page;
49 pg_data_t *pgdat;
50
51 pgdat = NODE_DATA(page_cgroup_array_id(pc));
52 pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
53 page = pfn_to_page(pfn);
54 VM_BUG_ON(pc != lookup_page_cgroup(page));
55 return page;
56}
57
58static int __init alloc_node_page_cgroup(int nid) 45static int __init alloc_node_page_cgroup(int nid)
59{ 46{
60 struct page_cgroup *base, *pc; 47 struct page_cgroup *base;
61 unsigned long table_size; 48 unsigned long table_size;
62 unsigned long start_pfn, nr_pages, index; 49 unsigned long nr_pages;
63 50
64 start_pfn = NODE_DATA(nid)->node_start_pfn;
65 nr_pages = NODE_DATA(nid)->node_spanned_pages; 51 nr_pages = NODE_DATA(nid)->node_spanned_pages;
66
67 if (!nr_pages) 52 if (!nr_pages)
68 return 0; 53 return 0;
69 54
@@ -73,10 +58,6 @@ static int __init alloc_node_page_cgroup(int nid)
73 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
74 if (!base) 59 if (!base)
75 return -ENOMEM; 60 return -ENOMEM;
76 for (index = 0; index < nr_pages; index++) {
77 pc = base + index;
78 init_page_cgroup(pc, nid);
79 }
80 NODE_DATA(nid)->node_page_cgroup = base; 61 NODE_DATA(nid)->node_page_cgroup = base;
81 total_usage += table_size; 62 total_usage += table_size;
82 return 0; 63 return 0;
@@ -111,29 +92,23 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
111{ 92{
112 unsigned long pfn = page_to_pfn(page); 93 unsigned long pfn = page_to_pfn(page);
113 struct mem_section *section = __pfn_to_section(pfn); 94 struct mem_section *section = __pfn_to_section(pfn);
114 95#ifdef CONFIG_DEBUG_VM
96 /*
97 * The sanity checks the page allocator does upon freeing a
98 * page can reach here before the page_cgroup arrays are
99 * allocated when feeding a range of pages to the allocator
100 * for the first time during bootup or memory hotplug.
101 */
115 if (!section->page_cgroup) 102 if (!section->page_cgroup)
116 return NULL; 103 return NULL;
104#endif
117 return section->page_cgroup + pfn; 105 return section->page_cgroup + pfn;
118} 106}
119 107
120struct page *lookup_cgroup_page(struct page_cgroup *pc)
121{
122 struct mem_section *section;
123 struct page *page;
124 unsigned long nr;
125
126 nr = page_cgroup_array_id(pc);
127 section = __nr_to_section(nr);
128 page = pfn_to_page(pc - section->page_cgroup);
129 VM_BUG_ON(pc != lookup_page_cgroup(page));
130 return page;
131}
132
133static void *__meminit alloc_page_cgroup(size_t size, int nid) 108static void *__meminit alloc_page_cgroup(size_t size, int nid)
134{ 109{
110 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
135 void *addr = NULL; 111 void *addr = NULL;
136 gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
137 112
138 addr = alloc_pages_exact_nid(nid, size, flags); 113 addr = alloc_pages_exact_nid(nid, size, flags);
139 if (addr) { 114 if (addr) {
@@ -142,39 +117,20 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid)
142 } 117 }
143 118
144 if (node_state(nid, N_HIGH_MEMORY)) 119 if (node_state(nid, N_HIGH_MEMORY))
145 addr = vmalloc_node(size, nid); 120 addr = vzalloc_node(size, nid);
146 else 121 else
147 addr = vmalloc(size); 122 addr = vzalloc(size);
148 123
149 return addr; 124 return addr;
150} 125}
151 126
152#ifdef CONFIG_MEMORY_HOTPLUG
153static void free_page_cgroup(void *addr)
154{
155 if (is_vmalloc_addr(addr)) {
156 vfree(addr);
157 } else {
158 struct page *page = virt_to_page(addr);
159 size_t table_size =
160 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
161
162 BUG_ON(PageReserved(page));
163 free_pages_exact(addr, table_size);
164 }
165}
166#endif
167
168static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) 127static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
169{ 128{
170 struct page_cgroup *base, *pc;
171 struct mem_section *section; 129 struct mem_section *section;
130 struct page_cgroup *base;
172 unsigned long table_size; 131 unsigned long table_size;
173 unsigned long nr;
174 int index;
175 132
176 nr = pfn_to_section_nr(pfn); 133 section = __pfn_to_section(pfn);
177 section = __nr_to_section(nr);
178 134
179 if (section->page_cgroup) 135 if (section->page_cgroup)
180 return 0; 136 return 0;
@@ -194,10 +150,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
194 return -ENOMEM; 150 return -ENOMEM;
195 } 151 }
196 152
197 for (index = 0; index < PAGES_PER_SECTION; index++) {
198 pc = base + index;
199 init_page_cgroup(pc, nr);
200 }
201 /* 153 /*
202 * The passed "pfn" may not be aligned to SECTION. For the calculation 154 * The passed "pfn" may not be aligned to SECTION. For the calculation
203 * we need to apply a mask. 155 * we need to apply a mask.
@@ -208,6 +160,20 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
208 return 0; 160 return 0;
209} 161}
210#ifdef CONFIG_MEMORY_HOTPLUG 162#ifdef CONFIG_MEMORY_HOTPLUG
163static void free_page_cgroup(void *addr)
164{
165 if (is_vmalloc_addr(addr)) {
166 vfree(addr);
167 } else {
168 struct page *page = virt_to_page(addr);
169 size_t table_size =
170 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
171
172 BUG_ON(PageReserved(page));
173 free_pages_exact(addr, table_size);
174 }
175}
176
211void __free_page_cgroup(unsigned long pfn) 177void __free_page_cgroup(unsigned long pfn)
212{ 178{
213 struct mem_section *ms; 179 struct mem_section *ms;
@@ -366,7 +332,6 @@ struct swap_cgroup {
366 unsigned short id; 332 unsigned short id;
367}; 333};
368#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) 334#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
369#define SC_POS_MASK (SC_PER_PAGE - 1)
370 335
371/* 336/*
372 * SwapCgroup implements "lookup" and "exchange" operations. 337 * SwapCgroup implements "lookup" and "exchange" operations.
@@ -408,6 +373,21 @@ not_enough_page:
408 return -ENOMEM; 373 return -ENOMEM;
409} 374}
410 375
376static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
377 struct swap_cgroup_ctrl **ctrlp)
378{
379 pgoff_t offset = swp_offset(ent);
380 struct swap_cgroup_ctrl *ctrl;
381 struct page *mappage;
382
383 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
384 if (ctrlp)
385 *ctrlp = ctrl;
386
387 mappage = ctrl->map[offset / SC_PER_PAGE];
388 return page_address(mappage) + offset % SC_PER_PAGE;
389}
390
411/** 391/**
412 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. 392 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
413 * @end: swap entry to be cmpxchged 393 * @end: swap entry to be cmpxchged
@@ -420,21 +400,13 @@ not_enough_page:
420unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, 400unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
421 unsigned short old, unsigned short new) 401 unsigned short old, unsigned short new)
422{ 402{
423 int type = swp_type(ent);
424 unsigned long offset = swp_offset(ent);
425 unsigned long idx = offset / SC_PER_PAGE;
426 unsigned long pos = offset & SC_POS_MASK;
427 struct swap_cgroup_ctrl *ctrl; 403 struct swap_cgroup_ctrl *ctrl;
428 struct page *mappage;
429 struct swap_cgroup *sc; 404 struct swap_cgroup *sc;
430 unsigned long flags; 405 unsigned long flags;
431 unsigned short retval; 406 unsigned short retval;
432 407
433 ctrl = &swap_cgroup_ctrl[type]; 408 sc = lookup_swap_cgroup(ent, &ctrl);
434 409
435 mappage = ctrl->map[idx];
436 sc = page_address(mappage);
437 sc += pos;
438 spin_lock_irqsave(&ctrl->lock, flags); 410 spin_lock_irqsave(&ctrl->lock, flags);
439 retval = sc->id; 411 retval = sc->id;
440 if (retval == old) 412 if (retval == old)
@@ -455,21 +427,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
455 */ 427 */
456unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) 428unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
457{ 429{
458 int type = swp_type(ent);
459 unsigned long offset = swp_offset(ent);
460 unsigned long idx = offset / SC_PER_PAGE;
461 unsigned long pos = offset & SC_POS_MASK;
462 struct swap_cgroup_ctrl *ctrl; 430 struct swap_cgroup_ctrl *ctrl;
463 struct page *mappage;
464 struct swap_cgroup *sc; 431 struct swap_cgroup *sc;
465 unsigned short old; 432 unsigned short old;
466 unsigned long flags; 433 unsigned long flags;
467 434
468 ctrl = &swap_cgroup_ctrl[type]; 435 sc = lookup_swap_cgroup(ent, &ctrl);
469 436
470 mappage = ctrl->map[idx];
471 sc = page_address(mappage);
472 sc += pos;
473 spin_lock_irqsave(&ctrl->lock, flags); 437 spin_lock_irqsave(&ctrl->lock, flags);
474 old = sc->id; 438 old = sc->id;
475 sc->id = id; 439 sc->id = id;
@@ -479,28 +443,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
479} 443}
480 444
481/** 445/**
482 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry 446 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
483 * @ent: swap entry to be looked up. 447 * @ent: swap entry to be looked up.
484 * 448 *
485 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) 449 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
486 */ 450 */
487unsigned short lookup_swap_cgroup(swp_entry_t ent) 451unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
488{ 452{
489 int type = swp_type(ent); 453 return lookup_swap_cgroup(ent, NULL)->id;
490 unsigned long offset = swp_offset(ent);
491 unsigned long idx = offset / SC_PER_PAGE;
492 unsigned long pos = offset & SC_POS_MASK;
493 struct swap_cgroup_ctrl *ctrl;
494 struct page *mappage;
495 struct swap_cgroup *sc;
496 unsigned short ret;
497
498 ctrl = &swap_cgroup_ctrl[type];
499 mappage = ctrl->map[idx];
500 sc = page_address(mappage);
501 sc += pos;
502 ret = sc->id;
503 return ret;
504} 454}
505 455
506int swap_cgroup_swapon(int type, unsigned long max_pages) 456int swap_cgroup_swapon(int type, unsigned long max_pages)
diff --git a/mm/rmap.c b/mm/rmap.c
index a2e5ce1fa081..c8454e06b6c8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -773,7 +773,7 @@ out:
773} 773}
774 774
775static int page_referenced_anon(struct page *page, 775static int page_referenced_anon(struct page *page,
776 struct mem_cgroup *mem_cont, 776 struct mem_cgroup *memcg,
777 unsigned long *vm_flags) 777 unsigned long *vm_flags)
778{ 778{
779 unsigned int mapcount; 779 unsigned int mapcount;
@@ -796,7 +796,7 @@ static int page_referenced_anon(struct page *page,
796 * counting on behalf of references from different 796 * counting on behalf of references from different
797 * cgroups 797 * cgroups
798 */ 798 */
799 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 799 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
800 continue; 800 continue;
801 referenced += page_referenced_one(page, vma, address, 801 referenced += page_referenced_one(page, vma, address,
802 &mapcount, vm_flags); 802 &mapcount, vm_flags);
@@ -811,7 +811,7 @@ static int page_referenced_anon(struct page *page,
811/** 811/**
812 * page_referenced_file - referenced check for object-based rmap 812 * page_referenced_file - referenced check for object-based rmap
813 * @page: the page we're checking references on. 813 * @page: the page we're checking references on.
814 * @mem_cont: target memory controller 814 * @memcg: target memory control group
815 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 815 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
816 * 816 *
817 * For an object-based mapped page, find all the places it is mapped and 817 * For an object-based mapped page, find all the places it is mapped and
@@ -822,7 +822,7 @@ static int page_referenced_anon(struct page *page,
822 * This function is only called from page_referenced for object-based pages. 822 * This function is only called from page_referenced for object-based pages.
823 */ 823 */
824static int page_referenced_file(struct page *page, 824static int page_referenced_file(struct page *page,
825 struct mem_cgroup *mem_cont, 825 struct mem_cgroup *memcg,
826 unsigned long *vm_flags) 826 unsigned long *vm_flags)
827{ 827{
828 unsigned int mapcount; 828 unsigned int mapcount;
@@ -864,7 +864,7 @@ static int page_referenced_file(struct page *page,
864 * counting on behalf of references from different 864 * counting on behalf of references from different
865 * cgroups 865 * cgroups
866 */ 866 */
867 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 867 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
868 continue; 868 continue;
869 referenced += page_referenced_one(page, vma, address, 869 referenced += page_referenced_one(page, vma, address,
870 &mapcount, vm_flags); 870 &mapcount, vm_flags);
@@ -880,7 +880,7 @@ static int page_referenced_file(struct page *page,
880 * page_referenced - test if the page was referenced 880 * page_referenced - test if the page was referenced
881 * @page: the page to test 881 * @page: the page to test
882 * @is_locked: caller holds lock on the page 882 * @is_locked: caller holds lock on the page
883 * @mem_cont: target memory controller 883 * @memcg: target memory cgroup
884 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 884 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
885 * 885 *
886 * Quick test_and_clear_referenced for all mappings to a page, 886 * Quick test_and_clear_referenced for all mappings to a page,
@@ -888,7 +888,7 @@ static int page_referenced_file(struct page *page,
888 */ 888 */
889int page_referenced(struct page *page, 889int page_referenced(struct page *page,
890 int is_locked, 890 int is_locked,
891 struct mem_cgroup *mem_cont, 891 struct mem_cgroup *memcg,
892 unsigned long *vm_flags) 892 unsigned long *vm_flags)
893{ 893{
894 int referenced = 0; 894 int referenced = 0;
@@ -904,13 +904,13 @@ int page_referenced(struct page *page,
904 } 904 }
905 } 905 }
906 if (unlikely(PageKsm(page))) 906 if (unlikely(PageKsm(page)))
907 referenced += page_referenced_ksm(page, mem_cont, 907 referenced += page_referenced_ksm(page, memcg,
908 vm_flags); 908 vm_flags);
909 else if (PageAnon(page)) 909 else if (PageAnon(page))
910 referenced += page_referenced_anon(page, mem_cont, 910 referenced += page_referenced_anon(page, memcg,
911 vm_flags); 911 vm_flags);
912 else if (page->mapping) 912 else if (page->mapping)
913 referenced += page_referenced_file(page, mem_cont, 913 referenced += page_referenced_file(page, memcg,
914 vm_flags); 914 vm_flags);
915 if (we_locked) 915 if (we_locked)
916 unlock_page(page); 916 unlock_page(page);
diff --git a/mm/slub.c b/mm/slub.c
index 5d37b5e44140..4907563ef7ff 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
366 const char *n) 366 const char *n)
367{ 367{
368 VM_BUG_ON(!irqs_disabled()); 368 VM_BUG_ON(!irqs_disabled());
369#ifdef CONFIG_CMPXCHG_DOUBLE 369#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
370 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
370 if (s->flags & __CMPXCHG_DOUBLE) { 371 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist, &page->counters, 372 if (cmpxchg_double(&page->freelist, &page->counters,
372 freelist_old, counters_old, 373 freelist_old, counters_old,
@@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
400 void *freelist_new, unsigned long counters_new, 401 void *freelist_new, unsigned long counters_new,
401 const char *n) 402 const char *n)
402{ 403{
403#ifdef CONFIG_CMPXCHG_DOUBLE 404#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
405 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
404 if (s->flags & __CMPXCHG_DOUBLE) { 406 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist, &page->counters, 407 if (cmpxchg_double(&page->freelist, &page->counters,
406 freelist_old, counters_old, 408 freelist_old, counters_old,
@@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s,
3014 } 3016 }
3015 } 3017 }
3016 3018
3017#ifdef CONFIG_CMPXCHG_DOUBLE 3019#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
3020 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3018 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 3021 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
3019 /* Enable fast mode */ 3022 /* Enable fast mode */
3020 s->flags |= __CMPXCHG_DOUBLE; 3023 s->flags |= __CMPXCHG_DOUBLE;
diff --git a/mm/swap.c b/mm/swap.c
index 67a09a633a09..b0f529b38979 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -23,7 +23,6 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/buffer_head.h> /* for try_to_release_page() */
27#include <linux/percpu_counter.h> 26#include <linux/percpu_counter.h>
28#include <linux/percpu.h> 27#include <linux/percpu.h>
29#include <linux/cpu.h> 28#include <linux/cpu.h>
@@ -54,7 +53,7 @@ static void __page_cache_release(struct page *page)
54 spin_lock_irqsave(&zone->lru_lock, flags); 53 spin_lock_irqsave(&zone->lru_lock, flags);
55 VM_BUG_ON(!PageLRU(page)); 54 VM_BUG_ON(!PageLRU(page));
56 __ClearPageLRU(page); 55 __ClearPageLRU(page);
57 del_page_from_lru(zone, page); 56 del_page_from_lru_list(zone, page, page_off_lru(page));
58 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 spin_unlock_irqrestore(&zone->lru_lock, flags);
59 } 58 }
60} 59}
@@ -232,12 +231,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
232static void pagevec_move_tail_fn(struct page *page, void *arg) 231static void pagevec_move_tail_fn(struct page *page, void *arg)
233{ 232{
234 int *pgmoved = arg; 233 int *pgmoved = arg;
235 struct zone *zone = page_zone(page);
236 234
237 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 235 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
238 enum lru_list lru = page_lru_base_type(page); 236 enum lru_list lru = page_lru_base_type(page);
239 list_move_tail(&page->lru, &zone->lru[lru].list); 237 struct lruvec *lruvec;
240 mem_cgroup_rotate_reclaimable_page(page); 238
239 lruvec = mem_cgroup_lru_move_lists(page_zone(page),
240 page, lru, lru);
241 list_move_tail(&page->lru, &lruvec->lists[lru]);
241 (*pgmoved)++; 242 (*pgmoved)++;
242 } 243 }
243} 244}
@@ -368,7 +369,6 @@ void mark_page_accessed(struct page *page)
368 SetPageReferenced(page); 369 SetPageReferenced(page);
369 } 370 }
370} 371}
371
372EXPORT_SYMBOL(mark_page_accessed); 372EXPORT_SYMBOL(mark_page_accessed);
373 373
374void __lru_cache_add(struct page *page, enum lru_list lru) 374void __lru_cache_add(struct page *page, enum lru_list lru)
@@ -377,7 +377,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
377 377
378 page_cache_get(page); 378 page_cache_get(page);
379 if (!pagevec_add(pvec, page)) 379 if (!pagevec_add(pvec, page))
380 ____pagevec_lru_add(pvec, lru); 380 __pagevec_lru_add(pvec, lru);
381 put_cpu_var(lru_add_pvecs); 381 put_cpu_var(lru_add_pvecs);
382} 382}
383EXPORT_SYMBOL(__lru_cache_add); 383EXPORT_SYMBOL(__lru_cache_add);
@@ -476,12 +476,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
476 */ 476 */
477 SetPageReclaim(page); 477 SetPageReclaim(page);
478 } else { 478 } else {
479 struct lruvec *lruvec;
479 /* 480 /*
480 * The page's writeback ends up during pagevec 481 * The page's writeback ends up during pagevec
481 * We moves tha page into tail of inactive. 482 * We moves tha page into tail of inactive.
482 */ 483 */
483 list_move_tail(&page->lru, &zone->lru[lru].list); 484 lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
484 mem_cgroup_rotate_reclaimable_page(page); 485 list_move_tail(&page->lru, &lruvec->lists[lru]);
485 __count_vm_event(PGROTATED); 486 __count_vm_event(PGROTATED);
486 } 487 }
487 488
@@ -504,7 +505,7 @@ static void drain_cpu_pagevecs(int cpu)
504 for_each_lru(lru) { 505 for_each_lru(lru) {
505 pvec = &pvecs[lru - LRU_BASE]; 506 pvec = &pvecs[lru - LRU_BASE];
506 if (pagevec_count(pvec)) 507 if (pagevec_count(pvec))
507 ____pagevec_lru_add(pvec, lru); 508 __pagevec_lru_add(pvec, lru);
508 } 509 }
509 510
510 pvec = &per_cpu(lru_rotate_pvecs, cpu); 511 pvec = &per_cpu(lru_rotate_pvecs, cpu);
@@ -616,7 +617,7 @@ void release_pages(struct page **pages, int nr, int cold)
616 } 617 }
617 VM_BUG_ON(!PageLRU(page)); 618 VM_BUG_ON(!PageLRU(page));
618 __ClearPageLRU(page); 619 __ClearPageLRU(page);
619 del_page_from_lru(zone, page); 620 del_page_from_lru_list(zone, page, page_off_lru(page));
620 } 621 }
621 622
622 list_add(&page->lru, &pages_to_free); 623 list_add(&page->lru, &pages_to_free);
@@ -644,9 +645,9 @@ void __pagevec_release(struct pagevec *pvec)
644 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 645 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
645 pagevec_reinit(pvec); 646 pagevec_reinit(pvec);
646} 647}
647
648EXPORT_SYMBOL(__pagevec_release); 648EXPORT_SYMBOL(__pagevec_release);
649 649
650#ifdef CONFIG_TRANSPARENT_HUGEPAGE
650/* used by __split_huge_page_refcount() */ 651/* used by __split_huge_page_refcount() */
651void lru_add_page_tail(struct zone* zone, 652void lru_add_page_tail(struct zone* zone,
652 struct page *page, struct page *page_tail) 653 struct page *page, struct page *page_tail)
@@ -654,7 +655,6 @@ void lru_add_page_tail(struct zone* zone,
654 int active; 655 int active;
655 enum lru_list lru; 656 enum lru_list lru;
656 const int file = 0; 657 const int file = 0;
657 struct list_head *head;
658 658
659 VM_BUG_ON(!PageHead(page)); 659 VM_BUG_ON(!PageHead(page));
660 VM_BUG_ON(PageCompound(page_tail)); 660 VM_BUG_ON(PageCompound(page_tail));
@@ -673,18 +673,30 @@ void lru_add_page_tail(struct zone* zone,
673 lru = LRU_INACTIVE_ANON; 673 lru = LRU_INACTIVE_ANON;
674 } 674 }
675 update_page_reclaim_stat(zone, page_tail, file, active); 675 update_page_reclaim_stat(zone, page_tail, file, active);
676 if (likely(PageLRU(page)))
677 head = page->lru.prev;
678 else
679 head = &zone->lru[lru].list;
680 __add_page_to_lru_list(zone, page_tail, lru, head);
681 } else { 676 } else {
682 SetPageUnevictable(page_tail); 677 SetPageUnevictable(page_tail);
683 add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); 678 lru = LRU_UNEVICTABLE;
679 }
680
681 if (likely(PageLRU(page)))
682 list_add_tail(&page_tail->lru, &page->lru);
683 else {
684 struct list_head *list_head;
685 /*
686 * Head page has not yet been counted, as an hpage,
687 * so we must account for each subpage individually.
688 *
689 * Use the standard add function to put page_tail on the list,
690 * but then correct its position so they all end up in order.
691 */
692 add_page_to_lru_list(zone, page_tail, lru);
693 list_head = page_tail->lru.prev;
694 list_move_tail(&page_tail->lru, list_head);
684 } 695 }
685} 696}
697#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
686 698
687static void ____pagevec_lru_add_fn(struct page *page, void *arg) 699static void __pagevec_lru_add_fn(struct page *page, void *arg)
688{ 700{
689 enum lru_list lru = (enum lru_list)arg; 701 enum lru_list lru = (enum lru_list)arg;
690 struct zone *zone = page_zone(page); 702 struct zone *zone = page_zone(page);
@@ -706,32 +718,13 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg)
706 * Add the passed pages to the LRU, then drop the caller's refcount 718 * Add the passed pages to the LRU, then drop the caller's refcount
707 * on them. Reinitialises the caller's pagevec. 719 * on them. Reinitialises the caller's pagevec.
708 */ 720 */
709void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 721void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
710{ 722{
711 VM_BUG_ON(is_unevictable_lru(lru)); 723 VM_BUG_ON(is_unevictable_lru(lru));
712 724
713 pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); 725 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
714}
715
716EXPORT_SYMBOL(____pagevec_lru_add);
717
718/*
719 * Try to drop buffers from the pages in a pagevec
720 */
721void pagevec_strip(struct pagevec *pvec)
722{
723 int i;
724
725 for (i = 0; i < pagevec_count(pvec); i++) {
726 struct page *page = pvec->pages[i];
727
728 if (page_has_private(page) && trylock_page(page)) {
729 if (page_has_private(page))
730 try_to_release_page(page, 0);
731 unlock_page(page);
732 }
733 }
734} 726}
727EXPORT_SYMBOL(__pagevec_lru_add);
735 728
736/** 729/**
737 * pagevec_lookup - gang pagecache lookup 730 * pagevec_lookup - gang pagecache lookup
@@ -755,7 +748,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
755 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 748 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
756 return pagevec_count(pvec); 749 return pagevec_count(pvec);
757} 750}
758
759EXPORT_SYMBOL(pagevec_lookup); 751EXPORT_SYMBOL(pagevec_lookup);
760 752
761unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 753unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
@@ -765,7 +757,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
765 nr_pages, pvec->pages); 757 nr_pages, pvec->pages);
766 return pagevec_count(pvec); 758 return pagevec_count(pvec);
767} 759}
768
769EXPORT_SYMBOL(pagevec_lookup_tag); 760EXPORT_SYMBOL(pagevec_lookup_tag);
770 761
771/* 762/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ea6b32d61873..470038a91873 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,6 +300,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
300 new_page = alloc_page_vma(gfp_mask, vma, addr); 300 new_page = alloc_page_vma(gfp_mask, vma, addr);
301 if (!new_page) 301 if (!new_page)
302 break; /* Out of memory */ 302 break; /* Out of memory */
303 /*
304 * The memcg-specific accounting when moving
305 * pages around the LRU lists relies on the
306 * page's owner (memcg) to be valid. Usually,
307 * pages are assigned to a new owner before
308 * being put on the LRU list, but since this
309 * is not the case here, the stale owner from
310 * a previous allocation cycle must be reset.
311 */
312 mem_cgroup_reset_owner(new_page);
303 } 313 }
304 314
305 /* 315 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9520592d4231..d999f090dfda 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -847,12 +847,13 @@ unsigned int count_swap_pages(int type, int free)
847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
848 unsigned long addr, swp_entry_t entry, struct page *page) 848 unsigned long addr, swp_entry_t entry, struct page *page)
849{ 849{
850 struct mem_cgroup *ptr; 850 struct mem_cgroup *memcg;
851 spinlock_t *ptl; 851 spinlock_t *ptl;
852 pte_t *pte; 852 pte_t *pte;
853 int ret = 1; 853 int ret = 1;
854 854
855 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { 855 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
856 GFP_KERNEL, &memcg)) {
856 ret = -ENOMEM; 857 ret = -ENOMEM;
857 goto out_nolock; 858 goto out_nolock;
858 } 859 }
@@ -860,7 +861,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
860 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 861 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
861 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 862 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
862 if (ret > 0) 863 if (ret > 0)
863 mem_cgroup_cancel_charge_swapin(ptr); 864 mem_cgroup_cancel_charge_swapin(memcg);
864 ret = 0; 865 ret = 0;
865 goto out; 866 goto out;
866 } 867 }
@@ -871,7 +872,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
871 set_pte_at(vma->vm_mm, addr, pte, 872 set_pte_at(vma->vm_mm, addr, pte,
872 pte_mkold(mk_pte(page, vma->vm_page_prot))); 873 pte_mkold(mk_pte(page, vma->vm_page_prot)));
873 page_add_anon_rmap(page, vma, addr); 874 page_add_anon_rmap(page, vma, addr);
874 mem_cgroup_commit_charge_swapin(page, ptr); 875 mem_cgroup_commit_charge_swapin(page, memcg);
875 swap_free(entry); 876 swap_free(entry);
876 /* 877 /*
877 * Move the page to the active list so it is not 878 * Move the page to the active list so it is not
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 877ca046f43d..86ce9a526c17 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2378,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2378 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); 2378 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
2379 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); 2379 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
2380 if (!vas || !vms) 2380 if (!vas || !vms)
2381 goto err_free; 2381 goto err_free2;
2382 2382
2383 for (area = 0; area < nr_vms; area++) { 2383 for (area = 0; area < nr_vms; area++) {
2384 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); 2384 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
@@ -2476,11 +2476,10 @@ found:
2476 2476
2477err_free: 2477err_free:
2478 for (area = 0; area < nr_vms; area++) { 2478 for (area = 0; area < nr_vms; area++) {
2479 if (vas) 2479 kfree(vas[area]);
2480 kfree(vas[area]); 2480 kfree(vms[area]);
2481 if (vms)
2482 kfree(vms[area]);
2483 } 2481 }
2482err_free2:
2484 kfree(vas); 2483 kfree(vas);
2485 kfree(vms); 2484 kfree(vms);
2486 return NULL; 2485 return NULL;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26f4a8a4e0c7..2880396f7953 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -103,8 +103,11 @@ struct scan_control {
103 */ 103 */
104 reclaim_mode_t reclaim_mode; 104 reclaim_mode_t reclaim_mode;
105 105
106 /* Which cgroup do we reclaim from */ 106 /*
107 struct mem_cgroup *mem_cgroup; 107 * The memory cgroup that hit its limit and as a result is the
108 * primary target of this reclaim invocation.
109 */
110 struct mem_cgroup *target_mem_cgroup;
108 111
109 /* 112 /*
110 * Nodemask of nodes allowed by the caller. If NULL, all nodes 113 * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -113,6 +116,11 @@ struct scan_control {
113 nodemask_t *nodemask; 116 nodemask_t *nodemask;
114}; 117};
115 118
119struct mem_cgroup_zone {
120 struct mem_cgroup *mem_cgroup;
121 struct zone *zone;
122};
123
116#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 124#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
117 125
118#ifdef ARCH_HAS_PREFETCH 126#ifdef ARCH_HAS_PREFETCH
@@ -153,28 +161,45 @@ static LIST_HEAD(shrinker_list);
153static DECLARE_RWSEM(shrinker_rwsem); 161static DECLARE_RWSEM(shrinker_rwsem);
154 162
155#ifdef CONFIG_CGROUP_MEM_RES_CTLR 163#ifdef CONFIG_CGROUP_MEM_RES_CTLR
156#define scanning_global_lru(sc) (!(sc)->mem_cgroup) 164static bool global_reclaim(struct scan_control *sc)
165{
166 return !sc->target_mem_cgroup;
167}
168
169static bool scanning_global_lru(struct mem_cgroup_zone *mz)
170{
171 return !mz->mem_cgroup;
172}
157#else 173#else
158#define scanning_global_lru(sc) (1) 174static bool global_reclaim(struct scan_control *sc)
175{
176 return true;
177}
178
179static bool scanning_global_lru(struct mem_cgroup_zone *mz)
180{
181 return true;
182}
159#endif 183#endif
160 184
161static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, 185static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
162 struct scan_control *sc)
163{ 186{
164 if (!scanning_global_lru(sc)) 187 if (!scanning_global_lru(mz))
165 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); 188 return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
166 189
167 return &zone->reclaim_stat; 190 return &mz->zone->reclaim_stat;
168} 191}
169 192
170static unsigned long zone_nr_lru_pages(struct zone *zone, 193static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
171 struct scan_control *sc, enum lru_list lru) 194 enum lru_list lru)
172{ 195{
173 if (!scanning_global_lru(sc)) 196 if (!scanning_global_lru(mz))
174 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, 197 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
175 zone_to_nid(zone), zone_idx(zone), BIT(lru)); 198 zone_to_nid(mz->zone),
199 zone_idx(mz->zone),
200 BIT(lru));
176 201
177 return zone_page_state(zone, NR_LRU_BASE + lru); 202 return zone_page_state(mz->zone, NR_LRU_BASE + lru);
178} 203}
179 204
180 205
@@ -677,12 +702,13 @@ enum page_references {
677}; 702};
678 703
679static enum page_references page_check_references(struct page *page, 704static enum page_references page_check_references(struct page *page,
705 struct mem_cgroup_zone *mz,
680 struct scan_control *sc) 706 struct scan_control *sc)
681{ 707{
682 int referenced_ptes, referenced_page; 708 int referenced_ptes, referenced_page;
683 unsigned long vm_flags; 709 unsigned long vm_flags;
684 710
685 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); 711 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
686 referenced_page = TestClearPageReferenced(page); 712 referenced_page = TestClearPageReferenced(page);
687 713
688 /* Lumpy reclaim - ignore references */ 714 /* Lumpy reclaim - ignore references */
@@ -738,7 +764,7 @@ static enum page_references page_check_references(struct page *page,
738 * shrink_page_list() returns the number of reclaimed pages 764 * shrink_page_list() returns the number of reclaimed pages
739 */ 765 */
740static unsigned long shrink_page_list(struct list_head *page_list, 766static unsigned long shrink_page_list(struct list_head *page_list,
741 struct zone *zone, 767 struct mem_cgroup_zone *mz,
742 struct scan_control *sc, 768 struct scan_control *sc,
743 int priority, 769 int priority,
744 unsigned long *ret_nr_dirty, 770 unsigned long *ret_nr_dirty,
@@ -769,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
769 goto keep; 795 goto keep;
770 796
771 VM_BUG_ON(PageActive(page)); 797 VM_BUG_ON(PageActive(page));
772 VM_BUG_ON(page_zone(page) != zone); 798 VM_BUG_ON(page_zone(page) != mz->zone);
773 799
774 sc->nr_scanned++; 800 sc->nr_scanned++;
775 801
@@ -803,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
803 } 829 }
804 } 830 }
805 831
806 references = page_check_references(page, sc); 832 references = page_check_references(page, mz, sc);
807 switch (references) { 833 switch (references) {
808 case PAGEREF_ACTIVATE: 834 case PAGEREF_ACTIVATE:
809 goto activate_locked; 835 goto activate_locked;
@@ -994,8 +1020,8 @@ keep_lumpy:
994 * back off and wait for congestion to clear because further reclaim 1020 * back off and wait for congestion to clear because further reclaim
995 * will encounter the same problem 1021 * will encounter the same problem
996 */ 1022 */
997 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) 1023 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
998 zone_set_flag(zone, ZONE_CONGESTED); 1024 zone_set_flag(mz->zone, ZONE_CONGESTED);
999 1025
1000 free_hot_cold_page_list(&free_pages, 1); 1026 free_hot_cold_page_list(&free_pages, 1);
1001 1027
@@ -1049,8 +1075,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1049 1075
1050 ret = -EBUSY; 1076 ret = -EBUSY;
1051 1077
1052 if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) 1078 /*
1053 return ret; 1079 * To minimise LRU disruption, the caller can indicate that it only
1080 * wants to isolate pages it will be able to operate on without
1081 * blocking - clean pages for the most part.
1082 *
1083 * ISOLATE_CLEAN means that only clean pages should be isolated. This
1084 * is used by reclaim when it is cannot write to backing storage
1085 *
1086 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1087 * that it is possible to migrate without blocking
1088 */
1089 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1090 /* All the caller can do on PageWriteback is block */
1091 if (PageWriteback(page))
1092 return ret;
1093
1094 if (PageDirty(page)) {
1095 struct address_space *mapping;
1096
1097 /* ISOLATE_CLEAN means only clean pages */
1098 if (mode & ISOLATE_CLEAN)
1099 return ret;
1100
1101 /*
1102 * Only pages without mappings or that have a
1103 * ->migratepage callback are possible to migrate
1104 * without blocking
1105 */
1106 mapping = page_mapping(page);
1107 if (mapping && !mapping->a_ops->migratepage)
1108 return ret;
1109 }
1110 }
1054 1111
1055 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) 1112 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1056 return ret; 1113 return ret;
@@ -1079,25 +1136,36 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1079 * Appropriate locks must be held before calling this function. 1136 * Appropriate locks must be held before calling this function.
1080 * 1137 *
1081 * @nr_to_scan: The number of pages to look through on the list. 1138 * @nr_to_scan: The number of pages to look through on the list.
1082 * @src: The LRU list to pull pages off. 1139 * @mz: The mem_cgroup_zone to pull pages from.
1083 * @dst: The temp list to put pages on to. 1140 * @dst: The temp list to put pages on to.
1084 * @scanned: The number of pages that were scanned. 1141 * @nr_scanned: The number of pages that were scanned.
1085 * @order: The caller's attempted allocation order 1142 * @order: The caller's attempted allocation order
1086 * @mode: One of the LRU isolation modes 1143 * @mode: One of the LRU isolation modes
1144 * @active: True [1] if isolating active pages
1087 * @file: True [1] if isolating file [!anon] pages 1145 * @file: True [1] if isolating file [!anon] pages
1088 * 1146 *
1089 * returns how many pages were moved onto *@dst. 1147 * returns how many pages were moved onto *@dst.
1090 */ 1148 */
1091static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1149static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1092 struct list_head *src, struct list_head *dst, 1150 struct mem_cgroup_zone *mz, struct list_head *dst,
1093 unsigned long *scanned, int order, isolate_mode_t mode, 1151 unsigned long *nr_scanned, int order, isolate_mode_t mode,
1094 int file) 1152 int active, int file)
1095{ 1153{
1154 struct lruvec *lruvec;
1155 struct list_head *src;
1096 unsigned long nr_taken = 0; 1156 unsigned long nr_taken = 0;
1097 unsigned long nr_lumpy_taken = 0; 1157 unsigned long nr_lumpy_taken = 0;
1098 unsigned long nr_lumpy_dirty = 0; 1158 unsigned long nr_lumpy_dirty = 0;
1099 unsigned long nr_lumpy_failed = 0; 1159 unsigned long nr_lumpy_failed = 0;
1100 unsigned long scan; 1160 unsigned long scan;
1161 int lru = LRU_BASE;
1162
1163 lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
1164 if (active)
1165 lru += LRU_ACTIVE;
1166 if (file)
1167 lru += LRU_FILE;
1168 src = &lruvec->lists[lru];
1101 1169
1102 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1170 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1103 struct page *page; 1171 struct page *page;
@@ -1113,15 +1181,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1113 1181
1114 switch (__isolate_lru_page(page, mode, file)) { 1182 switch (__isolate_lru_page(page, mode, file)) {
1115 case 0: 1183 case 0:
1184 mem_cgroup_lru_del(page);
1116 list_move(&page->lru, dst); 1185 list_move(&page->lru, dst);
1117 mem_cgroup_del_lru(page);
1118 nr_taken += hpage_nr_pages(page); 1186 nr_taken += hpage_nr_pages(page);
1119 break; 1187 break;
1120 1188
1121 case -EBUSY: 1189 case -EBUSY:
1122 /* else it is being freed elsewhere */ 1190 /* else it is being freed elsewhere */
1123 list_move(&page->lru, src); 1191 list_move(&page->lru, src);
1124 mem_cgroup_rotate_lru_list(page, page_lru(page));
1125 continue; 1192 continue;
1126 1193
1127 default: 1194 default:
@@ -1171,13 +1238,17 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1171 break; 1238 break;
1172 1239
1173 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1240 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1241 unsigned int isolated_pages;
1242
1243 mem_cgroup_lru_del(cursor_page);
1174 list_move(&cursor_page->lru, dst); 1244 list_move(&cursor_page->lru, dst);
1175 mem_cgroup_del_lru(cursor_page); 1245 isolated_pages = hpage_nr_pages(cursor_page);
1176 nr_taken += hpage_nr_pages(cursor_page); 1246 nr_taken += isolated_pages;
1177 nr_lumpy_taken++; 1247 nr_lumpy_taken += isolated_pages;
1178 if (PageDirty(cursor_page)) 1248 if (PageDirty(cursor_page))
1179 nr_lumpy_dirty++; 1249 nr_lumpy_dirty += isolated_pages;
1180 scan++; 1250 scan++;
1251 pfn += isolated_pages - 1;
1181 } else { 1252 } else {
1182 /* 1253 /*
1183 * Check if the page is freed already. 1254 * Check if the page is freed already.
@@ -1203,57 +1274,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1203 nr_lumpy_failed++; 1274 nr_lumpy_failed++;
1204 } 1275 }
1205 1276
1206 *scanned = scan; 1277 *nr_scanned = scan;
1207 1278
1208 trace_mm_vmscan_lru_isolate(order, 1279 trace_mm_vmscan_lru_isolate(order,
1209 nr_to_scan, scan, 1280 nr_to_scan, scan,
1210 nr_taken, 1281 nr_taken,
1211 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1282 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1212 mode); 1283 mode, file);
1213 return nr_taken; 1284 return nr_taken;
1214} 1285}
1215 1286
1216static unsigned long isolate_pages_global(unsigned long nr,
1217 struct list_head *dst,
1218 unsigned long *scanned, int order,
1219 isolate_mode_t mode,
1220 struct zone *z, int active, int file)
1221{
1222 int lru = LRU_BASE;
1223 if (active)
1224 lru += LRU_ACTIVE;
1225 if (file)
1226 lru += LRU_FILE;
1227 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
1228 mode, file);
1229}
1230
1231/*
1232 * clear_active_flags() is a helper for shrink_active_list(), clearing
1233 * any active bits from the pages in the list.
1234 */
1235static unsigned long clear_active_flags(struct list_head *page_list,
1236 unsigned int *count)
1237{
1238 int nr_active = 0;
1239 int lru;
1240 struct page *page;
1241
1242 list_for_each_entry(page, page_list, lru) {
1243 int numpages = hpage_nr_pages(page);
1244 lru = page_lru_base_type(page);
1245 if (PageActive(page)) {
1246 lru += LRU_ACTIVE;
1247 ClearPageActive(page);
1248 nr_active += numpages;
1249 }
1250 if (count)
1251 count[lru] += numpages;
1252 }
1253
1254 return nr_active;
1255}
1256
1257/** 1287/**
1258 * isolate_lru_page - tries to isolate a page from its LRU list 1288 * isolate_lru_page - tries to isolate a page from its LRU list
1259 * @page: page to isolate from its LRU list 1289 * @page: page to isolate from its LRU list
@@ -1313,7 +1343,7 @@ static int too_many_isolated(struct zone *zone, int file,
1313 if (current_is_kswapd()) 1343 if (current_is_kswapd())
1314 return 0; 1344 return 0;
1315 1345
1316 if (!scanning_global_lru(sc)) 1346 if (!global_reclaim(sc))
1317 return 0; 1347 return 0;
1318 1348
1319 if (file) { 1349 if (file) {
@@ -1327,27 +1357,21 @@ static int too_many_isolated(struct zone *zone, int file,
1327 return isolated > inactive; 1357 return isolated > inactive;
1328} 1358}
1329 1359
1330/*
1331 * TODO: Try merging with migrations version of putback_lru_pages
1332 */
1333static noinline_for_stack void 1360static noinline_for_stack void
1334putback_lru_pages(struct zone *zone, struct scan_control *sc, 1361putback_inactive_pages(struct mem_cgroup_zone *mz,
1335 unsigned long nr_anon, unsigned long nr_file, 1362 struct list_head *page_list)
1336 struct list_head *page_list)
1337{ 1363{
1338 struct page *page; 1364 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1339 struct pagevec pvec; 1365 struct zone *zone = mz->zone;
1340 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1366 LIST_HEAD(pages_to_free);
1341
1342 pagevec_init(&pvec, 1);
1343 1367
1344 /* 1368 /*
1345 * Put back any unfreeable pages. 1369 * Put back any unfreeable pages.
1346 */ 1370 */
1347 spin_lock(&zone->lru_lock);
1348 while (!list_empty(page_list)) { 1371 while (!list_empty(page_list)) {
1372 struct page *page = lru_to_page(page_list);
1349 int lru; 1373 int lru;
1350 page = lru_to_page(page_list); 1374
1351 VM_BUG_ON(PageLRU(page)); 1375 VM_BUG_ON(PageLRU(page));
1352 list_del(&page->lru); 1376 list_del(&page->lru);
1353 if (unlikely(!page_evictable(page, NULL))) { 1377 if (unlikely(!page_evictable(page, NULL))) {
@@ -1364,30 +1388,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1364 int numpages = hpage_nr_pages(page); 1388 int numpages = hpage_nr_pages(page);
1365 reclaim_stat->recent_rotated[file] += numpages; 1389 reclaim_stat->recent_rotated[file] += numpages;
1366 } 1390 }
1367 if (!pagevec_add(&pvec, page)) { 1391 if (put_page_testzero(page)) {
1368 spin_unlock_irq(&zone->lru_lock); 1392 __ClearPageLRU(page);
1369 __pagevec_release(&pvec); 1393 __ClearPageActive(page);
1370 spin_lock_irq(&zone->lru_lock); 1394 del_page_from_lru_list(zone, page, lru);
1395
1396 if (unlikely(PageCompound(page))) {
1397 spin_unlock_irq(&zone->lru_lock);
1398 (*get_compound_page_dtor(page))(page);
1399 spin_lock_irq(&zone->lru_lock);
1400 } else
1401 list_add(&page->lru, &pages_to_free);
1371 } 1402 }
1372 } 1403 }
1373 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1374 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1375 1404
1376 spin_unlock_irq(&zone->lru_lock); 1405 /*
1377 pagevec_release(&pvec); 1406 * To save our caller's stack, now use input list for pages to free.
1407 */
1408 list_splice(&pages_to_free, page_list);
1378} 1409}
1379 1410
1380static noinline_for_stack void update_isolated_counts(struct zone *zone, 1411static noinline_for_stack void
1381 struct scan_control *sc, 1412update_isolated_counts(struct mem_cgroup_zone *mz,
1382 unsigned long *nr_anon, 1413 struct list_head *page_list,
1383 unsigned long *nr_file, 1414 unsigned long *nr_anon,
1384 struct list_head *isolated_list) 1415 unsigned long *nr_file)
1385{ 1416{
1386 unsigned long nr_active; 1417 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1418 struct zone *zone = mz->zone;
1387 unsigned int count[NR_LRU_LISTS] = { 0, }; 1419 unsigned int count[NR_LRU_LISTS] = { 0, };
1388 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1420 unsigned long nr_active = 0;
1421 struct page *page;
1422 int lru;
1423
1424 /*
1425 * Count pages and clear active flags
1426 */
1427 list_for_each_entry(page, page_list, lru) {
1428 int numpages = hpage_nr_pages(page);
1429 lru = page_lru_base_type(page);
1430 if (PageActive(page)) {
1431 lru += LRU_ACTIVE;
1432 ClearPageActive(page);
1433 nr_active += numpages;
1434 }
1435 count[lru] += numpages;
1436 }
1389 1437
1390 nr_active = clear_active_flags(isolated_list, count);
1391 __count_vm_events(PGDEACTIVATE, nr_active); 1438 __count_vm_events(PGDEACTIVATE, nr_active);
1392 1439
1393 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1440 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1401,8 +1448,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1401 1448
1402 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1449 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1403 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1450 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1404 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1405 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1406 1451
1407 reclaim_stat->recent_scanned[0] += *nr_anon; 1452 reclaim_stat->recent_scanned[0] += *nr_anon;
1408 reclaim_stat->recent_scanned[1] += *nr_file; 1453 reclaim_stat->recent_scanned[1] += *nr_file;
@@ -1454,8 +1499,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1454 * of reclaimed pages 1499 * of reclaimed pages
1455 */ 1500 */
1456static noinline_for_stack unsigned long 1501static noinline_for_stack unsigned long
1457shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, 1502shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1458 struct scan_control *sc, int priority, int file) 1503 struct scan_control *sc, int priority, int file)
1459{ 1504{
1460 LIST_HEAD(page_list); 1505 LIST_HEAD(page_list);
1461 unsigned long nr_scanned; 1506 unsigned long nr_scanned;
@@ -1466,6 +1511,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1466 unsigned long nr_dirty = 0; 1511 unsigned long nr_dirty = 0;
1467 unsigned long nr_writeback = 0; 1512 unsigned long nr_writeback = 0;
1468 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1513 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1514 struct zone *zone = mz->zone;
1469 1515
1470 while (unlikely(too_many_isolated(zone, file, sc))) { 1516 while (unlikely(too_many_isolated(zone, file, sc))) {
1471 congestion_wait(BLK_RW_ASYNC, HZ/10); 1517 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1488,9 +1534,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1488 1534
1489 spin_lock_irq(&zone->lru_lock); 1535 spin_lock_irq(&zone->lru_lock);
1490 1536
1491 if (scanning_global_lru(sc)) { 1537 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
1492 nr_taken = isolate_pages_global(nr_to_scan, &page_list, 1538 &nr_scanned, sc->order,
1493 &nr_scanned, sc->order, reclaim_mode, zone, 0, file); 1539 reclaim_mode, 0, file);
1540 if (global_reclaim(sc)) {
1494 zone->pages_scanned += nr_scanned; 1541 zone->pages_scanned += nr_scanned;
1495 if (current_is_kswapd()) 1542 if (current_is_kswapd())
1496 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1543 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1498,14 +1545,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1498 else 1545 else
1499 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1546 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1500 nr_scanned); 1547 nr_scanned);
1501 } else {
1502 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1503 &nr_scanned, sc->order, reclaim_mode, zone,
1504 sc->mem_cgroup, 0, file);
1505 /*
1506 * mem_cgroup_isolate_pages() keeps track of
1507 * scanned pages on its own.
1508 */
1509 } 1548 }
1510 1549
1511 if (nr_taken == 0) { 1550 if (nr_taken == 0) {
@@ -1513,26 +1552,37 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1513 return 0; 1552 return 0;
1514 } 1553 }
1515 1554
1516 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); 1555 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
1556
1557 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1558 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1517 1559
1518 spin_unlock_irq(&zone->lru_lock); 1560 spin_unlock_irq(&zone->lru_lock);
1519 1561
1520 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, 1562 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1521 &nr_dirty, &nr_writeback); 1563 &nr_dirty, &nr_writeback);
1522 1564
1523 /* Check if we should syncronously wait for writeback */ 1565 /* Check if we should syncronously wait for writeback */
1524 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1566 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1525 set_reclaim_mode(priority, sc, true); 1567 set_reclaim_mode(priority, sc, true);
1526 nr_reclaimed += shrink_page_list(&page_list, zone, sc, 1568 nr_reclaimed += shrink_page_list(&page_list, mz, sc,
1527 priority, &nr_dirty, &nr_writeback); 1569 priority, &nr_dirty, &nr_writeback);
1528 } 1570 }
1529 1571
1530 local_irq_disable(); 1572 spin_lock_irq(&zone->lru_lock);
1573
1531 if (current_is_kswapd()) 1574 if (current_is_kswapd())
1532 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1575 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1533 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1576 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1534 1577
1535 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1578 putback_inactive_pages(mz, &page_list);
1579
1580 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1581 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1582
1583 spin_unlock_irq(&zone->lru_lock);
1584
1585 free_hot_cold_page_list(&page_list, 1);
1536 1586
1537 /* 1587 /*
1538 * If reclaim is isolating dirty pages under writeback, it implies 1588 * If reclaim is isolating dirty pages under writeback, it implies
@@ -1588,30 +1638,47 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1588 1638
1589static void move_active_pages_to_lru(struct zone *zone, 1639static void move_active_pages_to_lru(struct zone *zone,
1590 struct list_head *list, 1640 struct list_head *list,
1641 struct list_head *pages_to_free,
1591 enum lru_list lru) 1642 enum lru_list lru)
1592{ 1643{
1593 unsigned long pgmoved = 0; 1644 unsigned long pgmoved = 0;
1594 struct pagevec pvec;
1595 struct page *page; 1645 struct page *page;
1596 1646
1597 pagevec_init(&pvec, 1); 1647 if (buffer_heads_over_limit) {
1648 spin_unlock_irq(&zone->lru_lock);
1649 list_for_each_entry(page, list, lru) {
1650 if (page_has_private(page) && trylock_page(page)) {
1651 if (page_has_private(page))
1652 try_to_release_page(page, 0);
1653 unlock_page(page);
1654 }
1655 }
1656 spin_lock_irq(&zone->lru_lock);
1657 }
1598 1658
1599 while (!list_empty(list)) { 1659 while (!list_empty(list)) {
1660 struct lruvec *lruvec;
1661
1600 page = lru_to_page(list); 1662 page = lru_to_page(list);
1601 1663
1602 VM_BUG_ON(PageLRU(page)); 1664 VM_BUG_ON(PageLRU(page));
1603 SetPageLRU(page); 1665 SetPageLRU(page);
1604 1666
1605 list_move(&page->lru, &zone->lru[lru].list); 1667 lruvec = mem_cgroup_lru_add_list(zone, page, lru);
1606 mem_cgroup_add_lru_list(page, lru); 1668 list_move(&page->lru, &lruvec->lists[lru]);
1607 pgmoved += hpage_nr_pages(page); 1669 pgmoved += hpage_nr_pages(page);
1608 1670
1609 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1671 if (put_page_testzero(page)) {
1610 spin_unlock_irq(&zone->lru_lock); 1672 __ClearPageLRU(page);
1611 if (buffer_heads_over_limit) 1673 __ClearPageActive(page);
1612 pagevec_strip(&pvec); 1674 del_page_from_lru_list(zone, page, lru);
1613 __pagevec_release(&pvec); 1675
1614 spin_lock_irq(&zone->lru_lock); 1676 if (unlikely(PageCompound(page))) {
1677 spin_unlock_irq(&zone->lru_lock);
1678 (*get_compound_page_dtor(page))(page);
1679 spin_lock_irq(&zone->lru_lock);
1680 } else
1681 list_add(&page->lru, pages_to_free);
1615 } 1682 }
1616 } 1683 }
1617 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1684 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1619,19 +1686,22 @@ static void move_active_pages_to_lru(struct zone *zone,
1619 __count_vm_events(PGDEACTIVATE, pgmoved); 1686 __count_vm_events(PGDEACTIVATE, pgmoved);
1620} 1687}
1621 1688
1622static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1689static void shrink_active_list(unsigned long nr_to_scan,
1623 struct scan_control *sc, int priority, int file) 1690 struct mem_cgroup_zone *mz,
1691 struct scan_control *sc,
1692 int priority, int file)
1624{ 1693{
1625 unsigned long nr_taken; 1694 unsigned long nr_taken;
1626 unsigned long pgscanned; 1695 unsigned long nr_scanned;
1627 unsigned long vm_flags; 1696 unsigned long vm_flags;
1628 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1697 LIST_HEAD(l_hold); /* The pages which were snipped off */
1629 LIST_HEAD(l_active); 1698 LIST_HEAD(l_active);
1630 LIST_HEAD(l_inactive); 1699 LIST_HEAD(l_inactive);
1631 struct page *page; 1700 struct page *page;
1632 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1701 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1633 unsigned long nr_rotated = 0; 1702 unsigned long nr_rotated = 0;
1634 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; 1703 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1704 struct zone *zone = mz->zone;
1635 1705
1636 lru_add_drain(); 1706 lru_add_drain();
1637 1707
@@ -1641,26 +1711,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1641 reclaim_mode |= ISOLATE_CLEAN; 1711 reclaim_mode |= ISOLATE_CLEAN;
1642 1712
1643 spin_lock_irq(&zone->lru_lock); 1713 spin_lock_irq(&zone->lru_lock);
1644 if (scanning_global_lru(sc)) { 1714
1645 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1715 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
1646 &pgscanned, sc->order, 1716 &nr_scanned, sc->order,
1647 reclaim_mode, zone, 1717 reclaim_mode, 1, file);
1648 1, file); 1718 if (global_reclaim(sc))
1649 zone->pages_scanned += pgscanned; 1719 zone->pages_scanned += nr_scanned;
1650 } else {
1651 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1652 &pgscanned, sc->order,
1653 reclaim_mode, zone,
1654 sc->mem_cgroup, 1, file);
1655 /*
1656 * mem_cgroup_isolate_pages() keeps track of
1657 * scanned pages on its own.
1658 */
1659 }
1660 1720
1661 reclaim_stat->recent_scanned[file] += nr_taken; 1721 reclaim_stat->recent_scanned[file] += nr_taken;
1662 1722
1663 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1723 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1664 if (file) 1724 if (file)
1665 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); 1725 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1666 else 1726 else
@@ -1678,7 +1738,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1678 continue; 1738 continue;
1679 } 1739 }
1680 1740
1681 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1741 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
1682 nr_rotated += hpage_nr_pages(page); 1742 nr_rotated += hpage_nr_pages(page);
1683 /* 1743 /*
1684 * Identify referenced, file-backed active pages and 1744 * Identify referenced, file-backed active pages and
@@ -1711,12 +1771,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1711 */ 1771 */
1712 reclaim_stat->recent_rotated[file] += nr_rotated; 1772 reclaim_stat->recent_rotated[file] += nr_rotated;
1713 1773
1714 move_active_pages_to_lru(zone, &l_active, 1774 move_active_pages_to_lru(zone, &l_active, &l_hold,
1715 LRU_ACTIVE + file * LRU_FILE); 1775 LRU_ACTIVE + file * LRU_FILE);
1716 move_active_pages_to_lru(zone, &l_inactive, 1776 move_active_pages_to_lru(zone, &l_inactive, &l_hold,
1717 LRU_BASE + file * LRU_FILE); 1777 LRU_BASE + file * LRU_FILE);
1718 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1778 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1719 spin_unlock_irq(&zone->lru_lock); 1779 spin_unlock_irq(&zone->lru_lock);
1780
1781 free_hot_cold_page_list(&l_hold, 1);
1720} 1782}
1721 1783
1722#ifdef CONFIG_SWAP 1784#ifdef CONFIG_SWAP
@@ -1741,10 +1803,8 @@ static int inactive_anon_is_low_global(struct zone *zone)
1741 * Returns true if the zone does not have enough inactive anon pages, 1803 * Returns true if the zone does not have enough inactive anon pages,
1742 * meaning some active anon pages need to be deactivated. 1804 * meaning some active anon pages need to be deactivated.
1743 */ 1805 */
1744static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) 1806static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1745{ 1807{
1746 int low;
1747
1748 /* 1808 /*
1749 * If we don't have swap space, anonymous page deactivation 1809 * If we don't have swap space, anonymous page deactivation
1750 * is pointless. 1810 * is pointless.
@@ -1752,15 +1812,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1752 if (!total_swap_pages) 1812 if (!total_swap_pages)
1753 return 0; 1813 return 0;
1754 1814
1755 if (scanning_global_lru(sc)) 1815 if (!scanning_global_lru(mz))
1756 low = inactive_anon_is_low_global(zone); 1816 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
1757 else 1817 mz->zone);
1758 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); 1818
1759 return low; 1819 return inactive_anon_is_low_global(mz->zone);
1760} 1820}
1761#else 1821#else
1762static inline int inactive_anon_is_low(struct zone *zone, 1822static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1763 struct scan_control *sc)
1764{ 1823{
1765 return 0; 1824 return 0;
1766} 1825}
@@ -1778,8 +1837,7 @@ static int inactive_file_is_low_global(struct zone *zone)
1778 1837
1779/** 1838/**
1780 * inactive_file_is_low - check if file pages need to be deactivated 1839 * inactive_file_is_low - check if file pages need to be deactivated
1781 * @zone: zone to check 1840 * @mz: memory cgroup and zone to check
1782 * @sc: scan control of this context
1783 * 1841 *
1784 * When the system is doing streaming IO, memory pressure here 1842 * When the system is doing streaming IO, memory pressure here
1785 * ensures that active file pages get deactivated, until more 1843 * ensures that active file pages get deactivated, until more
@@ -1791,45 +1849,44 @@ static int inactive_file_is_low_global(struct zone *zone)
1791 * This uses a different ratio than the anonymous pages, because 1849 * This uses a different ratio than the anonymous pages, because
1792 * the page cache uses a use-once replacement algorithm. 1850 * the page cache uses a use-once replacement algorithm.
1793 */ 1851 */
1794static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) 1852static int inactive_file_is_low(struct mem_cgroup_zone *mz)
1795{ 1853{
1796 int low; 1854 if (!scanning_global_lru(mz))
1855 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
1856 mz->zone);
1797 1857
1798 if (scanning_global_lru(sc)) 1858 return inactive_file_is_low_global(mz->zone);
1799 low = inactive_file_is_low_global(zone);
1800 else
1801 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
1802 return low;
1803} 1859}
1804 1860
1805static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, 1861static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
1806 int file)
1807{ 1862{
1808 if (file) 1863 if (file)
1809 return inactive_file_is_low(zone, sc); 1864 return inactive_file_is_low(mz);
1810 else 1865 else
1811 return inactive_anon_is_low(zone, sc); 1866 return inactive_anon_is_low(mz);
1812} 1867}
1813 1868
1814static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1869static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1815 struct zone *zone, struct scan_control *sc, int priority) 1870 struct mem_cgroup_zone *mz,
1871 struct scan_control *sc, int priority)
1816{ 1872{
1817 int file = is_file_lru(lru); 1873 int file = is_file_lru(lru);
1818 1874
1819 if (is_active_lru(lru)) { 1875 if (is_active_lru(lru)) {
1820 if (inactive_list_is_low(zone, sc, file)) 1876 if (inactive_list_is_low(mz, file))
1821 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1877 shrink_active_list(nr_to_scan, mz, sc, priority, file);
1822 return 0; 1878 return 0;
1823 } 1879 }
1824 1880
1825 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1881 return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
1826} 1882}
1827 1883
1828static int vmscan_swappiness(struct scan_control *sc) 1884static int vmscan_swappiness(struct mem_cgroup_zone *mz,
1885 struct scan_control *sc)
1829{ 1886{
1830 if (scanning_global_lru(sc)) 1887 if (global_reclaim(sc))
1831 return vm_swappiness; 1888 return vm_swappiness;
1832 return mem_cgroup_swappiness(sc->mem_cgroup); 1889 return mem_cgroup_swappiness(mz->mem_cgroup);
1833} 1890}
1834 1891
1835/* 1892/*
@@ -1840,15 +1897,15 @@ static int vmscan_swappiness(struct scan_control *sc)
1840 * 1897 *
1841 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1898 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1842 */ 1899 */
1843static void get_scan_count(struct zone *zone, struct scan_control *sc, 1900static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1844 unsigned long *nr, int priority) 1901 unsigned long *nr, int priority)
1845{ 1902{
1846 unsigned long anon, file, free; 1903 unsigned long anon, file, free;
1847 unsigned long anon_prio, file_prio; 1904 unsigned long anon_prio, file_prio;
1848 unsigned long ap, fp; 1905 unsigned long ap, fp;
1849 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1906 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1850 u64 fraction[2], denominator; 1907 u64 fraction[2], denominator;
1851 enum lru_list l; 1908 enum lru_list lru;
1852 int noswap = 0; 1909 int noswap = 0;
1853 bool force_scan = false; 1910 bool force_scan = false;
1854 1911
@@ -1862,9 +1919,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1862 * latencies, so it's better to scan a minimum amount there as 1919 * latencies, so it's better to scan a minimum amount there as
1863 * well. 1920 * well.
1864 */ 1921 */
1865 if (scanning_global_lru(sc) && current_is_kswapd()) 1922 if (current_is_kswapd() && mz->zone->all_unreclaimable)
1866 force_scan = true; 1923 force_scan = true;
1867 if (!scanning_global_lru(sc)) 1924 if (!global_reclaim(sc))
1868 force_scan = true; 1925 force_scan = true;
1869 1926
1870 /* If we have no swap space, do not bother scanning anon pages. */ 1927 /* If we have no swap space, do not bother scanning anon pages. */
@@ -1876,16 +1933,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1876 goto out; 1933 goto out;
1877 } 1934 }
1878 1935
1879 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1936 anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
1880 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1937 zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
1881 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1938 file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
1882 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1939 zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
1883 1940
1884 if (scanning_global_lru(sc)) { 1941 if (global_reclaim(sc)) {
1885 free = zone_page_state(zone, NR_FREE_PAGES); 1942 free = zone_page_state(mz->zone, NR_FREE_PAGES);
1886 /* If we have very few page cache pages, 1943 /* If we have very few page cache pages,
1887 force-scan anon pages. */ 1944 force-scan anon pages. */
1888 if (unlikely(file + free <= high_wmark_pages(zone))) { 1945 if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
1889 fraction[0] = 1; 1946 fraction[0] = 1;
1890 fraction[1] = 0; 1947 fraction[1] = 0;
1891 denominator = 1; 1948 denominator = 1;
@@ -1897,8 +1954,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1897 * With swappiness at 100, anonymous and file have the same priority. 1954 * With swappiness at 100, anonymous and file have the same priority.
1898 * This scanning priority is essentially the inverse of IO cost. 1955 * This scanning priority is essentially the inverse of IO cost.
1899 */ 1956 */
1900 anon_prio = vmscan_swappiness(sc); 1957 anon_prio = vmscan_swappiness(mz, sc);
1901 file_prio = 200 - vmscan_swappiness(sc); 1958 file_prio = 200 - vmscan_swappiness(mz, sc);
1902 1959
1903 /* 1960 /*
1904 * OK, so we have swap space and a fair amount of page cache 1961 * OK, so we have swap space and a fair amount of page cache
@@ -1911,7 +1968,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1911 * 1968 *
1912 * anon in [0], file in [1] 1969 * anon in [0], file in [1]
1913 */ 1970 */
1914 spin_lock_irq(&zone->lru_lock); 1971 spin_lock_irq(&mz->zone->lru_lock);
1915 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1972 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1916 reclaim_stat->recent_scanned[0] /= 2; 1973 reclaim_stat->recent_scanned[0] /= 2;
1917 reclaim_stat->recent_rotated[0] /= 2; 1974 reclaim_stat->recent_rotated[0] /= 2;
@@ -1932,24 +1989,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1932 1989
1933 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1990 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1934 fp /= reclaim_stat->recent_rotated[1] + 1; 1991 fp /= reclaim_stat->recent_rotated[1] + 1;
1935 spin_unlock_irq(&zone->lru_lock); 1992 spin_unlock_irq(&mz->zone->lru_lock);
1936 1993
1937 fraction[0] = ap; 1994 fraction[0] = ap;
1938 fraction[1] = fp; 1995 fraction[1] = fp;
1939 denominator = ap + fp + 1; 1996 denominator = ap + fp + 1;
1940out: 1997out:
1941 for_each_evictable_lru(l) { 1998 for_each_evictable_lru(lru) {
1942 int file = is_file_lru(l); 1999 int file = is_file_lru(lru);
1943 unsigned long scan; 2000 unsigned long scan;
1944 2001
1945 scan = zone_nr_lru_pages(zone, sc, l); 2002 scan = zone_nr_lru_pages(mz, lru);
1946 if (priority || noswap) { 2003 if (priority || noswap) {
1947 scan >>= priority; 2004 scan >>= priority;
1948 if (!scan && force_scan) 2005 if (!scan && force_scan)
1949 scan = SWAP_CLUSTER_MAX; 2006 scan = SWAP_CLUSTER_MAX;
1950 scan = div64_u64(scan * fraction[file], denominator); 2007 scan = div64_u64(scan * fraction[file], denominator);
1951 } 2008 }
1952 nr[l] = scan; 2009 nr[lru] = scan;
1953 } 2010 }
1954} 2011}
1955 2012
@@ -1960,7 +2017,7 @@ out:
1960 * back to the allocator and call try_to_compact_zone(), we ensure that 2017 * back to the allocator and call try_to_compact_zone(), we ensure that
1961 * there are enough free pages for it to be likely successful 2018 * there are enough free pages for it to be likely successful
1962 */ 2019 */
1963static inline bool should_continue_reclaim(struct zone *zone, 2020static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
1964 unsigned long nr_reclaimed, 2021 unsigned long nr_reclaimed,
1965 unsigned long nr_scanned, 2022 unsigned long nr_scanned,
1966 struct scan_control *sc) 2023 struct scan_control *sc)
@@ -2000,15 +2057,15 @@ static inline bool should_continue_reclaim(struct zone *zone,
2000 * inactive lists are large enough, continue reclaiming 2057 * inactive lists are large enough, continue reclaiming
2001 */ 2058 */
2002 pages_for_compaction = (2UL << sc->order); 2059 pages_for_compaction = (2UL << sc->order);
2003 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 2060 inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
2004 if (nr_swap_pages > 0) 2061 if (nr_swap_pages > 0)
2005 inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 2062 inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
2006 if (sc->nr_reclaimed < pages_for_compaction && 2063 if (sc->nr_reclaimed < pages_for_compaction &&
2007 inactive_lru_pages > pages_for_compaction) 2064 inactive_lru_pages > pages_for_compaction)
2008 return true; 2065 return true;
2009 2066
2010 /* If compaction would go ahead or the allocation would succeed, stop */ 2067 /* If compaction would go ahead or the allocation would succeed, stop */
2011 switch (compaction_suitable(zone, sc->order)) { 2068 switch (compaction_suitable(mz->zone, sc->order)) {
2012 case COMPACT_PARTIAL: 2069 case COMPACT_PARTIAL:
2013 case COMPACT_CONTINUE: 2070 case COMPACT_CONTINUE:
2014 return false; 2071 return false;
@@ -2020,12 +2077,12 @@ static inline bool should_continue_reclaim(struct zone *zone,
2020/* 2077/*
2021 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2078 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2022 */ 2079 */
2023static void shrink_zone(int priority, struct zone *zone, 2080static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
2024 struct scan_control *sc) 2081 struct scan_control *sc)
2025{ 2082{
2026 unsigned long nr[NR_LRU_LISTS]; 2083 unsigned long nr[NR_LRU_LISTS];
2027 unsigned long nr_to_scan; 2084 unsigned long nr_to_scan;
2028 enum lru_list l; 2085 enum lru_list lru;
2029 unsigned long nr_reclaimed, nr_scanned; 2086 unsigned long nr_reclaimed, nr_scanned;
2030 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2087 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2031 struct blk_plug plug; 2088 struct blk_plug plug;
@@ -2033,19 +2090,19 @@ static void shrink_zone(int priority, struct zone *zone,
2033restart: 2090restart:
2034 nr_reclaimed = 0; 2091 nr_reclaimed = 0;
2035 nr_scanned = sc->nr_scanned; 2092 nr_scanned = sc->nr_scanned;
2036 get_scan_count(zone, sc, nr, priority); 2093 get_scan_count(mz, sc, nr, priority);
2037 2094
2038 blk_start_plug(&plug); 2095 blk_start_plug(&plug);
2039 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2096 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2040 nr[LRU_INACTIVE_FILE]) { 2097 nr[LRU_INACTIVE_FILE]) {
2041 for_each_evictable_lru(l) { 2098 for_each_evictable_lru(lru) {
2042 if (nr[l]) { 2099 if (nr[lru]) {
2043 nr_to_scan = min_t(unsigned long, 2100 nr_to_scan = min_t(unsigned long,
2044 nr[l], SWAP_CLUSTER_MAX); 2101 nr[lru], SWAP_CLUSTER_MAX);
2045 nr[l] -= nr_to_scan; 2102 nr[lru] -= nr_to_scan;
2046 2103
2047 nr_reclaimed += shrink_list(l, nr_to_scan, 2104 nr_reclaimed += shrink_list(lru, nr_to_scan,
2048 zone, sc, priority); 2105 mz, sc, priority);
2049 } 2106 }
2050 } 2107 }
2051 /* 2108 /*
@@ -2066,17 +2123,89 @@ restart:
2066 * Even if we did not try to evict anon pages at all, we want to 2123 * Even if we did not try to evict anon pages at all, we want to
2067 * rebalance the anon lru active/inactive ratio. 2124 * rebalance the anon lru active/inactive ratio.
2068 */ 2125 */
2069 if (inactive_anon_is_low(zone, sc)) 2126 if (inactive_anon_is_low(mz))
2070 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 2127 shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
2071 2128
2072 /* reclaim/compaction might need reclaim to continue */ 2129 /* reclaim/compaction might need reclaim to continue */
2073 if (should_continue_reclaim(zone, nr_reclaimed, 2130 if (should_continue_reclaim(mz, nr_reclaimed,
2074 sc->nr_scanned - nr_scanned, sc)) 2131 sc->nr_scanned - nr_scanned, sc))
2075 goto restart; 2132 goto restart;
2076 2133
2077 throttle_vm_writeout(sc->gfp_mask); 2134 throttle_vm_writeout(sc->gfp_mask);
2078} 2135}
2079 2136
2137static void shrink_zone(int priority, struct zone *zone,
2138 struct scan_control *sc)
2139{
2140 struct mem_cgroup *root = sc->target_mem_cgroup;
2141 struct mem_cgroup_reclaim_cookie reclaim = {
2142 .zone = zone,
2143 .priority = priority,
2144 };
2145 struct mem_cgroup *memcg;
2146
2147 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2148 do {
2149 struct mem_cgroup_zone mz = {
2150 .mem_cgroup = memcg,
2151 .zone = zone,
2152 };
2153
2154 shrink_mem_cgroup_zone(priority, &mz, sc);
2155 /*
2156 * Limit reclaim has historically picked one memcg and
2157 * scanned it with decreasing priority levels until
2158 * nr_to_reclaim had been reclaimed. This priority
2159 * cycle is thus over after a single memcg.
2160 *
2161 * Direct reclaim and kswapd, on the other hand, have
2162 * to scan all memory cgroups to fulfill the overall
2163 * scan target for the zone.
2164 */
2165 if (!global_reclaim(sc)) {
2166 mem_cgroup_iter_break(root, memcg);
2167 break;
2168 }
2169 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2170 } while (memcg);
2171}
2172
2173/* Returns true if compaction should go ahead for a high-order request */
2174static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2175{
2176 unsigned long balance_gap, watermark;
2177 bool watermark_ok;
2178
2179 /* Do not consider compaction for orders reclaim is meant to satisfy */
2180 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2181 return false;
2182
2183 /*
2184 * Compaction takes time to run and there are potentially other
2185 * callers using the pages just freed. Continue reclaiming until
2186 * there is a buffer of free pages available to give compaction
2187 * a reasonable chance of completing and allocating the page
2188 */
2189 balance_gap = min(low_wmark_pages(zone),
2190 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2191 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2192 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2193 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2194
2195 /*
2196 * If compaction is deferred, reclaim up to a point where
2197 * compaction will have a chance of success when re-enabled
2198 */
2199 if (compaction_deferred(zone))
2200 return watermark_ok;
2201
2202 /* If compaction is not ready to start, keep reclaiming */
2203 if (!compaction_suitable(zone, sc->order))
2204 return false;
2205
2206 return watermark_ok;
2207}
2208
2080/* 2209/*
2081 * This is the direct reclaim path, for page-allocating processes. We only 2210 * This is the direct reclaim path, for page-allocating processes. We only
2082 * try to reclaim pages from zones which will satisfy the caller's allocation 2211 * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2094,8 +2223,9 @@ restart:
2094 * scan then give up on it. 2223 * scan then give up on it.
2095 * 2224 *
2096 * This function returns true if a zone is being reclaimed for a costly 2225 * This function returns true if a zone is being reclaimed for a costly
2097 * high-order allocation and compaction is either ready to begin or deferred. 2226 * high-order allocation and compaction is ready to begin. This indicates to
2098 * This indicates to the caller that it should retry the allocation or fail. 2227 * the caller that it should consider retrying the allocation instead of
2228 * further reclaim.
2099 */ 2229 */
2100static bool shrink_zones(int priority, struct zonelist *zonelist, 2230static bool shrink_zones(int priority, struct zonelist *zonelist,
2101 struct scan_control *sc) 2231 struct scan_control *sc)
@@ -2104,7 +2234,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2104 struct zone *zone; 2234 struct zone *zone;
2105 unsigned long nr_soft_reclaimed; 2235 unsigned long nr_soft_reclaimed;
2106 unsigned long nr_soft_scanned; 2236 unsigned long nr_soft_scanned;
2107 bool should_abort_reclaim = false; 2237 bool aborted_reclaim = false;
2108 2238
2109 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2239 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2110 gfp_zone(sc->gfp_mask), sc->nodemask) { 2240 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2114,7 +2244,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2114 * Take care memory controller reclaiming has small influence 2244 * Take care memory controller reclaiming has small influence
2115 * to global LRU. 2245 * to global LRU.
2116 */ 2246 */
2117 if (scanning_global_lru(sc)) { 2247 if (global_reclaim(sc)) {
2118 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2248 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2119 continue; 2249 continue;
2120 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2250 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -2129,10 +2259,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2129 * noticable problem, like transparent huge page 2259 * noticable problem, like transparent huge page
2130 * allocations. 2260 * allocations.
2131 */ 2261 */
2132 if (sc->order > PAGE_ALLOC_COSTLY_ORDER && 2262 if (compaction_ready(zone, sc)) {
2133 (compaction_suitable(zone, sc->order) || 2263 aborted_reclaim = true;
2134 compaction_deferred(zone))) {
2135 should_abort_reclaim = true;
2136 continue; 2264 continue;
2137 } 2265 }
2138 } 2266 }
@@ -2154,7 +2282,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2154 shrink_zone(priority, zone, sc); 2282 shrink_zone(priority, zone, sc);
2155 } 2283 }
2156 2284
2157 return should_abort_reclaim; 2285 return aborted_reclaim;
2158} 2286}
2159 2287
2160static bool zone_reclaimable(struct zone *zone) 2288static bool zone_reclaimable(struct zone *zone)
@@ -2208,25 +2336,25 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2208 struct zoneref *z; 2336 struct zoneref *z;
2209 struct zone *zone; 2337 struct zone *zone;
2210 unsigned long writeback_threshold; 2338 unsigned long writeback_threshold;
2339 bool aborted_reclaim;
2211 2340
2212 get_mems_allowed(); 2341 get_mems_allowed();
2213 delayacct_freepages_start(); 2342 delayacct_freepages_start();
2214 2343
2215 if (scanning_global_lru(sc)) 2344 if (global_reclaim(sc))
2216 count_vm_event(ALLOCSTALL); 2345 count_vm_event(ALLOCSTALL);
2217 2346
2218 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2347 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2219 sc->nr_scanned = 0; 2348 sc->nr_scanned = 0;
2220 if (!priority) 2349 if (!priority)
2221 disable_swap_token(sc->mem_cgroup); 2350 disable_swap_token(sc->target_mem_cgroup);
2222 if (shrink_zones(priority, zonelist, sc)) 2351 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2223 break;
2224 2352
2225 /* 2353 /*
2226 * Don't shrink slabs when reclaiming memory from 2354 * Don't shrink slabs when reclaiming memory from
2227 * over limit cgroups 2355 * over limit cgroups
2228 */ 2356 */
2229 if (scanning_global_lru(sc)) { 2357 if (global_reclaim(sc)) {
2230 unsigned long lru_pages = 0; 2358 unsigned long lru_pages = 0;
2231 for_each_zone_zonelist(zone, z, zonelist, 2359 for_each_zone_zonelist(zone, z, zonelist,
2232 gfp_zone(sc->gfp_mask)) { 2360 gfp_zone(sc->gfp_mask)) {
@@ -2287,8 +2415,12 @@ out:
2287 if (oom_killer_disabled) 2415 if (oom_killer_disabled)
2288 return 0; 2416 return 0;
2289 2417
2418 /* Aborted reclaim to try compaction? don't OOM, then */
2419 if (aborted_reclaim)
2420 return 1;
2421
2290 /* top priority shrink_zones still had more to do? don't OOM, then */ 2422 /* top priority shrink_zones still had more to do? don't OOM, then */
2291 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2423 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2292 return 1; 2424 return 1;
2293 2425
2294 return 0; 2426 return 0;
@@ -2305,7 +2437,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2305 .may_unmap = 1, 2437 .may_unmap = 1,
2306 .may_swap = 1, 2438 .may_swap = 1,
2307 .order = order, 2439 .order = order,
2308 .mem_cgroup = NULL, 2440 .target_mem_cgroup = NULL,
2309 .nodemask = nodemask, 2441 .nodemask = nodemask,
2310 }; 2442 };
2311 struct shrink_control shrink = { 2443 struct shrink_control shrink = {
@@ -2325,7 +2457,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2325 2457
2326#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2458#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2327 2459
2328unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2460unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2329 gfp_t gfp_mask, bool noswap, 2461 gfp_t gfp_mask, bool noswap,
2330 struct zone *zone, 2462 struct zone *zone,
2331 unsigned long *nr_scanned) 2463 unsigned long *nr_scanned)
@@ -2337,7 +2469,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2337 .may_unmap = 1, 2469 .may_unmap = 1,
2338 .may_swap = !noswap, 2470 .may_swap = !noswap,
2339 .order = 0, 2471 .order = 0,
2340 .mem_cgroup = mem, 2472 .target_mem_cgroup = memcg,
2473 };
2474 struct mem_cgroup_zone mz = {
2475 .mem_cgroup = memcg,
2476 .zone = zone,
2341 }; 2477 };
2342 2478
2343 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2479 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2354,7 +2490,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2354 * will pick up pages from other mem cgroup's as well. We hack 2490 * will pick up pages from other mem cgroup's as well. We hack
2355 * the priority and make it zero. 2491 * the priority and make it zero.
2356 */ 2492 */
2357 shrink_zone(0, zone, &sc); 2493 shrink_mem_cgroup_zone(0, &mz, &sc);
2358 2494
2359 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2495 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2360 2496
@@ -2362,7 +2498,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2362 return sc.nr_reclaimed; 2498 return sc.nr_reclaimed;
2363} 2499}
2364 2500
2365unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2501unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2366 gfp_t gfp_mask, 2502 gfp_t gfp_mask,
2367 bool noswap) 2503 bool noswap)
2368{ 2504{
@@ -2375,7 +2511,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2375 .may_swap = !noswap, 2511 .may_swap = !noswap,
2376 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2512 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2377 .order = 0, 2513 .order = 0,
2378 .mem_cgroup = mem_cont, 2514 .target_mem_cgroup = memcg,
2379 .nodemask = NULL, /* we don't care the placement */ 2515 .nodemask = NULL, /* we don't care the placement */
2380 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2516 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2381 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2517 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2389,7 +2525,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2389 * take care of from where we get pages. So the node where we start the 2525 * take care of from where we get pages. So the node where we start the
2390 * scan does not need to be the current node. 2526 * scan does not need to be the current node.
2391 */ 2527 */
2392 nid = mem_cgroup_select_victim_node(mem_cont); 2528 nid = mem_cgroup_select_victim_node(memcg);
2393 2529
2394 zonelist = NODE_DATA(nid)->node_zonelists; 2530 zonelist = NODE_DATA(nid)->node_zonelists;
2395 2531
@@ -2405,6 +2541,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2405} 2541}
2406#endif 2542#endif
2407 2543
2544static void age_active_anon(struct zone *zone, struct scan_control *sc,
2545 int priority)
2546{
2547 struct mem_cgroup *memcg;
2548
2549 if (!total_swap_pages)
2550 return;
2551
2552 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2553 do {
2554 struct mem_cgroup_zone mz = {
2555 .mem_cgroup = memcg,
2556 .zone = zone,
2557 };
2558
2559 if (inactive_anon_is_low(&mz))
2560 shrink_active_list(SWAP_CLUSTER_MAX, &mz,
2561 sc, priority, 0);
2562
2563 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2564 } while (memcg);
2565}
2566
2408/* 2567/*
2409 * pgdat_balanced is used when checking if a node is balanced for high-order 2568 * pgdat_balanced is used when checking if a node is balanced for high-order
2410 * allocations. Only zones that meet watermarks and are in a zone allowed 2569 * allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2525,7 +2684,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2525 */ 2684 */
2526 .nr_to_reclaim = ULONG_MAX, 2685 .nr_to_reclaim = ULONG_MAX,
2527 .order = order, 2686 .order = order,
2528 .mem_cgroup = NULL, 2687 .target_mem_cgroup = NULL,
2529 }; 2688 };
2530 struct shrink_control shrink = { 2689 struct shrink_control shrink = {
2531 .gfp_mask = sc.gfp_mask, 2690 .gfp_mask = sc.gfp_mask,
@@ -2564,9 +2723,7 @@ loop_again:
2564 * Do some background aging of the anon list, to give 2723 * Do some background aging of the anon list, to give
2565 * pages a chance to be referenced before reclaiming. 2724 * pages a chance to be referenced before reclaiming.
2566 */ 2725 */
2567 if (inactive_anon_is_low(zone, &sc)) 2726 age_active_anon(zone, &sc, priority);
2568 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2569 &sc, priority, 0);
2570 2727
2571 if (!zone_watermark_ok_safe(zone, order, 2728 if (!zone_watermark_ok_safe(zone, order,
2572 high_wmark_pages(zone), 0, 0)) { 2729 high_wmark_pages(zone), 0, 0)) {
@@ -3355,16 +3512,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
3355 */ 3512 */
3356static void check_move_unevictable_page(struct page *page, struct zone *zone) 3513static void check_move_unevictable_page(struct page *page, struct zone *zone)
3357{ 3514{
3358 VM_BUG_ON(PageActive(page)); 3515 struct lruvec *lruvec;
3359 3516
3517 VM_BUG_ON(PageActive(page));
3360retry: 3518retry:
3361 ClearPageUnevictable(page); 3519 ClearPageUnevictable(page);
3362 if (page_evictable(page, NULL)) { 3520 if (page_evictable(page, NULL)) {
3363 enum lru_list l = page_lru_base_type(page); 3521 enum lru_list l = page_lru_base_type(page);
3364 3522
3365 __dec_zone_state(zone, NR_UNEVICTABLE); 3523 __dec_zone_state(zone, NR_UNEVICTABLE);
3366 list_move(&page->lru, &zone->lru[l].list); 3524 lruvec = mem_cgroup_lru_move_lists(zone, page,
3367 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); 3525 LRU_UNEVICTABLE, l);
3526 list_move(&page->lru, &lruvec->lists[l]);
3368 __inc_zone_state(zone, NR_INACTIVE_ANON + l); 3527 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
3369 __count_vm_event(UNEVICTABLE_PGRESCUED); 3528 __count_vm_event(UNEVICTABLE_PGRESCUED);
3370 } else { 3529 } else {
@@ -3372,8 +3531,9 @@ retry:
3372 * rotate unevictable list 3531 * rotate unevictable list
3373 */ 3532 */
3374 SetPageUnevictable(page); 3533 SetPageUnevictable(page);
3375 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); 3534 lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE,
3376 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); 3535 LRU_UNEVICTABLE);
3536 list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]);
3377 if (page_evictable(page, NULL)) 3537 if (page_evictable(page, NULL))
3378 goto retry; 3538 goto retry;
3379 } 3539 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8fd603b1665e..f600557a7659 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
295} 295}
296EXPORT_SYMBOL(__dec_zone_page_state); 296EXPORT_SYMBOL(__dec_zone_page_state);
297 297
298#ifdef CONFIG_CMPXCHG_LOCAL 298#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
299/* 299/*
300 * If we have cmpxchg_local support then we do not need to incur the overhead 300 * If we have cmpxchg_local support then we do not need to incur the overhead
301 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. 301 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.