aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@techsingularity.net>2016-07-28 18:46:05 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-28 19:07:41 -0400
commitef8f2327996b5c20f11420f64e439e87c7a01604 (patch)
tree0ea9bf78d88e1207005fc5310fe812d1edb0efc2
parenta9dd0a83104c01269ea36a9b4ec42b51edf85427 (diff)
mm, memcg: move memcg limit enforcement from zones to nodes
Memcg needs adjustment after moving LRUs to the node. Limits are tracked per memcg but the soft-limit excess is tracked per zone. As global page reclaim is based on the node, it is easy to imagine a situation where a zone soft limit is exceeded even though the memcg limit is fine. This patch moves the soft limit tree the node. Technically, all the variable names should also change but people are already familiar by the meaning of "mz" even if "mn" would be a more appropriate name now. Link: http://lkml.kernel.org/r/1467970510-21195-15-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@surriel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h38
-rw-r--r--include/linux/swap.h2
-rw-r--r--mm/memcontrol.c190
-rw-r--r--mm/vmscan.c19
-rw-r--r--mm/workingset.c6
5 files changed, 111 insertions, 144 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f4963ee4fdbc..b759827b2f1e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -60,7 +60,7 @@ enum mem_cgroup_stat_index {
60}; 60};
61 61
62struct mem_cgroup_reclaim_cookie { 62struct mem_cgroup_reclaim_cookie {
63 struct zone *zone; 63 pg_data_t *pgdat;
64 int priority; 64 int priority;
65 unsigned int generation; 65 unsigned int generation;
66}; 66};
@@ -118,7 +118,7 @@ struct mem_cgroup_reclaim_iter {
118/* 118/*
119 * per-zone information in memory controller. 119 * per-zone information in memory controller.
120 */ 120 */
121struct mem_cgroup_per_zone { 121struct mem_cgroup_per_node {
122 struct lruvec lruvec; 122 struct lruvec lruvec;
123 unsigned long lru_size[NR_LRU_LISTS]; 123 unsigned long lru_size[NR_LRU_LISTS];
124 124
@@ -132,10 +132,6 @@ struct mem_cgroup_per_zone {
132 /* use container_of */ 132 /* use container_of */
133}; 133};
134 134
135struct mem_cgroup_per_node {
136 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
137};
138
139struct mem_cgroup_threshold { 135struct mem_cgroup_threshold {
140 struct eventfd_ctx *eventfd; 136 struct eventfd_ctx *eventfd;
141 unsigned long threshold; 137 unsigned long threshold;
@@ -314,19 +310,15 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
314 310
315void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); 311void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
316 312
317static inline struct mem_cgroup_per_zone * 313static struct mem_cgroup_per_node *
318mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 314mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
319{ 315{
320 int nid = zone_to_nid(zone); 316 return memcg->nodeinfo[nid];
321 int zid = zone_idx(zone);
322
323 return &memcg->nodeinfo[nid]->zoneinfo[zid];
324} 317}
325 318
326/** 319/**
327 * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone 320 * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
328 * @node: node of the wanted lruvec 321 * @node: node of the wanted lruvec
329 * @zone: zone of the wanted lruvec
330 * @memcg: memcg of the wanted lruvec 322 * @memcg: memcg of the wanted lruvec
331 * 323 *
332 * Returns the lru list vector holding pages for a given @node or a given 324 * Returns the lru list vector holding pages for a given @node or a given
@@ -334,9 +326,9 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
334 * is disabled. 326 * is disabled.
335 */ 327 */
336static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, 328static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
337 struct zone *zone, struct mem_cgroup *memcg) 329 struct mem_cgroup *memcg)
338{ 330{
339 struct mem_cgroup_per_zone *mz; 331 struct mem_cgroup_per_node *mz;
340 struct lruvec *lruvec; 332 struct lruvec *lruvec;
341 333
342 if (mem_cgroup_disabled()) { 334 if (mem_cgroup_disabled()) {
@@ -344,7 +336,7 @@ static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
344 goto out; 336 goto out;
345 } 337 }
346 338
347 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 339 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
348 lruvec = &mz->lruvec; 340 lruvec = &mz->lruvec;
349out: 341out:
350 /* 342 /*
@@ -352,8 +344,8 @@ out:
352 * we have to be prepared to initialize lruvec->pgdat here; 344 * we have to be prepared to initialize lruvec->pgdat here;
353 * and if offlined then reonlined, we need to reinitialize it. 345 * and if offlined then reonlined, we need to reinitialize it.
354 */ 346 */
355 if (unlikely(lruvec->pgdat != zone->zone_pgdat)) 347 if (unlikely(lruvec->pgdat != pgdat))
356 lruvec->pgdat = zone->zone_pgdat; 348 lruvec->pgdat = pgdat;
357 return lruvec; 349 return lruvec;
358} 350}
359 351
@@ -446,9 +438,9 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
446static inline 438static inline
447unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 439unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
448{ 440{
449 struct mem_cgroup_per_zone *mz; 441 struct mem_cgroup_per_node *mz;
450 442
451 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 443 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
452 return mz->lru_size[lru]; 444 return mz->lru_size[lru];
453} 445}
454 446
@@ -519,7 +511,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
519 mem_cgroup_update_page_stat(page, idx, -1); 511 mem_cgroup_update_page_stat(page, idx, -1);
520} 512}
521 513
522unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 514unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
523 gfp_t gfp_mask, 515 gfp_t gfp_mask,
524 unsigned long *total_scanned); 516 unsigned long *total_scanned);
525 517
@@ -611,7 +603,7 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
611} 603}
612 604
613static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, 605static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
614 struct zone *zone, struct mem_cgroup *memcg) 606 struct mem_cgroup *memcg)
615{ 607{
616 return node_lruvec(pgdat); 608 return node_lruvec(pgdat);
617} 609}
@@ -723,7 +715,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
723} 715}
724 716
725static inline 717static inline
726unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 718unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
727 gfp_t gfp_mask, 719 gfp_t gfp_mask,
728 unsigned long *total_scanned) 720 unsigned long *total_scanned)
729{ 721{
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ad616d7c381..2a23ddc96edd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -318,7 +318,7 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
318 bool may_swap); 318 bool may_swap);
319extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, 319extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
320 gfp_t gfp_mask, bool noswap, 320 gfp_t gfp_mask, bool noswap,
321 struct zone *zone, 321 pg_data_t *pgdat,
322 unsigned long *nr_scanned); 322 unsigned long *nr_scanned);
323extern unsigned long shrink_all_memory(unsigned long nr_pages); 323extern unsigned long shrink_all_memory(unsigned long nr_pages);
324extern int vm_swappiness; 324extern int vm_swappiness;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c9ebec98e92a..9cbd40ebccd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = {
132 * their hierarchy representation 132 * their hierarchy representation
133 */ 133 */
134 134
135struct mem_cgroup_tree_per_zone { 135struct mem_cgroup_tree_per_node {
136 struct rb_root rb_root; 136 struct rb_root rb_root;
137 spinlock_t lock; 137 spinlock_t lock;
138}; 138};
139 139
140struct mem_cgroup_tree_per_node {
141 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
142};
143
144struct mem_cgroup_tree { 140struct mem_cgroup_tree {
145 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 141 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
146}; 142};
@@ -374,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page)
374 return ino; 370 return ino;
375} 371}
376 372
377static struct mem_cgroup_per_zone * 373static struct mem_cgroup_per_node *
378mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 374mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
379{ 375{
380 int nid = page_to_nid(page); 376 int nid = page_to_nid(page);
381 int zid = page_zonenum(page);
382 377
383 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 378 return memcg->nodeinfo[nid];
384} 379}
385 380
386static struct mem_cgroup_tree_per_zone * 381static struct mem_cgroup_tree_per_node *
387soft_limit_tree_node_zone(int nid, int zid) 382soft_limit_tree_node(int nid)
388{ 383{
389 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 384 return soft_limit_tree.rb_tree_per_node[nid];
390} 385}
391 386
392static struct mem_cgroup_tree_per_zone * 387static struct mem_cgroup_tree_per_node *
393soft_limit_tree_from_page(struct page *page) 388soft_limit_tree_from_page(struct page *page)
394{ 389{
395 int nid = page_to_nid(page); 390 int nid = page_to_nid(page);
396 int zid = page_zonenum(page);
397 391
398 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 392 return soft_limit_tree.rb_tree_per_node[nid];
399} 393}
400 394
401static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 395static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
402 struct mem_cgroup_tree_per_zone *mctz, 396 struct mem_cgroup_tree_per_node *mctz,
403 unsigned long new_usage_in_excess) 397 unsigned long new_usage_in_excess)
404{ 398{
405 struct rb_node **p = &mctz->rb_root.rb_node; 399 struct rb_node **p = &mctz->rb_root.rb_node;
406 struct rb_node *parent = NULL; 400 struct rb_node *parent = NULL;
407 struct mem_cgroup_per_zone *mz_node; 401 struct mem_cgroup_per_node *mz_node;
408 402
409 if (mz->on_tree) 403 if (mz->on_tree)
410 return; 404 return;
@@ -414,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
414 return; 408 return;
415 while (*p) { 409 while (*p) {
416 parent = *p; 410 parent = *p;
417 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 411 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
418 tree_node); 412 tree_node);
419 if (mz->usage_in_excess < mz_node->usage_in_excess) 413 if (mz->usage_in_excess < mz_node->usage_in_excess)
420 p = &(*p)->rb_left; 414 p = &(*p)->rb_left;
@@ -430,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
430 mz->on_tree = true; 424 mz->on_tree = true;
431} 425}
432 426
433static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 427static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
434 struct mem_cgroup_tree_per_zone *mctz) 428 struct mem_cgroup_tree_per_node *mctz)
435{ 429{
436 if (!mz->on_tree) 430 if (!mz->on_tree)
437 return; 431 return;
@@ -439,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
439 mz->on_tree = false; 433 mz->on_tree = false;
440} 434}
441 435
442static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 436static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
443 struct mem_cgroup_tree_per_zone *mctz) 437 struct mem_cgroup_tree_per_node *mctz)
444{ 438{
445 unsigned long flags; 439 unsigned long flags;
446 440
@@ -464,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
464static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 458static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
465{ 459{
466 unsigned long excess; 460 unsigned long excess;
467 struct mem_cgroup_per_zone *mz; 461 struct mem_cgroup_per_node *mz;
468 struct mem_cgroup_tree_per_zone *mctz; 462 struct mem_cgroup_tree_per_node *mctz;
469 463
470 mctz = soft_limit_tree_from_page(page); 464 mctz = soft_limit_tree_from_page(page);
471 /* 465 /*
@@ -473,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
473 * because their event counter is not touched. 467 * because their event counter is not touched.
474 */ 468 */
475 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 469 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
476 mz = mem_cgroup_page_zoneinfo(memcg, page); 470 mz = mem_cgroup_page_nodeinfo(memcg, page);
477 excess = soft_limit_excess(memcg); 471 excess = soft_limit_excess(memcg);
478 /* 472 /*
479 * We have to update the tree if mz is on RB-tree or 473 * We have to update the tree if mz is on RB-tree or
@@ -498,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
498 492
499static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 493static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
500{ 494{
501 struct mem_cgroup_tree_per_zone *mctz; 495 struct mem_cgroup_tree_per_node *mctz;
502 struct mem_cgroup_per_zone *mz; 496 struct mem_cgroup_per_node *mz;
503 int nid, zid; 497 int nid;
504 498
505 for_each_node(nid) { 499 for_each_node(nid) {
506 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 500 mz = mem_cgroup_nodeinfo(memcg, nid);
507 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 501 mctz = soft_limit_tree_node(nid);
508 mctz = soft_limit_tree_node_zone(nid, zid); 502 mem_cgroup_remove_exceeded(mz, mctz);
509 mem_cgroup_remove_exceeded(mz, mctz);
510 }
511 } 503 }
512} 504}
513 505
514static struct mem_cgroup_per_zone * 506static struct mem_cgroup_per_node *
515__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 507__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
516{ 508{
517 struct rb_node *rightmost = NULL; 509 struct rb_node *rightmost = NULL;
518 struct mem_cgroup_per_zone *mz; 510 struct mem_cgroup_per_node *mz;
519 511
520retry: 512retry:
521 mz = NULL; 513 mz = NULL;
@@ -523,7 +515,7 @@ retry:
523 if (!rightmost) 515 if (!rightmost)
524 goto done; /* Nothing to reclaim from */ 516 goto done; /* Nothing to reclaim from */
525 517
526 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 518 mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
527 /* 519 /*
528 * Remove the node now but someone else can add it back, 520 * Remove the node now but someone else can add it back,
529 * we will to add it back at the end of reclaim to its correct 521 * we will to add it back at the end of reclaim to its correct
@@ -537,10 +529,10 @@ done:
537 return mz; 529 return mz;
538} 530}
539 531
540static struct mem_cgroup_per_zone * 532static struct mem_cgroup_per_node *
541mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 533mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
542{ 534{
543 struct mem_cgroup_per_zone *mz; 535 struct mem_cgroup_per_node *mz;
544 536
545 spin_lock_irq(&mctz->lock); 537 spin_lock_irq(&mctz->lock);
546 mz = __mem_cgroup_largest_soft_limit_node(mctz); 538 mz = __mem_cgroup_largest_soft_limit_node(mctz);
@@ -634,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
634 int nid, unsigned int lru_mask) 626 int nid, unsigned int lru_mask)
635{ 627{
636 unsigned long nr = 0; 628 unsigned long nr = 0;
637 int zid; 629 struct mem_cgroup_per_node *mz;
630 enum lru_list lru;
638 631
639 VM_BUG_ON((unsigned)nid >= nr_node_ids); 632 VM_BUG_ON((unsigned)nid >= nr_node_ids);
640 633
641 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 634 for_each_lru(lru) {
642 struct mem_cgroup_per_zone *mz; 635 if (!(BIT(lru) & lru_mask))
643 enum lru_list lru; 636 continue;
644 637 mz = mem_cgroup_nodeinfo(memcg, nid);
645 for_each_lru(lru) { 638 nr += mz->lru_size[lru];
646 if (!(BIT(lru) & lru_mask))
647 continue;
648 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
649 nr += mz->lru_size[lru];
650 }
651 } 639 }
652 return nr; 640 return nr;
653} 641}
@@ -800,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
800 rcu_read_lock(); 788 rcu_read_lock();
801 789
802 if (reclaim) { 790 if (reclaim) {
803 struct mem_cgroup_per_zone *mz; 791 struct mem_cgroup_per_node *mz;
804 792
805 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 793 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
806 iter = &mz->iter[reclaim->priority]; 794 iter = &mz->iter[reclaim->priority];
807 795
808 if (prev && reclaim->generation != iter->generation) 796 if (prev && reclaim->generation != iter->generation)
@@ -901,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
901{ 889{
902 struct mem_cgroup *memcg = dead_memcg; 890 struct mem_cgroup *memcg = dead_memcg;
903 struct mem_cgroup_reclaim_iter *iter; 891 struct mem_cgroup_reclaim_iter *iter;
904 struct mem_cgroup_per_zone *mz; 892 struct mem_cgroup_per_node *mz;
905 int nid, zid; 893 int nid;
906 int i; 894 int i;
907 895
908 while ((memcg = parent_mem_cgroup(memcg))) { 896 while ((memcg = parent_mem_cgroup(memcg))) {
909 for_each_node(nid) { 897 for_each_node(nid) {
910 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 898 mz = mem_cgroup_nodeinfo(memcg, nid);
911 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 899 for (i = 0; i <= DEF_PRIORITY; i++) {
912 for (i = 0; i <= DEF_PRIORITY; i++) { 900 iter = &mz->iter[i];
913 iter = &mz->iter[i]; 901 cmpxchg(&iter->position,
914 cmpxchg(&iter->position, 902 dead_memcg, NULL);
915 dead_memcg, NULL);
916 }
917 } 903 }
918 } 904 }
919 } 905 }
@@ -945,7 +931,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
945 */ 931 */
946struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) 932struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
947{ 933{
948 struct mem_cgroup_per_zone *mz; 934 struct mem_cgroup_per_node *mz;
949 struct mem_cgroup *memcg; 935 struct mem_cgroup *memcg;
950 struct lruvec *lruvec; 936 struct lruvec *lruvec;
951 937
@@ -962,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
962 if (!memcg) 948 if (!memcg)
963 memcg = root_mem_cgroup; 949 memcg = root_mem_cgroup;
964 950
965 mz = mem_cgroup_page_zoneinfo(memcg, page); 951 mz = mem_cgroup_page_nodeinfo(memcg, page);
966 lruvec = &mz->lruvec; 952 lruvec = &mz->lruvec;
967out: 953out:
968 /* 954 /*
@@ -989,7 +975,7 @@ out:
989void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 975void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
990 enum zone_type zid, int nr_pages) 976 enum zone_type zid, int nr_pages)
991{ 977{
992 struct mem_cgroup_per_zone *mz; 978 struct mem_cgroup_per_node *mz;
993 unsigned long *lru_size; 979 unsigned long *lru_size;
994 long size; 980 long size;
995 bool empty; 981 bool empty;
@@ -999,7 +985,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
999 if (mem_cgroup_disabled()) 985 if (mem_cgroup_disabled())
1000 return; 986 return;
1001 987
1002 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 988 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1003 lru_size = mz->lru_size + lru; 989 lru_size = mz->lru_size + lru;
1004 empty = list_empty(lruvec->lists + lru); 990 empty = list_empty(lruvec->lists + lru);
1005 991
@@ -1392,7 +1378,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1392#endif 1378#endif
1393 1379
1394static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1380static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1395 struct zone *zone, 1381 pg_data_t *pgdat,
1396 gfp_t gfp_mask, 1382 gfp_t gfp_mask,
1397 unsigned long *total_scanned) 1383 unsigned long *total_scanned)
1398{ 1384{
@@ -1402,7 +1388,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1402 unsigned long excess; 1388 unsigned long excess;
1403 unsigned long nr_scanned; 1389 unsigned long nr_scanned;
1404 struct mem_cgroup_reclaim_cookie reclaim = { 1390 struct mem_cgroup_reclaim_cookie reclaim = {
1405 .zone = zone, 1391 .pgdat = pgdat,
1406 .priority = 0, 1392 .priority = 0,
1407 }; 1393 };
1408 1394
@@ -1433,7 +1419,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1433 continue; 1419 continue;
1434 } 1420 }
1435 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1421 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1436 zone, &nr_scanned); 1422 pgdat, &nr_scanned);
1437 *total_scanned += nr_scanned; 1423 *total_scanned += nr_scanned;
1438 if (!soft_limit_excess(root_memcg)) 1424 if (!soft_limit_excess(root_memcg))
1439 break; 1425 break;
@@ -2560,22 +2546,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2560 return ret; 2546 return ret;
2561} 2547}
2562 2548
2563unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2549unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2564 gfp_t gfp_mask, 2550 gfp_t gfp_mask,
2565 unsigned long *total_scanned) 2551 unsigned long *total_scanned)
2566{ 2552{
2567 unsigned long nr_reclaimed = 0; 2553 unsigned long nr_reclaimed = 0;
2568 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2554 struct mem_cgroup_per_node *mz, *next_mz = NULL;
2569 unsigned long reclaimed; 2555 unsigned long reclaimed;
2570 int loop = 0; 2556 int loop = 0;
2571 struct mem_cgroup_tree_per_zone *mctz; 2557 struct mem_cgroup_tree_per_node *mctz;
2572 unsigned long excess; 2558 unsigned long excess;
2573 unsigned long nr_scanned; 2559 unsigned long nr_scanned;
2574 2560
2575 if (order > 0) 2561 if (order > 0)
2576 return 0; 2562 return 0;
2577 2563
2578 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 2564 mctz = soft_limit_tree_node(pgdat->node_id);
2579 /* 2565 /*
2580 * This loop can run a while, specially if mem_cgroup's continuously 2566 * This loop can run a while, specially if mem_cgroup's continuously
2581 * keep exceeding their soft limit and putting the system under 2567 * keep exceeding their soft limit and putting the system under
@@ -2590,7 +2576,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2590 break; 2576 break;
2591 2577
2592 nr_scanned = 0; 2578 nr_scanned = 0;
2593 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 2579 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2594 gfp_mask, &nr_scanned); 2580 gfp_mask, &nr_scanned);
2595 nr_reclaimed += reclaimed; 2581 nr_reclaimed += reclaimed;
2596 *total_scanned += nr_scanned; 2582 *total_scanned += nr_scanned;
@@ -3211,22 +3197,21 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3211 3197
3212#ifdef CONFIG_DEBUG_VM 3198#ifdef CONFIG_DEBUG_VM
3213 { 3199 {
3214 int nid, zid; 3200 pg_data_t *pgdat;
3215 struct mem_cgroup_per_zone *mz; 3201 struct mem_cgroup_per_node *mz;
3216 struct zone_reclaim_stat *rstat; 3202 struct zone_reclaim_stat *rstat;
3217 unsigned long recent_rotated[2] = {0, 0}; 3203 unsigned long recent_rotated[2] = {0, 0};
3218 unsigned long recent_scanned[2] = {0, 0}; 3204 unsigned long recent_scanned[2] = {0, 0};
3219 3205
3220 for_each_online_node(nid) 3206 for_each_online_pgdat(pgdat) {
3221 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3207 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3222 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 3208 rstat = &mz->lruvec.reclaim_stat;
3223 rstat = &mz->lruvec.reclaim_stat;
3224 3209
3225 recent_rotated[0] += rstat->recent_rotated[0]; 3210 recent_rotated[0] += rstat->recent_rotated[0];
3226 recent_rotated[1] += rstat->recent_rotated[1]; 3211 recent_rotated[1] += rstat->recent_rotated[1];
3227 recent_scanned[0] += rstat->recent_scanned[0]; 3212 recent_scanned[0] += rstat->recent_scanned[0];
3228 recent_scanned[1] += rstat->recent_scanned[1]; 3213 recent_scanned[1] += rstat->recent_scanned[1];
3229 } 3214 }
3230 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3215 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3231 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3216 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3232 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3217 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
@@ -4106,11 +4091,10 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4106 return idr_find(&mem_cgroup_idr, id); 4091 return idr_find(&mem_cgroup_idr, id);
4107} 4092}
4108 4093
4109static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4094static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4110{ 4095{
4111 struct mem_cgroup_per_node *pn; 4096 struct mem_cgroup_per_node *pn;
4112 struct mem_cgroup_per_zone *mz; 4097 int tmp = node;
4113 int zone, tmp = node;
4114 /* 4098 /*
4115 * This routine is called against possible nodes. 4099 * This routine is called against possible nodes.
4116 * But it's BUG to call kmalloc() against offline node. 4100 * But it's BUG to call kmalloc() against offline node.
@@ -4125,18 +4109,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4125 if (!pn) 4109 if (!pn)
4126 return 1; 4110 return 1;
4127 4111
4128 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4112 lruvec_init(&pn->lruvec);
4129 mz = &pn->zoneinfo[zone]; 4113 pn->usage_in_excess = 0;
4130 lruvec_init(&mz->lruvec); 4114 pn->on_tree = false;
4131 mz->usage_in_excess = 0; 4115 pn->memcg = memcg;
4132 mz->on_tree = false; 4116
4133 mz->memcg = memcg;
4134 }
4135 memcg->nodeinfo[node] = pn; 4117 memcg->nodeinfo[node] = pn;
4136 return 0; 4118 return 0;
4137} 4119}
4138 4120
4139static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4121static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4140{ 4122{
4141 kfree(memcg->nodeinfo[node]); 4123 kfree(memcg->nodeinfo[node]);
4142} 4124}
@@ -4147,7 +4129,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
4147 4129
4148 memcg_wb_domain_exit(memcg); 4130 memcg_wb_domain_exit(memcg);
4149 for_each_node(node) 4131 for_each_node(node)
4150 free_mem_cgroup_per_zone_info(memcg, node); 4132 free_mem_cgroup_per_node_info(memcg, node);
4151 free_percpu(memcg->stat); 4133 free_percpu(memcg->stat);
4152 kfree(memcg); 4134 kfree(memcg);
4153} 4135}
@@ -4176,7 +4158,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4176 goto fail; 4158 goto fail;
4177 4159
4178 for_each_node(node) 4160 for_each_node(node)
4179 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4161 if (alloc_mem_cgroup_per_node_info(memcg, node))
4180 goto fail; 4162 goto fail;
4181 4163
4182 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 4164 if (memcg_wb_domain_init(memcg, GFP_KERNEL))
@@ -5779,18 +5761,12 @@ static int __init mem_cgroup_init(void)
5779 5761
5780 for_each_node(node) { 5762 for_each_node(node) {
5781 struct mem_cgroup_tree_per_node *rtpn; 5763 struct mem_cgroup_tree_per_node *rtpn;
5782 int zone;
5783 5764
5784 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 5765 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5785 node_online(node) ? node : NUMA_NO_NODE); 5766 node_online(node) ? node : NUMA_NO_NODE);
5786 5767
5787 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5768 rtpn->rb_root = RB_ROOT;
5788 struct mem_cgroup_tree_per_zone *rtpz; 5769 spin_lock_init(&rtpn->lock);
5789
5790 rtpz = &rtpn->rb_tree_per_zone[zone];
5791 rtpz->rb_root = RB_ROOT;
5792 spin_lock_init(&rtpz->lock);
5793 }
5794 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5770 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5795 } 5771 }
5796 5772
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 46f7a71ed13b..9f6e673efba7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2229,8 +2229,7 @@ static inline void init_tlb_ubc(void)
2229static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, 2229static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2230 struct scan_control *sc, unsigned long *lru_pages) 2230 struct scan_control *sc, unsigned long *lru_pages)
2231{ 2231{
2232 struct zone *zone = &pgdat->node_zones[sc->reclaim_idx]; 2232 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2233 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg);
2234 unsigned long nr[NR_LRU_LISTS]; 2233 unsigned long nr[NR_LRU_LISTS];
2235 unsigned long targets[NR_LRU_LISTS]; 2234 unsigned long targets[NR_LRU_LISTS];
2236 unsigned long nr_to_scan; 2235 unsigned long nr_to_scan;
@@ -2439,7 +2438,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
2439 do { 2438 do {
2440 struct mem_cgroup *root = sc->target_mem_cgroup; 2439 struct mem_cgroup *root = sc->target_mem_cgroup;
2441 struct mem_cgroup_reclaim_cookie reclaim = { 2440 struct mem_cgroup_reclaim_cookie reclaim = {
2442 .zone = &pgdat->node_zones[classzone_idx], 2441 .pgdat = pgdat,
2443 .priority = sc->priority, 2442 .priority = sc->priority,
2444 }; 2443 };
2445 unsigned long node_lru_pages = 0; 2444 unsigned long node_lru_pages = 0;
@@ -2647,7 +2646,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2647 * and balancing, not for a memcg's limit. 2646 * and balancing, not for a memcg's limit.
2648 */ 2647 */
2649 nr_soft_scanned = 0; 2648 nr_soft_scanned = 0;
2650 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2649 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2651 sc->order, sc->gfp_mask, 2650 sc->order, sc->gfp_mask,
2652 &nr_soft_scanned); 2651 &nr_soft_scanned);
2653 sc->nr_reclaimed += nr_soft_reclaimed; 2652 sc->nr_reclaimed += nr_soft_reclaimed;
@@ -2917,7 +2916,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2917 2916
2918unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, 2917unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
2919 gfp_t gfp_mask, bool noswap, 2918 gfp_t gfp_mask, bool noswap,
2920 struct zone *zone, 2919 pg_data_t *pgdat,
2921 unsigned long *nr_scanned) 2920 unsigned long *nr_scanned)
2922{ 2921{
2923 struct scan_control sc = { 2922 struct scan_control sc = {
@@ -2944,7 +2943,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
2944 * will pick up pages from other mem cgroup's as well. We hack 2943 * will pick up pages from other mem cgroup's as well. We hack
2945 * the priority and make it zero. 2944 * the priority and make it zero.
2946 */ 2945 */
2947 shrink_node_memcg(zone->zone_pgdat, memcg, &sc, &lru_pages); 2946 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
2948 2947
2949 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2948 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2950 2949
@@ -2994,7 +2993,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2994#endif 2993#endif
2995 2994
2996static void age_active_anon(struct pglist_data *pgdat, 2995static void age_active_anon(struct pglist_data *pgdat,
2997 struct zone *zone, struct scan_control *sc) 2996 struct scan_control *sc)
2998{ 2997{
2999 struct mem_cgroup *memcg; 2998 struct mem_cgroup *memcg;
3000 2999
@@ -3003,7 +3002,7 @@ static void age_active_anon(struct pglist_data *pgdat,
3003 3002
3004 memcg = mem_cgroup_iter(NULL, NULL, NULL); 3003 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3005 do { 3004 do {
3006 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg); 3005 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3007 3006
3008 if (inactive_list_is_low(lruvec, false)) 3007 if (inactive_list_is_low(lruvec, false))
3009 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 3008 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
@@ -3193,7 +3192,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3193 * pages are rotated regardless of classzone as this is 3192 * pages are rotated regardless of classzone as this is
3194 * about consistent aging. 3193 * about consistent aging.
3195 */ 3194 */
3196 age_active_anon(pgdat, &pgdat->node_zones[MAX_NR_ZONES - 1], &sc); 3195 age_active_anon(pgdat, &sc);
3197 3196
3198 /* 3197 /*
3199 * If we're getting trouble reclaiming, start doing writepage 3198 * If we're getting trouble reclaiming, start doing writepage
@@ -3205,7 +3204,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3205 /* Call soft limit reclaim before calling shrink_node. */ 3204 /* Call soft limit reclaim before calling shrink_node. */
3206 sc.nr_scanned = 0; 3205 sc.nr_scanned = 0;
3207 nr_soft_scanned = 0; 3206 nr_soft_scanned = 0;
3208 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, sc.order, 3207 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3209 sc.gfp_mask, &nr_soft_scanned); 3208 sc.gfp_mask, &nr_soft_scanned);
3210 sc.nr_reclaimed += nr_soft_reclaimed; 3209 sc.nr_reclaimed += nr_soft_reclaimed;
3211 3210
diff --git a/mm/workingset.c b/mm/workingset.c
index df0dacaf54ee..2af14bb5a349 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -218,7 +218,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
218 VM_BUG_ON_PAGE(page_count(page), page); 218 VM_BUG_ON_PAGE(page_count(page), page);
219 VM_BUG_ON_PAGE(!PageLocked(page), page); 219 VM_BUG_ON_PAGE(!PageLocked(page), page);
220 220
221 lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg); 221 lruvec = mem_cgroup_lruvec(zone->zone_pgdat, memcg);
222 eviction = atomic_long_inc_return(&lruvec->inactive_age); 222 eviction = atomic_long_inc_return(&lruvec->inactive_age);
223 return pack_shadow(memcgid, zone, eviction); 223 return pack_shadow(memcgid, zone, eviction);
224} 224}
@@ -267,7 +267,7 @@ bool workingset_refault(void *shadow)
267 rcu_read_unlock(); 267 rcu_read_unlock();
268 return false; 268 return false;
269 } 269 }
270 lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg); 270 lruvec = mem_cgroup_lruvec(zone->zone_pgdat, memcg);
271 refault = atomic_long_read(&lruvec->inactive_age); 271 refault = atomic_long_read(&lruvec->inactive_age);
272 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); 272 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
273 rcu_read_unlock(); 273 rcu_read_unlock();
@@ -319,7 +319,7 @@ void workingset_activation(struct page *page)
319 memcg = page_memcg_rcu(page); 319 memcg = page_memcg_rcu(page);
320 if (!mem_cgroup_disabled() && !memcg) 320 if (!mem_cgroup_disabled() && !memcg)
321 goto out; 321 goto out;
322 lruvec = mem_cgroup_lruvec(page_pgdat(page), page_zone(page), memcg); 322 lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
323 atomic_long_inc(&lruvec->inactive_age); 323 atomic_long_inc(&lruvec->inactive_age);
324out: 324out:
325 rcu_read_unlock(); 325 rcu_read_unlock();