diff options
-rw-r--r-- | mm/memcontrol.c | 166 |
1 files changed, 162 insertions, 4 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e72fb2b4a7d8..20e1d90b3363 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -143,6 +143,13 @@ struct mem_cgroup { | |||
143 | struct mem_cgroup_lru_info info; | 143 | struct mem_cgroup_lru_info info; |
144 | 144 | ||
145 | int prev_priority; /* for recording reclaim priority */ | 145 | int prev_priority; /* for recording reclaim priority */ |
146 | |||
147 | /* | ||
148 | * While reclaiming in a hiearchy, we cache the last child we | ||
149 | * reclaimed from. Protected by cgroup_lock() | ||
150 | */ | ||
151 | struct mem_cgroup *last_scanned_child; | ||
152 | |||
146 | int obsolete; | 153 | int obsolete; |
147 | atomic_t refcnt; | 154 | atomic_t refcnt; |
148 | /* | 155 | /* |
@@ -461,6 +468,149 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
461 | return nr_taken; | 468 | return nr_taken; |
462 | } | 469 | } |
463 | 470 | ||
471 | #define mem_cgroup_from_res_counter(counter, member) \ | ||
472 | container_of(counter, struct mem_cgroup, member) | ||
473 | |||
474 | /* | ||
475 | * This routine finds the DFS walk successor. This routine should be | ||
476 | * called with cgroup_mutex held | ||
477 | */ | ||
478 | static struct mem_cgroup * | ||
479 | mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) | ||
480 | { | ||
481 | struct cgroup *cgroup, *curr_cgroup, *root_cgroup; | ||
482 | |||
483 | curr_cgroup = curr->css.cgroup; | ||
484 | root_cgroup = root_mem->css.cgroup; | ||
485 | |||
486 | if (!list_empty(&curr_cgroup->children)) { | ||
487 | /* | ||
488 | * Walk down to children | ||
489 | */ | ||
490 | mem_cgroup_put(curr); | ||
491 | cgroup = list_entry(curr_cgroup->children.next, | ||
492 | struct cgroup, sibling); | ||
493 | curr = mem_cgroup_from_cont(cgroup); | ||
494 | mem_cgroup_get(curr); | ||
495 | goto done; | ||
496 | } | ||
497 | |||
498 | visit_parent: | ||
499 | if (curr_cgroup == root_cgroup) { | ||
500 | mem_cgroup_put(curr); | ||
501 | curr = root_mem; | ||
502 | mem_cgroup_get(curr); | ||
503 | goto done; | ||
504 | } | ||
505 | |||
506 | /* | ||
507 | * Goto next sibling | ||
508 | */ | ||
509 | if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { | ||
510 | mem_cgroup_put(curr); | ||
511 | cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, | ||
512 | sibling); | ||
513 | curr = mem_cgroup_from_cont(cgroup); | ||
514 | mem_cgroup_get(curr); | ||
515 | goto done; | ||
516 | } | ||
517 | |||
518 | /* | ||
519 | * Go up to next parent and next parent's sibling if need be | ||
520 | */ | ||
521 | curr_cgroup = curr_cgroup->parent; | ||
522 | goto visit_parent; | ||
523 | |||
524 | done: | ||
525 | root_mem->last_scanned_child = curr; | ||
526 | return curr; | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * Visit the first child (need not be the first child as per the ordering | ||
531 | * of the cgroup list, since we track last_scanned_child) of @mem and use | ||
532 | * that to reclaim free pages from. | ||
533 | */ | ||
534 | static struct mem_cgroup * | ||
535 | mem_cgroup_get_first_node(struct mem_cgroup *root_mem) | ||
536 | { | ||
537 | struct cgroup *cgroup; | ||
538 | struct mem_cgroup *ret; | ||
539 | bool obsolete = (root_mem->last_scanned_child && | ||
540 | root_mem->last_scanned_child->obsolete); | ||
541 | |||
542 | /* | ||
543 | * Scan all children under the mem_cgroup mem | ||
544 | */ | ||
545 | cgroup_lock(); | ||
546 | if (list_empty(&root_mem->css.cgroup->children)) { | ||
547 | ret = root_mem; | ||
548 | goto done; | ||
549 | } | ||
550 | |||
551 | if (!root_mem->last_scanned_child || obsolete) { | ||
552 | |||
553 | if (obsolete) | ||
554 | mem_cgroup_put(root_mem->last_scanned_child); | ||
555 | |||
556 | cgroup = list_first_entry(&root_mem->css.cgroup->children, | ||
557 | struct cgroup, sibling); | ||
558 | ret = mem_cgroup_from_cont(cgroup); | ||
559 | mem_cgroup_get(ret); | ||
560 | } else | ||
561 | ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, | ||
562 | root_mem); | ||
563 | |||
564 | done: | ||
565 | root_mem->last_scanned_child = ret; | ||
566 | cgroup_unlock(); | ||
567 | return ret; | ||
568 | } | ||
569 | |||
570 | /* | ||
571 | * Dance down the hierarchy if needed to reclaim memory. We remember the | ||
572 | * last child we reclaimed from, so that we don't end up penalizing | ||
573 | * one child extensively based on its position in the children list. | ||
574 | * | ||
575 | * root_mem is the original ancestor that we've been reclaim from. | ||
576 | */ | ||
577 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | ||
578 | gfp_t gfp_mask, bool noswap) | ||
579 | { | ||
580 | struct mem_cgroup *next_mem; | ||
581 | int ret = 0; | ||
582 | |||
583 | /* | ||
584 | * Reclaim unconditionally and don't check for return value. | ||
585 | * We need to reclaim in the current group and down the tree. | ||
586 | * One might think about checking for children before reclaiming, | ||
587 | * but there might be left over accounting, even after children | ||
588 | * have left. | ||
589 | */ | ||
590 | ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap); | ||
591 | if (res_counter_check_under_limit(&root_mem->res)) | ||
592 | return 0; | ||
593 | |||
594 | next_mem = mem_cgroup_get_first_node(root_mem); | ||
595 | |||
596 | while (next_mem != root_mem) { | ||
597 | if (next_mem->obsolete) { | ||
598 | mem_cgroup_put(next_mem); | ||
599 | cgroup_lock(); | ||
600 | next_mem = mem_cgroup_get_first_node(root_mem); | ||
601 | cgroup_unlock(); | ||
602 | continue; | ||
603 | } | ||
604 | ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap); | ||
605 | if (res_counter_check_under_limit(&root_mem->res)) | ||
606 | return 0; | ||
607 | cgroup_lock(); | ||
608 | next_mem = mem_cgroup_get_next_node(next_mem, root_mem); | ||
609 | cgroup_unlock(); | ||
610 | } | ||
611 | return ret; | ||
612 | } | ||
613 | |||
464 | /* | 614 | /* |
465 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 615 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
466 | * oom-killer can be invoked. | 616 | * oom-killer can be invoked. |
@@ -469,7 +619,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
469 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 619 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
470 | bool oom) | 620 | bool oom) |
471 | { | 621 | { |
472 | struct mem_cgroup *mem; | 622 | struct mem_cgroup *mem, *mem_over_limit; |
473 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 623 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
474 | struct res_counter *fail_res; | 624 | struct res_counter *fail_res; |
475 | /* | 625 | /* |
@@ -511,12 +661,18 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
511 | /* mem+swap counter fails */ | 661 | /* mem+swap counter fails */ |
512 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 662 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
513 | noswap = true; | 663 | noswap = true; |
514 | } | 664 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
665 | memsw); | ||
666 | } else | ||
667 | /* mem counter fails */ | ||
668 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | ||
669 | res); | ||
670 | |||
515 | if (!(gfp_mask & __GFP_WAIT)) | 671 | if (!(gfp_mask & __GFP_WAIT)) |
516 | goto nomem; | 672 | goto nomem; |
517 | 673 | ||
518 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap)) | 674 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, |
519 | continue; | 675 | noswap); |
520 | 676 | ||
521 | /* | 677 | /* |
522 | * try_to_free_mem_cgroup_pages() might not give us a full | 678 | * try_to_free_mem_cgroup_pages() might not give us a full |
@@ -1732,6 +1888,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1732 | res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL); | 1888 | res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL); |
1733 | 1889 | ||
1734 | 1890 | ||
1891 | mem->last_scanned_child = NULL; | ||
1892 | |||
1735 | return &mem->css; | 1893 | return &mem->css; |
1736 | free_out: | 1894 | free_out: |
1737 | for_each_node_state(node, N_POSSIBLE) | 1895 | for_each_node_state(node, N_POSSIBLE) |