diff options
| author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2010-10-27 18:33:42 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-27 21:03:09 -0400 |
| commit | 711d3d2c9bc3fb7cb5116352fecdb5b4adb6db6e (patch) | |
| tree | 09979a5e3d7b2e1fe7b7de193d11d6f468a67e76 | |
| parent | 7d74b06f240f1bd1b4b68dd6fe84164d8bf4e315 (diff) | |
memcg: cpu hotplug aware percpu count updates
Now, memcgroup's per cpu coutner uses for_each_possible_cpu() to get the
value. It's better to use for_each_online_cpu() and a cpu hotplug
handler.
This patch only handles statistics counter. MEM_CGROUP_ON_MOVE will be
handled in another patch.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
| -rw-r--r-- | mm/memcontrol.c | 102 |
1 files changed, 93 insertions, 9 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5e7a14d117c7..31a1d3b71eee 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -89,7 +89,9 @@ enum mem_cgroup_stat_index { | |||
| 89 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 89 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
| 90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
| 91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
| 92 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | 92 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
| 93 | /* incremented at every pagein/pageout */ | ||
| 94 | MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, | ||
| 93 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | 95 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ |
| 94 | 96 | ||
| 95 | MEM_CGROUP_STAT_NSTATS, | 97 | MEM_CGROUP_STAT_NSTATS, |
| @@ -255,6 +257,12 @@ struct mem_cgroup { | |||
| 255 | * percpu counter. | 257 | * percpu counter. |
| 256 | */ | 258 | */ |
| 257 | struct mem_cgroup_stat_cpu *stat; | 259 | struct mem_cgroup_stat_cpu *stat; |
| 260 | /* | ||
| 261 | * used when a cpu is offlined or other synchronizations | ||
| 262 | * See mem_cgroup_read_stat(). | ||
| 263 | */ | ||
| 264 | struct mem_cgroup_stat_cpu nocpu_base; | ||
| 265 | spinlock_t pcp_counter_lock; | ||
| 258 | }; | 266 | }; |
| 259 | 267 | ||
| 260 | /* Stuffs for move charges at task migration. */ | 268 | /* Stuffs for move charges at task migration. */ |
| @@ -531,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
| 531 | return mz; | 539 | return mz; |
| 532 | } | 540 | } |
| 533 | 541 | ||
| 542 | /* | ||
| 543 | * Implementation Note: reading percpu statistics for memcg. | ||
| 544 | * | ||
| 545 | * Both of vmstat[] and percpu_counter has threshold and do periodic | ||
| 546 | * synchronization to implement "quick" read. There are trade-off between | ||
| 547 | * reading cost and precision of value. Then, we may have a chance to implement | ||
| 548 | * a periodic synchronizion of counter in memcg's counter. | ||
| 549 | * | ||
| 550 | * But this _read() function is used for user interface now. The user accounts | ||
| 551 | * memory usage by memory cgroup and he _always_ requires exact value because | ||
| 552 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always | ||
| 553 | * have to visit all online cpus and make sum. So, for now, unnecessary | ||
| 554 | * synchronization is not implemented. (just implemented for cpu hotplug) | ||
| 555 | * | ||
| 556 | * If there are kernel internal actions which can make use of some not-exact | ||
| 557 | * value, and reading all cpu value can be performance bottleneck in some | ||
| 558 | * common workload, threashold and synchonization as vmstat[] should be | ||
| 559 | * implemented. | ||
| 560 | */ | ||
| 534 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | 561 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, |
| 535 | enum mem_cgroup_stat_index idx) | 562 | enum mem_cgroup_stat_index idx) |
| 536 | { | 563 | { |
| 537 | int cpu; | 564 | int cpu; |
| 538 | s64 val = 0; | 565 | s64 val = 0; |
| 539 | 566 | ||
| 540 | for_each_possible_cpu(cpu) | 567 | get_online_cpus(); |
| 568 | for_each_online_cpu(cpu) | ||
| 541 | val += per_cpu(mem->stat->count[idx], cpu); | 569 | val += per_cpu(mem->stat->count[idx], cpu); |
| 570 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 571 | spin_lock(&mem->pcp_counter_lock); | ||
| 572 | val += mem->nocpu_base.count[idx]; | ||
| 573 | spin_unlock(&mem->pcp_counter_lock); | ||
| 574 | #endif | ||
| 575 | put_online_cpus(); | ||
| 542 | return val; | 576 | return val; |
| 543 | } | 577 | } |
| 544 | 578 | ||
| @@ -663,9 +697,28 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
| 663 | /* The caller has to guarantee "mem" exists before calling this */ | 697 | /* The caller has to guarantee "mem" exists before calling this */ |
| 664 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) | 698 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) |
| 665 | { | 699 | { |
| 666 | if (mem && css_tryget(&mem->css)) | 700 | struct cgroup_subsys_state *css; |
| 667 | return mem; | 701 | int found; |
| 668 | return NULL; | 702 | |
| 703 | if (!mem) /* ROOT cgroup has the smallest ID */ | ||
| 704 | return root_mem_cgroup; /*css_put/get against root is ignored*/ | ||
| 705 | if (!mem->use_hierarchy) { | ||
| 706 | if (css_tryget(&mem->css)) | ||
| 707 | return mem; | ||
| 708 | return NULL; | ||
| 709 | } | ||
| 710 | rcu_read_lock(); | ||
| 711 | /* | ||
| 712 | * searching a memory cgroup which has the smallest ID under given | ||
| 713 | * ROOT cgroup. (ID >= 1) | ||
| 714 | */ | ||
| 715 | css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); | ||
| 716 | if (css && css_tryget(css)) | ||
| 717 | mem = container_of(css, struct mem_cgroup, css); | ||
| 718 | else | ||
| 719 | mem = NULL; | ||
| 720 | rcu_read_unlock(); | ||
| 721 | return mem; | ||
| 669 | } | 722 | } |
| 670 | 723 | ||
| 671 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | 724 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, |
| @@ -680,9 +733,13 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | |||
| 680 | hierarchy_used = iter->use_hierarchy; | 733 | hierarchy_used = iter->use_hierarchy; |
| 681 | 734 | ||
| 682 | css_put(&iter->css); | 735 | css_put(&iter->css); |
| 683 | if (!cond || !hierarchy_used) | 736 | /* If no ROOT, walk all, ignore hierarchy */ |
| 737 | if (!cond || (root && !hierarchy_used)) | ||
| 684 | return NULL; | 738 | return NULL; |
| 685 | 739 | ||
| 740 | if (!root) | ||
| 741 | root = root_mem_cgroup; | ||
| 742 | |||
| 686 | do { | 743 | do { |
| 687 | iter = NULL; | 744 | iter = NULL; |
| 688 | rcu_read_lock(); | 745 | rcu_read_lock(); |
| @@ -711,6 +768,9 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | |||
| 711 | #define for_each_mem_cgroup_tree(iter, root) \ | 768 | #define for_each_mem_cgroup_tree(iter, root) \ |
| 712 | for_each_mem_cgroup_tree_cond(iter, root, true) | 769 | for_each_mem_cgroup_tree_cond(iter, root, true) |
| 713 | 770 | ||
| 771 | #define for_each_mem_cgroup_all(iter) \ | ||
| 772 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | ||
| 773 | |||
| 714 | 774 | ||
| 715 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | 775 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) |
| 716 | { | 776 | { |
| @@ -1676,15 +1736,38 @@ static void drain_all_stock_sync(void) | |||
| 1676 | atomic_dec(&memcg_drain_count); | 1736 | atomic_dec(&memcg_drain_count); |
| 1677 | } | 1737 | } |
| 1678 | 1738 | ||
| 1679 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | 1739 | /* |
| 1740 | * This function drains percpu counter value from DEAD cpu and | ||
| 1741 | * move it to local cpu. Note that this function can be preempted. | ||
| 1742 | */ | ||
| 1743 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | ||
| 1744 | { | ||
| 1745 | int i; | ||
| 1746 | |||
| 1747 | spin_lock(&mem->pcp_counter_lock); | ||
| 1748 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | ||
| 1749 | s64 x = per_cpu(mem->stat->count[i], cpu); | ||
| 1750 | |||
| 1751 | per_cpu(mem->stat->count[i], cpu) = 0; | ||
| 1752 | mem->nocpu_base.count[i] += x; | ||
| 1753 | } | ||
| 1754 | spin_unlock(&mem->pcp_counter_lock); | ||
| 1755 | } | ||
| 1756 | |||
| 1757 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | ||
| 1680 | unsigned long action, | 1758 | unsigned long action, |
| 1681 | void *hcpu) | 1759 | void *hcpu) |
| 1682 | { | 1760 | { |
| 1683 | int cpu = (unsigned long)hcpu; | 1761 | int cpu = (unsigned long)hcpu; |
| 1684 | struct memcg_stock_pcp *stock; | 1762 | struct memcg_stock_pcp *stock; |
| 1763 | struct mem_cgroup *iter; | ||
| 1685 | 1764 | ||
| 1686 | if (action != CPU_DEAD) | 1765 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
| 1687 | return NOTIFY_OK; | 1766 | return NOTIFY_OK; |
| 1767 | |||
| 1768 | for_each_mem_cgroup_all(iter) | ||
| 1769 | mem_cgroup_drain_pcp_counter(iter, cpu); | ||
| 1770 | |||
| 1688 | stock = &per_cpu(memcg_stock, cpu); | 1771 | stock = &per_cpu(memcg_stock, cpu); |
| 1689 | drain_stock(stock); | 1772 | drain_stock(stock); |
| 1690 | return NOTIFY_OK; | 1773 | return NOTIFY_OK; |
| @@ -4098,6 +4181,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
| 4098 | vfree(mem); | 4181 | vfree(mem); |
| 4099 | mem = NULL; | 4182 | mem = NULL; |
| 4100 | } | 4183 | } |
| 4184 | spin_lock_init(&mem->pcp_counter_lock); | ||
| 4101 | return mem; | 4185 | return mem; |
| 4102 | } | 4186 | } |
| 4103 | 4187 | ||
| @@ -4224,7 +4308,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 4224 | &per_cpu(memcg_stock, cpu); | 4308 | &per_cpu(memcg_stock, cpu); |
| 4225 | INIT_WORK(&stock->work, drain_local_stock); | 4309 | INIT_WORK(&stock->work, drain_local_stock); |
| 4226 | } | 4310 | } |
| 4227 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | 4311 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
| 4228 | } else { | 4312 | } else { |
| 4229 | parent = mem_cgroup_from_cont(cont->parent); | 4313 | parent = mem_cgroup_from_cont(cont->parent); |
| 4230 | mem->use_hierarchy = parent->use_hierarchy; | 4314 | mem->use_hierarchy = parent->use_hierarchy; |
