diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2010-10-27 18:33:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-27 21:03:09 -0400 |
commit | 711d3d2c9bc3fb7cb5116352fecdb5b4adb6db6e (patch) | |
tree | 09979a5e3d7b2e1fe7b7de193d11d6f468a67e76 | |
parent | 7d74b06f240f1bd1b4b68dd6fe84164d8bf4e315 (diff) |
memcg: cpu hotplug aware percpu count updates
Now, memcgroup's per cpu coutner uses for_each_possible_cpu() to get the
value. It's better to use for_each_online_cpu() and a cpu hotplug
handler.
This patch only handles statistics counter. MEM_CGROUP_ON_MOVE will be
handled in another patch.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 102 |
1 files changed, 93 insertions, 9 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5e7a14d117c7..31a1d3b71eee 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -89,7 +89,9 @@ enum mem_cgroup_stat_index { | |||
89 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 89 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
92 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | 92 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
93 | /* incremented at every pagein/pageout */ | ||
94 | MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, | ||
93 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | 95 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ |
94 | 96 | ||
95 | MEM_CGROUP_STAT_NSTATS, | 97 | MEM_CGROUP_STAT_NSTATS, |
@@ -255,6 +257,12 @@ struct mem_cgroup { | |||
255 | * percpu counter. | 257 | * percpu counter. |
256 | */ | 258 | */ |
257 | struct mem_cgroup_stat_cpu *stat; | 259 | struct mem_cgroup_stat_cpu *stat; |
260 | /* | ||
261 | * used when a cpu is offlined or other synchronizations | ||
262 | * See mem_cgroup_read_stat(). | ||
263 | */ | ||
264 | struct mem_cgroup_stat_cpu nocpu_base; | ||
265 | spinlock_t pcp_counter_lock; | ||
258 | }; | 266 | }; |
259 | 267 | ||
260 | /* Stuffs for move charges at task migration. */ | 268 | /* Stuffs for move charges at task migration. */ |
@@ -531,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
531 | return mz; | 539 | return mz; |
532 | } | 540 | } |
533 | 541 | ||
542 | /* | ||
543 | * Implementation Note: reading percpu statistics for memcg. | ||
544 | * | ||
545 | * Both of vmstat[] and percpu_counter has threshold and do periodic | ||
546 | * synchronization to implement "quick" read. There are trade-off between | ||
547 | * reading cost and precision of value. Then, we may have a chance to implement | ||
548 | * a periodic synchronizion of counter in memcg's counter. | ||
549 | * | ||
550 | * But this _read() function is used for user interface now. The user accounts | ||
551 | * memory usage by memory cgroup and he _always_ requires exact value because | ||
552 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always | ||
553 | * have to visit all online cpus and make sum. So, for now, unnecessary | ||
554 | * synchronization is not implemented. (just implemented for cpu hotplug) | ||
555 | * | ||
556 | * If there are kernel internal actions which can make use of some not-exact | ||
557 | * value, and reading all cpu value can be performance bottleneck in some | ||
558 | * common workload, threashold and synchonization as vmstat[] should be | ||
559 | * implemented. | ||
560 | */ | ||
534 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | 561 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, |
535 | enum mem_cgroup_stat_index idx) | 562 | enum mem_cgroup_stat_index idx) |
536 | { | 563 | { |
537 | int cpu; | 564 | int cpu; |
538 | s64 val = 0; | 565 | s64 val = 0; |
539 | 566 | ||
540 | for_each_possible_cpu(cpu) | 567 | get_online_cpus(); |
568 | for_each_online_cpu(cpu) | ||
541 | val += per_cpu(mem->stat->count[idx], cpu); | 569 | val += per_cpu(mem->stat->count[idx], cpu); |
570 | #ifdef CONFIG_HOTPLUG_CPU | ||
571 | spin_lock(&mem->pcp_counter_lock); | ||
572 | val += mem->nocpu_base.count[idx]; | ||
573 | spin_unlock(&mem->pcp_counter_lock); | ||
574 | #endif | ||
575 | put_online_cpus(); | ||
542 | return val; | 576 | return val; |
543 | } | 577 | } |
544 | 578 | ||
@@ -663,9 +697,28 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
663 | /* The caller has to guarantee "mem" exists before calling this */ | 697 | /* The caller has to guarantee "mem" exists before calling this */ |
664 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) | 698 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) |
665 | { | 699 | { |
666 | if (mem && css_tryget(&mem->css)) | 700 | struct cgroup_subsys_state *css; |
667 | return mem; | 701 | int found; |
668 | return NULL; | 702 | |
703 | if (!mem) /* ROOT cgroup has the smallest ID */ | ||
704 | return root_mem_cgroup; /*css_put/get against root is ignored*/ | ||
705 | if (!mem->use_hierarchy) { | ||
706 | if (css_tryget(&mem->css)) | ||
707 | return mem; | ||
708 | return NULL; | ||
709 | } | ||
710 | rcu_read_lock(); | ||
711 | /* | ||
712 | * searching a memory cgroup which has the smallest ID under given | ||
713 | * ROOT cgroup. (ID >= 1) | ||
714 | */ | ||
715 | css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); | ||
716 | if (css && css_tryget(css)) | ||
717 | mem = container_of(css, struct mem_cgroup, css); | ||
718 | else | ||
719 | mem = NULL; | ||
720 | rcu_read_unlock(); | ||
721 | return mem; | ||
669 | } | 722 | } |
670 | 723 | ||
671 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | 724 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, |
@@ -680,9 +733,13 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | |||
680 | hierarchy_used = iter->use_hierarchy; | 733 | hierarchy_used = iter->use_hierarchy; |
681 | 734 | ||
682 | css_put(&iter->css); | 735 | css_put(&iter->css); |
683 | if (!cond || !hierarchy_used) | 736 | /* If no ROOT, walk all, ignore hierarchy */ |
737 | if (!cond || (root && !hierarchy_used)) | ||
684 | return NULL; | 738 | return NULL; |
685 | 739 | ||
740 | if (!root) | ||
741 | root = root_mem_cgroup; | ||
742 | |||
686 | do { | 743 | do { |
687 | iter = NULL; | 744 | iter = NULL; |
688 | rcu_read_lock(); | 745 | rcu_read_lock(); |
@@ -711,6 +768,9 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | |||
711 | #define for_each_mem_cgroup_tree(iter, root) \ | 768 | #define for_each_mem_cgroup_tree(iter, root) \ |
712 | for_each_mem_cgroup_tree_cond(iter, root, true) | 769 | for_each_mem_cgroup_tree_cond(iter, root, true) |
713 | 770 | ||
771 | #define for_each_mem_cgroup_all(iter) \ | ||
772 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | ||
773 | |||
714 | 774 | ||
715 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | 775 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) |
716 | { | 776 | { |
@@ -1676,15 +1736,38 @@ static void drain_all_stock_sync(void) | |||
1676 | atomic_dec(&memcg_drain_count); | 1736 | atomic_dec(&memcg_drain_count); |
1677 | } | 1737 | } |
1678 | 1738 | ||
1679 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | 1739 | /* |
1740 | * This function drains percpu counter value from DEAD cpu and | ||
1741 | * move it to local cpu. Note that this function can be preempted. | ||
1742 | */ | ||
1743 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | ||
1744 | { | ||
1745 | int i; | ||
1746 | |||
1747 | spin_lock(&mem->pcp_counter_lock); | ||
1748 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | ||
1749 | s64 x = per_cpu(mem->stat->count[i], cpu); | ||
1750 | |||
1751 | per_cpu(mem->stat->count[i], cpu) = 0; | ||
1752 | mem->nocpu_base.count[i] += x; | ||
1753 | } | ||
1754 | spin_unlock(&mem->pcp_counter_lock); | ||
1755 | } | ||
1756 | |||
1757 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | ||
1680 | unsigned long action, | 1758 | unsigned long action, |
1681 | void *hcpu) | 1759 | void *hcpu) |
1682 | { | 1760 | { |
1683 | int cpu = (unsigned long)hcpu; | 1761 | int cpu = (unsigned long)hcpu; |
1684 | struct memcg_stock_pcp *stock; | 1762 | struct memcg_stock_pcp *stock; |
1763 | struct mem_cgroup *iter; | ||
1685 | 1764 | ||
1686 | if (action != CPU_DEAD) | 1765 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
1687 | return NOTIFY_OK; | 1766 | return NOTIFY_OK; |
1767 | |||
1768 | for_each_mem_cgroup_all(iter) | ||
1769 | mem_cgroup_drain_pcp_counter(iter, cpu); | ||
1770 | |||
1688 | stock = &per_cpu(memcg_stock, cpu); | 1771 | stock = &per_cpu(memcg_stock, cpu); |
1689 | drain_stock(stock); | 1772 | drain_stock(stock); |
1690 | return NOTIFY_OK; | 1773 | return NOTIFY_OK; |
@@ -4098,6 +4181,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
4098 | vfree(mem); | 4181 | vfree(mem); |
4099 | mem = NULL; | 4182 | mem = NULL; |
4100 | } | 4183 | } |
4184 | spin_lock_init(&mem->pcp_counter_lock); | ||
4101 | return mem; | 4185 | return mem; |
4102 | } | 4186 | } |
4103 | 4187 | ||
@@ -4224,7 +4308,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4224 | &per_cpu(memcg_stock, cpu); | 4308 | &per_cpu(memcg_stock, cpu); |
4225 | INIT_WORK(&stock->work, drain_local_stock); | 4309 | INIT_WORK(&stock->work, drain_local_stock); |
4226 | } | 4310 | } |
4227 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | 4311 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
4228 | } else { | 4312 | } else { |
4229 | parent = mem_cgroup_from_cont(cont->parent); | 4313 | parent = mem_cgroup_from_cont(cont->parent); |
4230 | mem->use_hierarchy = parent->use_hierarchy; | 4314 | mem->use_hierarchy = parent->use_hierarchy; |