aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/memcontrol.c102
1 files changed, 93 insertions, 9 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5e7a14d117c7..31a1d3b71eee 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,9 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 92 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
93 /* incremented at every pagein/pageout */
94 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
93 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 95 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
94 96
95 MEM_CGROUP_STAT_NSTATS, 97 MEM_CGROUP_STAT_NSTATS,
@@ -255,6 +257,12 @@ struct mem_cgroup {
255 * percpu counter. 257 * percpu counter.
256 */ 258 */
257 struct mem_cgroup_stat_cpu *stat; 259 struct mem_cgroup_stat_cpu *stat;
260 /*
261 * used when a cpu is offlined or other synchronizations
262 * See mem_cgroup_read_stat().
263 */
264 struct mem_cgroup_stat_cpu nocpu_base;
265 spinlock_t pcp_counter_lock;
258}; 266};
259 267
260/* Stuffs for move charges at task migration. */ 268/* Stuffs for move charges at task migration. */
@@ -531,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
531 return mz; 539 return mz;
532} 540}
533 541
542/*
543 * Implementation Note: reading percpu statistics for memcg.
544 *
545 * Both of vmstat[] and percpu_counter has threshold and do periodic
546 * synchronization to implement "quick" read. There are trade-off between
547 * reading cost and precision of value. Then, we may have a chance to implement
548 * a periodic synchronizion of counter in memcg's counter.
549 *
550 * But this _read() function is used for user interface now. The user accounts
551 * memory usage by memory cgroup and he _always_ requires exact value because
552 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
553 * have to visit all online cpus and make sum. So, for now, unnecessary
554 * synchronization is not implemented. (just implemented for cpu hotplug)
555 *
556 * If there are kernel internal actions which can make use of some not-exact
557 * value, and reading all cpu value can be performance bottleneck in some
558 * common workload, threashold and synchonization as vmstat[] should be
559 * implemented.
560 */
534static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 561static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
535 enum mem_cgroup_stat_index idx) 562 enum mem_cgroup_stat_index idx)
536{ 563{
537 int cpu; 564 int cpu;
538 s64 val = 0; 565 s64 val = 0;
539 566
540 for_each_possible_cpu(cpu) 567 get_online_cpus();
568 for_each_online_cpu(cpu)
541 val += per_cpu(mem->stat->count[idx], cpu); 569 val += per_cpu(mem->stat->count[idx], cpu);
570#ifdef CONFIG_HOTPLUG_CPU
571 spin_lock(&mem->pcp_counter_lock);
572 val += mem->nocpu_base.count[idx];
573 spin_unlock(&mem->pcp_counter_lock);
574#endif
575 put_online_cpus();
542 return val; 576 return val;
543} 577}
544 578
@@ -663,9 +697,28 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
663/* The caller has to guarantee "mem" exists before calling this */ 697/* The caller has to guarantee "mem" exists before calling this */
664static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 698static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
665{ 699{
666 if (mem && css_tryget(&mem->css)) 700 struct cgroup_subsys_state *css;
667 return mem; 701 int found;
668 return NULL; 702
703 if (!mem) /* ROOT cgroup has the smallest ID */
704 return root_mem_cgroup; /*css_put/get against root is ignored*/
705 if (!mem->use_hierarchy) {
706 if (css_tryget(&mem->css))
707 return mem;
708 return NULL;
709 }
710 rcu_read_lock();
711 /*
712 * searching a memory cgroup which has the smallest ID under given
713 * ROOT cgroup. (ID >= 1)
714 */
715 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
716 if (css && css_tryget(css))
717 mem = container_of(css, struct mem_cgroup, css);
718 else
719 mem = NULL;
720 rcu_read_unlock();
721 return mem;
669} 722}
670 723
671static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 724static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
@@ -680,9 +733,13 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
680 hierarchy_used = iter->use_hierarchy; 733 hierarchy_used = iter->use_hierarchy;
681 734
682 css_put(&iter->css); 735 css_put(&iter->css);
683 if (!cond || !hierarchy_used) 736 /* If no ROOT, walk all, ignore hierarchy */
737 if (!cond || (root && !hierarchy_used))
684 return NULL; 738 return NULL;
685 739
740 if (!root)
741 root = root_mem_cgroup;
742
686 do { 743 do {
687 iter = NULL; 744 iter = NULL;
688 rcu_read_lock(); 745 rcu_read_lock();
@@ -711,6 +768,9 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
711#define for_each_mem_cgroup_tree(iter, root) \ 768#define for_each_mem_cgroup_tree(iter, root) \
712 for_each_mem_cgroup_tree_cond(iter, root, true) 769 for_each_mem_cgroup_tree_cond(iter, root, true)
713 770
771#define for_each_mem_cgroup_all(iter) \
772 for_each_mem_cgroup_tree_cond(iter, NULL, true)
773
714 774
715static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 775static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
716{ 776{
@@ -1676,15 +1736,38 @@ static void drain_all_stock_sync(void)
1676 atomic_dec(&memcg_drain_count); 1736 atomic_dec(&memcg_drain_count);
1677} 1737}
1678 1738
1679static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1739/*
1740 * This function drains percpu counter value from DEAD cpu and
1741 * move it to local cpu. Note that this function can be preempted.
1742 */
1743static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1744{
1745 int i;
1746
1747 spin_lock(&mem->pcp_counter_lock);
1748 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1749 s64 x = per_cpu(mem->stat->count[i], cpu);
1750
1751 per_cpu(mem->stat->count[i], cpu) = 0;
1752 mem->nocpu_base.count[i] += x;
1753 }
1754 spin_unlock(&mem->pcp_counter_lock);
1755}
1756
1757static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1680 unsigned long action, 1758 unsigned long action,
1681 void *hcpu) 1759 void *hcpu)
1682{ 1760{
1683 int cpu = (unsigned long)hcpu; 1761 int cpu = (unsigned long)hcpu;
1684 struct memcg_stock_pcp *stock; 1762 struct memcg_stock_pcp *stock;
1763 struct mem_cgroup *iter;
1685 1764
1686 if (action != CPU_DEAD) 1765 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1687 return NOTIFY_OK; 1766 return NOTIFY_OK;
1767
1768 for_each_mem_cgroup_all(iter)
1769 mem_cgroup_drain_pcp_counter(iter, cpu);
1770
1688 stock = &per_cpu(memcg_stock, cpu); 1771 stock = &per_cpu(memcg_stock, cpu);
1689 drain_stock(stock); 1772 drain_stock(stock);
1690 return NOTIFY_OK; 1773 return NOTIFY_OK;
@@ -4098,6 +4181,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4098 vfree(mem); 4181 vfree(mem);
4099 mem = NULL; 4182 mem = NULL;
4100 } 4183 }
4184 spin_lock_init(&mem->pcp_counter_lock);
4101 return mem; 4185 return mem;
4102} 4186}
4103 4187
@@ -4224,7 +4308,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4224 &per_cpu(memcg_stock, cpu); 4308 &per_cpu(memcg_stock, cpu);
4225 INIT_WORK(&stock->work, drain_local_stock); 4309 INIT_WORK(&stock->work, drain_local_stock);
4226 } 4310 }
4227 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4311 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4228 } else { 4312 } else {
4229 parent = mem_cgroup_from_cont(cont->parent); 4313 parent = mem_cgroup_from_cont(cont->parent);
4230 mem->use_hierarchy = parent->use_hierarchy; 4314 mem->use_hierarchy = parent->use_hierarchy;