aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-03-10 18:22:29 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-12 18:52:37 -0500
commitc62b1a3b31b5e27a6c5c2e91cc5ce05fdb6344d0 (patch)
tree6887141ac2afc2a8f3a1a7dfa39a56f6899a9864
parent6a6135b64fda39d931a79090f4da37f1c6da4a8c (diff)
memcg: use generic percpu instead of private implementation
When per-cpu counter for memcg was implemneted, dynamic percpu allocator was not very good. But now, we have good one and useful macros. This patch replaces memcg's private percpu counter implementation with generic dynamic percpu allocator. The benefits are - We can remove private implementation. - The counters will be NUMA-aware. (Current one is not...) - This patch makes sizeof struct mem_cgroup smaller. Then, struct mem_cgroup may be fit in page size on small config. - About basic performance aspects, see below. [Before] # size mm/memcontrol.o text data bss dec hex filename 24373 2528 4132 31033 7939 mm/memcontrol.o [page-fault-throuput test on 8cpu/SMP in root cgroup] # /root/bin/perf stat -a -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8 Performance counter stats for './multi-fault-fork 8' (5 runs): 45878618 page-faults ( +- 0.110% ) 602635826 cache-misses ( +- 0.105% ) 61.005373262 seconds time elapsed ( +- 0.004% ) Then cache-miss/page fault = 13.14 [After] #size mm/memcontrol.o text data bss dec hex filename 23913 2528 4132 30573 776d mm/memcontrol.o # /root/bin/perf stat -a -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8 Performance counter stats for './multi-fault-fork 8' (5 runs): 48179400 page-faults ( +- 0.271% ) 588628407 cache-misses ( +- 0.136% ) 61.004615021 seconds time elapsed ( +- 0.004% ) Then cache-miss/page fault = 12.22 Text size is reduced. This performance improvement is not big and will be invisible in real world applications. But this result shows this patch has some good effect even on (small) SMP. Here is a test program I used. 1. fork() processes on each cpus. 2. do page fault repeatedly on each process. 3. after 60secs, kill all childredn and exit. (3 is necessary for getting stable data, this is improvement from previous one.) #define _GNU_SOURCE #include <stdio.h> #include <sched.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <signal.h> #include <stdlib.h> /* * For avoiding contention in page table lock, FAULT area is * sparse. If FAULT_LENGTH is too large for your cpus, decrease it. */ #define FAULT_LENGTH (2 * 1024 * 1024) #define PAGE_SIZE 4096 #define MAXNUM (128) void alarm_handler(int sig) { } void *worker(int cpu, int ppid) { void *start, *end; char *c; cpu_set_t set; int i; CPU_ZERO(&set); CPU_SET(cpu, &set); sched_setaffinity(0, sizeof(set), &set); start = mmap(NULL, FAULT_LENGTH, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (start == MAP_FAILED) { perror("mmap"); exit(1); } end = start + FAULT_LENGTH; pause(); //fprintf(stderr, "run%d", cpu); while (1) { for (c = (char*)start; (void *)c < end; c += PAGE_SIZE) *c = 0; madvise(start, FAULT_LENGTH, MADV_DONTNEED); } return NULL; } int main(int argc, char *argv[]) { int num, i, ret, pid, status; int pids[MAXNUM]; if (argc < 2) return 0; setpgid(0, 0); signal(SIGALRM, alarm_handler); num = atoi(argv[1]); pid = getpid(); for (i = 0; i < num; ++i) { ret = fork(); if (!ret) { worker(i, pid); exit(0); } pids[i] = ret; } sleep(1); kill(-pid, SIGALRM); sleep(60); for (i = 0; i < num; i++) kill(pids[i], SIGKILL); for (i = 0; i < num; i++) waitpid(pids[i], &status, 0); return 0; } Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memcontrol.c184
1 files changed, 63 insertions, 121 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a82464b6e3d..9c9dfcf7a6d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,54 +89,8 @@ enum mem_cgroup_stat_index {
89 89
90struct mem_cgroup_stat_cpu { 90struct mem_cgroup_stat_cpu {
91 s64 count[MEM_CGROUP_STAT_NSTATS]; 91 s64 count[MEM_CGROUP_STAT_NSTATS];
92} ____cacheline_aligned_in_smp;
93
94struct mem_cgroup_stat {
95 struct mem_cgroup_stat_cpu cpustat[0];
96}; 92};
97 93
98static inline void
99__mem_cgroup_stat_set_safe(struct mem_cgroup_stat_cpu *stat,
100 enum mem_cgroup_stat_index idx, s64 val)
101{
102 stat->count[idx] = val;
103}
104
105static inline s64
106__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
107 enum mem_cgroup_stat_index idx)
108{
109 return stat->count[idx];
110}
111
112/*
113 * For accounting under irq disable, no need for increment preempt count.
114 */
115static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
116 enum mem_cgroup_stat_index idx, int val)
117{
118 stat->count[idx] += val;
119}
120
121static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
122 enum mem_cgroup_stat_index idx)
123{
124 int cpu;
125 s64 ret = 0;
126 for_each_possible_cpu(cpu)
127 ret += stat->cpustat[cpu].count[idx];
128 return ret;
129}
130
131static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
132{
133 s64 ret;
134
135 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
136 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
137 return ret;
138}
139
140/* 94/*
141 * per-zone information in memory controller. 95 * per-zone information in memory controller.
142 */ 96 */
@@ -270,9 +224,9 @@ struct mem_cgroup {
270 unsigned long move_charge_at_immigrate; 224 unsigned long move_charge_at_immigrate;
271 225
272 /* 226 /*
273 * statistics. This must be placed at the end of memcg. 227 * percpu counter.
274 */ 228 */
275 struct mem_cgroup_stat stat; 229 struct mem_cgroup_stat_cpu *stat;
276}; 230};
277 231
278/* Stuffs for move charges at task migration. */ 232/* Stuffs for move charges at task migration. */
@@ -441,19 +395,14 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
441static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) 395static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
442{ 396{
443 bool ret = false; 397 bool ret = false;
444 int cpu;
445 s64 val; 398 s64 val;
446 struct mem_cgroup_stat_cpu *cpustat;
447 399
448 cpu = get_cpu(); 400 val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
449 cpustat = &mem->stat.cpustat[cpu];
450 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_SOFTLIMIT);
451 if (unlikely(val < 0)) { 401 if (unlikely(val < 0)) {
452 __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, 402 this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT],
453 SOFTLIMIT_EVENTS_THRESH); 403 SOFTLIMIT_EVENTS_THRESH);
454 ret = true; 404 ret = true;
455 } 405 }
456 put_cpu();
457 return ret; 406 return ret;
458} 407}
459 408
@@ -549,17 +498,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
549 return mz; 498 return mz;
550} 499}
551 500
501static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
502 enum mem_cgroup_stat_index idx)
503{
504 int cpu;
505 s64 val = 0;
506
507 for_each_possible_cpu(cpu)
508 val += per_cpu(mem->stat->count[idx], cpu);
509 return val;
510}
511
512static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
513{
514 s64 ret;
515
516 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
517 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
518 return ret;
519}
520
552static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 521static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
553 bool charge) 522 bool charge)
554{ 523{
555 int val = (charge) ? 1 : -1; 524 int val = (charge) ? 1 : -1;
556 struct mem_cgroup_stat *stat = &mem->stat; 525 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
557 struct mem_cgroup_stat_cpu *cpustat;
558 int cpu = get_cpu();
559
560 cpustat = &stat->cpustat[cpu];
561 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
562 put_cpu();
563} 526}
564 527
565static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 528static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -567,26 +530,22 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
567 bool charge) 530 bool charge)
568{ 531{
569 int val = (charge) ? 1 : -1; 532 int val = (charge) ? 1 : -1;
570 struct mem_cgroup_stat *stat = &mem->stat;
571 struct mem_cgroup_stat_cpu *cpustat;
572 int cpu = get_cpu();
573 533
574 cpustat = &stat->cpustat[cpu]; 534 preempt_disable();
535
575 if (PageCgroupCache(pc)) 536 if (PageCgroupCache(pc))
576 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 537 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
577 else 538 else
578 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 539 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
579 540
580 if (charge) 541 if (charge)
581 __mem_cgroup_stat_add_safe(cpustat, 542 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
582 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
583 else 543 else
584 __mem_cgroup_stat_add_safe(cpustat, 544 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
585 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 545 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
586 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1); 546 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
587 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
588 547
589 put_cpu(); 548 preempt_enable();
590} 549}
591 550
592static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 551static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -1244,7 +1203,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1244 } 1203 }
1245 } 1204 }
1246 } 1205 }
1247 if (!mem_cgroup_local_usage(&victim->stat)) { 1206 if (!mem_cgroup_local_usage(victim)) {
1248 /* this cgroup's local usage == 0 */ 1207 /* this cgroup's local usage == 0 */
1249 css_put(&victim->css); 1208 css_put(&victim->css);
1250 continue; 1209 continue;
@@ -1310,9 +1269,6 @@ static void record_last_oom(struct mem_cgroup *mem)
1310void mem_cgroup_update_file_mapped(struct page *page, int val) 1269void mem_cgroup_update_file_mapped(struct page *page, int val)
1311{ 1270{
1312 struct mem_cgroup *mem; 1271 struct mem_cgroup *mem;
1313 struct mem_cgroup_stat *stat;
1314 struct mem_cgroup_stat_cpu *cpustat;
1315 int cpu;
1316 struct page_cgroup *pc; 1272 struct page_cgroup *pc;
1317 1273
1318 pc = lookup_page_cgroup(page); 1274 pc = lookup_page_cgroup(page);
@@ -1328,13 +1284,10 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
1328 goto done; 1284 goto done;
1329 1285
1330 /* 1286 /*
1331 * Preemption is already disabled, we don't need get_cpu() 1287 * Preemption is already disabled. We can use __this_cpu_xxx
1332 */ 1288 */
1333 cpu = smp_processor_id(); 1289 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
1334 stat = &mem->stat;
1335 cpustat = &stat->cpustat[cpu];
1336 1290
1337 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1338done: 1291done:
1339 unlock_page_cgroup(pc); 1292 unlock_page_cgroup(pc);
1340} 1293}
@@ -1761,9 +1714,6 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1761 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1714 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1762{ 1715{
1763 struct page *page; 1716 struct page *page;
1764 int cpu;
1765 struct mem_cgroup_stat *stat;
1766 struct mem_cgroup_stat_cpu *cpustat;
1767 1717
1768 VM_BUG_ON(from == to); 1718 VM_BUG_ON(from == to);
1769 VM_BUG_ON(PageLRU(pc->page)); 1719 VM_BUG_ON(PageLRU(pc->page));
@@ -1773,18 +1723,11 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1773 1723
1774 page = pc->page; 1724 page = pc->page;
1775 if (page_mapped(page) && !PageAnon(page)) { 1725 if (page_mapped(page) && !PageAnon(page)) {
1776 cpu = smp_processor_id(); 1726 /* Update mapped_file data for mem_cgroup */
1777 /* Update mapped_file data for mem_cgroup "from" */ 1727 preempt_disable();
1778 stat = &from->stat; 1728 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1779 cpustat = &stat->cpustat[cpu]; 1729 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1780 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, 1730 preempt_enable();
1781 -1);
1782
1783 /* Update mapped_file data for mem_cgroup "to" */
1784 stat = &to->stat;
1785 cpustat = &stat->cpustat[cpu];
1786 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1787 1);
1788 } 1731 }
1789 mem_cgroup_charge_statistics(from, pc, false); 1732 mem_cgroup_charge_statistics(from, pc, false);
1790 if (uncharge) 1733 if (uncharge)
@@ -2885,7 +2828,7 @@ static int
2885mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2828mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2886{ 2829{
2887 struct mem_cgroup_idx_data *d = data; 2830 struct mem_cgroup_idx_data *d = data;
2888 d->val += mem_cgroup_read_stat(&mem->stat, d->idx); 2831 d->val += mem_cgroup_read_stat(mem, d->idx);
2889 return 0; 2832 return 0;
2890} 2833}
2891 2834
@@ -3134,18 +3077,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3134 s64 val; 3077 s64 val;
3135 3078
3136 /* per cpu stat */ 3079 /* per cpu stat */
3137 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); 3080 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3138 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3081 s->stat[MCS_CACHE] += val * PAGE_SIZE;
3139 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 3082 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3140 s->stat[MCS_RSS] += val * PAGE_SIZE; 3083 s->stat[MCS_RSS] += val * PAGE_SIZE;
3141 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); 3084 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3142 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3085 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3143 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 3086 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3144 s->stat[MCS_PGPGIN] += val; 3087 s->stat[MCS_PGPGIN] += val;
3145 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3088 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3146 s->stat[MCS_PGPGOUT] += val; 3089 s->stat[MCS_PGPGOUT] += val;
3147 if (do_swap_account) { 3090 if (do_swap_account) {
3148 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); 3091 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3149 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3092 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3150 } 3093 }
3151 3094
@@ -3276,19 +3219,14 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3276static bool mem_cgroup_threshold_check(struct mem_cgroup *mem) 3219static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
3277{ 3220{
3278 bool ret = false; 3221 bool ret = false;
3279 int cpu;
3280 s64 val; 3222 s64 val;
3281 struct mem_cgroup_stat_cpu *cpustat;
3282 3223
3283 cpu = get_cpu(); 3224 val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
3284 cpustat = &mem->stat.cpustat[cpu];
3285 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
3286 if (unlikely(val < 0)) { 3225 if (unlikely(val < 0)) {
3287 __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, 3226 this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS],
3288 THRESHOLDS_EVENTS_THRESH); 3227 THRESHOLDS_EVENTS_THRESH);
3289 ret = true; 3228 ret = true;
3290 } 3229 }
3291 put_cpu();
3292 return ret; 3230 return ret;
3293} 3231}
3294 3232
@@ -3676,17 +3614,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3676 kfree(mem->info.nodeinfo[node]); 3614 kfree(mem->info.nodeinfo[node]);
3677} 3615}
3678 3616
3679static int mem_cgroup_size(void)
3680{
3681 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
3682 return sizeof(struct mem_cgroup) + cpustat_size;
3683}
3684
3685static struct mem_cgroup *mem_cgroup_alloc(void) 3617static struct mem_cgroup *mem_cgroup_alloc(void)
3686{ 3618{
3687 struct mem_cgroup *mem; 3619 struct mem_cgroup *mem;
3688 int size = mem_cgroup_size(); 3620 int size = sizeof(struct mem_cgroup);
3689 3621
3622 /* Can be very big if MAX_NUMNODES is very big */
3690 if (size < PAGE_SIZE) 3623 if (size < PAGE_SIZE)
3691 mem = kmalloc(size, GFP_KERNEL); 3624 mem = kmalloc(size, GFP_KERNEL);
3692 else 3625 else
@@ -3694,6 +3627,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
3694 3627
3695 if (mem) 3628 if (mem)
3696 memset(mem, 0, size); 3629 memset(mem, 0, size);
3630 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3631 if (!mem->stat) {
3632 if (size < PAGE_SIZE)
3633 kfree(mem);
3634 else
3635 vfree(mem);
3636 mem = NULL;
3637 }
3697 return mem; 3638 return mem;
3698} 3639}
3699 3640
@@ -3718,7 +3659,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
3718 for_each_node_state(node, N_POSSIBLE) 3659 for_each_node_state(node, N_POSSIBLE)
3719 free_mem_cgroup_per_zone_info(mem, node); 3660 free_mem_cgroup_per_zone_info(mem, node);
3720 3661
3721 if (mem_cgroup_size() < PAGE_SIZE) 3662 free_percpu(mem->stat);
3663 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3722 kfree(mem); 3664 kfree(mem);
3723 else 3665 else
3724 vfree(mem); 3666 vfree(mem);