aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2009-09-23 18:56:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-24 10:20:59 -0400
commit0c3e73e84fe3f64cf1c2e8bb4e91e8901cbcdc38 (patch)
tree1c3e27fce3babd3703b34c9a353f253fcefb6325 /mm/memcontrol.c
parent4e41695356fb4e0b153be1440ad027e46e0a7ea2 (diff)
memcg: improve resource counter scalability
Reduce the resource counter overhead (mostly spinlock) associated with the root cgroup. This is a part of the several patches to reduce mem cgroup overhead. I had posted other approaches earlier (including using percpu counters). Those patches will be a natural addition and will be added iteratively on top of these. The patch stops resource counter accounting for the root cgroup. The data for display is derived from the statisitcs we maintain via mem_cgroup_charge_statistics (which is more scalable). What happens today is that, we do double accounting, once using res_counter_charge() and once using memory_cgroup_charge_statistics(). For the root, since we don't implement limits any more, we don't need to track every charge via res_counter_charge() and check for limit being exceeded and reclaim. The main mem->res usage_in_bytes can be derived by summing the cache and rss usage data from memory statistics (MEM_CGROUP_STAT_RSS and MEM_CGROUP_STAT_CACHE). However, for memsw->res usage_in_bytes, we need additional data about swapped out memory. This patch adds a MEM_CGROUP_STAT_SWAPOUT and uses that along with MEM_CGROUP_STAT_RSS and MEM_CGROUP_STAT_CACHE to derive the memsw data. This data is computed recursively when hierarchy is enabled. The tests results I see on a 24 way show that 1. The lock contention disappears from /proc/lock_stats 2. The results of the test are comparable to running with cgroup_disable=memory. Here is a sample of my program runs Without Patch Performance counter stats for '/home/balbir/parallel_pagefault': 7192804.124144 task-clock-msecs # 23.937 CPUs 424691 context-switches # 0.000 M/sec 267 CPU-migrations # 0.000 M/sec 28498113 page-faults # 0.004 M/sec 5826093739340 cycles # 809.989 M/sec 408883496292 instructions # 0.070 IPC 7057079452 cache-references # 0.981 M/sec 3036086243 cache-misses # 0.422 M/sec 300.485365680 seconds time elapsed With cgroup_disable=memory Performance counter stats for '/home/balbir/parallel_pagefault': 7182183.546587 task-clock-msecs # 23.915 CPUs 425458 context-switches # 0.000 M/sec 203 CPU-migrations # 0.000 M/sec 92545093 page-faults # 0.013 M/sec 6034363609986 cycles # 840.185 M/sec 437204346785 instructions # 0.072 IPC 6636073192 cache-references # 0.924 M/sec 2358117732 cache-misses # 0.328 M/sec 300.320905827 seconds time elapsed With this patch applied Performance counter stats for '/home/balbir/parallel_pagefault': 7191619.223977 task-clock-msecs # 23.955 CPUs 422579 context-switches # 0.000 M/sec 88 CPU-migrations # 0.000 M/sec 91946060 page-faults # 0.013 M/sec 5957054385619 cycles # 828.333 M/sec 1058117350365 instructions # 0.178 IPC 9161776218 cache-references # 1.274 M/sec 1920494280 cache-misses # 0.267 M/sec 300.218764862 seconds time elapsed Data from Prarit (kernel compile with make -j64 on a 64 CPU/32G machine) For a single run Without patch real 27m8.988s user 87m24.916s sys 382m6.037s With patch real 4m18.607s user 84m58.943s sys 50m52.682s With config turned off real 4m54.972s user 90m13.456s sys 50m19.711s NOTE: The data looks counterintuitive due to the increased performance with the patch, even over the config being turned off. We probably need more runs, but so far all testing has shown that the patches definitely help. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Prarit Bhargava <prarit@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c121
1 files changed, 100 insertions, 21 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 011aba6cad70..2011f15278fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -70,6 +70,7 @@ enum mem_cgroup_stat_index {
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
73 74
74 MEM_CGROUP_STAT_NSTATS, 75 MEM_CGROUP_STAT_NSTATS,
75}; 76};
@@ -478,11 +479,24 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
478 return mz; 479 return mz;
479} 480}
480 481
482static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
483 bool charge)
484{
485 int val = (charge) ? 1 : -1;
486 struct mem_cgroup_stat *stat = &mem->stat;
487 struct mem_cgroup_stat_cpu *cpustat;
488 int cpu = get_cpu();
489
490 cpustat = &stat->cpustat[cpu];
491 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
492 put_cpu();
493}
494
481static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 495static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
482 struct page_cgroup *pc, 496 struct page_cgroup *pc,
483 bool charge) 497 bool charge)
484{ 498{
485 int val = (charge)? 1 : -1; 499 int val = (charge) ? 1 : -1;
486 struct mem_cgroup_stat *stat = &mem->stat; 500 struct mem_cgroup_stat *stat = &mem->stat;
487 struct mem_cgroup_stat_cpu *cpustat; 501 struct mem_cgroup_stat_cpu *cpustat;
488 int cpu = get_cpu(); 502 int cpu = get_cpu();
@@ -1285,9 +1299,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1285 VM_BUG_ON(css_is_removed(&mem->css)); 1299 VM_BUG_ON(css_is_removed(&mem->css));
1286 1300
1287 while (1) { 1301 while (1) {
1288 int ret; 1302 int ret = 0;
1289 unsigned long flags = 0; 1303 unsigned long flags = 0;
1290 1304
1305 if (mem_cgroup_is_root(mem))
1306 goto done;
1291 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, 1307 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
1292 &soft_fail_res); 1308 &soft_fail_res);
1293 if (likely(!ret)) { 1309 if (likely(!ret)) {
@@ -1347,6 +1363,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1347 if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) 1363 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1348 mem_cgroup_update_tree(mem_over_soft_limit, page); 1364 mem_cgroup_update_tree(mem_over_soft_limit, page);
1349 } 1365 }
1366done:
1350 return 0; 1367 return 0;
1351nomem: 1368nomem:
1352 css_put(&mem->css); 1369 css_put(&mem->css);
@@ -1419,9 +1436,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1419 lock_page_cgroup(pc); 1436 lock_page_cgroup(pc);
1420 if (unlikely(PageCgroupUsed(pc))) { 1437 if (unlikely(PageCgroupUsed(pc))) {
1421 unlock_page_cgroup(pc); 1438 unlock_page_cgroup(pc);
1422 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1439 if (!mem_cgroup_is_root(mem)) {
1423 if (do_swap_account) 1440 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1424 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1441 if (do_swap_account)
1442 res_counter_uncharge(&mem->memsw, PAGE_SIZE,
1443 NULL);
1444 }
1425 css_put(&mem->css); 1445 css_put(&mem->css);
1426 return; 1446 return;
1427 } 1447 }
@@ -1498,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1498 if (pc->mem_cgroup != from) 1518 if (pc->mem_cgroup != from)
1499 goto out; 1519 goto out;
1500 1520
1501 res_counter_uncharge(&from->res, PAGE_SIZE, NULL); 1521 if (!mem_cgroup_is_root(from))
1522 res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
1502 mem_cgroup_charge_statistics(from, pc, false); 1523 mem_cgroup_charge_statistics(from, pc, false);
1503 1524
1504 page = pc->page; 1525 page = pc->page;
@@ -1517,7 +1538,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1517 1); 1538 1);
1518 } 1539 }
1519 1540
1520 if (do_swap_account) 1541 if (do_swap_account && !mem_cgroup_is_root(from))
1521 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); 1542 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
1522 css_put(&from->css); 1543 css_put(&from->css);
1523 1544
@@ -1588,9 +1609,11 @@ uncharge:
1588 /* drop extra refcnt by try_charge() */ 1609 /* drop extra refcnt by try_charge() */
1589 css_put(&parent->css); 1610 css_put(&parent->css);
1590 /* uncharge if move fails */ 1611 /* uncharge if move fails */
1591 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); 1612 if (!mem_cgroup_is_root(parent)) {
1592 if (do_swap_account) 1613 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
1593 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); 1614 if (do_swap_account)
1615 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
1616 }
1594 return ret; 1617 return ret;
1595} 1618}
1596 1619
@@ -1779,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1779 * This recorded memcg can be obsolete one. So, avoid 1802 * This recorded memcg can be obsolete one. So, avoid
1780 * calling css_tryget 1803 * calling css_tryget
1781 */ 1804 */
1782 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); 1805 if (!mem_cgroup_is_root(memcg))
1806 res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
1807 NULL);
1808 mem_cgroup_swap_statistics(memcg, false);
1783 mem_cgroup_put(memcg); 1809 mem_cgroup_put(memcg);
1784 } 1810 }
1785 rcu_read_unlock(); 1811 rcu_read_unlock();
@@ -1804,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1804 return; 1830 return;
1805 if (!mem) 1831 if (!mem)
1806 return; 1832 return;
1807 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1833 if (!mem_cgroup_is_root(mem)) {
1808 if (do_swap_account) 1834 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1809 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1835 if (do_swap_account)
1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1837 }
1810 css_put(&mem->css); 1838 css_put(&mem->css);
1811} 1839}
1812 1840
@@ -1859,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1859 break; 1887 break;
1860 } 1888 }
1861 1889
1862 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); 1890 if (!mem_cgroup_is_root(mem)) {
1863 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1891 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
1864 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1892 if (do_swap_account &&
1893 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1894 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1895 }
1896 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1897 mem_cgroup_swap_statistics(mem, true);
1865 mem_cgroup_charge_statistics(mem, pc, false); 1898 mem_cgroup_charge_statistics(mem, pc, false);
1866 1899
1867 ClearPageCgroupUsed(pc); 1900 ClearPageCgroupUsed(pc);
@@ -1952,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1952 * We uncharge this because swap is freed. 1985 * We uncharge this because swap is freed.
1953 * This memcg can be obsolete one. We avoid calling css_tryget 1986 * This memcg can be obsolete one. We avoid calling css_tryget
1954 */ 1987 */
1955 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); 1988 if (!mem_cgroup_is_root(memcg))
1989 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
1990 mem_cgroup_swap_statistics(memcg, false);
1956 mem_cgroup_put(memcg); 1991 mem_cgroup_put(memcg);
1957 } 1992 }
1958 rcu_read_unlock(); 1993 rcu_read_unlock();
@@ -2464,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2464 return retval; 2499 return retval;
2465} 2500}
2466 2501
2502struct mem_cgroup_idx_data {
2503 s64 val;
2504 enum mem_cgroup_stat_index idx;
2505};
2506
2507static int
2508mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2509{
2510 struct mem_cgroup_idx_data *d = data;
2511 d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2512 return 0;
2513}
2514
2515static void
2516mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2517 enum mem_cgroup_stat_index idx, s64 *val)
2518{
2519 struct mem_cgroup_idx_data d;
2520 d.idx = idx;
2521 d.val = 0;
2522 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2523 *val = d.val;
2524}
2525
2467static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2526static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2468{ 2527{
2469 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2528 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2470 u64 val = 0; 2529 u64 idx_val, val;
2471 int type, name; 2530 int type, name;
2472 2531
2473 type = MEMFILE_TYPE(cft->private); 2532 type = MEMFILE_TYPE(cft->private);
2474 name = MEMFILE_ATTR(cft->private); 2533 name = MEMFILE_ATTR(cft->private);
2475 switch (type) { 2534 switch (type) {
2476 case _MEM: 2535 case _MEM:
2477 val = res_counter_read_u64(&mem->res, name); 2536 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2537 mem_cgroup_get_recursive_idx_stat(mem,
2538 MEM_CGROUP_STAT_CACHE, &idx_val);
2539 val = idx_val;
2540 mem_cgroup_get_recursive_idx_stat(mem,
2541 MEM_CGROUP_STAT_RSS, &idx_val);
2542 val += idx_val;
2543 val <<= PAGE_SHIFT;
2544 } else
2545 val = res_counter_read_u64(&mem->res, name);
2478 break; 2546 break;
2479 case _MEMSWAP: 2547 case _MEMSWAP:
2480 val = res_counter_read_u64(&mem->memsw, name); 2548 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2549 mem_cgroup_get_recursive_idx_stat(mem,
2550 MEM_CGROUP_STAT_CACHE, &idx_val);
2551 val = idx_val;
2552 mem_cgroup_get_recursive_idx_stat(mem,
2553 MEM_CGROUP_STAT_RSS, &idx_val);
2554 val += idx_val;
2555 mem_cgroup_get_recursive_idx_stat(mem,
2556 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2557 val <<= PAGE_SHIFT;
2558 } else
2559 val = res_counter_read_u64(&mem->memsw, name);
2481 break; 2560 break;
2482 default: 2561 default:
2483 BUG(); 2562 BUG();