aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-10-01 18:44:11 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-10-01 19:11:13 -0400
commit4e649152cbaa1aedd01821d200ab9d597fe469e4 (patch)
tree635fa7d75acda929e81b8b0db7e641b7d4e07b35
parent3dece8347df6a16239fab10dadb370854f1c969c (diff)
memcg: some modification to softlimit under hierarchical memory reclaim.
This patch clean up/fixes for memcg's uncharge soft limit path. Problems: Now, res_counter_charge()/uncharge() handles softlimit information at charge/uncharge and softlimit-check is done when event counter per memcg goes over limit. Now, event counter per memcg is updated only when memory usage is over soft limit. Here, considering hierarchical memcg management, ancesotors should be taken care of. Now, ancerstors(hierarchy) are handled in charge() but not in uncharge(). This is not good. Prolems: 1. memcg's event counter incremented only when softlimit hits. That's bad. It makes event counter hard to be reused for other purpose. 2. At uncharge, only the lowest level rescounter is handled. This is bug. Because ancesotor's event counter is not incremented, children should take care of them. 3. res_counter_uncharge()'s 3rd argument is NULL in most case. ops under res_counter->lock should be small. No "if" sentense is better. Fixes: * Removed soft_limit_xx poitner and checks in charge and uncharge. Do-check-only-when-necessary scheme works enough well without them. * make event-counter of memcg incremented at every charge/uncharge. (per-cpu area will be accessed soon anyway) * All ancestors are checked at soft-limit-check. This is necessary because ancesotor's event counter may never be modified. Then, they should be checked at the same time. Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/res_counter.h6
-rw-r--r--kernel/res_counter.c18
-rw-r--r--mm/memcontrol.c113
3 files changed, 54 insertions, 83 deletions
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 731af71cddc9..fcb9884df618 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -114,8 +114,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
114int __must_check res_counter_charge_locked(struct res_counter *counter, 114int __must_check res_counter_charge_locked(struct res_counter *counter,
115 unsigned long val); 115 unsigned long val);
116int __must_check res_counter_charge(struct res_counter *counter, 116int __must_check res_counter_charge(struct res_counter *counter,
117 unsigned long val, struct res_counter **limit_fail_at, 117 unsigned long val, struct res_counter **limit_fail_at);
118 struct res_counter **soft_limit_at);
119 118
120/* 119/*
121 * uncharge - tell that some portion of the resource is released 120 * uncharge - tell that some portion of the resource is released
@@ -128,8 +127,7 @@ int __must_check res_counter_charge(struct res_counter *counter,
128 */ 127 */
129 128
130void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); 129void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
131void res_counter_uncharge(struct res_counter *counter, unsigned long val, 130void res_counter_uncharge(struct res_counter *counter, unsigned long val);
132 bool *was_soft_limit_excess);
133 131
134static inline bool res_counter_limit_check_locked(struct res_counter *cnt) 132static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
135{ 133{
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 88faec23e833..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -37,27 +37,17 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
37} 37}
38 38
39int res_counter_charge(struct res_counter *counter, unsigned long val, 39int res_counter_charge(struct res_counter *counter, unsigned long val,
40 struct res_counter **limit_fail_at, 40 struct res_counter **limit_fail_at)
41 struct res_counter **soft_limit_fail_at)
42{ 41{
43 int ret; 42 int ret;
44 unsigned long flags; 43 unsigned long flags;
45 struct res_counter *c, *u; 44 struct res_counter *c, *u;
46 45
47 *limit_fail_at = NULL; 46 *limit_fail_at = NULL;
48 if (soft_limit_fail_at)
49 *soft_limit_fail_at = NULL;
50 local_irq_save(flags); 47 local_irq_save(flags);
51 for (c = counter; c != NULL; c = c->parent) { 48 for (c = counter; c != NULL; c = c->parent) {
52 spin_lock(&c->lock); 49 spin_lock(&c->lock);
53 ret = res_counter_charge_locked(c, val); 50 ret = res_counter_charge_locked(c, val);
54 /*
55 * With soft limits, we return the highest ancestor
56 * that exceeds its soft limit
57 */
58 if (soft_limit_fail_at &&
59 !res_counter_soft_limit_check_locked(c))
60 *soft_limit_fail_at = c;
61 spin_unlock(&c->lock); 51 spin_unlock(&c->lock);
62 if (ret < 0) { 52 if (ret < 0) {
63 *limit_fail_at = c; 53 *limit_fail_at = c;
@@ -85,8 +75,7 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
85 counter->usage -= val; 75 counter->usage -= val;
86} 76}
87 77
88void res_counter_uncharge(struct res_counter *counter, unsigned long val, 78void res_counter_uncharge(struct res_counter *counter, unsigned long val)
89 bool *was_soft_limit_excess)
90{ 79{
91 unsigned long flags; 80 unsigned long flags;
92 struct res_counter *c; 81 struct res_counter *c;
@@ -94,9 +83,6 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val,
94 local_irq_save(flags); 83 local_irq_save(flags);
95 for (c = counter; c != NULL; c = c->parent) { 84 for (c = counter; c != NULL; c = c->parent) {
96 spin_lock(&c->lock); 85 spin_lock(&c->lock);
97 if (was_soft_limit_excess)
98 *was_soft_limit_excess =
99 !res_counter_soft_limit_check_locked(c);
100 res_counter_uncharge_locked(c, val); 86 res_counter_uncharge_locked(c, val);
101 spin_unlock(&c->lock); 87 spin_unlock(&c->lock);
102 } 88 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 21a30629ca80..1ae8c439584a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -353,16 +353,6 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
353} 353}
354 354
355static void 355static void
356mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
357 struct mem_cgroup_per_zone *mz,
358 struct mem_cgroup_tree_per_zone *mctz)
359{
360 spin_lock(&mctz->lock);
361 __mem_cgroup_insert_exceeded(mem, mz, mctz);
362 spin_unlock(&mctz->lock);
363}
364
365static void
366mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 356mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
367 struct mem_cgroup_per_zone *mz, 357 struct mem_cgroup_per_zone *mz,
368 struct mem_cgroup_tree_per_zone *mctz) 358 struct mem_cgroup_tree_per_zone *mctz)
@@ -392,34 +382,40 @@ static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
392 382
393static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 383static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
394{ 384{
395 unsigned long long prev_usage_in_excess, new_usage_in_excess; 385 unsigned long long new_usage_in_excess;
396 bool updated_tree = false;
397 struct mem_cgroup_per_zone *mz; 386 struct mem_cgroup_per_zone *mz;
398 struct mem_cgroup_tree_per_zone *mctz; 387 struct mem_cgroup_tree_per_zone *mctz;
399 388 int nid = page_to_nid(page);
400 mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); 389 int zid = page_zonenum(page);
401 mctz = soft_limit_tree_from_page(page); 390 mctz = soft_limit_tree_from_page(page);
402 391
403 /* 392 /*
404 * We do updates in lazy mode, mem's are removed 393 * Necessary to update all ancestors when hierarchy is used.
405 * lazily from the per-zone, per-node rb tree 394 * because their event counter is not touched.
406 */ 395 */
407 prev_usage_in_excess = mz->usage_in_excess; 396 for (; mem; mem = parent_mem_cgroup(mem)) {
408 397 mz = mem_cgroup_zoneinfo(mem, nid, zid);
409 new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); 398 new_usage_in_excess =
410 if (prev_usage_in_excess) { 399 res_counter_soft_limit_excess(&mem->res);
411 mem_cgroup_remove_exceeded(mem, mz, mctz); 400 /*
412 updated_tree = true; 401 * We have to update the tree if mz is on RB-tree or
413 } 402 * mem is over its softlimit.
414 if (!new_usage_in_excess) 403 */
415 goto done; 404 if (new_usage_in_excess || mz->on_tree) {
416 mem_cgroup_insert_exceeded(mem, mz, mctz); 405 spin_lock(&mctz->lock);
417 406 /* if on-tree, remove it */
418done: 407 if (mz->on_tree)
419 if (updated_tree) { 408 __mem_cgroup_remove_exceeded(mem, mz, mctz);
420 spin_lock(&mctz->lock); 409 /*
421 mz->usage_in_excess = new_usage_in_excess; 410 * if over soft limit, insert again. mz->usage_in_excess
422 spin_unlock(&mctz->lock); 411 * will be updated properly.
412 */
413 if (new_usage_in_excess)
414 __mem_cgroup_insert_exceeded(mem, mz, mctz);
415 else
416 mz->usage_in_excess = 0;
417 spin_unlock(&mctz->lock);
418 }
423 } 419 }
424} 420}
425 421
@@ -1271,9 +1267,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1271 gfp_t gfp_mask, struct mem_cgroup **memcg, 1267 gfp_t gfp_mask, struct mem_cgroup **memcg,
1272 bool oom, struct page *page) 1268 bool oom, struct page *page)
1273{ 1269{
1274 struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; 1270 struct mem_cgroup *mem, *mem_over_limit;
1275 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1271 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1276 struct res_counter *fail_res, *soft_fail_res = NULL; 1272 struct res_counter *fail_res;
1277 1273
1278 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1274 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1279 /* Don't account this! */ 1275 /* Don't account this! */
@@ -1305,17 +1301,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1305 1301
1306 if (mem_cgroup_is_root(mem)) 1302 if (mem_cgroup_is_root(mem))
1307 goto done; 1303 goto done;
1308 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, 1304 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
1309 &soft_fail_res);
1310 if (likely(!ret)) { 1305 if (likely(!ret)) {
1311 if (!do_swap_account) 1306 if (!do_swap_account)
1312 break; 1307 break;
1313 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1308 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
1314 &fail_res, NULL); 1309 &fail_res);
1315 if (likely(!ret)) 1310 if (likely(!ret))
1316 break; 1311 break;
1317 /* mem+swap counter fails */ 1312 /* mem+swap counter fails */
1318 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1313 res_counter_uncharge(&mem->res, PAGE_SIZE);
1319 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1314 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1320 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1315 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1321 memsw); 1316 memsw);
@@ -1354,16 +1349,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1354 } 1349 }
1355 } 1350 }
1356 /* 1351 /*
1357 * Insert just the ancestor, we should trickle down to the correct 1352 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1358 * cgroup for reclaim, since the other nodes will be below their 1353 * if they exceeds softlimit.
1359 * soft limit
1360 */ 1354 */
1361 if (soft_fail_res) { 1355 if (mem_cgroup_soft_limit_check(mem))
1362 mem_over_soft_limit = 1356 mem_cgroup_update_tree(mem, page);
1363 mem_cgroup_from_res_counter(soft_fail_res, res);
1364 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1365 mem_cgroup_update_tree(mem_over_soft_limit, page);
1366 }
1367done: 1357done:
1368 return 0; 1358 return 0;
1369nomem: 1359nomem:
@@ -1438,10 +1428,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1438 if (unlikely(PageCgroupUsed(pc))) { 1428 if (unlikely(PageCgroupUsed(pc))) {
1439 unlock_page_cgroup(pc); 1429 unlock_page_cgroup(pc);
1440 if (!mem_cgroup_is_root(mem)) { 1430 if (!mem_cgroup_is_root(mem)) {
1441 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1431 res_counter_uncharge(&mem->res, PAGE_SIZE);
1442 if (do_swap_account) 1432 if (do_swap_account)
1443 res_counter_uncharge(&mem->memsw, PAGE_SIZE, 1433 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1444 NULL);
1445 } 1434 }
1446 css_put(&mem->css); 1435 css_put(&mem->css);
1447 return; 1436 return;
@@ -1520,7 +1509,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1520 goto out; 1509 goto out;
1521 1510
1522 if (!mem_cgroup_is_root(from)) 1511 if (!mem_cgroup_is_root(from))
1523 res_counter_uncharge(&from->res, PAGE_SIZE, NULL); 1512 res_counter_uncharge(&from->res, PAGE_SIZE);
1524 mem_cgroup_charge_statistics(from, pc, false); 1513 mem_cgroup_charge_statistics(from, pc, false);
1525 1514
1526 page = pc->page; 1515 page = pc->page;
@@ -1540,7 +1529,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1540 } 1529 }
1541 1530
1542 if (do_swap_account && !mem_cgroup_is_root(from)) 1531 if (do_swap_account && !mem_cgroup_is_root(from))
1543 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); 1532 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1544 css_put(&from->css); 1533 css_put(&from->css);
1545 1534
1546 css_get(&to->css); 1535 css_get(&to->css);
@@ -1611,9 +1600,9 @@ uncharge:
1611 css_put(&parent->css); 1600 css_put(&parent->css);
1612 /* uncharge if move fails */ 1601 /* uncharge if move fails */
1613 if (!mem_cgroup_is_root(parent)) { 1602 if (!mem_cgroup_is_root(parent)) {
1614 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); 1603 res_counter_uncharge(&parent->res, PAGE_SIZE);
1615 if (do_swap_account) 1604 if (do_swap_account)
1616 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); 1605 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1617 } 1606 }
1618 return ret; 1607 return ret;
1619} 1608}
@@ -1804,8 +1793,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1804 * calling css_tryget 1793 * calling css_tryget
1805 */ 1794 */
1806 if (!mem_cgroup_is_root(memcg)) 1795 if (!mem_cgroup_is_root(memcg))
1807 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, 1796 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1808 NULL);
1809 mem_cgroup_swap_statistics(memcg, false); 1797 mem_cgroup_swap_statistics(memcg, false);
1810 mem_cgroup_put(memcg); 1798 mem_cgroup_put(memcg);
1811 } 1799 }
@@ -1832,9 +1820,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1832 if (!mem) 1820 if (!mem)
1833 return; 1821 return;
1834 if (!mem_cgroup_is_root(mem)) { 1822 if (!mem_cgroup_is_root(mem)) {
1835 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1823 res_counter_uncharge(&mem->res, PAGE_SIZE);
1836 if (do_swap_account) 1824 if (do_swap_account)
1837 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1825 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1838 } 1826 }
1839 css_put(&mem->css); 1827 css_put(&mem->css);
1840} 1828}
@@ -1849,7 +1837,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1849 struct page_cgroup *pc; 1837 struct page_cgroup *pc;
1850 struct mem_cgroup *mem = NULL; 1838 struct mem_cgroup *mem = NULL;
1851 struct mem_cgroup_per_zone *mz; 1839 struct mem_cgroup_per_zone *mz;
1852 bool soft_limit_excess = false;
1853 1840
1854 if (mem_cgroup_disabled()) 1841 if (mem_cgroup_disabled())
1855 return NULL; 1842 return NULL;
@@ -1889,10 +1876,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1889 } 1876 }
1890 1877
1891 if (!mem_cgroup_is_root(mem)) { 1878 if (!mem_cgroup_is_root(mem)) {
1892 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); 1879 res_counter_uncharge(&mem->res, PAGE_SIZE);
1893 if (do_swap_account && 1880 if (do_swap_account &&
1894 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1881 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1895 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1882 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1896 } 1883 }
1897 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1884 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1898 mem_cgroup_swap_statistics(mem, true); 1885 mem_cgroup_swap_statistics(mem, true);
@@ -1909,7 +1896,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1909 mz = page_cgroup_zoneinfo(pc); 1896 mz = page_cgroup_zoneinfo(pc);
1910 unlock_page_cgroup(pc); 1897 unlock_page_cgroup(pc);
1911 1898
1912 if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) 1899 if (mem_cgroup_soft_limit_check(mem))
1913 mem_cgroup_update_tree(mem, page); 1900 mem_cgroup_update_tree(mem, page);
1914 /* at swapout, this memcg will be accessed to record to swap */ 1901 /* at swapout, this memcg will be accessed to record to swap */
1915 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1902 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
@@ -1987,7 +1974,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1987 * This memcg can be obsolete one. We avoid calling css_tryget 1974 * This memcg can be obsolete one. We avoid calling css_tryget
1988 */ 1975 */
1989 if (!mem_cgroup_is_root(memcg)) 1976 if (!mem_cgroup_is_root(memcg))
1990 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); 1977 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1991 mem_cgroup_swap_statistics(memcg, false); 1978 mem_cgroup_swap_statistics(memcg, false);
1992 mem_cgroup_put(memcg); 1979 mem_cgroup_put(memcg);
1993 } 1980 }