aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-12-10 18:42:31 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 20:41:04 -0500
commit3e32cb2e0a12b6915056ff04601cf1bb9b44f967 (patch)
tree75d312d531736fbb4281bfe0e80847d3ef9f8a4a /mm
parent8df0c2dcf61781d2efa8e6e5b06870f6c6785735 (diff)
mm: memcontrol: lockless page counters
Memory is internally accounted in bytes, using spinlock-protected 64-bit counters, even though the smallest accounting delta is a page. The counter interface is also convoluted and does too many things. Introduce a new lockless word-sized page counter API, then change all memory accounting over to it. The translation from and to bytes then only happens when interfacing with userspace. The removed locking overhead is noticable when scaling beyond the per-cpu charge caches - on a 4-socket machine with 144-threads, the following test shows the performance differences of 288 memcgs concurrently running a page fault benchmark: vanilla: 18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% ) 1,380,638 context-switches # 0.074 K/sec ( +- 0.75% ) 24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% ) 1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% ) 50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% ) 1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% ) 1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% ) 132.474343877 seconds time elapsed ( +- 0.21% ) lockless: 12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% ) 832,850 context-switches # 0.068 K/sec ( +- 0.54% ) 15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% ) 1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% ) 32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% ) 2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% ) 1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% ) 91.369330729 seconds time elapsed ( +- 0.45% ) On top of improved scalability, this also gets rid of the icky long long types in the very heart of memcg, which is great for 32 bit and also makes the code a lot more readable. Notable differences between the old and new API: - res_counter_charge() and res_counter_charge_nofail() become page_counter_try_charge() and page_counter_charge() resp. to match the more common kernel naming scheme of try_do()/do() - res_counter_uncharge_until() is only ever used to cancel a local counter and never to uncharge bigger segments of a hierarchy, so it's replaced by the simpler page_counter_cancel() - res_counter_set_limit() is replaced by page_counter_limit(), which expects its callers to serialize against themselves - res_counter_memparse_write_strategy() is replaced by page_counter_limit(), which rounds down to the nearest page size - rather than up. This is more reasonable for explicitely requested hard upper limits. - to keep charging light-weight, page_counter_try_charge() charges speculatively, only to roll back if the result exceeds the limit. Because of this, a failing bigger charge can temporarily lock out smaller charges that would otherwise succeed. The error is bounded to the difference between the smallest and the biggest possible charge size, so for memcg, this means that a failing THP charge can send base page charges into reclaim upto 2MB (4MB) before the limit would have been reached. This should be acceptable. [akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse] [akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vladimir Davydov <vdavydov@parallels.com> Cc: Tejun Heo <tj@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile1
-rw-r--r--mm/memcontrol.c633
-rw-r--r--mm/page_counter.c207
3 files changed, 503 insertions, 338 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 8405eb0023a9..6d9f40e922f7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
55obj-$(CONFIG_MIGRATION) += migrate.o 55obj-$(CONFIG_MIGRATION) += migrate.o
56obj-$(CONFIG_QUICKLIST) += quicklist.o 56obj-$(CONFIG_QUICKLIST) += quicklist.o
57obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 57obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
58obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
58obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o 59obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
59obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o 60obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
60obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 61obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d6ac0e33e150..4129ad74e93b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
25 * GNU General Public License for more details. 25 * GNU General Public License for more details.
26 */ 26 */
27 27
28#include <linux/res_counter.h> 28#include <linux/page_counter.h>
29#include <linux/memcontrol.h> 29#include <linux/memcontrol.h>
30#include <linux/cgroup.h> 30#include <linux/cgroup.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
@@ -165,7 +165,7 @@ struct mem_cgroup_per_zone {
165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
166 166
167 struct rb_node tree_node; /* RB tree node */ 167 struct rb_node tree_node; /* RB tree node */
168 unsigned long long usage_in_excess;/* Set to the value by which */ 168 unsigned long usage_in_excess;/* Set to the value by which */
169 /* the soft limit is exceeded*/ 169 /* the soft limit is exceeded*/
170 bool on_tree; 170 bool on_tree;
171 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 171 struct mem_cgroup *memcg; /* Back pointer, we cannot */
@@ -198,7 +198,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
198 198
199struct mem_cgroup_threshold { 199struct mem_cgroup_threshold {
200 struct eventfd_ctx *eventfd; 200 struct eventfd_ctx *eventfd;
201 u64 threshold; 201 unsigned long threshold;
202}; 202};
203 203
204/* For threshold */ 204/* For threshold */
@@ -284,10 +284,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
284 */ 284 */
285struct mem_cgroup { 285struct mem_cgroup {
286 struct cgroup_subsys_state css; 286 struct cgroup_subsys_state css;
287 /* 287
288 * the counter to account for memory usage 288 /* Accounted resources */
289 */ 289 struct page_counter memory;
290 struct res_counter res; 290 struct page_counter memsw;
291 struct page_counter kmem;
292
293 unsigned long soft_limit;
291 294
292 /* vmpressure notifications */ 295 /* vmpressure notifications */
293 struct vmpressure vmpressure; 296 struct vmpressure vmpressure;
@@ -296,15 +299,6 @@ struct mem_cgroup {
296 int initialized; 299 int initialized;
297 300
298 /* 301 /*
299 * the counter to account for mem+swap usage.
300 */
301 struct res_counter memsw;
302
303 /*
304 * the counter to account for kernel memory usage.
305 */
306 struct res_counter kmem;
307 /*
308 * Should the accounting and control be hierarchical, per subtree? 302 * Should the accounting and control be hierarchical, per subtree?
309 */ 303 */
310 bool use_hierarchy; 304 bool use_hierarchy;
@@ -650,7 +644,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
650 * This check can't live in kmem destruction function, 644 * This check can't live in kmem destruction function,
651 * since the charges will outlive the cgroup 645 * since the charges will outlive the cgroup
652 */ 646 */
653 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 647 WARN_ON(page_counter_read(&memcg->kmem));
654} 648}
655#else 649#else
656static void disarm_kmem_keys(struct mem_cgroup *memcg) 650static void disarm_kmem_keys(struct mem_cgroup *memcg)
@@ -706,7 +700,7 @@ soft_limit_tree_from_page(struct page *page)
706 700
707static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 701static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
708 struct mem_cgroup_tree_per_zone *mctz, 702 struct mem_cgroup_tree_per_zone *mctz,
709 unsigned long long new_usage_in_excess) 703 unsigned long new_usage_in_excess)
710{ 704{
711 struct rb_node **p = &mctz->rb_root.rb_node; 705 struct rb_node **p = &mctz->rb_root.rb_node;
712 struct rb_node *parent = NULL; 706 struct rb_node *parent = NULL;
@@ -755,10 +749,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
755 spin_unlock_irqrestore(&mctz->lock, flags); 749 spin_unlock_irqrestore(&mctz->lock, flags);
756} 750}
757 751
752static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
753{
754 unsigned long nr_pages = page_counter_read(&memcg->memory);
755 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
756 unsigned long excess = 0;
757
758 if (nr_pages > soft_limit)
759 excess = nr_pages - soft_limit;
760
761 return excess;
762}
758 763
759static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 764static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
760{ 765{
761 unsigned long long excess; 766 unsigned long excess;
762 struct mem_cgroup_per_zone *mz; 767 struct mem_cgroup_per_zone *mz;
763 struct mem_cgroup_tree_per_zone *mctz; 768 struct mem_cgroup_tree_per_zone *mctz;
764 769
@@ -769,7 +774,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
769 */ 774 */
770 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 775 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
771 mz = mem_cgroup_page_zoneinfo(memcg, page); 776 mz = mem_cgroup_page_zoneinfo(memcg, page);
772 excess = res_counter_soft_limit_excess(&memcg->res); 777 excess = soft_limit_excess(memcg);
773 /* 778 /*
774 * We have to update the tree if mz is on RB-tree or 779 * We have to update the tree if mz is on RB-tree or
775 * mem is over its softlimit. 780 * mem is over its softlimit.
@@ -825,7 +830,7 @@ retry:
825 * position in the tree. 830 * position in the tree.
826 */ 831 */
827 __mem_cgroup_remove_exceeded(mz, mctz); 832 __mem_cgroup_remove_exceeded(mz, mctz);
828 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 833 if (!soft_limit_excess(mz->memcg) ||
829 !css_tryget_online(&mz->memcg->css)) 834 !css_tryget_online(&mz->memcg->css))
830 goto retry; 835 goto retry;
831done: 836done:
@@ -1492,7 +1497,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1492 return inactive * inactive_ratio < active; 1497 return inactive * inactive_ratio < active;
1493} 1498}
1494 1499
1495#define mem_cgroup_from_res_counter(counter, member) \ 1500#define mem_cgroup_from_counter(counter, member) \
1496 container_of(counter, struct mem_cgroup, member) 1501 container_of(counter, struct mem_cgroup, member)
1497 1502
1498/** 1503/**
@@ -1504,12 +1509,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1504 */ 1509 */
1505static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1510static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1506{ 1511{
1507 unsigned long long margin; 1512 unsigned long margin = 0;
1513 unsigned long count;
1514 unsigned long limit;
1508 1515
1509 margin = res_counter_margin(&memcg->res); 1516 count = page_counter_read(&memcg->memory);
1510 if (do_swap_account) 1517 limit = ACCESS_ONCE(memcg->memory.limit);
1511 margin = min(margin, res_counter_margin(&memcg->memsw)); 1518 if (count < limit)
1512 return margin >> PAGE_SHIFT; 1519 margin = limit - count;
1520
1521 if (do_swap_account) {
1522 count = page_counter_read(&memcg->memsw);
1523 limit = ACCESS_ONCE(memcg->memsw.limit);
1524 if (count <= limit)
1525 margin = min(margin, limit - count);
1526 }
1527
1528 return margin;
1513} 1529}
1514 1530
1515int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1531int mem_cgroup_swappiness(struct mem_cgroup *memcg)
@@ -1644,18 +1660,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1644 1660
1645 rcu_read_unlock(); 1661 rcu_read_unlock();
1646 1662
1647 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1663 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1648 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1664 K((u64)page_counter_read(&memcg->memory)),
1649 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1665 K((u64)memcg->memory.limit), memcg->memory.failcnt);
1650 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1666 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1651 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1667 K((u64)page_counter_read(&memcg->memsw)),
1652 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1668 K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1653 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1669 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1654 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1670 K((u64)page_counter_read(&memcg->kmem)),
1655 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1671 K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1656 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1657 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1658 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1659 1672
1660 for_each_mem_cgroup_tree(iter, memcg) { 1673 for_each_mem_cgroup_tree(iter, memcg) {
1661 pr_info("Memory cgroup stats for "); 1674 pr_info("Memory cgroup stats for ");
@@ -1695,28 +1708,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1695/* 1708/*
1696 * Return the memory (and swap, if configured) limit for a memcg. 1709 * Return the memory (and swap, if configured) limit for a memcg.
1697 */ 1710 */
1698static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1711static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1699{ 1712{
1700 u64 limit; 1713 unsigned long limit;
1701 1714
1702 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1715 limit = memcg->memory.limit;
1703
1704 /*
1705 * Do not consider swap space if we cannot swap due to swappiness
1706 */
1707 if (mem_cgroup_swappiness(memcg)) { 1716 if (mem_cgroup_swappiness(memcg)) {
1708 u64 memsw; 1717 unsigned long memsw_limit;
1709 1718
1710 limit += total_swap_pages << PAGE_SHIFT; 1719 memsw_limit = memcg->memsw.limit;
1711 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1720 limit = min(limit + total_swap_pages, memsw_limit);
1712
1713 /*
1714 * If memsw is finite and limits the amount of swap space
1715 * available to this memcg, return that limit.
1716 */
1717 limit = min(limit, memsw);
1718 } 1721 }
1719
1720 return limit; 1722 return limit;
1721} 1723}
1722 1724
@@ -1740,7 +1742,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1740 } 1742 }
1741 1743
1742 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1744 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1743 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1745 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1744 for_each_mem_cgroup_tree(iter, memcg) { 1746 for_each_mem_cgroup_tree(iter, memcg) {
1745 struct css_task_iter it; 1747 struct css_task_iter it;
1746 struct task_struct *task; 1748 struct task_struct *task;
@@ -1943,7 +1945,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1943 .priority = 0, 1945 .priority = 0,
1944 }; 1946 };
1945 1947
1946 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1948 excess = soft_limit_excess(root_memcg);
1947 1949
1948 while (1) { 1950 while (1) {
1949 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1951 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
@@ -1974,7 +1976,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1974 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1976 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1975 zone, &nr_scanned); 1977 zone, &nr_scanned);
1976 *total_scanned += nr_scanned; 1978 *total_scanned += nr_scanned;
1977 if (!res_counter_soft_limit_excess(&root_memcg->res)) 1979 if (!soft_limit_excess(root_memcg))
1978 break; 1980 break;
1979 } 1981 }
1980 mem_cgroup_iter_break(root_memcg, victim); 1982 mem_cgroup_iter_break(root_memcg, victim);
@@ -2316,33 +2318,31 @@ static DEFINE_MUTEX(percpu_charge_mutex);
2316static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2318static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2317{ 2319{
2318 struct memcg_stock_pcp *stock; 2320 struct memcg_stock_pcp *stock;
2319 bool ret = true; 2321 bool ret = false;
2320 2322
2321 if (nr_pages > CHARGE_BATCH) 2323 if (nr_pages > CHARGE_BATCH)
2322 return false; 2324 return ret;
2323 2325
2324 stock = &get_cpu_var(memcg_stock); 2326 stock = &get_cpu_var(memcg_stock);
2325 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2327 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2326 stock->nr_pages -= nr_pages; 2328 stock->nr_pages -= nr_pages;
2327 else /* need to call res_counter_charge */ 2329 ret = true;
2328 ret = false; 2330 }
2329 put_cpu_var(memcg_stock); 2331 put_cpu_var(memcg_stock);
2330 return ret; 2332 return ret;
2331} 2333}
2332 2334
2333/* 2335/*
2334 * Returns stocks cached in percpu to res_counter and reset cached information. 2336 * Returns stocks cached in percpu and reset cached information.
2335 */ 2337 */
2336static void drain_stock(struct memcg_stock_pcp *stock) 2338static void drain_stock(struct memcg_stock_pcp *stock)
2337{ 2339{
2338 struct mem_cgroup *old = stock->cached; 2340 struct mem_cgroup *old = stock->cached;
2339 2341
2340 if (stock->nr_pages) { 2342 if (stock->nr_pages) {
2341 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2343 page_counter_uncharge(&old->memory, stock->nr_pages);
2342
2343 res_counter_uncharge(&old->res, bytes);
2344 if (do_swap_account) 2344 if (do_swap_account)
2345 res_counter_uncharge(&old->memsw, bytes); 2345 page_counter_uncharge(&old->memsw, stock->nr_pages);
2346 stock->nr_pages = 0; 2346 stock->nr_pages = 0;
2347 } 2347 }
2348 stock->cached = NULL; 2348 stock->cached = NULL;
@@ -2371,7 +2371,7 @@ static void __init memcg_stock_init(void)
2371} 2371}
2372 2372
2373/* 2373/*
2374 * Cache charges(val) which is from res_counter, to local per_cpu area. 2374 * Cache charges(val) to local per_cpu area.
2375 * This will be consumed by consume_stock() function, later. 2375 * This will be consumed by consume_stock() function, later.
2376 */ 2376 */
2377static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2377static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2431,8 +2431,7 @@ out:
2431/* 2431/*
2432 * Tries to drain stocked charges in other cpus. This function is asynchronous 2432 * Tries to drain stocked charges in other cpus. This function is asynchronous
2433 * and just put a work per cpu for draining localy on each cpu. Caller can 2433 * and just put a work per cpu for draining localy on each cpu. Caller can
2434 * expects some charges will be back to res_counter later but cannot wait for 2434 * expects some charges will be back later but cannot wait for it.
2435 * it.
2436 */ 2435 */
2437static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2436static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2438{ 2437{
@@ -2506,9 +2505,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2506 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2505 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2507 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2506 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2508 struct mem_cgroup *mem_over_limit; 2507 struct mem_cgroup *mem_over_limit;
2509 struct res_counter *fail_res; 2508 struct page_counter *counter;
2510 unsigned long nr_reclaimed; 2509 unsigned long nr_reclaimed;
2511 unsigned long long size;
2512 bool may_swap = true; 2510 bool may_swap = true;
2513 bool drained = false; 2511 bool drained = false;
2514 int ret = 0; 2512 int ret = 0;
@@ -2519,16 +2517,15 @@ retry:
2519 if (consume_stock(memcg, nr_pages)) 2517 if (consume_stock(memcg, nr_pages))
2520 goto done; 2518 goto done;
2521 2519
2522 size = batch * PAGE_SIZE;
2523 if (!do_swap_account || 2520 if (!do_swap_account ||
2524 !res_counter_charge(&memcg->memsw, size, &fail_res)) { 2521 !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2525 if (!res_counter_charge(&memcg->res, size, &fail_res)) 2522 if (!page_counter_try_charge(&memcg->memory, batch, &counter))
2526 goto done_restock; 2523 goto done_restock;
2527 if (do_swap_account) 2524 if (do_swap_account)
2528 res_counter_uncharge(&memcg->memsw, size); 2525 page_counter_uncharge(&memcg->memsw, batch);
2529 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2526 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2530 } else { 2527 } else {
2531 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2528 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2532 may_swap = false; 2529 may_swap = false;
2533 } 2530 }
2534 2531
@@ -2611,32 +2608,12 @@ done:
2611 2608
2612static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2609static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2613{ 2610{
2614 unsigned long bytes = nr_pages * PAGE_SIZE;
2615
2616 if (mem_cgroup_is_root(memcg)) 2611 if (mem_cgroup_is_root(memcg))
2617 return; 2612 return;
2618 2613
2619 res_counter_uncharge(&memcg->res, bytes); 2614 page_counter_uncharge(&memcg->memory, nr_pages);
2620 if (do_swap_account) 2615 if (do_swap_account)
2621 res_counter_uncharge(&memcg->memsw, bytes); 2616 page_counter_uncharge(&memcg->memsw, nr_pages);
2622}
2623
2624/*
2625 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2626 * This is useful when moving usage to parent cgroup.
2627 */
2628static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2629 unsigned int nr_pages)
2630{
2631 unsigned long bytes = nr_pages * PAGE_SIZE;
2632
2633 if (mem_cgroup_is_root(memcg))
2634 return;
2635
2636 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2637 if (do_swap_account)
2638 res_counter_uncharge_until(&memcg->memsw,
2639 memcg->memsw.parent, bytes);
2640} 2617}
2641 2618
2642/* 2619/*
@@ -2760,8 +2737,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2760 unlock_page_lru(page, isolated); 2737 unlock_page_lru(page, isolated);
2761} 2738}
2762 2739
2763static DEFINE_MUTEX(set_limit_mutex);
2764
2765#ifdef CONFIG_MEMCG_KMEM 2740#ifdef CONFIG_MEMCG_KMEM
2766/* 2741/*
2767 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2742 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
@@ -2804,16 +2779,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2804} 2779}
2805#endif 2780#endif
2806 2781
2807static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 2782static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2783 unsigned long nr_pages)
2808{ 2784{
2809 struct res_counter *fail_res; 2785 struct page_counter *counter;
2810 int ret = 0; 2786 int ret = 0;
2811 2787
2812 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2788 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
2813 if (ret) 2789 if (ret < 0)
2814 return ret; 2790 return ret;
2815 2791
2816 ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); 2792 ret = try_charge(memcg, gfp, nr_pages);
2817 if (ret == -EINTR) { 2793 if (ret == -EINTR) {
2818 /* 2794 /*
2819 * try_charge() chose to bypass to root due to OOM kill or 2795 * try_charge() chose to bypass to root due to OOM kill or
@@ -2830,25 +2806,25 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2830 * when the allocation triggers should have been already 2806 * when the allocation triggers should have been already
2831 * directed to the root cgroup in memcontrol.h 2807 * directed to the root cgroup in memcontrol.h
2832 */ 2808 */
2833 res_counter_charge_nofail(&memcg->res, size, &fail_res); 2809 page_counter_charge(&memcg->memory, nr_pages);
2834 if (do_swap_account) 2810 if (do_swap_account)
2835 res_counter_charge_nofail(&memcg->memsw, size, 2811 page_counter_charge(&memcg->memsw, nr_pages);
2836 &fail_res);
2837 ret = 0; 2812 ret = 0;
2838 } else if (ret) 2813 } else if (ret)
2839 res_counter_uncharge(&memcg->kmem, size); 2814 page_counter_uncharge(&memcg->kmem, nr_pages);
2840 2815
2841 return ret; 2816 return ret;
2842} 2817}
2843 2818
2844static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 2819static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
2820 unsigned long nr_pages)
2845{ 2821{
2846 res_counter_uncharge(&memcg->res, size); 2822 page_counter_uncharge(&memcg->memory, nr_pages);
2847 if (do_swap_account) 2823 if (do_swap_account)
2848 res_counter_uncharge(&memcg->memsw, size); 2824 page_counter_uncharge(&memcg->memsw, nr_pages);
2849 2825
2850 /* Not down to 0 */ 2826 /* Not down to 0 */
2851 if (res_counter_uncharge(&memcg->kmem, size)) 2827 if (page_counter_uncharge(&memcg->kmem, nr_pages))
2852 return; 2828 return;
2853 2829
2854 /* 2830 /*
@@ -3124,19 +3100,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
3124 3100
3125int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 3101int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
3126{ 3102{
3103 unsigned int nr_pages = 1 << order;
3127 int res; 3104 int res;
3128 3105
3129 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, 3106 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
3130 PAGE_SIZE << order);
3131 if (!res) 3107 if (!res)
3132 atomic_add(1 << order, &cachep->memcg_params->nr_pages); 3108 atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
3133 return res; 3109 return res;
3134} 3110}
3135 3111
3136void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 3112void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
3137{ 3113{
3138 memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); 3114 unsigned int nr_pages = 1 << order;
3139 atomic_sub(1 << order, &cachep->memcg_params->nr_pages); 3115
3116 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
3117 atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
3140} 3118}
3141 3119
3142/* 3120/*
@@ -3257,7 +3235,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3257 return true; 3235 return true;
3258 } 3236 }
3259 3237
3260 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3238 ret = memcg_charge_kmem(memcg, gfp, 1 << order);
3261 if (!ret) 3239 if (!ret)
3262 *_memcg = memcg; 3240 *_memcg = memcg;
3263 3241
@@ -3274,7 +3252,7 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3274 3252
3275 /* The page allocation failed. Revert */ 3253 /* The page allocation failed. Revert */
3276 if (!page) { 3254 if (!page) {
3277 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3255 memcg_uncharge_kmem(memcg, 1 << order);
3278 return; 3256 return;
3279 } 3257 }
3280 /* 3258 /*
@@ -3307,7 +3285,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
3307 return; 3285 return;
3308 3286
3309 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3287 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3310 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3288 memcg_uncharge_kmem(memcg, 1 << order);
3311} 3289}
3312#else 3290#else
3313static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) 3291static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
@@ -3485,8 +3463,12 @@ static int mem_cgroup_move_parent(struct page *page,
3485 3463
3486 ret = mem_cgroup_move_account(page, nr_pages, 3464 ret = mem_cgroup_move_account(page, nr_pages,
3487 pc, child, parent); 3465 pc, child, parent);
3488 if (!ret) 3466 if (!ret) {
3489 __mem_cgroup_cancel_local_charge(child, nr_pages); 3467 /* Take charge off the local counters */
3468 page_counter_cancel(&child->memory, nr_pages);
3469 if (do_swap_account)
3470 page_counter_cancel(&child->memsw, nr_pages);
3471 }
3490 3472
3491 if (nr_pages > 1) 3473 if (nr_pages > 1)
3492 compound_unlock_irqrestore(page, flags); 3474 compound_unlock_irqrestore(page, flags);
@@ -3516,7 +3498,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3516 * 3498 *
3517 * Returns 0 on success, -EINVAL on failure. 3499 * Returns 0 on success, -EINVAL on failure.
3518 * 3500 *
3519 * The caller must have charged to @to, IOW, called res_counter_charge() about 3501 * The caller must have charged to @to, IOW, called page_counter_charge() about
3520 * both res and memsw, and called css_get(). 3502 * both res and memsw, and called css_get().
3521 */ 3503 */
3522static int mem_cgroup_move_swap_account(swp_entry_t entry, 3504static int mem_cgroup_move_swap_account(swp_entry_t entry,
@@ -3532,7 +3514,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3532 mem_cgroup_swap_statistics(to, true); 3514 mem_cgroup_swap_statistics(to, true);
3533 /* 3515 /*
3534 * This function is only called from task migration context now. 3516 * This function is only called from task migration context now.
3535 * It postpones res_counter and refcount handling till the end 3517 * It postpones page_counter and refcount handling till the end
3536 * of task migration(mem_cgroup_clear_mc()) for performance 3518 * of task migration(mem_cgroup_clear_mc()) for performance
3537 * improvement. But we cannot postpone css_get(to) because if 3519 * improvement. But we cannot postpone css_get(to) because if
3538 * the process that has been moved to @to does swap-in, the 3520 * the process that has been moved to @to does swap-in, the
@@ -3590,60 +3572,57 @@ void mem_cgroup_print_bad_page(struct page *page)
3590} 3572}
3591#endif 3573#endif
3592 3574
3575static DEFINE_MUTEX(memcg_limit_mutex);
3576
3593static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3577static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3594 unsigned long long val) 3578 unsigned long limit)
3595{ 3579{
3580 unsigned long curusage;
3581 unsigned long oldusage;
3582 bool enlarge = false;
3596 int retry_count; 3583 int retry_count;
3597 int ret = 0; 3584 int ret;
3598 int children = mem_cgroup_count_children(memcg);
3599 u64 curusage, oldusage;
3600 int enlarge;
3601 3585
3602 /* 3586 /*
3603 * For keeping hierarchical_reclaim simple, how long we should retry 3587 * For keeping hierarchical_reclaim simple, how long we should retry
3604 * is depends on callers. We set our retry-count to be function 3588 * is depends on callers. We set our retry-count to be function
3605 * of # of children which we should visit in this loop. 3589 * of # of children which we should visit in this loop.
3606 */ 3590 */
3607 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3591 retry_count = MEM_CGROUP_RECLAIM_RETRIES *
3592 mem_cgroup_count_children(memcg);
3608 3593
3609 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3594 oldusage = page_counter_read(&memcg->memory);
3610 3595
3611 enlarge = 0; 3596 do {
3612 while (retry_count) {
3613 if (signal_pending(current)) { 3597 if (signal_pending(current)) {
3614 ret = -EINTR; 3598 ret = -EINTR;
3615 break; 3599 break;
3616 } 3600 }
3617 /* 3601
3618 * Rather than hide all in some function, I do this in 3602 mutex_lock(&memcg_limit_mutex);
3619 * open coded manner. You see what this really does. 3603 if (limit > memcg->memsw.limit) {
3620 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3604 mutex_unlock(&memcg_limit_mutex);
3621 */
3622 mutex_lock(&set_limit_mutex);
3623 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
3624 ret = -EINVAL; 3605 ret = -EINVAL;
3625 mutex_unlock(&set_limit_mutex);
3626 break; 3606 break;
3627 } 3607 }
3628 3608 if (limit > memcg->memory.limit)
3629 if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) 3609 enlarge = true;
3630 enlarge = 1; 3610 ret = page_counter_limit(&memcg->memory, limit);
3631 3611 mutex_unlock(&memcg_limit_mutex);
3632 ret = res_counter_set_limit(&memcg->res, val);
3633 mutex_unlock(&set_limit_mutex);
3634 3612
3635 if (!ret) 3613 if (!ret)
3636 break; 3614 break;
3637 3615
3638 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 3616 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3639 3617
3640 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3618 curusage = page_counter_read(&memcg->memory);
3641 /* Usage is reduced ? */ 3619 /* Usage is reduced ? */
3642 if (curusage >= oldusage) 3620 if (curusage >= oldusage)
3643 retry_count--; 3621 retry_count--;
3644 else 3622 else
3645 oldusage = curusage; 3623 oldusage = curusage;
3646 } 3624 } while (retry_count);
3625
3647 if (!ret && enlarge) 3626 if (!ret && enlarge)
3648 memcg_oom_recover(memcg); 3627 memcg_oom_recover(memcg);
3649 3628
@@ -3651,52 +3630,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3651} 3630}
3652 3631
3653static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3632static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3654 unsigned long long val) 3633 unsigned long limit)
3655{ 3634{
3635 unsigned long curusage;
3636 unsigned long oldusage;
3637 bool enlarge = false;
3656 int retry_count; 3638 int retry_count;
3657 u64 oldusage, curusage; 3639 int ret;
3658 int children = mem_cgroup_count_children(memcg);
3659 int ret = -EBUSY;
3660 int enlarge = 0;
3661 3640
3662 /* see mem_cgroup_resize_res_limit */ 3641 /* see mem_cgroup_resize_res_limit */
3663 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3642 retry_count = MEM_CGROUP_RECLAIM_RETRIES *
3664 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3643 mem_cgroup_count_children(memcg);
3665 while (retry_count) { 3644
3645 oldusage = page_counter_read(&memcg->memsw);
3646
3647 do {
3666 if (signal_pending(current)) { 3648 if (signal_pending(current)) {
3667 ret = -EINTR; 3649 ret = -EINTR;
3668 break; 3650 break;
3669 } 3651 }
3670 /* 3652
3671 * Rather than hide all in some function, I do this in 3653 mutex_lock(&memcg_limit_mutex);
3672 * open coded manner. You see what this really does. 3654 if (limit < memcg->memory.limit) {
3673 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3655 mutex_unlock(&memcg_limit_mutex);
3674 */
3675 mutex_lock(&set_limit_mutex);
3676 if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
3677 ret = -EINVAL; 3656 ret = -EINVAL;
3678 mutex_unlock(&set_limit_mutex);
3679 break; 3657 break;
3680 } 3658 }
3681 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) 3659 if (limit > memcg->memsw.limit)
3682 enlarge = 1; 3660 enlarge = true;
3683 ret = res_counter_set_limit(&memcg->memsw, val); 3661 ret = page_counter_limit(&memcg->memsw, limit);
3684 mutex_unlock(&set_limit_mutex); 3662 mutex_unlock(&memcg_limit_mutex);
3685 3663
3686 if (!ret) 3664 if (!ret)
3687 break; 3665 break;
3688 3666
3689 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 3667 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3690 3668
3691 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3669 curusage = page_counter_read(&memcg->memsw);
3692 /* Usage is reduced ? */ 3670 /* Usage is reduced ? */
3693 if (curusage >= oldusage) 3671 if (curusage >= oldusage)
3694 retry_count--; 3672 retry_count--;
3695 else 3673 else
3696 oldusage = curusage; 3674 oldusage = curusage;
3697 } 3675 } while (retry_count);
3676
3698 if (!ret && enlarge) 3677 if (!ret && enlarge)
3699 memcg_oom_recover(memcg); 3678 memcg_oom_recover(memcg);
3679
3700 return ret; 3680 return ret;
3701} 3681}
3702 3682
@@ -3709,7 +3689,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3709 unsigned long reclaimed; 3689 unsigned long reclaimed;
3710 int loop = 0; 3690 int loop = 0;
3711 struct mem_cgroup_tree_per_zone *mctz; 3691 struct mem_cgroup_tree_per_zone *mctz;
3712 unsigned long long excess; 3692 unsigned long excess;
3713 unsigned long nr_scanned; 3693 unsigned long nr_scanned;
3714 3694
3715 if (order > 0) 3695 if (order > 0)
@@ -3763,7 +3743,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3763 } while (1); 3743 } while (1);
3764 } 3744 }
3765 __mem_cgroup_remove_exceeded(mz, mctz); 3745 __mem_cgroup_remove_exceeded(mz, mctz);
3766 excess = res_counter_soft_limit_excess(&mz->memcg->res); 3746 excess = soft_limit_excess(mz->memcg);
3767 /* 3747 /*
3768 * One school of thought says that we should not add 3748 * One school of thought says that we should not add
3769 * back the node to the tree if reclaim returns 0. 3749 * back the node to the tree if reclaim returns 0.
@@ -3856,7 +3836,6 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3856static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 3836static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3857{ 3837{
3858 int node, zid; 3838 int node, zid;
3859 u64 usage;
3860 3839
3861 do { 3840 do {
3862 /* This is for making all *used* pages to be on LRU. */ 3841 /* This is for making all *used* pages to be on LRU. */
@@ -3888,9 +3867,8 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3888 * right after the check. RES_USAGE should be safe as we always 3867 * right after the check. RES_USAGE should be safe as we always
3889 * charge before adding to the LRU. 3868 * charge before adding to the LRU.
3890 */ 3869 */
3891 usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 3870 } while (page_counter_read(&memcg->memory) -
3892 res_counter_read_u64(&memcg->kmem, RES_USAGE); 3871 page_counter_read(&memcg->kmem) > 0);
3893 } while (usage > 0);
3894} 3872}
3895 3873
3896/* 3874/*
@@ -3930,7 +3908,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3930 /* we call try-to-free pages for make this cgroup empty */ 3908 /* we call try-to-free pages for make this cgroup empty */
3931 lru_add_drain_all(); 3909 lru_add_drain_all();
3932 /* try to free all pages in this cgroup */ 3910 /* try to free all pages in this cgroup */
3933 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3911 while (nr_retries && page_counter_read(&memcg->memory)) {
3934 int progress; 3912 int progress;
3935 3913
3936 if (signal_pending(current)) 3914 if (signal_pending(current))
@@ -4001,8 +3979,8 @@ out:
4001 return retval; 3979 return retval;
4002} 3980}
4003 3981
4004static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 3982static unsigned long tree_stat(struct mem_cgroup *memcg,
4005 enum mem_cgroup_stat_index idx) 3983 enum mem_cgroup_stat_index idx)
4006{ 3984{
4007 struct mem_cgroup *iter; 3985 struct mem_cgroup *iter;
4008 long val = 0; 3986 long val = 0;
@@ -4020,55 +3998,72 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4020{ 3998{
4021 u64 val; 3999 u64 val;
4022 4000
4023 if (!mem_cgroup_is_root(memcg)) { 4001 if (mem_cgroup_is_root(memcg)) {
4002 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
4003 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
4004 if (swap)
4005 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
4006 } else {
4024 if (!swap) 4007 if (!swap)
4025 return res_counter_read_u64(&memcg->res, RES_USAGE); 4008 val = page_counter_read(&memcg->memory);
4026 else 4009 else
4027 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 4010 val = page_counter_read(&memcg->memsw);
4028 } 4011 }
4029
4030 /*
4031 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
4032 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
4033 */
4034 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4035 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4036
4037 if (swap)
4038 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4039
4040 return val << PAGE_SHIFT; 4012 return val << PAGE_SHIFT;
4041} 4013}
4042 4014
4015enum {
4016 RES_USAGE,
4017 RES_LIMIT,
4018 RES_MAX_USAGE,
4019 RES_FAILCNT,
4020 RES_SOFT_LIMIT,
4021};
4043 4022
4044static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4023static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4045 struct cftype *cft) 4024 struct cftype *cft)
4046{ 4025{
4047 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4026 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4048 enum res_type type = MEMFILE_TYPE(cft->private); 4027 struct page_counter *counter;
4049 int name = MEMFILE_ATTR(cft->private);
4050 4028
4051 switch (type) { 4029 switch (MEMFILE_TYPE(cft->private)) {
4052 case _MEM: 4030 case _MEM:
4053 if (name == RES_USAGE) 4031 counter = &memcg->memory;
4054 return mem_cgroup_usage(memcg, false); 4032 break;
4055 return res_counter_read_u64(&memcg->res, name);
4056 case _MEMSWAP: 4033 case _MEMSWAP:
4057 if (name == RES_USAGE) 4034 counter = &memcg->memsw;
4058 return mem_cgroup_usage(memcg, true); 4035 break;
4059 return res_counter_read_u64(&memcg->memsw, name);
4060 case _KMEM: 4036 case _KMEM:
4061 return res_counter_read_u64(&memcg->kmem, name); 4037 counter = &memcg->kmem;
4062 break; 4038 break;
4063 default: 4039 default:
4064 BUG(); 4040 BUG();
4065 } 4041 }
4042
4043 switch (MEMFILE_ATTR(cft->private)) {
4044 case RES_USAGE:
4045 if (counter == &memcg->memory)
4046 return mem_cgroup_usage(memcg, false);
4047 if (counter == &memcg->memsw)
4048 return mem_cgroup_usage(memcg, true);
4049 return (u64)page_counter_read(counter) * PAGE_SIZE;
4050 case RES_LIMIT:
4051 return (u64)counter->limit * PAGE_SIZE;
4052 case RES_MAX_USAGE:
4053 return (u64)counter->watermark * PAGE_SIZE;
4054 case RES_FAILCNT:
4055 return counter->failcnt;
4056 case RES_SOFT_LIMIT:
4057 return (u64)memcg->soft_limit * PAGE_SIZE;
4058 default:
4059 BUG();
4060 }
4066} 4061}
4067 4062
4068#ifdef CONFIG_MEMCG_KMEM 4063#ifdef CONFIG_MEMCG_KMEM
4069/* should be called with activate_kmem_mutex held */ 4064/* should be called with activate_kmem_mutex held */
4070static int __memcg_activate_kmem(struct mem_cgroup *memcg, 4065static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4071 unsigned long long limit) 4066 unsigned long nr_pages)
4072{ 4067{
4073 int err = 0; 4068 int err = 0;
4074 int memcg_id; 4069 int memcg_id;
@@ -4115,7 +4110,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4115 * We couldn't have accounted to this cgroup, because it hasn't got the 4110 * We couldn't have accounted to this cgroup, because it hasn't got the
4116 * active bit set yet, so this should succeed. 4111 * active bit set yet, so this should succeed.
4117 */ 4112 */
4118 err = res_counter_set_limit(&memcg->kmem, limit); 4113 err = page_counter_limit(&memcg->kmem, nr_pages);
4119 VM_BUG_ON(err); 4114 VM_BUG_ON(err);
4120 4115
4121 static_key_slow_inc(&memcg_kmem_enabled_key); 4116 static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -4131,25 +4126,27 @@ out:
4131} 4126}
4132 4127
4133static int memcg_activate_kmem(struct mem_cgroup *memcg, 4128static int memcg_activate_kmem(struct mem_cgroup *memcg,
4134 unsigned long long limit) 4129 unsigned long nr_pages)
4135{ 4130{
4136 int ret; 4131 int ret;
4137 4132
4138 mutex_lock(&activate_kmem_mutex); 4133 mutex_lock(&activate_kmem_mutex);
4139 ret = __memcg_activate_kmem(memcg, limit); 4134 ret = __memcg_activate_kmem(memcg, nr_pages);
4140 mutex_unlock(&activate_kmem_mutex); 4135 mutex_unlock(&activate_kmem_mutex);
4141 return ret; 4136 return ret;
4142} 4137}
4143 4138
4144static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4139static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4145 unsigned long long val) 4140 unsigned long limit)
4146{ 4141{
4147 int ret; 4142 int ret;
4148 4143
4144 mutex_lock(&memcg_limit_mutex);
4149 if (!memcg_kmem_is_active(memcg)) 4145 if (!memcg_kmem_is_active(memcg))
4150 ret = memcg_activate_kmem(memcg, val); 4146 ret = memcg_activate_kmem(memcg, limit);
4151 else 4147 else
4152 ret = res_counter_set_limit(&memcg->kmem, val); 4148 ret = page_counter_limit(&memcg->kmem, limit);
4149 mutex_unlock(&memcg_limit_mutex);
4153 return ret; 4150 return ret;
4154} 4151}
4155 4152
@@ -4167,13 +4164,13 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4167 * after this point, because it has at least one child already. 4164 * after this point, because it has at least one child already.
4168 */ 4165 */
4169 if (memcg_kmem_is_active(parent)) 4166 if (memcg_kmem_is_active(parent))
4170 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); 4167 ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
4171 mutex_unlock(&activate_kmem_mutex); 4168 mutex_unlock(&activate_kmem_mutex);
4172 return ret; 4169 return ret;
4173} 4170}
4174#else 4171#else
4175static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4172static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4176 unsigned long long val) 4173 unsigned long limit)
4177{ 4174{
4178 return -EINVAL; 4175 return -EINVAL;
4179} 4176}
@@ -4187,110 +4184,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
4187 char *buf, size_t nbytes, loff_t off) 4184 char *buf, size_t nbytes, loff_t off)
4188{ 4185{
4189 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4186 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4190 enum res_type type; 4187 unsigned long nr_pages;
4191 int name;
4192 unsigned long long val;
4193 int ret; 4188 int ret;
4194 4189
4195 buf = strstrip(buf); 4190 buf = strstrip(buf);
4196 type = MEMFILE_TYPE(of_cft(of)->private); 4191 ret = page_counter_memparse(buf, &nr_pages);
4197 name = MEMFILE_ATTR(of_cft(of)->private); 4192 if (ret)
4193 return ret;
4198 4194
4199 switch (name) { 4195 switch (MEMFILE_ATTR(of_cft(of)->private)) {
4200 case RES_LIMIT: 4196 case RES_LIMIT:
4201 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 4197 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
4202 ret = -EINVAL; 4198 ret = -EINVAL;
4203 break; 4199 break;
4204 } 4200 }
4205 /* This function does all necessary parse...reuse it */ 4201 switch (MEMFILE_TYPE(of_cft(of)->private)) {
4206 ret = res_counter_memparse_write_strategy(buf, &val); 4202 case _MEM:
4207 if (ret) 4203 ret = mem_cgroup_resize_limit(memcg, nr_pages);
4208 break; 4204 break;
4209 if (type == _MEM) 4205 case _MEMSWAP:
4210 ret = mem_cgroup_resize_limit(memcg, val); 4206 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
4211 else if (type == _MEMSWAP)
4212 ret = mem_cgroup_resize_memsw_limit(memcg, val);
4213 else if (type == _KMEM)
4214 ret = memcg_update_kmem_limit(memcg, val);
4215 else
4216 return -EINVAL;
4217 break;
4218 case RES_SOFT_LIMIT:
4219 ret = res_counter_memparse_write_strategy(buf, &val);
4220 if (ret)
4221 break; 4207 break;
4222 /* 4208 case _KMEM:
4223 * For memsw, soft limits are hard to implement in terms 4209 ret = memcg_update_kmem_limit(memcg, nr_pages);
4224 * of semantics, for now, we support soft limits for 4210 break;
4225 * control without swap 4211 }
4226 */
4227 if (type == _MEM)
4228 ret = res_counter_set_soft_limit(&memcg->res, val);
4229 else
4230 ret = -EINVAL;
4231 break; 4212 break;
4232 default: 4213 case RES_SOFT_LIMIT:
4233 ret = -EINVAL; /* should be BUG() ? */ 4214 memcg->soft_limit = nr_pages;
4215 ret = 0;
4234 break; 4216 break;
4235 } 4217 }
4236 return ret ?: nbytes; 4218 return ret ?: nbytes;
4237} 4219}
4238 4220
4239static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4240 unsigned long long *mem_limit, unsigned long long *memsw_limit)
4241{
4242 unsigned long long min_limit, min_memsw_limit, tmp;
4243
4244 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4245 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4246 if (!memcg->use_hierarchy)
4247 goto out;
4248
4249 while (memcg->css.parent) {
4250 memcg = mem_cgroup_from_css(memcg->css.parent);
4251 if (!memcg->use_hierarchy)
4252 break;
4253 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4254 min_limit = min(min_limit, tmp);
4255 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4256 min_memsw_limit = min(min_memsw_limit, tmp);
4257 }
4258out:
4259 *mem_limit = min_limit;
4260 *memsw_limit = min_memsw_limit;
4261}
4262
4263static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 4221static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
4264 size_t nbytes, loff_t off) 4222 size_t nbytes, loff_t off)
4265{ 4223{
4266 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4224 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4267 int name; 4225 struct page_counter *counter;
4268 enum res_type type;
4269 4226
4270 type = MEMFILE_TYPE(of_cft(of)->private); 4227 switch (MEMFILE_TYPE(of_cft(of)->private)) {
4271 name = MEMFILE_ATTR(of_cft(of)->private); 4228 case _MEM:
4229 counter = &memcg->memory;
4230 break;
4231 case _MEMSWAP:
4232 counter = &memcg->memsw;
4233 break;
4234 case _KMEM:
4235 counter = &memcg->kmem;
4236 break;
4237 default:
4238 BUG();
4239 }
4272 4240
4273 switch (name) { 4241 switch (MEMFILE_ATTR(of_cft(of)->private)) {
4274 case RES_MAX_USAGE: 4242 case RES_MAX_USAGE:
4275 if (type == _MEM) 4243 page_counter_reset_watermark(counter);
4276 res_counter_reset_max(&memcg->res);
4277 else if (type == _MEMSWAP)
4278 res_counter_reset_max(&memcg->memsw);
4279 else if (type == _KMEM)
4280 res_counter_reset_max(&memcg->kmem);
4281 else
4282 return -EINVAL;
4283 break; 4244 break;
4284 case RES_FAILCNT: 4245 case RES_FAILCNT:
4285 if (type == _MEM) 4246 counter->failcnt = 0;
4286 res_counter_reset_failcnt(&memcg->res);
4287 else if (type == _MEMSWAP)
4288 res_counter_reset_failcnt(&memcg->memsw);
4289 else if (type == _KMEM)
4290 res_counter_reset_failcnt(&memcg->kmem);
4291 else
4292 return -EINVAL;
4293 break; 4247 break;
4248 default:
4249 BUG();
4294 } 4250 }
4295 4251
4296 return nbytes; 4252 return nbytes;
@@ -4387,6 +4343,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
4387static int memcg_stat_show(struct seq_file *m, void *v) 4343static int memcg_stat_show(struct seq_file *m, void *v)
4388{ 4344{
4389 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 4345 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4346 unsigned long memory, memsw;
4390 struct mem_cgroup *mi; 4347 struct mem_cgroup *mi;
4391 unsigned int i; 4348 unsigned int i;
4392 4349
@@ -4406,14 +4363,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
4406 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 4363 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4407 4364
4408 /* Hierarchical information */ 4365 /* Hierarchical information */
4409 { 4366 memory = memsw = PAGE_COUNTER_MAX;
4410 unsigned long long limit, memsw_limit; 4367 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4411 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4368 memory = min(memory, mi->memory.limit);
4412 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 4369 memsw = min(memsw, mi->memsw.limit);
4413 if (do_swap_account)
4414 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4415 memsw_limit);
4416 } 4370 }
4371 seq_printf(m, "hierarchical_memory_limit %llu\n",
4372 (u64)memory * PAGE_SIZE);
4373 if (do_swap_account)
4374 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4375 (u64)memsw * PAGE_SIZE);
4417 4376
4418 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4377 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4419 long long val = 0; 4378 long long val = 0;
@@ -4497,7 +4456,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4497static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4456static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4498{ 4457{
4499 struct mem_cgroup_threshold_ary *t; 4458 struct mem_cgroup_threshold_ary *t;
4500 u64 usage; 4459 unsigned long usage;
4501 int i; 4460 int i;
4502 4461
4503 rcu_read_lock(); 4462 rcu_read_lock();
@@ -4596,10 +4555,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4596{ 4555{
4597 struct mem_cgroup_thresholds *thresholds; 4556 struct mem_cgroup_thresholds *thresholds;
4598 struct mem_cgroup_threshold_ary *new; 4557 struct mem_cgroup_threshold_ary *new;
4599 u64 threshold, usage; 4558 unsigned long threshold;
4559 unsigned long usage;
4600 int i, size, ret; 4560 int i, size, ret;
4601 4561
4602 ret = res_counter_memparse_write_strategy(args, &threshold); 4562 ret = page_counter_memparse(args, &threshold);
4603 if (ret) 4563 if (ret)
4604 return ret; 4564 return ret;
4605 4565
@@ -4689,7 +4649,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4689{ 4649{
4690 struct mem_cgroup_thresholds *thresholds; 4650 struct mem_cgroup_thresholds *thresholds;
4691 struct mem_cgroup_threshold_ary *new; 4651 struct mem_cgroup_threshold_ary *new;
4692 u64 usage; 4652 unsigned long usage;
4693 int i, j, size; 4653 int i, j, size;
4694 4654
4695 mutex_lock(&memcg->thresholds_lock); 4655 mutex_lock(&memcg->thresholds_lock);
@@ -4883,7 +4843,7 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4883 4843
4884 memcg_kmem_mark_dead(memcg); 4844 memcg_kmem_mark_dead(memcg);
4885 4845
4886 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 4846 if (page_counter_read(&memcg->kmem))
4887 return; 4847 return;
4888 4848
4889 if (memcg_kmem_test_and_clear_dead(memcg)) 4849 if (memcg_kmem_test_and_clear_dead(memcg))
@@ -5363,9 +5323,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5363 */ 5323 */
5364struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 5324struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
5365{ 5325{
5366 if (!memcg->res.parent) 5326 if (!memcg->memory.parent)
5367 return NULL; 5327 return NULL;
5368 return mem_cgroup_from_res_counter(memcg->res.parent, res); 5328 return mem_cgroup_from_counter(memcg->memory.parent, memory);
5369} 5329}
5370EXPORT_SYMBOL(parent_mem_cgroup); 5330EXPORT_SYMBOL(parent_mem_cgroup);
5371 5331
@@ -5410,9 +5370,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5410 /* root ? */ 5370 /* root ? */
5411 if (parent_css == NULL) { 5371 if (parent_css == NULL) {
5412 root_mem_cgroup = memcg; 5372 root_mem_cgroup = memcg;
5413 res_counter_init(&memcg->res, NULL); 5373 page_counter_init(&memcg->memory, NULL);
5414 res_counter_init(&memcg->memsw, NULL); 5374 page_counter_init(&memcg->memsw, NULL);
5415 res_counter_init(&memcg->kmem, NULL); 5375 page_counter_init(&memcg->kmem, NULL);
5416 } 5376 }
5417 5377
5418 memcg->last_scanned_node = MAX_NUMNODES; 5378 memcg->last_scanned_node = MAX_NUMNODES;
@@ -5451,18 +5411,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5451 memcg->swappiness = mem_cgroup_swappiness(parent); 5411 memcg->swappiness = mem_cgroup_swappiness(parent);
5452 5412
5453 if (parent->use_hierarchy) { 5413 if (parent->use_hierarchy) {
5454 res_counter_init(&memcg->res, &parent->res); 5414 page_counter_init(&memcg->memory, &parent->memory);
5455 res_counter_init(&memcg->memsw, &parent->memsw); 5415 page_counter_init(&memcg->memsw, &parent->memsw);
5456 res_counter_init(&memcg->kmem, &parent->kmem); 5416 page_counter_init(&memcg->kmem, &parent->kmem);
5457 5417
5458 /* 5418 /*
5459 * No need to take a reference to the parent because cgroup 5419 * No need to take a reference to the parent because cgroup
5460 * core guarantees its existence. 5420 * core guarantees its existence.
5461 */ 5421 */
5462 } else { 5422 } else {
5463 res_counter_init(&memcg->res, NULL); 5423 page_counter_init(&memcg->memory, NULL);
5464 res_counter_init(&memcg->memsw, NULL); 5424 page_counter_init(&memcg->memsw, NULL);
5465 res_counter_init(&memcg->kmem, NULL); 5425 page_counter_init(&memcg->kmem, NULL);
5466 /* 5426 /*
5467 * Deeper hierachy with use_hierarchy == false doesn't make 5427 * Deeper hierachy with use_hierarchy == false doesn't make
5468 * much sense so let cgroup subsystem know about this 5428 * much sense so let cgroup subsystem know about this
@@ -5544,7 +5504,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5544 /* 5504 /*
5545 * XXX: css_offline() would be where we should reparent all 5505 * XXX: css_offline() would be where we should reparent all
5546 * memory to prepare the cgroup for destruction. However, 5506 * memory to prepare the cgroup for destruction. However,
5547 * memcg does not do css_tryget_online() and res_counter charging 5507 * memcg does not do css_tryget_online() and page_counter charging
5548 * under the same RCU lock region, which means that charging 5508 * under the same RCU lock region, which means that charging
5549 * could race with offlining. Offlining only happens to 5509 * could race with offlining. Offlining only happens to
5550 * cgroups with no tasks in them but charges can show up 5510 * cgroups with no tasks in them but charges can show up
@@ -5564,7 +5524,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5564 * call_rcu() 5524 * call_rcu()
5565 * offline_css() 5525 * offline_css()
5566 * reparent_charges() 5526 * reparent_charges()
5567 * res_counter_charge() 5527 * page_counter_try_charge()
5568 * css_put() 5528 * css_put()
5569 * css_free() 5529 * css_free()
5570 * pc->mem_cgroup = dead memcg 5530 * pc->mem_cgroup = dead memcg
@@ -5599,10 +5559,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5599{ 5559{
5600 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5560 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5601 5561
5602 mem_cgroup_resize_limit(memcg, ULLONG_MAX); 5562 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
5603 mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); 5563 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
5604 memcg_update_kmem_limit(memcg, ULLONG_MAX); 5564 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
5605 res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); 5565 memcg->soft_limit = 0;
5606} 5566}
5607 5567
5608#ifdef CONFIG_MMU 5568#ifdef CONFIG_MMU
@@ -5916,19 +5876,18 @@ static void __mem_cgroup_clear_mc(void)
5916 if (mc.moved_swap) { 5876 if (mc.moved_swap) {
5917 /* uncharge swap account from the old cgroup */ 5877 /* uncharge swap account from the old cgroup */
5918 if (!mem_cgroup_is_root(mc.from)) 5878 if (!mem_cgroup_is_root(mc.from))
5919 res_counter_uncharge(&mc.from->memsw, 5879 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5920 PAGE_SIZE * mc.moved_swap);
5921
5922 for (i = 0; i < mc.moved_swap; i++)
5923 css_put(&mc.from->css);
5924 5880
5925 /* 5881 /*
5926 * we charged both to->res and to->memsw, so we should 5882 * we charged both to->memory and to->memsw, so we
5927 * uncharge to->res. 5883 * should uncharge to->memory.
5928 */ 5884 */
5929 if (!mem_cgroup_is_root(mc.to)) 5885 if (!mem_cgroup_is_root(mc.to))
5930 res_counter_uncharge(&mc.to->res, 5886 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5931 PAGE_SIZE * mc.moved_swap); 5887
5888 for (i = 0; i < mc.moved_swap; i++)
5889 css_put(&mc.from->css);
5890
5932 /* we've already done css_get(mc.to) */ 5891 /* we've already done css_get(mc.to) */
5933 mc.moved_swap = 0; 5892 mc.moved_swap = 0;
5934 } 5893 }
@@ -6294,7 +6253,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
6294 memcg = mem_cgroup_lookup(id); 6253 memcg = mem_cgroup_lookup(id);
6295 if (memcg) { 6254 if (memcg) {
6296 if (!mem_cgroup_is_root(memcg)) 6255 if (!mem_cgroup_is_root(memcg))
6297 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 6256 page_counter_uncharge(&memcg->memsw, 1);
6298 mem_cgroup_swap_statistics(memcg, false); 6257 mem_cgroup_swap_statistics(memcg, false);
6299 css_put(&memcg->css); 6258 css_put(&memcg->css);
6300 } 6259 }
@@ -6460,11 +6419,9 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6460 6419
6461 if (!mem_cgroup_is_root(memcg)) { 6420 if (!mem_cgroup_is_root(memcg)) {
6462 if (nr_mem) 6421 if (nr_mem)
6463 res_counter_uncharge(&memcg->res, 6422 page_counter_uncharge(&memcg->memory, nr_mem);
6464 nr_mem * PAGE_SIZE);
6465 if (nr_memsw) 6423 if (nr_memsw)
6466 res_counter_uncharge(&memcg->memsw, 6424 page_counter_uncharge(&memcg->memsw, nr_memsw);
6467 nr_memsw * PAGE_SIZE);
6468 memcg_oom_recover(memcg); 6425 memcg_oom_recover(memcg);
6469 } 6426 }
6470 6427
diff --git a/mm/page_counter.c b/mm/page_counter.c
new file mode 100644
index 000000000000..f0cbc0825426
--- /dev/null
+++ b/mm/page_counter.c
@@ -0,0 +1,207 @@
1/*
2 * Lockless hierarchical page accounting & limiting
3 *
4 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
5 */
6
7#include <linux/page_counter.h>
8#include <linux/atomic.h>
9#include <linux/kernel.h>
10#include <linux/string.h>
11#include <linux/sched.h>
12#include <linux/bug.h>
13#include <asm/page.h>
14
15/**
16 * page_counter_cancel - take pages out of the local counter
17 * @counter: counter
18 * @nr_pages: number of pages to cancel
19 *
20 * Returns whether there are remaining pages in the counter.
21 */
22int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
23{
24 long new;
25
26 new = atomic_long_sub_return(nr_pages, &counter->count);
27
28 /* More uncharges than charges? */
29 WARN_ON_ONCE(new < 0);
30
31 return new > 0;
32}
33
34/**
35 * page_counter_charge - hierarchically charge pages
36 * @counter: counter
37 * @nr_pages: number of pages to charge
38 *
39 * NOTE: This does not consider any configured counter limits.
40 */
41void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
42{
43 struct page_counter *c;
44
45 for (c = counter; c; c = c->parent) {
46 long new;
47
48 new = atomic_long_add_return(nr_pages, &c->count);
49 /*
50 * This is indeed racy, but we can live with some
51 * inaccuracy in the watermark.
52 */
53 if (new > c->watermark)
54 c->watermark = new;
55 }
56}
57
58/**
59 * page_counter_try_charge - try to hierarchically charge pages
60 * @counter: counter
61 * @nr_pages: number of pages to charge
62 * @fail: points first counter to hit its limit, if any
63 *
64 * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
65 * its ancestors has hit its configured limit.
66 */
67int page_counter_try_charge(struct page_counter *counter,
68 unsigned long nr_pages,
69 struct page_counter **fail)
70{
71 struct page_counter *c;
72
73 for (c = counter; c; c = c->parent) {
74 long new;
75 /*
76 * Charge speculatively to avoid an expensive CAS. If
77 * a bigger charge fails, it might falsely lock out a
78 * racing smaller charge and send it into reclaim
79 * early, but the error is limited to the difference
80 * between the two sizes, which is less than 2M/4M in
81 * case of a THP locking out a regular page charge.
82 *
83 * The atomic_long_add_return() implies a full memory
84 * barrier between incrementing the count and reading
85 * the limit. When racing with page_counter_limit(),
86 * we either see the new limit or the setter sees the
87 * counter has changed and retries.
88 */
89 new = atomic_long_add_return(nr_pages, &c->count);
90 if (new > c->limit) {
91 atomic_long_sub(nr_pages, &c->count);
92 /*
93 * This is racy, but we can live with some
94 * inaccuracy in the failcnt.
95 */
96 c->failcnt++;
97 *fail = c;
98 goto failed;
99 }
100 /*
101 * Just like with failcnt, we can live with some
102 * inaccuracy in the watermark.
103 */
104 if (new > c->watermark)
105 c->watermark = new;
106 }
107 return 0;
108
109failed:
110 for (c = counter; c != *fail; c = c->parent)
111 page_counter_cancel(c, nr_pages);
112
113 return -ENOMEM;
114}
115
116/**
117 * page_counter_uncharge - hierarchically uncharge pages
118 * @counter: counter
119 * @nr_pages: number of pages to uncharge
120 *
121 * Returns whether there are remaining charges in @counter.
122 */
123int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
124{
125 struct page_counter *c;
126 int ret = 1;
127
128 for (c = counter; c; c = c->parent) {
129 int remainder;
130
131 remainder = page_counter_cancel(c, nr_pages);
132 if (c == counter && !remainder)
133 ret = 0;
134 }
135
136 return ret;
137}
138
139/**
140 * page_counter_limit - limit the number of pages allowed
141 * @counter: counter
142 * @limit: limit to set
143 *
144 * Returns 0 on success, -EBUSY if the current number of pages on the
145 * counter already exceeds the specified limit.
146 *
147 * The caller must serialize invocations on the same counter.
148 */
149int page_counter_limit(struct page_counter *counter, unsigned long limit)
150{
151 for (;;) {
152 unsigned long old;
153 long count;
154
155 /*
156 * Update the limit while making sure that it's not
157 * below the concurrently-changing counter value.
158 *
159 * The xchg implies two full memory barriers before
160 * and after, so the read-swap-read is ordered and
161 * ensures coherency with page_counter_try_charge():
162 * that function modifies the count before checking
163 * the limit, so if it sees the old limit, we see the
164 * modified counter and retry.
165 */
166 count = atomic_long_read(&counter->count);
167
168 if (count > limit)
169 return -EBUSY;
170
171 old = xchg(&counter->limit, limit);
172
173 if (atomic_long_read(&counter->count) <= count)
174 return 0;
175
176 counter->limit = old;
177 cond_resched();
178 }
179}
180
181/**
182 * page_counter_memparse - memparse() for page counter limits
183 * @buf: string to parse
184 * @nr_pages: returns the result in number of pages
185 *
186 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
187 * limited to %PAGE_COUNTER_MAX.
188 */
189int page_counter_memparse(const char *buf, unsigned long *nr_pages)
190{
191 char unlimited[] = "-1";
192 char *end;
193 u64 bytes;
194
195 if (!strncmp(buf, unlimited, sizeof(unlimited))) {
196 *nr_pages = PAGE_COUNTER_MAX;
197 return 0;
198 }
199
200 bytes = memparse(buf, &end);
201 if (*end != '\0')
202 return -EINVAL;
203
204 *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
205
206 return 0;
207}