aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2008-02-07 03:13:57 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:18 -0500
commit0eea10301708c64a6b793894c156e21ddd15eb64 (patch)
treea0dcbe47d48d35ec0554faa5f86068cfab94ca6e
parent66e1707bc34609f626e2e7b4fe7e454c9748bad5 (diff)
Memory controller improve user interface
Change the interface to use bytes instead of pages. Page sizes can vary across platforms and configurations. A new strategy routine has been added to the resource counters infrastructure to format the data as desired. Suggested by David Rientjes, Andrew Morton and Herbert Poetzl Tested on a UML setup with the config for memory control enabled. [kamezawa.hiroyu@jp.fujitsu.com: possible race fix in res_counter] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Signed-off-by: Pavel Emelianov <xemul@openvz.org> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/controllers/memory.txt29
-rw-r--r--include/linux/res_counter.h12
-rw-r--r--kernel/res_counter.c36
-rw-r--r--mm/memcontrol.c35
4 files changed, 81 insertions, 31 deletions
diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
index 7e27baacca7b..61df8f81c803 100644
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -165,11 +165,30 @@ c. Enable CONFIG_CGROUP_MEM_CONT
165 165
166Since now we're in the 0 cgroup, 166Since now we're in the 0 cgroup,
167We can alter the memory limit: 167We can alter the memory limit:
168# echo -n 6000 > /cgroups/0/memory.limit 168# echo -n 4M > /cgroups/0/memory.limit_in_bytes
169
170NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
171mega or gigabytes.
172
173# cat /cgroups/0/memory.limit_in_bytes
1744194304 Bytes
175
176NOTE: The interface has now changed to display the usage in bytes
177instead of pages
169 178
170We can check the usage: 179We can check the usage:
171# cat /cgroups/0/memory.usage 180# cat /cgroups/0/memory.usage_in_bytes
17225 1811216512 Bytes
182
183A successful write to this file does not guarantee a successful set of
184this limit to the value written into the file. This can be due to a
185number of factors, such as rounding up to page boundaries or the total
186availability of memory on the system. The user is required to re-read
187this file after a write to guarantee the value committed by the kernel.
188
189# echo -n 1 > memory.limit_in_bytes
190# cat memory.limit_in_bytes
1914096 Bytes
173 192
174The memory.failcnt field gives the number of times that the cgroup limit was 193The memory.failcnt field gives the number of times that the cgroup limit was
175exceeded. 194exceeded.
@@ -206,8 +225,8 @@ cgroup might have some charge associated with it, even though all
206tasks have migrated away from it. If some pages are still left, after following 225tasks have migrated away from it. If some pages are still left, after following
207the steps listed in sections 4.1 and 4.2, check the Swap Cache usage in 226the steps listed in sections 4.1 and 4.2, check the Swap Cache usage in
208/proc/meminfo to see if the Swap Cache usage is showing up in the 227/proc/meminfo to see if the Swap Cache usage is showing up in the
209cgroups memory.usage counter. A simple test of swapoff -a and swapon -a 228cgroups memory.usage_in_bytes counter. A simple test of swapoff -a and
210should free any pending Swap Cache usage. 229swapon -a should free any pending Swap Cache usage.
211 230
2124.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)? 2314.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)?
213 232
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 5e60a4f34243..61363ce896d5 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -23,15 +23,15 @@ struct res_counter {
23 /* 23 /*
24 * the current resource consumption level 24 * the current resource consumption level
25 */ 25 */
26 unsigned long usage; 26 unsigned long long usage;
27 /* 27 /*
28 * the limit that usage cannot exceed 28 * the limit that usage cannot exceed
29 */ 29 */
30 unsigned long limit; 30 unsigned long long limit;
31 /* 31 /*
32 * the number of unsuccessful attempts to consume the resource 32 * the number of unsuccessful attempts to consume the resource
33 */ 33 */
34 unsigned long failcnt; 34 unsigned long long failcnt;
35 /* 35 /*
36 * the lock to protect all of the above. 36 * the lock to protect all of the above.
37 * the routines below consider this to be IRQ-safe 37 * the routines below consider this to be IRQ-safe
@@ -52,9 +52,11 @@ struct res_counter {
52 */ 52 */
53 53
54ssize_t res_counter_read(struct res_counter *counter, int member, 54ssize_t res_counter_read(struct res_counter *counter, int member,
55 const char __user *buf, size_t nbytes, loff_t *pos); 55 const char __user *buf, size_t nbytes, loff_t *pos,
56 int (*read_strategy)(unsigned long long val, char *s));
56ssize_t res_counter_write(struct res_counter *counter, int member, 57ssize_t res_counter_write(struct res_counter *counter, int member,
57 const char __user *buf, size_t nbytes, loff_t *pos); 58 const char __user *buf, size_t nbytes, loff_t *pos,
59 int (*write_strategy)(char *buf, unsigned long long *val));
58 60
59/* 61/*
60 * the field descriptors. one for each member of res_counter 62 * the field descriptors. one for each member of res_counter
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 722c484b068b..16cbec2d5d60 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -16,7 +16,7 @@
16void res_counter_init(struct res_counter *counter) 16void res_counter_init(struct res_counter *counter)
17{ 17{
18 spin_lock_init(&counter->lock); 18 spin_lock_init(&counter->lock);
19 counter->limit = (unsigned long)LONG_MAX; 19 counter->limit = (unsigned long long)LLONG_MAX;
20} 20}
21 21
22int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 22int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -59,8 +59,8 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
59} 59}
60 60
61 61
62static inline unsigned long *res_counter_member(struct res_counter *counter, 62static inline unsigned long long *
63 int member) 63res_counter_member(struct res_counter *counter, int member)
64{ 64{
65 switch (member) { 65 switch (member) {
66 case RES_USAGE: 66 case RES_USAGE:
@@ -76,24 +76,30 @@ static inline unsigned long *res_counter_member(struct res_counter *counter,
76} 76}
77 77
78ssize_t res_counter_read(struct res_counter *counter, int member, 78ssize_t res_counter_read(struct res_counter *counter, int member,
79 const char __user *userbuf, size_t nbytes, loff_t *pos) 79 const char __user *userbuf, size_t nbytes, loff_t *pos,
80 int (*read_strategy)(unsigned long long val, char *st_buf))
80{ 81{
81 unsigned long *val; 82 unsigned long long *val;
82 char buf[64], *s; 83 char buf[64], *s;
83 84
84 s = buf; 85 s = buf;
85 val = res_counter_member(counter, member); 86 val = res_counter_member(counter, member);
86 s += sprintf(s, "%lu\n", *val); 87 if (read_strategy)
88 s += read_strategy(*val, s);
89 else
90 s += sprintf(s, "%llu\n", *val);
87 return simple_read_from_buffer((void __user *)userbuf, nbytes, 91 return simple_read_from_buffer((void __user *)userbuf, nbytes,
88 pos, buf, s - buf); 92 pos, buf, s - buf);
89} 93}
90 94
91ssize_t res_counter_write(struct res_counter *counter, int member, 95ssize_t res_counter_write(struct res_counter *counter, int member,
92 const char __user *userbuf, size_t nbytes, loff_t *pos) 96 const char __user *userbuf, size_t nbytes, loff_t *pos,
97 int (*write_strategy)(char *st_buf, unsigned long long *val))
93{ 98{
94 int ret; 99 int ret;
95 char *buf, *end; 100 char *buf, *end;
96 unsigned long tmp, *val; 101 unsigned long flags;
102 unsigned long long tmp, *val;
97 103
98 buf = kmalloc(nbytes + 1, GFP_KERNEL); 104 buf = kmalloc(nbytes + 1, GFP_KERNEL);
99 ret = -ENOMEM; 105 ret = -ENOMEM;
@@ -106,12 +112,20 @@ ssize_t res_counter_write(struct res_counter *counter, int member,
106 goto out_free; 112 goto out_free;
107 113
108 ret = -EINVAL; 114 ret = -EINVAL;
109 tmp = simple_strtoul(buf, &end, 10);
110 if (*end != '\0')
111 goto out_free;
112 115
116 if (write_strategy) {
117 if (write_strategy(buf, &tmp)) {
118 goto out_free;
119 }
120 } else {
121 tmp = simple_strtoull(buf, &end, 10);
122 if (*end != '\0')
123 goto out_free;
124 }
125 spin_lock_irqsave(&counter->lock, flags);
113 val = res_counter_member(counter, member); 126 val = res_counter_member(counter, member);
114 *val = tmp; 127 *val = tmp;
128 spin_unlock_irqrestore(&counter->lock, flags);
115 ret = nbytes; 129 ret = nbytes;
116out_free: 130out_free:
117 kfree(buf); 131 kfree(buf);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9e9ff914c0f1..d73692279ab1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -302,7 +302,7 @@ retry:
302 * If we created the page_cgroup, we should free it on exceeding 302 * If we created the page_cgroup, we should free it on exceeding
303 * the cgroup limit. 303 * the cgroup limit.
304 */ 304 */
305 while (res_counter_charge(&mem->res, 1)) { 305 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
306 if (try_to_free_mem_cgroup_pages(mem)) 306 if (try_to_free_mem_cgroup_pages(mem))
307 continue; 307 continue;
308 308
@@ -341,7 +341,7 @@ retry:
341 kfree(pc); 341 kfree(pc);
342 pc = race_pc; 342 pc = race_pc;
343 atomic_inc(&pc->ref_cnt); 343 atomic_inc(&pc->ref_cnt);
344 res_counter_uncharge(&mem->res, 1); 344 res_counter_uncharge(&mem->res, PAGE_SIZE);
345 css_put(&mem->css); 345 css_put(&mem->css);
346 goto done; 346 goto done;
347 } 347 }
@@ -384,7 +384,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
384 css_put(&mem->css); 384 css_put(&mem->css);
385 page_assign_page_cgroup(page, NULL); 385 page_assign_page_cgroup(page, NULL);
386 unlock_page_cgroup(page); 386 unlock_page_cgroup(page);
387 res_counter_uncharge(&mem->res, 1); 387 res_counter_uncharge(&mem->res, PAGE_SIZE);
388 388
389 spin_lock_irqsave(&mem->lru_lock, flags); 389 spin_lock_irqsave(&mem->lru_lock, flags);
390 list_del_init(&pc->lru); 390 list_del_init(&pc->lru);
@@ -393,12 +393,26 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
393 } 393 }
394} 394}
395 395
396static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 396int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
397 struct file *file, char __user *userbuf, size_t nbytes, 397{
398 loff_t *ppos) 398 *tmp = memparse(buf, &buf);
399 if (*buf != '\0')
400 return -EINVAL;
401
402 /*
403 * Round up the value to the closest page size
404 */
405 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
406 return 0;
407}
408
409static ssize_t mem_cgroup_read(struct cgroup *cont,
410 struct cftype *cft, struct file *file,
411 char __user *userbuf, size_t nbytes, loff_t *ppos)
399{ 412{
400 return res_counter_read(&mem_cgroup_from_cont(cont)->res, 413 return res_counter_read(&mem_cgroup_from_cont(cont)->res,
401 cft->private, userbuf, nbytes, ppos); 414 cft->private, userbuf, nbytes, ppos,
415 NULL);
402} 416}
403 417
404static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 418static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
@@ -406,17 +420,18 @@ static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
406 size_t nbytes, loff_t *ppos) 420 size_t nbytes, loff_t *ppos)
407{ 421{
408 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 422 return res_counter_write(&mem_cgroup_from_cont(cont)->res,
409 cft->private, userbuf, nbytes, ppos); 423 cft->private, userbuf, nbytes, ppos,
424 mem_cgroup_write_strategy);
410} 425}
411 426
412static struct cftype mem_cgroup_files[] = { 427static struct cftype mem_cgroup_files[] = {
413 { 428 {
414 .name = "usage", 429 .name = "usage_in_bytes",
415 .private = RES_USAGE, 430 .private = RES_USAGE,
416 .read = mem_cgroup_read, 431 .read = mem_cgroup_read,
417 }, 432 },
418 { 433 {
419 .name = "limit", 434 .name = "limit_in_bytes",
420 .private = RES_LIMIT, 435 .private = RES_LIMIT,
421 .write = mem_cgroup_write, 436 .write = mem_cgroup_write,
422 .read = mem_cgroup_read, 437 .read = mem_cgroup_read,