diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2008-02-07 03:13:57 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 11:42:18 -0500 |
commit | 0eea10301708c64a6b793894c156e21ddd15eb64 (patch) | |
tree | a0dcbe47d48d35ec0554faa5f86068cfab94ca6e | |
parent | 66e1707bc34609f626e2e7b4fe7e454c9748bad5 (diff) |
Memory controller improve user interface
Change the interface to use bytes instead of pages. Page sizes can vary
across platforms and configurations. A new strategy routine has been added
to the resource counters infrastructure to format the data as desired.
Suggested by David Rientjes, Andrew Morton and Herbert Poetzl
Tested on a UML setup with the config for memory control enabled.
[kamezawa.hiroyu@jp.fujitsu.com: possible race fix in res_counter]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/controllers/memory.txt | 29 | ||||
-rw-r--r-- | include/linux/res_counter.h | 12 | ||||
-rw-r--r-- | kernel/res_counter.c | 36 | ||||
-rw-r--r-- | mm/memcontrol.c | 35 |
4 files changed, 81 insertions, 31 deletions
diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt index 7e27baacca7b..61df8f81c803 100644 --- a/Documentation/controllers/memory.txt +++ b/Documentation/controllers/memory.txt | |||
@@ -165,11 +165,30 @@ c. Enable CONFIG_CGROUP_MEM_CONT | |||
165 | 165 | ||
166 | Since now we're in the 0 cgroup, | 166 | Since now we're in the 0 cgroup, |
167 | We can alter the memory limit: | 167 | We can alter the memory limit: |
168 | # echo -n 6000 > /cgroups/0/memory.limit | 168 | # echo -n 4M > /cgroups/0/memory.limit_in_bytes |
169 | |||
170 | NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, | ||
171 | mega or gigabytes. | ||
172 | |||
173 | # cat /cgroups/0/memory.limit_in_bytes | ||
174 | 4194304 Bytes | ||
175 | |||
176 | NOTE: The interface has now changed to display the usage in bytes | ||
177 | instead of pages | ||
169 | 178 | ||
170 | We can check the usage: | 179 | We can check the usage: |
171 | # cat /cgroups/0/memory.usage | 180 | # cat /cgroups/0/memory.usage_in_bytes |
172 | 25 | 181 | 1216512 Bytes |
182 | |||
183 | A successful write to this file does not guarantee a successful set of | ||
184 | this limit to the value written into the file. This can be due to a | ||
185 | number of factors, such as rounding up to page boundaries or the total | ||
186 | availability of memory on the system. The user is required to re-read | ||
187 | this file after a write to guarantee the value committed by the kernel. | ||
188 | |||
189 | # echo -n 1 > memory.limit_in_bytes | ||
190 | # cat memory.limit_in_bytes | ||
191 | 4096 Bytes | ||
173 | 192 | ||
174 | The memory.failcnt field gives the number of times that the cgroup limit was | 193 | The memory.failcnt field gives the number of times that the cgroup limit was |
175 | exceeded. | 194 | exceeded. |
@@ -206,8 +225,8 @@ cgroup might have some charge associated with it, even though all | |||
206 | tasks have migrated away from it. If some pages are still left, after following | 225 | tasks have migrated away from it. If some pages are still left, after following |
207 | the steps listed in sections 4.1 and 4.2, check the Swap Cache usage in | 226 | the steps listed in sections 4.1 and 4.2, check the Swap Cache usage in |
208 | /proc/meminfo to see if the Swap Cache usage is showing up in the | 227 | /proc/meminfo to see if the Swap Cache usage is showing up in the |
209 | cgroups memory.usage counter. A simple test of swapoff -a and swapon -a | 228 | cgroups memory.usage_in_bytes counter. A simple test of swapoff -a and |
210 | should free any pending Swap Cache usage. | 229 | swapon -a should free any pending Swap Cache usage. |
211 | 230 | ||
212 | 4.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)? | 231 | 4.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)? |
213 | 232 | ||
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 5e60a4f34243..61363ce896d5 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h | |||
@@ -23,15 +23,15 @@ struct res_counter { | |||
23 | /* | 23 | /* |
24 | * the current resource consumption level | 24 | * the current resource consumption level |
25 | */ | 25 | */ |
26 | unsigned long usage; | 26 | unsigned long long usage; |
27 | /* | 27 | /* |
28 | * the limit that usage cannot exceed | 28 | * the limit that usage cannot exceed |
29 | */ | 29 | */ |
30 | unsigned long limit; | 30 | unsigned long long limit; |
31 | /* | 31 | /* |
32 | * the number of unsuccessful attempts to consume the resource | 32 | * the number of unsuccessful attempts to consume the resource |
33 | */ | 33 | */ |
34 | unsigned long failcnt; | 34 | unsigned long long failcnt; |
35 | /* | 35 | /* |
36 | * the lock to protect all of the above. | 36 | * the lock to protect all of the above. |
37 | * the routines below consider this to be IRQ-safe | 37 | * the routines below consider this to be IRQ-safe |
@@ -52,9 +52,11 @@ struct res_counter { | |||
52 | */ | 52 | */ |
53 | 53 | ||
54 | ssize_t res_counter_read(struct res_counter *counter, int member, | 54 | ssize_t res_counter_read(struct res_counter *counter, int member, |
55 | const char __user *buf, size_t nbytes, loff_t *pos); | 55 | const char __user *buf, size_t nbytes, loff_t *pos, |
56 | int (*read_strategy)(unsigned long long val, char *s)); | ||
56 | ssize_t res_counter_write(struct res_counter *counter, int member, | 57 | ssize_t res_counter_write(struct res_counter *counter, int member, |
57 | const char __user *buf, size_t nbytes, loff_t *pos); | 58 | const char __user *buf, size_t nbytes, loff_t *pos, |
59 | int (*write_strategy)(char *buf, unsigned long long *val)); | ||
58 | 60 | ||
59 | /* | 61 | /* |
60 | * the field descriptors. one for each member of res_counter | 62 | * the field descriptors. one for each member of res_counter |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 722c484b068b..16cbec2d5d60 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -16,7 +16,7 @@ | |||
16 | void res_counter_init(struct res_counter *counter) | 16 | void res_counter_init(struct res_counter *counter) |
17 | { | 17 | { |
18 | spin_lock_init(&counter->lock); | 18 | spin_lock_init(&counter->lock); |
19 | counter->limit = (unsigned long)LONG_MAX; | 19 | counter->limit = (unsigned long long)LLONG_MAX; |
20 | } | 20 | } |
21 | 21 | ||
22 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | 22 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) |
@@ -59,8 +59,8 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) | |||
59 | } | 59 | } |
60 | 60 | ||
61 | 61 | ||
62 | static inline unsigned long *res_counter_member(struct res_counter *counter, | 62 | static inline unsigned long long * |
63 | int member) | 63 | res_counter_member(struct res_counter *counter, int member) |
64 | { | 64 | { |
65 | switch (member) { | 65 | switch (member) { |
66 | case RES_USAGE: | 66 | case RES_USAGE: |
@@ -76,24 +76,30 @@ static inline unsigned long *res_counter_member(struct res_counter *counter, | |||
76 | } | 76 | } |
77 | 77 | ||
78 | ssize_t res_counter_read(struct res_counter *counter, int member, | 78 | ssize_t res_counter_read(struct res_counter *counter, int member, |
79 | const char __user *userbuf, size_t nbytes, loff_t *pos) | 79 | const char __user *userbuf, size_t nbytes, loff_t *pos, |
80 | int (*read_strategy)(unsigned long long val, char *st_buf)) | ||
80 | { | 81 | { |
81 | unsigned long *val; | 82 | unsigned long long *val; |
82 | char buf[64], *s; | 83 | char buf[64], *s; |
83 | 84 | ||
84 | s = buf; | 85 | s = buf; |
85 | val = res_counter_member(counter, member); | 86 | val = res_counter_member(counter, member); |
86 | s += sprintf(s, "%lu\n", *val); | 87 | if (read_strategy) |
88 | s += read_strategy(*val, s); | ||
89 | else | ||
90 | s += sprintf(s, "%llu\n", *val); | ||
87 | return simple_read_from_buffer((void __user *)userbuf, nbytes, | 91 | return simple_read_from_buffer((void __user *)userbuf, nbytes, |
88 | pos, buf, s - buf); | 92 | pos, buf, s - buf); |
89 | } | 93 | } |
90 | 94 | ||
91 | ssize_t res_counter_write(struct res_counter *counter, int member, | 95 | ssize_t res_counter_write(struct res_counter *counter, int member, |
92 | const char __user *userbuf, size_t nbytes, loff_t *pos) | 96 | const char __user *userbuf, size_t nbytes, loff_t *pos, |
97 | int (*write_strategy)(char *st_buf, unsigned long long *val)) | ||
93 | { | 98 | { |
94 | int ret; | 99 | int ret; |
95 | char *buf, *end; | 100 | char *buf, *end; |
96 | unsigned long tmp, *val; | 101 | unsigned long flags; |
102 | unsigned long long tmp, *val; | ||
97 | 103 | ||
98 | buf = kmalloc(nbytes + 1, GFP_KERNEL); | 104 | buf = kmalloc(nbytes + 1, GFP_KERNEL); |
99 | ret = -ENOMEM; | 105 | ret = -ENOMEM; |
@@ -106,12 +112,20 @@ ssize_t res_counter_write(struct res_counter *counter, int member, | |||
106 | goto out_free; | 112 | goto out_free; |
107 | 113 | ||
108 | ret = -EINVAL; | 114 | ret = -EINVAL; |
109 | tmp = simple_strtoul(buf, &end, 10); | ||
110 | if (*end != '\0') | ||
111 | goto out_free; | ||
112 | 115 | ||
116 | if (write_strategy) { | ||
117 | if (write_strategy(buf, &tmp)) { | ||
118 | goto out_free; | ||
119 | } | ||
120 | } else { | ||
121 | tmp = simple_strtoull(buf, &end, 10); | ||
122 | if (*end != '\0') | ||
123 | goto out_free; | ||
124 | } | ||
125 | spin_lock_irqsave(&counter->lock, flags); | ||
113 | val = res_counter_member(counter, member); | 126 | val = res_counter_member(counter, member); |
114 | *val = tmp; | 127 | *val = tmp; |
128 | spin_unlock_irqrestore(&counter->lock, flags); | ||
115 | ret = nbytes; | 129 | ret = nbytes; |
116 | out_free: | 130 | out_free: |
117 | kfree(buf); | 131 | kfree(buf); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e9ff914c0f1..d73692279ab1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -302,7 +302,7 @@ retry: | |||
302 | * If we created the page_cgroup, we should free it on exceeding | 302 | * If we created the page_cgroup, we should free it on exceeding |
303 | * the cgroup limit. | 303 | * the cgroup limit. |
304 | */ | 304 | */ |
305 | while (res_counter_charge(&mem->res, 1)) { | 305 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { |
306 | if (try_to_free_mem_cgroup_pages(mem)) | 306 | if (try_to_free_mem_cgroup_pages(mem)) |
307 | continue; | 307 | continue; |
308 | 308 | ||
@@ -341,7 +341,7 @@ retry: | |||
341 | kfree(pc); | 341 | kfree(pc); |
342 | pc = race_pc; | 342 | pc = race_pc; |
343 | atomic_inc(&pc->ref_cnt); | 343 | atomic_inc(&pc->ref_cnt); |
344 | res_counter_uncharge(&mem->res, 1); | 344 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
345 | css_put(&mem->css); | 345 | css_put(&mem->css); |
346 | goto done; | 346 | goto done; |
347 | } | 347 | } |
@@ -384,7 +384,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) | |||
384 | css_put(&mem->css); | 384 | css_put(&mem->css); |
385 | page_assign_page_cgroup(page, NULL); | 385 | page_assign_page_cgroup(page, NULL); |
386 | unlock_page_cgroup(page); | 386 | unlock_page_cgroup(page); |
387 | res_counter_uncharge(&mem->res, 1); | 387 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
388 | 388 | ||
389 | spin_lock_irqsave(&mem->lru_lock, flags); | 389 | spin_lock_irqsave(&mem->lru_lock, flags); |
390 | list_del_init(&pc->lru); | 390 | list_del_init(&pc->lru); |
@@ -393,12 +393,26 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) | |||
393 | } | 393 | } |
394 | } | 394 | } |
395 | 395 | ||
396 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | 396 | int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) |
397 | struct file *file, char __user *userbuf, size_t nbytes, | 397 | { |
398 | loff_t *ppos) | 398 | *tmp = memparse(buf, &buf); |
399 | if (*buf != '\0') | ||
400 | return -EINVAL; | ||
401 | |||
402 | /* | ||
403 | * Round up the value to the closest page size | ||
404 | */ | ||
405 | *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; | ||
406 | return 0; | ||
407 | } | ||
408 | |||
409 | static ssize_t mem_cgroup_read(struct cgroup *cont, | ||
410 | struct cftype *cft, struct file *file, | ||
411 | char __user *userbuf, size_t nbytes, loff_t *ppos) | ||
399 | { | 412 | { |
400 | return res_counter_read(&mem_cgroup_from_cont(cont)->res, | 413 | return res_counter_read(&mem_cgroup_from_cont(cont)->res, |
401 | cft->private, userbuf, nbytes, ppos); | 414 | cft->private, userbuf, nbytes, ppos, |
415 | NULL); | ||
402 | } | 416 | } |
403 | 417 | ||
404 | static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | 418 | static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, |
@@ -406,17 +420,18 @@ static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
406 | size_t nbytes, loff_t *ppos) | 420 | size_t nbytes, loff_t *ppos) |
407 | { | 421 | { |
408 | return res_counter_write(&mem_cgroup_from_cont(cont)->res, | 422 | return res_counter_write(&mem_cgroup_from_cont(cont)->res, |
409 | cft->private, userbuf, nbytes, ppos); | 423 | cft->private, userbuf, nbytes, ppos, |
424 | mem_cgroup_write_strategy); | ||
410 | } | 425 | } |
411 | 426 | ||
412 | static struct cftype mem_cgroup_files[] = { | 427 | static struct cftype mem_cgroup_files[] = { |
413 | { | 428 | { |
414 | .name = "usage", | 429 | .name = "usage_in_bytes", |
415 | .private = RES_USAGE, | 430 | .private = RES_USAGE, |
416 | .read = mem_cgroup_read, | 431 | .read = mem_cgroup_read, |
417 | }, | 432 | }, |
418 | { | 433 | { |
419 | .name = "limit", | 434 | .name = "limit_in_bytes", |
420 | .private = RES_LIMIT, | 435 | .private = RES_LIMIT, |
421 | .write = mem_cgroup_write, | 436 | .write = mem_cgroup_write, |
422 | .read = mem_cgroup_read, | 437 | .read = mem_cgroup_read, |