diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-12-15 19:47:03 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-16 10:20:07 -0500 |
commit | 569b846df54ffb2827b83ce3244c5f032394cba4 (patch) | |
tree | 77c5d373a5edf97710fab8777912971b99e84828 /mm/truncate.c | |
parent | cd9b45b78a61e8df250e69385c74e729e5b66abf (diff) |
memcg: coalesce uncharge during unmap/truncate
In massive parallel enviroment, res_counter can be a performance
bottleneck. One strong techinque to reduce lock contention is reducing
calls by coalescing some amount of calls into one.
Considering charge/uncharge chatacteristic,
- charge is done one by one via demand-paging.
- uncharge is done by
- in chunk at munmap, truncate, exit, execve...
- one by one via vmscan/paging.
It seems we have a chance to coalesce uncharges for improving scalability
at unmap/truncation.
This patch is a for coalescing uncharge. For avoiding scattering memcg's
structure to functions under /mm, this patch adds memcg batch uncharge
information to the task. A reason for per-task batching is for making use
of caller's context information. We do batched uncharge (deleyed
uncharge) when truncation/unmap occurs but do direct uncharge when
uncharge is called by memory reclaim (vmscan.c).
The degree of coalescing depends on callers
- at invalidate/trucate... pagevec size
- at unmap ....ZAP_BLOCK_SIZE
(memory itself will be freed in this degree.)
Then, we'll not coalescing too much.
On x86-64 8cpu server, I tested overheads of memcg at page fault by
running a program which does map/fault/unmap in a loop. Running
a task per a cpu by taskset and see sum of the number of page faults
in 60secs.
[without memcg config]
40156968 page-faults # 0.085 M/sec ( +- 0.046% )
27.67 cache-miss/faults
[root cgroup]
36659599 page-faults # 0.077 M/sec ( +- 0.247% )
31.58 miss/faults
[in a child cgroup]
18444157 page-faults # 0.039 M/sec ( +- 0.133% )
69.96 miss/faults
[child with this patch]
27133719 page-faults # 0.057 M/sec ( +- 0.155% )
47.16 miss/faults
We can see some amounts of improvement.
(root cgroup doesn't affected by this patch)
Another patch for "charge" will follow this and above will be improved more.
Changelog(since 2009/10/02):
- renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes)
- some clean up and commentary/description updates.
- added initialize code to copy_process(). (possible bug fix)
Changelog(old):
- fixed !CONFIG_MEM_CGROUP case.
- rebased onto the latest mmotm + softlimit fix patches.
- unified patch for callers
- added commetns.
- make ->do_batch as bool.
- removed css_get() at el. We don't need it.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/truncate.c')
-rw-r--r-- | mm/truncate.c | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/mm/truncate.c b/mm/truncate.c index 2c147a7e5f2c..342deee22684 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
272 | pagevec_release(&pvec); | 272 | pagevec_release(&pvec); |
273 | break; | 273 | break; |
274 | } | 274 | } |
275 | mem_cgroup_uncharge_start(); | ||
275 | for (i = 0; i < pagevec_count(&pvec); i++) { | 276 | for (i = 0; i < pagevec_count(&pvec); i++) { |
276 | struct page *page = pvec.pages[i]; | 277 | struct page *page = pvec.pages[i]; |
277 | 278 | ||
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
286 | unlock_page(page); | 287 | unlock_page(page); |
287 | } | 288 | } |
288 | pagevec_release(&pvec); | 289 | pagevec_release(&pvec); |
290 | mem_cgroup_uncharge_end(); | ||
289 | } | 291 | } |
290 | } | 292 | } |
291 | EXPORT_SYMBOL(truncate_inode_pages_range); | 293 | EXPORT_SYMBOL(truncate_inode_pages_range); |
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
327 | pagevec_init(&pvec, 0); | 329 | pagevec_init(&pvec, 0); |
328 | while (next <= end && | 330 | while (next <= end && |
329 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 331 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
332 | mem_cgroup_uncharge_start(); | ||
330 | for (i = 0; i < pagevec_count(&pvec); i++) { | 333 | for (i = 0; i < pagevec_count(&pvec); i++) { |
331 | struct page *page = pvec.pages[i]; | 334 | struct page *page = pvec.pages[i]; |
332 | pgoff_t index; | 335 | pgoff_t index; |
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
354 | break; | 357 | break; |
355 | } | 358 | } |
356 | pagevec_release(&pvec); | 359 | pagevec_release(&pvec); |
360 | mem_cgroup_uncharge_end(); | ||
357 | cond_resched(); | 361 | cond_resched(); |
358 | } | 362 | } |
359 | return ret; | 363 | return ret; |
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
428 | while (next <= end && !wrapped && | 432 | while (next <= end && !wrapped && |
429 | pagevec_lookup(&pvec, mapping, next, | 433 | pagevec_lookup(&pvec, mapping, next, |
430 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 434 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
435 | mem_cgroup_uncharge_start(); | ||
431 | for (i = 0; i < pagevec_count(&pvec); i++) { | 436 | for (i = 0; i < pagevec_count(&pvec); i++) { |
432 | struct page *page = pvec.pages[i]; | 437 | struct page *page = pvec.pages[i]; |
433 | pgoff_t page_index; | 438 | pgoff_t page_index; |
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
477 | unlock_page(page); | 482 | unlock_page(page); |
478 | } | 483 | } |
479 | pagevec_release(&pvec); | 484 | pagevec_release(&pvec); |
485 | mem_cgroup_uncharge_end(); | ||
480 | cond_resched(); | 486 | cond_resched(); |
481 | } | 487 | } |
482 | return ret; | 488 | return ret; |