diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-12-15 19:47:03 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-16 10:20:07 -0500 |
commit | 569b846df54ffb2827b83ce3244c5f032394cba4 (patch) | |
tree | 77c5d373a5edf97710fab8777912971b99e84828 | |
parent | cd9b45b78a61e8df250e69385c74e729e5b66abf (diff) |
memcg: coalesce uncharge during unmap/truncate
In massive parallel enviroment, res_counter can be a performance
bottleneck. One strong techinque to reduce lock contention is reducing
calls by coalescing some amount of calls into one.
Considering charge/uncharge chatacteristic,
- charge is done one by one via demand-paging.
- uncharge is done by
- in chunk at munmap, truncate, exit, execve...
- one by one via vmscan/paging.
It seems we have a chance to coalesce uncharges for improving scalability
at unmap/truncation.
This patch is a for coalescing uncharge. For avoiding scattering memcg's
structure to functions under /mm, this patch adds memcg batch uncharge
information to the task. A reason for per-task batching is for making use
of caller's context information. We do batched uncharge (deleyed
uncharge) when truncation/unmap occurs but do direct uncharge when
uncharge is called by memory reclaim (vmscan.c).
The degree of coalescing depends on callers
- at invalidate/trucate... pagevec size
- at unmap ....ZAP_BLOCK_SIZE
(memory itself will be freed in this degree.)
Then, we'll not coalescing too much.
On x86-64 8cpu server, I tested overheads of memcg at page fault by
running a program which does map/fault/unmap in a loop. Running
a task per a cpu by taskset and see sum of the number of page faults
in 60secs.
[without memcg config]
40156968 page-faults # 0.085 M/sec ( +- 0.046% )
27.67 cache-miss/faults
[root cgroup]
36659599 page-faults # 0.077 M/sec ( +- 0.247% )
31.58 miss/faults
[in a child cgroup]
18444157 page-faults # 0.039 M/sec ( +- 0.133% )
69.96 miss/faults
[child with this patch]
27133719 page-faults # 0.057 M/sec ( +- 0.155% )
47.16 miss/faults
We can see some amounts of improvement.
(root cgroup doesn't affected by this patch)
Another patch for "charge" will follow this and above will be improved more.
Changelog(since 2009/10/02):
- renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes)
- some clean up and commentary/description updates.
- added initialize code to copy_process(). (possible bug fix)
Changelog(old):
- fixed !CONFIG_MEM_CGROUP case.
- rebased onto the latest mmotm + softlimit fix patches.
- unified patch for callers
- added commetns.
- make ->do_batch as bool.
- removed css_get() at el. We don't need it.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/memcontrol.h | 13 | ||||
-rw-r--r-- | include/linux/sched.h | 8 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 96 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 6 |
6 files changed, 123 insertions, 6 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bf9213b2db8f..91300c972e76 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -54,6 +54,11 @@ extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); | |||
54 | extern void mem_cgroup_del_lru(struct page *page); | 54 | extern void mem_cgroup_del_lru(struct page *page); |
55 | extern void mem_cgroup_move_lists(struct page *page, | 55 | extern void mem_cgroup_move_lists(struct page *page, |
56 | enum lru_list from, enum lru_list to); | 56 | enum lru_list from, enum lru_list to); |
57 | |||
58 | /* For coalescing uncharge for reducing memcg' overhead*/ | ||
59 | extern void mem_cgroup_uncharge_start(void); | ||
60 | extern void mem_cgroup_uncharge_end(void); | ||
61 | |||
57 | extern void mem_cgroup_uncharge_page(struct page *page); | 62 | extern void mem_cgroup_uncharge_page(struct page *page); |
58 | extern void mem_cgroup_uncharge_cache_page(struct page *page); | 63 | extern void mem_cgroup_uncharge_cache_page(struct page *page); |
59 | extern int mem_cgroup_shmem_charge_fallback(struct page *page, | 64 | extern int mem_cgroup_shmem_charge_fallback(struct page *page, |
@@ -151,6 +156,14 @@ static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr) | |||
151 | { | 156 | { |
152 | } | 157 | } |
153 | 158 | ||
159 | static inline void mem_cgroup_uncharge_start(void) | ||
160 | { | ||
161 | } | ||
162 | |||
163 | static inline void mem_cgroup_uncharge_end(void) | ||
164 | { | ||
165 | } | ||
166 | |||
154 | static inline void mem_cgroup_uncharge_page(struct page *page) | 167 | static inline void mem_cgroup_uncharge_page(struct page *page) |
155 | { | 168 | { |
156 | } | 169 | } |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c858f38e81a..f4c145410a8d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1544,6 +1544,14 @@ struct task_struct { | |||
1544 | unsigned long trace_recursion; | 1544 | unsigned long trace_recursion; |
1545 | #endif /* CONFIG_TRACING */ | 1545 | #endif /* CONFIG_TRACING */ |
1546 | unsigned long stack_start; | 1546 | unsigned long stack_start; |
1547 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ | ||
1548 | struct memcg_batch_info { | ||
1549 | int do_batch; /* incremented when batch uncharge started */ | ||
1550 | struct mem_cgroup *memcg; /* target memcg of uncharge */ | ||
1551 | unsigned long bytes; /* uncharged usage */ | ||
1552 | unsigned long memsw_bytes; /* uncharged mem+swap usage */ | ||
1553 | } memcg_batch; | ||
1554 | #endif | ||
1547 | }; | 1555 | }; |
1548 | 1556 | ||
1549 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ | 1557 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ |
diff --git a/kernel/fork.c b/kernel/fork.c index 9bd91447e052..b6cbd33dde80 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1127 | #ifdef CONFIG_DEBUG_MUTEXES | 1127 | #ifdef CONFIG_DEBUG_MUTEXES |
1128 | p->blocked_on = NULL; /* not blocked yet */ | 1128 | p->blocked_on = NULL; /* not blocked yet */ |
1129 | #endif | 1129 | #endif |
1130 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
1131 | p->memcg_batch.do_batch = 0; | ||
1132 | p->memcg_batch.memcg = NULL; | ||
1133 | #endif | ||
1130 | 1134 | ||
1131 | p->bts = NULL; | 1135 | p->bts = NULL; |
1132 | 1136 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b5b108c1c6b..a730c91b8e69 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1827 | css_put(&mem->css); | 1827 | css_put(&mem->css); |
1828 | } | 1828 | } |
1829 | 1829 | ||
1830 | static void | ||
1831 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
1832 | { | ||
1833 | struct memcg_batch_info *batch = NULL; | ||
1834 | bool uncharge_memsw = true; | ||
1835 | /* If swapout, usage of swap doesn't decrease */ | ||
1836 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1837 | uncharge_memsw = false; | ||
1838 | /* | ||
1839 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
1840 | * In those cases, all pages freed continously can be expected to be in | ||
1841 | * the same cgroup and we have chance to coalesce uncharges. | ||
1842 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
1843 | * because we want to do uncharge as soon as possible. | ||
1844 | */ | ||
1845 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
1846 | goto direct_uncharge; | ||
1847 | |||
1848 | batch = ¤t->memcg_batch; | ||
1849 | /* | ||
1850 | * In usual, we do css_get() when we remember memcg pointer. | ||
1851 | * But in this case, we keep res->usage until end of a series of | ||
1852 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
1853 | */ | ||
1854 | if (!batch->memcg) | ||
1855 | batch->memcg = mem; | ||
1856 | /* | ||
1857 | * In typical case, batch->memcg == mem. This means we can | ||
1858 | * merge a series of uncharges to an uncharge of res_counter. | ||
1859 | * If not, we uncharge res_counter ony by one. | ||
1860 | */ | ||
1861 | if (batch->memcg != mem) | ||
1862 | goto direct_uncharge; | ||
1863 | /* remember freed charge and uncharge it later */ | ||
1864 | batch->bytes += PAGE_SIZE; | ||
1865 | if (uncharge_memsw) | ||
1866 | batch->memsw_bytes += PAGE_SIZE; | ||
1867 | return; | ||
1868 | direct_uncharge: | ||
1869 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1870 | if (uncharge_memsw) | ||
1871 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1872 | return; | ||
1873 | } | ||
1830 | 1874 | ||
1831 | /* | 1875 | /* |
1832 | * uncharge if !page_mapped(page) | 1876 | * uncharge if !page_mapped(page) |
@@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1875 | break; | 1919 | break; |
1876 | } | 1920 | } |
1877 | 1921 | ||
1878 | if (!mem_cgroup_is_root(mem)) { | 1922 | if (!mem_cgroup_is_root(mem)) |
1879 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1923 | __do_uncharge(mem, ctype); |
1880 | if (do_swap_account && | ||
1881 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1882 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1883 | } | ||
1884 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 1924 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1885 | mem_cgroup_swap_statistics(mem, true); | 1925 | mem_cgroup_swap_statistics(mem, true); |
1886 | mem_cgroup_charge_statistics(mem, pc, false); | 1926 | mem_cgroup_charge_statistics(mem, pc, false); |
@@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1926 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 1966 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1927 | } | 1967 | } |
1928 | 1968 | ||
1969 | /* | ||
1970 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
1971 | * In that cases, pages are freed continuously and we can expect pages | ||
1972 | * are in the same memcg. All these calls itself limits the number of | ||
1973 | * pages freed at once, then uncharge_start/end() is called properly. | ||
1974 | * This may be called prural(2) times in a context, | ||
1975 | */ | ||
1976 | |||
1977 | void mem_cgroup_uncharge_start(void) | ||
1978 | { | ||
1979 | current->memcg_batch.do_batch++; | ||
1980 | /* We can do nest. */ | ||
1981 | if (current->memcg_batch.do_batch == 1) { | ||
1982 | current->memcg_batch.memcg = NULL; | ||
1983 | current->memcg_batch.bytes = 0; | ||
1984 | current->memcg_batch.memsw_bytes = 0; | ||
1985 | } | ||
1986 | } | ||
1987 | |||
1988 | void mem_cgroup_uncharge_end(void) | ||
1989 | { | ||
1990 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
1991 | |||
1992 | if (!batch->do_batch) | ||
1993 | return; | ||
1994 | |||
1995 | batch->do_batch--; | ||
1996 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
1997 | return; | ||
1998 | |||
1999 | if (!batch->memcg) | ||
2000 | return; | ||
2001 | /* | ||
2002 | * This "batch->memcg" is valid without any css_get/put etc... | ||
2003 | * bacause we hide charges behind us. | ||
2004 | */ | ||
2005 | if (batch->bytes) | ||
2006 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
2007 | if (batch->memsw_bytes) | ||
2008 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
2009 | /* forget this pointer (for sanity check) */ | ||
2010 | batch->memcg = NULL; | ||
2011 | } | ||
2012 | |||
1929 | #ifdef CONFIG_SWAP | 2013 | #ifdef CONFIG_SWAP |
1930 | /* | 2014 | /* |
1931 | * called after __delete_from_swap_cache() and drop "page" account. | 2015 | * called after __delete_from_swap_cache() and drop "page" account. |
diff --git a/mm/memory.c b/mm/memory.c index a54b2c498444..aed45eaf8ac9 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
956 | details = NULL; | 956 | details = NULL; |
957 | 957 | ||
958 | BUG_ON(addr >= end); | 958 | BUG_ON(addr >= end); |
959 | mem_cgroup_uncharge_start(); | ||
959 | tlb_start_vma(tlb, vma); | 960 | tlb_start_vma(tlb, vma); |
960 | pgd = pgd_offset(vma->vm_mm, addr); | 961 | pgd = pgd_offset(vma->vm_mm, addr); |
961 | do { | 962 | do { |
@@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
968 | zap_work, details); | 969 | zap_work, details); |
969 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | 970 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); |
970 | tlb_end_vma(tlb, vma); | 971 | tlb_end_vma(tlb, vma); |
972 | mem_cgroup_uncharge_end(); | ||
971 | 973 | ||
972 | return addr; | 974 | return addr; |
973 | } | 975 | } |
diff --git a/mm/truncate.c b/mm/truncate.c index 2c147a7e5f2c..342deee22684 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
272 | pagevec_release(&pvec); | 272 | pagevec_release(&pvec); |
273 | break; | 273 | break; |
274 | } | 274 | } |
275 | mem_cgroup_uncharge_start(); | ||
275 | for (i = 0; i < pagevec_count(&pvec); i++) { | 276 | for (i = 0; i < pagevec_count(&pvec); i++) { |
276 | struct page *page = pvec.pages[i]; | 277 | struct page *page = pvec.pages[i]; |
277 | 278 | ||
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
286 | unlock_page(page); | 287 | unlock_page(page); |
287 | } | 288 | } |
288 | pagevec_release(&pvec); | 289 | pagevec_release(&pvec); |
290 | mem_cgroup_uncharge_end(); | ||
289 | } | 291 | } |
290 | } | 292 | } |
291 | EXPORT_SYMBOL(truncate_inode_pages_range); | 293 | EXPORT_SYMBOL(truncate_inode_pages_range); |
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
327 | pagevec_init(&pvec, 0); | 329 | pagevec_init(&pvec, 0); |
328 | while (next <= end && | 330 | while (next <= end && |
329 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 331 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
332 | mem_cgroup_uncharge_start(); | ||
330 | for (i = 0; i < pagevec_count(&pvec); i++) { | 333 | for (i = 0; i < pagevec_count(&pvec); i++) { |
331 | struct page *page = pvec.pages[i]; | 334 | struct page *page = pvec.pages[i]; |
332 | pgoff_t index; | 335 | pgoff_t index; |
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
354 | break; | 357 | break; |
355 | } | 358 | } |
356 | pagevec_release(&pvec); | 359 | pagevec_release(&pvec); |
360 | mem_cgroup_uncharge_end(); | ||
357 | cond_resched(); | 361 | cond_resched(); |
358 | } | 362 | } |
359 | return ret; | 363 | return ret; |
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
428 | while (next <= end && !wrapped && | 432 | while (next <= end && !wrapped && |
429 | pagevec_lookup(&pvec, mapping, next, | 433 | pagevec_lookup(&pvec, mapping, next, |
430 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 434 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
435 | mem_cgroup_uncharge_start(); | ||
431 | for (i = 0; i < pagevec_count(&pvec); i++) { | 436 | for (i = 0; i < pagevec_count(&pvec); i++) { |
432 | struct page *page = pvec.pages[i]; | 437 | struct page *page = pvec.pages[i]; |
433 | pgoff_t page_index; | 438 | pgoff_t page_index; |
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
477 | unlock_page(page); | 482 | unlock_page(page); |
478 | } | 483 | } |
479 | pagevec_release(&pvec); | 484 | pagevec_release(&pvec); |
485 | mem_cgroup_uncharge_end(); | ||
480 | cond_resched(); | 486 | cond_resched(); |
481 | } | 487 | } |
482 | return ret; | 488 | return ret; |