aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-12-15 19:47:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-16 10:20:07 -0500
commit569b846df54ffb2827b83ce3244c5f032394cba4 (patch)
tree77c5d373a5edf97710fab8777912971b99e84828
parentcd9b45b78a61e8df250e69385c74e729e5b66abf (diff)
memcg: coalesce uncharge during unmap/truncate
In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h13
-rw-r--r--include/linux/sched.h8
-rw-r--r--kernel/fork.c4
-rw-r--r--mm/memcontrol.c96
-rw-r--r--mm/memory.c2
-rw-r--r--mm/truncate.c6
6 files changed, 123 insertions, 6 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bf9213b2db8f..91300c972e76 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -54,6 +54,11 @@ extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
54extern void mem_cgroup_del_lru(struct page *page); 54extern void mem_cgroup_del_lru(struct page *page);
55extern void mem_cgroup_move_lists(struct page *page, 55extern void mem_cgroup_move_lists(struct page *page,
56 enum lru_list from, enum lru_list to); 56 enum lru_list from, enum lru_list to);
57
58/* For coalescing uncharge for reducing memcg' overhead*/
59extern void mem_cgroup_uncharge_start(void);
60extern void mem_cgroup_uncharge_end(void);
61
57extern void mem_cgroup_uncharge_page(struct page *page); 62extern void mem_cgroup_uncharge_page(struct page *page);
58extern void mem_cgroup_uncharge_cache_page(struct page *page); 63extern void mem_cgroup_uncharge_cache_page(struct page *page);
59extern int mem_cgroup_shmem_charge_fallback(struct page *page, 64extern int mem_cgroup_shmem_charge_fallback(struct page *page,
@@ -151,6 +156,14 @@ static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
151{ 156{
152} 157}
153 158
159static inline void mem_cgroup_uncharge_start(void)
160{
161}
162
163static inline void mem_cgroup_uncharge_end(void)
164{
165}
166
154static inline void mem_cgroup_uncharge_page(struct page *page) 167static inline void mem_cgroup_uncharge_page(struct page *page)
155{ 168{
156} 169}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c858f38e81a..f4c145410a8d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1544,6 +1544,14 @@ struct task_struct {
1544 unsigned long trace_recursion; 1544 unsigned long trace_recursion;
1545#endif /* CONFIG_TRACING */ 1545#endif /* CONFIG_TRACING */
1546 unsigned long stack_start; 1546 unsigned long stack_start;
1547#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
1548 struct memcg_batch_info {
1549 int do_batch; /* incremented when batch uncharge started */
1550 struct mem_cgroup *memcg; /* target memcg of uncharge */
1551 unsigned long bytes; /* uncharged usage */
1552 unsigned long memsw_bytes; /* uncharged mem+swap usage */
1553 } memcg_batch;
1554#endif
1547}; 1555};
1548 1556
1549/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1557/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd91447e052..b6cbd33dde80 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1127#ifdef CONFIG_DEBUG_MUTEXES 1127#ifdef CONFIG_DEBUG_MUTEXES
1128 p->blocked_on = NULL; /* not blocked yet */ 1128 p->blocked_on = NULL; /* not blocked yet */
1129#endif 1129#endif
1130#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1131 p->memcg_batch.do_batch = 0;
1132 p->memcg_batch.memcg = NULL;
1133#endif
1130 1134
1131 p->bts = NULL; 1135 p->bts = NULL;
1132 1136
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b5b108c1c6b..a730c91b8e69 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1827 css_put(&mem->css); 1827 css_put(&mem->css);
1828} 1828}
1829 1829
1830static void
1831__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
1832{
1833 struct memcg_batch_info *batch = NULL;
1834 bool uncharge_memsw = true;
1835 /* If swapout, usage of swap doesn't decrease */
1836 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1837 uncharge_memsw = false;
1838 /*
1839 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
1840 * In those cases, all pages freed continously can be expected to be in
1841 * the same cgroup and we have chance to coalesce uncharges.
1842 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
1843 * because we want to do uncharge as soon as possible.
1844 */
1845 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
1846 goto direct_uncharge;
1847
1848 batch = &current->memcg_batch;
1849 /*
1850 * In usual, we do css_get() when we remember memcg pointer.
1851 * But in this case, we keep res->usage until end of a series of
1852 * uncharges. Then, it's ok to ignore memcg's refcnt.
1853 */
1854 if (!batch->memcg)
1855 batch->memcg = mem;
1856 /*
1857 * In typical case, batch->memcg == mem. This means we can
1858 * merge a series of uncharges to an uncharge of res_counter.
1859 * If not, we uncharge res_counter ony by one.
1860 */
1861 if (batch->memcg != mem)
1862 goto direct_uncharge;
1863 /* remember freed charge and uncharge it later */
1864 batch->bytes += PAGE_SIZE;
1865 if (uncharge_memsw)
1866 batch->memsw_bytes += PAGE_SIZE;
1867 return;
1868direct_uncharge:
1869 res_counter_uncharge(&mem->res, PAGE_SIZE);
1870 if (uncharge_memsw)
1871 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1872 return;
1873}
1830 1874
1831/* 1875/*
1832 * uncharge if !page_mapped(page) 1876 * uncharge if !page_mapped(page)
@@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1875 break; 1919 break;
1876 } 1920 }
1877 1921
1878 if (!mem_cgroup_is_root(mem)) { 1922 if (!mem_cgroup_is_root(mem))
1879 res_counter_uncharge(&mem->res, PAGE_SIZE); 1923 __do_uncharge(mem, ctype);
1880 if (do_swap_account &&
1881 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1882 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1883 }
1884 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1924 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1885 mem_cgroup_swap_statistics(mem, true); 1925 mem_cgroup_swap_statistics(mem, true);
1886 mem_cgroup_charge_statistics(mem, pc, false); 1926 mem_cgroup_charge_statistics(mem, pc, false);
@@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1926 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1966 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1927} 1967}
1928 1968
1969/*
1970 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
1971 * In that cases, pages are freed continuously and we can expect pages
1972 * are in the same memcg. All these calls itself limits the number of
1973 * pages freed at once, then uncharge_start/end() is called properly.
1974 * This may be called prural(2) times in a context,
1975 */
1976
1977void mem_cgroup_uncharge_start(void)
1978{
1979 current->memcg_batch.do_batch++;
1980 /* We can do nest. */
1981 if (current->memcg_batch.do_batch == 1) {
1982 current->memcg_batch.memcg = NULL;
1983 current->memcg_batch.bytes = 0;
1984 current->memcg_batch.memsw_bytes = 0;
1985 }
1986}
1987
1988void mem_cgroup_uncharge_end(void)
1989{
1990 struct memcg_batch_info *batch = &current->memcg_batch;
1991
1992 if (!batch->do_batch)
1993 return;
1994
1995 batch->do_batch--;
1996 if (batch->do_batch) /* If stacked, do nothing. */
1997 return;
1998
1999 if (!batch->memcg)
2000 return;
2001 /*
2002 * This "batch->memcg" is valid without any css_get/put etc...
2003 * bacause we hide charges behind us.
2004 */
2005 if (batch->bytes)
2006 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2007 if (batch->memsw_bytes)
2008 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2009 /* forget this pointer (for sanity check) */
2010 batch->memcg = NULL;
2011}
2012
1929#ifdef CONFIG_SWAP 2013#ifdef CONFIG_SWAP
1930/* 2014/*
1931 * called after __delete_from_swap_cache() and drop "page" account. 2015 * called after __delete_from_swap_cache() and drop "page" account.
diff --git a/mm/memory.c b/mm/memory.c
index a54b2c498444..aed45eaf8ac9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
956 details = NULL; 956 details = NULL;
957 957
958 BUG_ON(addr >= end); 958 BUG_ON(addr >= end);
959 mem_cgroup_uncharge_start();
959 tlb_start_vma(tlb, vma); 960 tlb_start_vma(tlb, vma);
960 pgd = pgd_offset(vma->vm_mm, addr); 961 pgd = pgd_offset(vma->vm_mm, addr);
961 do { 962 do {
@@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
968 zap_work, details); 969 zap_work, details);
969 } while (pgd++, addr = next, (addr != end && *zap_work > 0)); 970 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
970 tlb_end_vma(tlb, vma); 971 tlb_end_vma(tlb, vma);
972 mem_cgroup_uncharge_end();
971 973
972 return addr; 974 return addr;
973} 975}
diff --git a/mm/truncate.c b/mm/truncate.c
index 2c147a7e5f2c..342deee22684 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
272 pagevec_release(&pvec); 272 pagevec_release(&pvec);
273 break; 273 break;
274 } 274 }
275 mem_cgroup_uncharge_start();
275 for (i = 0; i < pagevec_count(&pvec); i++) { 276 for (i = 0; i < pagevec_count(&pvec); i++) {
276 struct page *page = pvec.pages[i]; 277 struct page *page = pvec.pages[i];
277 278
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
286 unlock_page(page); 287 unlock_page(page);
287 } 288 }
288 pagevec_release(&pvec); 289 pagevec_release(&pvec);
290 mem_cgroup_uncharge_end();
289 } 291 }
290} 292}
291EXPORT_SYMBOL(truncate_inode_pages_range); 293EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
327 pagevec_init(&pvec, 0); 329 pagevec_init(&pvec, 0);
328 while (next <= end && 330 while (next <= end &&
329 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 331 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
332 mem_cgroup_uncharge_start();
330 for (i = 0; i < pagevec_count(&pvec); i++) { 333 for (i = 0; i < pagevec_count(&pvec); i++) {
331 struct page *page = pvec.pages[i]; 334 struct page *page = pvec.pages[i];
332 pgoff_t index; 335 pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
354 break; 357 break;
355 } 358 }
356 pagevec_release(&pvec); 359 pagevec_release(&pvec);
360 mem_cgroup_uncharge_end();
357 cond_resched(); 361 cond_resched();
358 } 362 }
359 return ret; 363 return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
428 while (next <= end && !wrapped && 432 while (next <= end && !wrapped &&
429 pagevec_lookup(&pvec, mapping, next, 433 pagevec_lookup(&pvec, mapping, next,
430 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 434 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
435 mem_cgroup_uncharge_start();
431 for (i = 0; i < pagevec_count(&pvec); i++) { 436 for (i = 0; i < pagevec_count(&pvec); i++) {
432 struct page *page = pvec.pages[i]; 437 struct page *page = pvec.pages[i];
433 pgoff_t page_index; 438 pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
477 unlock_page(page); 482 unlock_page(page);
478 } 483 }
479 pagevec_release(&pvec); 484 pagevec_release(&pvec);
485 mem_cgroup_uncharge_end();
480 cond_resched(); 486 cond_resched();
481 } 487 }
482 return ret; 488 return ret;