aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-12-15 19:47:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-16 10:20:07 -0500
commit569b846df54ffb2827b83ce3244c5f032394cba4 (patch)
tree77c5d373a5edf97710fab8777912971b99e84828 /mm
parentcd9b45b78a61e8df250e69385c74e729e5b66abf (diff)
memcg: coalesce uncharge during unmap/truncate
In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c96
-rw-r--r--mm/memory.c2
-rw-r--r--mm/truncate.c6
3 files changed, 98 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b5b108c1c6b..a730c91b8e69 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1827 css_put(&mem->css); 1827 css_put(&mem->css);
1828} 1828}
1829 1829
1830static void
1831__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
1832{
1833 struct memcg_batch_info *batch = NULL;
1834 bool uncharge_memsw = true;
1835 /* If swapout, usage of swap doesn't decrease */
1836 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1837 uncharge_memsw = false;
1838 /*
1839 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
1840 * In those cases, all pages freed continously can be expected to be in
1841 * the same cgroup and we have chance to coalesce uncharges.
1842 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
1843 * because we want to do uncharge as soon as possible.
1844 */
1845 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
1846 goto direct_uncharge;
1847
1848 batch = &current->memcg_batch;
1849 /*
1850 * In usual, we do css_get() when we remember memcg pointer.
1851 * But in this case, we keep res->usage until end of a series of
1852 * uncharges. Then, it's ok to ignore memcg's refcnt.
1853 */
1854 if (!batch->memcg)
1855 batch->memcg = mem;
1856 /*
1857 * In typical case, batch->memcg == mem. This means we can
1858 * merge a series of uncharges to an uncharge of res_counter.
1859 * If not, we uncharge res_counter ony by one.
1860 */
1861 if (batch->memcg != mem)
1862 goto direct_uncharge;
1863 /* remember freed charge and uncharge it later */
1864 batch->bytes += PAGE_SIZE;
1865 if (uncharge_memsw)
1866 batch->memsw_bytes += PAGE_SIZE;
1867 return;
1868direct_uncharge:
1869 res_counter_uncharge(&mem->res, PAGE_SIZE);
1870 if (uncharge_memsw)
1871 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1872 return;
1873}
1830 1874
1831/* 1875/*
1832 * uncharge if !page_mapped(page) 1876 * uncharge if !page_mapped(page)
@@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1875 break; 1919 break;
1876 } 1920 }
1877 1921
1878 if (!mem_cgroup_is_root(mem)) { 1922 if (!mem_cgroup_is_root(mem))
1879 res_counter_uncharge(&mem->res, PAGE_SIZE); 1923 __do_uncharge(mem, ctype);
1880 if (do_swap_account &&
1881 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1882 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1883 }
1884 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1924 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1885 mem_cgroup_swap_statistics(mem, true); 1925 mem_cgroup_swap_statistics(mem, true);
1886 mem_cgroup_charge_statistics(mem, pc, false); 1926 mem_cgroup_charge_statistics(mem, pc, false);
@@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1926 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1966 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1927} 1967}
1928 1968
1969/*
1970 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
1971 * In that cases, pages are freed continuously and we can expect pages
1972 * are in the same memcg. All these calls itself limits the number of
1973 * pages freed at once, then uncharge_start/end() is called properly.
1974 * This may be called prural(2) times in a context,
1975 */
1976
1977void mem_cgroup_uncharge_start(void)
1978{
1979 current->memcg_batch.do_batch++;
1980 /* We can do nest. */
1981 if (current->memcg_batch.do_batch == 1) {
1982 current->memcg_batch.memcg = NULL;
1983 current->memcg_batch.bytes = 0;
1984 current->memcg_batch.memsw_bytes = 0;
1985 }
1986}
1987
1988void mem_cgroup_uncharge_end(void)
1989{
1990 struct memcg_batch_info *batch = &current->memcg_batch;
1991
1992 if (!batch->do_batch)
1993 return;
1994
1995 batch->do_batch--;
1996 if (batch->do_batch) /* If stacked, do nothing. */
1997 return;
1998
1999 if (!batch->memcg)
2000 return;
2001 /*
2002 * This "batch->memcg" is valid without any css_get/put etc...
2003 * bacause we hide charges behind us.
2004 */
2005 if (batch->bytes)
2006 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2007 if (batch->memsw_bytes)
2008 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2009 /* forget this pointer (for sanity check) */
2010 batch->memcg = NULL;
2011}
2012
1929#ifdef CONFIG_SWAP 2013#ifdef CONFIG_SWAP
1930/* 2014/*
1931 * called after __delete_from_swap_cache() and drop "page" account. 2015 * called after __delete_from_swap_cache() and drop "page" account.
diff --git a/mm/memory.c b/mm/memory.c
index a54b2c498444..aed45eaf8ac9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
956 details = NULL; 956 details = NULL;
957 957
958 BUG_ON(addr >= end); 958 BUG_ON(addr >= end);
959 mem_cgroup_uncharge_start();
959 tlb_start_vma(tlb, vma); 960 tlb_start_vma(tlb, vma);
960 pgd = pgd_offset(vma->vm_mm, addr); 961 pgd = pgd_offset(vma->vm_mm, addr);
961 do { 962 do {
@@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
968 zap_work, details); 969 zap_work, details);
969 } while (pgd++, addr = next, (addr != end && *zap_work > 0)); 970 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
970 tlb_end_vma(tlb, vma); 971 tlb_end_vma(tlb, vma);
972 mem_cgroup_uncharge_end();
971 973
972 return addr; 974 return addr;
973} 975}
diff --git a/mm/truncate.c b/mm/truncate.c
index 2c147a7e5f2c..342deee22684 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
272 pagevec_release(&pvec); 272 pagevec_release(&pvec);
273 break; 273 break;
274 } 274 }
275 mem_cgroup_uncharge_start();
275 for (i = 0; i < pagevec_count(&pvec); i++) { 276 for (i = 0; i < pagevec_count(&pvec); i++) {
276 struct page *page = pvec.pages[i]; 277 struct page *page = pvec.pages[i];
277 278
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
286 unlock_page(page); 287 unlock_page(page);
287 } 288 }
288 pagevec_release(&pvec); 289 pagevec_release(&pvec);
290 mem_cgroup_uncharge_end();
289 } 291 }
290} 292}
291EXPORT_SYMBOL(truncate_inode_pages_range); 293EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
327 pagevec_init(&pvec, 0); 329 pagevec_init(&pvec, 0);
328 while (next <= end && 330 while (next <= end &&
329 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 331 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
332 mem_cgroup_uncharge_start();
330 for (i = 0; i < pagevec_count(&pvec); i++) { 333 for (i = 0; i < pagevec_count(&pvec); i++) {
331 struct page *page = pvec.pages[i]; 334 struct page *page = pvec.pages[i];
332 pgoff_t index; 335 pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
354 break; 357 break;
355 } 358 }
356 pagevec_release(&pvec); 359 pagevec_release(&pvec);
360 mem_cgroup_uncharge_end();
357 cond_resched(); 361 cond_resched();
358 } 362 }
359 return ret; 363 return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
428 while (next <= end && !wrapped && 432 while (next <= end && !wrapped &&
429 pagevec_lookup(&pvec, mapping, next, 433 pagevec_lookup(&pvec, mapping, next,
430 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 434 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
435 mem_cgroup_uncharge_start();
431 for (i = 0; i < pagevec_count(&pvec); i++) { 436 for (i = 0; i < pagevec_count(&pvec); i++) {
432 struct page *page = pvec.pages[i]; 437 struct page *page = pvec.pages[i];
433 pgoff_t page_index; 438 pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
477 unlock_page(page); 482 unlock_page(page);
478 } 483 }
479 pagevec_release(&pvec); 484 pagevec_release(&pvec);
485 mem_cgroup_uncharge_end();
480 cond_resched(); 486 cond_resched();
481 } 487 }
482 return ret; 488 return ret;