memcg: coalesce uncharge during unmap/truncate

In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> 2009-12-15 19:47:03 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-12-16 10:20:07 -0500
commit: 569b846df54ffb2827b83ce3244c5f032394cba4 (patch)
tree: 77c5d373a5edf97710fab8777912971b99e84828 /mm
parent: cd9b45b78a61e8df250e69385c74e729e5b66abf (diff)
3 files changed, 98 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b5b108c1c6b..a730c91b8e69 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
        css_put(&mem->css);
 }
+static void
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+{
+        struct memcg_batch_info *batch = NULL;
+        bool uncharge_memsw = true;
+        /* If swapout, usage of swap doesn't decrease */
+        if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+                uncharge_memsw = false;
+        /*
+         * do_batch > 0 when unmapping pages or inode invalidate/truncate.
+         * In those cases, all pages freed continously can be expected to be in
+         * the same cgroup and we have chance to coalesce uncharges.
+         * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
+         * because we want to do uncharge as soon as possible.
+         */
+        if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
+                goto direct_uncharge;
+        batch = &current->memcg_batch;
+        /*
+         * In usual, we do css_get() when we remember memcg pointer.
+         * But in this case, we keep res->usage until end of a series of
+         * uncharges. Then, it's ok to ignore memcg's refcnt.
+         */
+        if (!batch->memcg)
+                batch->memcg = mem;
+        /*
+         * In typical case, batch->memcg == mem. This means we can
+         * merge a series of uncharges to an uncharge of res_counter.
+         * If not, we uncharge res_counter ony by one.
+         */
+        if (batch->memcg != mem)
+                goto direct_uncharge;
+        /* remember freed charge and uncharge it later */
+        batch->bytes += PAGE_SIZE;
+        if (uncharge_memsw)
+                batch->memsw_bytes += PAGE_SIZE;
+        return;
+direct_uncharge:
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        if (uncharge_memsw)
+                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+        return;
+}
 /*
 * uncharge if !page_mapped(page)
@@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        if (!mem_cgroup_is_root(mem)) {
+        if (!mem_cgroup_is_root(mem))
-                res_counter_uncharge(&mem->res, PAGE_SIZE);
+                __do_uncharge(mem, ctype);
-                if (do_swap_account &&
-                                (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
-                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
-        }
        if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
                mem_cgroup_swap_statistics(mem, true);
        mem_cgroup_charge_statistics(mem, pc, false);
@@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
+/*
+ * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
+ * In that cases, pages are freed continuously and we can expect pages
+ * are in the same memcg. All these calls itself limits the number of
+ * pages freed at once, then uncharge_start/end() is called properly.
+ * This may be called prural(2) times in a context,
+ */
+void mem_cgroup_uncharge_start(void)
+{
+        current->memcg_batch.do_batch++;
+        /* We can do nest. */
+        if (current->memcg_batch.do_batch == 1) {
+                current->memcg_batch.memcg = NULL;
+                current->memcg_batch.bytes = 0;
+                current->memcg_batch.memsw_bytes = 0;
+        }
+}
+void mem_cgroup_uncharge_end(void)
+{
+        struct memcg_batch_info *batch = &current->memcg_batch;
+        if (!batch->do_batch)
+                return;
+        batch->do_batch--;
+        if (batch->do_batch) /* If stacked, do nothing. */
+                return;
+        if (!batch->memcg)
+                return;
+        /*
+         * This "batch->memcg" is valid without any css_get/put etc...
+         * bacause we hide charges behind us.
+         */
+        if (batch->bytes)
+                res_counter_uncharge(&batch->memcg->res, batch->bytes);
+        if (batch->memsw_bytes)
+                res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+        /* forget this pointer (for sanity check) */
+        batch->memcg = NULL;
+}
 #ifdef CONFIG_SWAP
 /*
 * called after __delete_from_swap_cache() and drop "page" account.
diff --git a/mm/memory.c b/mm/memory.c
index a54b2c498444..aed45eaf8ac9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
                details = NULL;
        BUG_ON(addr >= end);
+        mem_cgroup_uncharge_start();
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
@@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
                                                zap_work, details);
        } while (pgd++, addr = next, (addr != end && *zap_work > 0));
        tlb_end_vma(tlb, vma);
+        mem_cgroup_uncharge_end();
        return addr;
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 2c147a7e5f2c..342deee22684 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        pagevec_release(&pvec);
                        break;
                }
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        unlock_page(page);
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
        }
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
        pagevec_init(&pvec, 0);
        while (next <= end &&
                        pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                break;
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
                cond_resched();
        }
        return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        while (next <= end && !wrapped &&
                pagevec_lookup(&pvec, mapping, next,
                        min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                        unlock_page(page);
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
                cond_resched();
        }
        return ret;
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>	2009-12-15 19:47:03 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-12-16 10:20:07 -0500
commit	569b846df54ffb2827b83ce3244c5f032394cba4 (patch)
tree	77c5d373a5edf97710fab8777912971b99e84828 /mm
parent	cd9b45b78a61e8df250e69385c74e729e5b66abf (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b5b108c1c6b..a730c91b8e69 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1827	css_put(&mem->css);	1827	css_put(&mem->css);
1828	}	1828	}
1829		1829
		1830	static void
		1831	__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
		1832	{
		1833	struct memcg_batch_info *batch = NULL;
		1834	bool uncharge_memsw = true;
		1835	/* If swapout, usage of swap doesn't decrease */
		1836	if (!do_swap_account \|\| ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
		1837	uncharge_memsw = false;
		1838	/*
		1839	* do_batch > 0 when unmapping pages or inode invalidate/truncate.
		1840	* In those cases, all pages freed continously can be expected to be in
		1841	* the same cgroup and we have chance to coalesce uncharges.
		1842	* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
		1843	* because we want to do uncharge as soon as possible.
		1844	*/
		1845	if (!current->memcg_batch.do_batch \|\| test_thread_flag(TIF_MEMDIE))
		1846	goto direct_uncharge;
		1847
		1848	batch = &current->memcg_batch;
		1849	/*
		1850	* In usual, we do css_get() when we remember memcg pointer.
		1851	* But in this case, we keep res->usage until end of a series of
		1852	* uncharges. Then, it's ok to ignore memcg's refcnt.
		1853	*/
		1854	if (!batch->memcg)
		1855	batch->memcg = mem;
		1856	/*
		1857	* In typical case, batch->memcg == mem. This means we can
		1858	* merge a series of uncharges to an uncharge of res_counter.
		1859	* If not, we uncharge res_counter ony by one.
		1860	*/
		1861	if (batch->memcg != mem)
		1862	goto direct_uncharge;
		1863	/* remember freed charge and uncharge it later */
		1864	batch->bytes += PAGE_SIZE;
		1865	if (uncharge_memsw)
		1866	batch->memsw_bytes += PAGE_SIZE;
		1867	return;
		1868	direct_uncharge:
		1869	res_counter_uncharge(&mem->res, PAGE_SIZE);
		1870	if (uncharge_memsw)
		1871	res_counter_uncharge(&mem->memsw, PAGE_SIZE);
		1872	return;
		1873	}
1830		1874
1831	/*	1875	/*
1832	* uncharge if !page_mapped(page)	1876	* uncharge if !page_mapped(page)
@@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1875	break;	1919	break;
1876	}	1920	}
1877		1921
1878	if (!mem_cgroup_is_root(mem)) {	1922	if (!mem_cgroup_is_root(mem))
1879	res_counter_uncharge(&mem->res, PAGE_SIZE);	1923	__do_uncharge(mem, ctype);
1880	if (do_swap_account &&
1881	(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1882	res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1883	}
1884	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)	1924	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1885	mem_cgroup_swap_statistics(mem, true);	1925	mem_cgroup_swap_statistics(mem, true);
1886	mem_cgroup_charge_statistics(mem, pc, false);	1926	mem_cgroup_charge_statistics(mem, pc, false);
@@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1926	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);	1966	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1927	}	1967	}
1928		1968
		1969	/*
		1970	* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
		1971	* In that cases, pages are freed continuously and we can expect pages
		1972	* are in the same memcg. All these calls itself limits the number of
		1973	* pages freed at once, then uncharge_start/end() is called properly.
		1974	* This may be called prural(2) times in a context,
		1975	*/
		1976
		1977	void mem_cgroup_uncharge_start(void)
		1978	{
		1979	current->memcg_batch.do_batch++;
		1980	/* We can do nest. */
		1981	if (current->memcg_batch.do_batch == 1) {
		1982	current->memcg_batch.memcg = NULL;
		1983	current->memcg_batch.bytes = 0;
		1984	current->memcg_batch.memsw_bytes = 0;
		1985	}
		1986	}
		1987
		1988	void mem_cgroup_uncharge_end(void)
		1989	{
		1990	struct memcg_batch_info *batch = &current->memcg_batch;
		1991
		1992	if (!batch->do_batch)
		1993	return;
		1994
		1995	batch->do_batch--;
		1996	if (batch->do_batch) /* If stacked, do nothing. */
		1997	return;
		1998
		1999	if (!batch->memcg)
		2000	return;
		2001	/*
		2002	* This "batch->memcg" is valid without any css_get/put etc...
		2003	* bacause we hide charges behind us.
		2004	*/
		2005	if (batch->bytes)
		2006	res_counter_uncharge(&batch->memcg->res, batch->bytes);
		2007	if (batch->memsw_bytes)
		2008	res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
		2009	/* forget this pointer (for sanity check) */
		2010	batch->memcg = NULL;
		2011	}
		2012
1929	#ifdef CONFIG_SWAP	2013	#ifdef CONFIG_SWAP
1930	/*	2014	/*
1931	* called after __delete_from_swap_cache() and drop "page" account.	2015	* called after __delete_from_swap_cache() and drop "page" account.


diff --git a/mm/memory.c b/mm/memory.c index a54b2c498444..aed45eaf8ac9 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
956	details = NULL;	956	details = NULL;
957		957
958	BUG_ON(addr >= end);	958	BUG_ON(addr >= end);
		959	mem_cgroup_uncharge_start();
959	tlb_start_vma(tlb, vma);	960	tlb_start_vma(tlb, vma);
960	pgd = pgd_offset(vma->vm_mm, addr);	961	pgd = pgd_offset(vma->vm_mm, addr);
961	do {	962	do {
@@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
968	zap_work, details);	969	zap_work, details);
969	} while (pgd++, addr = next, (addr != end && *zap_work > 0));	970	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
970	tlb_end_vma(tlb, vma);	971	tlb_end_vma(tlb, vma);
		972	mem_cgroup_uncharge_end();
971		973
972	return addr;	974	return addr;
973	}	975	}


diff --git a/mm/truncate.c b/mm/truncate.c index 2c147a7e5f2c..342deee22684 100644 --- a/mm/truncate.c +++ b/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
272	pagevec_release(&pvec);	272	pagevec_release(&pvec);
273	break;	273	break;
274	}	274	}
		275	mem_cgroup_uncharge_start();
275	for (i = 0; i < pagevec_count(&pvec); i++) {	276	for (i = 0; i < pagevec_count(&pvec); i++) {
276	struct page *page = pvec.pages[i];	277	struct page *page = pvec.pages[i];
277		278
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
286	unlock_page(page);	287	unlock_page(page);
287	}	288	}
288	pagevec_release(&pvec);	289	pagevec_release(&pvec);
		290	mem_cgroup_uncharge_end();
289	}	291	}
290	}	292	}
291	EXPORT_SYMBOL(truncate_inode_pages_range);	293	EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
327	pagevec_init(&pvec, 0);	329	pagevec_init(&pvec, 0);
328	while (next <= end &&	330	while (next <= end &&
329	pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {	331	pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
		332	mem_cgroup_uncharge_start();
330	for (i = 0; i < pagevec_count(&pvec); i++) {	333	for (i = 0; i < pagevec_count(&pvec); i++) {
331	struct page *page = pvec.pages[i];	334	struct page *page = pvec.pages[i];
332	pgoff_t index;	335	pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
354	break;	357	break;
355	}	358	}
356	pagevec_release(&pvec);	359	pagevec_release(&pvec);
		360	mem_cgroup_uncharge_end();
357	cond_resched();	361	cond_resched();
358	}	362	}
359	return ret;	363	return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
428	while (next <= end && !wrapped &&	432	while (next <= end && !wrapped &&
429	pagevec_lookup(&pvec, mapping, next,	433	pagevec_lookup(&pvec, mapping, next,
430	min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {	434	min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
		435	mem_cgroup_uncharge_start();
431	for (i = 0; i < pagevec_count(&pvec); i++) {	436	for (i = 0; i < pagevec_count(&pvec); i++) {
432	struct page *page = pvec.pages[i];	437	struct page *page = pvec.pages[i];
433	pgoff_t page_index;	438	pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
477	unlock_page(page);	482	unlock_page(page);
478	}	483	}
479	pagevec_release(&pvec);	484	pagevec_release(&pvec);
		485	mem_cgroup_uncharge_end();
480	cond_resched();	486	cond_resched();
481	}	487	}
482	return ret;	488	return ret;