mm: avoid false sharing of mm_counter

Considering the nature of per mm stats, it's the shared object among threads and can be a cache-miss point in the page fault path. This patch adds per-thread cache for mm_counter. RSS value will be counted into a struct in task_struct and synchronized with mm's one at events. Now, in this patch, the event is the number of calls to handle_mm_fault. Per-thread value is added to mm at each 64 calls. rough estimation with small benchmark on parallel thread (2threads) shows [before] 4.5 cache-miss/faults [after] 4.0 cache-miss/faults Anyway, the most contended object is mmap_sem if the number of threads grows. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> 2010-03-05 16:41:40 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-03-06 14:26:24 -0500
commit: 34e55232e59f7b19050267a05ff1226e5cd122a5 (patch)
tree: 6b94e776e87d2a2fe1ceca7c5606901575323900 /mm/memory.c
parent: d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 (diff)
1 files changed, 86 insertions, 8 deletions
diff --git a/mm/memory.c b/mm/memory.c
index c57678478801..a4597614f18d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void)
 core_initcall(init_zero_pfn);
+#if defined(SPLIT_RSS_COUNTING)
+void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+{
+        int i;
+        for (i = 0; i < NR_MM_COUNTERS; i++) {
+                if (task->rss_stat.count[i]) {
+                        add_mm_counter(mm, i, task->rss_stat.count[i]);
+                        task->rss_stat.count[i] = 0;
+                }
+        }
+        task->rss_stat.events = 0;
+}
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+{
+        struct task_struct *task = current;
+        if (likely(task->mm == mm))
+                task->rss_stat.count[member] += val;
+        else
+                add_mm_counter(mm, member, val);
+}
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
+/* sync counter once per 64 page faults */
+#define TASK_RSS_EVENTS_THRESH  (64)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+        if (unlikely(task != current))
+                return;
+        if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
+                __sync_task_rss_stat(task, task->mm);
+}
+unsigned long get_mm_counter(struct mm_struct *mm, int member)
+{
+        long val = 0;
+        /*
+         * Don't use task->mm here...for avoiding to use task_get_mm()..
+         * The caller must guarantee task->mm is not invalid.
+         */
+        val = atomic_long_read(&mm->rss_stat.count[member]);
+        /*
+         * counter is updated in asynchronous manner and may go to minus.
+         * But it's never be expected number for users.
+         */
+        if (val < 0)
+                return 0;
+        return (unsigned long)val;
+}
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+        __sync_task_rss_stat(task, mm);
+}
+#else
+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+}
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+}
+#endif
 /*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 {
        int i;
+        if (current->mm == mm)
+                sync_mm_rss(current, mm);
        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
        /* Ok, finally just insert the thing.. */
        get_page(page);
-        inc_mm_counter(mm, MM_FILEPAGES);
+        inc_mm_counter_fast(mm, MM_FILEPAGES);
        page_add_file_rmap(page);
        set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -2175,11 +2250,11 @@ gotten:
        if (likely(pte_same(*page_table, orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
-                                dec_mm_counter(mm, MM_FILEPAGES);
+                                dec_mm_counter_fast(mm, MM_FILEPAGES);
-                                inc_mm_counter(mm, MM_ANONPAGES);
+                                inc_mm_counter_fast(mm, MM_ANONPAGES);
                        }
                } else
-                        inc_mm_counter(mm, MM_ANONPAGES);
+                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * discarded at swap_free().
         */
-        inc_mm_counter(mm, MM_ANONPAGES);
+        inc_mm_counter_fast(mm, MM_ANONPAGES);
        pte = mk_pte(page, vma->vm_page_prot);
        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_none(*page_table))
                goto release;
-        inc_mm_counter(mm, MM_ANONPAGES);
+        inc_mm_counter_fast(mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, address);
 setpte:
        set_pte_at(mm, address, page_table, entry);
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                if (flags & FAULT_FLAG_WRITE)
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (anon) {
-                        inc_mm_counter(mm, MM_ANONPAGES);
+                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                        page_add_new_anon_rmap(page, vma, address);
                } else {
-                        inc_mm_counter(mm, MM_FILEPAGES);
+                        inc_mm_counter_fast(mm, MM_FILEPAGES);
                        page_add_file_rmap(page);
                        if (flags & FAULT_FLAG_WRITE) {
                                dirty_page = page;
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        count_vm_event(PGFAULT);
+        /* do counter updates before entering really critical section. */
+        check_sync_rss_stat(current);
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, flags);
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>	2010-03-05 16:41:40 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-03-06 14:26:24 -0500
commit	34e55232e59f7b19050267a05ff1226e5cd122a5 (patch)
tree	6b94e776e87d2a2fe1ceca7c5606901575323900 /mm/memory.c
parent	d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 (diff)

diff --git a/mm/memory.c b/mm/memory.c index c57678478801..a4597614f18d 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void)
122	core_initcall(init_zero_pfn);	122	core_initcall(init_zero_pfn);
123		123
124		124
		125	#if defined(SPLIT_RSS_COUNTING)
		126
		127	void __sync_task_rss_stat(struct task_struct task, struct mm_struct mm)
		128	{
		129	int i;
		130
		131	for (i = 0; i < NR_MM_COUNTERS; i++) {
		132	if (task->rss_stat.count[i]) {
		133	add_mm_counter(mm, i, task->rss_stat.count[i]);
		134	task->rss_stat.count[i] = 0;
		135	}
		136	}
		137	task->rss_stat.events = 0;
		138	}
		139
		140	static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
		141	{
		142	struct task_struct *task = current;
		143
		144	if (likely(task->mm == mm))
		145	task->rss_stat.count[member] += val;
		146	else
		147	add_mm_counter(mm, member, val);
		148	}
		149	#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
		150	#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
		151
		152	/* sync counter once per 64 page faults */
		153	#define TASK_RSS_EVENTS_THRESH (64)
		154	static void check_sync_rss_stat(struct task_struct *task)
		155	{
		156	if (unlikely(task != current))
		157	return;
		158	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
		159	__sync_task_rss_stat(task, task->mm);
		160	}
		161
		162	unsigned long get_mm_counter(struct mm_struct *mm, int member)
		163	{
		164	long val = 0;
		165
		166	/*
		167	* Don't use task->mm here...for avoiding to use task_get_mm()..
		168	* The caller must guarantee task->mm is not invalid.
		169	*/
		170	val = atomic_long_read(&mm->rss_stat.count[member]);
		171	/*
		172	* counter is updated in asynchronous manner and may go to minus.
		173	* But it's never be expected number for users.
		174	*/
		175	if (val < 0)
		176	return 0;
		177	return (unsigned long)val;
		178	}
		179
		180	void sync_mm_rss(struct task_struct task, struct mm_struct mm)
		181	{
		182	__sync_task_rss_stat(task, mm);
		183	}
		184	#else
		185
		186	#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
		187	#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
		188
		189	static void check_sync_rss_stat(struct task_struct *task)
		190	{
		191	}
		192
		193	void sync_mm_rss(struct task_struct task, struct mm_struct mm)
		194	{
		195	}
		196	#endif
		197
125	/*	198	/*
126	* If a p?d_bad entry is found while walking page tables, report	199	* If a p?d_bad entry is found while walking page tables, report
127	* the error, before resetting entry to p?d_none. Usually (but	200	* the error, before resetting entry to p?d_none. Usually (but
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct mm, int rss)
386	{	459	{
387	int i;	460	int i;
388		461
		462	if (current->mm == mm)
		463	sync_mm_rss(current, mm);
389	for (i = 0; i < NR_MM_COUNTERS; i++)	464	for (i = 0; i < NR_MM_COUNTERS; i++)
390	if (rss[i])	465	if (rss[i])
391	add_mm_counter(mm, i, rss[i]);	466	add_mm_counter(mm, i, rss[i]);
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1539		1614
1540	/* Ok, finally just insert the thing.. */	1615	/* Ok, finally just insert the thing.. */
1541	get_page(page);	1616	get_page(page);
1542	inc_mm_counter(mm, MM_FILEPAGES);	1617	inc_mm_counter_fast(mm, MM_FILEPAGES);
1543	page_add_file_rmap(page);	1618	page_add_file_rmap(page);
1544	set_pte_at(mm, addr, pte, mk_pte(page, prot));	1619	set_pte_at(mm, addr, pte, mk_pte(page, prot));
1545		1620
@@ -2175,11 +2250,11 @@ gotten:
2175	if (likely(pte_same(*page_table, orig_pte))) {	2250	if (likely(pte_same(*page_table, orig_pte))) {
2176	if (old_page) {	2251	if (old_page) {
2177	if (!PageAnon(old_page)) {	2252	if (!PageAnon(old_page)) {
2178	dec_mm_counter(mm, MM_FILEPAGES);	2253	dec_mm_counter_fast(mm, MM_FILEPAGES);
2179	inc_mm_counter(mm, MM_ANONPAGES);	2254	inc_mm_counter_fast(mm, MM_ANONPAGES);
2180	}	2255	}
2181	} else	2256	} else
2182	inc_mm_counter(mm, MM_ANONPAGES);	2257	inc_mm_counter_fast(mm, MM_ANONPAGES);
2183	flush_cache_page(vma, address, pte_pfn(orig_pte));	2258	flush_cache_page(vma, address, pte_pfn(orig_pte));
2184	entry = mk_pte(new_page, vma->vm_page_prot);	2259	entry = mk_pte(new_page, vma->vm_page_prot);
2185	entry = maybe_mkwrite(pte_mkdirty(entry), vma);	2260	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct mm, struct vm_area_struct vma,
2616	* discarded at swap_free().	2691	* discarded at swap_free().
2617	*/	2692	*/
2618		2693
2619	inc_mm_counter(mm, MM_ANONPAGES);	2694	inc_mm_counter_fast(mm, MM_ANONPAGES);
2620	pte = mk_pte(page, vma->vm_page_prot);	2695	pte = mk_pte(page, vma->vm_page_prot);
2621	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {	2696	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2622	pte = maybe_mkwrite(pte_mkdirty(pte), vma);	2697	pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct mm, struct vm_area_struct vma,
2700	if (!pte_none(*page_table))	2775	if (!pte_none(*page_table))
2701	goto release;	2776	goto release;
2702		2777
2703	inc_mm_counter(mm, MM_ANONPAGES);	2778	inc_mm_counter_fast(mm, MM_ANONPAGES);
2704	page_add_new_anon_rmap(page, vma, address);	2779	page_add_new_anon_rmap(page, vma, address);
2705	setpte:	2780	setpte:
2706	set_pte_at(mm, address, page_table, entry);	2781	set_pte_at(mm, address, page_table, entry);
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct mm, struct vm_area_struct vma,
2854	if (flags & FAULT_FLAG_WRITE)	2929	if (flags & FAULT_FLAG_WRITE)
2855	entry = maybe_mkwrite(pte_mkdirty(entry), vma);	2930	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2856	if (anon) {	2931	if (anon) {
2857	inc_mm_counter(mm, MM_ANONPAGES);	2932	inc_mm_counter_fast(mm, MM_ANONPAGES);
2858	page_add_new_anon_rmap(page, vma, address);	2933	page_add_new_anon_rmap(page, vma, address);
2859	} else {	2934	} else {
2860	inc_mm_counter(mm, MM_FILEPAGES);	2935	inc_mm_counter_fast(mm, MM_FILEPAGES);
2861	page_add_file_rmap(page);	2936	page_add_file_rmap(page);
2862	if (flags & FAULT_FLAG_WRITE) {	2937	if (flags & FAULT_FLAG_WRITE) {
2863	dirty_page = page;	2938	dirty_page = page;
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct mm, struct vm_area_struct vma,
3035		3110
3036	count_vm_event(PGFAULT);	3111	count_vm_event(PGFAULT);
3037		3112
		3113	/* do counter updates before entering really critical section. */
		3114	check_sync_rss_stat(current);
		3115
3038	if (unlikely(is_vm_hugetlb_page(vma)))	3116	if (unlikely(is_vm_hugetlb_page(vma)))
3039	return hugetlb_fault(mm, vma, address, flags);	3117	return hugetlb_fault(mm, vma, address, flags);
3040		3118