aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-03-05 16:41:40 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-06 14:26:24 -0500
commit34e55232e59f7b19050267a05ff1226e5cd122a5 (patch)
tree6b94e776e87d2a2fe1ceca7c5606901575323900 /mm/memory.c
parentd559db086ff5be9bcc259e5aa50bf3d881eaf1d1 (diff)
mm: avoid false sharing of mm_counter
Considering the nature of per mm stats, it's the shared object among threads and can be a cache-miss point in the page fault path. This patch adds per-thread cache for mm_counter. RSS value will be counted into a struct in task_struct and synchronized with mm's one at events. Now, in this patch, the event is the number of calls to handle_mm_fault. Per-thread value is added to mm at each 64 calls. rough estimation with small benchmark on parallel thread (2threads) shows [before] 4.5 cache-miss/faults [after] 4.0 cache-miss/faults Anyway, the most contended object is mmap_sem if the number of threads grows. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c94
1 files changed, 86 insertions, 8 deletions
diff --git a/mm/memory.c b/mm/memory.c
index c57678478801..a4597614f18d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void)
122core_initcall(init_zero_pfn); 122core_initcall(init_zero_pfn);
123 123
124 124
125#if defined(SPLIT_RSS_COUNTING)
126
127void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
128{
129 int i;
130
131 for (i = 0; i < NR_MM_COUNTERS; i++) {
132 if (task->rss_stat.count[i]) {
133 add_mm_counter(mm, i, task->rss_stat.count[i]);
134 task->rss_stat.count[i] = 0;
135 }
136 }
137 task->rss_stat.events = 0;
138}
139
140static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
141{
142 struct task_struct *task = current;
143
144 if (likely(task->mm == mm))
145 task->rss_stat.count[member] += val;
146 else
147 add_mm_counter(mm, member, val);
148}
149#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
150#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
151
152/* sync counter once per 64 page faults */
153#define TASK_RSS_EVENTS_THRESH (64)
154static void check_sync_rss_stat(struct task_struct *task)
155{
156 if (unlikely(task != current))
157 return;
158 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
159 __sync_task_rss_stat(task, task->mm);
160}
161
162unsigned long get_mm_counter(struct mm_struct *mm, int member)
163{
164 long val = 0;
165
166 /*
167 * Don't use task->mm here...for avoiding to use task_get_mm()..
168 * The caller must guarantee task->mm is not invalid.
169 */
170 val = atomic_long_read(&mm->rss_stat.count[member]);
171 /*
172 * counter is updated in asynchronous manner and may go to minus.
173 * But it's never be expected number for users.
174 */
175 if (val < 0)
176 return 0;
177 return (unsigned long)val;
178}
179
180void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
181{
182 __sync_task_rss_stat(task, mm);
183}
184#else
185
186#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
187#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
188
189static void check_sync_rss_stat(struct task_struct *task)
190{
191}
192
193void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
194{
195}
196#endif
197
125/* 198/*
126 * If a p?d_bad entry is found while walking page tables, report 199 * If a p?d_bad entry is found while walking page tables, report
127 * the error, before resetting entry to p?d_none. Usually (but 200 * the error, before resetting entry to p?d_none. Usually (but
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
386{ 459{
387 int i; 460 int i;
388 461
462 if (current->mm == mm)
463 sync_mm_rss(current, mm);
389 for (i = 0; i < NR_MM_COUNTERS; i++) 464 for (i = 0; i < NR_MM_COUNTERS; i++)
390 if (rss[i]) 465 if (rss[i])
391 add_mm_counter(mm, i, rss[i]); 466 add_mm_counter(mm, i, rss[i]);
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1539 1614
1540 /* Ok, finally just insert the thing.. */ 1615 /* Ok, finally just insert the thing.. */
1541 get_page(page); 1616 get_page(page);
1542 inc_mm_counter(mm, MM_FILEPAGES); 1617 inc_mm_counter_fast(mm, MM_FILEPAGES);
1543 page_add_file_rmap(page); 1618 page_add_file_rmap(page);
1544 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1619 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1545 1620
@@ -2175,11 +2250,11 @@ gotten:
2175 if (likely(pte_same(*page_table, orig_pte))) { 2250 if (likely(pte_same(*page_table, orig_pte))) {
2176 if (old_page) { 2251 if (old_page) {
2177 if (!PageAnon(old_page)) { 2252 if (!PageAnon(old_page)) {
2178 dec_mm_counter(mm, MM_FILEPAGES); 2253 dec_mm_counter_fast(mm, MM_FILEPAGES);
2179 inc_mm_counter(mm, MM_ANONPAGES); 2254 inc_mm_counter_fast(mm, MM_ANONPAGES);
2180 } 2255 }
2181 } else 2256 } else
2182 inc_mm_counter(mm, MM_ANONPAGES); 2257 inc_mm_counter_fast(mm, MM_ANONPAGES);
2183 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2258 flush_cache_page(vma, address, pte_pfn(orig_pte));
2184 entry = mk_pte(new_page, vma->vm_page_prot); 2259 entry = mk_pte(new_page, vma->vm_page_prot);
2185 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2260 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2616 * discarded at swap_free(). 2691 * discarded at swap_free().
2617 */ 2692 */
2618 2693
2619 inc_mm_counter(mm, MM_ANONPAGES); 2694 inc_mm_counter_fast(mm, MM_ANONPAGES);
2620 pte = mk_pte(page, vma->vm_page_prot); 2695 pte = mk_pte(page, vma->vm_page_prot);
2621 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2696 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2622 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2697 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2700 if (!pte_none(*page_table)) 2775 if (!pte_none(*page_table))
2701 goto release; 2776 goto release;
2702 2777
2703 inc_mm_counter(mm, MM_ANONPAGES); 2778 inc_mm_counter_fast(mm, MM_ANONPAGES);
2704 page_add_new_anon_rmap(page, vma, address); 2779 page_add_new_anon_rmap(page, vma, address);
2705setpte: 2780setpte:
2706 set_pte_at(mm, address, page_table, entry); 2781 set_pte_at(mm, address, page_table, entry);
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2854 if (flags & FAULT_FLAG_WRITE) 2929 if (flags & FAULT_FLAG_WRITE)
2855 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2930 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2856 if (anon) { 2931 if (anon) {
2857 inc_mm_counter(mm, MM_ANONPAGES); 2932 inc_mm_counter_fast(mm, MM_ANONPAGES);
2858 page_add_new_anon_rmap(page, vma, address); 2933 page_add_new_anon_rmap(page, vma, address);
2859 } else { 2934 } else {
2860 inc_mm_counter(mm, MM_FILEPAGES); 2935 inc_mm_counter_fast(mm, MM_FILEPAGES);
2861 page_add_file_rmap(page); 2936 page_add_file_rmap(page);
2862 if (flags & FAULT_FLAG_WRITE) { 2937 if (flags & FAULT_FLAG_WRITE) {
2863 dirty_page = page; 2938 dirty_page = page;
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3035 3110
3036 count_vm_event(PGFAULT); 3111 count_vm_event(PGFAULT);
3037 3112
3113 /* do counter updates before entering really critical section. */
3114 check_sync_rss_stat(current);
3115
3038 if (unlikely(is_vm_hugetlb_page(vma))) 3116 if (unlikely(is_vm_hugetlb_page(vma)))
3039 return hugetlb_fault(mm, vma, address, flags); 3117 return hugetlb_fault(mm, vma, address, flags);
3040 3118