diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2010-03-05 16:41:40 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-06 14:26:24 -0500 |
commit | 34e55232e59f7b19050267a05ff1226e5cd122a5 (patch) | |
tree | 6b94e776e87d2a2fe1ceca7c5606901575323900 /mm | |
parent | d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 (diff) |
mm: avoid false sharing of mm_counter
Considering the nature of per mm stats, it's the shared object among
threads and can be a cache-miss point in the page fault path.
This patch adds per-thread cache for mm_counter. RSS value will be
counted into a struct in task_struct and synchronized with mm's one at
events.
Now, in this patch, the event is the number of calls to handle_mm_fault.
Per-thread value is added to mm at each 64 calls.
rough estimation with small benchmark on parallel thread (2threads) shows
[before]
4.5 cache-miss/faults
[after]
4.0 cache-miss/faults
Anyway, the most contended object is mmap_sem if the number of threads grows.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memory.c | 94 |
1 files changed, 86 insertions, 8 deletions
diff --git a/mm/memory.c b/mm/memory.c index c57678478801..a4597614f18d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void) | |||
122 | core_initcall(init_zero_pfn); | 122 | core_initcall(init_zero_pfn); |
123 | 123 | ||
124 | 124 | ||
125 | #if defined(SPLIT_RSS_COUNTING) | ||
126 | |||
127 | void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | ||
128 | { | ||
129 | int i; | ||
130 | |||
131 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
132 | if (task->rss_stat.count[i]) { | ||
133 | add_mm_counter(mm, i, task->rss_stat.count[i]); | ||
134 | task->rss_stat.count[i] = 0; | ||
135 | } | ||
136 | } | ||
137 | task->rss_stat.events = 0; | ||
138 | } | ||
139 | |||
140 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | ||
141 | { | ||
142 | struct task_struct *task = current; | ||
143 | |||
144 | if (likely(task->mm == mm)) | ||
145 | task->rss_stat.count[member] += val; | ||
146 | else | ||
147 | add_mm_counter(mm, member, val); | ||
148 | } | ||
149 | #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) | ||
150 | #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) | ||
151 | |||
152 | /* sync counter once per 64 page faults */ | ||
153 | #define TASK_RSS_EVENTS_THRESH (64) | ||
154 | static void check_sync_rss_stat(struct task_struct *task) | ||
155 | { | ||
156 | if (unlikely(task != current)) | ||
157 | return; | ||
158 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | ||
159 | __sync_task_rss_stat(task, task->mm); | ||
160 | } | ||
161 | |||
162 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
163 | { | ||
164 | long val = 0; | ||
165 | |||
166 | /* | ||
167 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
168 | * The caller must guarantee task->mm is not invalid. | ||
169 | */ | ||
170 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
171 | /* | ||
172 | * counter is updated in asynchronous manner and may go to minus. | ||
173 | * But it's never be expected number for users. | ||
174 | */ | ||
175 | if (val < 0) | ||
176 | return 0; | ||
177 | return (unsigned long)val; | ||
178 | } | ||
179 | |||
180 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
181 | { | ||
182 | __sync_task_rss_stat(task, mm); | ||
183 | } | ||
184 | #else | ||
185 | |||
186 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | ||
187 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | ||
188 | |||
189 | static void check_sync_rss_stat(struct task_struct *task) | ||
190 | { | ||
191 | } | ||
192 | |||
193 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
194 | { | ||
195 | } | ||
196 | #endif | ||
197 | |||
125 | /* | 198 | /* |
126 | * If a p?d_bad entry is found while walking page tables, report | 199 | * If a p?d_bad entry is found while walking page tables, report |
127 | * the error, before resetting entry to p?d_none. Usually (but | 200 | * the error, before resetting entry to p?d_none. Usually (but |
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) | |||
386 | { | 459 | { |
387 | int i; | 460 | int i; |
388 | 461 | ||
462 | if (current->mm == mm) | ||
463 | sync_mm_rss(current, mm); | ||
389 | for (i = 0; i < NR_MM_COUNTERS; i++) | 464 | for (i = 0; i < NR_MM_COUNTERS; i++) |
390 | if (rss[i]) | 465 | if (rss[i]) |
391 | add_mm_counter(mm, i, rss[i]); | 466 | add_mm_counter(mm, i, rss[i]); |
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1539 | 1614 | ||
1540 | /* Ok, finally just insert the thing.. */ | 1615 | /* Ok, finally just insert the thing.. */ |
1541 | get_page(page); | 1616 | get_page(page); |
1542 | inc_mm_counter(mm, MM_FILEPAGES); | 1617 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
1543 | page_add_file_rmap(page); | 1618 | page_add_file_rmap(page); |
1544 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1619 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1545 | 1620 | ||
@@ -2175,11 +2250,11 @@ gotten: | |||
2175 | if (likely(pte_same(*page_table, orig_pte))) { | 2250 | if (likely(pte_same(*page_table, orig_pte))) { |
2176 | if (old_page) { | 2251 | if (old_page) { |
2177 | if (!PageAnon(old_page)) { | 2252 | if (!PageAnon(old_page)) { |
2178 | dec_mm_counter(mm, MM_FILEPAGES); | 2253 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
2179 | inc_mm_counter(mm, MM_ANONPAGES); | 2254 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2180 | } | 2255 | } |
2181 | } else | 2256 | } else |
2182 | inc_mm_counter(mm, MM_ANONPAGES); | 2257 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2183 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2258 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2184 | entry = mk_pte(new_page, vma->vm_page_prot); | 2259 | entry = mk_pte(new_page, vma->vm_page_prot); |
2185 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2260 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2616 | * discarded at swap_free(). | 2691 | * discarded at swap_free(). |
2617 | */ | 2692 | */ |
2618 | 2693 | ||
2619 | inc_mm_counter(mm, MM_ANONPAGES); | 2694 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2620 | pte = mk_pte(page, vma->vm_page_prot); | 2695 | pte = mk_pte(page, vma->vm_page_prot); |
2621 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { | 2696 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
2622 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2697 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2700 | if (!pte_none(*page_table)) | 2775 | if (!pte_none(*page_table)) |
2701 | goto release; | 2776 | goto release; |
2702 | 2777 | ||
2703 | inc_mm_counter(mm, MM_ANONPAGES); | 2778 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2704 | page_add_new_anon_rmap(page, vma, address); | 2779 | page_add_new_anon_rmap(page, vma, address); |
2705 | setpte: | 2780 | setpte: |
2706 | set_pte_at(mm, address, page_table, entry); | 2781 | set_pte_at(mm, address, page_table, entry); |
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2854 | if (flags & FAULT_FLAG_WRITE) | 2929 | if (flags & FAULT_FLAG_WRITE) |
2855 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2930 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2856 | if (anon) { | 2931 | if (anon) { |
2857 | inc_mm_counter(mm, MM_ANONPAGES); | 2932 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2858 | page_add_new_anon_rmap(page, vma, address); | 2933 | page_add_new_anon_rmap(page, vma, address); |
2859 | } else { | 2934 | } else { |
2860 | inc_mm_counter(mm, MM_FILEPAGES); | 2935 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
2861 | page_add_file_rmap(page); | 2936 | page_add_file_rmap(page); |
2862 | if (flags & FAULT_FLAG_WRITE) { | 2937 | if (flags & FAULT_FLAG_WRITE) { |
2863 | dirty_page = page; | 2938 | dirty_page = page; |
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3035 | 3110 | ||
3036 | count_vm_event(PGFAULT); | 3111 | count_vm_event(PGFAULT); |
3037 | 3112 | ||
3113 | /* do counter updates before entering really critical section. */ | ||
3114 | check_sync_rss_stat(current); | ||
3115 | |||
3038 | if (unlikely(is_vm_hugetlb_page(vma))) | 3116 | if (unlikely(is_vm_hugetlb_page(vma))) |
3039 | return hugetlb_fault(mm, vma, address, flags); | 3117 | return hugetlb_fault(mm, vma, address, flags); |
3040 | 3118 | ||