diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2010-03-05 16:41:40 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-06 14:26:24 -0500 |
commit | 34e55232e59f7b19050267a05ff1226e5cd122a5 (patch) | |
tree | 6b94e776e87d2a2fe1ceca7c5606901575323900 | |
parent | d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 (diff) |
mm: avoid false sharing of mm_counter
Considering the nature of per mm stats, it's the shared object among
threads and can be a cache-miss point in the page fault path.
This patch adds per-thread cache for mm_counter. RSS value will be
counted into a struct in task_struct and synchronized with mm's one at
events.
Now, in this patch, the event is the number of calls to handle_mm_fault.
Per-thread value is added to mm at each 64 calls.
rough estimation with small benchmark on parallel thread (2threads) shows
[before]
4.5 cache-miss/faults
[after]
4.0 cache-miss/faults
Anyway, the most contended object is mmap_sem if the number of threads grows.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/filesystems/proc.txt | 6 | ||||
-rw-r--r-- | fs/exec.c | 1 | ||||
-rw-r--r-- | include/linux/mm.h | 8 | ||||
-rw-r--r-- | include/linux/mm_types.h | 6 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | kernel/exit.c | 3 | ||||
-rw-r--r-- | mm/memory.c | 94 |
7 files changed, 107 insertions, 15 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d07513a67a6..e418f3d8f427 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file | |||
188 | contains details information about the process itself. Its fields are | 188 | contains details information about the process itself. Its fields are |
189 | explained in Table 1-4. | 189 | explained in Table 1-4. |
190 | 190 | ||
191 | (for SMP CONFIG users) | ||
192 | For making accounting scalable, RSS related information are handled in | ||
193 | asynchronous manner and the vaule may not be very precise. To see a precise | ||
194 | snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. | ||
195 | It's slow but very precise. | ||
196 | |||
191 | Table 1-2: Contents of the statm files (as of 2.6.30-rc7) | 197 | Table 1-2: Contents of the statm files (as of 2.6.30-rc7) |
192 | .............................................................................. | 198 | .............................................................................. |
193 | Field Content | 199 | Field Content |
@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm) | |||
718 | /* Notify parent that we're no longer interested in the old VM */ | 718 | /* Notify parent that we're no longer interested in the old VM */ |
719 | tsk = current; | 719 | tsk = current; |
720 | old_mm = current->mm; | 720 | old_mm = current->mm; |
721 | sync_mm_rss(tsk, old_mm); | ||
721 | mm_release(tsk, old_mm); | 722 | mm_release(tsk, old_mm); |
722 | 723 | ||
723 | if (old_mm) { | 724 | if (old_mm) { |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 2124cdb2d1d0..8e580c07d171 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
873 | /* | 873 | /* |
874 | * per-process(per-mm_struct) statistics. | 874 | * per-process(per-mm_struct) statistics. |
875 | */ | 875 | */ |
876 | #if USE_SPLIT_PTLOCKS | 876 | #if defined(SPLIT_RSS_COUNTING) |
877 | /* | 877 | /* |
878 | * The mm counters are not protected by its page_table_lock, | 878 | * The mm counters are not protected by its page_table_lock, |
879 | * so must be incremented atomically. | 879 | * so must be incremented atomically. |
@@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value) | |||
883 | atomic_long_set(&mm->rss_stat.count[member], value); | 883 | atomic_long_set(&mm->rss_stat.count[member], value); |
884 | } | 884 | } |
885 | 885 | ||
886 | static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) | 886 | unsigned long get_mm_counter(struct mm_struct *mm, int member); |
887 | { | ||
888 | return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]); | ||
889 | } | ||
890 | 887 | ||
891 | static inline void add_mm_counter(struct mm_struct *mm, int member, long value) | 888 | static inline void add_mm_counter(struct mm_struct *mm, int member, long value) |
892 | { | 889 | { |
@@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, | |||
974 | *maxrss = hiwater_rss; | 971 | *maxrss = hiwater_rss; |
975 | } | 972 | } |
976 | 973 | ||
974 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); | ||
977 | 975 | ||
978 | /* | 976 | /* |
979 | * A callback you can register to apply pressure to ageable caches. | 977 | * A callback you can register to apply pressure to ageable caches. |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e1ca64be6678..21861239ab0c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -202,9 +202,15 @@ enum { | |||
202 | }; | 202 | }; |
203 | 203 | ||
204 | #if USE_SPLIT_PTLOCKS | 204 | #if USE_SPLIT_PTLOCKS |
205 | #define SPLIT_RSS_COUNTING | ||
205 | struct mm_rss_stat { | 206 | struct mm_rss_stat { |
206 | atomic_long_t count[NR_MM_COUNTERS]; | 207 | atomic_long_t count[NR_MM_COUNTERS]; |
207 | }; | 208 | }; |
209 | /* per-thread cached information, */ | ||
210 | struct task_rss_stat { | ||
211 | int events; /* for synchronization threshold */ | ||
212 | int count[NR_MM_COUNTERS]; | ||
213 | }; | ||
208 | #else /* !USE_SPLIT_PTLOCKS */ | 214 | #else /* !USE_SPLIT_PTLOCKS */ |
209 | struct mm_rss_stat { | 215 | struct mm_rss_stat { |
210 | unsigned long count[NR_MM_COUNTERS]; | 216 | unsigned long count[NR_MM_COUNTERS]; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index cbeafa49a53b..46c6f8d5dc06 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1220,7 +1220,9 @@ struct task_struct { | |||
1220 | struct plist_node pushable_tasks; | 1220 | struct plist_node pushable_tasks; |
1221 | 1221 | ||
1222 | struct mm_struct *mm, *active_mm; | 1222 | struct mm_struct *mm, *active_mm; |
1223 | 1223 | #if defined(SPLIT_RSS_COUNTING) | |
1224 | struct task_rss_stat rss_stat; | ||
1225 | #endif | ||
1224 | /* task state */ | 1226 | /* task state */ |
1225 | int exit_state; | 1227 | int exit_state; |
1226 | int exit_code, exit_signal; | 1228 | int exit_code, exit_signal; |
diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043b8bf5..10d3c5d5ae44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code) | |||
952 | preempt_count()); | 952 | preempt_count()); |
953 | 953 | ||
954 | acct_update_integrals(tsk); | 954 | acct_update_integrals(tsk); |
955 | 955 | /* sync mm's RSS info before statistics gathering */ | |
956 | sync_mm_rss(tsk, tsk->mm); | ||
956 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 957 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
957 | if (group_dead) { | 958 | if (group_dead) { |
958 | hrtimer_cancel(&tsk->signal->real_timer); | 959 | hrtimer_cancel(&tsk->signal->real_timer); |
diff --git a/mm/memory.c b/mm/memory.c index c57678478801..a4597614f18d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void) | |||
122 | core_initcall(init_zero_pfn); | 122 | core_initcall(init_zero_pfn); |
123 | 123 | ||
124 | 124 | ||
125 | #if defined(SPLIT_RSS_COUNTING) | ||
126 | |||
127 | void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | ||
128 | { | ||
129 | int i; | ||
130 | |||
131 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
132 | if (task->rss_stat.count[i]) { | ||
133 | add_mm_counter(mm, i, task->rss_stat.count[i]); | ||
134 | task->rss_stat.count[i] = 0; | ||
135 | } | ||
136 | } | ||
137 | task->rss_stat.events = 0; | ||
138 | } | ||
139 | |||
140 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | ||
141 | { | ||
142 | struct task_struct *task = current; | ||
143 | |||
144 | if (likely(task->mm == mm)) | ||
145 | task->rss_stat.count[member] += val; | ||
146 | else | ||
147 | add_mm_counter(mm, member, val); | ||
148 | } | ||
149 | #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) | ||
150 | #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) | ||
151 | |||
152 | /* sync counter once per 64 page faults */ | ||
153 | #define TASK_RSS_EVENTS_THRESH (64) | ||
154 | static void check_sync_rss_stat(struct task_struct *task) | ||
155 | { | ||
156 | if (unlikely(task != current)) | ||
157 | return; | ||
158 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | ||
159 | __sync_task_rss_stat(task, task->mm); | ||
160 | } | ||
161 | |||
162 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
163 | { | ||
164 | long val = 0; | ||
165 | |||
166 | /* | ||
167 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
168 | * The caller must guarantee task->mm is not invalid. | ||
169 | */ | ||
170 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
171 | /* | ||
172 | * counter is updated in asynchronous manner and may go to minus. | ||
173 | * But it's never be expected number for users. | ||
174 | */ | ||
175 | if (val < 0) | ||
176 | return 0; | ||
177 | return (unsigned long)val; | ||
178 | } | ||
179 | |||
180 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
181 | { | ||
182 | __sync_task_rss_stat(task, mm); | ||
183 | } | ||
184 | #else | ||
185 | |||
186 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | ||
187 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | ||
188 | |||
189 | static void check_sync_rss_stat(struct task_struct *task) | ||
190 | { | ||
191 | } | ||
192 | |||
193 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
194 | { | ||
195 | } | ||
196 | #endif | ||
197 | |||
125 | /* | 198 | /* |
126 | * If a p?d_bad entry is found while walking page tables, report | 199 | * If a p?d_bad entry is found while walking page tables, report |
127 | * the error, before resetting entry to p?d_none. Usually (but | 200 | * the error, before resetting entry to p?d_none. Usually (but |
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) | |||
386 | { | 459 | { |
387 | int i; | 460 | int i; |
388 | 461 | ||
462 | if (current->mm == mm) | ||
463 | sync_mm_rss(current, mm); | ||
389 | for (i = 0; i < NR_MM_COUNTERS; i++) | 464 | for (i = 0; i < NR_MM_COUNTERS; i++) |
390 | if (rss[i]) | 465 | if (rss[i]) |
391 | add_mm_counter(mm, i, rss[i]); | 466 | add_mm_counter(mm, i, rss[i]); |
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1539 | 1614 | ||
1540 | /* Ok, finally just insert the thing.. */ | 1615 | /* Ok, finally just insert the thing.. */ |
1541 | get_page(page); | 1616 | get_page(page); |
1542 | inc_mm_counter(mm, MM_FILEPAGES); | 1617 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
1543 | page_add_file_rmap(page); | 1618 | page_add_file_rmap(page); |
1544 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1619 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1545 | 1620 | ||
@@ -2175,11 +2250,11 @@ gotten: | |||
2175 | if (likely(pte_same(*page_table, orig_pte))) { | 2250 | if (likely(pte_same(*page_table, orig_pte))) { |
2176 | if (old_page) { | 2251 | if (old_page) { |
2177 | if (!PageAnon(old_page)) { | 2252 | if (!PageAnon(old_page)) { |
2178 | dec_mm_counter(mm, MM_FILEPAGES); | 2253 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
2179 | inc_mm_counter(mm, MM_ANONPAGES); | 2254 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2180 | } | 2255 | } |
2181 | } else | 2256 | } else |
2182 | inc_mm_counter(mm, MM_ANONPAGES); | 2257 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2183 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2258 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2184 | entry = mk_pte(new_page, vma->vm_page_prot); | 2259 | entry = mk_pte(new_page, vma->vm_page_prot); |
2185 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2260 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2616 | * discarded at swap_free(). | 2691 | * discarded at swap_free(). |
2617 | */ | 2692 | */ |
2618 | 2693 | ||
2619 | inc_mm_counter(mm, MM_ANONPAGES); | 2694 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2620 | pte = mk_pte(page, vma->vm_page_prot); | 2695 | pte = mk_pte(page, vma->vm_page_prot); |
2621 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { | 2696 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
2622 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2697 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2700 | if (!pte_none(*page_table)) | 2775 | if (!pte_none(*page_table)) |
2701 | goto release; | 2776 | goto release; |
2702 | 2777 | ||
2703 | inc_mm_counter(mm, MM_ANONPAGES); | 2778 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2704 | page_add_new_anon_rmap(page, vma, address); | 2779 | page_add_new_anon_rmap(page, vma, address); |
2705 | setpte: | 2780 | setpte: |
2706 | set_pte_at(mm, address, page_table, entry); | 2781 | set_pte_at(mm, address, page_table, entry); |
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2854 | if (flags & FAULT_FLAG_WRITE) | 2929 | if (flags & FAULT_FLAG_WRITE) |
2855 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2930 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2856 | if (anon) { | 2931 | if (anon) { |
2857 | inc_mm_counter(mm, MM_ANONPAGES); | 2932 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2858 | page_add_new_anon_rmap(page, vma, address); | 2933 | page_add_new_anon_rmap(page, vma, address); |
2859 | } else { | 2934 | } else { |
2860 | inc_mm_counter(mm, MM_FILEPAGES); | 2935 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
2861 | page_add_file_rmap(page); | 2936 | page_add_file_rmap(page); |
2862 | if (flags & FAULT_FLAG_WRITE) { | 2937 | if (flags & FAULT_FLAG_WRITE) { |
2863 | dirty_page = page; | 2938 | dirty_page = page; |
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3035 | 3110 | ||
3036 | count_vm_event(PGFAULT); | 3111 | count_vm_event(PGFAULT); |
3037 | 3112 | ||
3113 | /* do counter updates before entering really critical section. */ | ||
3114 | check_sync_rss_stat(current); | ||
3115 | |||
3038 | if (unlikely(is_vm_hugetlb_page(vma))) | 3116 | if (unlikely(is_vm_hugetlb_page(vma))) |
3039 | return hugetlb_fault(mm, vma, address, flags); | 3117 | return hugetlb_fault(mm, vma, address, flags); |
3040 | 3118 | ||