aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-03-05 16:41:40 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-06 14:26:24 -0500
commit34e55232e59f7b19050267a05ff1226e5cd122a5 (patch)
tree6b94e776e87d2a2fe1ceca7c5606901575323900
parentd559db086ff5be9bcc259e5aa50bf3d881eaf1d1 (diff)
mm: avoid false sharing of mm_counter
Considering the nature of per mm stats, it's the shared object among threads and can be a cache-miss point in the page fault path. This patch adds per-thread cache for mm_counter. RSS value will be counted into a struct in task_struct and synchronized with mm's one at events. Now, in this patch, the event is the number of calls to handle_mm_fault. Per-thread value is added to mm at each 64 calls. rough estimation with small benchmark on parallel thread (2threads) shows [before] 4.5 cache-miss/faults [after] 4.0 cache-miss/faults Anyway, the most contended object is mmap_sem if the number of threads grows. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/filesystems/proc.txt6
-rw-r--r--fs/exec.c1
-rw-r--r--include/linux/mm.h8
-rw-r--r--include/linux/mm_types.h6
-rw-r--r--include/linux/sched.h4
-rw-r--r--kernel/exit.c3
-rw-r--r--mm/memory.c94
7 files changed, 107 insertions, 15 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0d07513a67a6..e418f3d8f427 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file
188contains details information about the process itself. Its fields are 188contains details information about the process itself. Its fields are
189explained in Table 1-4. 189explained in Table 1-4.
190 190
191(for SMP CONFIG users)
192For making accounting scalable, RSS related information are handled in
193asynchronous manner and the vaule may not be very precise. To see a precise
194snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
195It's slow but very precise.
196
191Table 1-2: Contents of the statm files (as of 2.6.30-rc7) 197Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
192.............................................................................. 198..............................................................................
193 Field Content 199 Field Content
diff --git a/fs/exec.c b/fs/exec.c
index cce6bbdbdbb1..ea7861727efd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
718 /* Notify parent that we're no longer interested in the old VM */ 718 /* Notify parent that we're no longer interested in the old VM */
719 tsk = current; 719 tsk = current;
720 old_mm = current->mm; 720 old_mm = current->mm;
721 sync_mm_rss(tsk, old_mm);
721 mm_release(tsk, old_mm); 722 mm_release(tsk, old_mm);
722 723
723 if (old_mm) { 724 if (old_mm) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2124cdb2d1d0..8e580c07d171 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
873/* 873/*
874 * per-process(per-mm_struct) statistics. 874 * per-process(per-mm_struct) statistics.
875 */ 875 */
876#if USE_SPLIT_PTLOCKS 876#if defined(SPLIT_RSS_COUNTING)
877/* 877/*
878 * The mm counters are not protected by its page_table_lock, 878 * The mm counters are not protected by its page_table_lock,
879 * so must be incremented atomically. 879 * so must be incremented atomically.
@@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
883 atomic_long_set(&mm->rss_stat.count[member], value); 883 atomic_long_set(&mm->rss_stat.count[member], value);
884} 884}
885 885
886static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) 886unsigned long get_mm_counter(struct mm_struct *mm, int member);
887{
888 return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]);
889}
890 887
891static inline void add_mm_counter(struct mm_struct *mm, int member, long value) 888static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
892{ 889{
@@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
974 *maxrss = hiwater_rss; 971 *maxrss = hiwater_rss;
975} 972}
976 973
974void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
977 975
978/* 976/*
979 * A callback you can register to apply pressure to ageable caches. 977 * A callback you can register to apply pressure to ageable caches.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e1ca64be6678..21861239ab0c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -202,9 +202,15 @@ enum {
202}; 202};
203 203
204#if USE_SPLIT_PTLOCKS 204#if USE_SPLIT_PTLOCKS
205#define SPLIT_RSS_COUNTING
205struct mm_rss_stat { 206struct mm_rss_stat {
206 atomic_long_t count[NR_MM_COUNTERS]; 207 atomic_long_t count[NR_MM_COUNTERS];
207}; 208};
209/* per-thread cached information, */
210struct task_rss_stat {
211 int events; /* for synchronization threshold */
212 int count[NR_MM_COUNTERS];
213};
208#else /* !USE_SPLIT_PTLOCKS */ 214#else /* !USE_SPLIT_PTLOCKS */
209struct mm_rss_stat { 215struct mm_rss_stat {
210 unsigned long count[NR_MM_COUNTERS]; 216 unsigned long count[NR_MM_COUNTERS];
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbeafa49a53b..46c6f8d5dc06 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,9 @@ struct task_struct {
1220 struct plist_node pushable_tasks; 1220 struct plist_node pushable_tasks;
1221 1221
1222 struct mm_struct *mm, *active_mm; 1222 struct mm_struct *mm, *active_mm;
1223 1223#if defined(SPLIT_RSS_COUNTING)
1224 struct task_rss_stat rss_stat;
1225#endif
1224/* task state */ 1226/* task state */
1225 int exit_state; 1227 int exit_state;
1226 int exit_code, exit_signal; 1228 int exit_code, exit_signal;
diff --git a/kernel/exit.c b/kernel/exit.c
index 45ed043b8bf5..10d3c5d5ae44 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code)
952 preempt_count()); 952 preempt_count());
953 953
954 acct_update_integrals(tsk); 954 acct_update_integrals(tsk);
955 955 /* sync mm's RSS info before statistics gathering */
956 sync_mm_rss(tsk, tsk->mm);
956 group_dead = atomic_dec_and_test(&tsk->signal->live); 957 group_dead = atomic_dec_and_test(&tsk->signal->live);
957 if (group_dead) { 958 if (group_dead) {
958 hrtimer_cancel(&tsk->signal->real_timer); 959 hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/mm/memory.c b/mm/memory.c
index c57678478801..a4597614f18d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void)
122core_initcall(init_zero_pfn); 122core_initcall(init_zero_pfn);
123 123
124 124
125#if defined(SPLIT_RSS_COUNTING)
126
127void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
128{
129 int i;
130
131 for (i = 0; i < NR_MM_COUNTERS; i++) {
132 if (task->rss_stat.count[i]) {
133 add_mm_counter(mm, i, task->rss_stat.count[i]);
134 task->rss_stat.count[i] = 0;
135 }
136 }
137 task->rss_stat.events = 0;
138}
139
140static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
141{
142 struct task_struct *task = current;
143
144 if (likely(task->mm == mm))
145 task->rss_stat.count[member] += val;
146 else
147 add_mm_counter(mm, member, val);
148}
149#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
150#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
151
152/* sync counter once per 64 page faults */
153#define TASK_RSS_EVENTS_THRESH (64)
154static void check_sync_rss_stat(struct task_struct *task)
155{
156 if (unlikely(task != current))
157 return;
158 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
159 __sync_task_rss_stat(task, task->mm);
160}
161
162unsigned long get_mm_counter(struct mm_struct *mm, int member)
163{
164 long val = 0;
165
166 /*
167 * Don't use task->mm here...for avoiding to use task_get_mm()..
168 * The caller must guarantee task->mm is not invalid.
169 */
170 val = atomic_long_read(&mm->rss_stat.count[member]);
171 /*
172 * counter is updated in asynchronous manner and may go to minus.
173 * But it's never be expected number for users.
174 */
175 if (val < 0)
176 return 0;
177 return (unsigned long)val;
178}
179
180void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
181{
182 __sync_task_rss_stat(task, mm);
183}
184#else
185
186#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
187#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
188
189static void check_sync_rss_stat(struct task_struct *task)
190{
191}
192
193void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
194{
195}
196#endif
197
125/* 198/*
126 * If a p?d_bad entry is found while walking page tables, report 199 * If a p?d_bad entry is found while walking page tables, report
127 * the error, before resetting entry to p?d_none. Usually (but 200 * the error, before resetting entry to p?d_none. Usually (but
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
386{ 459{
387 int i; 460 int i;
388 461
462 if (current->mm == mm)
463 sync_mm_rss(current, mm);
389 for (i = 0; i < NR_MM_COUNTERS; i++) 464 for (i = 0; i < NR_MM_COUNTERS; i++)
390 if (rss[i]) 465 if (rss[i])
391 add_mm_counter(mm, i, rss[i]); 466 add_mm_counter(mm, i, rss[i]);
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1539 1614
1540 /* Ok, finally just insert the thing.. */ 1615 /* Ok, finally just insert the thing.. */
1541 get_page(page); 1616 get_page(page);
1542 inc_mm_counter(mm, MM_FILEPAGES); 1617 inc_mm_counter_fast(mm, MM_FILEPAGES);
1543 page_add_file_rmap(page); 1618 page_add_file_rmap(page);
1544 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1619 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1545 1620
@@ -2175,11 +2250,11 @@ gotten:
2175 if (likely(pte_same(*page_table, orig_pte))) { 2250 if (likely(pte_same(*page_table, orig_pte))) {
2176 if (old_page) { 2251 if (old_page) {
2177 if (!PageAnon(old_page)) { 2252 if (!PageAnon(old_page)) {
2178 dec_mm_counter(mm, MM_FILEPAGES); 2253 dec_mm_counter_fast(mm, MM_FILEPAGES);
2179 inc_mm_counter(mm, MM_ANONPAGES); 2254 inc_mm_counter_fast(mm, MM_ANONPAGES);
2180 } 2255 }
2181 } else 2256 } else
2182 inc_mm_counter(mm, MM_ANONPAGES); 2257 inc_mm_counter_fast(mm, MM_ANONPAGES);
2183 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2258 flush_cache_page(vma, address, pte_pfn(orig_pte));
2184 entry = mk_pte(new_page, vma->vm_page_prot); 2259 entry = mk_pte(new_page, vma->vm_page_prot);
2185 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2260 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2616 * discarded at swap_free(). 2691 * discarded at swap_free().
2617 */ 2692 */
2618 2693
2619 inc_mm_counter(mm, MM_ANONPAGES); 2694 inc_mm_counter_fast(mm, MM_ANONPAGES);
2620 pte = mk_pte(page, vma->vm_page_prot); 2695 pte = mk_pte(page, vma->vm_page_prot);
2621 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2696 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2622 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2697 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2700 if (!pte_none(*page_table)) 2775 if (!pte_none(*page_table))
2701 goto release; 2776 goto release;
2702 2777
2703 inc_mm_counter(mm, MM_ANONPAGES); 2778 inc_mm_counter_fast(mm, MM_ANONPAGES);
2704 page_add_new_anon_rmap(page, vma, address); 2779 page_add_new_anon_rmap(page, vma, address);
2705setpte: 2780setpte:
2706 set_pte_at(mm, address, page_table, entry); 2781 set_pte_at(mm, address, page_table, entry);
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2854 if (flags & FAULT_FLAG_WRITE) 2929 if (flags & FAULT_FLAG_WRITE)
2855 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2930 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2856 if (anon) { 2931 if (anon) {
2857 inc_mm_counter(mm, MM_ANONPAGES); 2932 inc_mm_counter_fast(mm, MM_ANONPAGES);
2858 page_add_new_anon_rmap(page, vma, address); 2933 page_add_new_anon_rmap(page, vma, address);
2859 } else { 2934 } else {
2860 inc_mm_counter(mm, MM_FILEPAGES); 2935 inc_mm_counter_fast(mm, MM_FILEPAGES);
2861 page_add_file_rmap(page); 2936 page_add_file_rmap(page);
2862 if (flags & FAULT_FLAG_WRITE) { 2937 if (flags & FAULT_FLAG_WRITE) {
2863 dirty_page = page; 2938 dirty_page = page;
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3035 3110
3036 count_vm_event(PGFAULT); 3111 count_vm_event(PGFAULT);
3037 3112
3113 /* do counter updates before entering really critical section. */
3114 check_sync_rss_stat(current);
3115
3038 if (unlikely(is_vm_hugetlb_page(vma))) 3116 if (unlikely(is_vm_hugetlb_page(vma)))
3039 return hugetlb_fault(mm, vma, address, flags); 3117 return hugetlb_fault(mm, vma, address, flags);
3040 3118