aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--mm/vmstat.c139
2 files changed, 120 insertions, 20 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 656b588a9f96..f45163c528e8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -77,6 +77,7 @@ struct per_cpu_pages {
77struct per_cpu_pageset { 77struct per_cpu_pageset {
78 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ 78 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
79#ifdef CONFIG_SMP 79#ifdef CONFIG_SMP
80 s8 stat_threshold;
80 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; 81 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
81#endif 82#endif
82} ____cacheline_aligned_in_smp; 83} ____cacheline_aligned_in_smp;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3799a0f7543a..c1b5f4106b38 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/cpu.h>
15 16
16void __get_zone_counts(unsigned long *active, unsigned long *inactive, 17void __get_zone_counts(unsigned long *active, unsigned long *inactive,
17 unsigned long *free, struct pglist_data *pgdat) 18 unsigned long *free, struct pglist_data *pgdat)
@@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat);
114 115
115#ifdef CONFIG_SMP 116#ifdef CONFIG_SMP
116 117
117#define STAT_THRESHOLD 32 118static int calculate_threshold(struct zone *zone)
119{
120 int threshold;
121 int mem; /* memory in 128 MB units */
122
123 /*
124 * The threshold scales with the number of processors and the amount
125 * of memory per zone. More memory means that we can defer updates for
126 * longer, more processors could lead to more contention.
127 * fls() is used to have a cheap way of logarithmic scaling.
128 *
129 * Some sample thresholds:
130 *
131 * Threshold Processors (fls) Zonesize fls(mem+1)
132 * ------------------------------------------------------------------
133 * 8 1 1 0.9-1 GB 4
134 * 16 2 2 0.9-1 GB 4
135 * 20 2 2 1-2 GB 5
136 * 24 2 2 2-4 GB 6
137 * 28 2 2 4-8 GB 7
138 * 32 2 2 8-16 GB 8
139 * 4 2 2 <128M 1
140 * 30 4 3 2-4 GB 5
141 * 48 4 3 8-16 GB 8
142 * 32 8 4 1-2 GB 4
143 * 32 8 4 0.9-1GB 4
144 * 10 16 5 <128M 1
145 * 40 16 5 900M 4
146 * 70 64 7 2-4 GB 5
147 * 84 64 7 4-8 GB 6
148 * 108 512 9 4-8 GB 6
149 * 125 1024 10 8-16 GB 8
150 * 125 1024 10 16-32 GB 9
151 */
152
153 mem = zone->present_pages >> (27 - PAGE_SHIFT);
154
155 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
156
157 /*
158 * Maximum threshold is 125
159 */
160 threshold = min(125, threshold);
161
162 return threshold;
163}
118 164
119/* 165/*
120 * Determine pointer to currently valid differential byte given a zone and 166 * Refresh the thresholds for each zone.
121 * the item number.
122 *
123 * Preemption must be off
124 */ 167 */
125static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) 168static void refresh_zone_stat_thresholds(void)
126{ 169{
127 return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; 170 struct zone *zone;
171 int cpu;
172 int threshold;
173
174 for_each_zone(zone) {
175
176 if (!zone->present_pages)
177 continue;
178
179 threshold = calculate_threshold(zone);
180
181 for_each_online_cpu(cpu)
182 zone_pcp(zone, cpu)->stat_threshold = threshold;
183 }
128} 184}
129 185
130/* 186/*
@@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
133void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 189void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
134 int delta) 190 int delta)
135{ 191{
136 s8 *p; 192 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
193 s8 *p = pcp->vm_stat_diff + item;
137 long x; 194 long x;
138 195
139 p = diff_pointer(zone, item);
140 x = delta + *p; 196 x = delta + *p;
141 197
142 if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { 198 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
143 zone_page_state_add(x, zone, item); 199 zone_page_state_add(x, zone, item);
144 x = 0; 200 x = 0;
145 } 201 }
146
147 *p = x; 202 *p = x;
148} 203}
149EXPORT_SYMBOL(__mod_zone_page_state); 204EXPORT_SYMBOL(__mod_zone_page_state);
@@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state);
172 * No overflow check is necessary and therefore the differential can be 227 * No overflow check is necessary and therefore the differential can be
173 * incremented or decremented in place which may allow the compilers to 228 * incremented or decremented in place which may allow the compilers to
174 * generate better code. 229 * generate better code.
175 *
176 * The increment or decrement is known and therefore one boundary check can 230 * The increment or decrement is known and therefore one boundary check can
177 * be omitted. 231 * be omitted.
178 * 232 *
233 * NOTE: These functions are very performance sensitive. Change only
234 * with care.
235 *
179 * Some processors have inc/dec instructions that are atomic vs an interrupt. 236 * Some processors have inc/dec instructions that are atomic vs an interrupt.
180 * However, the code must first determine the differential location in a zone 237 * However, the code must first determine the differential location in a zone
181 * based on the processor number and then inc/dec the counter. There is no 238 * based on the processor number and then inc/dec the counter. There is no
@@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state);
185 */ 242 */
186static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 243static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
187{ 244{
188 s8 *p = diff_pointer(zone, item); 245 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
246 s8 *p = pcp->vm_stat_diff + item;
189 247
190 (*p)++; 248 (*p)++;
191 249
192 if (unlikely(*p > STAT_THRESHOLD)) { 250 if (unlikely(*p > pcp->stat_threshold)) {
193 zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item); 251 int overstep = pcp->stat_threshold / 2;
194 *p = -STAT_THRESHOLD / 2; 252
253 zone_page_state_add(*p + overstep, zone, item);
254 *p = -overstep;
195 } 255 }
196} 256}
197 257
@@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state);
204void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 264void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
205{ 265{
206 struct zone *zone = page_zone(page); 266 struct zone *zone = page_zone(page);
207 s8 *p = diff_pointer(zone, item); 267 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
268 s8 *p = pcp->vm_stat_diff + item;
208 269
209 (*p)--; 270 (*p)--;
210 271
211 if (unlikely(*p < -STAT_THRESHOLD)) { 272 if (unlikely(*p < - pcp->stat_threshold)) {
212 zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item); 273 int overstep = pcp->stat_threshold / 2;
213 *p = STAT_THRESHOLD /2; 274
275 zone_page_state_add(*p - overstep, zone, item);
276 *p = overstep;
214 } 277 }
215} 278}
216EXPORT_SYMBOL(__dec_zone_page_state); 279EXPORT_SYMBOL(__dec_zone_page_state);
@@ -515,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
515 pageset->pcp[j].high, 578 pageset->pcp[j].high,
516 pageset->pcp[j].batch); 579 pageset->pcp[j].batch);
517 } 580 }
581#ifdef CONFIG_SMP
582 seq_printf(m, "\n vm stats threshold: %d",
583 pageset->stat_threshold);
584#endif
518 } 585 }
519 seq_printf(m, 586 seq_printf(m,
520 "\n all_unreclaimable: %u" 587 "\n all_unreclaimable: %u"
@@ -603,3 +670,35 @@ struct seq_operations vmstat_op = {
603 670
604#endif /* CONFIG_PROC_FS */ 671#endif /* CONFIG_PROC_FS */
605 672
673#ifdef CONFIG_SMP
674/*
675 * Use the cpu notifier to insure that the thresholds are recalculated
676 * when necessary.
677 */
678static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
679 unsigned long action,
680 void *hcpu)
681{
682 switch (action) {
683 case CPU_UP_PREPARE:
684 case CPU_UP_CANCELED:
685 case CPU_DEAD:
686 refresh_zone_stat_thresholds();
687 break;
688 default:
689 break;
690 }
691 return NOTIFY_OK;
692}
693
694static struct notifier_block __cpuinitdata vmstat_notifier =
695 { &vmstat_cpuup_callback, NULL, 0 };
696
697int __init setup_vmstat(void)
698{
699 refresh_zone_stat_thresholds();
700 register_cpu_notifier(&vmstat_notifier);
701 return 0;
702}
703module_init(setup_vmstat)
704#endif