aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmstat.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r--mm/vmstat.c202
1 files changed, 162 insertions, 40 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8f62f17ee1c7..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
85 85
86static int calculate_threshold(struct zone *zone) 86int calculate_pressure_threshold(struct zone *zone)
87{
88 int threshold;
89 int watermark_distance;
90
91 /*
92 * As vmstats are not up to date, there is drift between the estimated
93 * and real values. For high thresholds and a high number of CPUs, it
94 * is possible for the min watermark to be breached while the estimated
95 * value looks fine. The pressure threshold is a reduced value such
96 * that even the maximum amount of drift will not accidentally breach
97 * the min watermark
98 */
99 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
100 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
101
102 /*
103 * Maximum threshold is 125
104 */
105 threshold = min(125, threshold);
106
107 return threshold;
108}
109
110int calculate_normal_threshold(struct zone *zone)
87{ 111{
88 int threshold; 112 int threshold;
89 int mem; /* memory in 128 MB units */ 113 int mem; /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
142 for_each_populated_zone(zone) { 166 for_each_populated_zone(zone) {
143 unsigned long max_drift, tolerate_drift; 167 unsigned long max_drift, tolerate_drift;
144 168
145 threshold = calculate_threshold(zone); 169 threshold = calculate_normal_threshold(zone);
146 170
147 for_each_online_cpu(cpu) 171 for_each_online_cpu(cpu)
148 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 172 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void)
161 } 185 }
162} 186}
163 187
188void set_pgdat_percpu_threshold(pg_data_t *pgdat,
189 int (*calculate_pressure)(struct zone *))
190{
191 struct zone *zone;
192 int cpu;
193 int threshold;
194 int i;
195
196 for (i = 0; i < pgdat->nr_zones; i++) {
197 zone = &pgdat->node_zones[i];
198 if (!zone->percpu_drift_mark)
199 continue;
200
201 threshold = (*calculate_pressure)(zone);
202 for_each_possible_cpu(cpu)
203 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
204 = threshold;
205 }
206}
207
164/* 208/*
165 * For use when we know that interrupts are disabled. 209 * For use when we know that interrupts are disabled.
166 */ 210 */
167void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 211void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
168 int delta) 212 int delta)
169{ 213{
170 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 214 struct per_cpu_pageset __percpu *pcp = zone->pageset;
171 215 s8 __percpu *p = pcp->vm_stat_diff + item;
172 s8 *p = pcp->vm_stat_diff + item;
173 long x; 216 long x;
217 long t;
218
219 x = delta + __this_cpu_read(*p);
174 220
175 x = delta + *p; 221 t = __this_cpu_read(pcp->stat_threshold);
176 222
177 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { 223 if (unlikely(x > t || x < -t)) {
178 zone_page_state_add(x, zone, item); 224 zone_page_state_add(x, zone, item);
179 x = 0; 225 x = 0;
180 } 226 }
181 *p = x; 227 __this_cpu_write(*p, x);
182} 228}
183EXPORT_SYMBOL(__mod_zone_page_state); 229EXPORT_SYMBOL(__mod_zone_page_state);
184 230
185/* 231/*
186 * For an unknown interrupt state
187 */
188void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
189 int delta)
190{
191 unsigned long flags;
192
193 local_irq_save(flags);
194 __mod_zone_page_state(zone, item, delta);
195 local_irq_restore(flags);
196}
197EXPORT_SYMBOL(mod_zone_page_state);
198
199/*
200 * Optimized increment and decrement functions. 232 * Optimized increment and decrement functions.
201 * 233 *
202 * These are only for a single page and therefore can take a struct page * 234 * These are only for a single page and therefore can take a struct page *
@@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
221 */ 253 */
222void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 254void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
223{ 255{
224 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 256 struct per_cpu_pageset __percpu *pcp = zone->pageset;
225 s8 *p = pcp->vm_stat_diff + item; 257 s8 __percpu *p = pcp->vm_stat_diff + item;
258 s8 v, t;
226 259
227 (*p)++; 260 v = __this_cpu_inc_return(*p);
261 t = __this_cpu_read(pcp->stat_threshold);
262 if (unlikely(v > t)) {
263 s8 overstep = t >> 1;
228 264
229 if (unlikely(*p > pcp->stat_threshold)) { 265 zone_page_state_add(v + overstep, zone, item);
230 int overstep = pcp->stat_threshold / 2; 266 __this_cpu_write(*p, -overstep);
231
232 zone_page_state_add(*p + overstep, zone, item);
233 *p = -overstep;
234 } 267 }
235} 268}
236 269
@@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
242 275
243void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 276void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
244{ 277{
245 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 278 struct per_cpu_pageset __percpu *pcp = zone->pageset;
246 s8 *p = pcp->vm_stat_diff + item; 279 s8 __percpu *p = pcp->vm_stat_diff + item;
247 280 s8 v, t;
248 (*p)--;
249 281
250 if (unlikely(*p < - pcp->stat_threshold)) { 282 v = __this_cpu_dec_return(*p);
251 int overstep = pcp->stat_threshold / 2; 283 t = __this_cpu_read(pcp->stat_threshold);
284 if (unlikely(v < - t)) {
285 s8 overstep = t >> 1;
252 286
253 zone_page_state_add(*p - overstep, zone, item); 287 zone_page_state_add(v - overstep, zone, item);
254 *p = overstep; 288 __this_cpu_write(*p, overstep);
255 } 289 }
256} 290}
257 291
@@ -261,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
261} 295}
262EXPORT_SYMBOL(__dec_zone_page_state); 296EXPORT_SYMBOL(__dec_zone_page_state);
263 297
298#ifdef CONFIG_CMPXCHG_LOCAL
299/*
300 * If we have cmpxchg_local support then we do not need to incur the overhead
301 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
302 *
303 * mod_state() modifies the zone counter state through atomic per cpu
304 * operations.
305 *
306 * Overstep mode specifies how overstep should handled:
307 * 0 No overstepping
308 * 1 Overstepping half of threshold
309 * -1 Overstepping minus half of threshold
310*/
311static inline void mod_state(struct zone *zone,
312 enum zone_stat_item item, int delta, int overstep_mode)
313{
314 struct per_cpu_pageset __percpu *pcp = zone->pageset;
315 s8 __percpu *p = pcp->vm_stat_diff + item;
316 long o, n, t, z;
317
318 do {
319 z = 0; /* overflow to zone counters */
320
321 /*
322 * The fetching of the stat_threshold is racy. We may apply
323 * a counter threshold to the wrong the cpu if we get
324 * rescheduled while executing here. However, the following
325 * will apply the threshold again and therefore bring the
326 * counter under the threshold.
327 */
328 t = this_cpu_read(pcp->stat_threshold);
329
330 o = this_cpu_read(*p);
331 n = delta + o;
332
333 if (n > t || n < -t) {
334 int os = overstep_mode * (t >> 1) ;
335
336 /* Overflow must be added to zone counters */
337 z = n + os;
338 n = -os;
339 }
340 } while (this_cpu_cmpxchg(*p, o, n) != o);
341
342 if (z)
343 zone_page_state_add(z, zone, item);
344}
345
346void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
347 int delta)
348{
349 mod_state(zone, item, delta, 0);
350}
351EXPORT_SYMBOL(mod_zone_page_state);
352
353void inc_zone_state(struct zone *zone, enum zone_stat_item item)
354{
355 mod_state(zone, item, 1, 1);
356}
357
358void inc_zone_page_state(struct page *page, enum zone_stat_item item)
359{
360 mod_state(page_zone(page), item, 1, 1);
361}
362EXPORT_SYMBOL(inc_zone_page_state);
363
364void dec_zone_page_state(struct page *page, enum zone_stat_item item)
365{
366 mod_state(page_zone(page), item, -1, -1);
367}
368EXPORT_SYMBOL(dec_zone_page_state);
369#else
370/*
371 * Use interrupt disable to serialize counter updates
372 */
373void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
374 int delta)
375{
376 unsigned long flags;
377
378 local_irq_save(flags);
379 __mod_zone_page_state(zone, item, delta);
380 local_irq_restore(flags);
381}
382EXPORT_SYMBOL(mod_zone_page_state);
383
264void inc_zone_state(struct zone *zone, enum zone_stat_item item) 384void inc_zone_state(struct zone *zone, enum zone_stat_item item)
265{ 385{
266 unsigned long flags; 386 unsigned long flags;
@@ -291,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
291 local_irq_restore(flags); 411 local_irq_restore(flags);
292} 412}
293EXPORT_SYMBOL(dec_zone_page_state); 413EXPORT_SYMBOL(dec_zone_page_state);
414#endif
294 415
295/* 416/*
296 * Update the zone counters for one cpu. 417 * Update the zone counters for one cpu.
@@ -759,6 +880,7 @@ static const char * const vmstat_text[] = {
759 "numa_local", 880 "numa_local",
760 "numa_other", 881 "numa_other",
761#endif 882#endif
883 "nr_anon_transparent_hugepages",
762 "nr_dirty_threshold", 884 "nr_dirty_threshold",
763 "nr_dirty_background_threshold", 885 "nr_dirty_background_threshold",
764 886
@@ -834,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
834 "\n scanned %lu" 956 "\n scanned %lu"
835 "\n spanned %lu" 957 "\n spanned %lu"
836 "\n present %lu", 958 "\n present %lu",
837 zone_nr_free_pages(zone), 959 zone_page_state(zone, NR_FREE_PAGES),
838 min_wmark_pages(zone), 960 min_wmark_pages(zone),
839 low_wmark_pages(zone), 961 low_wmark_pages(zone),
840 high_wmark_pages(zone), 962 high_wmark_pages(zone),
@@ -1033,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1033 break; 1155 break;
1034 case CPU_DOWN_PREPARE: 1156 case CPU_DOWN_PREPARE:
1035 case CPU_DOWN_PREPARE_FROZEN: 1157 case CPU_DOWN_PREPARE_FROZEN:
1036 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); 1158 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1037 per_cpu(vmstat_work, cpu).work.func = NULL; 1159 per_cpu(vmstat_work, cpu).work.func = NULL;
1038 break; 1160 break;
1039 case CPU_DOWN_FAILED: 1161 case CPU_DOWN_FAILED: