diff options
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r-- | mm/vmstat.c | 202 |
1 files changed, 162 insertions, 40 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c index 8f62f17ee1c7..0c3b5048773e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat); | |||
83 | 83 | ||
84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
85 | 85 | ||
86 | static int calculate_threshold(struct zone *zone) | 86 | int calculate_pressure_threshold(struct zone *zone) |
87 | { | ||
88 | int threshold; | ||
89 | int watermark_distance; | ||
90 | |||
91 | /* | ||
92 | * As vmstats are not up to date, there is drift between the estimated | ||
93 | * and real values. For high thresholds and a high number of CPUs, it | ||
94 | * is possible for the min watermark to be breached while the estimated | ||
95 | * value looks fine. The pressure threshold is a reduced value such | ||
96 | * that even the maximum amount of drift will not accidentally breach | ||
97 | * the min watermark | ||
98 | */ | ||
99 | watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); | ||
100 | threshold = max(1, (int)(watermark_distance / num_online_cpus())); | ||
101 | |||
102 | /* | ||
103 | * Maximum threshold is 125 | ||
104 | */ | ||
105 | threshold = min(125, threshold); | ||
106 | |||
107 | return threshold; | ||
108 | } | ||
109 | |||
110 | int calculate_normal_threshold(struct zone *zone) | ||
87 | { | 111 | { |
88 | int threshold; | 112 | int threshold; |
89 | int mem; /* memory in 128 MB units */ | 113 | int mem; /* memory in 128 MB units */ |
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) | |||
142 | for_each_populated_zone(zone) { | 166 | for_each_populated_zone(zone) { |
143 | unsigned long max_drift, tolerate_drift; | 167 | unsigned long max_drift, tolerate_drift; |
144 | 168 | ||
145 | threshold = calculate_threshold(zone); | 169 | threshold = calculate_normal_threshold(zone); |
146 | 170 | ||
147 | for_each_online_cpu(cpu) | 171 | for_each_online_cpu(cpu) |
148 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | 172 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
@@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void) | |||
161 | } | 185 | } |
162 | } | 186 | } |
163 | 187 | ||
188 | void set_pgdat_percpu_threshold(pg_data_t *pgdat, | ||
189 | int (*calculate_pressure)(struct zone *)) | ||
190 | { | ||
191 | struct zone *zone; | ||
192 | int cpu; | ||
193 | int threshold; | ||
194 | int i; | ||
195 | |||
196 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
197 | zone = &pgdat->node_zones[i]; | ||
198 | if (!zone->percpu_drift_mark) | ||
199 | continue; | ||
200 | |||
201 | threshold = (*calculate_pressure)(zone); | ||
202 | for_each_possible_cpu(cpu) | ||
203 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | ||
204 | = threshold; | ||
205 | } | ||
206 | } | ||
207 | |||
164 | /* | 208 | /* |
165 | * For use when we know that interrupts are disabled. | 209 | * For use when we know that interrupts are disabled. |
166 | */ | 210 | */ |
167 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 211 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
168 | int delta) | 212 | int delta) |
169 | { | 213 | { |
170 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 214 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
171 | 215 | s8 __percpu *p = pcp->vm_stat_diff + item; | |
172 | s8 *p = pcp->vm_stat_diff + item; | ||
173 | long x; | 216 | long x; |
217 | long t; | ||
218 | |||
219 | x = delta + __this_cpu_read(*p); | ||
174 | 220 | ||
175 | x = delta + *p; | 221 | t = __this_cpu_read(pcp->stat_threshold); |
176 | 222 | ||
177 | if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { | 223 | if (unlikely(x > t || x < -t)) { |
178 | zone_page_state_add(x, zone, item); | 224 | zone_page_state_add(x, zone, item); |
179 | x = 0; | 225 | x = 0; |
180 | } | 226 | } |
181 | *p = x; | 227 | __this_cpu_write(*p, x); |
182 | } | 228 | } |
183 | EXPORT_SYMBOL(__mod_zone_page_state); | 229 | EXPORT_SYMBOL(__mod_zone_page_state); |
184 | 230 | ||
185 | /* | 231 | /* |
186 | * For an unknown interrupt state | ||
187 | */ | ||
188 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
189 | int delta) | ||
190 | { | ||
191 | unsigned long flags; | ||
192 | |||
193 | local_irq_save(flags); | ||
194 | __mod_zone_page_state(zone, item, delta); | ||
195 | local_irq_restore(flags); | ||
196 | } | ||
197 | EXPORT_SYMBOL(mod_zone_page_state); | ||
198 | |||
199 | /* | ||
200 | * Optimized increment and decrement functions. | 232 | * Optimized increment and decrement functions. |
201 | * | 233 | * |
202 | * These are only for a single page and therefore can take a struct page * | 234 | * These are only for a single page and therefore can take a struct page * |
@@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
221 | */ | 253 | */ |
222 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 254 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
223 | { | 255 | { |
224 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 256 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
225 | s8 *p = pcp->vm_stat_diff + item; | 257 | s8 __percpu *p = pcp->vm_stat_diff + item; |
258 | s8 v, t; | ||
226 | 259 | ||
227 | (*p)++; | 260 | v = __this_cpu_inc_return(*p); |
261 | t = __this_cpu_read(pcp->stat_threshold); | ||
262 | if (unlikely(v > t)) { | ||
263 | s8 overstep = t >> 1; | ||
228 | 264 | ||
229 | if (unlikely(*p > pcp->stat_threshold)) { | 265 | zone_page_state_add(v + overstep, zone, item); |
230 | int overstep = pcp->stat_threshold / 2; | 266 | __this_cpu_write(*p, -overstep); |
231 | |||
232 | zone_page_state_add(*p + overstep, zone, item); | ||
233 | *p = -overstep; | ||
234 | } | 267 | } |
235 | } | 268 | } |
236 | 269 | ||
@@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
242 | 275 | ||
243 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 276 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
244 | { | 277 | { |
245 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); | 278 | struct per_cpu_pageset __percpu *pcp = zone->pageset; |
246 | s8 *p = pcp->vm_stat_diff + item; | 279 | s8 __percpu *p = pcp->vm_stat_diff + item; |
247 | 280 | s8 v, t; | |
248 | (*p)--; | ||
249 | 281 | ||
250 | if (unlikely(*p < - pcp->stat_threshold)) { | 282 | v = __this_cpu_dec_return(*p); |
251 | int overstep = pcp->stat_threshold / 2; | 283 | t = __this_cpu_read(pcp->stat_threshold); |
284 | if (unlikely(v < - t)) { | ||
285 | s8 overstep = t >> 1; | ||
252 | 286 | ||
253 | zone_page_state_add(*p - overstep, zone, item); | 287 | zone_page_state_add(v - overstep, zone, item); |
254 | *p = overstep; | 288 | __this_cpu_write(*p, overstep); |
255 | } | 289 | } |
256 | } | 290 | } |
257 | 291 | ||
@@ -261,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
261 | } | 295 | } |
262 | EXPORT_SYMBOL(__dec_zone_page_state); | 296 | EXPORT_SYMBOL(__dec_zone_page_state); |
263 | 297 | ||
298 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
299 | /* | ||
300 | * If we have cmpxchg_local support then we do not need to incur the overhead | ||
301 | * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. | ||
302 | * | ||
303 | * mod_state() modifies the zone counter state through atomic per cpu | ||
304 | * operations. | ||
305 | * | ||
306 | * Overstep mode specifies how overstep should handled: | ||
307 | * 0 No overstepping | ||
308 | * 1 Overstepping half of threshold | ||
309 | * -1 Overstepping minus half of threshold | ||
310 | */ | ||
311 | static inline void mod_state(struct zone *zone, | ||
312 | enum zone_stat_item item, int delta, int overstep_mode) | ||
313 | { | ||
314 | struct per_cpu_pageset __percpu *pcp = zone->pageset; | ||
315 | s8 __percpu *p = pcp->vm_stat_diff + item; | ||
316 | long o, n, t, z; | ||
317 | |||
318 | do { | ||
319 | z = 0; /* overflow to zone counters */ | ||
320 | |||
321 | /* | ||
322 | * The fetching of the stat_threshold is racy. We may apply | ||
323 | * a counter threshold to the wrong the cpu if we get | ||
324 | * rescheduled while executing here. However, the following | ||
325 | * will apply the threshold again and therefore bring the | ||
326 | * counter under the threshold. | ||
327 | */ | ||
328 | t = this_cpu_read(pcp->stat_threshold); | ||
329 | |||
330 | o = this_cpu_read(*p); | ||
331 | n = delta + o; | ||
332 | |||
333 | if (n > t || n < -t) { | ||
334 | int os = overstep_mode * (t >> 1) ; | ||
335 | |||
336 | /* Overflow must be added to zone counters */ | ||
337 | z = n + os; | ||
338 | n = -os; | ||
339 | } | ||
340 | } while (this_cpu_cmpxchg(*p, o, n) != o); | ||
341 | |||
342 | if (z) | ||
343 | zone_page_state_add(z, zone, item); | ||
344 | } | ||
345 | |||
346 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
347 | int delta) | ||
348 | { | ||
349 | mod_state(zone, item, delta, 0); | ||
350 | } | ||
351 | EXPORT_SYMBOL(mod_zone_page_state); | ||
352 | |||
353 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
354 | { | ||
355 | mod_state(zone, item, 1, 1); | ||
356 | } | ||
357 | |||
358 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) | ||
359 | { | ||
360 | mod_state(page_zone(page), item, 1, 1); | ||
361 | } | ||
362 | EXPORT_SYMBOL(inc_zone_page_state); | ||
363 | |||
364 | void dec_zone_page_state(struct page *page, enum zone_stat_item item) | ||
365 | { | ||
366 | mod_state(page_zone(page), item, -1, -1); | ||
367 | } | ||
368 | EXPORT_SYMBOL(dec_zone_page_state); | ||
369 | #else | ||
370 | /* | ||
371 | * Use interrupt disable to serialize counter updates | ||
372 | */ | ||
373 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
374 | int delta) | ||
375 | { | ||
376 | unsigned long flags; | ||
377 | |||
378 | local_irq_save(flags); | ||
379 | __mod_zone_page_state(zone, item, delta); | ||
380 | local_irq_restore(flags); | ||
381 | } | ||
382 | EXPORT_SYMBOL(mod_zone_page_state); | ||
383 | |||
264 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | 384 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) |
265 | { | 385 | { |
266 | unsigned long flags; | 386 | unsigned long flags; |
@@ -291,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
291 | local_irq_restore(flags); | 411 | local_irq_restore(flags); |
292 | } | 412 | } |
293 | EXPORT_SYMBOL(dec_zone_page_state); | 413 | EXPORT_SYMBOL(dec_zone_page_state); |
414 | #endif | ||
294 | 415 | ||
295 | /* | 416 | /* |
296 | * Update the zone counters for one cpu. | 417 | * Update the zone counters for one cpu. |
@@ -759,6 +880,7 @@ static const char * const vmstat_text[] = { | |||
759 | "numa_local", | 880 | "numa_local", |
760 | "numa_other", | 881 | "numa_other", |
761 | #endif | 882 | #endif |
883 | "nr_anon_transparent_hugepages", | ||
762 | "nr_dirty_threshold", | 884 | "nr_dirty_threshold", |
763 | "nr_dirty_background_threshold", | 885 | "nr_dirty_background_threshold", |
764 | 886 | ||
@@ -834,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
834 | "\n scanned %lu" | 956 | "\n scanned %lu" |
835 | "\n spanned %lu" | 957 | "\n spanned %lu" |
836 | "\n present %lu", | 958 | "\n present %lu", |
837 | zone_nr_free_pages(zone), | 959 | zone_page_state(zone, NR_FREE_PAGES), |
838 | min_wmark_pages(zone), | 960 | min_wmark_pages(zone), |
839 | low_wmark_pages(zone), | 961 | low_wmark_pages(zone), |
840 | high_wmark_pages(zone), | 962 | high_wmark_pages(zone), |
@@ -1033,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
1033 | break; | 1155 | break; |
1034 | case CPU_DOWN_PREPARE: | 1156 | case CPU_DOWN_PREPARE: |
1035 | case CPU_DOWN_PREPARE_FROZEN: | 1157 | case CPU_DOWN_PREPARE_FROZEN: |
1036 | cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); | 1158 | cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); |
1037 | per_cpu(vmstat_work, cpu).work.func = NULL; | 1159 | per_cpu(vmstat_work, cpu).work.func = NULL; |
1038 | break; | 1160 | break; |
1039 | case CPU_DOWN_FAILED: | 1161 | case CPU_DOWN_FAILED: |