diff options
-rw-r--r-- | include/linux/mmzone.h | 10 | ||||
-rw-r--r-- | include/linux/vmstat.h | 5 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/page_alloc.c | 35 | ||||
-rw-r--r-- | mm/vmscan.c | 23 | ||||
-rw-r--r-- | mm/vmstat.c | 68 |
6 files changed, 115 insertions, 47 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39c24ebe9cfd..48906629335c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone) | |||
458 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); | 458 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); |
459 | } | 459 | } |
460 | 460 | ||
461 | #ifdef CONFIG_SMP | ||
462 | unsigned long zone_nr_free_pages(struct zone *zone); | ||
463 | #else | ||
464 | #define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) | ||
465 | #endif /* CONFIG_SMP */ | ||
466 | |||
467 | /* | 461 | /* |
468 | * The "priority" of VM scanning is how much of the queues we will scan in one | 462 | * The "priority" of VM scanning is how much of the queues we will scan in one |
469 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the | 463 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the |
@@ -661,7 +655,9 @@ typedef struct pglist_data { | |||
661 | extern struct mutex zonelists_mutex; | 655 | extern struct mutex zonelists_mutex; |
662 | void build_all_zonelists(void *data); | 656 | void build_all_zonelists(void *data); |
663 | void wakeup_kswapd(struct zone *zone, int order); | 657 | void wakeup_kswapd(struct zone *zone, int order); |
664 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 658 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
659 | int classzone_idx, int alloc_flags); | ||
660 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | ||
665 | int classzone_idx, int alloc_flags); | 661 | int classzone_idx, int alloc_flags); |
666 | enum memmap_context { | 662 | enum memmap_context { |
667 | MEMMAP_EARLY, | 663 | MEMMAP_EARLY, |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index eaaea37b3b75..e4cc21cf5870 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); | |||
254 | extern void __dec_zone_state(struct zone *, enum zone_stat_item); | 254 | extern void __dec_zone_state(struct zone *, enum zone_stat_item); |
255 | 255 | ||
256 | void refresh_cpu_vm_stats(int); | 256 | void refresh_cpu_vm_stats(int); |
257 | void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); | ||
258 | void restore_pgdat_percpu_threshold(pg_data_t *pgdat); | ||
257 | #else /* CONFIG_SMP */ | 259 | #else /* CONFIG_SMP */ |
258 | 260 | ||
259 | /* | 261 | /* |
@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page, | |||
298 | #define dec_zone_page_state __dec_zone_page_state | 300 | #define dec_zone_page_state __dec_zone_page_state |
299 | #define mod_zone_page_state __mod_zone_page_state | 301 | #define mod_zone_page_state __mod_zone_page_state |
300 | 302 | ||
303 | static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } | ||
304 | static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } | ||
305 | |||
301 | static inline void refresh_cpu_vm_stats(int cpu) { } | 306 | static inline void refresh_cpu_vm_stats(int cpu) { } |
302 | #endif | 307 | #endif |
303 | 308 | ||
diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c855..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, | |||
87 | return 1; | 87 | return 1; |
88 | } | 88 | } |
89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
90 | |||
91 | #ifdef CONFIG_SMP | ||
92 | /* Called when a more accurate view of NR_FREE_PAGES is needed */ | ||
93 | unsigned long zone_nr_free_pages(struct zone *zone) | ||
94 | { | ||
95 | unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
96 | |||
97 | /* | ||
98 | * While kswapd is awake, it is considered the zone is under some | ||
99 | * memory pressure. Under pressure, there is a risk that | ||
100 | * per-cpu-counter-drift will allow the min watermark to be breached | ||
101 | * potentially causing a live-lock. While kswapd is awake and | ||
102 | * free pages are low, get a better estimate for free pages | ||
103 | */ | ||
104 | if (nr_free_pages < zone->percpu_drift_mark && | ||
105 | !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) | ||
106 | return zone_page_state_snapshot(zone, NR_FREE_PAGES); | ||
107 | |||
108 | return nr_free_pages; | ||
109 | } | ||
110 | #endif /* CONFIG_SMP */ | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 826ba6922e84..22a1bb7723e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
1460 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1460 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
1461 | 1461 | ||
1462 | /* | 1462 | /* |
1463 | * Return 1 if free pages are above 'mark'. This takes into account the order | 1463 | * Return true if free pages are above 'mark'. This takes into account the order |
1464 | * of the allocation. | 1464 | * of the allocation. |
1465 | */ | 1465 | */ |
1466 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1466 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1467 | int classzone_idx, int alloc_flags) | 1467 | int classzone_idx, int alloc_flags, long free_pages) |
1468 | { | 1468 | { |
1469 | /* free_pages my go negative - that's OK */ | 1469 | /* free_pages my go negative - that's OK */ |
1470 | long min = mark; | 1470 | long min = mark; |
1471 | long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; | ||
1472 | int o; | 1471 | int o; |
1473 | 1472 | ||
1473 | free_pages -= (1 << order) + 1; | ||
1474 | if (alloc_flags & ALLOC_HIGH) | 1474 | if (alloc_flags & ALLOC_HIGH) |
1475 | min -= min / 2; | 1475 | min -= min / 2; |
1476 | if (alloc_flags & ALLOC_HARDER) | 1476 | if (alloc_flags & ALLOC_HARDER) |
1477 | min -= min / 4; | 1477 | min -= min / 4; |
1478 | 1478 | ||
1479 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1479 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
1480 | return 0; | 1480 | return false; |
1481 | for (o = 0; o < order; o++) { | 1481 | for (o = 0; o < order; o++) { |
1482 | /* At the next order, this order's pages become unavailable */ | 1482 | /* At the next order, this order's pages become unavailable */ |
1483 | free_pages -= z->free_area[o].nr_free << o; | 1483 | free_pages -= z->free_area[o].nr_free << o; |
@@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1486 | min >>= 1; | 1486 | min >>= 1; |
1487 | 1487 | ||
1488 | if (free_pages <= min) | 1488 | if (free_pages <= min) |
1489 | return 0; | 1489 | return false; |
1490 | } | 1490 | } |
1491 | return 1; | 1491 | return true; |
1492 | } | ||
1493 | |||
1494 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | ||
1495 | int classzone_idx, int alloc_flags) | ||
1496 | { | ||
1497 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1498 | zone_page_state(z, NR_FREE_PAGES)); | ||
1499 | } | ||
1500 | |||
1501 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | ||
1502 | int classzone_idx, int alloc_flags) | ||
1503 | { | ||
1504 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | ||
1505 | |||
1506 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | ||
1507 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | ||
1508 | |||
1509 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1510 | free_pages); | ||
1492 | } | 1511 | } |
1493 | 1512 | ||
1494 | #ifdef CONFIG_NUMA | 1513 | #ifdef CONFIG_NUMA |
@@ -2442,7 +2461,7 @@ void show_free_areas(void) | |||
2442 | " all_unreclaimable? %s" | 2461 | " all_unreclaimable? %s" |
2443 | "\n", | 2462 | "\n", |
2444 | zone->name, | 2463 | zone->name, |
2445 | K(zone_nr_free_pages(zone)), | 2464 | K(zone_page_state(zone, NR_FREE_PAGES)), |
2446 | K(min_wmark_pages(zone)), | 2465 | K(min_wmark_pages(zone)), |
2447 | K(low_wmark_pages(zone)), | 2466 | K(low_wmark_pages(zone)), |
2448 | K(high_wmark_pages(zone)), | 2467 | K(high_wmark_pages(zone)), |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c69274..5da4295e7d67 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
2143 | if (zone->all_unreclaimable) | 2143 | if (zone->all_unreclaimable) |
2144 | continue; | 2144 | continue; |
2145 | 2145 | ||
2146 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2146 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2147 | 0, 0)) | 2147 | 0, 0)) |
2148 | return 1; | 2148 | return 1; |
2149 | } | 2149 | } |
@@ -2230,7 +2230,7 @@ loop_again: | |||
2230 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2230 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
2231 | &sc, priority, 0); | 2231 | &sc, priority, 0); |
2232 | 2232 | ||
2233 | if (!zone_watermark_ok(zone, order, | 2233 | if (!zone_watermark_ok_safe(zone, order, |
2234 | high_wmark_pages(zone), 0, 0)) { | 2234 | high_wmark_pages(zone), 0, 0)) { |
2235 | end_zone = i; | 2235 | end_zone = i; |
2236 | break; | 2236 | break; |
@@ -2276,7 +2276,7 @@ loop_again: | |||
2276 | * We put equal pressure on every zone, unless one | 2276 | * We put equal pressure on every zone, unless one |
2277 | * zone has way too many pages free already. | 2277 | * zone has way too many pages free already. |
2278 | */ | 2278 | */ |
2279 | if (!zone_watermark_ok(zone, order, | 2279 | if (!zone_watermark_ok_safe(zone, order, |
2280 | 8*high_wmark_pages(zone), end_zone, 0)) | 2280 | 8*high_wmark_pages(zone), end_zone, 0)) |
2281 | shrink_zone(priority, zone, &sc); | 2281 | shrink_zone(priority, zone, &sc); |
2282 | reclaim_state->reclaimed_slab = 0; | 2282 | reclaim_state->reclaimed_slab = 0; |
@@ -2297,7 +2297,7 @@ loop_again: | |||
2297 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2297 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2298 | sc.may_writepage = 1; | 2298 | sc.may_writepage = 1; |
2299 | 2299 | ||
2300 | if (!zone_watermark_ok(zone, order, | 2300 | if (!zone_watermark_ok_safe(zone, order, |
2301 | high_wmark_pages(zone), end_zone, 0)) { | 2301 | high_wmark_pages(zone), end_zone, 0)) { |
2302 | all_zones_ok = 0; | 2302 | all_zones_ok = 0; |
2303 | /* | 2303 | /* |
@@ -2305,7 +2305,7 @@ loop_again: | |||
2305 | * means that we have a GFP_ATOMIC allocation | 2305 | * means that we have a GFP_ATOMIC allocation |
2306 | * failure risk. Hurry up! | 2306 | * failure risk. Hurry up! |
2307 | */ | 2307 | */ |
2308 | if (!zone_watermark_ok(zone, order, | 2308 | if (!zone_watermark_ok_safe(zone, order, |
2309 | min_wmark_pages(zone), end_zone, 0)) | 2309 | min_wmark_pages(zone), end_zone, 0)) |
2310 | has_under_min_watermark_zone = 1; | 2310 | has_under_min_watermark_zone = 1; |
2311 | } else { | 2311 | } else { |
@@ -2448,7 +2448,9 @@ static int kswapd(void *p) | |||
2448 | */ | 2448 | */ |
2449 | if (!sleeping_prematurely(pgdat, order, remaining)) { | 2449 | if (!sleeping_prematurely(pgdat, order, remaining)) { |
2450 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2450 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2451 | restore_pgdat_percpu_threshold(pgdat); | ||
2451 | schedule(); | 2452 | schedule(); |
2453 | reduce_pgdat_percpu_threshold(pgdat); | ||
2452 | } else { | 2454 | } else { |
2453 | if (remaining) | 2455 | if (remaining) |
2454 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | 2456 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); |
@@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
2487 | if (!populated_zone(zone)) | 2489 | if (!populated_zone(zone)) |
2488 | return; | 2490 | return; |
2489 | 2491 | ||
2490 | pgdat = zone->zone_pgdat; | 2492 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2491 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2492 | return; | 2493 | return; |
2494 | pgdat = zone->zone_pgdat; | ||
2493 | if (pgdat->kswapd_max_order < order) | 2495 | if (pgdat->kswapd_max_order < order) |
2494 | pgdat->kswapd_max_order = order; | 2496 | pgdat->kswapd_max_order = order; |
2495 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2496 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
2497 | return; | ||
2498 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2497 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
2499 | return; | 2498 | return; |
2499 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2500 | return; | ||
2501 | |||
2502 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2500 | wake_up_interruptible(&pgdat->kswapd_wait); | 2503 | wake_up_interruptible(&pgdat->kswapd_wait); |
2501 | } | 2504 | } |
2502 | 2505 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 312d728976f1..bc0f095791b4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat); | |||
83 | 83 | ||
84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
85 | 85 | ||
86 | static int calculate_pressure_threshold(struct zone *zone) | ||
87 | { | ||
88 | int threshold; | ||
89 | int watermark_distance; | ||
90 | |||
91 | /* | ||
92 | * As vmstats are not up to date, there is drift between the estimated | ||
93 | * and real values. For high thresholds and a high number of CPUs, it | ||
94 | * is possible for the min watermark to be breached while the estimated | ||
95 | * value looks fine. The pressure threshold is a reduced value such | ||
96 | * that even the maximum amount of drift will not accidentally breach | ||
97 | * the min watermark | ||
98 | */ | ||
99 | watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); | ||
100 | threshold = max(1, (int)(watermark_distance / num_online_cpus())); | ||
101 | |||
102 | /* | ||
103 | * Maximum threshold is 125 | ||
104 | */ | ||
105 | threshold = min(125, threshold); | ||
106 | |||
107 | return threshold; | ||
108 | } | ||
109 | |||
86 | static int calculate_threshold(struct zone *zone) | 110 | static int calculate_threshold(struct zone *zone) |
87 | { | 111 | { |
88 | int threshold; | 112 | int threshold; |
@@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void) | |||
161 | } | 185 | } |
162 | } | 186 | } |
163 | 187 | ||
188 | void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) | ||
189 | { | ||
190 | struct zone *zone; | ||
191 | int cpu; | ||
192 | int threshold; | ||
193 | int i; | ||
194 | |||
195 | get_online_cpus(); | ||
196 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
197 | zone = &pgdat->node_zones[i]; | ||
198 | if (!zone->percpu_drift_mark) | ||
199 | continue; | ||
200 | |||
201 | threshold = calculate_pressure_threshold(zone); | ||
202 | for_each_online_cpu(cpu) | ||
203 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | ||
204 | = threshold; | ||
205 | } | ||
206 | put_online_cpus(); | ||
207 | } | ||
208 | |||
209 | void restore_pgdat_percpu_threshold(pg_data_t *pgdat) | ||
210 | { | ||
211 | struct zone *zone; | ||
212 | int cpu; | ||
213 | int threshold; | ||
214 | int i; | ||
215 | |||
216 | get_online_cpus(); | ||
217 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
218 | zone = &pgdat->node_zones[i]; | ||
219 | if (!zone->percpu_drift_mark) | ||
220 | continue; | ||
221 | |||
222 | threshold = calculate_threshold(zone); | ||
223 | for_each_online_cpu(cpu) | ||
224 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | ||
225 | = threshold; | ||
226 | } | ||
227 | put_online_cpus(); | ||
228 | } | ||
229 | |||
164 | /* | 230 | /* |
165 | * For use when we know that interrupts are disabled. | 231 | * For use when we know that interrupts are disabled. |
166 | */ | 232 | */ |
@@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
911 | "\n scanned %lu" | 977 | "\n scanned %lu" |
912 | "\n spanned %lu" | 978 | "\n spanned %lu" |
913 | "\n present %lu", | 979 | "\n present %lu", |
914 | zone_nr_free_pages(zone), | 980 | zone_page_state(zone, NR_FREE_PAGES), |
915 | min_wmark_pages(zone), | 981 | min_wmark_pages(zone), |
916 | low_wmark_pages(zone), | 982 | low_wmark_pages(zone), |
917 | high_wmark_pages(zone), | 983 | high_wmark_pages(zone), |