aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mmzone.h10
-rw-r--r--include/linux/vmstat.h5
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/page_alloc.c35
-rw-r--r--mm/vmscan.c23
-rw-r--r--mm/vmstat.c68
6 files changed, 115 insertions, 47 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39c24ebe9cfd..48906629335c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
458 return test_bit(ZONE_OOM_LOCKED, &zone->flags); 458 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
459} 459}
460 460
461#ifdef CONFIG_SMP
462unsigned long zone_nr_free_pages(struct zone *zone);
463#else
464#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
465#endif /* CONFIG_SMP */
466
467/* 461/*
468 * The "priority" of VM scanning is how much of the queues we will scan in one 462 * The "priority" of VM scanning is how much of the queues we will scan in one
469 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 463 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -661,7 +655,9 @@ typedef struct pglist_data {
661extern struct mutex zonelists_mutex; 655extern struct mutex zonelists_mutex;
662void build_all_zonelists(void *data); 656void build_all_zonelists(void *data);
663void wakeup_kswapd(struct zone *zone, int order); 657void wakeup_kswapd(struct zone *zone, int order);
664int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 658bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
659 int classzone_idx, int alloc_flags);
660bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
665 int classzone_idx, int alloc_flags); 661 int classzone_idx, int alloc_flags);
666enum memmap_context { 662enum memmap_context {
667 MEMMAP_EARLY, 663 MEMMAP_EARLY,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index eaaea37b3b75..e4cc21cf5870 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
254extern void __dec_zone_state(struct zone *, enum zone_stat_item); 254extern void __dec_zone_state(struct zone *, enum zone_stat_item);
255 255
256void refresh_cpu_vm_stats(int); 256void refresh_cpu_vm_stats(int);
257void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
258void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
257#else /* CONFIG_SMP */ 259#else /* CONFIG_SMP */
258 260
259/* 261/*
@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
298#define dec_zone_page_state __dec_zone_page_state 300#define dec_zone_page_state __dec_zone_page_state
299#define mod_zone_page_state __mod_zone_page_state 301#define mod_zone_page_state __mod_zone_page_state
300 302
303static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
304static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
305
301static inline void refresh_cpu_vm_stats(int cpu) { } 306static inline void refresh_cpu_vm_stats(int cpu) { }
302#endif 307#endif
303 308
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..22a1bb7723e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1460#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1460#endif /* CONFIG_FAIL_PAGE_ALLOC */
1461 1461
1462/* 1462/*
1463 * Return 1 if free pages are above 'mark'. This takes into account the order 1463 * Return true if free pages are above 'mark'. This takes into account the order
1464 * of the allocation. 1464 * of the allocation.
1465 */ 1465 */
1466int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1466static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1467 int classzone_idx, int alloc_flags) 1467 int classzone_idx, int alloc_flags, long free_pages)
1468{ 1468{
1469 /* free_pages my go negative - that's OK */ 1469 /* free_pages my go negative - that's OK */
1470 long min = mark; 1470 long min = mark;
1471 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1472 int o; 1471 int o;
1473 1472
1473 free_pages -= (1 << order) + 1;
1474 if (alloc_flags & ALLOC_HIGH) 1474 if (alloc_flags & ALLOC_HIGH)
1475 min -= min / 2; 1475 min -= min / 2;
1476 if (alloc_flags & ALLOC_HARDER) 1476 if (alloc_flags & ALLOC_HARDER)
1477 min -= min / 4; 1477 min -= min / 4;
1478 1478
1479 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1479 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1480 return 0; 1480 return false;
1481 for (o = 0; o < order; o++) { 1481 for (o = 0; o < order; o++) {
1482 /* At the next order, this order's pages become unavailable */ 1482 /* At the next order, this order's pages become unavailable */
1483 free_pages -= z->free_area[o].nr_free << o; 1483 free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1486 min >>= 1; 1486 min >>= 1;
1487 1487
1488 if (free_pages <= min) 1488 if (free_pages <= min)
1489 return 0; 1489 return false;
1490 } 1490 }
1491 return 1; 1491 return true;
1492}
1493
1494bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1495 int classzone_idx, int alloc_flags)
1496{
1497 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1498 zone_page_state(z, NR_FREE_PAGES));
1499}
1500
1501bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1502 int classzone_idx, int alloc_flags)
1503{
1504 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1505
1506 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1507 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1508
1509 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1510 free_pages);
1492} 1511}
1493 1512
1494#ifdef CONFIG_NUMA 1513#ifdef CONFIG_NUMA
@@ -2442,7 +2461,7 @@ void show_free_areas(void)
2442 " all_unreclaimable? %s" 2461 " all_unreclaimable? %s"
2443 "\n", 2462 "\n",
2444 zone->name, 2463 zone->name,
2445 K(zone_nr_free_pages(zone)), 2464 K(zone_page_state(zone, NR_FREE_PAGES)),
2446 K(min_wmark_pages(zone)), 2465 K(min_wmark_pages(zone)),
2447 K(low_wmark_pages(zone)), 2466 K(low_wmark_pages(zone)),
2448 K(high_wmark_pages(zone)), 2467 K(high_wmark_pages(zone)),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9ca587c69274..5da4295e7d67 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2143 if (zone->all_unreclaimable) 2143 if (zone->all_unreclaimable)
2144 continue; 2144 continue;
2145 2145
2146 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2146 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2147 0, 0)) 2147 0, 0))
2148 return 1; 2148 return 1;
2149 } 2149 }
@@ -2230,7 +2230,7 @@ loop_again:
2230 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2230 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2231 &sc, priority, 0); 2231 &sc, priority, 0);
2232 2232
2233 if (!zone_watermark_ok(zone, order, 2233 if (!zone_watermark_ok_safe(zone, order,
2234 high_wmark_pages(zone), 0, 0)) { 2234 high_wmark_pages(zone), 0, 0)) {
2235 end_zone = i; 2235 end_zone = i;
2236 break; 2236 break;
@@ -2276,7 +2276,7 @@ loop_again:
2276 * We put equal pressure on every zone, unless one 2276 * We put equal pressure on every zone, unless one
2277 * zone has way too many pages free already. 2277 * zone has way too many pages free already.
2278 */ 2278 */
2279 if (!zone_watermark_ok(zone, order, 2279 if (!zone_watermark_ok_safe(zone, order,
2280 8*high_wmark_pages(zone), end_zone, 0)) 2280 8*high_wmark_pages(zone), end_zone, 0))
2281 shrink_zone(priority, zone, &sc); 2281 shrink_zone(priority, zone, &sc);
2282 reclaim_state->reclaimed_slab = 0; 2282 reclaim_state->reclaimed_slab = 0;
@@ -2297,7 +2297,7 @@ loop_again:
2297 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2297 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2298 sc.may_writepage = 1; 2298 sc.may_writepage = 1;
2299 2299
2300 if (!zone_watermark_ok(zone, order, 2300 if (!zone_watermark_ok_safe(zone, order,
2301 high_wmark_pages(zone), end_zone, 0)) { 2301 high_wmark_pages(zone), end_zone, 0)) {
2302 all_zones_ok = 0; 2302 all_zones_ok = 0;
2303 /* 2303 /*
@@ -2305,7 +2305,7 @@ loop_again:
2305 * means that we have a GFP_ATOMIC allocation 2305 * means that we have a GFP_ATOMIC allocation
2306 * failure risk. Hurry up! 2306 * failure risk. Hurry up!
2307 */ 2307 */
2308 if (!zone_watermark_ok(zone, order, 2308 if (!zone_watermark_ok_safe(zone, order,
2309 min_wmark_pages(zone), end_zone, 0)) 2309 min_wmark_pages(zone), end_zone, 0))
2310 has_under_min_watermark_zone = 1; 2310 has_under_min_watermark_zone = 1;
2311 } else { 2311 } else {
@@ -2448,7 +2448,9 @@ static int kswapd(void *p)
2448 */ 2448 */
2449 if (!sleeping_prematurely(pgdat, order, remaining)) { 2449 if (!sleeping_prematurely(pgdat, order, remaining)) {
2450 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2450 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2451 restore_pgdat_percpu_threshold(pgdat);
2451 schedule(); 2452 schedule();
2453 reduce_pgdat_percpu_threshold(pgdat);
2452 } else { 2454 } else {
2453 if (remaining) 2455 if (remaining)
2454 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2456 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
@@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order)
2487 if (!populated_zone(zone)) 2489 if (!populated_zone(zone))
2488 return; 2490 return;
2489 2491
2490 pgdat = zone->zone_pgdat; 2492 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2491 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2492 return; 2493 return;
2494 pgdat = zone->zone_pgdat;
2493 if (pgdat->kswapd_max_order < order) 2495 if (pgdat->kswapd_max_order < order)
2494 pgdat->kswapd_max_order = order; 2496 pgdat->kswapd_max_order = order;
2495 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2496 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2497 return;
2498 if (!waitqueue_active(&pgdat->kswapd_wait)) 2497 if (!waitqueue_active(&pgdat->kswapd_wait))
2499 return; 2498 return;
2499 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2500 return;
2501
2502 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2500 wake_up_interruptible(&pgdat->kswapd_wait); 2503 wake_up_interruptible(&pgdat->kswapd_wait);
2501} 2504}
2502 2505
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 312d728976f1..bc0f095791b4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
85 85
86static int calculate_pressure_threshold(struct zone *zone)
87{
88 int threshold;
89 int watermark_distance;
90
91 /*
92 * As vmstats are not up to date, there is drift between the estimated
93 * and real values. For high thresholds and a high number of CPUs, it
94 * is possible for the min watermark to be breached while the estimated
95 * value looks fine. The pressure threshold is a reduced value such
96 * that even the maximum amount of drift will not accidentally breach
97 * the min watermark
98 */
99 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
100 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
101
102 /*
103 * Maximum threshold is 125
104 */
105 threshold = min(125, threshold);
106
107 return threshold;
108}
109
86static int calculate_threshold(struct zone *zone) 110static int calculate_threshold(struct zone *zone)
87{ 111{
88 int threshold; 112 int threshold;
@@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void)
161 } 185 }
162} 186}
163 187
188void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
189{
190 struct zone *zone;
191 int cpu;
192 int threshold;
193 int i;
194
195 get_online_cpus();
196 for (i = 0; i < pgdat->nr_zones; i++) {
197 zone = &pgdat->node_zones[i];
198 if (!zone->percpu_drift_mark)
199 continue;
200
201 threshold = calculate_pressure_threshold(zone);
202 for_each_online_cpu(cpu)
203 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
204 = threshold;
205 }
206 put_online_cpus();
207}
208
209void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
210{
211 struct zone *zone;
212 int cpu;
213 int threshold;
214 int i;
215
216 get_online_cpus();
217 for (i = 0; i < pgdat->nr_zones; i++) {
218 zone = &pgdat->node_zones[i];
219 if (!zone->percpu_drift_mark)
220 continue;
221
222 threshold = calculate_threshold(zone);
223 for_each_online_cpu(cpu)
224 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
225 = threshold;
226 }
227 put_online_cpus();
228}
229
164/* 230/*
165 * For use when we know that interrupts are disabled. 231 * For use when we know that interrupts are disabled.
166 */ 232 */
@@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
911 "\n scanned %lu" 977 "\n scanned %lu"
912 "\n spanned %lu" 978 "\n spanned %lu"
913 "\n present %lu", 979 "\n present %lu",
914 zone_nr_free_pages(zone), 980 zone_page_state(zone, NR_FREE_PAGES),
915 min_wmark_pages(zone), 981 min_wmark_pages(zone),
916 low_wmark_pages(zone), 982 low_wmark_pages(zone),
917 high_wmark_pages(zone), 983 high_wmark_pages(zone),