aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2011-01-13 18:45:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:31 -0500
commit88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 (patch)
tree6f39beef8cf918eb2ca9f64ae1bcd1ea79ca487a
parent43bb40c9e3aa51a3b038c9df2c9afb4d4685614d (diff)
mm: page allocator: adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To avoid synchronization overhead, these counters are maintained on a per-cpu basis and drained both periodically and when a threshold is above a threshold. On large CPU systems, the difference between the estimate and real value of NR_FREE_PAGES can be very high. The system can get into a case where pages are allocated far below the min watermark potentially causing livelock issues. The commit solved the problem by taking a better reading of NR_FREE_PAGES when memory was low. Unfortately, as reported by Shaohua Li this accurate reading can consume a large amount of CPU time on systems with many sockets due to cache line bouncing. This patch takes a different approach. For large machines where counter drift might be unsafe and while kswapd is awake, the per-cpu thresholds for the target pgdat are reduced to limit the level of drift to what should be a safe level. This incurs a performance penalty in heavy memory pressure by a factor that depends on the workload and the machine but the machine should function correctly without accidentally exhausting all memory on a node. There is an additional cost when kswapd wakes and sleeps but the event is not expected to be frequent - in Shaohua's test case, there was one recorded sleep and wake event at least. To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is introduced that takes a more accurate reading of NR_FREE_PAGES when called from wakeup_kswapd, when deciding whether it is really safe to go back to sleep in sleeping_prematurely() and when deciding if a zone is really balanced or not in balance_pgdat(). We are still using an expensive function but limiting how often it is called. When the test case is reproduced, the time spent in the watermark functions is reduced. The following report is on the percentage of time spent cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(), zone_page_state(). vanilla 11.6615% disable-threshold 0.2584% David said: : We had to pull aa454840 "mm: page allocator: calculate a better estimate : of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36 : internally because tests showed that it would cause the machine to stall : as the result of heavy kswapd activity. I merged it back with this fix as : it is pending in the -mm tree and it solves the issue we were seeing, so I : definitely think this should be pushed to -stable (and I would seriously : consider it for 2.6.37 inclusion even at this late date). Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reported-by: Shaohua Li <shaohua.li@intel.com> Reviewed-by: Christoph Lameter <cl@linux.com> Tested-by: Nicolas Bareil <nico@chdir.org> Cc: David Rientjes <rientjes@google.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: <stable@kernel.org> [2.6.37.1, 2.6.36.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h10
-rw-r--r--include/linux/vmstat.h5
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/page_alloc.c35
-rw-r--r--mm/vmscan.c23
-rw-r--r--mm/vmstat.c68
6 files changed, 115 insertions, 47 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39c24ebe9cfd..48906629335c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
458 return test_bit(ZONE_OOM_LOCKED, &zone->flags); 458 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
459} 459}
460 460
461#ifdef CONFIG_SMP
462unsigned long zone_nr_free_pages(struct zone *zone);
463#else
464#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
465#endif /* CONFIG_SMP */
466
467/* 461/*
468 * The "priority" of VM scanning is how much of the queues we will scan in one 462 * The "priority" of VM scanning is how much of the queues we will scan in one
469 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 463 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -661,7 +655,9 @@ typedef struct pglist_data {
661extern struct mutex zonelists_mutex; 655extern struct mutex zonelists_mutex;
662void build_all_zonelists(void *data); 656void build_all_zonelists(void *data);
663void wakeup_kswapd(struct zone *zone, int order); 657void wakeup_kswapd(struct zone *zone, int order);
664int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 658bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
659 int classzone_idx, int alloc_flags);
660bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
665 int classzone_idx, int alloc_flags); 661 int classzone_idx, int alloc_flags);
666enum memmap_context { 662enum memmap_context {
667 MEMMAP_EARLY, 663 MEMMAP_EARLY,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index eaaea37b3b75..e4cc21cf5870 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
254extern void __dec_zone_state(struct zone *, enum zone_stat_item); 254extern void __dec_zone_state(struct zone *, enum zone_stat_item);
255 255
256void refresh_cpu_vm_stats(int); 256void refresh_cpu_vm_stats(int);
257void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
258void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
257#else /* CONFIG_SMP */ 259#else /* CONFIG_SMP */
258 260
259/* 261/*
@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
298#define dec_zone_page_state __dec_zone_page_state 300#define dec_zone_page_state __dec_zone_page_state
299#define mod_zone_page_state __mod_zone_page_state 301#define mod_zone_page_state __mod_zone_page_state
300 302
303static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
304static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
305
301static inline void refresh_cpu_vm_stats(int cpu) { } 306static inline void refresh_cpu_vm_stats(int cpu) { }
302#endif 307#endif
303 308
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..22a1bb7723e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1460#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1460#endif /* CONFIG_FAIL_PAGE_ALLOC */
1461 1461
1462/* 1462/*
1463 * Return 1 if free pages are above 'mark'. This takes into account the order 1463 * Return true if free pages are above 'mark'. This takes into account the order
1464 * of the allocation. 1464 * of the allocation.
1465 */ 1465 */
1466int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1466static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1467 int classzone_idx, int alloc_flags) 1467 int classzone_idx, int alloc_flags, long free_pages)
1468{ 1468{
1469 /* free_pages my go negative - that's OK */ 1469 /* free_pages my go negative - that's OK */
1470 long min = mark; 1470 long min = mark;
1471 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1472 int o; 1471 int o;
1473 1472
1473 free_pages -= (1 << order) + 1;
1474 if (alloc_flags & ALLOC_HIGH) 1474 if (alloc_flags & ALLOC_HIGH)
1475 min -= min / 2; 1475 min -= min / 2;
1476 if (alloc_flags & ALLOC_HARDER) 1476 if (alloc_flags & ALLOC_HARDER)
1477 min -= min / 4; 1477 min -= min / 4;
1478 1478
1479 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1479 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1480 return 0; 1480 return false;
1481 for (o = 0; o < order; o++) { 1481 for (o = 0; o < order; o++) {
1482 /* At the next order, this order's pages become unavailable */ 1482 /* At the next order, this order's pages become unavailable */
1483 free_pages -= z->free_area[o].nr_free << o; 1483 free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1486 min >>= 1; 1486 min >>= 1;
1487 1487
1488 if (free_pages <= min) 1488 if (free_pages <= min)
1489 return 0; 1489 return false;
1490 } 1490 }
1491 return 1; 1491 return true;
1492}
1493
1494bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1495 int classzone_idx, int alloc_flags)
1496{
1497 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1498 zone_page_state(z, NR_FREE_PAGES));
1499}
1500
1501bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1502 int classzone_idx, int alloc_flags)
1503{
1504 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1505
1506 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1507 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1508
1509 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1510 free_pages);
1492} 1511}
1493 1512
1494#ifdef CONFIG_NUMA 1513#ifdef CONFIG_NUMA
@@ -2442,7 +2461,7 @@ void show_free_areas(void)
2442 " all_unreclaimable? %s" 2461 " all_unreclaimable? %s"
2443 "\n", 2462 "\n",
2444 zone->name, 2463 zone->name,
2445 K(zone_nr_free_pages(zone)), 2464 K(zone_page_state(zone, NR_FREE_PAGES)),
2446 K(min_wmark_pages(zone)), 2465 K(min_wmark_pages(zone)),
2447 K(low_wmark_pages(zone)), 2466 K(low_wmark_pages(zone)),
2448 K(high_wmark_pages(zone)), 2467 K(high_wmark_pages(zone)),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9ca587c69274..5da4295e7d67 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2143 if (zone->all_unreclaimable) 2143 if (zone->all_unreclaimable)
2144 continue; 2144 continue;
2145 2145
2146 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2146 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2147 0, 0)) 2147 0, 0))
2148 return 1; 2148 return 1;
2149 } 2149 }
@@ -2230,7 +2230,7 @@ loop_again:
2230 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2230 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2231 &sc, priority, 0); 2231 &sc, priority, 0);
2232 2232
2233 if (!zone_watermark_ok(zone, order, 2233 if (!zone_watermark_ok_safe(zone, order,
2234 high_wmark_pages(zone), 0, 0)) { 2234 high_wmark_pages(zone), 0, 0)) {
2235 end_zone = i; 2235 end_zone = i;
2236 break; 2236 break;
@@ -2276,7 +2276,7 @@ loop_again:
2276 * We put equal pressure on every zone, unless one 2276 * We put equal pressure on every zone, unless one
2277 * zone has way too many pages free already. 2277 * zone has way too many pages free already.
2278 */ 2278 */
2279 if (!zone_watermark_ok(zone, order, 2279 if (!zone_watermark_ok_safe(zone, order,
2280 8*high_wmark_pages(zone), end_zone, 0)) 2280 8*high_wmark_pages(zone), end_zone, 0))
2281 shrink_zone(priority, zone, &sc); 2281 shrink_zone(priority, zone, &sc);
2282 reclaim_state->reclaimed_slab = 0; 2282 reclaim_state->reclaimed_slab = 0;
@@ -2297,7 +2297,7 @@ loop_again:
2297 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2297 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2298 sc.may_writepage = 1; 2298 sc.may_writepage = 1;
2299 2299
2300 if (!zone_watermark_ok(zone, order, 2300 if (!zone_watermark_ok_safe(zone, order,
2301 high_wmark_pages(zone), end_zone, 0)) { 2301 high_wmark_pages(zone), end_zone, 0)) {
2302 all_zones_ok = 0; 2302 all_zones_ok = 0;
2303 /* 2303 /*
@@ -2305,7 +2305,7 @@ loop_again:
2305 * means that we have a GFP_ATOMIC allocation 2305 * means that we have a GFP_ATOMIC allocation
2306 * failure risk. Hurry up! 2306 * failure risk. Hurry up!
2307 */ 2307 */
2308 if (!zone_watermark_ok(zone, order, 2308 if (!zone_watermark_ok_safe(zone, order,
2309 min_wmark_pages(zone), end_zone, 0)) 2309 min_wmark_pages(zone), end_zone, 0))
2310 has_under_min_watermark_zone = 1; 2310 has_under_min_watermark_zone = 1;
2311 } else { 2311 } else {
@@ -2448,7 +2448,9 @@ static int kswapd(void *p)
2448 */ 2448 */
2449 if (!sleeping_prematurely(pgdat, order, remaining)) { 2449 if (!sleeping_prematurely(pgdat, order, remaining)) {
2450 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2450 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2451 restore_pgdat_percpu_threshold(pgdat);
2451 schedule(); 2452 schedule();
2453 reduce_pgdat_percpu_threshold(pgdat);
2452 } else { 2454 } else {
2453 if (remaining) 2455 if (remaining)
2454 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2456 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
@@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order)
2487 if (!populated_zone(zone)) 2489 if (!populated_zone(zone))
2488 return; 2490 return;
2489 2491
2490 pgdat = zone->zone_pgdat; 2492 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2491 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2492 return; 2493 return;
2494 pgdat = zone->zone_pgdat;
2493 if (pgdat->kswapd_max_order < order) 2495 if (pgdat->kswapd_max_order < order)
2494 pgdat->kswapd_max_order = order; 2496 pgdat->kswapd_max_order = order;
2495 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2496 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2497 return;
2498 if (!waitqueue_active(&pgdat->kswapd_wait)) 2497 if (!waitqueue_active(&pgdat->kswapd_wait))
2499 return; 2498 return;
2499 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2500 return;
2501
2502 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2500 wake_up_interruptible(&pgdat->kswapd_wait); 2503 wake_up_interruptible(&pgdat->kswapd_wait);
2501} 2504}
2502 2505
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 312d728976f1..bc0f095791b4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
85 85
86static int calculate_pressure_threshold(struct zone *zone)
87{
88 int threshold;
89 int watermark_distance;
90
91 /*
92 * As vmstats are not up to date, there is drift between the estimated
93 * and real values. For high thresholds and a high number of CPUs, it
94 * is possible for the min watermark to be breached while the estimated
95 * value looks fine. The pressure threshold is a reduced value such
96 * that even the maximum amount of drift will not accidentally breach
97 * the min watermark
98 */
99 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
100 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
101
102 /*
103 * Maximum threshold is 125
104 */
105 threshold = min(125, threshold);
106
107 return threshold;
108}
109
86static int calculate_threshold(struct zone *zone) 110static int calculate_threshold(struct zone *zone)
87{ 111{
88 int threshold; 112 int threshold;
@@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void)
161 } 185 }
162} 186}
163 187
188void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
189{
190 struct zone *zone;
191 int cpu;
192 int threshold;
193 int i;
194
195 get_online_cpus();
196 for (i = 0; i < pgdat->nr_zones; i++) {
197 zone = &pgdat->node_zones[i];
198 if (!zone->percpu_drift_mark)
199 continue;
200
201 threshold = calculate_pressure_threshold(zone);
202 for_each_online_cpu(cpu)
203 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
204 = threshold;
205 }
206 put_online_cpus();
207}
208
209void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
210{
211 struct zone *zone;
212 int cpu;
213 int threshold;
214 int i;
215
216 get_online_cpus();
217 for (i = 0; i < pgdat->nr_zones; i++) {
218 zone = &pgdat->node_zones[i];
219 if (!zone->percpu_drift_mark)
220 continue;
221
222 threshold = calculate_threshold(zone);
223 for_each_online_cpu(cpu)
224 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
225 = threshold;
226 }
227 put_online_cpus();
228}
229
164/* 230/*
165 * For use when we know that interrupts are disabled. 231 * For use when we know that interrupts are disabled.
166 */ 232 */
@@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
911 "\n scanned %lu" 977 "\n scanned %lu"
912 "\n spanned %lu" 978 "\n spanned %lu"
913 "\n present %lu", 979 "\n present %lu",
914 zone_nr_free_pages(zone), 980 zone_page_state(zone, NR_FREE_PAGES),
915 min_wmark_pages(zone), 981 min_wmark_pages(zone),
916 low_wmark_pages(zone), 982 low_wmark_pages(zone),
917 high_wmark_pages(zone), 983 high_wmark_pages(zone),