aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <cl@linux.com>2010-09-09 19:38:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-09-09 21:57:25 -0400
commitaa45484031ddee09b06350ab8528bfe5b2c76d1c (patch)
tree6758072232db9a54453022ec3e6cede35d52001c
parent72853e2991a2702ae93aaf889ac7db743a415dd3 (diff)
mm: page allocator: calculate a better estimate of NR_FREE_PAGES when memory is low and kswapd is awake
Ordinarily watermark checks are based on the vmstat NR_FREE_PAGES as it is cheaper than scanning a number of lists. To avoid synchronization overhead, counter deltas are maintained on a per-cpu basis and drained both periodically and when the delta is above a threshold. On large CPU systems, the difference between the estimated and real value of NR_FREE_PAGES can be very high. If NR_FREE_PAGES is much higher than number of real free page in buddy, the VM can allocate pages below min watermark, at worst reducing the real number of pages to zero. Even if the OOM killer kills some victim for freeing memory, it may not free memory if the exit path requires a new page resulting in livelock. This patch introduces a zone_page_state_snapshot() function (courtesy of Christoph) that takes a slightly more accurate view of an arbitrary vmstat counter. It is used to read NR_FREE_PAGES while kswapd is awake to avoid the watermark being accidentally broken. The estimate is not perfect and may result in cache line bounces but is expected to be lighter than the IPI calls necessary to continually drain the per-cpu counters while kswapd is awake. Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h13
-rw-r--r--include/linux/vmstat.h22
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/vmstat.c15
5 files changed, 72 insertions, 3 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6e6e62648a4d..3984c4eb41fd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -284,6 +284,13 @@ struct zone {
284 unsigned long watermark[NR_WMARK]; 284 unsigned long watermark[NR_WMARK];
285 285
286 /* 286 /*
287 * When free pages are below this point, additional steps are taken
288 * when reading the number of free pages to avoid per-cpu counter
289 * drift allowing watermarks to be breached
290 */
291 unsigned long percpu_drift_mark;
292
293 /*
287 * We don't know if the memory that we're going to allocate will be freeable 294 * We don't know if the memory that we're going to allocate will be freeable
288 * or/and it will be released eventually, so to avoid totally wasting several 295 * or/and it will be released eventually, so to avoid totally wasting several
289 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 296 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -441,6 +448,12 @@ static inline int zone_is_oom_locked(const struct zone *zone)
441 return test_bit(ZONE_OOM_LOCKED, &zone->flags); 448 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
442} 449}
443 450
451#ifdef CONFIG_SMP
452unsigned long zone_nr_free_pages(struct zone *zone);
453#else
454#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
455#endif /* CONFIG_SMP */
456
444/* 457/*
445 * The "priority" of VM scanning is how much of the queues we will scan in one 458 * The "priority" of VM scanning is how much of the queues we will scan in one
446 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 459 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 7f43ccdc1d38..eaaea37b3b75 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -170,6 +170,28 @@ static inline unsigned long zone_page_state(struct zone *zone,
170 return x; 170 return x;
171} 171}
172 172
173/*
174 * More accurate version that also considers the currently pending
175 * deltas. For that we need to loop over all cpus to find the current
176 * deltas. There is no synchronization so the result cannot be
177 * exactly accurate either.
178 */
179static inline unsigned long zone_page_state_snapshot(struct zone *zone,
180 enum zone_stat_item item)
181{
182 long x = atomic_long_read(&zone->vm_stat[item]);
183
184#ifdef CONFIG_SMP
185 int cpu;
186 for_each_online_cpu(cpu)
187 x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
188
189 if (x < 0)
190 x = 0;
191#endif
192 return x;
193}
194
173extern unsigned long global_reclaimable_pages(void); 195extern unsigned long global_reclaimable_pages(void);
174extern unsigned long zone_reclaimable_pages(struct zone *zone); 196extern unsigned long zone_reclaimable_pages(struct zone *zone);
175 197
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..e35bfb82c855 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 452e2ba06c7c..b2d21e06d45d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1462,7 +1462,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1462{ 1462{
1463 /* free_pages my go negative - that's OK */ 1463 /* free_pages my go negative - that's OK */
1464 long min = mark; 1464 long min = mark;
1465 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1465 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1466 int o; 1466 int o;
1467 1467
1468 if (alloc_flags & ALLOC_HIGH) 1468 if (alloc_flags & ALLOC_HIGH)
@@ -2424,7 +2424,7 @@ void show_free_areas(void)
2424 " all_unreclaimable? %s" 2424 " all_unreclaimable? %s"
2425 "\n", 2425 "\n",
2426 zone->name, 2426 zone->name,
2427 K(zone_page_state(zone, NR_FREE_PAGES)), 2427 K(zone_nr_free_pages(zone)),
2428 K(min_wmark_pages(zone)), 2428 K(min_wmark_pages(zone)),
2429 K(low_wmark_pages(zone)), 2429 K(low_wmark_pages(zone)),
2430 K(high_wmark_pages(zone)), 2430 K(high_wmark_pages(zone)),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a8d6b59e609a..355a9e669aaa 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void)
138 int threshold; 138 int threshold;
139 139
140 for_each_populated_zone(zone) { 140 for_each_populated_zone(zone) {
141 unsigned long max_drift, tolerate_drift;
142
141 threshold = calculate_threshold(zone); 143 threshold = calculate_threshold(zone);
142 144
143 for_each_online_cpu(cpu) 145 for_each_online_cpu(cpu)
144 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 146 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
145 = threshold; 147 = threshold;
148
149 /*
150 * Only set percpu_drift_mark if there is a danger that
151 * NR_FREE_PAGES reports the low watermark is ok when in fact
152 * the min watermark could be breached by an allocation
153 */
154 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
155 max_drift = num_online_cpus() * threshold;
156 if (max_drift > tolerate_drift)
157 zone->percpu_drift_mark = high_wmark_pages(zone) +
158 max_drift;
146 } 159 }
147} 160}
148 161
@@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
813 "\n scanned %lu" 826 "\n scanned %lu"
814 "\n spanned %lu" 827 "\n spanned %lu"
815 "\n present %lu", 828 "\n present %lu",
816 zone_page_state(zone, NR_FREE_PAGES), 829 zone_nr_free_pages(zone),
817 min_wmark_pages(zone), 830 min_wmark_pages(zone),
818 low_wmark_pages(zone), 831 low_wmark_pages(zone),
819 high_wmark_pages(zone), 832 high_wmark_pages(zone),