mm: page allocator: calculate a better estimate of NR_FREE_PAGES when memory is low and kswapd is awake

Ordinarily watermark checks are based on the vmstat NR_FREE_PAGES as it is cheaper than scanning a number of lists. To avoid synchronization overhead, counter deltas are maintained on a per-cpu basis and drained both periodically and when the delta is above a threshold. On large CPU systems, the difference between the estimated and real value of NR_FREE_PAGES can be very high. If NR_FREE_PAGES is much higher than number of real free page in buddy, the VM can allocate pages below min watermark, at worst reducing the real number of pages to zero. Even if the OOM killer kills some victim for freeing memory, it may not free memory if the exit path requires a new page resulting in livelock. This patch introduces a zone_page_state_snapshot() function (courtesy of Christoph) that takes a slightly more accurate view of an arbitrary vmstat counter. It is used to read NR_FREE_PAGES while kswapd is awake to avoid the watermark being accidentally broken. The estimate is not perfect and may result in cache line bounces but is expected to be lighter than the IPI calls necessary to continually drain the per-cpu counters while kswapd is awake. Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Christoph Lameter <cl@linux.com> 2010-09-09 19:38:17 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-09-09 21:57:25 -0400
commit: aa45484031ddee09b06350ab8528bfe5b2c76d1c (patch)
tree: 6758072232db9a54453022ec3e6cede35d52001c /include
parent: 72853e2991a2702ae93aaf889ac7db743a415dd3 (diff)
2 files changed, 35 insertions, 0 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6e6e62648a4d..3984c4eb41fd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -284,6 +284,13 @@ struct zone {
        unsigned long watermark[NR_WMARK];
        /*
+         * When free pages are below this point, additional steps are taken
+         * when reading the number of free pages to avoid per-cpu counter
+         * drift allowing watermarks to be breached
+         */
+        unsigned long percpu_drift_mark;
+        /*
         * We don't know if the memory that we're going to allocate will be freeable
         * or/and it will be released eventually, so to avoid totally wasting several
         * GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -441,6 +448,12 @@ static inline int zone_is_oom_locked(const struct zone *zone)
        return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
+#ifdef CONFIG_SMP
+unsigned long zone_nr_free_pages(struct zone *zone);
+#else
+#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
+#endif /* CONFIG_SMP */
 /*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 7f43ccdc1d38..eaaea37b3b75 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -170,6 +170,28 @@ static inline unsigned long zone_page_state(struct zone *zone,
        return x;
 }
+/*
+ * More accurate version that also considers the currently pending
+ * deltas. For that we need to loop over all cpus to find the current
+ * deltas. There is no synchronization so the result cannot be
+ * exactly accurate either.
+ */
+static inline unsigned long zone_page_state_snapshot(struct zone *zone,
+                                        enum zone_stat_item item)
+{
+        long x = atomic_long_read(&zone->vm_stat[item]);
+#ifdef CONFIG_SMP
+        int cpu;
+        for_each_online_cpu(cpu)
+                x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+        if (x < 0)
+                x = 0;
+#endif
+        return x;
+}
 extern unsigned long global_reclaimable_pages(void);
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
author	Christoph Lameter <cl@linux.com>	2010-09-09 19:38:17 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-09-09 21:57:25 -0400
commit	aa45484031ddee09b06350ab8528bfe5b2c76d1c (patch)
tree	6758072232db9a54453022ec3e6cede35d52001c /include
parent	72853e2991a2702ae93aaf889ac7db743a415dd3 (diff)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6e6e62648a4d..3984c4eb41fd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h
@@ -284,6 +284,13 @@ struct zone {
284	unsigned long watermark[NR_WMARK];	284	unsigned long watermark[NR_WMARK];
285		285
286	/*	286	/*
		287	* When free pages are below this point, additional steps are taken
		288	* when reading the number of free pages to avoid per-cpu counter
		289	* drift allowing watermarks to be breached
		290	*/
		291	unsigned long percpu_drift_mark;
		292
		293	/*
287	* We don't know if the memory that we're going to allocate will be freeable	294	* We don't know if the memory that we're going to allocate will be freeable
288	* or/and it will be released eventually, so to avoid totally wasting several	295	* or/and it will be released eventually, so to avoid totally wasting several
289	* GB of ram we must reserve some of the lower zone memory (otherwise we risk	296	* GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -441,6 +448,12 @@ static inline int zone_is_oom_locked(const struct zone *zone)
441	return test_bit(ZONE_OOM_LOCKED, &zone->flags);	448	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
442	}	449	}
443		450
		451	#ifdef CONFIG_SMP
		452	unsigned long zone_nr_free_pages(struct zone *zone);
		453	#else
		454	#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
		455	#endif /* CONFIG_SMP */
		456
444	/*	457	/*
445	* The "priority" of VM scanning is how much of the queues we will scan in one	458	* The "priority" of VM scanning is how much of the queues we will scan in one
446	* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the	459	* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the


diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 7f43ccdc1d38..eaaea37b3b75 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h
@@ -170,6 +170,28 @@ static inline unsigned long zone_page_state(struct zone *zone,
170	return x;	170	return x;
171	}	171	}
172		172
		173	/*
		174	* More accurate version that also considers the currently pending
		175	* deltas. For that we need to loop over all cpus to find the current
		176	* deltas. There is no synchronization so the result cannot be
		177	* exactly accurate either.
		178	*/
		179	static inline unsigned long zone_page_state_snapshot(struct zone *zone,
		180	enum zone_stat_item item)
		181	{
		182	long x = atomic_long_read(&zone->vm_stat[item]);
		183
		184	#ifdef CONFIG_SMP
		185	int cpu;
		186	for_each_online_cpu(cpu)
		187	x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
		188
		189	if (x < 0)
		190	x = 0;
		191	#endif
		192	return x;
		193	}
		194
173	extern unsigned long global_reclaimable_pages(void);	195	extern unsigned long global_reclaimable_pages(void);
174	extern unsigned long zone_reclaimable_pages(struct zone *zone);	196	extern unsigned long zone_reclaimable_pages(struct zone *zone);
175		197