aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmstat.c
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2011-01-13 18:45:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:31 -0500
commit88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 (patch)
tree6f39beef8cf918eb2ca9f64ae1bcd1ea79ca487a /mm/vmstat.c
parent43bb40c9e3aa51a3b038c9df2c9afb4d4685614d (diff)
mm: page allocator: adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To avoid synchronization overhead, these counters are maintained on a per-cpu basis and drained both periodically and when a threshold is above a threshold. On large CPU systems, the difference between the estimate and real value of NR_FREE_PAGES can be very high. The system can get into a case where pages are allocated far below the min watermark potentially causing livelock issues. The commit solved the problem by taking a better reading of NR_FREE_PAGES when memory was low. Unfortately, as reported by Shaohua Li this accurate reading can consume a large amount of CPU time on systems with many sockets due to cache line bouncing. This patch takes a different approach. For large machines where counter drift might be unsafe and while kswapd is awake, the per-cpu thresholds for the target pgdat are reduced to limit the level of drift to what should be a safe level. This incurs a performance penalty in heavy memory pressure by a factor that depends on the workload and the machine but the machine should function correctly without accidentally exhausting all memory on a node. There is an additional cost when kswapd wakes and sleeps but the event is not expected to be frequent - in Shaohua's test case, there was one recorded sleep and wake event at least. To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is introduced that takes a more accurate reading of NR_FREE_PAGES when called from wakeup_kswapd, when deciding whether it is really safe to go back to sleep in sleeping_prematurely() and when deciding if a zone is really balanced or not in balance_pgdat(). We are still using an expensive function but limiting how often it is called. When the test case is reproduced, the time spent in the watermark functions is reduced. The following report is on the percentage of time spent cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(), zone_page_state(). vanilla 11.6615% disable-threshold 0.2584% David said: : We had to pull aa454840 "mm: page allocator: calculate a better estimate : of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36 : internally because tests showed that it would cause the machine to stall : as the result of heavy kswapd activity. I merged it back with this fix as : it is pending in the -mm tree and it solves the issue we were seeing, so I : definitely think this should be pushed to -stable (and I would seriously : consider it for 2.6.37 inclusion even at this late date). Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reported-by: Shaohua Li <shaohua.li@intel.com> Reviewed-by: Christoph Lameter <cl@linux.com> Tested-by: Nicolas Bareil <nico@chdir.org> Cc: David Rientjes <rientjes@google.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: <stable@kernel.org> [2.6.37.1, 2.6.36.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r--mm/vmstat.c68
1 files changed, 67 insertions, 1 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 312d728976f1..bc0f095791b4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
85 85
86static int calculate_pressure_threshold(struct zone *zone)
87{
88 int threshold;
89 int watermark_distance;
90
91 /*
92 * As vmstats are not up to date, there is drift between the estimated
93 * and real values. For high thresholds and a high number of CPUs, it
94 * is possible for the min watermark to be breached while the estimated
95 * value looks fine. The pressure threshold is a reduced value such
96 * that even the maximum amount of drift will not accidentally breach
97 * the min watermark
98 */
99 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
100 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
101
102 /*
103 * Maximum threshold is 125
104 */
105 threshold = min(125, threshold);
106
107 return threshold;
108}
109
86static int calculate_threshold(struct zone *zone) 110static int calculate_threshold(struct zone *zone)
87{ 111{
88 int threshold; 112 int threshold;
@@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void)
161 } 185 }
162} 186}
163 187
188void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
189{
190 struct zone *zone;
191 int cpu;
192 int threshold;
193 int i;
194
195 get_online_cpus();
196 for (i = 0; i < pgdat->nr_zones; i++) {
197 zone = &pgdat->node_zones[i];
198 if (!zone->percpu_drift_mark)
199 continue;
200
201 threshold = calculate_pressure_threshold(zone);
202 for_each_online_cpu(cpu)
203 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
204 = threshold;
205 }
206 put_online_cpus();
207}
208
209void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
210{
211 struct zone *zone;
212 int cpu;
213 int threshold;
214 int i;
215
216 get_online_cpus();
217 for (i = 0; i < pgdat->nr_zones; i++) {
218 zone = &pgdat->node_zones[i];
219 if (!zone->percpu_drift_mark)
220 continue;
221
222 threshold = calculate_threshold(zone);
223 for_each_online_cpu(cpu)
224 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
225 = threshold;
226 }
227 put_online_cpus();
228}
229
164/* 230/*
165 * For use when we know that interrupts are disabled. 231 * For use when we know that interrupts are disabled.
166 */ 232 */
@@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
911 "\n scanned %lu" 977 "\n scanned %lu"
912 "\n spanned %lu" 978 "\n spanned %lu"
913 "\n present %lu", 979 "\n present %lu",
914 zone_nr_free_pages(zone), 980 zone_page_state(zone, NR_FREE_PAGES),
915 min_wmark_pages(zone), 981 min_wmark_pages(zone),
916 low_wmark_pages(zone), 982 low_wmark_pages(zone),
917 high_wmark_pages(zone), 983 high_wmark_pages(zone),