aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mmzone.h13
-rw-r--r--include/linux/vmstat.h22
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/vmstat.c15
5 files changed, 72 insertions, 3 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6e6e62648a4d..3984c4eb41fd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -284,6 +284,13 @@ struct zone {
284 unsigned long watermark[NR_WMARK]; 284 unsigned long watermark[NR_WMARK];
285 285
286 /* 286 /*
287 * When free pages are below this point, additional steps are taken
288 * when reading the number of free pages to avoid per-cpu counter
289 * drift allowing watermarks to be breached
290 */
291 unsigned long percpu_drift_mark;
292
293 /*
287 * We don't know if the memory that we're going to allocate will be freeable 294 * We don't know if the memory that we're going to allocate will be freeable
288 * or/and it will be released eventually, so to avoid totally wasting several 295 * or/and it will be released eventually, so to avoid totally wasting several
289 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 296 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -441,6 +448,12 @@ static inline int zone_is_oom_locked(const struct zone *zone)
441 return test_bit(ZONE_OOM_LOCKED, &zone->flags); 448 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
442} 449}
443 450
451#ifdef CONFIG_SMP
452unsigned long zone_nr_free_pages(struct zone *zone);
453#else
454#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
455#endif /* CONFIG_SMP */
456
444/* 457/*
445 * The "priority" of VM scanning is how much of the queues we will scan in one 458 * The "priority" of VM scanning is how much of the queues we will scan in one
446 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 459 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 7f43ccdc1d38..eaaea37b3b75 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -170,6 +170,28 @@ static inline unsigned long zone_page_state(struct zone *zone,
170 return x; 170 return x;
171} 171}
172 172
173/*
174 * More accurate version that also considers the currently pending
175 * deltas. For that we need to loop over all cpus to find the current
176 * deltas. There is no synchronization so the result cannot be
177 * exactly accurate either.
178 */
179static inline unsigned long zone_page_state_snapshot(struct zone *zone,
180 enum zone_stat_item item)
181{
182 long x = atomic_long_read(&zone->vm_stat[item]);
183
184#ifdef CONFIG_SMP
185 int cpu;
186 for_each_online_cpu(cpu)
187 x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
188
189 if (x < 0)
190 x = 0;
191#endif
192 return x;
193}
194
173extern unsigned long global_reclaimable_pages(void); 195extern unsigned long global_reclaimable_pages(void);
174extern unsigned long zone_reclaimable_pages(struct zone *zone); 196extern unsigned long zone_reclaimable_pages(struct zone *zone);
175 197
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..e35bfb82c855 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 452e2ba06c7c..b2d21e06d45d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1462,7 +1462,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1462{ 1462{
1463 /* free_pages my go negative - that's OK */ 1463 /* free_pages my go negative - that's OK */
1464 long min = mark; 1464 long min = mark;
1465 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1465 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1466 int o; 1466 int o;
1467 1467
1468 if (alloc_flags & ALLOC_HIGH) 1468 if (alloc_flags & ALLOC_HIGH)
@@ -2424,7 +2424,7 @@ void show_free_areas(void)
2424 " all_unreclaimable? %s" 2424 " all_unreclaimable? %s"
2425 "\n", 2425 "\n",
2426 zone->name, 2426 zone->name,
2427 K(zone_page_state(zone, NR_FREE_PAGES)), 2427 K(zone_nr_free_pages(zone)),
2428 K(min_wmark_pages(zone)), 2428 K(min_wmark_pages(zone)),
2429 K(low_wmark_pages(zone)), 2429 K(low_wmark_pages(zone)),
2430 K(high_wmark_pages(zone)), 2430 K(high_wmark_pages(zone)),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a8d6b59e609a..355a9e669aaa 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void)
138 int threshold; 138 int threshold;
139 139
140 for_each_populated_zone(zone) { 140 for_each_populated_zone(zone) {
141 unsigned long max_drift, tolerate_drift;
142
141 threshold = calculate_threshold(zone); 143 threshold = calculate_threshold(zone);
142 144
143 for_each_online_cpu(cpu) 145 for_each_online_cpu(cpu)
144 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 146 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
145 = threshold; 147 = threshold;
148
149 /*
150 * Only set percpu_drift_mark if there is a danger that
151 * NR_FREE_PAGES reports the low watermark is ok when in fact
152 * the min watermark could be breached by an allocation
153 */
154 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
155 max_drift = num_online_cpus() * threshold;
156 if (max_drift > tolerate_drift)
157 zone->percpu_drift_mark = high_wmark_pages(zone) +
158 max_drift;
146 } 159 }
147} 160}
148 161
@@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
813 "\n scanned %lu" 826 "\n scanned %lu"
814 "\n spanned %lu" 827 "\n spanned %lu"
815 "\n present %lu", 828 "\n present %lu",
816 zone_page_state(zone, NR_FREE_PAGES), 829 zone_nr_free_pages(zone),
817 min_wmark_pages(zone), 830 min_wmark_pages(zone),
818 low_wmark_pages(zone), 831 low_wmark_pages(zone),
819 high_wmark_pages(zone), 832 high_wmark_pages(zone),