diff options
-rw-r--r-- | include/linux/mmzone.h | 13 | ||||
-rw-r--r-- | include/linux/vmstat.h | 22 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/vmstat.c | 15 |
5 files changed, 72 insertions, 3 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6e6e62648a4d..3984c4eb41fd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -284,6 +284,13 @@ struct zone { | |||
284 | unsigned long watermark[NR_WMARK]; | 284 | unsigned long watermark[NR_WMARK]; |
285 | 285 | ||
286 | /* | 286 | /* |
287 | * When free pages are below this point, additional steps are taken | ||
288 | * when reading the number of free pages to avoid per-cpu counter | ||
289 | * drift allowing watermarks to be breached | ||
290 | */ | ||
291 | unsigned long percpu_drift_mark; | ||
292 | |||
293 | /* | ||
287 | * We don't know if the memory that we're going to allocate will be freeable | 294 | * We don't know if the memory that we're going to allocate will be freeable |
288 | * or/and it will be released eventually, so to avoid totally wasting several | 295 | * or/and it will be released eventually, so to avoid totally wasting several |
289 | * GB of ram we must reserve some of the lower zone memory (otherwise we risk | 296 | * GB of ram we must reserve some of the lower zone memory (otherwise we risk |
@@ -441,6 +448,12 @@ static inline int zone_is_oom_locked(const struct zone *zone) | |||
441 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); | 448 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); |
442 | } | 449 | } |
443 | 450 | ||
451 | #ifdef CONFIG_SMP | ||
452 | unsigned long zone_nr_free_pages(struct zone *zone); | ||
453 | #else | ||
454 | #define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) | ||
455 | #endif /* CONFIG_SMP */ | ||
456 | |||
444 | /* | 457 | /* |
445 | * The "priority" of VM scanning is how much of the queues we will scan in one | 458 | * The "priority" of VM scanning is how much of the queues we will scan in one |
446 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the | 459 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 7f43ccdc1d38..eaaea37b3b75 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -170,6 +170,28 @@ static inline unsigned long zone_page_state(struct zone *zone, | |||
170 | return x; | 170 | return x; |
171 | } | 171 | } |
172 | 172 | ||
173 | /* | ||
174 | * More accurate version that also considers the currently pending | ||
175 | * deltas. For that we need to loop over all cpus to find the current | ||
176 | * deltas. There is no synchronization so the result cannot be | ||
177 | * exactly accurate either. | ||
178 | */ | ||
179 | static inline unsigned long zone_page_state_snapshot(struct zone *zone, | ||
180 | enum zone_stat_item item) | ||
181 | { | ||
182 | long x = atomic_long_read(&zone->vm_stat[item]); | ||
183 | |||
184 | #ifdef CONFIG_SMP | ||
185 | int cpu; | ||
186 | for_each_online_cpu(cpu) | ||
187 | x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; | ||
188 | |||
189 | if (x < 0) | ||
190 | x = 0; | ||
191 | #endif | ||
192 | return x; | ||
193 | } | ||
194 | |||
173 | extern unsigned long global_reclaimable_pages(void); | 195 | extern unsigned long global_reclaimable_pages(void); |
174 | extern unsigned long zone_reclaimable_pages(struct zone *zone); | 196 | extern unsigned long zone_reclaimable_pages(struct zone *zone); |
175 | 197 | ||
diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d1760213..e35bfb82c855 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn, | |||
87 | return 1; | 87 | return 1; |
88 | } | 88 | } |
89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
90 | |||
91 | #ifdef CONFIG_SMP | ||
92 | /* Called when a more accurate view of NR_FREE_PAGES is needed */ | ||
93 | unsigned long zone_nr_free_pages(struct zone *zone) | ||
94 | { | ||
95 | unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
96 | |||
97 | /* | ||
98 | * While kswapd is awake, it is considered the zone is under some | ||
99 | * memory pressure. Under pressure, there is a risk that | ||
100 | * per-cpu-counter-drift will allow the min watermark to be breached | ||
101 | * potentially causing a live-lock. While kswapd is awake and | ||
102 | * free pages are low, get a better estimate for free pages | ||
103 | */ | ||
104 | if (nr_free_pages < zone->percpu_drift_mark && | ||
105 | !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) | ||
106 | return zone_page_state_snapshot(zone, NR_FREE_PAGES); | ||
107 | |||
108 | return nr_free_pages; | ||
109 | } | ||
110 | #endif /* CONFIG_SMP */ | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 452e2ba06c7c..b2d21e06d45d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1462,7 +1462,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1462 | { | 1462 | { |
1463 | /* free_pages my go negative - that's OK */ | 1463 | /* free_pages my go negative - that's OK */ |
1464 | long min = mark; | 1464 | long min = mark; |
1465 | long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; | 1465 | long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; |
1466 | int o; | 1466 | int o; |
1467 | 1467 | ||
1468 | if (alloc_flags & ALLOC_HIGH) | 1468 | if (alloc_flags & ALLOC_HIGH) |
@@ -2424,7 +2424,7 @@ void show_free_areas(void) | |||
2424 | " all_unreclaimable? %s" | 2424 | " all_unreclaimable? %s" |
2425 | "\n", | 2425 | "\n", |
2426 | zone->name, | 2426 | zone->name, |
2427 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2427 | K(zone_nr_free_pages(zone)), |
2428 | K(min_wmark_pages(zone)), | 2428 | K(min_wmark_pages(zone)), |
2429 | K(low_wmark_pages(zone)), | 2429 | K(low_wmark_pages(zone)), |
2430 | K(high_wmark_pages(zone)), | 2430 | K(high_wmark_pages(zone)), |
diff --git a/mm/vmstat.c b/mm/vmstat.c index a8d6b59e609a..355a9e669aaa 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void) | |||
138 | int threshold; | 138 | int threshold; |
139 | 139 | ||
140 | for_each_populated_zone(zone) { | 140 | for_each_populated_zone(zone) { |
141 | unsigned long max_drift, tolerate_drift; | ||
142 | |||
141 | threshold = calculate_threshold(zone); | 143 | threshold = calculate_threshold(zone); |
142 | 144 | ||
143 | for_each_online_cpu(cpu) | 145 | for_each_online_cpu(cpu) |
144 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | 146 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
145 | = threshold; | 147 | = threshold; |
148 | |||
149 | /* | ||
150 | * Only set percpu_drift_mark if there is a danger that | ||
151 | * NR_FREE_PAGES reports the low watermark is ok when in fact | ||
152 | * the min watermark could be breached by an allocation | ||
153 | */ | ||
154 | tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); | ||
155 | max_drift = num_online_cpus() * threshold; | ||
156 | if (max_drift > tolerate_drift) | ||
157 | zone->percpu_drift_mark = high_wmark_pages(zone) + | ||
158 | max_drift; | ||
146 | } | 159 | } |
147 | } | 160 | } |
148 | 161 | ||
@@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
813 | "\n scanned %lu" | 826 | "\n scanned %lu" |
814 | "\n spanned %lu" | 827 | "\n spanned %lu" |
815 | "\n present %lu", | 828 | "\n present %lu", |
816 | zone_page_state(zone, NR_FREE_PAGES), | 829 | zone_nr_free_pages(zone), |
817 | min_wmark_pages(zone), | 830 | min_wmark_pages(zone), |
818 | low_wmark_pages(zone), | 831 | low_wmark_pages(zone), |
819 | high_wmark_pages(zone), | 832 | high_wmark_pages(zone), |