aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2010-05-24 17:32:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-25 11:06:59 -0400
commit748446bb6b5a9390b546af38ec899c868a9dbcf0 (patch)
tree4c27d0805a5e094b39ff938ad60dd270b953a79f /mm
parentc175a0ce7584e5b498fff8cbdb9aa7912aa9fbba (diff)
mm: compaction: memory compaction core
This patch is the core of a mechanism which compacts memory in a zone by relocating movable pages towards the end of the zone. A single compaction run involves a migration scanner and a free scanner. Both scanners operate on pageblock-sized areas in the zone. The migration scanner starts at the bottom of the zone and searches for all movable pages within each area, isolating them onto a private list called migratelist. The free scanner starts at the top of the zone and searches for suitable areas and consumes the free pages within making them available for the migration scanner. The pages isolated for migration are then migrated to the newly isolated free pages. [aarcange@redhat.com: Fix unsafe optimisation] [mel@csn.ul.ie: do not schedule work on other CPUs for compaction] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c393
-rw-r--r--mm/migrate.c11
-rw-r--r--mm/page_alloc.c45
-rw-r--r--mm/vmstat.c7
5 files changed, 456 insertions, 1 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 6c2a73a54a43..8982504bd03b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
23obj-$(CONFIG_SPARSEMEM) += sparse.o 23obj-$(CONFIG_SPARSEMEM) += sparse.o
24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
26obj-$(CONFIG_COMPACTION) += compaction.o
26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_KSM) += ksm.o 28obj-$(CONFIG_KSM) += ksm.o
28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 000000000000..be1ff3f7552b
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,393 @@
1/*
2 * linux/mm/compaction.c
3 *
4 * Memory compaction for the reduction of external fragmentation. Note that
5 * this heavily depends upon page migration to do all the real heavy
6 * lifting
7 *
8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
9 */
10#include <linux/swap.h>
11#include <linux/migrate.h>
12#include <linux/compaction.h>
13#include <linux/mm_inline.h>
14#include <linux/backing-dev.h>
15#include "internal.h"
16
17/*
18 * compact_control is used to track pages being migrated and the free pages
19 * they are being migrated to during memory compaction. The free_pfn starts
20 * at the end of a zone and migrate_pfn begins at the start. Movable pages
21 * are moved to the end of a zone during a compaction run and the run
22 * completes when free_pfn <= migrate_pfn
23 */
24struct compact_control {
25 struct list_head freepages; /* List of free pages to migrate to */
26 struct list_head migratepages; /* List of pages being migrated */
27 unsigned long nr_freepages; /* Number of isolated free pages */
28 unsigned long nr_migratepages; /* Number of pages to migrate */
29 unsigned long free_pfn; /* isolate_freepages search base */
30 unsigned long migrate_pfn; /* isolate_migratepages search base */
31
32 /* Account for isolated anon and file pages */
33 unsigned long nr_anon;
34 unsigned long nr_file;
35
36 struct zone *zone;
37};
38
39static unsigned long release_freepages(struct list_head *freelist)
40{
41 struct page *page, *next;
42 unsigned long count = 0;
43
44 list_for_each_entry_safe(page, next, freelist, lru) {
45 list_del(&page->lru);
46 __free_page(page);
47 count++;
48 }
49
50 return count;
51}
52
53/* Isolate free pages onto a private freelist. Must hold zone->lock */
54static unsigned long isolate_freepages_block(struct zone *zone,
55 unsigned long blockpfn,
56 struct list_head *freelist)
57{
58 unsigned long zone_end_pfn, end_pfn;
59 int total_isolated = 0;
60 struct page *cursor;
61
62 /* Get the last PFN we should scan for free pages at */
63 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
64 end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
65
66 /* Find the first usable PFN in the block to initialse page cursor */
67 for (; blockpfn < end_pfn; blockpfn++) {
68 if (pfn_valid_within(blockpfn))
69 break;
70 }
71 cursor = pfn_to_page(blockpfn);
72
73 /* Isolate free pages. This assumes the block is valid */
74 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
75 int isolated, i;
76 struct page *page = cursor;
77
78 if (!pfn_valid_within(blockpfn))
79 continue;
80
81 if (!PageBuddy(page))
82 continue;
83
84 /* Found a free page, break it into order-0 pages */
85 isolated = split_free_page(page);
86 total_isolated += isolated;
87 for (i = 0; i < isolated; i++) {
88 list_add(&page->lru, freelist);
89 page++;
90 }
91
92 /* If a page was split, advance to the end of it */
93 if (isolated) {
94 blockpfn += isolated - 1;
95 cursor += isolated - 1;
96 }
97 }
98
99 return total_isolated;
100}
101
102/* Returns true if the page is within a block suitable for migration to */
103static bool suitable_migration_target(struct page *page)
104{
105
106 int migratetype = get_pageblock_migratetype(page);
107
108 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
109 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
110 return false;
111
112 /* If the page is a large free page, then allow migration */
113 if (PageBuddy(page) && page_order(page) >= pageblock_order)
114 return true;
115
116 /* If the block is MIGRATE_MOVABLE, allow migration */
117 if (migratetype == MIGRATE_MOVABLE)
118 return true;
119
120 /* Otherwise skip the block */
121 return false;
122}
123
124/*
125 * Based on information in the current compact_control, find blocks
126 * suitable for isolating free pages from and then isolate them.
127 */
128static void isolate_freepages(struct zone *zone,
129 struct compact_control *cc)
130{
131 struct page *page;
132 unsigned long high_pfn, low_pfn, pfn;
133 unsigned long flags;
134 int nr_freepages = cc->nr_freepages;
135 struct list_head *freelist = &cc->freepages;
136
137 pfn = cc->free_pfn;
138 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
139 high_pfn = low_pfn;
140
141 /*
142 * Isolate free pages until enough are available to migrate the
143 * pages on cc->migratepages. We stop searching if the migrate
144 * and free page scanners meet or enough free pages are isolated.
145 */
146 spin_lock_irqsave(&zone->lock, flags);
147 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
148 pfn -= pageblock_nr_pages) {
149 unsigned long isolated;
150
151 if (!pfn_valid(pfn))
152 continue;
153
154 /*
155 * Check for overlapping nodes/zones. It's possible on some
156 * configurations to have a setup like
157 * node0 node1 node0
158 * i.e. it's possible that all pages within a zones range of
159 * pages do not belong to a single zone.
160 */
161 page = pfn_to_page(pfn);
162 if (page_zone(page) != zone)
163 continue;
164
165 /* Check the block is suitable for migration */
166 if (!suitable_migration_target(page))
167 continue;
168
169 /* Found a block suitable for isolating free pages from */
170 isolated = isolate_freepages_block(zone, pfn, freelist);
171 nr_freepages += isolated;
172
173 /*
174 * Record the highest PFN we isolated pages from. When next
175 * looking for free pages, the search will restart here as
176 * page migration may have returned some pages to the allocator
177 */
178 if (isolated)
179 high_pfn = max(high_pfn, pfn);
180 }
181 spin_unlock_irqrestore(&zone->lock, flags);
182
183 /* split_free_page does not map the pages */
184 list_for_each_entry(page, freelist, lru) {
185 arch_alloc_page(page, 0);
186 kernel_map_pages(page, 1, 1);
187 }
188
189 cc->free_pfn = high_pfn;
190 cc->nr_freepages = nr_freepages;
191}
192
193/* Update the number of anon and file isolated pages in the zone */
194static void acct_isolated(struct zone *zone, struct compact_control *cc)
195{
196 struct page *page;
197 unsigned int count[NR_LRU_LISTS] = { 0, };
198
199 list_for_each_entry(page, &cc->migratepages, lru) {
200 int lru = page_lru_base_type(page);
201 count[lru]++;
202 }
203
204 cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
205 cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
206 __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
207 __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
208}
209
210/* Similar to reclaim, but different enough that they don't share logic */
211static bool too_many_isolated(struct zone *zone)
212{
213
214 unsigned long inactive, isolated;
215
216 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
217 zone_page_state(zone, NR_INACTIVE_ANON);
218 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
219 zone_page_state(zone, NR_ISOLATED_ANON);
220
221 return isolated > inactive;
222}
223
224/*
225 * Isolate all pages that can be migrated from the block pointed to by
226 * the migrate scanner within compact_control.
227 */
228static unsigned long isolate_migratepages(struct zone *zone,
229 struct compact_control *cc)
230{
231 unsigned long low_pfn, end_pfn;
232 struct list_head *migratelist = &cc->migratepages;
233
234 /* Do not scan outside zone boundaries */
235 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
236
237 /* Only scan within a pageblock boundary */
238 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
239
240 /* Do not cross the free scanner or scan within a memory hole */
241 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
242 cc->migrate_pfn = end_pfn;
243 return 0;
244 }
245
246 /*
247 * Ensure that there are not too many pages isolated from the LRU
248 * list by either parallel reclaimers or compaction. If there are,
249 * delay for some time until fewer pages are isolated
250 */
251 while (unlikely(too_many_isolated(zone))) {
252 congestion_wait(BLK_RW_ASYNC, HZ/10);
253
254 if (fatal_signal_pending(current))
255 return 0;
256 }
257
258 /* Time to isolate some pages for migration */
259 spin_lock_irq(&zone->lru_lock);
260 for (; low_pfn < end_pfn; low_pfn++) {
261 struct page *page;
262 if (!pfn_valid_within(low_pfn))
263 continue;
264
265 /* Get the page and skip if free */
266 page = pfn_to_page(low_pfn);
267 if (PageBuddy(page))
268 continue;
269
270 /* Try isolate the page */
271 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
272 continue;
273
274 /* Successfully isolated */
275 del_page_from_lru_list(zone, page, page_lru(page));
276 list_add(&page->lru, migratelist);
277 mem_cgroup_del_lru(page);
278 cc->nr_migratepages++;
279
280 /* Avoid isolating too much */
281 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
282 break;
283 }
284
285 acct_isolated(zone, cc);
286
287 spin_unlock_irq(&zone->lru_lock);
288 cc->migrate_pfn = low_pfn;
289
290 return cc->nr_migratepages;
291}
292
293/*
294 * This is a migrate-callback that "allocates" freepages by taking pages
295 * from the isolated freelists in the block we are migrating to.
296 */
297static struct page *compaction_alloc(struct page *migratepage,
298 unsigned long data,
299 int **result)
300{
301 struct compact_control *cc = (struct compact_control *)data;
302 struct page *freepage;
303
304 /* Isolate free pages if necessary */
305 if (list_empty(&cc->freepages)) {
306 isolate_freepages(cc->zone, cc);
307
308 if (list_empty(&cc->freepages))
309 return NULL;
310 }
311
312 freepage = list_entry(cc->freepages.next, struct page, lru);
313 list_del(&freepage->lru);
314 cc->nr_freepages--;
315
316 return freepage;
317}
318
319/*
320 * We cannot control nr_migratepages and nr_freepages fully when migration is
321 * running as migrate_pages() has no knowledge of compact_control. When
322 * migration is complete, we count the number of pages on the lists by hand.
323 */
324static void update_nr_listpages(struct compact_control *cc)
325{
326 int nr_migratepages = 0;
327 int nr_freepages = 0;
328 struct page *page;
329
330 list_for_each_entry(page, &cc->migratepages, lru)
331 nr_migratepages++;
332 list_for_each_entry(page, &cc->freepages, lru)
333 nr_freepages++;
334
335 cc->nr_migratepages = nr_migratepages;
336 cc->nr_freepages = nr_freepages;
337}
338
339static int compact_finished(struct zone *zone,
340 struct compact_control *cc)
341{
342 if (fatal_signal_pending(current))
343 return COMPACT_PARTIAL;
344
345 /* Compaction run completes if the migrate and free scanner meet */
346 if (cc->free_pfn <= cc->migrate_pfn)
347 return COMPACT_COMPLETE;
348
349 return COMPACT_CONTINUE;
350}
351
352static int compact_zone(struct zone *zone, struct compact_control *cc)
353{
354 int ret;
355
356 /* Setup to move all movable pages to the end of the zone */
357 cc->migrate_pfn = zone->zone_start_pfn;
358 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
359 cc->free_pfn &= ~(pageblock_nr_pages-1);
360
361 migrate_prep_local();
362
363 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
364 unsigned long nr_migrate, nr_remaining;
365
366 if (!isolate_migratepages(zone, cc))
367 continue;
368
369 nr_migrate = cc->nr_migratepages;
370 migrate_pages(&cc->migratepages, compaction_alloc,
371 (unsigned long)cc, 0);
372 update_nr_listpages(cc);
373 nr_remaining = cc->nr_migratepages;
374
375 count_vm_event(COMPACTBLOCKS);
376 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
377 if (nr_remaining)
378 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
379
380 /* Release LRU pages not migrated */
381 if (!list_empty(&cc->migratepages)) {
382 putback_lru_pages(&cc->migratepages);
383 cc->nr_migratepages = 0;
384 }
385
386 }
387
388 /* Release free pages and check accounting */
389 cc->nr_freepages -= release_freepages(&cc->freepages);
390 VM_BUG_ON(cc->nr_freepages != 0);
391
392 return ret;
393}
diff --git a/mm/migrate.c b/mm/migrate.c
index 4afd6fe3c074..09e2471afa0f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -40,7 +40,8 @@
40 40
41/* 41/*
42 * migrate_prep() needs to be called before we start compiling a list of pages 42 * migrate_prep() needs to be called before we start compiling a list of pages
43 * to be migrated using isolate_lru_page(). 43 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
44 * undesirable, use migrate_prep_local()
44 */ 45 */
45int migrate_prep(void) 46int migrate_prep(void)
46{ 47{
@@ -55,6 +56,14 @@ int migrate_prep(void)
55 return 0; 56 return 0;
56} 57}
57 58
59/* Do the necessary work of migrate_prep but not if it involves other CPUs */
60int migrate_prep_local(void)
61{
62 lru_add_drain();
63
64 return 0;
65}
66
58/* 67/*
59 * Add isolated pages on the list back to the LRU under page lock 68 * Add isolated pages on the list back to the LRU under page lock
60 * to avoid leaking evictable pages back onto unevictable list. 69 * to avoid leaking evictable pages back onto unevictable list.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cefe6fe8d991..c54376a09f30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1208,6 +1208,51 @@ void split_page(struct page *page, unsigned int order)
1208} 1208}
1209 1209
1210/* 1210/*
1211 * Similar to split_page except the page is already free. As this is only
1212 * being used for migration, the migratetype of the block also changes.
1213 * As this is called with interrupts disabled, the caller is responsible
1214 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1215 * are enabled.
1216 *
1217 * Note: this is probably too low level an operation for use in drivers.
1218 * Please consult with lkml before using this in your driver.
1219 */
1220int split_free_page(struct page *page)
1221{
1222 unsigned int order;
1223 unsigned long watermark;
1224 struct zone *zone;
1225
1226 BUG_ON(!PageBuddy(page));
1227
1228 zone = page_zone(page);
1229 order = page_order(page);
1230
1231 /* Obey watermarks as if the page was being allocated */
1232 watermark = low_wmark_pages(zone) + (1 << order);
1233 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1234 return 0;
1235
1236 /* Remove page from free list */
1237 list_del(&page->lru);
1238 zone->free_area[order].nr_free--;
1239 rmv_page_order(page);
1240 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1241
1242 /* Split into individual pages */
1243 set_page_refcounted(page);
1244 split_page(page, order);
1245
1246 if (order >= pageblock_order - 1) {
1247 struct page *endpage = page + (1 << order) - 1;
1248 for (; page < endpage; page += pageblock_nr_pages)
1249 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1250 }
1251
1252 return 1 << order;
1253}
1254
1255/*
1211 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1256 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1212 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1257 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1213 * or two. 1258 * or two.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 23a5899c7461..c6aacf51b554 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -766,6 +766,13 @@ static const char * const vmstat_text[] = {
766 "allocstall", 766 "allocstall",
767 767
768 "pgrotated", 768 "pgrotated",
769
770#ifdef CONFIG_COMPACTION
771 "compact_blocks_moved",
772 "compact_pages_moved",
773 "compact_pagemigrate_failed",
774#endif
775
769#ifdef CONFIG_HUGETLB_PAGE 776#ifdef CONFIG_HUGETLB_PAGE
770 "htlb_buddy_alloc_success", 777 "htlb_buddy_alloc_success",
771 "htlb_buddy_alloc_fail", 778 "htlb_buddy_alloc_fail",