aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2011-01-13 18:45:56 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:33 -0500
commit3e7d344970673c5334cf7b5bb27c8c0942b06126 (patch)
tree832ecb4da5fd27efa5a503df5b96bfdee2a52ffd /mm
parentee64fc9354e515a79c7232cfde65c88ec627308b (diff)
mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim
Lumpy reclaim is disruptive. It reclaims a large number of pages and ignores the age of the pages it reclaims. This can incur significant stalls and potentially increase the number of major faults. Compaction has reached the point where it is considered reasonably stable (meaning it has passed a lot of testing) and is a potential candidate for displacing lumpy reclaim. This patch introduces an alternative to lumpy reclaim whe compaction is available called reclaim/compaction. The basic operation is very simple - instead of selecting a contiguous range of pages to reclaim, a number of order-0 pages are reclaimed and then compaction is later by either kswapd (compact_zone_order()) or direct compaction (__alloc_pages_direct_compact()). [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: use conventional task_struct naming] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c89
-rw-r--r--mm/migrate.c17
-rw-r--r--mm/page_alloc.c16
-rw-r--r--mm/vmscan.c102
4 files changed, 175 insertions, 49 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 20011a850fef..8fe917ec7c11 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone,
384 return COMPACT_CONTINUE; 384 return COMPACT_CONTINUE;
385} 385}
386 386
387/*
388 * compaction_suitable: Is this suitable to run compaction on this zone now?
389 * Returns
390 * COMPACT_SKIPPED - If there are too few free pages for compaction
391 * COMPACT_PARTIAL - If the allocation would succeed without compaction
392 * COMPACT_CONTINUE - If compaction should run now
393 */
394unsigned long compaction_suitable(struct zone *zone, int order)
395{
396 int fragindex;
397 unsigned long watermark;
398
399 /*
400 * Watermarks for order-0 must be met for compaction. Note the 2UL.
401 * This is because during migration, copies of pages need to be
402 * allocated and for a short time, the footprint is higher
403 */
404 watermark = low_wmark_pages(zone) + (2UL << order);
405 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
406 return COMPACT_SKIPPED;
407
408 /*
409 * fragmentation index determines if allocation failures are due to
410 * low memory or external fragmentation
411 *
412 * index of -1 implies allocations might succeed dependingon watermarks
413 * index towards 0 implies failure is due to lack of memory
414 * index towards 1000 implies failure is due to fragmentation
415 *
416 * Only compact if a failure would be due to fragmentation.
417 */
418 fragindex = fragmentation_index(zone, order);
419 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
420 return COMPACT_SKIPPED;
421
422 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
423 return COMPACT_PARTIAL;
424
425 return COMPACT_CONTINUE;
426}
427
387static int compact_zone(struct zone *zone, struct compact_control *cc) 428static int compact_zone(struct zone *zone, struct compact_control *cc)
388{ 429{
389 int ret; 430 int ret;
390 431
432 ret = compaction_suitable(zone, cc->order);
433 switch (ret) {
434 case COMPACT_PARTIAL:
435 case COMPACT_SKIPPED:
436 /* Compaction is likely to fail */
437 return ret;
438 case COMPACT_CONTINUE:
439 /* Fall through to compaction */
440 ;
441 }
442
391 /* Setup to move all movable pages to the end of the zone */ 443 /* Setup to move all movable pages to the end of the zone */
392 cc->migrate_pfn = zone->zone_start_pfn; 444 cc->migrate_pfn = zone->zone_start_pfn;
393 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 445 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
429 return ret; 481 return ret;
430} 482}
431 483
432static unsigned long compact_zone_order(struct zone *zone, 484unsigned long compact_zone_order(struct zone *zone,
433 int order, gfp_t gfp_mask) 485 int order, gfp_t gfp_mask)
434{ 486{
435 struct compact_control cc = { 487 struct compact_control cc = {
@@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
462 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 514 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
463 int may_enter_fs = gfp_mask & __GFP_FS; 515 int may_enter_fs = gfp_mask & __GFP_FS;
464 int may_perform_io = gfp_mask & __GFP_IO; 516 int may_perform_io = gfp_mask & __GFP_IO;
465 unsigned long watermark;
466 struct zoneref *z; 517 struct zoneref *z;
467 struct zone *zone; 518 struct zone *zone;
468 int rc = COMPACT_SKIPPED; 519 int rc = COMPACT_SKIPPED;
@@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
480 /* Compact each zone in the list */ 531 /* Compact each zone in the list */
481 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 532 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
482 nodemask) { 533 nodemask) {
483 int fragindex;
484 int status; 534 int status;
485 535
486 /*
487 * Watermarks for order-0 must be met for compaction. Note
488 * the 2UL. This is because during migration, copies of
489 * pages need to be allocated and for a short time, the
490 * footprint is higher
491 */
492 watermark = low_wmark_pages(zone) + (2UL << order);
493 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
494 continue;
495
496 /*
497 * fragmentation index determines if allocation failures are
498 * due to low memory or external fragmentation
499 *
500 * index of -1 implies allocations might succeed depending
501 * on watermarks
502 * index towards 0 implies failure is due to lack of memory
503 * index towards 1000 implies failure is due to fragmentation
504 *
505 * Only compact if a failure would be due to fragmentation.
506 */
507 fragindex = fragmentation_index(zone, order);
508 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
509 continue;
510
511 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
512 rc = COMPACT_PARTIAL;
513 break;
514 }
515
516 status = compact_zone_order(zone, order, gfp_mask); 536 status = compact_zone_order(zone, order, gfp_mask);
517 rc = max(status, rc); 537 rc = max(status, rc);
518 538
519 if (zone_watermark_ok(zone, order, watermark, 0, 0)) 539 /* If a normal allocation would succeed, stop compacting */
540 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
520 break; 541 break;
521 } 542 }
522 543
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..94875b265928 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
639 if (!trylock_page(page)) { 639 if (!trylock_page(page)) {
640 if (!force) 640 if (!force)
641 goto move_newpage; 641 goto move_newpage;
642
643 /*
644 * It's not safe for direct compaction to call lock_page.
645 * For example, during page readahead pages are added locked
646 * to the LRU. Later, when the IO completes the pages are
647 * marked uptodate and unlocked. However, the queueing
648 * could be merging multiple pages for one bio (e.g.
649 * mpage_readpages). If an allocation happens for the
650 * second or third page, the process can end up locking
651 * the same page twice and deadlocking. Rather than
652 * trying to be clever about what pages can be locked,
653 * avoid the use of lock_page for direct compaction
654 * altogether.
655 */
656 if (current->flags & PF_MEMALLOC)
657 goto move_newpage;
658
642 lock_page(page); 659 lock_page(page);
643 } 660 }
644 661
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22a1bb7723e4..03a66a31bfcd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1815 int migratetype, unsigned long *did_some_progress) 1815 int migratetype, unsigned long *did_some_progress)
1816{ 1816{
1817 struct page *page; 1817 struct page *page;
1818 struct task_struct *tsk = current;
1818 1819
1819 if (!order || compaction_deferred(preferred_zone)) 1820 if (!order || compaction_deferred(preferred_zone))
1820 return NULL; 1821 return NULL;
1821 1822
1823 tsk->flags |= PF_MEMALLOC;
1822 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1824 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1823 nodemask); 1825 nodemask);
1826 tsk->flags &= ~PF_MEMALLOC;
1824 if (*did_some_progress != COMPACT_SKIPPED) { 1827 if (*did_some_progress != COMPACT_SKIPPED) {
1825 1828
1826 /* Page migration frees to the PCP lists but we want merging */ 1829 /* Page migration frees to the PCP lists but we want merging */
@@ -2121,6 +2124,19 @@ rebalance:
2121 /* Wait for some write requests to complete then retry */ 2124 /* Wait for some write requests to complete then retry */
2122 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2125 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2123 goto rebalance; 2126 goto rebalance;
2127 } else {
2128 /*
2129 * High-order allocations do not necessarily loop after
2130 * direct reclaim and reclaim/compaction depends on compaction
2131 * being called after reclaim so call directly if necessary
2132 */
2133 page = __alloc_pages_direct_compact(gfp_mask, order,
2134 zonelist, high_zoneidx,
2135 nodemask,
2136 alloc_flags, preferred_zone,
2137 migratetype, &did_some_progress);
2138 if (page)
2139 goto got_pg;
2124 } 2140 }
2125 2141
2126nopage: 2142nopage:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3464312bde07..10ebd74a423c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32#include <linux/topology.h> 32#include <linux/topology.h>
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/compaction.h>
35#include <linux/notifier.h> 36#include <linux/notifier.h>
36#include <linux/rwsem.h> 37#include <linux/rwsem.h>
37#include <linux/delay.h> 38#include <linux/delay.h>
@@ -59,12 +60,15 @@
59 * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference 60 * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference
60 * page from the LRU and reclaim all pages within a 61 * page from the LRU and reclaim all pages within a
61 * naturally aligned range 62 * naturally aligned range
63 * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of
64 * order-0 pages and then compact the zone
62 */ 65 */
63typedef unsigned __bitwise__ lumpy_mode; 66typedef unsigned __bitwise__ lumpy_mode;
64#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) 67#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u)
65#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) 68#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u)
66#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) 69#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u)
67#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) 70#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u)
71#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u)
68 72
69struct scan_control { 73struct scan_control {
70 /* Incremented by the number of inactive pages that were scanned */ 74 /* Incremented by the number of inactive pages that were scanned */
@@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
286 lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; 290 lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
287 291
288 /* 292 /*
289 * Some reclaim have alredy been failed. No worth to try synchronous 293 * Initially assume we are entering either lumpy reclaim or
290 * lumpy reclaim. 294 * reclaim/compaction.Depending on the order, we will either set the
295 * sync mode or just reclaim order-0 pages later.
291 */ 296 */
292 if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) 297 if (COMPACTION_BUILD)
293 return; 298 sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION;
299 else
300 sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
294 301
295 /* 302 /*
296 * If we need a large contiguous chunk of memory, or have 303 * Avoid using lumpy reclaim or reclaim/compaction if possible by
297 * trouble getting a small set of contiguous pages, we 304 * restricting when its set to either costly allocations or when
298 * will reclaim both active and inactive pages. 305 * under memory pressure
299 */ 306 */
300 sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
301 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 307 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
302 sc->lumpy_reclaim_mode |= syncmode; 308 sc->lumpy_reclaim_mode |= syncmode;
303 else if (sc->order && priority < DEF_PRIORITY - 2) 309 else if (sc->order && priority < DEF_PRIORITY - 2)
@@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1385 if (scanning_global_lru(sc)) { 1391 if (scanning_global_lru(sc)) {
1386 nr_taken = isolate_pages_global(nr_to_scan, 1392 nr_taken = isolate_pages_global(nr_to_scan,
1387 &page_list, &nr_scanned, sc->order, 1393 &page_list, &nr_scanned, sc->order,
1388 sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? 1394 sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
1389 ISOLATE_INACTIVE : ISOLATE_BOTH, 1395 ISOLATE_BOTH : ISOLATE_INACTIVE,
1390 zone, 0, file); 1396 zone, 0, file);
1391 zone->pages_scanned += nr_scanned; 1397 zone->pages_scanned += nr_scanned;
1392 if (current_is_kswapd()) 1398 if (current_is_kswapd())
@@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1398 } else { 1404 } else {
1399 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1405 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1400 &page_list, &nr_scanned, sc->order, 1406 &page_list, &nr_scanned, sc->order,
1401 sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? 1407 sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
1402 ISOLATE_INACTIVE : ISOLATE_BOTH, 1408 ISOLATE_BOTH : ISOLATE_INACTIVE,
1403 zone, sc->mem_cgroup, 1409 zone, sc->mem_cgroup,
1404 0, file); 1410 0, file);
1405 /* 1411 /*
@@ -1815,6 +1821,57 @@ out:
1815} 1821}
1816 1822
1817/* 1823/*
1824 * Reclaim/compaction depends on a number of pages being freed. To avoid
1825 * disruption to the system, a small number of order-0 pages continue to be
1826 * rotated and reclaimed in the normal fashion. However, by the time we get
1827 * back to the allocator and call try_to_compact_zone(), we ensure that
1828 * there are enough free pages for it to be likely successful
1829 */
1830static inline bool should_continue_reclaim(struct zone *zone,
1831 unsigned long nr_reclaimed,
1832 unsigned long nr_scanned,
1833 struct scan_control *sc)
1834{
1835 unsigned long pages_for_compaction;
1836 unsigned long inactive_lru_pages;
1837
1838 /* If not in reclaim/compaction mode, stop */
1839 if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION))
1840 return false;
1841
1842 /*
1843 * If we failed to reclaim and have scanned the full list, stop.
1844 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
1845 * faster but obviously would be less likely to succeed
1846 * allocation. If this is desirable, use GFP_REPEAT to decide
1847 * if both reclaimed and scanned should be checked or just
1848 * reclaimed
1849 */
1850 if (!nr_reclaimed && !nr_scanned)
1851 return false;
1852
1853 /*
1854 * If we have not reclaimed enough pages for compaction and the
1855 * inactive lists are large enough, continue reclaiming
1856 */
1857 pages_for_compaction = (2UL << sc->order);
1858 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1859 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1860 if (sc->nr_reclaimed < pages_for_compaction &&
1861 inactive_lru_pages > pages_for_compaction)
1862 return true;
1863
1864 /* If compaction would go ahead or the allocation would succeed, stop */
1865 switch (compaction_suitable(zone, sc->order)) {
1866 case COMPACT_PARTIAL:
1867 case COMPACT_CONTINUE:
1868 return false;
1869 default:
1870 return true;
1871 }
1872}
1873
1874/*
1818 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1875 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1819 */ 1876 */
1820static void shrink_zone(int priority, struct zone *zone, 1877static void shrink_zone(int priority, struct zone *zone,
@@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone,
1823 unsigned long nr[NR_LRU_LISTS]; 1880 unsigned long nr[NR_LRU_LISTS];
1824 unsigned long nr_to_scan; 1881 unsigned long nr_to_scan;
1825 enum lru_list l; 1882 enum lru_list l;
1826 unsigned long nr_reclaimed = sc->nr_reclaimed; 1883 unsigned long nr_reclaimed;
1827 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1884 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1885 unsigned long nr_scanned = sc->nr_scanned;
1828 1886
1887restart:
1888 nr_reclaimed = 0;
1829 get_scan_count(zone, sc, nr, priority); 1889 get_scan_count(zone, sc, nr, priority);
1830 1890
1831 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1891 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone,
1851 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1911 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1852 break; 1912 break;
1853 } 1913 }
1854 1914 sc->nr_reclaimed += nr_reclaimed;
1855 sc->nr_reclaimed = nr_reclaimed;
1856 1915
1857 /* 1916 /*
1858 * Even if we did not try to evict anon pages at all, we want to 1917 * Even if we did not try to evict anon pages at all, we want to
@@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone,
1861 if (inactive_anon_is_low(zone, sc)) 1920 if (inactive_anon_is_low(zone, sc))
1862 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1921 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1863 1922
1923 /* reclaim/compaction might need reclaim to continue */
1924 if (should_continue_reclaim(zone, nr_reclaimed,
1925 sc->nr_scanned - nr_scanned, sc))
1926 goto restart;
1927
1864 throttle_vm_writeout(sc->gfp_mask); 1928 throttle_vm_writeout(sc->gfp_mask);
1865} 1929}
1866 1930
@@ -2307,6 +2371,14 @@ loop_again:
2307 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2371 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2308 sc.may_writepage = 1; 2372 sc.may_writepage = 1;
2309 2373
2374 /*
2375 * Compact the zone for higher orders to reduce
2376 * latencies for higher-order allocations that
2377 * would ordinarily call try_to_compact_pages()
2378 */
2379 if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
2380 compact_zone_order(zone, sc.order, sc.gfp_mask);
2381
2310 if (!zone_watermark_ok_safe(zone, order, 2382 if (!zone_watermark_ok_safe(zone, order,
2311 high_wmark_pages(zone), end_zone, 0)) { 2383 high_wmark_pages(zone), end_zone, 0)) {
2312 all_zones_ok = 0; 2384 all_zones_ok = 0;