aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Whitcroft <apw@shadowen.org>2007-07-17 07:03:16 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-17 13:22:59 -0400
commit5ad333eb66ff1e52a87639822ae088577669dcf9 (patch)
treeaddae6bbd19585f19328f309924d06d647e8f2b7
parent7e63efef857575320fb413fbc3d0ee704b72845f (diff)
Lumpy Reclaim V4
When we are out of memory of a suitable size we enter reclaim. The current reclaim algorithm targets pages in LRU order, which is great for fairness at order-0 but highly unsuitable if you desire pages at higher orders. To get pages of higher order we must shoot down a very high proportion of memory; >95% in a lot of cases. This patch set adds a lumpy reclaim algorithm to the allocator. It targets groups of pages at the specified order anchored at the end of the active and inactive lists. This encourages groups of pages at the requested orders to move from active to inactive, and active to free lists. This behaviour is only triggered out of direct reclaim when higher order pages have been requested. This patch set is particularly effective when utilised with an anti-fragmentation scheme which groups pages of similar reclaimability together. This patch set is based on Peter Zijlstra's lumpy reclaim V2 patch which forms the foundation. Credit to Mel Gorman for sanitity checking. Mel said: The patches have an application with hugepage pool resizing. When lumpy-reclaim is used used with ZONE_MOVABLE, the hugepages pool can be resized with greater reliability. Testing on a desktop machine with 2GB of RAM showed that growing the hugepage pool with ZONE_MOVABLE on it's own was very slow as the success rate was quite low. Without lumpy-reclaim, each attempt to grow the pool by 100 pages would yield 1 or 2 hugepages. With lumpy-reclaim, getting 40 to 70 hugepages on each attempt was typical. [akpm@osdl.org: ia64 pfn_to_nid fixes and loop cleanup] [bunk@stusta.de: static declarations for internal functions] [a.p.zijlstra@chello.nl: initial lumpy V2 implementation] Signed-off-by: Andy Whitcroft <apw@shadowen.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: Bob Picco <bob.picco@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/buffer.c2
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/swap.h3
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/vmscan.c171
5 files changed, 163 insertions, 26 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 94344b2e0b46..d654a3b6209e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -356,7 +356,7 @@ static void free_more_memory(void)
356 for_each_online_pgdat(pgdat) { 356 for_each_online_pgdat(pgdat) {
357 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; 357 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
358 if (*zones) 358 if (*zones)
359 try_to_free_pages(zones, GFP_NOFS); 359 try_to_free_pages(zones, 0, GFP_NOFS);
360 } 360 }
361} 361}
362 362
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d71ff763c9df..da8eb8ad9e9b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,6 +24,14 @@
24#endif 24#endif
25#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) 25#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
26 26
27/*
28 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
29 * costly to service. That is between allocation orders which should
30 * coelesce naturally under reasonable reclaim pressure and those which
31 * will not.
32 */
33#define PAGE_ALLOC_COSTLY_ORDER 3
34
27struct free_area { 35struct free_area {
28 struct list_head free_list; 36 struct list_head free_list;
29 unsigned long nr_free; 37 unsigned long nr_free;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 006868881346..665f85f2a3af 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -188,7 +188,8 @@ extern int rotate_reclaimable_page(struct page *page);
188extern void swap_setup(void); 188extern void swap_setup(void);
189 189
190/* linux/mm/vmscan.c */ 190/* linux/mm/vmscan.c */
191extern unsigned long try_to_free_pages(struct zone **, gfp_t); 191extern unsigned long try_to_free_pages(struct zone **zones, int order,
192 gfp_t gfp_mask);
192extern unsigned long shrink_all_memory(unsigned long nr_pages); 193extern unsigned long shrink_all_memory(unsigned long nr_pages);
193extern int vm_swappiness; 194extern int vm_swappiness;
194extern int remove_mapping(struct address_space *mapping, struct page *page); 195extern int remove_mapping(struct address_space *mapping, struct page *page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ac4f8c6b5c10..1a889c3fec59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1333,7 +1333,7 @@ nofail_alloc:
1333 reclaim_state.reclaimed_slab = 0; 1333 reclaim_state.reclaimed_slab = 0;
1334 p->reclaim_state = &reclaim_state; 1334 p->reclaim_state = &reclaim_state;
1335 1335
1336 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); 1336 did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
1337 1337
1338 p->reclaim_state = NULL; 1338 p->reclaim_state = NULL;
1339 p->flags &= ~PF_MEMALLOC; 1339 p->flags &= ~PF_MEMALLOC;
@@ -1370,7 +1370,8 @@ nofail_alloc:
1370 */ 1370 */
1371 do_retry = 0; 1371 do_retry = 0;
1372 if (!(gfp_mask & __GFP_NORETRY)) { 1372 if (!(gfp_mask & __GFP_NORETRY)) {
1373 if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) 1373 if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
1374 (gfp_mask & __GFP_REPEAT))
1374 do_retry = 1; 1375 do_retry = 1;
1375 if (gfp_mask & __GFP_NOFAIL) 1376 if (gfp_mask & __GFP_NOFAIL)
1376 do_retry = 1; 1377 do_retry = 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1be5a6376ef0..1d9971d8924b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -66,6 +66,8 @@ struct scan_control {
66 int swappiness; 66 int swappiness;
67 67
68 int all_unreclaimable; 68 int all_unreclaimable;
69
70 int order;
69}; 71};
70 72
71/* 73/*
@@ -481,7 +483,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
481 483
482 referenced = page_referenced(page, 1); 484 referenced = page_referenced(page, 1);
483 /* In active use or really unfreeable? Activate it. */ 485 /* In active use or really unfreeable? Activate it. */
484 if (referenced && page_mapping_inuse(page)) 486 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
487 referenced && page_mapping_inuse(page))
485 goto activate_locked; 488 goto activate_locked;
486 489
487#ifdef CONFIG_SWAP 490#ifdef CONFIG_SWAP
@@ -514,7 +517,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
514 } 517 }
515 518
516 if (PageDirty(page)) { 519 if (PageDirty(page)) {
517 if (referenced) 520 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
518 goto keep_locked; 521 goto keep_locked;
519 if (!may_enter_fs) 522 if (!may_enter_fs)
520 goto keep_locked; 523 goto keep_locked;
@@ -598,6 +601,51 @@ keep:
598 return nr_reclaimed; 601 return nr_reclaimed;
599} 602}
600 603
604/* LRU Isolation modes. */
605#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
606#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
607#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
608
609/*
610 * Attempt to remove the specified page from its LRU. Only take this page
611 * if it is of the appropriate PageActive status. Pages which are being
612 * freed elsewhere are also ignored.
613 *
614 * page: page to consider
615 * mode: one of the LRU isolation modes defined above
616 *
617 * returns 0 on success, -ve errno on failure.
618 */
619static int __isolate_lru_page(struct page *page, int mode)
620{
621 int ret = -EINVAL;
622
623 /* Only take pages on the LRU. */
624 if (!PageLRU(page))
625 return ret;
626
627 /*
628 * When checking the active state, we need to be sure we are
629 * dealing with comparible boolean values. Take the logical not
630 * of each.
631 */
632 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
633 return ret;
634
635 ret = -EBUSY;
636 if (likely(get_page_unless_zero(page))) {
637 /*
638 * Be careful not to clear PageLRU until after we're
639 * sure the page is not being freed elsewhere -- the
640 * page release code relies on it.
641 */
642 ClearPageLRU(page);
643 ret = 0;
644 }
645
646 return ret;
647}
648
601/* 649/*
602 * zone->lru_lock is heavily contended. Some of the functions that 650 * zone->lru_lock is heavily contended. Some of the functions that
603 * shrink the lists perform better by taking out a batch of pages 651 * shrink the lists perform better by taking out a batch of pages
@@ -612,38 +660,90 @@ keep:
612 * @src: The LRU list to pull pages off. 660 * @src: The LRU list to pull pages off.
613 * @dst: The temp list to put pages on to. 661 * @dst: The temp list to put pages on to.
614 * @scanned: The number of pages that were scanned. 662 * @scanned: The number of pages that were scanned.
663 * @order: The caller's attempted allocation order
664 * @mode: One of the LRU isolation modes
615 * 665 *
616 * returns how many pages were moved onto *@dst. 666 * returns how many pages were moved onto *@dst.
617 */ 667 */
618static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 668static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
619 struct list_head *src, struct list_head *dst, 669 struct list_head *src, struct list_head *dst,
620 unsigned long *scanned) 670 unsigned long *scanned, int order, int mode)
621{ 671{
622 unsigned long nr_taken = 0; 672 unsigned long nr_taken = 0;
623 struct page *page;
624 unsigned long scan; 673 unsigned long scan;
625 674
626 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 675 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
627 struct list_head *target; 676 struct page *page;
677 unsigned long pfn;
678 unsigned long end_pfn;
679 unsigned long page_pfn;
680 int zone_id;
681
628 page = lru_to_page(src); 682 page = lru_to_page(src);
629 prefetchw_prev_lru_page(page, src, flags); 683 prefetchw_prev_lru_page(page, src, flags);
630 684
631 VM_BUG_ON(!PageLRU(page)); 685 VM_BUG_ON(!PageLRU(page));
632 686
633 list_del(&page->lru); 687 switch (__isolate_lru_page(page, mode)) {
634 target = src; 688 case 0:
635 if (likely(get_page_unless_zero(page))) { 689 list_move(&page->lru, dst);
636 /*
637 * Be careful not to clear PageLRU until after we're
638 * sure the page is not being freed elsewhere -- the
639 * page release code relies on it.
640 */
641 ClearPageLRU(page);
642 target = dst;
643 nr_taken++; 690 nr_taken++;
644 } /* else it is being freed elsewhere */ 691 break;
645 692
646 list_add(&page->lru, target); 693 case -EBUSY:
694 /* else it is being freed elsewhere */
695 list_move(&page->lru, src);
696 continue;
697
698 default:
699 BUG();
700 }
701
702 if (!order)
703 continue;
704
705 /*
706 * Attempt to take all pages in the order aligned region
707 * surrounding the tag page. Only take those pages of
708 * the same active state as that tag page. We may safely
709 * round the target page pfn down to the requested order
710 * as the mem_map is guarenteed valid out to MAX_ORDER,
711 * where that page is in a different zone we will detect
712 * it from its zone id and abort this block scan.
713 */
714 zone_id = page_zone_id(page);
715 page_pfn = page_to_pfn(page);
716 pfn = page_pfn & ~((1 << order) - 1);
717 end_pfn = pfn + (1 << order);
718 for (; pfn < end_pfn; pfn++) {
719 struct page *cursor_page;
720
721 /* The target page is in the block, ignore it. */
722 if (unlikely(pfn == page_pfn))
723 continue;
724
725 /* Avoid holes within the zone. */
726 if (unlikely(!pfn_valid_within(pfn)))
727 break;
728
729 cursor_page = pfn_to_page(pfn);
730 /* Check that we have not crossed a zone boundary. */
731 if (unlikely(page_zone_id(cursor_page) != zone_id))
732 continue;
733 switch (__isolate_lru_page(cursor_page, mode)) {
734 case 0:
735 list_move(&cursor_page->lru, dst);
736 nr_taken++;
737 scan++;
738 break;
739
740 case -EBUSY:
741 /* else it is being freed elsewhere */
742 list_move(&cursor_page->lru, src);
743 default:
744 break;
745 }
746 }
647 } 747 }
648 748
649 *scanned = scan; 749 *scanned = scan;
@@ -651,6 +751,24 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
651} 751}
652 752
653/* 753/*
754 * clear_active_flags() is a helper for shrink_active_list(), clearing
755 * any active bits from the pages in the list.
756 */
757static unsigned long clear_active_flags(struct list_head *page_list)
758{
759 int nr_active = 0;
760 struct page *page;
761
762 list_for_each_entry(page, page_list, lru)
763 if (PageActive(page)) {
764 ClearPageActive(page);
765 nr_active++;
766 }
767
768 return nr_active;
769}
770
771/*
654 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 772 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
655 * of reclaimed pages 773 * of reclaimed pages
656 */ 774 */
@@ -671,11 +789,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
671 unsigned long nr_taken; 789 unsigned long nr_taken;
672 unsigned long nr_scan; 790 unsigned long nr_scan;
673 unsigned long nr_freed; 791 unsigned long nr_freed;
792 unsigned long nr_active;
674 793
675 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 794 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
676 &zone->inactive_list, 795 &zone->inactive_list,
677 &page_list, &nr_scan); 796 &page_list, &nr_scan, sc->order,
678 __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken); 797 (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
798 ISOLATE_BOTH : ISOLATE_INACTIVE);
799 nr_active = clear_active_flags(&page_list);
800
801 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
802 __mod_zone_page_state(zone, NR_INACTIVE,
803 -(nr_taken - nr_active));
679 zone->pages_scanned += nr_scan; 804 zone->pages_scanned += nr_scan;
680 spin_unlock_irq(&zone->lru_lock); 805 spin_unlock_irq(&zone->lru_lock);
681 806
@@ -820,7 +945,7 @@ force_reclaim_mapped:
820 lru_add_drain(); 945 lru_add_drain();
821 spin_lock_irq(&zone->lru_lock); 946 spin_lock_irq(&zone->lru_lock);
822 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 947 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
823 &l_hold, &pgscanned); 948 &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
824 zone->pages_scanned += pgscanned; 949 zone->pages_scanned += pgscanned;
825 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 950 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
826 spin_unlock_irq(&zone->lru_lock); 951 spin_unlock_irq(&zone->lru_lock);
@@ -1011,7 +1136,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1011 * holds filesystem locks which prevent writeout this might not work, and the 1136 * holds filesystem locks which prevent writeout this might not work, and the
1012 * allocation attempt will fail. 1137 * allocation attempt will fail.
1013 */ 1138 */
1014unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 1139unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1015{ 1140{
1016 int priority; 1141 int priority;
1017 int ret = 0; 1142 int ret = 0;
@@ -1026,6 +1151,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1026 .swap_cluster_max = SWAP_CLUSTER_MAX, 1151 .swap_cluster_max = SWAP_CLUSTER_MAX,
1027 .may_swap = 1, 1152 .may_swap = 1,
1028 .swappiness = vm_swappiness, 1153 .swappiness = vm_swappiness,
1154 .order = order,
1029 }; 1155 };
1030 1156
1031 count_vm_event(ALLOCSTALL); 1157 count_vm_event(ALLOCSTALL);
@@ -1131,6 +1257,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1131 .may_swap = 1, 1257 .may_swap = 1,
1132 .swap_cluster_max = SWAP_CLUSTER_MAX, 1258 .swap_cluster_max = SWAP_CLUSTER_MAX,
1133 .swappiness = vm_swappiness, 1259 .swappiness = vm_swappiness,
1260 .order = order,
1134 }; 1261 };
1135 /* 1262 /*
1136 * temp_priority is used to remember the scanning priority at which 1263 * temp_priority is used to remember the scanning priority at which