aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2007-10-16 04:25:58 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:43:00 -0400
commit56fd56b868f19385c50af8941a4c78df433b2d32 (patch)
tree5ea8362e6e141e2d1124d4640811c76489567bc5
parent5c0e3066474b57c56ff0d88ca31d95bd14232fee (diff)
Bias the location of pages freed for min_free_kbytes in the same MAX_ORDER_NR_PAGES blocks
The standard buddy allocator always favours the smallest block of pages. The effect of this is that the pages free to satisfy min_free_kbytes tends to be preserved since boot time at the same location of memory ffor a very long time and as a contiguous block. When an administrator sets the reserve at 16384 at boot time, it tends to be the same MAX_ORDER blocks that remain free. This allows the occasional high atomic allocation to succeed up until the point the blocks are split. In practice, it is difficult to split these blocks but when they do split, the benefit of having min_free_kbytes for contiguous blocks disappears. Additionally, increasing min_free_kbytes once the system has been running for some time has no guarantee of creating contiguous blocks. On the other hand, CONFIG_PAGE_GROUP_BY_MOBILITY favours splitting large blocks when there are no free pages of the appropriate type available. A side-effect of this is that all blocks in memory tends to be used up and the contiguous free blocks from boot time are not preserved like in the vanilla allocator. This can cause a problem if a new caller is unwilling to reclaim or does not reclaim for long enough. A failure scenario was found for a wireless network device allocating order-1 atomic allocations but the allocations were not intense or frequent enough for a whole block of pages to be preserved for MIGRATE_HIGHALLOC. This was reproduced on a desktop by booting with mem=256mb, forcing the driver to allocate at order-1, running a bittorrent client (downloading a debian ISO) and building a kernel with -j2. This patch addresses the problem on the desktop machine booted with mem=256mb. It works by setting aside a reserve of MAX_ORDER_NR_PAGES blocks, the number of which depends on the value of min_free_kbytes. These blocks are only fallen back to when there is no other free pages. Then the smallest possible page is used just like the normal buddy allocator instead of the largest possible page to preserve contiguous pages The pages in free lists in the reserve blocks are never taken for another migrate type. The results is that even if min_free_kbytes is set to a low value, contiguous blocks will be preserved in the MIGRATE_RESERVE blocks. This works better than the vanilla allocator because if min_free_kbytes is increased, a new reserve block will be chosen based on the location of reclaimable pages and the block will free up as contiguous pages. In the vanilla allocator, no effort is made to target a block of pages to free as contiguous pages and min_free_kbytes pages are scattered randomly. This effect has been observed on the test machine. min_free_kbytes was set initially low but it was kept as a contiguous free block within MIGRATE_RESERVE. min_free_kbytes was then set to a higher value and over a period of time, the free blocks were within the reserve and coalescing. How long it takes to free up depends on how quickly LRU is rotating. Amusingly, this means that more activity will free the blocks faster. This mechanism potentially replaces MIGRATE_HIGHALLOC as it may be more effective than grouping contiguous free pages together. It all depends on whether the number of active atomic high allocations exceeds min_free_kbytes or not. If the number of active allocations exceeds min_free_kbytes, it's worth it but maybe in that situation, min_free_kbytes should be set higher. Once there are no more reports of allocation failures, a patch will be submitted that backs out MIGRATE_HIGHALLOC and see if the reports stay missing. Credit to Mariusz Kozlowski for discovering the problem, describing the failure scenario and testing patches and scenarios. [akpm@linux-foundation.org: cleanups] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h4
-rw-r--r--include/linux/pageblock-flags.h2
-rw-r--r--mm/page_alloc.c138
3 files changed, 114 insertions, 30 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9a5d5590bd39..afdec8117458 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -38,12 +38,14 @@
38#define MIGRATE_RECLAIMABLE 1 38#define MIGRATE_RECLAIMABLE 1
39#define MIGRATE_MOVABLE 2 39#define MIGRATE_MOVABLE 2
40#define MIGRATE_HIGHATOMIC 3 40#define MIGRATE_HIGHATOMIC 3
41#define MIGRATE_TYPES 4 41#define MIGRATE_RESERVE 4
42#define MIGRATE_TYPES 5
42#else 43#else
43#define MIGRATE_UNMOVABLE 0 44#define MIGRATE_UNMOVABLE 0
44#define MIGRATE_UNRECLAIMABLE 0 45#define MIGRATE_UNRECLAIMABLE 0
45#define MIGRATE_MOVABLE 0 46#define MIGRATE_MOVABLE 0
46#define MIGRATE_HIGHATOMIC 0 47#define MIGRATE_HIGHATOMIC 0
48#define MIGRATE_RESERVE 0
47#define MIGRATE_TYPES 1 49#define MIGRATE_TYPES 1
48#endif 50#endif
49 51
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 5456da6b4ade..fa3b1001894b 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -31,7 +31,7 @@
31 31
32/* Bit indices that affect a whole block of pages */ 32/* Bit indices that affect a whole block of pages */
33enum pageblock_bits { 33enum pageblock_bits {
34 PB_range(PB_migrate, 2), /* 2 bits required for migrate types */ 34 PB_range(PB_migrate, 3), /* 3 bits required for migrate types */
35 NR_PAGEBLOCK_BITS 35 NR_PAGEBLOCK_BITS
36}; 36};
37 37
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b864584c92b4..f7873a47fa8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -687,16 +687,48 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
687 return 0; 687 return 0;
688} 688}
689 689
690/*
691 * Go through the free lists for the given migratetype and remove
692 * the smallest available page from the freelists
693 */
694static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
695 int migratetype)
696{
697 unsigned int current_order;
698 struct free_area * area;
699 struct page *page;
700
701 /* Find a page of the appropriate size in the preferred list */
702 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
703 area = &(zone->free_area[current_order]);
704 if (list_empty(&area->free_list[migratetype]))
705 continue;
706
707 page = list_entry(area->free_list[migratetype].next,
708 struct page, lru);
709 list_del(&page->lru);
710 rmv_page_order(page);
711 area->nr_free--;
712 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
713 expand(zone, page, order, current_order, area, migratetype);
714 return page;
715 }
716
717 return NULL;
718}
719
720
690#ifdef CONFIG_PAGE_GROUP_BY_MOBILITY 721#ifdef CONFIG_PAGE_GROUP_BY_MOBILITY
691/* 722/*
692 * This array describes the order lists are fallen back to when 723 * This array describes the order lists are fallen back to when
693 * the free lists for the desirable migrate type are depleted 724 * the free lists for the desirable migrate type are depleted
694 */ 725 */
695static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { 726static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
696 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_HIGHATOMIC }, 727 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_HIGHATOMIC, MIGRATE_RESERVE },
697 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_HIGHATOMIC }, 728 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_HIGHATOMIC, MIGRATE_RESERVE },
698 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,MIGRATE_HIGHATOMIC }, 729 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_HIGHATOMIC, MIGRATE_RESERVE },
699 [MIGRATE_HIGHATOMIC] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,MIGRATE_MOVABLE}, 730 [MIGRATE_HIGHATOMIC] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
731 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
700}; 732};
701 733
702/* 734/*
@@ -799,6 +831,9 @@ retry:
799 for (i = 0; i < MIGRATE_TYPES - 1; i++) { 831 for (i = 0; i < MIGRATE_TYPES - 1; i++) {
800 migratetype = fallbacks[start_migratetype][i]; 832 migratetype = fallbacks[start_migratetype][i];
801 833
834 /* MIGRATE_RESERVE handled later if necessary */
835 if (migratetype == MIGRATE_RESERVE)
836 continue;
802 /* 837 /*
803 * Make it hard to fallback to blocks used for 838 * Make it hard to fallback to blocks used for
804 * high-order atomic allocations 839 * high-order atomic allocations
@@ -861,7 +896,8 @@ retry:
861 goto retry; 896 goto retry;
862 } 897 }
863 898
864 return NULL; 899 /* Use MIGRATE_RESERVE rather than fail an allocation */
900 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
865} 901}
866#else 902#else
867static struct page *__rmqueue_fallback(struct zone *zone, int order, 903static struct page *__rmqueue_fallback(struct zone *zone, int order,
@@ -871,36 +907,19 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
871} 907}
872#endif /* CONFIG_PAGE_GROUP_BY_MOBILITY */ 908#endif /* CONFIG_PAGE_GROUP_BY_MOBILITY */
873 909
874/* 910/*
875 * Do the hard work of removing an element from the buddy allocator. 911 * Do the hard work of removing an element from the buddy allocator.
876 * Call me with the zone->lock already held. 912 * Call me with the zone->lock already held.
877 */ 913 */
878static struct page *__rmqueue(struct zone *zone, unsigned int order, 914static struct page *__rmqueue(struct zone *zone, unsigned int order,
879 int migratetype) 915 int migratetype)
880{ 916{
881 struct free_area * area;
882 unsigned int current_order;
883 struct page *page; 917 struct page *page;
884 918
885 /* Find a page of the appropriate size in the preferred list */ 919 page = __rmqueue_smallest(zone, order, migratetype);
886 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
887 area = &(zone->free_area[current_order]);
888 if (list_empty(&area->free_list[migratetype]))
889 continue;
890
891 page = list_entry(area->free_list[migratetype].next,
892 struct page, lru);
893 list_del(&page->lru);
894 rmv_page_order(page);
895 area->nr_free--;
896 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
897 expand(zone, page, order, current_order, area, migratetype);
898 goto got_page;
899 }
900
901 page = __rmqueue_fallback(zone, order, migratetype);
902 920
903got_page: 921 if (unlikely(!page))
922 page = __rmqueue_fallback(zone, order, migratetype);
904 923
905 return page; 924 return page;
906} 925}
@@ -2506,6 +2525,65 @@ static inline unsigned long wait_table_bits(unsigned long size)
2506 2525
2507#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 2526#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
2508 2527
2528#ifdef CONFIG_PAGE_GROUP_BY_MOBILITY
2529/*
2530 * Mark a number of MAX_ORDER_NR_PAGES blocks as MIGRATE_RESERVE. The number
2531 * of blocks reserved is based on zone->pages_min. The memory within the
2532 * reserve will tend to store contiguous free pages. Setting min_free_kbytes
2533 * higher will lead to a bigger reserve which will get freed as contiguous
2534 * blocks as reclaim kicks in
2535 */
2536static void setup_zone_migrate_reserve(struct zone *zone)
2537{
2538 unsigned long start_pfn, pfn, end_pfn;
2539 struct page *page;
2540 unsigned long reserve, block_migratetype;
2541
2542 /* Get the start pfn, end pfn and the number of blocks to reserve */
2543 start_pfn = zone->zone_start_pfn;
2544 end_pfn = start_pfn + zone->spanned_pages;
2545 reserve = roundup(zone->pages_min, MAX_ORDER_NR_PAGES) >> (MAX_ORDER-1);
2546
2547 for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) {
2548 if (!pfn_valid(pfn))
2549 continue;
2550 page = pfn_to_page(pfn);
2551
2552 /* Blocks with reserved pages will never free, skip them. */
2553 if (PageReserved(page))
2554 continue;
2555
2556 block_migratetype = get_pageblock_migratetype(page);
2557
2558 /* If this block is reserved, account for it */
2559 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
2560 reserve--;
2561 continue;
2562 }
2563
2564 /* Suitable for reserving if this block is movable */
2565 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
2566 set_pageblock_migratetype(page, MIGRATE_RESERVE);
2567 move_freepages_block(zone, page, MIGRATE_RESERVE);
2568 reserve--;
2569 continue;
2570 }
2571
2572 /*
2573 * If the reserve is met and this is a previous reserved block,
2574 * take it back
2575 */
2576 if (block_migratetype == MIGRATE_RESERVE) {
2577 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2578 move_freepages_block(zone, page, MIGRATE_MOVABLE);
2579 }
2580 }
2581}
2582#else
2583static inline void setup_zone_migrate_reserve(struct zone *zone)
2584{
2585}
2586#endif /* CONFIG_PAGE_GROUP_BY_MOBILITY */
2509/* 2587/*
2510 * Initially all pages are reserved - free ones are freed 2588 * Initially all pages are reserved - free ones are freed
2511 * up by free_all_bootmem() once the early boot process is 2589 * up by free_all_bootmem() once the early boot process is
@@ -2541,9 +2619,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2541 * movable at startup. This will force kernel allocations 2619 * movable at startup. This will force kernel allocations
2542 * to reserve their blocks rather than leaking throughout 2620 * to reserve their blocks rather than leaking throughout
2543 * the address space during boot when many long-lived 2621 * the address space during boot when many long-lived
2544 * kernel allocations are made 2622 * kernel allocations are made. Later some blocks near
2623 * the start are marked MIGRATE_RESERVE by
2624 * setup_zone_migrate_reserve()
2545 */ 2625 */
2546 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 2626 if ((pfn & (MAX_ORDER_NR_PAGES-1)))
2627 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2547 2628
2548 INIT_LIST_HEAD(&page->lru); 2629 INIT_LIST_HEAD(&page->lru);
2549#ifdef WANT_PAGE_VIRTUAL 2630#ifdef WANT_PAGE_VIRTUAL
@@ -4078,6 +4159,7 @@ void setup_per_zone_pages_min(void)
4078 4159
4079 zone->pages_low = zone->pages_min + (tmp >> 2); 4160 zone->pages_low = zone->pages_min + (tmp >> 2);
4080 zone->pages_high = zone->pages_min + (tmp >> 1); 4161 zone->pages_high = zone->pages_min + (tmp >> 1);
4162 setup_zone_migrate_reserve(zone);
4081 spin_unlock_irqrestore(&zone->lru_lock, flags); 4163 spin_unlock_irqrestore(&zone->lru_lock, flags);
4082 } 4164 }
4083 4165