aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pobox.com>2005-11-15 04:51:40 -0500
committerJeff Garzik <jgarzik@pobox.com>2005-11-15 04:51:40 -0500
commitf055408957750cf759162c364c2a4dfe19765844 (patch)
treeaecc0a13c582d310902e6fa95d8853c627828fcc /mm/page_alloc.c
parent83cbd33aae2c3cd14f80a8abf733033a57aa4923 (diff)
parent4060994c3e337b40e0f6fa8ce2cc178e021baf3d (diff)
Merge branch 'master'
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c245
1 files changed, 117 insertions, 128 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 987225bdd661..104e69ca55e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -60,8 +60,11 @@ long nr_swap_pages;
60 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 60 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
61 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 61 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
62 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 62 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
63 *
64 * TBD: should special case ZONE_DMA32 machines here - in those we normally
65 * don't need any ZONE_NORMAL reservation
63 */ 66 */
64int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 67int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
65 68
66EXPORT_SYMBOL(totalram_pages); 69EXPORT_SYMBOL(totalram_pages);
67 70
@@ -72,7 +75,7 @@ EXPORT_SYMBOL(totalram_pages);
72struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 75struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
73EXPORT_SYMBOL(zone_table); 76EXPORT_SYMBOL(zone_table);
74 77
75static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 78static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
76int min_free_kbytes = 1024; 79int min_free_kbytes = 1024;
77 80
78unsigned long __initdata nr_kernel_pages; 81unsigned long __initdata nr_kernel_pages;
@@ -124,7 +127,7 @@ static void bad_page(const char *function, struct page *page)
124 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 127 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
125 function, current->comm, page); 128 function, current->comm, page);
126 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 129 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
127 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, 130 (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
128 page->mapping, page_mapcount(page), page_count(page)); 131 page->mapping, page_mapcount(page), page_count(page));
129 printk(KERN_EMERG "Backtrace:\n"); 132 printk(KERN_EMERG "Backtrace:\n");
130 dump_stack(); 133 dump_stack();
@@ -732,9 +735,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
732 } 735 }
733 local_irq_restore(flags); 736 local_irq_restore(flags);
734 put_cpu(); 737 put_cpu();
735 } 738 } else {
736
737 if (page == NULL) {
738 spin_lock_irqsave(&zone->lock, flags); 739 spin_lock_irqsave(&zone->lock, flags);
739 page = __rmqueue(zone, order); 740 page = __rmqueue(zone, order);
740 spin_unlock_irqrestore(&zone->lock, flags); 741 spin_unlock_irqrestore(&zone->lock, flags);
@@ -754,20 +755,25 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
754 return page; 755 return page;
755} 756}
756 757
758#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
759#define ALLOC_HARDER 0x02 /* try to alloc harder */
760#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */
761#define ALLOC_CPUSET 0x08 /* check for correct cpuset */
762
757/* 763/*
758 * Return 1 if free pages are above 'mark'. This takes into account the order 764 * Return 1 if free pages are above 'mark'. This takes into account the order
759 * of the allocation. 765 * of the allocation.
760 */ 766 */
761int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 767int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
762 int classzone_idx, int can_try_harder, gfp_t gfp_high) 768 int classzone_idx, int alloc_flags)
763{ 769{
764 /* free_pages my go negative - that's OK */ 770 /* free_pages my go negative - that's OK */
765 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 771 long min = mark, free_pages = z->free_pages - (1 << order) + 1;
766 int o; 772 int o;
767 773
768 if (gfp_high) 774 if (alloc_flags & ALLOC_HIGH)
769 min -= min / 2; 775 min -= min / 2;
770 if (can_try_harder) 776 if (alloc_flags & ALLOC_HARDER)
771 min -= min / 4; 777 min -= min / 4;
772 778
773 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 779 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -785,14 +791,40 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
785 return 1; 791 return 1;
786} 792}
787 793
788static inline int 794/*
789should_reclaim_zone(struct zone *z, gfp_t gfp_mask) 795 * get_page_from_freeliest goes through the zonelist trying to allocate
796 * a page.
797 */
798static struct page *
799get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
800 struct zonelist *zonelist, int alloc_flags)
790{ 801{
791 if (!z->reclaim_pages) 802 struct zone **z = zonelist->zones;
792 return 0; 803 struct page *page = NULL;
793 if (gfp_mask & __GFP_NORECLAIM) 804 int classzone_idx = zone_idx(*z);
794 return 0; 805
795 return 1; 806 /*
807 * Go through the zonelist once, looking for a zone with enough free.
808 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
809 */
810 do {
811 if ((alloc_flags & ALLOC_CPUSET) &&
812 !cpuset_zone_allowed(*z, gfp_mask))
813 continue;
814
815 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
816 if (!zone_watermark_ok(*z, order, (*z)->pages_low,
817 classzone_idx, alloc_flags))
818 continue;
819 }
820
821 page = buffered_rmqueue(*z, order, gfp_mask);
822 if (page) {
823 zone_statistics(zonelist, *z);
824 break;
825 }
826 } while (*(++z) != NULL);
827 return page;
796} 828}
797 829
798/* 830/*
@@ -803,105 +835,75 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
803 struct zonelist *zonelist) 835 struct zonelist *zonelist)
804{ 836{
805 const gfp_t wait = gfp_mask & __GFP_WAIT; 837 const gfp_t wait = gfp_mask & __GFP_WAIT;
806 struct zone **zones, *z; 838 struct zone **z;
807 struct page *page; 839 struct page *page;
808 struct reclaim_state reclaim_state; 840 struct reclaim_state reclaim_state;
809 struct task_struct *p = current; 841 struct task_struct *p = current;
810 int i;
811 int classzone_idx;
812 int do_retry; 842 int do_retry;
813 int can_try_harder; 843 int alloc_flags;
814 int did_some_progress; 844 int did_some_progress;
815 845
816 might_sleep_if(wait); 846 might_sleep_if(wait);
817 847
818 /* 848 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
819 * The caller may dip into page reserves a bit more if the caller
820 * cannot run direct reclaim, or is the caller has realtime scheduling
821 * policy
822 */
823 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
824
825 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
826 849
827 if (unlikely(zones[0] == NULL)) { 850 if (unlikely(*z == NULL)) {
828 /* Should this ever happen?? */ 851 /* Should this ever happen?? */
829 return NULL; 852 return NULL;
830 } 853 }
854restart:
855 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
856 zonelist, ALLOC_CPUSET);
857 if (page)
858 goto got_pg;
831 859
832 classzone_idx = zone_idx(zones[0]); 860 do
861 wakeup_kswapd(*z, order);
862 while (*(++z));
833 863
834restart:
835 /* 864 /*
836 * Go through the zonelist once, looking for a zone with enough free. 865 * OK, we're below the kswapd watermark and have kicked background
837 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 866 * reclaim. Now things get more complex, so set up alloc_flags according
867 * to how we want to proceed.
868 *
869 * The caller may dip into page reserves a bit more if the caller
870 * cannot run direct reclaim, or if the caller has realtime scheduling
871 * policy.
838 */ 872 */
839 for (i = 0; (z = zones[i]) != NULL; i++) { 873 alloc_flags = 0;
840 int do_reclaim = should_reclaim_zone(z, gfp_mask); 874 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
841 875 alloc_flags |= ALLOC_HARDER;
842 if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) 876 if (gfp_mask & __GFP_HIGH)
843 continue; 877 alloc_flags |= ALLOC_HIGH;
844 878 if (wait)
845 /* 879 alloc_flags |= ALLOC_CPUSET;
846 * If the zone is to attempt early page reclaim then this loop
847 * will try to reclaim pages and check the watermark a second
848 * time before giving up and falling back to the next zone.
849 */
850zone_reclaim_retry:
851 if (!zone_watermark_ok(z, order, z->pages_low,
852 classzone_idx, 0, 0)) {
853 if (!do_reclaim)
854 continue;
855 else {
856 zone_reclaim(z, gfp_mask, order);
857 /* Only try reclaim once */
858 do_reclaim = 0;
859 goto zone_reclaim_retry;
860 }
861 }
862
863 page = buffered_rmqueue(z, order, gfp_mask);
864 if (page)
865 goto got_pg;
866 }
867
868 for (i = 0; (z = zones[i]) != NULL; i++)
869 wakeup_kswapd(z, order);
870 880
871 /* 881 /*
872 * Go through the zonelist again. Let __GFP_HIGH and allocations 882 * Go through the zonelist again. Let __GFP_HIGH and allocations
873 * coming from realtime tasks to go deeper into reserves 883 * coming from realtime tasks go deeper into reserves.
874 * 884 *
875 * This is the last chance, in general, before the goto nopage. 885 * This is the last chance, in general, before the goto nopage.
876 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 886 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
877 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 887 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
878 */ 888 */
879 for (i = 0; (z = zones[i]) != NULL; i++) { 889 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
880 if (!zone_watermark_ok(z, order, z->pages_min, 890 if (page)
881 classzone_idx, can_try_harder, 891 goto got_pg;
882 gfp_mask & __GFP_HIGH))
883 continue;
884
885 if (wait && !cpuset_zone_allowed(z, gfp_mask))
886 continue;
887
888 page = buffered_rmqueue(z, order, gfp_mask);
889 if (page)
890 goto got_pg;
891 }
892 892
893 /* This allocation should allow future memory freeing. */ 893 /* This allocation should allow future memory freeing. */
894 894
895 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 895 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
896 && !in_interrupt()) { 896 && !in_interrupt()) {
897 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 897 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
898nofail_alloc:
898 /* go through the zonelist yet again, ignoring mins */ 899 /* go through the zonelist yet again, ignoring mins */
899 for (i = 0; (z = zones[i]) != NULL; i++) { 900 page = get_page_from_freelist(gfp_mask, order,
900 if (!cpuset_zone_allowed(z, gfp_mask)) 901 zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
901 continue; 902 if (page)
902 page = buffered_rmqueue(z, order, gfp_mask); 903 goto got_pg;
903 if (page) 904 if (gfp_mask & __GFP_NOFAIL) {
904 goto got_pg; 905 blk_congestion_wait(WRITE, HZ/50);
906 goto nofail_alloc;
905 } 907 }
906 } 908 }
907 goto nopage; 909 goto nopage;
@@ -919,7 +921,7 @@ rebalance:
919 reclaim_state.reclaimed_slab = 0; 921 reclaim_state.reclaimed_slab = 0;
920 p->reclaim_state = &reclaim_state; 922 p->reclaim_state = &reclaim_state;
921 923
922 did_some_progress = try_to_free_pages(zones, gfp_mask); 924 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
923 925
924 p->reclaim_state = NULL; 926 p->reclaim_state = NULL;
925 p->flags &= ~PF_MEMALLOC; 927 p->flags &= ~PF_MEMALLOC;
@@ -927,19 +929,10 @@ rebalance:
927 cond_resched(); 929 cond_resched();
928 930
929 if (likely(did_some_progress)) { 931 if (likely(did_some_progress)) {
930 for (i = 0; (z = zones[i]) != NULL; i++) { 932 page = get_page_from_freelist(gfp_mask, order,
931 if (!zone_watermark_ok(z, order, z->pages_min, 933 zonelist, alloc_flags);
932 classzone_idx, can_try_harder, 934 if (page)
933 gfp_mask & __GFP_HIGH)) 935 goto got_pg;
934 continue;
935
936 if (!cpuset_zone_allowed(z, gfp_mask))
937 continue;
938
939 page = buffered_rmqueue(z, order, gfp_mask);
940 if (page)
941 goto got_pg;
942 }
943 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 936 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
944 /* 937 /*
945 * Go through the zonelist yet one more time, keep 938 * Go through the zonelist yet one more time, keep
@@ -947,18 +940,10 @@ rebalance:
947 * a parallel oom killing, we must fail if we're still 940 * a parallel oom killing, we must fail if we're still
948 * under heavy pressure. 941 * under heavy pressure.
949 */ 942 */
950 for (i = 0; (z = zones[i]) != NULL; i++) { 943 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
951 if (!zone_watermark_ok(z, order, z->pages_high, 944 zonelist, ALLOC_CPUSET);
952 classzone_idx, 0, 0)) 945 if (page)
953 continue; 946 goto got_pg;
954
955 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
956 continue;
957
958 page = buffered_rmqueue(z, order, gfp_mask);
959 if (page)
960 goto got_pg;
961 }
962 947
963 out_of_memory(gfp_mask, order); 948 out_of_memory(gfp_mask, order);
964 goto restart; 949 goto restart;
@@ -991,9 +976,7 @@ nopage:
991 dump_stack(); 976 dump_stack();
992 show_mem(); 977 show_mem();
993 } 978 }
994 return NULL;
995got_pg: 979got_pg:
996 zone_statistics(zonelist, z);
997 return page; 980 return page;
998} 981}
999 982
@@ -1441,6 +1424,10 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
1441 zone = pgdat->node_zones + ZONE_NORMAL; 1424 zone = pgdat->node_zones + ZONE_NORMAL;
1442 if (zone->present_pages) 1425 if (zone->present_pages)
1443 zonelist->zones[j++] = zone; 1426 zonelist->zones[j++] = zone;
1427 case ZONE_DMA32:
1428 zone = pgdat->node_zones + ZONE_DMA32;
1429 if (zone->present_pages)
1430 zonelist->zones[j++] = zone;
1444 case ZONE_DMA: 1431 case ZONE_DMA:
1445 zone = pgdat->node_zones + ZONE_DMA; 1432 zone = pgdat->node_zones + ZONE_DMA;
1446 if (zone->present_pages) 1433 if (zone->present_pages)
@@ -1455,6 +1442,8 @@ static inline int highest_zone(int zone_bits)
1455 int res = ZONE_NORMAL; 1442 int res = ZONE_NORMAL;
1456 if (zone_bits & (__force int)__GFP_HIGHMEM) 1443 if (zone_bits & (__force int)__GFP_HIGHMEM)
1457 res = ZONE_HIGHMEM; 1444 res = ZONE_HIGHMEM;
1445 if (zone_bits & (__force int)__GFP_DMA32)
1446 res = ZONE_DMA32;
1458 if (zone_bits & (__force int)__GFP_DMA) 1447 if (zone_bits & (__force int)__GFP_DMA)
1459 res = ZONE_DMA; 1448 res = ZONE_DMA;
1460 return res; 1449 return res;
@@ -1866,11 +1855,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1866 if (process_zones(cpu)) 1855 if (process_zones(cpu))
1867 ret = NOTIFY_BAD; 1856 ret = NOTIFY_BAD;
1868 break; 1857 break;
1869#ifdef CONFIG_HOTPLUG_CPU 1858 case CPU_UP_CANCELED:
1870 case CPU_DEAD: 1859 case CPU_DEAD:
1871 free_zone_pagesets(cpu); 1860 free_zone_pagesets(cpu);
1872 break; 1861 break;
1873#endif
1874 default: 1862 default:
1875 break; 1863 break;
1876 } 1864 }
@@ -1975,7 +1963,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1975 if (zholes_size) 1963 if (zholes_size)
1976 realsize -= zholes_size[j]; 1964 realsize -= zholes_size[j];
1977 1965
1978 if (j == ZONE_DMA || j == ZONE_NORMAL) 1966 if (j < ZONE_HIGHMEM)
1979 nr_kernel_pages += realsize; 1967 nr_kernel_pages += realsize;
1980 nr_all_pages += realsize; 1968 nr_all_pages += realsize;
1981 1969
@@ -2417,13 +2405,18 @@ void setup_per_zone_pages_min(void)
2417 } 2405 }
2418 2406
2419 for_each_zone(zone) { 2407 for_each_zone(zone) {
2408 unsigned long tmp;
2420 spin_lock_irqsave(&zone->lru_lock, flags); 2409 spin_lock_irqsave(&zone->lru_lock, flags);
2410 tmp = (pages_min * zone->present_pages) / lowmem_pages;
2421 if (is_highmem(zone)) { 2411 if (is_highmem(zone)) {
2422 /* 2412 /*
2423 * Often, highmem doesn't need to reserve any pages. 2413 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
2424 * But the pages_min/low/high values are also used for 2414 * need highmem pages, so cap pages_min to a small
2425 * batching up page reclaim activity so we need a 2415 * value here.
2426 * decent value here. 2416 *
2417 * The (pages_high-pages_low) and (pages_low-pages_min)
2418 * deltas controls asynch page reclaim, and so should
2419 * not be capped for highmem.
2427 */ 2420 */
2428 int min_pages; 2421 int min_pages;
2429 2422
@@ -2434,19 +2427,15 @@ void setup_per_zone_pages_min(void)
2434 min_pages = 128; 2427 min_pages = 128;
2435 zone->pages_min = min_pages; 2428 zone->pages_min = min_pages;
2436 } else { 2429 } else {
2437 /* if it's a lowmem zone, reserve a number of pages 2430 /*
2431 * If it's a lowmem zone, reserve a number of pages
2438 * proportionate to the zone's size. 2432 * proportionate to the zone's size.
2439 */ 2433 */
2440 zone->pages_min = (pages_min * zone->present_pages) / 2434 zone->pages_min = tmp;
2441 lowmem_pages;
2442 } 2435 }
2443 2436
2444 /* 2437 zone->pages_low = zone->pages_min + tmp / 4;
2445 * When interpreting these watermarks, just keep in mind that: 2438 zone->pages_high = zone->pages_min + tmp / 2;
2446 * zone->pages_min == (zone->pages_min * 4) / 4;
2447 */
2448 zone->pages_low = (zone->pages_min * 5) / 4;
2449 zone->pages_high = (zone->pages_min * 6) / 4;
2450 spin_unlock_irqrestore(&zone->lru_lock, flags); 2439 spin_unlock_irqrestore(&zone->lru_lock, flags);
2451 } 2440 }
2452} 2441}