Merge branch 'master'

author: Jeff Garzik <jgarzik@pobox.com> 2005-11-15 04:51:40 -0500
committer: Jeff Garzik <jgarzik@pobox.com> 2005-11-15 04:51:40 -0500
commit: f055408957750cf759162c364c2a4dfe19765844 (patch)
tree: aecc0a13c582d310902e6fa95d8853c627828fcc /mm/page_alloc.c
parent: 83cbd33aae2c3cd14f80a8abf733033a57aa4923 (diff)
parent: 4060994c3e337b40e0f6fa8ce2cc178e021baf3d (diff)
1 files changed, 117 insertions, 128 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 987225bdd661..104e69ca55e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -60,8 +60,11 @@ long nr_swap_pages;
 *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
 *      HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
 */
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
 EXPORT_SYMBOL(totalram_pages);
@@ -72,7 +75,7 @@ EXPORT_SYMBOL(totalram_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
@@ -124,7 +127,7 @@ static void bad_page(const char *function, struct page *page)
        printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
                function, current->comm, page);
        printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-                (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+                (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
                page->mapping, page_mapcount(page), page_count(page));
        printk(KERN_EMERG "Backtrace:\n");
        dump_stack();
@@ -732,9 +735,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
                }
                local_irq_restore(flags);
                put_cpu();
-        }
+        } else {
-        if (page == NULL) {
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order);
                spin_unlock_irqrestore(&zone->lock, flags);
@@ -754,20 +755,25 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
        return page;
 }
+#define ALLOC_NO_WATERMARKS     0x01 /* don't check watermarks at all */
+#define ALLOC_HARDER            0x02 /* try to alloc harder */
+#define ALLOC_HIGH              0x04 /* __GFP_HIGH set */
+#define ALLOC_CPUSET            0x08 /* check for correct cpuset */
 /*
 * Return 1 if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int can_try_harder, gfp_t gfp_high)
+                      int classzone_idx, int alloc_flags)
 {
        /* free_pages my go negative - that's OK */
        long min = mark, free_pages = z->free_pages - (1 << order) + 1;
        int o;
-        if (gfp_high)
+        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
-        if (can_try_harder)
+        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -785,14 +791,40 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
-static inline int
+/*
-should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+                struct zonelist *zonelist, int alloc_flags)
 {
-        if (!z->reclaim_pages)
+        struct zone **z = zonelist->zones;
-                return 0;
+        struct page *page = NULL;
-        if (gfp_mask & __GFP_NORECLAIM)
+        int classzone_idx = zone_idx(*z);
-                return 0;
-        return 1;
+        /*
+         * Go through the zonelist once, looking for a zone with enough free.
+         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+         */
+        do {
+                if ((alloc_flags & ALLOC_CPUSET) &&
+                                !cpuset_zone_allowed(*z, gfp_mask))
+                        continue;
+                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+                        if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+                                    classzone_idx, alloc_flags))
+                                continue;
+                }
+                page = buffered_rmqueue(*z, order, gfp_mask);
+                if (page) {
+                        zone_statistics(zonelist, *z);
+                        break;
+                }
+        } while (*(++z) != NULL);
+        return page;
 }
 /*
@@ -803,105 +835,75 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
                struct zonelist *zonelist)
 {
        const gfp_t wait = gfp_mask & __GFP_WAIT;
-        struct zone **zones, *z;
+        struct zone **z;
        struct page *page;
        struct reclaim_state reclaim_state;
        struct task_struct *p = current;
-        int i;
-        int classzone_idx;
        int do_retry;
-        int can_try_harder;
+        int alloc_flags;
        int did_some_progress;
        might_sleep_if(wait);
-        /*
+        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-         * The caller may dip into page reserves a bit more if the caller
-         * cannot run direct reclaim, or is the caller has realtime scheduling
-         * policy
-         */
-        can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
-        zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-        if (unlikely(zones[0] == NULL)) {
+        if (unlikely(*z == NULL)) {
                /* Should this ever happen?? */
                return NULL;
        }
+restart:
+        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+                                zonelist, ALLOC_CPUSET);
+        if (page)
+                goto got_pg;
-        classzone_idx = zone_idx(zones[0]);
+        do
+                wakeup_kswapd(*z, order);
+        while (*(++z));
-restart:
        /*
-         * Go through the zonelist once, looking for a zone with enough free.
+         * OK, we're below the kswapd watermark and have kicked background
-         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+         * reclaim. Now things get more complex, so set up alloc_flags according
+         * to how we want to proceed.
+         *
+         * The caller may dip into page reserves a bit more if the caller
+         * cannot run direct reclaim, or if the caller has realtime scheduling
+         * policy.
         */
-        for (i = 0; (z = zones[i]) != NULL; i++) {
+        alloc_flags = 0;
-                int do_reclaim = should_reclaim_zone(z, gfp_mask);
+        if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+                alloc_flags |= ALLOC_HARDER;
-                if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
+        if (gfp_mask & __GFP_HIGH)
-                        continue;
+                alloc_flags |= ALLOC_HIGH;
+        if (wait)
-                /*
+                alloc_flags |= ALLOC_CPUSET;
-                 * If the zone is to attempt early page reclaim then this loop
-                 * will try to reclaim pages and check the watermark a second
-                 * time before giving up and falling back to the next zone.
-                 */
-zone_reclaim_retry:
-                if (!zone_watermark_ok(z, order, z->pages_low,
-                                       classzone_idx, 0, 0)) {
-                        if (!do_reclaim)
-                                continue;
-                        else {
-                                zone_reclaim(z, gfp_mask, order);
-                                /* Only try reclaim once */
-                                do_reclaim = 0;
-                                goto zone_reclaim_retry;
-                        }
-                }
-                page = buffered_rmqueue(z, order, gfp_mask);
-                if (page)
-                        goto got_pg;
-        }
-        for (i = 0; (z = zones[i]) != NULL; i++)
-                wakeup_kswapd(z, order);
        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
-         * coming from realtime tasks to go deeper into reserves
+         * coming from realtime tasks go deeper into reserves.
         *
         * This is the last chance, in general, before the goto nopage.
         * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
-        for (i = 0; (z = zones[i]) != NULL; i++) {
+        page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
-                if (!zone_watermark_ok(z, order, z->pages_min,
+        if (page)
-                                       classzone_idx, can_try_harder,
+                goto got_pg;
-                                       gfp_mask & __GFP_HIGH))
-                        continue;
-                if (wait && !cpuset_zone_allowed(z, gfp_mask))
-                        continue;
-                page = buffered_rmqueue(z, order, gfp_mask);
-                if (page)
-                        goto got_pg;
-        }
        /* This allocation should allow future memory freeing. */
        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                        && !in_interrupt()) {
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
                        /* go through the zonelist yet again, ignoring mins */
-                        for (i = 0; (z = zones[i]) != NULL; i++) {
+                        page = get_page_from_freelist(gfp_mask, order,
-                                if (!cpuset_zone_allowed(z, gfp_mask))
+                                zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
-                                        continue;
+                        if (page)
-                                page = buffered_rmqueue(z, order, gfp_mask);
+                                goto got_pg;
-                                if (page)
+                        if (gfp_mask & __GFP_NOFAIL) {
-                                        goto got_pg;
+                                blk_congestion_wait(WRITE, HZ/50);
+                                goto nofail_alloc;
                        }
                }
                goto nopage;
@@ -919,7 +921,7 @@ rebalance:
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zones, gfp_mask);
+        did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
        p->reclaim_state = NULL;
        p->flags &= ~PF_MEMALLOC;
@@ -927,19 +929,10 @@ rebalance:
        cond_resched();
        if (likely(did_some_progress)) {
-                for (i = 0; (z = zones[i]) != NULL; i++) {
+                page = get_page_from_freelist(gfp_mask, order,
-                        if (!zone_watermark_ok(z, order, z->pages_min,
+                                                zonelist, alloc_flags);
-                                               classzone_idx, can_try_harder,
+                if (page)
-                                               gfp_mask & __GFP_HIGH))
+                        goto got_pg;
-                                continue;
-                        if (!cpuset_zone_allowed(z, gfp_mask))
-                                continue;
-                        page = buffered_rmqueue(z, order, gfp_mask);
-                        if (page)
-                                goto got_pg;
-                }
        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                /*
                 * Go through the zonelist yet one more time, keep
@@ -947,18 +940,10 @@ rebalance:
                 * a parallel oom killing, we must fail if we're still
                 * under heavy pressure.
                 */
-                for (i = 0; (z = zones[i]) != NULL; i++) {
+                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-                        if (!zone_watermark_ok(z, order, z->pages_high,
+                                                zonelist, ALLOC_CPUSET);
-                                               classzone_idx, 0, 0))
+                if (page)
-                                continue;
+                        goto got_pg;
-                        if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-                                continue;
-                        page = buffered_rmqueue(z, order, gfp_mask);
-                        if (page)
-                                goto got_pg;
-                }
                out_of_memory(gfp_mask, order);
                goto restart;
@@ -991,9 +976,7 @@ nopage:
                dump_stack();
                show_mem();
        }
-        return NULL;
 got_pg:
-        zone_statistics(zonelist, z);
        return page;
 }
@@ -1441,6 +1424,10 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
                zone = pgdat->node_zones + ZONE_NORMAL;
                if (zone->present_pages)
                        zonelist->zones[j++] = zone;
+        case ZONE_DMA32:
+                zone = pgdat->node_zones + ZONE_DMA32;
+                if (zone->present_pages)
+                        zonelist->zones[j++] = zone;
        case ZONE_DMA:
                zone = pgdat->node_zones + ZONE_DMA;
                if (zone->present_pages)
@@ -1455,6 +1442,8 @@ static inline int highest_zone(int zone_bits)
        int res = ZONE_NORMAL;
        if (zone_bits & (__force int)__GFP_HIGHMEM)
                res = ZONE_HIGHMEM;
+        if (zone_bits & (__force int)__GFP_DMA32)
+                res = ZONE_DMA32;
        if (zone_bits & (__force int)__GFP_DMA)
                res = ZONE_DMA;
        return res;
@@ -1866,11 +1855,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
                        if (process_zones(cpu))
                                ret = NOTIFY_BAD;
                        break;
-#ifdef CONFIG_HOTPLUG_CPU
+                case CPU_UP_CANCELED:
                case CPU_DEAD:
                        free_zone_pagesets(cpu);
                        break;
-#endif
                default:
                        break;
        }
@@ -1975,7 +1963,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                if (zholes_size)
                        realsize -= zholes_size[j];
-                if (j == ZONE_DMA || j == ZONE_NORMAL)
+                if (j < ZONE_HIGHMEM)
                        nr_kernel_pages += realsize;
                nr_all_pages += realsize;
@@ -2417,13 +2405,18 @@ void setup_per_zone_pages_min(void)
        }
        for_each_zone(zone) {
+                unsigned long tmp;
                spin_lock_irqsave(&zone->lru_lock, flags);
+                tmp = (pages_min * zone->present_pages) / lowmem_pages;
                if (is_highmem(zone)) {
                        /*
-                         * Often, highmem doesn't need to reserve any pages.
+                         * __GFP_HIGH and PF_MEMALLOC allocations usually don't
-                         * But the pages_min/low/high values are also used for
+                         * need highmem pages, so cap pages_min to a small
-                         * batching up page reclaim activity so we need a
+                         * value here.
-                         * decent value here.
+                         *
+                         * The (pages_high-pages_low) and (pages_low-pages_min)
+                         * deltas controls asynch page reclaim, and so should
+                         * not be capped for highmem.
                         */
                        int min_pages;
@@ -2434,19 +2427,15 @@ void setup_per_zone_pages_min(void)
                                min_pages = 128;
                        zone->pages_min = min_pages;
                } else {
-                        /* if it's a lowmem zone, reserve a number of pages
+                        /*
+                         * If it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                        zone->pages_min = (pages_min * zone->present_pages) /
+                        zone->pages_min = tmp;
-                                           lowmem_pages;
                }
-                /*
+                zone->pages_low   = zone->pages_min + tmp / 4;
-                 * When interpreting these watermarks, just keep in mind that:
+                zone->pages_high  = zone->pages_min + tmp / 2;
-                 * zone->pages_min == (zone->pages_min * 4) / 4;
-                 */
-                zone->pages_low   = (zone->pages_min * 5) / 4;
-                zone->pages_high  = (zone->pages_min * 6) / 4;
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
 }
author	Jeff Garzik <jgarzik@pobox.com>	2005-11-15 04:51:40 -0500
committer	Jeff Garzik <jgarzik@pobox.com>	2005-11-15 04:51:40 -0500
commit	f055408957750cf759162c364c2a4dfe19765844 (patch)
tree	aecc0a13c582d310902e6fa95d8853c627828fcc /mm/page_alloc.c
parent	83cbd33aae2c3cd14f80a8abf733033a57aa4923 (diff)
parent	4060994c3e337b40e0f6fa8ce2cc178e021baf3d (diff)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 987225bdd661..104e69ca55e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -60,8 +60,11 @@ long nr_swap_pages;
60	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA	60	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
61	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL	61	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
62	* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA	62	* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
		63	*
		64	* TBD: should special case ZONE_DMA32 machines here - in those we normally
		65	* don't need any ZONE_NORMAL reservation
63	*/	66	*/
64	int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };	67	int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
65		68
66	EXPORT_SYMBOL(totalram_pages);	69	EXPORT_SYMBOL(totalram_pages);
67		70
@@ -72,7 +75,7 @@ EXPORT_SYMBOL(totalram_pages);
72	struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;	75	struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
73	EXPORT_SYMBOL(zone_table);	76	EXPORT_SYMBOL(zone_table);
74		77
75	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };	78	static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
76	int min_free_kbytes = 1024;	79	int min_free_kbytes = 1024;
77		80
78	unsigned long __initdata nr_kernel_pages;	81	unsigned long __initdata nr_kernel_pages;
@@ -124,7 +127,7 @@ static void bad_page(const char function, struct page page)
124	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",	127	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
125	function, current->comm, page);	128	function, current->comm, page);
126	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",	129	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
127	(int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,	130	(int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
128	page->mapping, page_mapcount(page), page_count(page));	131	page->mapping, page_mapcount(page), page_count(page));
129	printk(KERN_EMERG "Backtrace:\n");	132	printk(KERN_EMERG "Backtrace:\n");
130	dump_stack();	133	dump_stack();
@@ -732,9 +735,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
732	}	735	}
733	local_irq_restore(flags);	736	local_irq_restore(flags);
734	put_cpu();	737	put_cpu();
735	}	738	} else {
736
737	if (page == NULL) {
738	spin_lock_irqsave(&zone->lock, flags);	739	spin_lock_irqsave(&zone->lock, flags);
739	page = __rmqueue(zone, order);	740	page = __rmqueue(zone, order);
740	spin_unlock_irqrestore(&zone->lock, flags);	741	spin_unlock_irqrestore(&zone->lock, flags);
@@ -754,20 +755,25 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
754	return page;	755	return page;
755	}	756	}
756		757
		758	#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
		759	#define ALLOC_HARDER 0x02 /* try to alloc harder */
		760	#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */
		761	#define ALLOC_CPUSET 0x08 /* check for correct cpuset */
		762
757	/*	763	/*
758	* Return 1 if free pages are above 'mark'. This takes into account the order	764	* Return 1 if free pages are above 'mark'. This takes into account the order
759	* of the allocation.	765	* of the allocation.
760	*/	766	*/
761	int zone_watermark_ok(struct zone *z, int order, unsigned long mark,	767	int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
762	int classzone_idx, int can_try_harder, gfp_t gfp_high)	768	int classzone_idx, int alloc_flags)
763	{	769	{
764	/* free_pages my go negative - that's OK */	770	/* free_pages my go negative - that's OK */
765	long min = mark, free_pages = z->free_pages - (1 << order) + 1;	771	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
766	int o;	772	int o;
767		773
768	if (gfp_high)	774	if (alloc_flags & ALLOC_HIGH)
769	min -= min / 2;	775	min -= min / 2;
770	if (can_try_harder)	776	if (alloc_flags & ALLOC_HARDER)
771	min -= min / 4;	777	min -= min / 4;
772		778
773	if (free_pages <= min + z->lowmem_reserve[classzone_idx])	779	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -785,14 +791,40 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
785	return 1;	791	return 1;
786	}	792	}
787		793
788	static inline int	794	/*
789	should_reclaim_zone(struct zone *z, gfp_t gfp_mask)	795	* get_page_from_freeliest goes through the zonelist trying to allocate
		796	* a page.
		797	*/
		798	static struct page *
		799	get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
		800	struct zonelist *zonelist, int alloc_flags)
790	{	801	{
791	if (!z->reclaim_pages)	802	struct zone **z = zonelist->zones;
792	return 0;	803	struct page *page = NULL;
793	if (gfp_mask & __GFP_NORECLAIM)	804	int classzone_idx = zone_idx(*z);
794	return 0;	805
795	return 1;	806	/*
		807	* Go through the zonelist once, looking for a zone with enough free.
		808	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
		809	*/
		810	do {
		811	if ((alloc_flags & ALLOC_CPUSET) &&
		812	!cpuset_zone_allowed(*z, gfp_mask))
		813	continue;
		814
		815	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
		816	if (!zone_watermark_ok(z, order, (z)->pages_low,
		817	classzone_idx, alloc_flags))
		818	continue;
		819	}
		820
		821	page = buffered_rmqueue(*z, order, gfp_mask);
		822	if (page) {
		823	zone_statistics(zonelist, *z);
		824	break;
		825	}
		826	} while (*(++z) != NULL);
		827	return page;
796	}	828	}
797		829
798	/*	830	/*
@@ -803,105 +835,75 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
803	struct zonelist *zonelist)	835	struct zonelist *zonelist)
804	{	836	{
805	const gfp_t wait = gfp_mask & __GFP_WAIT;	837	const gfp_t wait = gfp_mask & __GFP_WAIT;
806	struct zone *zones, z;	838	struct zone **z;
807	struct page *page;	839	struct page *page;
808	struct reclaim_state reclaim_state;	840	struct reclaim_state reclaim_state;
809	struct task_struct *p = current;	841	struct task_struct *p = current;
810	int i;
811	int classzone_idx;
812	int do_retry;	842	int do_retry;
813	int can_try_harder;	843	int alloc_flags;
814	int did_some_progress;	844	int did_some_progress;
815		845
816	might_sleep_if(wait);	846	might_sleep_if(wait);
817		847
818	/*	848	z = zonelist->zones; /* the list of zones suitable for gfp_mask */
819	* The caller may dip into page reserves a bit more if the caller
820	* cannot run direct reclaim, or is the caller has realtime scheduling
821	* policy
822	*/
823	can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) \|\| !wait;
824
825	zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
826		849
827	if (unlikely(zones[0] == NULL)) {	850	if (unlikely(*z == NULL)) {
828	/* Should this ever happen?? */	851	/* Should this ever happen?? */
829	return NULL;	852	return NULL;
830	}	853	}
		854	restart:
		855	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
		856	zonelist, ALLOC_CPUSET);
		857	if (page)
		858	goto got_pg;
831		859
832	classzone_idx = zone_idx(zones[0]);	860	do
		861	wakeup_kswapd(*z, order);
		862	while (*(++z));
833		863
834	restart:
835	/*	864	/*
836	* Go through the zonelist once, looking for a zone with enough free.	865	* OK, we're below the kswapd watermark and have kicked background
837	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.	866	* reclaim. Now things get more complex, so set up alloc_flags according
		867	* to how we want to proceed.
		868	*
		869	* The caller may dip into page reserves a bit more if the caller
		870	* cannot run direct reclaim, or if the caller has realtime scheduling
		871	* policy.
838	*/	872	*/
839	for (i = 0; (z = zones[i]) != NULL; i++) {	873	alloc_flags = 0;
840	int do_reclaim = should_reclaim_zone(z, gfp_mask);	874	if ((unlikely(rt_task(p)) && !in_interrupt()) \|\| !wait)
841		875	alloc_flags \|= ALLOC_HARDER;
842	if (!cpuset_zone_allowed(z, __GFP_HARDWALL))	876	if (gfp_mask & __GFP_HIGH)
843	continue;	877	alloc_flags \|= ALLOC_HIGH;
844		878	if (wait)
845	/*	879	alloc_flags \|= ALLOC_CPUSET;
846	* If the zone is to attempt early page reclaim then this loop
847	* will try to reclaim pages and check the watermark a second
848	* time before giving up and falling back to the next zone.
849	*/
850	zone_reclaim_retry:
851	if (!zone_watermark_ok(z, order, z->pages_low,
852	classzone_idx, 0, 0)) {
853	if (!do_reclaim)
854	continue;
855	else {
856	zone_reclaim(z, gfp_mask, order);
857	/* Only try reclaim once */
858	do_reclaim = 0;
859	goto zone_reclaim_retry;
860	}
861	}
862
863	page = buffered_rmqueue(z, order, gfp_mask);
864	if (page)
865	goto got_pg;
866	}
867
868	for (i = 0; (z = zones[i]) != NULL; i++)
869	wakeup_kswapd(z, order);
870		880
871	/*	881	/*
872	* Go through the zonelist again. Let __GFP_HIGH and allocations	882	* Go through the zonelist again. Let __GFP_HIGH and allocations
873	* coming from realtime tasks to go deeper into reserves	883	* coming from realtime tasks go deeper into reserves.
874	*	884	*
875	* This is the last chance, in general, before the goto nopage.	885	* This is the last chance, in general, before the goto nopage.
876	* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.	886	* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
877	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.	887	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
878	*/	888	*/
879	for (i = 0; (z = zones[i]) != NULL; i++) {	889	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
880	if (!zone_watermark_ok(z, order, z->pages_min,	890	if (page)
881	classzone_idx, can_try_harder,	891	goto got_pg;
882	gfp_mask & __GFP_HIGH))
883	continue;
884
885	if (wait && !cpuset_zone_allowed(z, gfp_mask))
886	continue;
887
888	page = buffered_rmqueue(z, order, gfp_mask);
889	if (page)
890	goto got_pg;
891	}
892		892
893	/* This allocation should allow future memory freeing. */	893	/* This allocation should allow future memory freeing. */
894		894
895	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))	895	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))
896	&& !in_interrupt()) {	896	&& !in_interrupt()) {
897	if (!(gfp_mask & __GFP_NOMEMALLOC)) {	897	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
		898	nofail_alloc:
898	/* go through the zonelist yet again, ignoring mins */	899	/* go through the zonelist yet again, ignoring mins */
899	for (i = 0; (z = zones[i]) != NULL; i++) {	900	page = get_page_from_freelist(gfp_mask, order,
900	if (!cpuset_zone_allowed(z, gfp_mask))	901	zonelist, ALLOC_NO_WATERMARKS\|ALLOC_CPUSET);
901	continue;	902	if (page)
902	page = buffered_rmqueue(z, order, gfp_mask);	903	goto got_pg;
903	if (page)	904	if (gfp_mask & __GFP_NOFAIL) {
904	goto got_pg;	905	blk_congestion_wait(WRITE, HZ/50);
		906	goto nofail_alloc;
905	}	907	}
906	}	908	}
907	goto nopage;	909	goto nopage;
@@ -919,7 +921,7 @@ rebalance:
919	reclaim_state.reclaimed_slab = 0;	921	reclaim_state.reclaimed_slab = 0;
920	p->reclaim_state = &reclaim_state;	922	p->reclaim_state = &reclaim_state;
921		923
922	did_some_progress = try_to_free_pages(zones, gfp_mask);	924	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
923		925
924	p->reclaim_state = NULL;	926	p->reclaim_state = NULL;
925	p->flags &= ~PF_MEMALLOC;	927	p->flags &= ~PF_MEMALLOC;
@@ -927,19 +929,10 @@ rebalance:
927	cond_resched();	929	cond_resched();
928		930
929	if (likely(did_some_progress)) {	931	if (likely(did_some_progress)) {
930	for (i = 0; (z = zones[i]) != NULL; i++) {	932	page = get_page_from_freelist(gfp_mask, order,
931	if (!zone_watermark_ok(z, order, z->pages_min,	933	zonelist, alloc_flags);
932	classzone_idx, can_try_harder,	934	if (page)
933	gfp_mask & __GFP_HIGH))	935	goto got_pg;
934	continue;
935
936	if (!cpuset_zone_allowed(z, gfp_mask))
937	continue;
938
939	page = buffered_rmqueue(z, order, gfp_mask);
940	if (page)
941	goto got_pg;
942	}
943	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {	936	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
944	/*	937	/*
945	* Go through the zonelist yet one more time, keep	938	* Go through the zonelist yet one more time, keep
@@ -947,18 +940,10 @@ rebalance:
947	* a parallel oom killing, we must fail if we're still	940	* a parallel oom killing, we must fail if we're still
948	* under heavy pressure.	941	* under heavy pressure.
949	*/	942	*/
950	for (i = 0; (z = zones[i]) != NULL; i++) {	943	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
951	if (!zone_watermark_ok(z, order, z->pages_high,	944	zonelist, ALLOC_CPUSET);
952	classzone_idx, 0, 0))	945	if (page)
953	continue;	946	goto got_pg;
954
955	if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
956	continue;
957
958	page = buffered_rmqueue(z, order, gfp_mask);
959	if (page)
960	goto got_pg;
961	}
962		947
963	out_of_memory(gfp_mask, order);	948	out_of_memory(gfp_mask, order);
964	goto restart;	949	goto restart;
@@ -991,9 +976,7 @@ nopage:
991	dump_stack();	976	dump_stack();
992	show_mem();	977	show_mem();
993	}	978	}
994	return NULL;
995	got_pg:	979	got_pg:
996	zone_statistics(zonelist, z);
997	return page;	980	return page;
998	}	981	}
999		982
@@ -1441,6 +1424,10 @@ static int __init build_zonelists_node(pg_data_t pgdat, struct zonelist zoneli
1441	zone = pgdat->node_zones + ZONE_NORMAL;	1424	zone = pgdat->node_zones + ZONE_NORMAL;
1442	if (zone->present_pages)	1425	if (zone->present_pages)
1443	zonelist->zones[j++] = zone;	1426	zonelist->zones[j++] = zone;
		1427	case ZONE_DMA32:
		1428	zone = pgdat->node_zones + ZONE_DMA32;
		1429	if (zone->present_pages)
		1430	zonelist->zones[j++] = zone;
1444	case ZONE_DMA:	1431	case ZONE_DMA:
1445	zone = pgdat->node_zones + ZONE_DMA;	1432	zone = pgdat->node_zones + ZONE_DMA;
1446	if (zone->present_pages)	1433	if (zone->present_pages)
@@ -1455,6 +1442,8 @@ static inline int highest_zone(int zone_bits)
1455	int res = ZONE_NORMAL;	1442	int res = ZONE_NORMAL;
1456	if (zone_bits & (__force int)__GFP_HIGHMEM)	1443	if (zone_bits & (__force int)__GFP_HIGHMEM)
1457	res = ZONE_HIGHMEM;	1444	res = ZONE_HIGHMEM;
		1445	if (zone_bits & (__force int)__GFP_DMA32)
		1446	res = ZONE_DMA32;
1458	if (zone_bits & (__force int)__GFP_DMA)	1447	if (zone_bits & (__force int)__GFP_DMA)
1459	res = ZONE_DMA;	1448	res = ZONE_DMA;
1460	return res;	1449	return res;
@@ -1866,11 +1855,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1866	if (process_zones(cpu))	1855	if (process_zones(cpu))
1867	ret = NOTIFY_BAD;	1856	ret = NOTIFY_BAD;
1868	break;	1857	break;
1869	#ifdef CONFIG_HOTPLUG_CPU	1858	case CPU_UP_CANCELED:
1870	case CPU_DEAD:	1859	case CPU_DEAD:
1871	free_zone_pagesets(cpu);	1860	free_zone_pagesets(cpu);
1872	break;	1861	break;
1873	#endif
1874	default:	1862	default:
1875	break;	1863	break;
1876	}	1864	}
@@ -1975,7 +1963,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1975	if (zholes_size)	1963	if (zholes_size)
1976	realsize -= zholes_size[j];	1964	realsize -= zholes_size[j];
1977		1965
1978	if (j == ZONE_DMA \|\| j == ZONE_NORMAL)	1966	if (j < ZONE_HIGHMEM)
1979	nr_kernel_pages += realsize;	1967	nr_kernel_pages += realsize;
1980	nr_all_pages += realsize;	1968	nr_all_pages += realsize;
1981		1969
@@ -2417,13 +2405,18 @@ void setup_per_zone_pages_min(void)
2417	}	2405	}
2418		2406
2419	for_each_zone(zone) {	2407	for_each_zone(zone) {
		2408	unsigned long tmp;
2420	spin_lock_irqsave(&zone->lru_lock, flags);	2409	spin_lock_irqsave(&zone->lru_lock, flags);
		2410	tmp = (pages_min * zone->present_pages) / lowmem_pages;
2421	if (is_highmem(zone)) {	2411	if (is_highmem(zone)) {
2422	/*	2412	/*
2423	* Often, highmem doesn't need to reserve any pages.	2413	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
2424	* But the pages_min/low/high values are also used for	2414	* need highmem pages, so cap pages_min to a small
2425	* batching up page reclaim activity so we need a	2415	* value here.
2426	* decent value here.	2416	*
		2417	* The (pages_high-pages_low) and (pages_low-pages_min)
		2418	* deltas controls asynch page reclaim, and so should
		2419	* not be capped for highmem.
2427	*/	2420	*/
2428	int min_pages;	2421	int min_pages;
2429		2422
@@ -2434,19 +2427,15 @@ void setup_per_zone_pages_min(void)
2434	min_pages = 128;	2427	min_pages = 128;
2435	zone->pages_min = min_pages;	2428	zone->pages_min = min_pages;
2436	} else {	2429	} else {
2437	/* if it's a lowmem zone, reserve a number of pages	2430	/*
		2431	* If it's a lowmem zone, reserve a number of pages
2438	* proportionate to the zone's size.	2432	* proportionate to the zone's size.
2439	*/	2433	*/
2440	zone->pages_min = (pages_min * zone->present_pages) /	2434	zone->pages_min = tmp;
2441	lowmem_pages;
2442	}	2435	}
2443		2436
2444	/*	2437	zone->pages_low = zone->pages_min + tmp / 4;
2445	* When interpreting these watermarks, just keep in mind that:	2438	zone->pages_high = zone->pages_min + tmp / 2;
2446	* zone->pages_min == (zone->pages_min * 4) / 4;
2447	*/
2448	zone->pages_low = (zone->pages_min * 5) / 4;
2449	zone->pages_high = (zone->pages_min * 6) / 4;
2450	spin_unlock_irqrestore(&zone->lru_lock, flags);	2439	spin_unlock_irqrestore(&zone->lru_lock, flags);
2451	}	2440	}
2452	}	2441	}