1 files changed, 125 insertions, 82 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94c864eac9c4..2dbdd98426fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
@@ -78,21 +79,44 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
+{
+        int ret = 0;
+        unsigned seq;
+        unsigned long pfn = page_to_pfn(page);
+        do {
+                seq = zone_span_seqbegin(zone);
+                if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+                        ret = 1;
+                else if (pfn < zone->zone_start_pfn)
+                        ret = 1;
+        } while (zone_span_seqretry(zone, seq));
+        return ret;
+}
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+        if (!pfn_valid(page_to_pfn(page)))
+                return 0;
+#endif
+        if (zone != page_zone(page))
+                return 0;
+        return 1;
+}
 /*
 * Temporary debugging check for pages not lying within a given zone.
 */
 static int bad_range(struct zone *zone, struct page *page)
 {
-        if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+        if (page_outside_zone_boundaries(zone, page))
                return 1;
-        if (page_to_pfn(page) < zone->zone_start_pfn)
+        if (!page_is_consistent(zone, page))
-                return 1;
-#ifdef CONFIG_HOLES_IN_ZONE
-        if (!pfn_valid(page_to_pfn(page)))
-                return 1;
-#endif
-        if (zone != page_zone(page))
                return 1;
        return 0;
 }
@@ -114,7 +138,8 @@ static void bad_page(const char *function, struct page *page)
                        1 << PG_reclaim |
                        1 << PG_slab    |
                        1 << PG_swapcache |
-                        1 << PG_writeback);
+                        1 << PG_writeback |
+                        1 << PG_reserved );
        set_page_count(page, 0);
        reset_page_mapcount(page);
        page->mapping = NULL;
@@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
                struct page *p = page + i;
                SetPageCompound(p);
-                p->private = (unsigned long)page;
+                set_page_private(p, (unsigned long)page);
        }
 }
@@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
                if (!PageCompound(p))
                        bad_page(__FUNCTION__, page);
-                if (p->private != (unsigned long)page)
+                if (page_private(p) != (unsigned long)page)
                        bad_page(__FUNCTION__, page);
                ClearPageCompound(p);
        }
@@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 * So, we don't need atomic page->flags operations here.
 */
 static inline unsigned long page_order(struct page *page) {
-        return page->private;
+        return page_private(page);
 }
 static inline void set_page_order(struct page *page, int order) {
-        page->private = order;
+        set_page_private(page, order);
        __SetPagePrivate(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
        __ClearPagePrivate(page);
-        page->private = 0;
+        set_page_private(page, 0);
 }
 /*
@@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * (a) the buddy is free &&
 * (b) the buddy is on the buddy system &&
 * (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
 *
 */
 static inline int page_is_buddy(struct page *page, int order)
 {
       if (PagePrivate(page)           &&
           (page_order(page) == order) &&
-           !PageReserved(page)         &&
            page_count(page) == 0)
               return 1;
       return 0;
@@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order)
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
 * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
 * free, the remainder of the region must be split into blocks.   
@@ -327,7 +351,8 @@ static inline void free_pages_check(const char *function, struct page *page)
                        1 << PG_reclaim |
                        1 << PG_slab    |
                        1 << PG_swapcache |
-                        1 << PG_writeback )))
+                        1 << PG_writeback |
+                        1 << PG_reserved )))
                bad_page(function, page);
        if (PageDirty(page))
                __ClearPageDirty(page);
@@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order)
                        1 << PG_reclaim |
                        1 << PG_slab    |
                        1 << PG_swapcache |
-                        1 << PG_writeback )))
+                        1 << PG_writeback |
+                        1 << PG_reserved )))
                bad_page(__FUNCTION__, page);
        page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                        1 << PG_referenced | 1 << PG_arch_1 |
                        1 << PG_checked | 1 << PG_mappedtodisk);
-        page->private = 0;
+        set_page_private(page, 0);
        set_page_refs(page, order);
        kernel_map_pages(page, 1 << order, 1);
 }
@@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec)
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
-        if (!PageReserved(page) && put_page_testzero(page)) {
+        if (put_page_testzero(page)) {
                if (order == 0)
                        free_hot_page(page);
                else
@@ -1305,12 +1331,9 @@ void show_free_areas(void)
                } else
                        printk("\n");
-                for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+                for_each_cpu(cpu) {
                        struct per_cpu_pageset *pageset;
-                        if (!cpu_possible(cpu))
-                                continue;
                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
@@ -1660,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 * up by free_all_bootmem() once the early boot process is
 * done. Non-atomic initialization, single-pass.
 */
-void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn)
 {
        struct page *page;
@@ -1674,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                        continue;
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
-                set_page_count(page, 0);
+                set_page_count(page, 1);
                reset_page_mapcount(page);
                SetPageReserved(page);
                INIT_LIST_HEAD(&page->lru);
@@ -1721,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone)
        /*
         * The per-cpu-pages pools are set to around 1000th of the
-         * size of the zone.  But no more than 1/4 of a meg - there's
+         * size of the zone.  But no more than 1/2 of a meg.
-         * no point in going beyond the size of L2 cache.
         *
         * OK, so we don't know how big the cache is.  So guess.
         */
        batch = zone->present_pages / 1024;
-        if (batch * PAGE_SIZE > 256 * 1024)
+        if (batch * PAGE_SIZE > 512 * 1024)
-                batch = (256 * 1024) / PAGE_SIZE;
+                batch = (512 * 1024) / PAGE_SIZE;
        batch /= 4;             /* We effectively *= 4 below */
        if (batch < 1)
                batch = 1;
        /*
-         * Clamp the batch to a 2^n - 1 value. Having a power
+         * We will be trying to allcoate bigger chunks of contiguous
-         * of 2 value was found to be more likely to have
+         * memory of the order of fls(batch).  This should result in
-         * suboptimal cache aliasing properties in some cases.
+         * better cache coloring.
         *
-         * For example if 2 tasks are alternately allocating
+         * A sanity check also to ensure that batch is still in limits.
-         * batches of pages, one task can end up with a lot
-         * of pages of one half of the possible page colors
-         * and the other with pages of the other colors.
         */
-        batch = (1 << fls(batch + batch/2)) - 1;
+        batch = (1 << fls(batch + batch/2));
+        if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
+                batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
        return batch;
 }
@@ -1755,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp = &p->pcp[0];               /* hot */
        pcp->count = 0;
-        pcp->low = 2 * batch;
+        pcp->low = 0;
        pcp->high = 6 * batch;
        pcp->batch = max(1UL, 1 * batch);
        INIT_LIST_HEAD(&pcp->list);
@@ -1764,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp->count = 0;
        pcp->low = 0;
        pcp->high = 2 * batch;
-        pcp->batch = max(1UL, 1 * batch);
+        pcp->batch = max(1UL, batch/2);
        INIT_LIST_HEAD(&pcp->list);
 }
@@ -1873,6 +1896,60 @@ void __init setup_per_cpu_pageset()
 #endif
+static __devinit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+        int i;
+        struct pglist_data *pgdat = zone->zone_pgdat;
+        /*
+         * The per-page waitqueue mechanism uses hashed waitqueues
+         * per zone.
+         */
+        zone->wait_table_size = wait_table_size(zone_size_pages);
+        zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
+        zone->wait_table = (wait_queue_head_t *)
+                alloc_bootmem_node(pgdat, zone->wait_table_size
+                                        * sizeof(wait_queue_head_t));
+        for(i = 0; i < zone->wait_table_size; ++i)
+                init_waitqueue_head(zone->wait_table + i);
+}
+static __devinit void zone_pcp_init(struct zone *zone)
+{
+        int cpu;
+        unsigned long batch = zone_batchsize(zone);
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+                /* Early boot. Slab allocator not functional yet */
+                zone->pageset[cpu] = &boot_pageset[cpu];
+                setup_pageset(&boot_pageset[cpu],0);
+#else
+                setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+        }
+        printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+                zone->name, zone->present_pages, batch);
+}
+static __devinit void init_currently_empty_zone(struct zone *zone,
+                unsigned long zone_start_pfn, unsigned long size)
+{
+        struct pglist_data *pgdat = zone->zone_pgdat;
+        zone_wait_table_init(zone, size);
+        pgdat->nr_zones = zone_idx(zone) + 1;
+        zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+        zone->zone_start_pfn = zone_start_pfn;
+        memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+        zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -1882,10 +1959,11 @@ void __init setup_per_cpu_pageset()
 static void __init free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
 {
-        unsigned long i, j;
+        unsigned long j;
-        int cpu, nid = pgdat->node_id;
+        int nid = pgdat->node_id;
        unsigned long zone_start_pfn = pgdat->node_start_pfn;
+        pgdat_resize_init(pgdat);
        pgdat->nr_zones = 0;
        init_waitqueue_head(&pgdat->kswapd_wait);
        pgdat->kswapd_max_order = 0;
@@ -1893,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize;
-                unsigned long batch;
                realsize = size = zones_size[j];
                if (zholes_size)
@@ -1908,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->name = zone_names[j];
                spin_lock_init(&zone->lock);
                spin_lock_init(&zone->lru_lock);
+                zone_seqlock_init(zone);
                zone->zone_pgdat = pgdat;
                zone->free_pages = 0;
                zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-                batch = zone_batchsize(zone);
+                zone_pcp_init(zone);
-                for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-                        /* Early boot. Slab allocator not functional yet */
-                        zone->pageset[cpu] = &boot_pageset[cpu];
-                        setup_pageset(&boot_pageset[cpu],0);
-#else
-                        setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-                }
-                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-                                zone_names[j], realsize, batch);
                INIT_LIST_HEAD(&zone->active_list);
                INIT_LIST_HEAD(&zone->inactive_list);
                zone->nr_scan_active = 0;
@@ -1936,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                if (!size)
                        continue;
-                /*
-                 * The per-page waitqueue mechanism uses hashed waitqueues
-                 * per zone.
-                 */
-                zone->wait_table_size = wait_table_size(size);
-                zone->wait_table_bits =
-                        wait_table_bits(zone->wait_table_size);
-                zone->wait_table = (wait_queue_head_t *)
-                        alloc_bootmem_node(pgdat, zone->wait_table_size
-                                                * sizeof(wait_queue_head_t));
-                for(i = 0; i < zone->wait_table_size; ++i)
-                        init_waitqueue_head(zone->wait_table + i);
-                pgdat->nr_zones = j+1;
-                zone->zone_mem_map = pfn_to_page(zone_start_pfn);
-                zone->zone_start_pfn = zone_start_pfn;
-                memmap_init(size, nid, j, zone_start_pfn);
                zonetable_add(zone, nid, j, zone_start_pfn, size);
+                init_currently_empty_zone(zone, zone_start_pfn, size);
                zone_start_pfn += size;
-                zone_init_free_lists(pgdat, zone, zone->spanned_pages);
        }
 }
@@ -2361,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void)
 *      that the pages_{min,low,high} values for each zone are set correctly 
 *      with respect to min_free_kbytes.
 */
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
 {
        unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
        unsigned long lowmem_pages = 0;

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94c864eac9c4..2dbdd98426fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
33	#include <linux/sysctl.h>	33	#include <linux/sysctl.h>
34	#include <linux/cpu.h>	34	#include <linux/cpu.h>
35	#include <linux/cpuset.h>	35	#include <linux/cpuset.h>
		36	#include <linux/memory_hotplug.h>
36	#include <linux/nodemask.h>	37	#include <linux/nodemask.h>
37	#include <linux/vmalloc.h>	38	#include <linux/vmalloc.h>
38		39
@@ -78,21 +79,44 @@ int min_free_kbytes = 1024;
78	unsigned long __initdata nr_kernel_pages;	79	unsigned long __initdata nr_kernel_pages;
79	unsigned long __initdata nr_all_pages;	80	unsigned long __initdata nr_all_pages;
80		81
		82	static int page_outside_zone_boundaries(struct zone zone, struct page page)
		83	{
		84	int ret = 0;
		85	unsigned seq;
		86	unsigned long pfn = page_to_pfn(page);
		87
		88	do {
		89	seq = zone_span_seqbegin(zone);
		90	if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
		91	ret = 1;
		92	else if (pfn < zone->zone_start_pfn)
		93	ret = 1;
		94	} while (zone_span_seqretry(zone, seq));
		95
		96	return ret;
		97	}
		98
		99	static int page_is_consistent(struct zone zone, struct page page)
		100	{
		101	#ifdef CONFIG_HOLES_IN_ZONE
		102	if (!pfn_valid(page_to_pfn(page)))
		103	return 0;
		104	#endif
		105	if (zone != page_zone(page))
		106	return 0;
		107
		108	return 1;
		109	}
81	/*	110	/*
82	* Temporary debugging check for pages not lying within a given zone.	111	* Temporary debugging check for pages not lying within a given zone.
83	*/	112	*/
84	static int bad_range(struct zone zone, struct page page)	113	static int bad_range(struct zone zone, struct page page)
85	{	114	{
86	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)	115	if (page_outside_zone_boundaries(zone, page))
87	return 1;	116	return 1;
88	if (page_to_pfn(page) < zone->zone_start_pfn)	117	if (!page_is_consistent(zone, page))
89	return 1;
90	#ifdef CONFIG_HOLES_IN_ZONE
91	if (!pfn_valid(page_to_pfn(page)))
92	return 1;
93	#endif
94	if (zone != page_zone(page))
95	return 1;	118	return 1;
		119
96	return 0;	120	return 0;
97	}	121	}
98		122
@@ -114,7 +138,8 @@ static void bad_page(const char function, struct page page)
114	1 << PG_reclaim \|	138	1 << PG_reclaim \|
115	1 << PG_slab \|	139	1 << PG_slab \|
116	1 << PG_swapcache \|	140	1 << PG_swapcache \|
117	1 << PG_writeback);	141	1 << PG_writeback \|
		142	1 << PG_reserved );
118	set_page_count(page, 0);	143	set_page_count(page, 0);
119	reset_page_mapcount(page);	144	reset_page_mapcount(page);
120	page->mapping = NULL;	145	page->mapping = NULL;
@@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
153	struct page *p = page + i;	178	struct page *p = page + i;
154		179
155	SetPageCompound(p);	180	SetPageCompound(p);
156	p->private = (unsigned long)page;	181	set_page_private(p, (unsigned long)page);
157	}	182	}
158	}	183	}
159		184
@@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
173		198
174	if (!PageCompound(p))	199	if (!PageCompound(p))
175	bad_page(__FUNCTION__, page);	200	bad_page(__FUNCTION__, page);
176	if (p->private != (unsigned long)page)	201	if (page_private(p) != (unsigned long)page)
177	bad_page(__FUNCTION__, page);	202	bad_page(__FUNCTION__, page);
178	ClearPageCompound(p);	203	ClearPageCompound(p);
179	}	204	}
@@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
186	* So, we don't need atomic page->flags operations here.	211	* So, we don't need atomic page->flags operations here.
187	*/	212	*/
188	static inline unsigned long page_order(struct page *page) {	213	static inline unsigned long page_order(struct page *page) {
189	return page->private;	214	return page_private(page);
190	}	215	}
191		216
192	static inline void set_page_order(struct page *page, int order) {	217	static inline void set_page_order(struct page *page, int order) {
193	page->private = order;	218	set_page_private(page, order);
194	__SetPagePrivate(page);	219	__SetPagePrivate(page);
195	}	220	}
196		221
197	static inline void rmv_page_order(struct page *page)	222	static inline void rmv_page_order(struct page *page)
198	{	223	{
199	__ClearPagePrivate(page);	224	__ClearPagePrivate(page);
200	page->private = 0;	225	set_page_private(page, 0);
201	}	226	}
202		227
203	/*	228	/*
@@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
237	* (a) the buddy is free &&	262	* (a) the buddy is free &&
238	* (b) the buddy is on the buddy system &&	263	* (b) the buddy is on the buddy system &&
239	* (c) a page and its buddy have the same order.	264	* (c) a page and its buddy have the same order.
240	* for recording page's order, we use page->private and PG_private.	265	* for recording page's order, we use page_private(page) and PG_private.
241	*	266	*
242	*/	267	*/
243	static inline int page_is_buddy(struct page *page, int order)	268	static inline int page_is_buddy(struct page *page, int order)
244	{	269	{
245	if (PagePrivate(page) &&	270	if (PagePrivate(page) &&
246	(page_order(page) == order) &&	271	(page_order(page) == order) &&
247	!PageReserved(page) &&
248	page_count(page) == 0)	272	page_count(page) == 0)
249	return 1;	273	return 1;
250	return 0;	274	return 0;
@@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order)
264	* parts of the VM system.	288	* parts of the VM system.
265	* At each level, we keep a list of pages, which are heads of continuous	289	* At each level, we keep a list of pages, which are heads of continuous
266	* free pages of length of (1 << order) and marked with PG_Private.Page's	290	* free pages of length of (1 << order) and marked with PG_Private.Page's
267	* order is recorded in page->private field.	291	* order is recorded in page_private(page) field.
268	* So when we are allocating or freeing one, we can derive the state of the	292	* So when we are allocating or freeing one, we can derive the state of the
269	* other. That is, if we allocate a small block, and both were	293	* other. That is, if we allocate a small block, and both were
270	* free, the remainder of the region must be split into blocks.	294	* free, the remainder of the region must be split into blocks.
@@ -327,7 +351,8 @@ static inline void free_pages_check(const char function, struct page page)
327	1 << PG_reclaim \|	351	1 << PG_reclaim \|
328	1 << PG_slab \|	352	1 << PG_slab \|
329	1 << PG_swapcache \|	353	1 << PG_swapcache \|
330	1 << PG_writeback )))	354	1 << PG_writeback \|
		355	1 << PG_reserved )))
331	bad_page(function, page);	356	bad_page(function, page);
332	if (PageDirty(page))	357	if (PageDirty(page))
333	__ClearPageDirty(page);	358	__ClearPageDirty(page);
@@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order)
455	1 << PG_reclaim \|	480	1 << PG_reclaim \|
456	1 << PG_slab \|	481	1 << PG_slab \|
457	1 << PG_swapcache \|	482	1 << PG_swapcache \|
458	1 << PG_writeback )))	483	1 << PG_writeback \|
		484	1 << PG_reserved )))
459	bad_page(__FUNCTION__, page);	485	bad_page(__FUNCTION__, page);
460		486
461	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|	487	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|
462	1 << PG_referenced \| 1 << PG_arch_1 \|	488	1 << PG_referenced \| 1 << PG_arch_1 \|
463	1 << PG_checked \| 1 << PG_mappedtodisk);	489	1 << PG_checked \| 1 << PG_mappedtodisk);
464	page->private = 0;	490	set_page_private(page, 0);
465	set_page_refs(page, order);	491	set_page_refs(page, order);
466	kernel_map_pages(page, 1 << order, 1);	492	kernel_map_pages(page, 1 << order, 1);
467	}	493	}
@@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec)
1016		1042
1017	fastcall void __free_pages(struct page *page, unsigned int order)	1043	fastcall void __free_pages(struct page *page, unsigned int order)
1018	{	1044	{
1019	if (!PageReserved(page) && put_page_testzero(page)) {	1045	if (put_page_testzero(page)) {
1020	if (order == 0)	1046	if (order == 0)
1021	free_hot_page(page);	1047	free_hot_page(page);
1022	else	1048	else
@@ -1305,12 +1331,9 @@ void show_free_areas(void)
1305	} else	1331	} else
1306	printk("\n");	1332	printk("\n");
1307		1333
1308	for (cpu = 0; cpu < NR_CPUS; ++cpu) {	1334	for_each_cpu(cpu) {
1309	struct per_cpu_pageset *pageset;	1335	struct per_cpu_pageset *pageset;
1310		1336
1311	if (!cpu_possible(cpu))
1312	continue;
1313
1314	pageset = zone_pcp(zone, cpu);	1337	pageset = zone_pcp(zone, cpu);
1315		1338
1316	for (temperature = 0; temperature < 2; temperature++)	1339	for (temperature = 0; temperature < 2; temperature++)
@@ -1660,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1660	* up by free_all_bootmem() once the early boot process is	1683	* up by free_all_bootmem() once the early boot process is
1661	* done. Non-atomic initialization, single-pass.	1684	* done. Non-atomic initialization, single-pass.
1662	*/	1685	*/
1663	void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,	1686	void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1664	unsigned long start_pfn)	1687	unsigned long start_pfn)
1665	{	1688	{
1666	struct page *page;	1689	struct page *page;
@@ -1674,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1674	continue;	1697	continue;
1675	page = pfn_to_page(pfn);	1698	page = pfn_to_page(pfn);
1676	set_page_links(page, zone, nid, pfn);	1699	set_page_links(page, zone, nid, pfn);
1677	set_page_count(page, 0);	1700	set_page_count(page, 1);
1678	reset_page_mapcount(page);	1701	reset_page_mapcount(page);
1679	SetPageReserved(page);	1702	SetPageReserved(page);
1680	INIT_LIST_HEAD(&page->lru);	1703	INIT_LIST_HEAD(&page->lru);
@@ -1721,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone)
1721		1744
1722	/*	1745	/*
1723	* The per-cpu-pages pools are set to around 1000th of the	1746	* The per-cpu-pages pools are set to around 1000th of the
1724	* size of the zone. But no more than 1/4 of a meg - there's	1747	* size of the zone. But no more than 1/2 of a meg.
1725	* no point in going beyond the size of L2 cache.
1726	*	1748	*
1727	* OK, so we don't know how big the cache is. So guess.	1749	* OK, so we don't know how big the cache is. So guess.
1728	*/	1750	*/
1729	batch = zone->present_pages / 1024;	1751	batch = zone->present_pages / 1024;
1730	if (batch * PAGE_SIZE > 256 * 1024)	1752	if (batch * PAGE_SIZE > 512 * 1024)
1731	batch = (256 * 1024) / PAGE_SIZE;	1753	batch = (512 * 1024) / PAGE_SIZE;
1732	batch /= 4; /* We effectively = 4 below /	1754	batch /= 4; /* We effectively = 4 below /
1733	if (batch < 1)	1755	if (batch < 1)
1734	batch = 1;	1756	batch = 1;
1735		1757
1736	/*	1758	/*
1737	* Clamp the batch to a 2^n - 1 value. Having a power	1759	* We will be trying to allcoate bigger chunks of contiguous
1738	* of 2 value was found to be more likely to have	1760	* memory of the order of fls(batch). This should result in
1739	* suboptimal cache aliasing properties in some cases.	1761	* better cache coloring.
1740	*	1762	*
1741	* For example if 2 tasks are alternately allocating	1763	* A sanity check also to ensure that batch is still in limits.
1742	* batches of pages, one task can end up with a lot
1743	* of pages of one half of the possible page colors
1744	* and the other with pages of the other colors.
1745	*/	1764	*/
1746	batch = (1 << fls(batch + batch/2)) - 1;	1765	batch = (1 << fls(batch + batch/2));
		1766
		1767	if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
		1768	batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
		1769
1747	return batch;	1770	return batch;
1748	}	1771	}
1749		1772
@@ -1755,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1755		1778
1756	pcp = &p->pcp[0]; /* hot */	1779	pcp = &p->pcp[0]; /* hot */
1757	pcp->count = 0;	1780	pcp->count = 0;
1758	pcp->low = 2 * batch;	1781	pcp->low = 0;
1759	pcp->high = 6 * batch;	1782	pcp->high = 6 * batch;
1760	pcp->batch = max(1UL, 1 * batch);	1783	pcp->batch = max(1UL, 1 * batch);
1761	INIT_LIST_HEAD(&pcp->list);	1784	INIT_LIST_HEAD(&pcp->list);
@@ -1764,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1764	pcp->count = 0;	1787	pcp->count = 0;
1765	pcp->low = 0;	1788	pcp->low = 0;
1766	pcp->high = 2 * batch;	1789	pcp->high = 2 * batch;
1767	pcp->batch = max(1UL, 1 * batch);	1790	pcp->batch = max(1UL, batch/2);
1768	INIT_LIST_HEAD(&pcp->list);	1791	INIT_LIST_HEAD(&pcp->list);
1769	}	1792	}
1770		1793
@@ -1873,6 +1896,60 @@ void __init setup_per_cpu_pageset()
1873		1896
1874	#endif	1897	#endif
1875		1898
		1899	static __devinit
		1900	void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
		1901	{
		1902	int i;
		1903	struct pglist_data *pgdat = zone->zone_pgdat;
		1904
		1905	/*
		1906	* The per-page waitqueue mechanism uses hashed waitqueues
		1907	* per zone.
		1908	*/
		1909	zone->wait_table_size = wait_table_size(zone_size_pages);
		1910	zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
		1911	zone->wait_table = (wait_queue_head_t *)
		1912	alloc_bootmem_node(pgdat, zone->wait_table_size
		1913	* sizeof(wait_queue_head_t));
		1914
		1915	for(i = 0; i < zone->wait_table_size; ++i)
		1916	init_waitqueue_head(zone->wait_table + i);
		1917	}
		1918
		1919	static __devinit void zone_pcp_init(struct zone *zone)
		1920	{
		1921	int cpu;
		1922	unsigned long batch = zone_batchsize(zone);
		1923
		1924	for (cpu = 0; cpu < NR_CPUS; cpu++) {
		1925	#ifdef CONFIG_NUMA
		1926	/* Early boot. Slab allocator not functional yet */
		1927	zone->pageset[cpu] = &boot_pageset[cpu];
		1928	setup_pageset(&boot_pageset[cpu],0);
		1929	#else
		1930	setup_pageset(zone_pcp(zone,cpu), batch);
		1931	#endif
		1932	}
		1933	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
		1934	zone->name, zone->present_pages, batch);
		1935	}
		1936
		1937	static __devinit void init_currently_empty_zone(struct zone *zone,
		1938	unsigned long zone_start_pfn, unsigned long size)
		1939	{
		1940	struct pglist_data *pgdat = zone->zone_pgdat;
		1941
		1942	zone_wait_table_init(zone, size);
		1943	pgdat->nr_zones = zone_idx(zone) + 1;
		1944
		1945	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
		1946	zone->zone_start_pfn = zone_start_pfn;
		1947
		1948	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
		1949
		1950	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
		1951	}
		1952
1876	/*	1953	/*
1877	* Set up the zone data structures:	1954	* Set up the zone data structures:
1878	* - mark all pages reserved	1955	* - mark all pages reserved
@@ -1882,10 +1959,11 @@ void __init setup_per_cpu_pageset()
1882	static void __init free_area_init_core(struct pglist_data *pgdat,	1959	static void __init free_area_init_core(struct pglist_data *pgdat,
1883	unsigned long zones_size, unsigned long zholes_size)	1960	unsigned long zones_size, unsigned long zholes_size)
1884	{	1961	{
1885	unsigned long i, j;	1962	unsigned long j;
1886	int cpu, nid = pgdat->node_id;	1963	int nid = pgdat->node_id;
1887	unsigned long zone_start_pfn = pgdat->node_start_pfn;	1964	unsigned long zone_start_pfn = pgdat->node_start_pfn;
1888		1965
		1966	pgdat_resize_init(pgdat);
1889	pgdat->nr_zones = 0;	1967	pgdat->nr_zones = 0;
1890	init_waitqueue_head(&pgdat->kswapd_wait);	1968	init_waitqueue_head(&pgdat->kswapd_wait);
1891	pgdat->kswapd_max_order = 0;	1969	pgdat->kswapd_max_order = 0;
@@ -1893,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1893	for (j = 0; j < MAX_NR_ZONES; j++) {	1971	for (j = 0; j < MAX_NR_ZONES; j++) {
1894	struct zone *zone = pgdat->node_zones + j;	1972	struct zone *zone = pgdat->node_zones + j;
1895	unsigned long size, realsize;	1973	unsigned long size, realsize;
1896	unsigned long batch;
1897		1974
1898	realsize = size = zones_size[j];	1975	realsize = size = zones_size[j];
1899	if (zholes_size)	1976	if (zholes_size)
@@ -1908,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1908	zone->name = zone_names[j];	1985	zone->name = zone_names[j];
1909	spin_lock_init(&zone->lock);	1986	spin_lock_init(&zone->lock);
1910	spin_lock_init(&zone->lru_lock);	1987	spin_lock_init(&zone->lru_lock);
		1988	zone_seqlock_init(zone);
1911	zone->zone_pgdat = pgdat;	1989	zone->zone_pgdat = pgdat;
1912	zone->free_pages = 0;	1990	zone->free_pages = 0;
1913		1991
1914	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;	1992	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1915		1993
1916	batch = zone_batchsize(zone);	1994	zone_pcp_init(zone);
1917
1918	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1919	#ifdef CONFIG_NUMA
1920	/* Early boot. Slab allocator not functional yet */
1921	zone->pageset[cpu] = &boot_pageset[cpu];
1922	setup_pageset(&boot_pageset[cpu],0);
1923	#else
1924	setup_pageset(zone_pcp(zone,cpu), batch);
1925	#endif
1926	}
1927	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1928	zone_names[j], realsize, batch);
1929	INIT_LIST_HEAD(&zone->active_list);	1995	INIT_LIST_HEAD(&zone->active_list);
1930	INIT_LIST_HEAD(&zone->inactive_list);	1996	INIT_LIST_HEAD(&zone->inactive_list);
1931	zone->nr_scan_active = 0;	1997	zone->nr_scan_active = 0;
@@ -1936,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1936	if (!size)	2002	if (!size)
1937	continue;	2003	continue;
1938		2004
1939	/*
1940	* The per-page waitqueue mechanism uses hashed waitqueues
1941	* per zone.
1942	*/
1943	zone->wait_table_size = wait_table_size(size);
1944	zone->wait_table_bits =
1945	wait_table_bits(zone->wait_table_size);
1946	zone->wait_table = (wait_queue_head_t *)
1947	alloc_bootmem_node(pgdat, zone->wait_table_size
1948	* sizeof(wait_queue_head_t));
1949
1950	for(i = 0; i < zone->wait_table_size; ++i)
1951	init_waitqueue_head(zone->wait_table + i);
1952
1953	pgdat->nr_zones = j+1;
1954
1955	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1956	zone->zone_start_pfn = zone_start_pfn;
1957
1958	memmap_init(size, nid, j, zone_start_pfn);
1959
1960	zonetable_add(zone, nid, j, zone_start_pfn, size);	2005	zonetable_add(zone, nid, j, zone_start_pfn, size);
1961		2006	init_currently_empty_zone(zone, zone_start_pfn, size);
1962	zone_start_pfn += size;	2007	zone_start_pfn += size;
1963
1964	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1965	}	2008	}
1966	}	2009	}
1967		2010
@@ -2361,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void)
2361	* that the pages_{min,low,high} values for each zone are set correctly	2404	* that the pages_{min,low,high} values for each zone are set correctly
2362	* with respect to min_free_kbytes.	2405	* with respect to min_free_kbytes.
2363	*/	2406	*/
2364	static void setup_per_zone_pages_min(void)	2407	void setup_per_zone_pages_min(void)
2365	{	2408	{
2366	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);	2409	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
2367	unsigned long lowmem_pages = 0;	2410	unsigned long lowmem_pages = 0;