1 files changed, 356 insertions, 67 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b1061b1962f8..206920796f5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page)
        printk(KERN_EMERG "Backtrace:\n");
        dump_stack();
        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
-        page->flags &= ~(1 << PG_private        |
+        page->flags &= ~(1 << PG_lru    |
+                        1 << PG_private |
                        1 << PG_locked  |
-                        1 << PG_lru     |
                        1 << PG_active  |
                        1 << PG_dirty   |
+                        1 << PG_reclaim |
+                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback);
        set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
 */
 static void prep_new_page(struct page *page, int order)
 {
-        if (page->mapping || page_mapcount(page) ||
+        if (    page_mapcount(page) ||
-            (page->flags & (
+                page->mapping != NULL ||
+                page_count(page) != 0 ||
+                (page->flags & (
+                        1 << PG_lru     |
                        1 << PG_private |
                        1 << PG_locked  |
-                        1 << PG_lru     |
                        1 << PG_active  |
                        1 << PG_dirty   |
                        1 << PG_reclaim |
+                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback )))
                bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
        return allocated;
 }
+#ifdef CONFIG_NUMA
+/* Called from the slab reaper to drain remote pagesets */
+void drain_remote_pages(void)
+{
+        struct zone *zone;
+        int i;
+        unsigned long flags;
+        local_irq_save(flags);
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pset;
+                /* Do not drain local pagesets */
+                if (zone->zone_pgdat->node_id == numa_node_id())
+                        continue;
+                pset = zone->pageset[smp_processor_id()];
+                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+                        struct per_cpu_pages *pcp;
+                        pcp = &pset->pcp[i];
+                        if (pcp->count)
+                                pcp->count -= free_pages_bulk(zone, pcp->count,
+                                                &pcp->list, 0);
+                }
+        }
+        local_irq_restore(flags);
+}
+#endif
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
-                pset = &zone->pageset[cpu];
+                pset = zone_pcp(zone, cpu);
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        p = &z->pageset[cpu];
+        p = zone_pcp(z,cpu);
        if (pg == orig) {
-                z->pageset[cpu].numa_hit++;
+                p->numa_hit++;
        } else {
                p->numa_miss++;
-                zonelist->zones[0]->pageset[cpu].numa_foreign++;
+                zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
        }
        if (pg == NODE_DATA(numa_node_id()))
                p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (PageAnon(page))
                page->mapping = NULL;
        free_pages_check(__FUNCTION__, page);
-        pcp = &zone->pageset[get_cpu()].pcp[cold];
+        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
-        if (pcp->count >= pcp->high)
-                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
        list_add(&page->lru, &pcp->list);
        pcp->count++;
+        if (pcp->count >= pcp->high)
+                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
        local_irq_restore(flags);
        put_cpu();
 }
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
        if (order == 0) {
                struct per_cpu_pages *pcp;
-                pcp = &zone->pageset[get_cpu()].pcp[cold];
+                pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                local_irq_save(flags);
                if (pcp->count <= pcp->low)
                        pcp->count += rmqueue_bulk(zone, 0,
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
+static inline int
+should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+{
+        if (!z->reclaim_pages)
+                return 0;
+        if (gfp_mask & __GFP_NORECLAIM)
+                return 0;
+        return 1;
+}
 /*
 * This is the 'heart' of the zoned buddy allocator.
 */
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
        classzone_idx = zone_idx(zones[0]);
- restart:
+restart:
        /* Go through the zonelist once, looking for a zone with enough free */
        for (i = 0; (z = zones[i]) != NULL; i++) {
+                int do_reclaim = should_reclaim_zone(z, gfp_mask);
-                if (!zone_watermark_ok(z, order, z->pages_low,
-                                       classzone_idx, 0, 0))
-                        continue;
                if (!cpuset_zone_allowed(z))
                        continue;
+                /*
+                 * If the zone is to attempt early page reclaim then this loop
+                 * will try to reclaim pages and check the watermark a second
+                 * time before giving up and falling back to the next zone.
+                 */
+zone_reclaim_retry:
+                if (!zone_watermark_ok(z, order, z->pages_low,
+                                       classzone_idx, 0, 0)) {
+                        if (!do_reclaim)
+                                continue;
+                        else {
+                                zone_reclaim(z, gfp_mask, order);
+                                /* Only try reclaim once */
+                                do_reclaim = 0;
+                                goto zone_reclaim_retry;
+                        }
+                }
                page = buffered_rmqueue(z, order, gfp_mask);
                if (page)
                        goto got_pg;
@@ -829,7 +889,7 @@ rebalance:
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zones, gfp_mask, order);
+        did_some_progress = try_to_free_pages(zones, gfp_mask);
        p->reclaim_state = NULL;
        p->flags &= ~PF_MEMALLOC;
@@ -905,6 +965,7 @@ nopage:
                        " order:%d, mode:0x%x\n",
                        p->comm, order, gfp_mask);
                dump_stack();
+                show_mem();
        }
        return NULL;
 got_pg:
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret)
        __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
 }
-unsigned long __read_page_state(unsigned offset)
+unsigned long __read_page_state(unsigned long offset)
 {
        unsigned long ret = 0;
        int cpu;
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset)
        return ret;
 }
-void __mod_page_state(unsigned offset, unsigned long delta)
+void __mod_page_state(unsigned long offset, unsigned long delta)
 {
        unsigned long flags;
        void* ptr;
@@ -1237,22 +1298,23 @@ void show_free_areas(void)
                        if (!cpu_possible(cpu))
                                continue;
-                        pageset = zone->pageset + cpu;
+                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
-                                printk("cpu %d %s: low %d, high %d, batch %d\n",
+                                printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
                                        cpu,
                                        temperature ? "cold" : "hot",
                                        pageset->pcp[temperature].low,
                                        pageset->pcp[temperature].high,
-                                        pageset->pcp[temperature].batch);
+                                        pageset->pcp[temperature].batch,
+                                        pageset->pcp[temperature].count);
                }
        }
        get_page_state(&ps);
        get_zone_counts(&active, &inactive, &free);
-        printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+        printk("Free pages: %11ukB (%ukB HighMem)\n",
                K(nr_free_pages()),
                K(nr_free_highpages()));
@@ -1620,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+        int batch;
+        /*
+         * The per-cpu-pages pools are set to around 1000th of the
+         * size of the zone.  But no more than 1/4 of a meg - there's
+         * no point in going beyond the size of L2 cache.
+         *
+         * OK, so we don't know how big the cache is.  So guess.
+         */
+        batch = zone->present_pages / 1024;
+        if (batch * PAGE_SIZE > 256 * 1024)
+                batch = (256 * 1024) / PAGE_SIZE;
+        batch /= 4;             /* We effectively *= 4 below */
+        if (batch < 1)
+                batch = 1;
+        /*
+         * Clamp the batch to a 2^n - 1 value. Having a power
+         * of 2 value was found to be more likely to have
+         * suboptimal cache aliasing properties in some cases.
+         *
+         * For example if 2 tasks are alternately allocating
+         * batches of pages, one task can end up with a lot
+         * of pages of one half of the possible page colors
+         * and the other with pages of the other colors.
+         */
+        batch = (1 << fls(batch + batch/2)) - 1;
+        return batch;
+}
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+        struct per_cpu_pages *pcp;
+        pcp = &p->pcp[0];               /* hot */
+        pcp->count = 0;
+        pcp->low = 2 * batch;
+        pcp->high = 6 * batch;
+        pcp->batch = max(1UL, 1 * batch);
+        INIT_LIST_HEAD(&pcp->list);
+        pcp = &p->pcp[1];               /* cold*/
+        pcp->count = 0;
+        pcp->low = 0;
+        pcp->high = 2 * batch;
+        pcp->batch = max(1UL, 1 * batch);
+        INIT_LIST_HEAD(&pcp->list);
+}
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ * These will be discarded when bootup is complete.
+ */
+static struct per_cpu_pageset
+        boot_pageset[NR_CPUS] __initdata;
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+        struct zone *zone, *dzone;
+        for_each_zone(zone) {
+                zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+                                         GFP_KERNEL, cpu_to_node(cpu));
+                if (!zone->pageset[cpu])
+                        goto bad;
+                setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+        }
+        return 0;
+bad:
+        for_each_zone(dzone) {
+                if (dzone == zone)
+                        break;
+                kfree(dzone->pageset[cpu]);
+                dzone->pageset[cpu] = NULL;
+        }
+        return -ENOMEM;
+}
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+        struct zone *zone;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+                zone_pcp(zone, cpu) = NULL;
+                kfree(pset);
+        }
+#endif
+}
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+                unsigned long action,
+                void *hcpu)
+{
+        int cpu = (long)hcpu;
+        int ret = NOTIFY_OK;
+        switch (action) {
+                case CPU_UP_PREPARE:
+                        if (process_zones(cpu))
+                                ret = NOTIFY_BAD;
+                        break;
+#ifdef CONFIG_HOTPLUG_CPU
+                case CPU_DEAD:
+                        free_zone_pagesets(cpu);
+                        break;
+#endif
+                default:
+                        break;
+        }
+        return ret;
+}
+static struct notifier_block pageset_notifier =
+        { &pageset_cpuup_callback, NULL, 0 };
+void __init setup_per_cpu_pageset()
+{
+        int err;
+        /* Initialize per_cpu_pageset for cpu 0.
+         * A cpuup callback will do this for every cpu
+         * as it comes online
+         */
+        err = process_zones(smp_processor_id());
+        BUG_ON(err);
+        register_cpu_notifier(&pageset_notifier);
+}
+#endif
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -1662,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-                /*
+                batch = zone_batchsize(zone);
-                 * The per-cpu-pages pools are set to around 1000th of the
-                 * size of the zone.  But no more than 1/4 of a meg - there's
-                 * no point in going beyond the size of L2 cache.
-                 *
-                 * OK, so we don't know how big the cache is.  So guess.
-                 */
-                batch = zone->present_pages / 1024;
-                if (batch * PAGE_SIZE > 256 * 1024)
-                        batch = (256 * 1024) / PAGE_SIZE;
-                batch /= 4;             /* We effectively *= 4 below */
-                if (batch < 1)
-                        batch = 1;
-                /*
-                 * Clamp the batch to a 2^n - 1 value. Having a power
-                 * of 2 value was found to be more likely to have
-                 * suboptimal cache aliasing properties in some cases.
-                 *
-                 * For example if 2 tasks are alternately allocating
-                 * batches of pages, one task can end up with a lot
-                 * of pages of one half of the possible page colors
-                 * and the other with pages of the other colors.
-                 */
-                batch = (1 << fls(batch + batch/2)) - 1;
                for (cpu = 0; cpu < NR_CPUS; cpu++) {
-                        struct per_cpu_pages *pcp;
+#ifdef CONFIG_NUMA
+                        /* Early boot. Slab allocator not functional yet */
-                        pcp = &zone->pageset[cpu].pcp[0];       /* hot */
+                        zone->pageset[cpu] = &boot_pageset[cpu];
-                        pcp->count = 0;
+                        setup_pageset(&boot_pageset[cpu],0);
-                        pcp->low = 2 * batch;
+#else
-                        pcp->high = 6 * batch;
+                        setup_pageset(zone_pcp(zone,cpu), batch);
-                        pcp->batch = 1 * batch;
+#endif
-                        INIT_LIST_HEAD(&pcp->list);
-                        pcp = &zone->pageset[cpu].pcp[1];       /* cold */
-                        pcp->count = 0;
-                        pcp->low = 0;
-                        pcp->high = 2 * batch;
-                        pcp->batch = 1 * batch;
-                        INIT_LIST_HEAD(&pcp->list);
                }
                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
                                zone_names[j], realsize, batch);
@@ -1713,6 +1892,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->nr_scan_inactive = 0;
                zone->nr_active = 0;
                zone->nr_inactive = 0;
+                atomic_set(&zone->reclaim_in_progress, -1);
                if (!size)
                        continue;
@@ -1853,6 +2033,115 @@ struct seq_operations fragmentation_op = {
        .show   = frag_show,
 };
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = arg;
+        struct zone *zone;
+        struct zone *node_zones = pgdat->node_zones;
+        unsigned long flags;
+        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+                int i;
+                if (!zone->present_pages)
+                        continue;
+                spin_lock_irqsave(&zone->lock, flags);
+                seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+                seq_printf(m,
+                           "\n  pages free     %lu"
+                           "\n        min      %lu"
+                           "\n        low      %lu"
+                           "\n        high     %lu"
+                           "\n        active   %lu"
+                           "\n        inactive %lu"
+                           "\n        scanned  %lu (a: %lu i: %lu)"
+                           "\n        spanned  %lu"
+                           "\n        present  %lu",
+                           zone->free_pages,
+                           zone->pages_min,
+                           zone->pages_low,
+                           zone->pages_high,
+                           zone->nr_active,
+                           zone->nr_inactive,
+                           zone->pages_scanned,
+                           zone->nr_scan_active, zone->nr_scan_inactive,
+                           zone->spanned_pages,
+                           zone->present_pages);
+                seq_printf(m,
+                           "\n        protection: (%lu",
+                           zone->lowmem_reserve[0]);
+                for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+                        seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+                seq_printf(m,
+                           ")"
+                           "\n  pagesets");
+                for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+                        struct per_cpu_pageset *pageset;
+                        int j;
+                        pageset = zone_pcp(zone, i);
+                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                                if (pageset->pcp[j].count)
+                                        break;
+                        }
+                        if (j == ARRAY_SIZE(pageset->pcp))
+                                continue;
+                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                                seq_printf(m,
+                                           "\n    cpu: %i pcp: %i"
+                                           "\n              count: %i"
+                                           "\n              low:   %i"
+                                           "\n              high:  %i"
+                                           "\n              batch: %i",
+                                           i, j,
+                                           pageset->pcp[j].count,
+                                           pageset->pcp[j].low,
+                                           pageset->pcp[j].high,
+                                           pageset->pcp[j].batch);
+                        }
+#ifdef CONFIG_NUMA
+                        seq_printf(m,
+                                   "\n            numa_hit:       %lu"
+                                   "\n            numa_miss:      %lu"
+                                   "\n            numa_foreign:   %lu"
+                                   "\n            interleave_hit: %lu"
+                                   "\n            local_node:     %lu"
+                                   "\n            other_node:     %lu",
+                                   pageset->numa_hit,
+                                   pageset->numa_miss,
+                                   pageset->numa_foreign,
+                                   pageset->interleave_hit,
+                                   pageset->local_node,
+                                   pageset->other_node);
+#endif
+                }
+                seq_printf(m,
+                           "\n  all_unreclaimable: %u"
+                           "\n  prev_priority:     %i"
+                           "\n  temp_priority:     %i"
+                           "\n  start_pfn:         %lu",
+                           zone->all_unreclaimable,
+                           zone->prev_priority,
+                           zone->temp_priority,
+                           zone->zone_start_pfn);
+                spin_unlock_irqrestore(&zone->lock, flags);
+                seq_putc(m, '\n');
+        }
+        return 0;
+}
+struct seq_operations zoneinfo_op = {
+        .start  = frag_start, /* iterate over all zones. The same as in
+                               * fragmentation. */
+        .next   = frag_next,
+        .stop   = frag_stop,
+        .show   = zoneinfo_show,
+};
 static char *vmstat_text[] = {
        "nr_dirty",
        "nr_writeback",
@@ -2058,10 +2347,10 @@ static void setup_per_zone_pages_min(void)
                                min_pages = 128;
                        zone->pages_min = min_pages;
                } else {
-                        /* if it's a lowmem zone, reserve a number of pages 
+                        /* if it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                        zone->pages_min = (pages_min * zone->present_pages) / 
+                        zone->pages_min = (pages_min * zone->present_pages) /
                                           lowmem_pages;
                }

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1061b1962f8..206920796f5f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -105,11 +105,13 @@ static void bad_page(const char function, struct page page)
105	printk(KERN_EMERG "Backtrace:\n");	105	printk(KERN_EMERG "Backtrace:\n");
106	dump_stack();	106	dump_stack();
107	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");	107	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
108	page->flags &= ~(1 << PG_private \|	108	page->flags &= ~(1 << PG_lru \|
		109	1 << PG_private \|
109	1 << PG_locked \|	110	1 << PG_locked \|
110	1 << PG_lru \|
111	1 << PG_active \|	111	1 << PG_active \|
112	1 << PG_dirty \|	112	1 << PG_dirty \|
		113	1 << PG_reclaim \|
		114	1 << PG_slab \|
113	1 << PG_swapcache \|	115	1 << PG_swapcache \|
114	1 << PG_writeback);	116	1 << PG_writeback);
115	set_page_count(page, 0);	117	set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
440	*/	442	*/
441	static void prep_new_page(struct page *page, int order)	443	static void prep_new_page(struct page *page, int order)
442	{	444	{
443	if (page->mapping \|\| page_mapcount(page) \|\|	445	if ( page_mapcount(page) \|\|
444	(page->flags & (	446	page->mapping != NULL \|\|
		447	page_count(page) != 0 \|\|
		448	(page->flags & (
		449	1 << PG_lru \|
445	1 << PG_private \|	450	1 << PG_private \|
446	1 << PG_locked \|	451	1 << PG_locked \|
447	1 << PG_lru \|
448	1 << PG_active \|	452	1 << PG_active \|
449	1 << PG_dirty \|	453	1 << PG_dirty \|
450	1 << PG_reclaim \|	454	1 << PG_reclaim \|
		455	1 << PG_slab \|
451	1 << PG_swapcache \|	456	1 << PG_swapcache \|
452	1 << PG_writeback )))	457	1 << PG_writeback )))
453	bad_page(__FUNCTION__, page);	458	bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
511	return allocated;	516	return allocated;
512	}	517	}
513		518
		519	#ifdef CONFIG_NUMA
		520	/* Called from the slab reaper to drain remote pagesets */
		521	void drain_remote_pages(void)
		522	{
		523	struct zone *zone;
		524	int i;
		525	unsigned long flags;
		526
		527	local_irq_save(flags);
		528	for_each_zone(zone) {
		529	struct per_cpu_pageset *pset;
		530
		531	/* Do not drain local pagesets */
		532	if (zone->zone_pgdat->node_id == numa_node_id())
		533	continue;
		534
		535	pset = zone->pageset[smp_processor_id()];
		536	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
		537	struct per_cpu_pages *pcp;
		538
		539	pcp = &pset->pcp[i];
		540	if (pcp->count)
		541	pcp->count -= free_pages_bulk(zone, pcp->count,
		542	&pcp->list, 0);
		543	}
		544	}
		545	local_irq_restore(flags);
		546	}
		547	#endif
		548
514	#if defined(CONFIG_PM) \|\| defined(CONFIG_HOTPLUG_CPU)	549	#if defined(CONFIG_PM) \|\| defined(CONFIG_HOTPLUG_CPU)
515	static void __drain_pages(unsigned int cpu)	550	static void __drain_pages(unsigned int cpu)
516	{	551	{
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
520	for_each_zone(zone) {	555	for_each_zone(zone) {
521	struct per_cpu_pageset *pset;	556	struct per_cpu_pageset *pset;
522		557
523	pset = &zone->pageset[cpu];	558	pset = zone_pcp(zone, cpu);
524	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {	559	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
525	struct per_cpu_pages *pcp;	560	struct per_cpu_pages *pcp;
526		561
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist zonelist, struct zone z)
583		618
584	local_irq_save(flags);	619	local_irq_save(flags);
585	cpu = smp_processor_id();	620	cpu = smp_processor_id();
586	p = &z->pageset[cpu];	621	p = zone_pcp(z,cpu);
587	if (pg == orig) {	622	if (pg == orig) {
588	z->pageset[cpu].numa_hit++;	623	p->numa_hit++;
589	} else {	624	} else {
590	p->numa_miss++;	625	p->numa_miss++;
591	zonelist->zones[0]->pageset[cpu].numa_foreign++;	626	zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
592	}	627	}
593	if (pg == NODE_DATA(numa_node_id()))	628	if (pg == NODE_DATA(numa_node_id()))
594	p->local_node++;	629	p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
615	if (PageAnon(page))	650	if (PageAnon(page))
616	page->mapping = NULL;	651	page->mapping = NULL;
617	free_pages_check(__FUNCTION__, page);	652	free_pages_check(__FUNCTION__, page);
618	pcp = &zone->pageset[get_cpu()].pcp[cold];	653	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
619	local_irq_save(flags);	654	local_irq_save(flags);
620	if (pcp->count >= pcp->high)
621	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
622	list_add(&page->lru, &pcp->list);	655	list_add(&page->lru, &pcp->list);
623	pcp->count++;	656	pcp->count++;
		657	if (pcp->count >= pcp->high)
		658	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
624	local_irq_restore(flags);	659	local_irq_restore(flags);
625	put_cpu();	660	put_cpu();
626	}	661	}
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
659	if (order == 0) {	694	if (order == 0) {
660	struct per_cpu_pages *pcp;	695	struct per_cpu_pages *pcp;
661		696
662	pcp = &zone->pageset[get_cpu()].pcp[cold];	697	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
663	local_irq_save(flags);	698	local_irq_save(flags);
664	if (pcp->count <= pcp->low)	699	if (pcp->count <= pcp->low)
665	pcp->count += rmqueue_bulk(zone, 0,	700	pcp->count += rmqueue_bulk(zone, 0,
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
724	return 1;	759	return 1;
725	}	760	}
726		761
		762	static inline int
		763	should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
		764	{
		765	if (!z->reclaim_pages)
		766	return 0;
		767	if (gfp_mask & __GFP_NORECLAIM)
		768	return 0;
		769	return 1;
		770	}
		771
727	/*	772	/*
728	* This is the 'heart' of the zoned buddy allocator.	773	* This is the 'heart' of the zoned buddy allocator.
729	*/	774	*/
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
760		805
761	classzone_idx = zone_idx(zones[0]);	806	classzone_idx = zone_idx(zones[0]);
762		807
763	restart:	808	restart:
764	/* Go through the zonelist once, looking for a zone with enough free */	809	/* Go through the zonelist once, looking for a zone with enough free */
765	for (i = 0; (z = zones[i]) != NULL; i++) {	810	for (i = 0; (z = zones[i]) != NULL; i++) {
766		811	int do_reclaim = should_reclaim_zone(z, gfp_mask);
767	if (!zone_watermark_ok(z, order, z->pages_low,
768	classzone_idx, 0, 0))
769	continue;
770		812
771	if (!cpuset_zone_allowed(z))	813	if (!cpuset_zone_allowed(z))
772	continue;	814	continue;
773		815
		816	/*
		817	* If the zone is to attempt early page reclaim then this loop
		818	* will try to reclaim pages and check the watermark a second
		819	* time before giving up and falling back to the next zone.
		820	*/
		821	zone_reclaim_retry:
		822	if (!zone_watermark_ok(z, order, z->pages_low,
		823	classzone_idx, 0, 0)) {
		824	if (!do_reclaim)
		825	continue;
		826	else {
		827	zone_reclaim(z, gfp_mask, order);
		828	/* Only try reclaim once */
		829	do_reclaim = 0;
		830	goto zone_reclaim_retry;
		831	}
		832	}
		833
774	page = buffered_rmqueue(z, order, gfp_mask);	834	page = buffered_rmqueue(z, order, gfp_mask);
775	if (page)	835	if (page)
776	goto got_pg;	836	goto got_pg;
@@ -829,7 +889,7 @@ rebalance:
829	reclaim_state.reclaimed_slab = 0;	889	reclaim_state.reclaimed_slab = 0;
830	p->reclaim_state = &reclaim_state;	890	p->reclaim_state = &reclaim_state;
831		891
832	did_some_progress = try_to_free_pages(zones, gfp_mask, order);	892	did_some_progress = try_to_free_pages(zones, gfp_mask);
833		893
834	p->reclaim_state = NULL;	894	p->reclaim_state = NULL;
835	p->flags &= ~PF_MEMALLOC;	895	p->flags &= ~PF_MEMALLOC;
@@ -905,6 +965,7 @@ nopage:
905	" order:%d, mode:0x%x\n",	965	" order:%d, mode:0x%x\n",
906	p->comm, order, gfp_mask);	966	p->comm, order, gfp_mask);
907	dump_stack();	967	dump_stack();
		968	show_mem();
908	}	969	}
909	return NULL;	970	return NULL;
910	got_pg:	971	got_pg:
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret)
1114	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));	1175	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
1115	}	1176	}
1116		1177
1117	unsigned long __read_page_state(unsigned offset)	1178	unsigned long __read_page_state(unsigned long offset)
1118	{	1179	{
1119	unsigned long ret = 0;	1180	unsigned long ret = 0;
1120	int cpu;	1181	int cpu;
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset)
1128	return ret;	1189	return ret;
1129	}	1190	}
1130		1191
1131	void __mod_page_state(unsigned offset, unsigned long delta)	1192	void __mod_page_state(unsigned long offset, unsigned long delta)
1132	{	1193	{
1133	unsigned long flags;	1194	unsigned long flags;
1134	void* ptr;	1195	void* ptr;
@@ -1237,22 +1298,23 @@ void show_free_areas(void)
1237	if (!cpu_possible(cpu))	1298	if (!cpu_possible(cpu))
1238	continue;	1299	continue;
1239		1300
1240	pageset = zone->pageset + cpu;	1301	pageset = zone_pcp(zone, cpu);
1241		1302
1242	for (temperature = 0; temperature < 2; temperature++)	1303	for (temperature = 0; temperature < 2; temperature++)
1243	printk("cpu %d %s: low %d, high %d, batch %d\n",	1304	printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
1244	cpu,	1305	cpu,
1245	temperature ? "cold" : "hot",	1306	temperature ? "cold" : "hot",
1246	pageset->pcp[temperature].low,	1307	pageset->pcp[temperature].low,
1247	pageset->pcp[temperature].high,	1308	pageset->pcp[temperature].high,
1248	pageset->pcp[temperature].batch);	1309	pageset->pcp[temperature].batch,
		1310	pageset->pcp[temperature].count);
1249	}	1311	}
1250	}	1312	}
1251		1313
1252	get_page_state(&ps);	1314	get_page_state(&ps);
1253	get_zone_counts(&active, &inactive, &free);	1315	get_zone_counts(&active, &inactive, &free);
1254		1316
1255	printk("\nFree pages: %11ukB (%ukB HighMem)\n",	1317	printk("Free pages: %11ukB (%ukB HighMem)\n",
1256	K(nr_free_pages()),	1318	K(nr_free_pages()),
1257	K(nr_free_highpages()));	1319	K(nr_free_highpages()));
1258		1320
@@ -1620,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
1620	memmap_init_zone((size), (nid), (zone), (start_pfn))	1682	memmap_init_zone((size), (nid), (zone), (start_pfn))
1621	#endif	1683	#endif
1622		1684
		1685	static int __devinit zone_batchsize(struct zone *zone)
		1686	{
		1687	int batch;
		1688
		1689	/*
		1690	* The per-cpu-pages pools are set to around 1000th of the
		1691	* size of the zone. But no more than 1/4 of a meg - there's
		1692	* no point in going beyond the size of L2 cache.
		1693	*
		1694	* OK, so we don't know how big the cache is. So guess.
		1695	*/
		1696	batch = zone->present_pages / 1024;
		1697	if (batch * PAGE_SIZE > 256 * 1024)
		1698	batch = (256 * 1024) / PAGE_SIZE;
		1699	batch /= 4; /* We effectively = 4 below /
		1700	if (batch < 1)
		1701	batch = 1;
		1702
		1703	/*
		1704	* Clamp the batch to a 2^n - 1 value. Having a power
		1705	* of 2 value was found to be more likely to have
		1706	* suboptimal cache aliasing properties in some cases.
		1707	*
		1708	* For example if 2 tasks are alternately allocating
		1709	* batches of pages, one task can end up with a lot
		1710	* of pages of one half of the possible page colors
		1711	* and the other with pages of the other colors.
		1712	*/
		1713	batch = (1 << fls(batch + batch/2)) - 1;
		1714	return batch;
		1715	}
		1716
		1717	inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
		1718	{
		1719	struct per_cpu_pages *pcp;
		1720
		1721	pcp = &p->pcp[0]; /* hot */
		1722	pcp->count = 0;
		1723	pcp->low = 2 * batch;
		1724	pcp->high = 6 * batch;
		1725	pcp->batch = max(1UL, 1 * batch);
		1726	INIT_LIST_HEAD(&pcp->list);
		1727
		1728	pcp = &p->pcp[1]; /* cold*/
		1729	pcp->count = 0;
		1730	pcp->low = 0;
		1731	pcp->high = 2 * batch;
		1732	pcp->batch = max(1UL, 1 * batch);
		1733	INIT_LIST_HEAD(&pcp->list);
		1734	}
		1735
		1736	#ifdef CONFIG_NUMA
		1737	/*
		1738	* Boot pageset table. One per cpu which is going to be used for all
		1739	* zones and all nodes. The parameters will be set in such a way
		1740	* that an item put on a list will immediately be handed over to
		1741	* the buddy list. This is safe since pageset manipulation is done
		1742	* with interrupts disabled.
		1743	*
		1744	* Some NUMA counter updates may also be caught by the boot pagesets.
		1745	* These will be discarded when bootup is complete.
		1746	*/
		1747	static struct per_cpu_pageset
		1748	boot_pageset[NR_CPUS] __initdata;
		1749
		1750	/*
		1751	* Dynamically allocate memory for the
		1752	* per cpu pageset array in struct zone.
		1753	*/
		1754	static int __devinit process_zones(int cpu)
		1755	{
		1756	struct zone zone, dzone;
		1757
		1758	for_each_zone(zone) {
		1759
		1760	zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
		1761	GFP_KERNEL, cpu_to_node(cpu));
		1762	if (!zone->pageset[cpu])
		1763	goto bad;
		1764
		1765	setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
		1766	}
		1767
		1768	return 0;
		1769	bad:
		1770	for_each_zone(dzone) {
		1771	if (dzone == zone)
		1772	break;
		1773	kfree(dzone->pageset[cpu]);
		1774	dzone->pageset[cpu] = NULL;
		1775	}
		1776	return -ENOMEM;
		1777	}
		1778
		1779	static inline void free_zone_pagesets(int cpu)
		1780	{
		1781	#ifdef CONFIG_NUMA
		1782	struct zone *zone;
		1783
		1784	for_each_zone(zone) {
		1785	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
		1786
		1787	zone_pcp(zone, cpu) = NULL;
		1788	kfree(pset);
		1789	}
		1790	#endif
		1791	}
		1792
		1793	static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
		1794	unsigned long action,
		1795	void *hcpu)
		1796	{
		1797	int cpu = (long)hcpu;
		1798	int ret = NOTIFY_OK;
		1799
		1800	switch (action) {
		1801	case CPU_UP_PREPARE:
		1802	if (process_zones(cpu))
		1803	ret = NOTIFY_BAD;
		1804	break;
		1805	#ifdef CONFIG_HOTPLUG_CPU
		1806	case CPU_DEAD:
		1807	free_zone_pagesets(cpu);
		1808	break;
		1809	#endif
		1810	default:
		1811	break;
		1812	}
		1813	return ret;
		1814	}
		1815
		1816	static struct notifier_block pageset_notifier =
		1817	{ &pageset_cpuup_callback, NULL, 0 };
		1818
		1819	void __init setup_per_cpu_pageset()
		1820	{
		1821	int err;
		1822
		1823	/* Initialize per_cpu_pageset for cpu 0.
		1824	* A cpuup callback will do this for every cpu
		1825	* as it comes online
		1826	*/
		1827	err = process_zones(smp_processor_id());
		1828	BUG_ON(err);
		1829	register_cpu_notifier(&pageset_notifier);
		1830	}
		1831
		1832	#endif
		1833
1623	/*	1834	/*
1624	* Set up the zone data structures:	1835	* Set up the zone data structures:
1625	* - mark all pages reserved	1836	* - mark all pages reserved
@@ -1662,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1662		1873
1663	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;	1874	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1664		1875
1665	/*	1876	batch = zone_batchsize(zone);
1666	* The per-cpu-pages pools are set to around 1000th of the
1667	* size of the zone. But no more than 1/4 of a meg - there's
1668	* no point in going beyond the size of L2 cache.
1669	*
1670	* OK, so we don't know how big the cache is. So guess.
1671	*/
1672	batch = zone->present_pages / 1024;
1673	if (batch * PAGE_SIZE > 256 * 1024)
1674	batch = (256 * 1024) / PAGE_SIZE;
1675	batch /= 4; /* We effectively = 4 below /
1676	if (batch < 1)
1677	batch = 1;
1678
1679	/*
1680	* Clamp the batch to a 2^n - 1 value. Having a power
1681	* of 2 value was found to be more likely to have
1682	* suboptimal cache aliasing properties in some cases.
1683	*
1684	* For example if 2 tasks are alternately allocating
1685	* batches of pages, one task can end up with a lot
1686	* of pages of one half of the possible page colors
1687	* and the other with pages of the other colors.
1688	*/
1689	batch = (1 << fls(batch + batch/2)) - 1;
1690		1877
1691	for (cpu = 0; cpu < NR_CPUS; cpu++) {	1878	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1692	struct per_cpu_pages *pcp;	1879	#ifdef CONFIG_NUMA
1693		1880	/* Early boot. Slab allocator not functional yet */
1694	pcp = &zone->pageset[cpu].pcp[0]; /* hot */	1881	zone->pageset[cpu] = &boot_pageset[cpu];
1695	pcp->count = 0;	1882	setup_pageset(&boot_pageset[cpu],0);
1696	pcp->low = 2 * batch;	1883	#else
1697	pcp->high = 6 * batch;	1884	setup_pageset(zone_pcp(zone,cpu), batch);
1698	pcp->batch = 1 * batch;	1885	#endif
1699	INIT_LIST_HEAD(&pcp->list);
1700
1701	pcp = &zone->pageset[cpu].pcp[1]; /* cold */
1702	pcp->count = 0;
1703	pcp->low = 0;
1704	pcp->high = 2 * batch;
1705	pcp->batch = 1 * batch;
1706	INIT_LIST_HEAD(&pcp->list);
1707	}	1886	}
1708	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",	1887	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1709	zone_names[j], realsize, batch);	1888	zone_names[j], realsize, batch);
@@ -1713,6 +1892,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1713	zone->nr_scan_inactive = 0;	1892	zone->nr_scan_inactive = 0;
1714	zone->nr_active = 0;	1893	zone->nr_active = 0;
1715	zone->nr_inactive = 0;	1894	zone->nr_inactive = 0;
		1895	atomic_set(&zone->reclaim_in_progress, -1);
1716	if (!size)	1896	if (!size)
1717	continue;	1897	continue;
1718		1898
@@ -1853,6 +2033,115 @@ struct seq_operations fragmentation_op = {
1853	.show = frag_show,	2033	.show = frag_show,
1854	};	2034	};
1855		2035
		2036	/*
		2037	* Output information about zones in @pgdat.
		2038	*/
		2039	static int zoneinfo_show(struct seq_file m, void arg)
		2040	{
		2041	pg_data_t *pgdat = arg;
		2042	struct zone *zone;
		2043	struct zone *node_zones = pgdat->node_zones;
		2044	unsigned long flags;
		2045
		2046	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
		2047	int i;
		2048
		2049	if (!zone->present_pages)
		2050	continue;
		2051
		2052	spin_lock_irqsave(&zone->lock, flags);
		2053	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
		2054	seq_printf(m,
		2055	"\n pages free %lu"
		2056	"\n min %lu"
		2057	"\n low %lu"
		2058	"\n high %lu"
		2059	"\n active %lu"
		2060	"\n inactive %lu"
		2061	"\n scanned %lu (a: %lu i: %lu)"
		2062	"\n spanned %lu"
		2063	"\n present %lu",
		2064	zone->free_pages,
		2065	zone->pages_min,
		2066	zone->pages_low,
		2067	zone->pages_high,
		2068	zone->nr_active,
		2069	zone->nr_inactive,
		2070	zone->pages_scanned,
		2071	zone->nr_scan_active, zone->nr_scan_inactive,
		2072	zone->spanned_pages,
		2073	zone->present_pages);
		2074	seq_printf(m,
		2075	"\n protection: (%lu",
		2076	zone->lowmem_reserve[0]);
		2077	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
		2078	seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
		2079	seq_printf(m,
		2080	")"
		2081	"\n pagesets");
		2082	for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
		2083	struct per_cpu_pageset *pageset;
		2084	int j;
		2085
		2086	pageset = zone_pcp(zone, i);
		2087	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
		2088	if (pageset->pcp[j].count)
		2089	break;
		2090	}
		2091	if (j == ARRAY_SIZE(pageset->pcp))
		2092	continue;
		2093	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
		2094	seq_printf(m,
		2095	"\n cpu: %i pcp: %i"
		2096	"\n count: %i"
		2097	"\n low: %i"
		2098	"\n high: %i"
		2099	"\n batch: %i",
		2100	i, j,
		2101	pageset->pcp[j].count,
		2102	pageset->pcp[j].low,
		2103	pageset->pcp[j].high,
		2104	pageset->pcp[j].batch);
		2105	}
		2106	#ifdef CONFIG_NUMA
		2107	seq_printf(m,
		2108	"\n numa_hit: %lu"
		2109	"\n numa_miss: %lu"
		2110	"\n numa_foreign: %lu"
		2111	"\n interleave_hit: %lu"
		2112	"\n local_node: %lu"
		2113	"\n other_node: %lu",
		2114	pageset->numa_hit,
		2115	pageset->numa_miss,
		2116	pageset->numa_foreign,
		2117	pageset->interleave_hit,
		2118	pageset->local_node,
		2119	pageset->other_node);
		2120	#endif
		2121	}
		2122	seq_printf(m,
		2123	"\n all_unreclaimable: %u"
		2124	"\n prev_priority: %i"
		2125	"\n temp_priority: %i"
		2126	"\n start_pfn: %lu",
		2127	zone->all_unreclaimable,
		2128	zone->prev_priority,
		2129	zone->temp_priority,
		2130	zone->zone_start_pfn);
		2131	spin_unlock_irqrestore(&zone->lock, flags);
		2132	seq_putc(m, '\n');
		2133	}
		2134	return 0;
		2135	}
		2136
		2137	struct seq_operations zoneinfo_op = {
		2138	.start = frag_start, /* iterate over all zones. The same as in
		2139	* fragmentation. */
		2140	.next = frag_next,
		2141	.stop = frag_stop,
		2142	.show = zoneinfo_show,
		2143	};
		2144
1856	static char *vmstat_text[] = {	2145	static char *vmstat_text[] = {
1857	"nr_dirty",	2146	"nr_dirty",
1858	"nr_writeback",	2147	"nr_writeback",
@@ -2058,10 +2347,10 @@ static void setup_per_zone_pages_min(void)
2058	min_pages = 128;	2347	min_pages = 128;
2059	zone->pages_min = min_pages;	2348	zone->pages_min = min_pages;
2060	} else {	2349	} else {
2061	/* if it's a lowmem zone, reserve a number of pages	2350	/* if it's a lowmem zone, reserve a number of pages
2062	* proportionate to the zone's size.	2351	* proportionate to the zone's size.
2063	*/	2352	*/
2064	zone->pages_min = (pages_min * zone->present_pages) /	2353	zone->pages_min = (pages_min * zone->present_pages) /
2065	lowmem_pages;	2354	lowmem_pages;
2066	}	2355	}
2067		2356