1 files changed, 202 insertions, 92 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fccf..327516b7aee9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,10 +61,14 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
+#include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
+/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
+static DEFINE_MUTEX(pcp_batch_high_lock);
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 };
 EXPORT_SYMBOL(node_states);
+/* Protect totalram_pages and zone->managed_pages */
+static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
@@ -739,14 +746,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        local_irq_restore(flags);
 }
-/*
+void __init __free_pages_bootmem(struct page *page, unsigned int order)
- * Read access to zone->managed_pages is safe because it's unsigned long,
- * but we still need to serialize writers. Currently all callers of
- * __free_pages_bootmem() except put_page_bootmem() should only be used
- * at boot time. So for shorter boot time, we shift the burden to
- * put_page_bootmem() to serialize writers.
- */
-void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
        unsigned int nr_pages = 1 << order;
        unsigned int loop;
@@ -781,11 +781,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
        set_page_refcounted(page);
        set_pageblock_migratetype(page, MIGRATE_CMA);
        __free_pages(page, pageblock_order);
-        totalram_pages += pageblock_nr_pages;
+        adjust_managed_page_count(page, pageblock_nr_pages);
-#ifdef CONFIG_HIGHMEM
-        if (PageHighMem(page))
-                totalhigh_pages += pageblock_nr_pages;
-#endif
 }
 #endif
@@ -1179,10 +1175,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
        unsigned long flags;
        int to_drain;
+        unsigned long batch;
        local_irq_save(flags);
-        if (pcp->count >= pcp->batch)
+        batch = ACCESS_ONCE(pcp->batch);
-                to_drain = pcp->batch;
+        if (pcp->count >= batch)
+                to_drain = batch;
        else
                to_drain = pcp->count;
        if (to_drain > 0) {
@@ -1350,8 +1348,9 @@ void free_hot_cold_page(struct page *page, int cold)
                list_add(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
-                free_pcppages_bulk(zone, pcp->batch, pcp);
+                unsigned long batch = ACCESS_ONCE(pcp->batch);
-                pcp->count -= pcp->batch;
+                free_pcppages_bulk(zone, batch, pcp);
+                pcp->count -= batch;
        }
 out:
@@ -2839,7 +2838,7 @@ EXPORT_SYMBOL(free_pages_exact);
 * nr_free_zone_pages() counts the number of counts pages which are beyond the
 * high watermark within all zones at or below a given zone index.  For each
 * zone, the number of pages is calculated as:
- *     present_pages - high_pages
+ *     managed_pages - high_pages
 */
 static unsigned long nr_free_zone_pages(int offset)
 {
@@ -2906,9 +2905,13 @@ EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
+        int zone_type;          /* needs to be signed */
+        unsigned long managed_pages = 0;
        pg_data_t *pgdat = NODE_DATA(nid);
-        val->totalram = pgdat->node_present_pages;
+        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+                managed_pages += pgdat->node_zones[zone_type].managed_pages;
+        val->totalram = managed_pages;
        val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
        val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3250,18 +3253,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
        static DEFINE_MUTEX(zl_order_mutex);
        mutex_lock(&zl_order_mutex);
-        if (write)
+        if (write) {
-                strcpy(saved_string, (char*)table->data);
+                if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                strcpy(saved_string, (char *)table->data);
+        }
        ret = proc_dostring(table, write, buffer, length, ppos);
        if (ret)
                goto out;
        if (write) {
                int oldval = user_zonelist_order;
-                if (__parse_numa_zonelist_order((char*)table->data)) {
+                ret = __parse_numa_zonelist_order((char *)table->data);
+                if (ret) {
                        /*
                         * bogus value.  restore saved string
                         */
-                        strncpy((char*)table->data, saved_string,
+                        strncpy((char *)table->data, saved_string,
                                NUMA_ZONELIST_ORDER_LEN);
                        user_zonelist_order = oldval;
                } else if (oldval != user_zonelist_order) {
@@ -3425,8 +3435,8 @@ static int default_zonelist_order(void)
                        z = &NODE_DATA(nid)->node_zones[zone_type];
                        if (populated_zone(z)) {
                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->present_pages;
+                                        low_kmem_size += z->managed_pages;
-                                total_size += z->present_pages;
+                                total_size += z->managed_pages;
                        } else if (zone_type == ZONE_NORMAL) {
                                /*
                                 * If any node has only lowmem, then node order
@@ -3705,12 +3715,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
                mminit_verify_zonelist();
                cpuset_init_current_mems_allowed();
        } else {
-                /* we have to stop all cpus to guarantee there is no user
-                   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
                if (zone)
                        setup_zone_pageset(zone);
 #endif
+                /* we have to stop all cpus to guarantee there is no user
+                   of zonelist */
                stop_machine(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
@@ -4032,7 +4042,40 @@ static int __meminit zone_batchsize(struct zone *zone)
 #endif
 }
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+/*
+ * pcp->high and pcp->batch values are related and dependent on one another:
+ * ->batch must never be higher then ->high.
+ * The following function updates them in a safe manner without read side
+ * locking.
+ *
+ * Any new users of pcp->batch and pcp->high should ensure they can cope with
+ * those fields changing asynchronously (acording the the above rule).
+ *
+ * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
+ * outside of boot time (or some other assurance that no concurrent updaters
+ * exist).
+ */
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
+                unsigned long batch)
+{
+       /* start with a fail safe value for batch */
+        pcp->batch = 1;
+        smp_wmb();
+       /* Update high, then batch, in order */
+        pcp->high = high;
+        smp_wmb();
+        pcp->batch = batch;
+}
+/* a companion to pageset_set_high() */
+static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+{
+        pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+}
+static void pageset_init(struct per_cpu_pageset *p)
 {
        struct per_cpu_pages *pcp;
        int migratetype;
@@ -4041,45 +4084,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp = &p->pcp;
        pcp->count = 0;
-        pcp->high = 6 * batch;
-        pcp->batch = max(1UL, 1 * batch);
        for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
                INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+        pageset_init(p);
+        pageset_set_batch(p, batch);
+}
 /*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
 * to the value high for the pageset p.
 */
+static void pageset_set_high(struct per_cpu_pageset *p,
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                                unsigned long high)
 {
-        struct per_cpu_pages *pcp;
+        unsigned long batch = max(1UL, high / 4);
+        if ((high / 4) > (PAGE_SHIFT * 8))
+                batch = PAGE_SHIFT * 8;
-        pcp = &p->pcp;
+        pageset_update(&p->pcp, high, batch);
-        pcp->high = high;
-        pcp->batch = max(1UL, high/4);
-        if ((high/4) > (PAGE_SHIFT * 8))
-                pcp->batch = PAGE_SHIFT * 8;
 }
-static void __meminit setup_zone_pageset(struct zone *zone)
+static void __meminit pageset_set_high_and_batch(struct zone *zone,
+                struct per_cpu_pageset *pcp)
 {
-        int cpu;
+        if (percpu_pagelist_fraction)
+                pageset_set_high(pcp,
-        zone->pageset = alloc_percpu(struct per_cpu_pageset);
+                        (zone->managed_pages /
+                                percpu_pagelist_fraction));
+        else
+                pageset_set_batch(pcp, zone_batchsize(zone));
+}
-        for_each_possible_cpu(cpu) {
+static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-                struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+{
+        struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
-                setup_pageset(pcp, zone_batchsize(zone));
+        pageset_init(pcp);
+        pageset_set_high_and_batch(zone, pcp);
+}
-                if (percpu_pagelist_fraction)
+static void __meminit setup_zone_pageset(struct zone *zone)
-                        setup_pagelist_highmark(pcp,
+{
-                                (zone->managed_pages /
+        int cpu;
-                                        percpu_pagelist_fraction));
+        zone->pageset = alloc_percpu(struct per_cpu_pageset);
-        }
+        for_each_possible_cpu(cpu)
+                zone_pageset_init(zone, cpu);
 }
 /*
@@ -5150,35 +5203,101 @@ early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-unsigned long free_reserved_area(unsigned long start, unsigned long end,
+void adjust_managed_page_count(struct page *page, long count)
-                                 int poison, char *s)
+{
+        spin_lock(&managed_page_count_lock);
+        page_zone(page)->managed_pages += count;
+        totalram_pages += count;
+#ifdef CONFIG_HIGHMEM
+        if (PageHighMem(page))
+                totalhigh_pages += count;
+#endif
+        spin_unlock(&managed_page_count_lock);
+}
+EXPORT_SYMBOL(adjust_managed_page_count);
+unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
-        unsigned long pages, pos;
+        void *pos;
+        unsigned long pages = 0;
-        pos = start = PAGE_ALIGN(start);
+        start = (void *)PAGE_ALIGN((unsigned long)start);
-        end &= PAGE_MASK;
+        end = (void *)((unsigned long)end & PAGE_MASK);
-        for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
+        for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
-                if (poison)
+                if ((unsigned int)poison <= 0xFF)
-                        memset((void *)pos, poison, PAGE_SIZE);
+                        memset(pos, poison, PAGE_SIZE);
-                free_reserved_page(virt_to_page((void *)pos));
+                free_reserved_page(virt_to_page(pos));
        }
        if (pages && s)
-                pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",
+                pr_info("Freeing %s memory: %ldK (%p - %p)\n",
                        s, pages << (PAGE_SHIFT - 10), start, end);
        return pages;
 }
+EXPORT_SYMBOL(free_reserved_area);
 #ifdef  CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
        __free_reserved_page(page);
        totalram_pages++;
+        page_zone(page)->managed_pages++;
        totalhigh_pages++;
 }
 #endif
+void __init mem_init_print_info(const char *str)
+{
+        unsigned long physpages, codesize, datasize, rosize, bss_size;
+        unsigned long init_code_size, init_data_size;
+        physpages = get_num_physpages();
+        codesize = _etext - _stext;
+        datasize = _edata - _sdata;
+        rosize = __end_rodata - __start_rodata;
+        bss_size = __bss_stop - __bss_start;
+        init_data_size = __init_end - __init_begin;
+        init_code_size = _einittext - _sinittext;
+        /*
+         * Detect special cases and adjust section sizes accordingly:
+         * 1) .init.* may be embedded into .data sections
+         * 2) .init.text.* may be out of [__init_begin, __init_end],
+         *    please refer to arch/tile/kernel/vmlinux.lds.S.
+         * 3) .rodata.* may be embedded into .text or .data sections.
+         */
+#define adj_init_size(start, end, size, pos, adj) \
+        if (start <= pos && pos < end && size > adj) \
+                size -= adj;
+        adj_init_size(__init_begin, __init_end, init_data_size,
+                     _sinittext, init_code_size);
+        adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+        adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+        adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+        adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+#undef  adj_init_size
+        printk("Memory: %luK/%luK available "
+               "(%luK kernel code, %luK rwdata, %luK rodata, "
+               "%luK init, %luK bss, %luK reserved"
+#ifdef  CONFIG_HIGHMEM
+               ", %luK highmem"
+#endif
+               "%s%s)\n",
+               nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
+               codesize >> 10, datasize >> 10, rosize >> 10,
+               (init_data_size + init_code_size) >> 10, bss_size >> 10,
+               (physpages - totalram_pages) << (PAGE_SHIFT-10),
+#ifdef  CONFIG_HIGHMEM
+               totalhigh_pages << (PAGE_SHIFT-10),
+#endif
+               str ? ", " : "", str ? str : "");
+}
 /**
 * set_dma_reserve - set the specified number of pages reserved in the first zone
 * @new_dma_reserve: The number of pages to mark reserved
@@ -5540,7 +5659,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
 * can have before it gets flushed back to buddy allocator.
 */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -5551,14 +5669,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (!write || (ret < 0))
                return ret;
+        mutex_lock(&pcp_batch_high_lock);
        for_each_populated_zone(zone) {
-                for_each_possible_cpu(cpu) {
+                unsigned long  high;
-                        unsigned long  high;
+                high = zone->managed_pages / percpu_pagelist_fraction;
-                        high = zone->managed_pages / percpu_pagelist_fraction;
+                for_each_possible_cpu(cpu)
-                        setup_pagelist_highmark(
+                        pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
-                                per_cpu_ptr(zone->pageset, cpu), high);
+                                         high);
-                }
        }
+        mutex_unlock(&pcp_batch_high_lock);
        return 0;
 }
@@ -6047,32 +6167,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
-static int __meminit __zone_pcp_update(void *data)
+/*
-{
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
-        struct zone *zone = data;
+ * page high values need to be recalulated.
-        int cpu;
+ */
-        unsigned long batch = zone_batchsize(zone), flags;
-        for_each_possible_cpu(cpu) {
-                struct per_cpu_pageset *pset;
-                struct per_cpu_pages *pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
-                pcp = &pset->pcp;
-                local_irq_save(flags);
-                if (pcp->count > 0)
-                        free_pcppages_bulk(zone, pcp->count, pcp);
-                drain_zonestat(zone, pset);
-                setup_pageset(pset, batch);
-                local_irq_restore(flags);
-        }
-        return 0;
-}
 void __meminit zone_pcp_update(struct zone *zone)
 {
-        stop_machine(__zone_pcp_update, zone, NULL);
+        unsigned cpu;
+        mutex_lock(&pcp_batch_high_lock);
+        for_each_possible_cpu(cpu)
+                pageset_set_high_and_batch(zone,
+                                per_cpu_ptr(zone->pageset, cpu));
+        mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
@@ -6142,6 +6248,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                list_del(&page->lru);
                rmv_page_order(page);
                zone->free_area[order].nr_free--;
+#ifdef CONFIG_HIGHMEM
+                if (PageHighMem(page))
+                        totalhigh_pages -= 1 << order;
+#endif
                for (i = 0; i < (1 << order); i++)
                        SetPageReserved((page+i));
                pfn += (1 << order);

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3edb624fccf..327516b7aee9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -61,10 +61,14 @@
61	#include <linux/hugetlb.h>	61	#include <linux/hugetlb.h>
62	#include <linux/sched/rt.h>	62	#include <linux/sched/rt.h>
63		63
		64	#include <asm/sections.h>
64	#include <asm/tlbflush.h>	65	#include <asm/tlbflush.h>
65	#include <asm/div64.h>	66	#include <asm/div64.h>
66	#include "internal.h"	67	#include "internal.h"
67		68
		69	/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
		70	static DEFINE_MUTEX(pcp_batch_high_lock);
		71
68	#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID	72	#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
69	DEFINE_PER_CPU(int, numa_node);	73	DEFINE_PER_CPU(int, numa_node);
70	EXPORT_PER_CPU_SYMBOL(numa_node);	74	EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
100	};	104	};
101	EXPORT_SYMBOL(node_states);	105	EXPORT_SYMBOL(node_states);
102		106
		107	/* Protect totalram_pages and zone->managed_pages */
		108	static DEFINE_SPINLOCK(managed_page_count_lock);
		109
103	unsigned long totalram_pages __read_mostly;	110	unsigned long totalram_pages __read_mostly;
104	unsigned long totalreserve_pages __read_mostly;	111	unsigned long totalreserve_pages __read_mostly;
105	/*	112	/*
@@ -739,14 +746,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
739	local_irq_restore(flags);	746	local_irq_restore(flags);
740	}	747	}
741		748
742	/*	749	void __init __free_pages_bootmem(struct page *page, unsigned int order)
743	* Read access to zone->managed_pages is safe because it's unsigned long,
744	* but we still need to serialize writers. Currently all callers of
745	* __free_pages_bootmem() except put_page_bootmem() should only be used
746	* at boot time. So for shorter boot time, we shift the burden to
747	* put_page_bootmem() to serialize writers.
748	*/
749	void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
750	{	750	{
751	unsigned int nr_pages = 1 << order;	751	unsigned int nr_pages = 1 << order;
752	unsigned int loop;	752	unsigned int loop;
@@ -781,11 +781,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
781	set_page_refcounted(page);	781	set_page_refcounted(page);
782	set_pageblock_migratetype(page, MIGRATE_CMA);	782	set_pageblock_migratetype(page, MIGRATE_CMA);
783	__free_pages(page, pageblock_order);	783	__free_pages(page, pageblock_order);
784	totalram_pages += pageblock_nr_pages;	784	adjust_managed_page_count(page, pageblock_nr_pages);
785	#ifdef CONFIG_HIGHMEM
786	if (PageHighMem(page))
787	totalhigh_pages += pageblock_nr_pages;
788	#endif
789	}	785	}
790	#endif	786	#endif
791		787
@@ -1179,10 +1175,12 @@ void drain_zone_pages(struct zone zone, struct per_cpu_pages pcp)
1179	{	1175	{
1180	unsigned long flags;	1176	unsigned long flags;
1181	int to_drain;	1177	int to_drain;
		1178	unsigned long batch;
1182		1179
1183	local_irq_save(flags);	1180	local_irq_save(flags);
1184	if (pcp->count >= pcp->batch)	1181	batch = ACCESS_ONCE(pcp->batch);
1185	to_drain = pcp->batch;	1182	if (pcp->count >= batch)
		1183	to_drain = batch;
1186	else	1184	else
1187	to_drain = pcp->count;	1185	to_drain = pcp->count;
1188	if (to_drain > 0) {	1186	if (to_drain > 0) {
@@ -1350,8 +1348,9 @@ void free_hot_cold_page(struct page *page, int cold)
1350	list_add(&page->lru, &pcp->lists[migratetype]);	1348	list_add(&page->lru, &pcp->lists[migratetype]);
1351	pcp->count++;	1349	pcp->count++;
1352	if (pcp->count >= pcp->high) {	1350	if (pcp->count >= pcp->high) {
1353	free_pcppages_bulk(zone, pcp->batch, pcp);	1351	unsigned long batch = ACCESS_ONCE(pcp->batch);
1354	pcp->count -= pcp->batch;	1352	free_pcppages_bulk(zone, batch, pcp);
		1353	pcp->count -= batch;
1355	}	1354	}
1356		1355
1357	out:	1356	out:
@@ -2839,7 +2838,7 @@ EXPORT_SYMBOL(free_pages_exact);
2839	* nr_free_zone_pages() counts the number of counts pages which are beyond the	2838	* nr_free_zone_pages() counts the number of counts pages which are beyond the
2840	* high watermark within all zones at or below a given zone index. For each	2839	* high watermark within all zones at or below a given zone index. For each
2841	* zone, the number of pages is calculated as:	2840	* zone, the number of pages is calculated as:
2842	* present_pages - high_pages	2841	* managed_pages - high_pages
2843	*/	2842	*/
2844	static unsigned long nr_free_zone_pages(int offset)	2843	static unsigned long nr_free_zone_pages(int offset)
2845	{	2844	{
@@ -2906,9 +2905,13 @@ EXPORT_SYMBOL(si_meminfo);
2906	#ifdef CONFIG_NUMA	2905	#ifdef CONFIG_NUMA
2907	void si_meminfo_node(struct sysinfo *val, int nid)	2906	void si_meminfo_node(struct sysinfo *val, int nid)
2908	{	2907	{
		2908	int zone_type; /* needs to be signed */
		2909	unsigned long managed_pages = 0;
2909	pg_data_t *pgdat = NODE_DATA(nid);	2910	pg_data_t *pgdat = NODE_DATA(nid);
2910		2911
2911	val->totalram = pgdat->node_present_pages;	2912	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
		2913	managed_pages += pgdat->node_zones[zone_type].managed_pages;
		2914	val->totalram = managed_pages;
2912	val->freeram = node_page_state(nid, NR_FREE_PAGES);	2915	val->freeram = node_page_state(nid, NR_FREE_PAGES);
2913	#ifdef CONFIG_HIGHMEM	2916	#ifdef CONFIG_HIGHMEM
2914	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;	2917	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3250,18 +3253,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3250	static DEFINE_MUTEX(zl_order_mutex);	3253	static DEFINE_MUTEX(zl_order_mutex);
3251		3254
3252	mutex_lock(&zl_order_mutex);	3255	mutex_lock(&zl_order_mutex);
3253	if (write)	3256	if (write) {
3254	strcpy(saved_string, (char*)table->data);	3257	if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
		3258	ret = -EINVAL;
		3259	goto out;
		3260	}
		3261	strcpy(saved_string, (char *)table->data);
		3262	}
3255	ret = proc_dostring(table, write, buffer, length, ppos);	3263	ret = proc_dostring(table, write, buffer, length, ppos);
3256	if (ret)	3264	if (ret)
3257	goto out;	3265	goto out;
3258	if (write) {	3266	if (write) {
3259	int oldval = user_zonelist_order;	3267	int oldval = user_zonelist_order;
3260	if (__parse_numa_zonelist_order((char*)table->data)) {	3268
		3269	ret = __parse_numa_zonelist_order((char *)table->data);
		3270	if (ret) {
3261	/*	3271	/*
3262	* bogus value. restore saved string	3272	* bogus value. restore saved string
3263	*/	3273	*/
3264	strncpy((char*)table->data, saved_string,	3274	strncpy((char *)table->data, saved_string,
3265	NUMA_ZONELIST_ORDER_LEN);	3275	NUMA_ZONELIST_ORDER_LEN);
3266	user_zonelist_order = oldval;	3276	user_zonelist_order = oldval;
3267	} else if (oldval != user_zonelist_order) {	3277	} else if (oldval != user_zonelist_order) {
@@ -3425,8 +3435,8 @@ static int default_zonelist_order(void)
3425	z = &NODE_DATA(nid)->node_zones[zone_type];	3435	z = &NODE_DATA(nid)->node_zones[zone_type];
3426	if (populated_zone(z)) {	3436	if (populated_zone(z)) {
3427	if (zone_type < ZONE_NORMAL)	3437	if (zone_type < ZONE_NORMAL)
3428	low_kmem_size += z->present_pages;	3438	low_kmem_size += z->managed_pages;
3429	total_size += z->present_pages;	3439	total_size += z->managed_pages;
3430	} else if (zone_type == ZONE_NORMAL) {	3440	} else if (zone_type == ZONE_NORMAL) {
3431	/*	3441	/*
3432	* If any node has only lowmem, then node order	3442	* If any node has only lowmem, then node order
@@ -3705,12 +3715,12 @@ void __ref build_all_zonelists(pg_data_t pgdat, struct zone zone)
3705	mminit_verify_zonelist();	3715	mminit_verify_zonelist();
3706	cpuset_init_current_mems_allowed();	3716	cpuset_init_current_mems_allowed();
3707	} else {	3717	} else {
3708	/* we have to stop all cpus to guarantee there is no user
3709	of zonelist */
3710	#ifdef CONFIG_MEMORY_HOTPLUG	3718	#ifdef CONFIG_MEMORY_HOTPLUG
3711	if (zone)	3719	if (zone)
3712	setup_zone_pageset(zone);	3720	setup_zone_pageset(zone);
3713	#endif	3721	#endif
		3722	/* we have to stop all cpus to guarantee there is no user
		3723	of zonelist */
3714	stop_machine(__build_all_zonelists, pgdat, NULL);	3724	stop_machine(__build_all_zonelists, pgdat, NULL);
3715	/* cpuset refresh routine should be here */	3725	/* cpuset refresh routine should be here */
3716	}	3726	}
@@ -4032,7 +4042,40 @@ static int __meminit zone_batchsize(struct zone *zone)
4032	#endif	4042	#endif
4033	}	4043	}
4034		4044
4035	static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)	4045	/*
		4046	* pcp->high and pcp->batch values are related and dependent on one another:
		4047	* ->batch must never be higher then ->high.
		4048	* The following function updates them in a safe manner without read side
		4049	* locking.
		4050	*
		4051	* Any new users of pcp->batch and pcp->high should ensure they can cope with
		4052	* those fields changing asynchronously (acording the the above rule).
		4053	*
		4054	* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
		4055	* outside of boot time (or some other assurance that no concurrent updaters
		4056	* exist).
		4057	*/
		4058	static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
		4059	unsigned long batch)
		4060	{
		4061	/* start with a fail safe value for batch */
		4062	pcp->batch = 1;
		4063	smp_wmb();
		4064
		4065	/* Update high, then batch, in order */
		4066	pcp->high = high;
		4067	smp_wmb();
		4068
		4069	pcp->batch = batch;
		4070	}
		4071
		4072	/* a companion to pageset_set_high() */
		4073	static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
		4074	{
		4075	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
		4076	}
		4077
		4078	static void pageset_init(struct per_cpu_pageset *p)
4036	{	4079	{
4037	struct per_cpu_pages *pcp;	4080	struct per_cpu_pages *pcp;
4038	int migratetype;	4081	int migratetype;
@@ -4041,45 +4084,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4041		4084
4042	pcp = &p->pcp;	4085	pcp = &p->pcp;
4043	pcp->count = 0;	4086	pcp->count = 0;
4044	pcp->high = 6 * batch;
4045	pcp->batch = max(1UL, 1 * batch);
4046	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)	4087	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4047	INIT_LIST_HEAD(&pcp->lists[migratetype]);	4088	INIT_LIST_HEAD(&pcp->lists[migratetype]);
4048	}	4089	}
4049		4090
		4091	static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
		4092	{
		4093	pageset_init(p);
		4094	pageset_set_batch(p, batch);
		4095	}
		4096
4050	/*	4097	/*
4051	* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist	4098	* pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4052	* to the value high for the pageset p.	4099	* to the value high for the pageset p.
4053	*/	4100	*/
4054		4101	static void pageset_set_high(struct per_cpu_pageset *p,
4055	static void setup_pagelist_highmark(struct per_cpu_pageset *p,
4056	unsigned long high)	4102	unsigned long high)
4057	{	4103	{
4058	struct per_cpu_pages *pcp;	4104	unsigned long batch = max(1UL, high / 4);
		4105	if ((high / 4) > (PAGE_SHIFT * 8))
		4106	batch = PAGE_SHIFT * 8;
4059		4107
4060	pcp = &p->pcp;	4108	pageset_update(&p->pcp, high, batch);
4061	pcp->high = high;
4062	pcp->batch = max(1UL, high/4);
4063	if ((high/4) > (PAGE_SHIFT * 8))
4064	pcp->batch = PAGE_SHIFT * 8;
4065	}	4109	}
4066		4110
4067	static void __meminit setup_zone_pageset(struct zone *zone)	4111	static void __meminit pageset_set_high_and_batch(struct zone *zone,
		4112	struct per_cpu_pageset *pcp)
4068	{	4113	{
4069	int cpu;	4114	if (percpu_pagelist_fraction)
4070		4115	pageset_set_high(pcp,
4071	zone->pageset = alloc_percpu(struct per_cpu_pageset);	4116	(zone->managed_pages /
		4117	percpu_pagelist_fraction));
		4118	else
		4119	pageset_set_batch(pcp, zone_batchsize(zone));
		4120	}
4072		4121
4073	for_each_possible_cpu(cpu) {	4122	static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4074	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);	4123	{
		4124	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4075		4125
4076	setup_pageset(pcp, zone_batchsize(zone));	4126	pageset_init(pcp);
		4127	pageset_set_high_and_batch(zone, pcp);
		4128	}
4077		4129
4078	if (percpu_pagelist_fraction)	4130	static void __meminit setup_zone_pageset(struct zone *zone)
4079	setup_pagelist_highmark(pcp,	4131	{
4080	(zone->managed_pages /	4132	int cpu;
4081	percpu_pagelist_fraction));	4133	zone->pageset = alloc_percpu(struct per_cpu_pageset);
4082	}	4134	for_each_possible_cpu(cpu)
		4135	zone_pageset_init(zone, cpu);
4083	}	4136	}
4084		4137
4085	/*	4138	/*
@@ -5150,35 +5203,101 @@ early_param("movablecore", cmdline_parse_movablecore);
5150		5203
5151	#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */	5204	#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5152		5205
5153	unsigned long free_reserved_area(unsigned long start, unsigned long end,	5206	void adjust_managed_page_count(struct page *page, long count)
5154	int poison, char *s)	5207	{
		5208	spin_lock(&managed_page_count_lock);
		5209	page_zone(page)->managed_pages += count;
		5210	totalram_pages += count;
		5211	#ifdef CONFIG_HIGHMEM
		5212	if (PageHighMem(page))
		5213	totalhigh_pages += count;
		5214	#endif
		5215	spin_unlock(&managed_page_count_lock);
		5216	}
		5217	EXPORT_SYMBOL(adjust_managed_page_count);
		5218
		5219	unsigned long free_reserved_area(void start, void end, int poison, char *s)
5155	{	5220	{
5156	unsigned long pages, pos;	5221	void *pos;
		5222	unsigned long pages = 0;
5157		5223
5158	pos = start = PAGE_ALIGN(start);	5224	start = (void *)PAGE_ALIGN((unsigned long)start);
5159	end &= PAGE_MASK;	5225	end = (void *)((unsigned long)end & PAGE_MASK);
5160	for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {	5226	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5161	if (poison)	5227	if ((unsigned int)poison <= 0xFF)
5162	memset((void *)pos, poison, PAGE_SIZE);	5228	memset(pos, poison, PAGE_SIZE);
5163	free_reserved_page(virt_to_page((void *)pos));	5229	free_reserved_page(virt_to_page(pos));
5164	}	5230	}
5165		5231
5166	if (pages && s)	5232	if (pages && s)
5167	pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",	5233	pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5168	s, pages << (PAGE_SHIFT - 10), start, end);	5234	s, pages << (PAGE_SHIFT - 10), start, end);
5169		5235
5170	return pages;	5236	return pages;
5171	}	5237	}
		5238	EXPORT_SYMBOL(free_reserved_area);
5172		5239
5173	#ifdef CONFIG_HIGHMEM	5240	#ifdef CONFIG_HIGHMEM
5174	void free_highmem_page(struct page *page)	5241	void free_highmem_page(struct page *page)
5175	{	5242	{
5176	__free_reserved_page(page);	5243	__free_reserved_page(page);
5177	totalram_pages++;	5244	totalram_pages++;
		5245	page_zone(page)->managed_pages++;
5178	totalhigh_pages++;	5246	totalhigh_pages++;
5179	}	5247	}
5180	#endif	5248	#endif
5181		5249
		5250
		5251	void __init mem_init_print_info(const char *str)
		5252	{
		5253	unsigned long physpages, codesize, datasize, rosize, bss_size;
		5254	unsigned long init_code_size, init_data_size;
		5255
		5256	physpages = get_num_physpages();
		5257	codesize = _etext - _stext;
		5258	datasize = _edata - _sdata;
		5259	rosize = __end_rodata - __start_rodata;
		5260	bss_size = __bss_stop - __bss_start;
		5261	init_data_size = __init_end - __init_begin;
		5262	init_code_size = _einittext - _sinittext;
		5263
		5264	/*
		5265	* Detect special cases and adjust section sizes accordingly:
		5266	* 1) .init.* may be embedded into .data sections
		5267	* 2) .init.text.* may be out of [__init_begin, __init_end],
		5268	* please refer to arch/tile/kernel/vmlinux.lds.S.
		5269	* 3) .rodata.* may be embedded into .text or .data sections.
		5270	*/
		5271	#define adj_init_size(start, end, size, pos, adj) \
		5272	if (start <= pos && pos < end && size > adj) \
		5273	size -= adj;
		5274
		5275	adj_init_size(__init_begin, __init_end, init_data_size,
		5276	_sinittext, init_code_size);
		5277	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
		5278	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
		5279	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
		5280	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
		5281
		5282	#undef adj_init_size
		5283
		5284	printk("Memory: %luK/%luK available "
		5285	"(%luK kernel code, %luK rwdata, %luK rodata, "
		5286	"%luK init, %luK bss, %luK reserved"
		5287	#ifdef CONFIG_HIGHMEM
		5288	", %luK highmem"
		5289	#endif
		5290	"%s%s)\n",
		5291	nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
		5292	codesize >> 10, datasize >> 10, rosize >> 10,
		5293	(init_data_size + init_code_size) >> 10, bss_size >> 10,
		5294	(physpages - totalram_pages) << (PAGE_SHIFT-10),
		5295	#ifdef CONFIG_HIGHMEM
		5296	totalhigh_pages << (PAGE_SHIFT-10),
		5297	#endif
		5298	str ? ", " : "", str ? str : "");
		5299	}
		5300
5182	/**	5301	/**
5183	* set_dma_reserve - set the specified number of pages reserved in the first zone	5302	* set_dma_reserve - set the specified number of pages reserved in the first zone
5184	* @new_dma_reserve: The number of pages to mark reserved	5303	* @new_dma_reserve: The number of pages to mark reserved
@@ -5540,7 +5659,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5540	* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist	5659	* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
5541	* can have before it gets flushed back to buddy allocator.	5660	* can have before it gets flushed back to buddy allocator.
5542	*/	5661	*/
5543
5544	int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,	5662	int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5545	void __user buffer, size_t length, loff_t *ppos)	5663	void __user buffer, size_t length, loff_t *ppos)
5546	{	5664	{
@@ -5551,14 +5669,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5551	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);	5669	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5552	if (!write \|\| (ret < 0))	5670	if (!write \|\| (ret < 0))
5553	return ret;	5671	return ret;
		5672
		5673	mutex_lock(&pcp_batch_high_lock);
5554	for_each_populated_zone(zone) {	5674	for_each_populated_zone(zone) {
5555	for_each_possible_cpu(cpu) {	5675	unsigned long high;
5556	unsigned long high;	5676	high = zone->managed_pages / percpu_pagelist_fraction;
5557	high = zone->managed_pages / percpu_pagelist_fraction;	5677	for_each_possible_cpu(cpu)
5558	setup_pagelist_highmark(	5678	pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
5559	per_cpu_ptr(zone->pageset, cpu), high);	5679	high);
5560	}
5561	}	5680	}
		5681	mutex_unlock(&pcp_batch_high_lock);
5562	return 0;	5682	return 0;
5563	}	5683	}
5564		5684
@@ -6047,32 +6167,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
6047	#endif	6167	#endif
6048		6168
6049	#ifdef CONFIG_MEMORY_HOTPLUG	6169	#ifdef CONFIG_MEMORY_HOTPLUG
6050	static int __meminit __zone_pcp_update(void *data)	6170	/*
6051	{	6171	* The zone indicated has a new number of managed_pages; batch sizes and percpu
6052	struct zone *zone = data;	6172	* page high values need to be recalulated.
6053	int cpu;	6173	*/
6054	unsigned long batch = zone_batchsize(zone), flags;
6055
6056	for_each_possible_cpu(cpu) {
6057	struct per_cpu_pageset *pset;
6058	struct per_cpu_pages *pcp;
6059
6060	pset = per_cpu_ptr(zone->pageset, cpu);
6061	pcp = &pset->pcp;
6062
6063	local_irq_save(flags);
6064	if (pcp->count > 0)
6065	free_pcppages_bulk(zone, pcp->count, pcp);
6066	drain_zonestat(zone, pset);
6067	setup_pageset(pset, batch);
6068	local_irq_restore(flags);
6069	}
6070	return 0;
6071	}
6072
6073	void __meminit zone_pcp_update(struct zone *zone)	6174	void __meminit zone_pcp_update(struct zone *zone)
6074	{	6175	{
6075	stop_machine(__zone_pcp_update, zone, NULL);	6176	unsigned cpu;
		6177	mutex_lock(&pcp_batch_high_lock);
		6178	for_each_possible_cpu(cpu)
		6179	pageset_set_high_and_batch(zone,
		6180	per_cpu_ptr(zone->pageset, cpu));
		6181	mutex_unlock(&pcp_batch_high_lock);
6076	}	6182	}
6077	#endif	6183	#endif
6078		6184
@@ -6142,6 +6248,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6142	list_del(&page->lru);	6248	list_del(&page->lru);
6143	rmv_page_order(page);	6249	rmv_page_order(page);
6144	zone->free_area[order].nr_free--;	6250	zone->free_area[order].nr_free--;
		6251	#ifdef CONFIG_HIGHMEM
		6252	if (PageHighMem(page))
		6253	totalhigh_pages -= 1 << order;
		6254	#endif
6145	for (i = 0; i < (1 << order); i++)	6255	for (i = 0; i < (1 << order); i++)
6146	SetPageReserved((page+i));	6256	SetPageReserved((page+i));
6147	pfn += (1 << order);	6257	pfn += (1 << order);