aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
committerSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
commitee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
treee74ee766a4764769ef1d3d45d266b4dea64101d3 /mm/page_alloc.c
parentfe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parentf1d6e17f540af37bb1891480143669ba7636c4cf (diff)
Merge remote-tracking branch 'linus/master' into testing
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c384
1 files changed, 260 insertions, 124 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fccf..b100255dedda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,10 +61,14 @@
61#include <linux/hugetlb.h> 61#include <linux/hugetlb.h>
62#include <linux/sched/rt.h> 62#include <linux/sched/rt.h>
63 63
64#include <asm/sections.h>
64#include <asm/tlbflush.h> 65#include <asm/tlbflush.h>
65#include <asm/div64.h> 66#include <asm/div64.h>
66#include "internal.h" 67#include "internal.h"
67 68
69/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
70static DEFINE_MUTEX(pcp_batch_high_lock);
71
68#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 72#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
69DEFINE_PER_CPU(int, numa_node); 73DEFINE_PER_CPU(int, numa_node);
70EXPORT_PER_CPU_SYMBOL(numa_node); 74EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
100}; 104};
101EXPORT_SYMBOL(node_states); 105EXPORT_SYMBOL(node_states);
102 106
107/* Protect totalram_pages and zone->managed_pages */
108static DEFINE_SPINLOCK(managed_page_count_lock);
109
103unsigned long totalram_pages __read_mostly; 110unsigned long totalram_pages __read_mostly;
104unsigned long totalreserve_pages __read_mostly; 111unsigned long totalreserve_pages __read_mostly;
105/* 112/*
@@ -197,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
197}; 204};
198 205
199int min_free_kbytes = 1024; 206int min_free_kbytes = 1024;
207int user_min_free_kbytes;
200 208
201static unsigned long __meminitdata nr_kernel_pages; 209static unsigned long __meminitdata nr_kernel_pages;
202static unsigned long __meminitdata nr_all_pages; 210static unsigned long __meminitdata nr_all_pages;
@@ -739,14 +747,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
739 local_irq_restore(flags); 747 local_irq_restore(flags);
740} 748}
741 749
742/* 750void __init __free_pages_bootmem(struct page *page, unsigned int order)
743 * Read access to zone->managed_pages is safe because it's unsigned long,
744 * but we still need to serialize writers. Currently all callers of
745 * __free_pages_bootmem() except put_page_bootmem() should only be used
746 * at boot time. So for shorter boot time, we shift the burden to
747 * put_page_bootmem() to serialize writers.
748 */
749void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
750{ 751{
751 unsigned int nr_pages = 1 << order; 752 unsigned int nr_pages = 1 << order;
752 unsigned int loop; 753 unsigned int loop;
@@ -781,11 +782,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
781 set_page_refcounted(page); 782 set_page_refcounted(page);
782 set_pageblock_migratetype(page, MIGRATE_CMA); 783 set_pageblock_migratetype(page, MIGRATE_CMA);
783 __free_pages(page, pageblock_order); 784 __free_pages(page, pageblock_order);
784 totalram_pages += pageblock_nr_pages; 785 adjust_managed_page_count(page, pageblock_nr_pages);
785#ifdef CONFIG_HIGHMEM
786 if (PageHighMem(page))
787 totalhigh_pages += pageblock_nr_pages;
788#endif
789} 786}
790#endif 787#endif
791 788
@@ -1050,7 +1047,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1050 * MIGRATE_CMA areas. 1047 * MIGRATE_CMA areas.
1051 */ 1048 */
1052 if (!is_migrate_cma(migratetype) && 1049 if (!is_migrate_cma(migratetype) &&
1053 (unlikely(current_order >= pageblock_order / 2) || 1050 (current_order >= pageblock_order / 2 ||
1054 start_migratetype == MIGRATE_RECLAIMABLE || 1051 start_migratetype == MIGRATE_RECLAIMABLE ||
1055 page_group_by_mobility_disabled)) { 1052 page_group_by_mobility_disabled)) {
1056 int pages; 1053 int pages;
@@ -1179,10 +1176,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1179{ 1176{
1180 unsigned long flags; 1177 unsigned long flags;
1181 int to_drain; 1178 int to_drain;
1179 unsigned long batch;
1182 1180
1183 local_irq_save(flags); 1181 local_irq_save(flags);
1184 if (pcp->count >= pcp->batch) 1182 batch = ACCESS_ONCE(pcp->batch);
1185 to_drain = pcp->batch; 1183 if (pcp->count >= batch)
1184 to_drain = batch;
1186 else 1185 else
1187 to_drain = pcp->count; 1186 to_drain = pcp->count;
1188 if (to_drain > 0) { 1187 if (to_drain > 0) {
@@ -1350,8 +1349,9 @@ void free_hot_cold_page(struct page *page, int cold)
1350 list_add(&page->lru, &pcp->lists[migratetype]); 1349 list_add(&page->lru, &pcp->lists[migratetype]);
1351 pcp->count++; 1350 pcp->count++;
1352 if (pcp->count >= pcp->high) { 1351 if (pcp->count >= pcp->high) {
1353 free_pcppages_bulk(zone, pcp->batch, pcp); 1352 unsigned long batch = ACCESS_ONCE(pcp->batch);
1354 pcp->count -= pcp->batch; 1353 free_pcppages_bulk(zone, batch, pcp);
1354 pcp->count -= batch;
1355 } 1355 }
1356 1356
1357out: 1357out:
@@ -2839,7 +2839,7 @@ EXPORT_SYMBOL(free_pages_exact);
2839 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2839 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2840 * high watermark within all zones at or below a given zone index. For each 2840 * high watermark within all zones at or below a given zone index. For each
2841 * zone, the number of pages is calculated as: 2841 * zone, the number of pages is calculated as:
2842 * present_pages - high_pages 2842 * managed_pages - high_pages
2843 */ 2843 */
2844static unsigned long nr_free_zone_pages(int offset) 2844static unsigned long nr_free_zone_pages(int offset)
2845{ 2845{
@@ -2906,9 +2906,13 @@ EXPORT_SYMBOL(si_meminfo);
2906#ifdef CONFIG_NUMA 2906#ifdef CONFIG_NUMA
2907void si_meminfo_node(struct sysinfo *val, int nid) 2907void si_meminfo_node(struct sysinfo *val, int nid)
2908{ 2908{
2909 int zone_type; /* needs to be signed */
2910 unsigned long managed_pages = 0;
2909 pg_data_t *pgdat = NODE_DATA(nid); 2911 pg_data_t *pgdat = NODE_DATA(nid);
2910 2912
2911 val->totalram = pgdat->node_present_pages; 2913 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
2914 managed_pages += pgdat->node_zones[zone_type].managed_pages;
2915 val->totalram = managed_pages;
2912 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2916 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2913#ifdef CONFIG_HIGHMEM 2917#ifdef CONFIG_HIGHMEM
2914 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 2918 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3150,12 +3154,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3150 * Add all populated zones of a node to the zonelist. 3154 * Add all populated zones of a node to the zonelist.
3151 */ 3155 */
3152static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3156static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3153 int nr_zones, enum zone_type zone_type) 3157 int nr_zones)
3154{ 3158{
3155 struct zone *zone; 3159 struct zone *zone;
3156 3160 enum zone_type zone_type = MAX_NR_ZONES;
3157 BUG_ON(zone_type >= MAX_NR_ZONES);
3158 zone_type++;
3159 3161
3160 do { 3162 do {
3161 zone_type--; 3163 zone_type--;
@@ -3165,8 +3167,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3165 &zonelist->_zonerefs[nr_zones++]); 3167 &zonelist->_zonerefs[nr_zones++]);
3166 check_highest_zone(zone_type); 3168 check_highest_zone(zone_type);
3167 } 3169 }
3168
3169 } while (zone_type); 3170 } while (zone_type);
3171
3170 return nr_zones; 3172 return nr_zones;
3171} 3173}
3172 3174
@@ -3250,18 +3252,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3250 static DEFINE_MUTEX(zl_order_mutex); 3252 static DEFINE_MUTEX(zl_order_mutex);
3251 3253
3252 mutex_lock(&zl_order_mutex); 3254 mutex_lock(&zl_order_mutex);
3253 if (write) 3255 if (write) {
3254 strcpy(saved_string, (char*)table->data); 3256 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3257 ret = -EINVAL;
3258 goto out;
3259 }
3260 strcpy(saved_string, (char *)table->data);
3261 }
3255 ret = proc_dostring(table, write, buffer, length, ppos); 3262 ret = proc_dostring(table, write, buffer, length, ppos);
3256 if (ret) 3263 if (ret)
3257 goto out; 3264 goto out;
3258 if (write) { 3265 if (write) {
3259 int oldval = user_zonelist_order; 3266 int oldval = user_zonelist_order;
3260 if (__parse_numa_zonelist_order((char*)table->data)) { 3267
3268 ret = __parse_numa_zonelist_order((char *)table->data);
3269 if (ret) {
3261 /* 3270 /*
3262 * bogus value. restore saved string 3271 * bogus value. restore saved string
3263 */ 3272 */
3264 strncpy((char*)table->data, saved_string, 3273 strncpy((char *)table->data, saved_string,
3265 NUMA_ZONELIST_ORDER_LEN); 3274 NUMA_ZONELIST_ORDER_LEN);
3266 user_zonelist_order = oldval; 3275 user_zonelist_order = oldval;
3267 } else if (oldval != user_zonelist_order) { 3276 } else if (oldval != user_zonelist_order) {
@@ -3353,8 +3362,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3353 zonelist = &pgdat->node_zonelists[0]; 3362 zonelist = &pgdat->node_zonelists[0];
3354 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3363 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3355 ; 3364 ;
3356 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3365 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3357 MAX_NR_ZONES - 1);
3358 zonelist->_zonerefs[j].zone = NULL; 3366 zonelist->_zonerefs[j].zone = NULL;
3359 zonelist->_zonerefs[j].zone_idx = 0; 3367 zonelist->_zonerefs[j].zone_idx = 0;
3360} 3368}
@@ -3368,7 +3376,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
3368 struct zonelist *zonelist; 3376 struct zonelist *zonelist;
3369 3377
3370 zonelist = &pgdat->node_zonelists[1]; 3378 zonelist = &pgdat->node_zonelists[1];
3371 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3379 j = build_zonelists_node(pgdat, zonelist, 0);
3372 zonelist->_zonerefs[j].zone = NULL; 3380 zonelist->_zonerefs[j].zone = NULL;
3373 zonelist->_zonerefs[j].zone_idx = 0; 3381 zonelist->_zonerefs[j].zone_idx = 0;
3374} 3382}
@@ -3425,8 +3433,8 @@ static int default_zonelist_order(void)
3425 z = &NODE_DATA(nid)->node_zones[zone_type]; 3433 z = &NODE_DATA(nid)->node_zones[zone_type];
3426 if (populated_zone(z)) { 3434 if (populated_zone(z)) {
3427 if (zone_type < ZONE_NORMAL) 3435 if (zone_type < ZONE_NORMAL)
3428 low_kmem_size += z->present_pages; 3436 low_kmem_size += z->managed_pages;
3429 total_size += z->present_pages; 3437 total_size += z->managed_pages;
3430 } else if (zone_type == ZONE_NORMAL) { 3438 } else if (zone_type == ZONE_NORMAL) {
3431 /* 3439 /*
3432 * If any node has only lowmem, then node order 3440 * If any node has only lowmem, then node order
@@ -3576,7 +3584,7 @@ static void build_zonelists(pg_data_t *pgdat)
3576 local_node = pgdat->node_id; 3584 local_node = pgdat->node_id;
3577 3585
3578 zonelist = &pgdat->node_zonelists[0]; 3586 zonelist = &pgdat->node_zonelists[0];
3579 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3587 j = build_zonelists_node(pgdat, zonelist, 0);
3580 3588
3581 /* 3589 /*
3582 * Now we build the zonelist so that it contains the zones 3590 * Now we build the zonelist so that it contains the zones
@@ -3589,14 +3597,12 @@ static void build_zonelists(pg_data_t *pgdat)
3589 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3597 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3590 if (!node_online(node)) 3598 if (!node_online(node))
3591 continue; 3599 continue;
3592 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3600 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3593 MAX_NR_ZONES - 1);
3594 } 3601 }
3595 for (node = 0; node < local_node; node++) { 3602 for (node = 0; node < local_node; node++) {
3596 if (!node_online(node)) 3603 if (!node_online(node))
3597 continue; 3604 continue;
3598 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3605 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3599 MAX_NR_ZONES - 1);
3600 } 3606 }
3601 3607
3602 zonelist->_zonerefs[j].zone = NULL; 3608 zonelist->_zonerefs[j].zone = NULL;
@@ -3705,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3705 mminit_verify_zonelist(); 3711 mminit_verify_zonelist();
3706 cpuset_init_current_mems_allowed(); 3712 cpuset_init_current_mems_allowed();
3707 } else { 3713 } else {
3708 /* we have to stop all cpus to guarantee there is no user
3709 of zonelist */
3710#ifdef CONFIG_MEMORY_HOTPLUG 3714#ifdef CONFIG_MEMORY_HOTPLUG
3711 if (zone) 3715 if (zone)
3712 setup_zone_pageset(zone); 3716 setup_zone_pageset(zone);
3713#endif 3717#endif
3718 /* we have to stop all cpus to guarantee there is no user
3719 of zonelist */
3714 stop_machine(__build_all_zonelists, pgdat, NULL); 3720 stop_machine(__build_all_zonelists, pgdat, NULL);
3715 /* cpuset refresh routine should be here */ 3721 /* cpuset refresh routine should be here */
3716 } 3722 }
@@ -4032,7 +4038,40 @@ static int __meminit zone_batchsize(struct zone *zone)
4032#endif 4038#endif
4033} 4039}
4034 4040
4035static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4041/*
4042 * pcp->high and pcp->batch values are related and dependent on one another:
4043 * ->batch must never be higher then ->high.
4044 * The following function updates them in a safe manner without read side
4045 * locking.
4046 *
4047 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4048 * those fields changing asynchronously (acording the the above rule).
4049 *
4050 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4051 * outside of boot time (or some other assurance that no concurrent updaters
4052 * exist).
4053 */
4054static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4055 unsigned long batch)
4056{
4057 /* start with a fail safe value for batch */
4058 pcp->batch = 1;
4059 smp_wmb();
4060
4061 /* Update high, then batch, in order */
4062 pcp->high = high;
4063 smp_wmb();
4064
4065 pcp->batch = batch;
4066}
4067
4068/* a companion to pageset_set_high() */
4069static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4070{
4071 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4072}
4073
4074static void pageset_init(struct per_cpu_pageset *p)
4036{ 4075{
4037 struct per_cpu_pages *pcp; 4076 struct per_cpu_pages *pcp;
4038 int migratetype; 4077 int migratetype;
@@ -4041,45 +4080,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4041 4080
4042 pcp = &p->pcp; 4081 pcp = &p->pcp;
4043 pcp->count = 0; 4082 pcp->count = 0;
4044 pcp->high = 6 * batch;
4045 pcp->batch = max(1UL, 1 * batch);
4046 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4083 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4047 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4084 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4048} 4085}
4049 4086
4087static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4088{
4089 pageset_init(p);
4090 pageset_set_batch(p, batch);
4091}
4092
4050/* 4093/*
4051 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 4094 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4052 * to the value high for the pageset p. 4095 * to the value high for the pageset p.
4053 */ 4096 */
4054 4097static void pageset_set_high(struct per_cpu_pageset *p,
4055static void setup_pagelist_highmark(struct per_cpu_pageset *p,
4056 unsigned long high) 4098 unsigned long high)
4057{ 4099{
4058 struct per_cpu_pages *pcp; 4100 unsigned long batch = max(1UL, high / 4);
4101 if ((high / 4) > (PAGE_SHIFT * 8))
4102 batch = PAGE_SHIFT * 8;
4059 4103
4060 pcp = &p->pcp; 4104 pageset_update(&p->pcp, high, batch);
4061 pcp->high = high;
4062 pcp->batch = max(1UL, high/4);
4063 if ((high/4) > (PAGE_SHIFT * 8))
4064 pcp->batch = PAGE_SHIFT * 8;
4065} 4105}
4066 4106
4067static void __meminit setup_zone_pageset(struct zone *zone) 4107static void __meminit pageset_set_high_and_batch(struct zone *zone,
4108 struct per_cpu_pageset *pcp)
4068{ 4109{
4069 int cpu; 4110 if (percpu_pagelist_fraction)
4070 4111 pageset_set_high(pcp,
4071 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4112 (zone->managed_pages /
4113 percpu_pagelist_fraction));
4114 else
4115 pageset_set_batch(pcp, zone_batchsize(zone));
4116}
4072 4117
4073 for_each_possible_cpu(cpu) { 4118static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4074 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4119{
4120 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4075 4121
4076 setup_pageset(pcp, zone_batchsize(zone)); 4122 pageset_init(pcp);
4123 pageset_set_high_and_batch(zone, pcp);
4124}
4077 4125
4078 if (percpu_pagelist_fraction) 4126static void __meminit setup_zone_pageset(struct zone *zone)
4079 setup_pagelist_highmark(pcp, 4127{
4080 (zone->managed_pages / 4128 int cpu;
4081 percpu_pagelist_fraction)); 4129 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4082 } 4130 for_each_possible_cpu(cpu)
4131 zone_pageset_init(zone, cpu);
4083} 4132}
4084 4133
4085/* 4134/*
@@ -4368,13 +4417,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
4368 */ 4417 */
4369static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4418static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4370 unsigned long zone_type, 4419 unsigned long zone_type,
4420 unsigned long node_start_pfn,
4421 unsigned long node_end_pfn,
4371 unsigned long *ignored) 4422 unsigned long *ignored)
4372{ 4423{
4373 unsigned long node_start_pfn, node_end_pfn;
4374 unsigned long zone_start_pfn, zone_end_pfn; 4424 unsigned long zone_start_pfn, zone_end_pfn;
4375 4425
4376 /* Get the start and end of the node and zone */ 4426 /* Get the start and end of the zone */
4377 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4378 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4427 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4379 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4428 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4380 adjust_zone_range_for_zone_movable(nid, zone_type, 4429 adjust_zone_range_for_zone_movable(nid, zone_type,
@@ -4429,14 +4478,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4429/* Return the number of page frames in holes in a zone on a node */ 4478/* Return the number of page frames in holes in a zone on a node */
4430static unsigned long __meminit zone_absent_pages_in_node(int nid, 4479static unsigned long __meminit zone_absent_pages_in_node(int nid,
4431 unsigned long zone_type, 4480 unsigned long zone_type,
4481 unsigned long node_start_pfn,
4482 unsigned long node_end_pfn,
4432 unsigned long *ignored) 4483 unsigned long *ignored)
4433{ 4484{
4434 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4485 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4435 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4486 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4436 unsigned long node_start_pfn, node_end_pfn;
4437 unsigned long zone_start_pfn, zone_end_pfn; 4487 unsigned long zone_start_pfn, zone_end_pfn;
4438 4488
4439 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4440 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4489 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4441 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4490 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4442 4491
@@ -4449,6 +4498,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4449#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4498#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4450static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4499static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4451 unsigned long zone_type, 4500 unsigned long zone_type,
4501 unsigned long node_start_pfn,
4502 unsigned long node_end_pfn,
4452 unsigned long *zones_size) 4503 unsigned long *zones_size)
4453{ 4504{
4454 return zones_size[zone_type]; 4505 return zones_size[zone_type];
@@ -4456,6 +4507,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4456 4507
4457static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4508static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4458 unsigned long zone_type, 4509 unsigned long zone_type,
4510 unsigned long node_start_pfn,
4511 unsigned long node_end_pfn,
4459 unsigned long *zholes_size) 4512 unsigned long *zholes_size)
4460{ 4513{
4461 if (!zholes_size) 4514 if (!zholes_size)
@@ -4467,21 +4520,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4467#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4520#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4468 4521
4469static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4522static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4470 unsigned long *zones_size, unsigned long *zholes_size) 4523 unsigned long node_start_pfn,
4524 unsigned long node_end_pfn,
4525 unsigned long *zones_size,
4526 unsigned long *zholes_size)
4471{ 4527{
4472 unsigned long realtotalpages, totalpages = 0; 4528 unsigned long realtotalpages, totalpages = 0;
4473 enum zone_type i; 4529 enum zone_type i;
4474 4530
4475 for (i = 0; i < MAX_NR_ZONES; i++) 4531 for (i = 0; i < MAX_NR_ZONES; i++)
4476 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4532 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4477 zones_size); 4533 node_start_pfn,
4534 node_end_pfn,
4535 zones_size);
4478 pgdat->node_spanned_pages = totalpages; 4536 pgdat->node_spanned_pages = totalpages;
4479 4537
4480 realtotalpages = totalpages; 4538 realtotalpages = totalpages;
4481 for (i = 0; i < MAX_NR_ZONES; i++) 4539 for (i = 0; i < MAX_NR_ZONES; i++)
4482 realtotalpages -= 4540 realtotalpages -=
4483 zone_absent_pages_in_node(pgdat->node_id, i, 4541 zone_absent_pages_in_node(pgdat->node_id, i,
4484 zholes_size); 4542 node_start_pfn, node_end_pfn,
4543 zholes_size);
4485 pgdat->node_present_pages = realtotalpages; 4544 pgdat->node_present_pages = realtotalpages;
4486 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4545 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4487 realtotalpages); 4546 realtotalpages);
@@ -4590,6 +4649,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4590 * NOTE: pgdat should get zeroed by caller. 4649 * NOTE: pgdat should get zeroed by caller.
4591 */ 4650 */
4592static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4651static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4652 unsigned long node_start_pfn, unsigned long node_end_pfn,
4593 unsigned long *zones_size, unsigned long *zholes_size) 4653 unsigned long *zones_size, unsigned long *zholes_size)
4594{ 4654{
4595 enum zone_type j; 4655 enum zone_type j;
@@ -4611,8 +4671,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4611 struct zone *zone = pgdat->node_zones + j; 4671 struct zone *zone = pgdat->node_zones + j;
4612 unsigned long size, realsize, freesize, memmap_pages; 4672 unsigned long size, realsize, freesize, memmap_pages;
4613 4673
4614 size = zone_spanned_pages_in_node(nid, j, zones_size); 4674 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4675 node_end_pfn, zones_size);
4615 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4676 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4677 node_start_pfn,
4678 node_end_pfn,
4616 zholes_size); 4679 zholes_size);
4617 4680
4618 /* 4681 /*
@@ -4726,6 +4789,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4726 unsigned long node_start_pfn, unsigned long *zholes_size) 4789 unsigned long node_start_pfn, unsigned long *zholes_size)
4727{ 4790{
4728 pg_data_t *pgdat = NODE_DATA(nid); 4791 pg_data_t *pgdat = NODE_DATA(nid);
4792 unsigned long start_pfn = 0;
4793 unsigned long end_pfn = 0;
4729 4794
4730 /* pg_data_t should be reset to zero when it's allocated */ 4795 /* pg_data_t should be reset to zero when it's allocated */
4731 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4796 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
@@ -4733,7 +4798,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4733 pgdat->node_id = nid; 4798 pgdat->node_id = nid;
4734 pgdat->node_start_pfn = node_start_pfn; 4799 pgdat->node_start_pfn = node_start_pfn;
4735 init_zone_allows_reclaim(nid); 4800 init_zone_allows_reclaim(nid);
4736 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4801#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4802 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4803#endif
4804 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4805 zones_size, zholes_size);
4737 4806
4738 alloc_node_mem_map(pgdat); 4807 alloc_node_mem_map(pgdat);
4739#ifdef CONFIG_FLAT_NODE_MEM_MAP 4808#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -4742,7 +4811,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4742 (unsigned long)pgdat->node_mem_map); 4811 (unsigned long)pgdat->node_mem_map);
4743#endif 4812#endif
4744 4813
4745 free_area_init_core(pgdat, zones_size, zholes_size); 4814 free_area_init_core(pgdat, start_pfn, end_pfn,
4815 zones_size, zholes_size);
4746} 4816}
4747 4817
4748#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4818#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5150,35 +5220,101 @@ early_param("movablecore", cmdline_parse_movablecore);
5150 5220
5151#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5221#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5152 5222
5153unsigned long free_reserved_area(unsigned long start, unsigned long end, 5223void adjust_managed_page_count(struct page *page, long count)
5154 int poison, char *s)
5155{ 5224{
5156 unsigned long pages, pos; 5225 spin_lock(&managed_page_count_lock);
5226 page_zone(page)->managed_pages += count;
5227 totalram_pages += count;
5228#ifdef CONFIG_HIGHMEM
5229 if (PageHighMem(page))
5230 totalhigh_pages += count;
5231#endif
5232 spin_unlock(&managed_page_count_lock);
5233}
5234EXPORT_SYMBOL(adjust_managed_page_count);
5157 5235
5158 pos = start = PAGE_ALIGN(start); 5236unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5159 end &= PAGE_MASK; 5237{
5160 for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { 5238 void *pos;
5161 if (poison) 5239 unsigned long pages = 0;
5162 memset((void *)pos, poison, PAGE_SIZE); 5240
5163 free_reserved_page(virt_to_page((void *)pos)); 5241 start = (void *)PAGE_ALIGN((unsigned long)start);
5242 end = (void *)((unsigned long)end & PAGE_MASK);
5243 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5244 if ((unsigned int)poison <= 0xFF)
5245 memset(pos, poison, PAGE_SIZE);
5246 free_reserved_page(virt_to_page(pos));
5164 } 5247 }
5165 5248
5166 if (pages && s) 5249 if (pages && s)
5167 pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", 5250 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5168 s, pages << (PAGE_SHIFT - 10), start, end); 5251 s, pages << (PAGE_SHIFT - 10), start, end);
5169 5252
5170 return pages; 5253 return pages;
5171} 5254}
5255EXPORT_SYMBOL(free_reserved_area);
5172 5256
5173#ifdef CONFIG_HIGHMEM 5257#ifdef CONFIG_HIGHMEM
5174void free_highmem_page(struct page *page) 5258void free_highmem_page(struct page *page)
5175{ 5259{
5176 __free_reserved_page(page); 5260 __free_reserved_page(page);
5177 totalram_pages++; 5261 totalram_pages++;
5262 page_zone(page)->managed_pages++;
5178 totalhigh_pages++; 5263 totalhigh_pages++;
5179} 5264}
5180#endif 5265#endif
5181 5266
5267
5268void __init mem_init_print_info(const char *str)
5269{
5270 unsigned long physpages, codesize, datasize, rosize, bss_size;
5271 unsigned long init_code_size, init_data_size;
5272
5273 physpages = get_num_physpages();
5274 codesize = _etext - _stext;
5275 datasize = _edata - _sdata;
5276 rosize = __end_rodata - __start_rodata;
5277 bss_size = __bss_stop - __bss_start;
5278 init_data_size = __init_end - __init_begin;
5279 init_code_size = _einittext - _sinittext;
5280
5281 /*
5282 * Detect special cases and adjust section sizes accordingly:
5283 * 1) .init.* may be embedded into .data sections
5284 * 2) .init.text.* may be out of [__init_begin, __init_end],
5285 * please refer to arch/tile/kernel/vmlinux.lds.S.
5286 * 3) .rodata.* may be embedded into .text or .data sections.
5287 */
5288#define adj_init_size(start, end, size, pos, adj) \
5289 if (start <= pos && pos < end && size > adj) \
5290 size -= adj;
5291
5292 adj_init_size(__init_begin, __init_end, init_data_size,
5293 _sinittext, init_code_size);
5294 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5295 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5296 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5297 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5298
5299#undef adj_init_size
5300
5301 printk("Memory: %luK/%luK available "
5302 "(%luK kernel code, %luK rwdata, %luK rodata, "
5303 "%luK init, %luK bss, %luK reserved"
5304#ifdef CONFIG_HIGHMEM
5305 ", %luK highmem"
5306#endif
5307 "%s%s)\n",
5308 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5309 codesize >> 10, datasize >> 10, rosize >> 10,
5310 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5311 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5312#ifdef CONFIG_HIGHMEM
5313 totalhigh_pages << (PAGE_SHIFT-10),
5314#endif
5315 str ? ", " : "", str ? str : "");
5316}
5317
5182/** 5318/**
5183 * set_dma_reserve - set the specified number of pages reserved in the first zone 5319 * set_dma_reserve - set the specified number of pages reserved in the first zone
5184 * @new_dma_reserve: The number of pages to mark reserved 5320 * @new_dma_reserve: The number of pages to mark reserved
@@ -5454,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void)
5454int __meminit init_per_zone_wmark_min(void) 5590int __meminit init_per_zone_wmark_min(void)
5455{ 5591{
5456 unsigned long lowmem_kbytes; 5592 unsigned long lowmem_kbytes;
5593 int new_min_free_kbytes;
5457 5594
5458 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5595 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5459 5596 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5460 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5597
5461 if (min_free_kbytes < 128) 5598 if (new_min_free_kbytes > user_min_free_kbytes) {
5462 min_free_kbytes = 128; 5599 min_free_kbytes = new_min_free_kbytes;
5463 if (min_free_kbytes > 65536) 5600 if (min_free_kbytes < 128)
5464 min_free_kbytes = 65536; 5601 min_free_kbytes = 128;
5602 if (min_free_kbytes > 65536)
5603 min_free_kbytes = 65536;
5604 } else {
5605 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5606 new_min_free_kbytes, user_min_free_kbytes);
5607 }
5465 setup_per_zone_wmarks(); 5608 setup_per_zone_wmarks();
5466 refresh_zone_stat_thresholds(); 5609 refresh_zone_stat_thresholds();
5467 setup_per_zone_lowmem_reserve(); 5610 setup_per_zone_lowmem_reserve();
@@ -5479,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5479 void __user *buffer, size_t *length, loff_t *ppos) 5622 void __user *buffer, size_t *length, loff_t *ppos)
5480{ 5623{
5481 proc_dointvec(table, write, buffer, length, ppos); 5624 proc_dointvec(table, write, buffer, length, ppos);
5482 if (write) 5625 if (write) {
5626 user_min_free_kbytes = min_free_kbytes;
5483 setup_per_zone_wmarks(); 5627 setup_per_zone_wmarks();
5628 }
5484 return 0; 5629 return 0;
5485} 5630}
5486 5631
@@ -5540,7 +5685,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5540 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5685 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
5541 * can have before it gets flushed back to buddy allocator. 5686 * can have before it gets flushed back to buddy allocator.
5542 */ 5687 */
5543
5544int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5688int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5545 void __user *buffer, size_t *length, loff_t *ppos) 5689 void __user *buffer, size_t *length, loff_t *ppos)
5546{ 5690{
@@ -5551,14 +5695,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5551 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5695 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5552 if (!write || (ret < 0)) 5696 if (!write || (ret < 0))
5553 return ret; 5697 return ret;
5698
5699 mutex_lock(&pcp_batch_high_lock);
5554 for_each_populated_zone(zone) { 5700 for_each_populated_zone(zone) {
5555 for_each_possible_cpu(cpu) { 5701 unsigned long high;
5556 unsigned long high; 5702 high = zone->managed_pages / percpu_pagelist_fraction;
5557 high = zone->managed_pages / percpu_pagelist_fraction; 5703 for_each_possible_cpu(cpu)
5558 setup_pagelist_highmark( 5704 pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
5559 per_cpu_ptr(zone->pageset, cpu), high); 5705 high);
5560 }
5561 } 5706 }
5707 mutex_unlock(&pcp_batch_high_lock);
5562 return 0; 5708 return 0;
5563} 5709}
5564 5710
@@ -6047,32 +6193,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
6047#endif 6193#endif
6048 6194
6049#ifdef CONFIG_MEMORY_HOTPLUG 6195#ifdef CONFIG_MEMORY_HOTPLUG
6050static int __meminit __zone_pcp_update(void *data) 6196/*
6051{ 6197 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6052 struct zone *zone = data; 6198 * page high values need to be recalulated.
6053 int cpu; 6199 */
6054 unsigned long batch = zone_batchsize(zone), flags;
6055
6056 for_each_possible_cpu(cpu) {
6057 struct per_cpu_pageset *pset;
6058 struct per_cpu_pages *pcp;
6059
6060 pset = per_cpu_ptr(zone->pageset, cpu);
6061 pcp = &pset->pcp;
6062
6063 local_irq_save(flags);
6064 if (pcp->count > 0)
6065 free_pcppages_bulk(zone, pcp->count, pcp);
6066 drain_zonestat(zone, pset);
6067 setup_pageset(pset, batch);
6068 local_irq_restore(flags);
6069 }
6070 return 0;
6071}
6072
6073void __meminit zone_pcp_update(struct zone *zone) 6200void __meminit zone_pcp_update(struct zone *zone)
6074{ 6201{
6075 stop_machine(__zone_pcp_update, zone, NULL); 6202 unsigned cpu;
6203 mutex_lock(&pcp_batch_high_lock);
6204 for_each_possible_cpu(cpu)
6205 pageset_set_high_and_batch(zone,
6206 per_cpu_ptr(zone->pageset, cpu));
6207 mutex_unlock(&pcp_batch_high_lock);
6076} 6208}
6077#endif 6209#endif
6078 6210
@@ -6142,6 +6274,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6142 list_del(&page->lru); 6274 list_del(&page->lru);
6143 rmv_page_order(page); 6275 rmv_page_order(page);
6144 zone->free_area[order].nr_free--; 6276 zone->free_area[order].nr_free--;
6277#ifdef CONFIG_HIGHMEM
6278 if (PageHighMem(page))
6279 totalhigh_pages -= 1 << order;
6280#endif
6145 for (i = 0; i < (1 << order); i++) 6281 for (i = 0; i < (1 << order); i++)
6146 SetPageReserved((page+i)); 6282 SetPageReserved((page+i));
6147 pfn += (1 << order); 6283 pfn += (1 << order);