aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorChristoph Lameter <cl@linux-foundation.org>2010-01-05 01:34:51 -0500
committerTejun Heo <tj@kernel.org>2010-01-05 01:34:51 -0500
commit99dcc3e5a94ed491fbef402831d8c0bbb267f995 (patch)
treedd4d2b9e10ab0d4502e4b2a22dfc0a02a3300d7e /mm/page_alloc.c
parent5917dae83cb02dfe74c9167b79e86e6d65183fa3 (diff)
this_cpu: Page allocator conversion
Use the per cpu allocator functionality to avoid per cpu arrays in struct zone. This drastically reduces the size of struct zone for systems with large amounts of processors and allows placement of critical variables of struct zone in one cacheline even on very large systems. Another effect is that the pagesets of one processor are placed near one another. If multiple pagesets from different zones fit into one cacheline then additional cacheline fetches can be avoided on the hot paths when allocating memory from multiple zones. Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs are reduced and we can drop the zone_pcp macro. Hotplug handling is also simplified since cpu alloc can bring up and shut down cpu areas for a specific cpu as a whole. So there is no need to allocate or free individual pagesets. V7-V8: - Explain chicken egg dilemmna with percpu allocator. V4-V5: - Fix up cases where per_cpu_ptr is called before irq disable - Integrate the bootstrap logic that was separate before. tj: Build failure in pageset_cpuup_callback() due to missing ret variable fixed. Reviewed-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c202
1 files changed, 71 insertions, 131 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e9f5cc5fb59..6849e870de54 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1008,10 +1008,10 @@ static void drain_pages(unsigned int cpu)
1008 struct per_cpu_pageset *pset; 1008 struct per_cpu_pageset *pset;
1009 struct per_cpu_pages *pcp; 1009 struct per_cpu_pages *pcp;
1010 1010
1011 pset = zone_pcp(zone, cpu); 1011 local_irq_save(flags);
1012 pset = per_cpu_ptr(zone->pageset, cpu);
1012 1013
1013 pcp = &pset->pcp; 1014 pcp = &pset->pcp;
1014 local_irq_save(flags);
1015 free_pcppages_bulk(zone, pcp->count, pcp); 1015 free_pcppages_bulk(zone, pcp->count, pcp);
1016 pcp->count = 0; 1016 pcp->count = 0;
1017 local_irq_restore(flags); 1017 local_irq_restore(flags);
@@ -1095,7 +1095,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1095 arch_free_page(page, 0); 1095 arch_free_page(page, 0);
1096 kernel_map_pages(page, 1, 0); 1096 kernel_map_pages(page, 1, 0);
1097 1097
1098 pcp = &zone_pcp(zone, get_cpu())->pcp;
1099 migratetype = get_pageblock_migratetype(page); 1098 migratetype = get_pageblock_migratetype(page);
1100 set_page_private(page, migratetype); 1099 set_page_private(page, migratetype);
1101 local_irq_save(flags); 1100 local_irq_save(flags);
@@ -1118,6 +1117,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1118 migratetype = MIGRATE_MOVABLE; 1117 migratetype = MIGRATE_MOVABLE;
1119 } 1118 }
1120 1119
1120 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1121 if (cold) 1121 if (cold)
1122 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1122 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1123 else 1123 else
@@ -1130,7 +1130,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1130 1130
1131out: 1131out:
1132 local_irq_restore(flags); 1132 local_irq_restore(flags);
1133 put_cpu();
1134} 1133}
1135 1134
1136void free_hot_page(struct page *page) 1135void free_hot_page(struct page *page)
@@ -1180,17 +1179,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1180 unsigned long flags; 1179 unsigned long flags;
1181 struct page *page; 1180 struct page *page;
1182 int cold = !!(gfp_flags & __GFP_COLD); 1181 int cold = !!(gfp_flags & __GFP_COLD);
1183 int cpu;
1184 1182
1185again: 1183again:
1186 cpu = get_cpu();
1187 if (likely(order == 0)) { 1184 if (likely(order == 0)) {
1188 struct per_cpu_pages *pcp; 1185 struct per_cpu_pages *pcp;
1189 struct list_head *list; 1186 struct list_head *list;
1190 1187
1191 pcp = &zone_pcp(zone, cpu)->pcp;
1192 list = &pcp->lists[migratetype];
1193 local_irq_save(flags); 1188 local_irq_save(flags);
1189 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1190 list = &pcp->lists[migratetype];
1194 if (list_empty(list)) { 1191 if (list_empty(list)) {
1195 pcp->count += rmqueue_bulk(zone, 0, 1192 pcp->count += rmqueue_bulk(zone, 0,
1196 pcp->batch, list, 1193 pcp->batch, list,
@@ -1231,7 +1228,6 @@ again:
1231 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1228 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1232 zone_statistics(preferred_zone, zone); 1229 zone_statistics(preferred_zone, zone);
1233 local_irq_restore(flags); 1230 local_irq_restore(flags);
1234 put_cpu();
1235 1231
1236 VM_BUG_ON(bad_range(zone, page)); 1232 VM_BUG_ON(bad_range(zone, page));
1237 if (prep_new_page(page, order, gfp_flags)) 1233 if (prep_new_page(page, order, gfp_flags))
@@ -1240,7 +1236,6 @@ again:
1240 1236
1241failed: 1237failed:
1242 local_irq_restore(flags); 1238 local_irq_restore(flags);
1243 put_cpu();
1244 return NULL; 1239 return NULL;
1245} 1240}
1246 1241
@@ -2179,7 +2174,7 @@ void show_free_areas(void)
2179 for_each_online_cpu(cpu) { 2174 for_each_online_cpu(cpu) {
2180 struct per_cpu_pageset *pageset; 2175 struct per_cpu_pageset *pageset;
2181 2176
2182 pageset = zone_pcp(zone, cpu); 2177 pageset = per_cpu_ptr(zone->pageset, cpu);
2183 2178
2184 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2179 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2185 cpu, pageset->pcp.high, 2180 cpu, pageset->pcp.high,
@@ -2744,10 +2739,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2744 2739
2745#endif /* CONFIG_NUMA */ 2740#endif /* CONFIG_NUMA */
2746 2741
2742/*
2743 * Boot pageset table. One per cpu which is going to be used for all
2744 * zones and all nodes. The parameters will be set in such a way
2745 * that an item put on a list will immediately be handed over to
2746 * the buddy list. This is safe since pageset manipulation is done
2747 * with interrupts disabled.
2748 *
2749 * The boot_pagesets must be kept even after bootup is complete for
2750 * unused processors and/or zones. They do play a role for bootstrapping
2751 * hotplugged processors.
2752 *
2753 * zoneinfo_show() and maybe other functions do
2754 * not check if the processor is online before following the pageset pointer.
2755 * Other parts of the kernel may not check if the zone is available.
2756 */
2757static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2758static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2759
2747/* return values int ....just for stop_machine() */ 2760/* return values int ....just for stop_machine() */
2748static int __build_all_zonelists(void *dummy) 2761static int __build_all_zonelists(void *dummy)
2749{ 2762{
2750 int nid; 2763 int nid;
2764 int cpu;
2751 2765
2752#ifdef CONFIG_NUMA 2766#ifdef CONFIG_NUMA
2753 memset(node_load, 0, sizeof(node_load)); 2767 memset(node_load, 0, sizeof(node_load));
@@ -2758,6 +2772,23 @@ static int __build_all_zonelists(void *dummy)
2758 build_zonelists(pgdat); 2772 build_zonelists(pgdat);
2759 build_zonelist_cache(pgdat); 2773 build_zonelist_cache(pgdat);
2760 } 2774 }
2775
2776 /*
2777 * Initialize the boot_pagesets that are going to be used
2778 * for bootstrapping processors. The real pagesets for
2779 * each zone will be allocated later when the per cpu
2780 * allocator is available.
2781 *
2782 * boot_pagesets are used also for bootstrapping offline
2783 * cpus if the system is already booted because the pagesets
2784 * are needed to initialize allocators on a specific cpu too.
2785 * F.e. the percpu allocator needs the page allocator which
2786 * needs the percpu allocator in order to allocate its pagesets
2787 * (a chicken-egg dilemma).
2788 */
2789 for_each_possible_cpu(cpu)
2790 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2791
2761 return 0; 2792 return 0;
2762} 2793}
2763 2794
@@ -3095,121 +3126,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3095 pcp->batch = PAGE_SHIFT * 8; 3126 pcp->batch = PAGE_SHIFT * 8;
3096} 3127}
3097 3128
3098
3099#ifdef CONFIG_NUMA
3100/*
3101 * Boot pageset table. One per cpu which is going to be used for all
3102 * zones and all nodes. The parameters will be set in such a way
3103 * that an item put on a list will immediately be handed over to
3104 * the buddy list. This is safe since pageset manipulation is done
3105 * with interrupts disabled.
3106 *
3107 * Some NUMA counter updates may also be caught by the boot pagesets.
3108 *
3109 * The boot_pagesets must be kept even after bootup is complete for
3110 * unused processors and/or zones. They do play a role for bootstrapping
3111 * hotplugged processors.
3112 *
3113 * zoneinfo_show() and maybe other functions do
3114 * not check if the processor is online before following the pageset pointer.
3115 * Other parts of the kernel may not check if the zone is available.
3116 */
3117static struct per_cpu_pageset boot_pageset[NR_CPUS];
3118
3119/* 3129/*
3120 * Dynamically allocate memory for the 3130 * Allocate per cpu pagesets and initialize them.
3121 * per cpu pageset array in struct zone. 3131 * Before this call only boot pagesets were available.
3132 * Boot pagesets will no longer be used by this processorr
3133 * after setup_per_cpu_pageset().
3122 */ 3134 */
3123static int __cpuinit process_zones(int cpu) 3135void __init setup_per_cpu_pageset(void)
3124{ 3136{
3125 struct zone *zone, *dzone; 3137 struct zone *zone;
3126 int node = cpu_to_node(cpu); 3138 int cpu;
3127
3128 node_set_state(node, N_CPU); /* this node has a cpu */
3129 3139
3130 for_each_populated_zone(zone) { 3140 for_each_populated_zone(zone) {
3131 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3141 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3132 GFP_KERNEL, node);
3133 if (!zone_pcp(zone, cpu))
3134 goto bad;
3135
3136 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
3137
3138 if (percpu_pagelist_fraction)
3139 setup_pagelist_highmark(zone_pcp(zone, cpu),
3140 (zone->present_pages / percpu_pagelist_fraction));
3141 }
3142
3143 return 0;
3144bad:
3145 for_each_zone(dzone) {
3146 if (!populated_zone(dzone))
3147 continue;
3148 if (dzone == zone)
3149 break;
3150 kfree(zone_pcp(dzone, cpu));
3151 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3152 }
3153 return -ENOMEM;
3154}
3155 3142
3156static inline void free_zone_pagesets(int cpu) 3143 for_each_possible_cpu(cpu) {
3157{ 3144 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3158 struct zone *zone;
3159
3160 for_each_zone(zone) {
3161 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3162 3145
3163 /* Free per_cpu_pageset if it is slab allocated */ 3146 setup_pageset(pcp, zone_batchsize(zone));
3164 if (pset != &boot_pageset[cpu])
3165 kfree(pset);
3166 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3167 }
3168}
3169 3147
3170static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, 3148 if (percpu_pagelist_fraction)
3171 unsigned long action, 3149 setup_pagelist_highmark(pcp,
3172 void *hcpu) 3150 (zone->present_pages /
3173{ 3151 percpu_pagelist_fraction));
3174 int cpu = (long)hcpu; 3152 }
3175 int ret = NOTIFY_OK;
3176
3177 switch (action) {
3178 case CPU_UP_PREPARE:
3179 case CPU_UP_PREPARE_FROZEN:
3180 if (process_zones(cpu))
3181 ret = NOTIFY_BAD;
3182 break;
3183 case CPU_UP_CANCELED:
3184 case CPU_UP_CANCELED_FROZEN:
3185 case CPU_DEAD:
3186 case CPU_DEAD_FROZEN:
3187 free_zone_pagesets(cpu);
3188 break;
3189 default:
3190 break;
3191 } 3153 }
3192 return ret;
3193} 3154}
3194 3155
3195static struct notifier_block __cpuinitdata pageset_notifier =
3196 { &pageset_cpuup_callback, NULL, 0 };
3197
3198void __init setup_per_cpu_pageset(void)
3199{
3200 int err;
3201
3202 /* Initialize per_cpu_pageset for cpu 0.
3203 * A cpuup callback will do this for every cpu
3204 * as it comes online
3205 */
3206 err = process_zones(smp_processor_id());
3207 BUG_ON(err);
3208 register_cpu_notifier(&pageset_notifier);
3209}
3210
3211#endif
3212
3213static noinline __init_refok 3156static noinline __init_refok
3214int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3157int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3215{ 3158{
@@ -3263,7 +3206,7 @@ static int __zone_pcp_update(void *data)
3263 struct per_cpu_pageset *pset; 3206 struct per_cpu_pageset *pset;
3264 struct per_cpu_pages *pcp; 3207 struct per_cpu_pages *pcp;
3265 3208
3266 pset = zone_pcp(zone, cpu); 3209 pset = per_cpu_ptr(zone->pageset, cpu);
3267 pcp = &pset->pcp; 3210 pcp = &pset->pcp;
3268 3211
3269 local_irq_save(flags); 3212 local_irq_save(flags);
@@ -3281,21 +3224,17 @@ void zone_pcp_update(struct zone *zone)
3281 3224
3282static __meminit void zone_pcp_init(struct zone *zone) 3225static __meminit void zone_pcp_init(struct zone *zone)
3283{ 3226{
3284 int cpu; 3227 /*
3285 unsigned long batch = zone_batchsize(zone); 3228 * per cpu subsystem is not up at this point. The following code
3229 * relies on the ability of the linker to provide the
3230 * offset of a (static) per cpu variable into the per cpu area.
3231 */
3232 zone->pageset = &boot_pageset;
3286 3233
3287 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3288#ifdef CONFIG_NUMA
3289 /* Early boot. Slab allocator not functional yet */
3290 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3291 setup_pageset(&boot_pageset[cpu],0);
3292#else
3293 setup_pageset(zone_pcp(zone,cpu), batch);
3294#endif
3295 }
3296 if (zone->present_pages) 3234 if (zone->present_pages)
3297 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3235 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3298 zone->name, zone->present_pages, batch); 3236 zone->name, zone->present_pages,
3237 zone_batchsize(zone));
3299} 3238}
3300 3239
3301__meminit int init_currently_empty_zone(struct zone *zone, 3240__meminit int init_currently_empty_zone(struct zone *zone,
@@ -4809,10 +4748,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4809 if (!write || (ret == -EINVAL)) 4748 if (!write || (ret == -EINVAL))
4810 return ret; 4749 return ret;
4811 for_each_populated_zone(zone) { 4750 for_each_populated_zone(zone) {
4812 for_each_online_cpu(cpu) { 4751 for_each_possible_cpu(cpu) {
4813 unsigned long high; 4752 unsigned long high;
4814 high = zone->present_pages / percpu_pagelist_fraction; 4753 high = zone->present_pages / percpu_pagelist_fraction;
4815 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4754 setup_pagelist_highmark(
4755 per_cpu_ptr(zone->pageset, cpu), high);
4816 } 4756 }
4817 } 4757 }
4818 return 0; 4758 return 0;