diff options
author | Christoph Lameter <cl@linux-foundation.org> | 2010-01-05 01:34:51 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2010-01-05 01:34:51 -0500 |
commit | 99dcc3e5a94ed491fbef402831d8c0bbb267f995 (patch) | |
tree | dd4d2b9e10ab0d4502e4b2a22dfc0a02a3300d7e /mm/page_alloc.c | |
parent | 5917dae83cb02dfe74c9167b79e86e6d65183fa3 (diff) |
this_cpu: Page allocator conversion
Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
This drastically reduces the size of struct zone for systems with large
amounts of processors and allows placement of critical variables of struct
zone in one cacheline even on very large systems.
Another effect is that the pagesets of one processor are placed near one
another. If multiple pagesets from different zones fit into one cacheline
then additional cacheline fetches can be avoided on the hot paths when
allocating memory from multiple zones.
Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
are reduced and we can drop the zone_pcp macro.
Hotplug handling is also simplified since cpu alloc can bring up and
shut down cpu areas for a specific cpu as a whole. So there is no need to
allocate or free individual pagesets.
V7-V8:
- Explain chicken egg dilemmna with percpu allocator.
V4-V5:
- Fix up cases where per_cpu_ptr is called before irq disable
- Integrate the bootstrap logic that was separate before.
tj: Build failure in pageset_cpuup_callback() due to missing ret
variable fixed.
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 202 |
1 files changed, 71 insertions, 131 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e9f5cc5fb5..6849e870de5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1008,10 +1008,10 @@ static void drain_pages(unsigned int cpu) | |||
1008 | struct per_cpu_pageset *pset; | 1008 | struct per_cpu_pageset *pset; |
1009 | struct per_cpu_pages *pcp; | 1009 | struct per_cpu_pages *pcp; |
1010 | 1010 | ||
1011 | pset = zone_pcp(zone, cpu); | 1011 | local_irq_save(flags); |
1012 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
1012 | 1013 | ||
1013 | pcp = &pset->pcp; | 1014 | pcp = &pset->pcp; |
1014 | local_irq_save(flags); | ||
1015 | free_pcppages_bulk(zone, pcp->count, pcp); | 1015 | free_pcppages_bulk(zone, pcp->count, pcp); |
1016 | pcp->count = 0; | 1016 | pcp->count = 0; |
1017 | local_irq_restore(flags); | 1017 | local_irq_restore(flags); |
@@ -1095,7 +1095,6 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1095 | arch_free_page(page, 0); | 1095 | arch_free_page(page, 0); |
1096 | kernel_map_pages(page, 1, 0); | 1096 | kernel_map_pages(page, 1, 0); |
1097 | 1097 | ||
1098 | pcp = &zone_pcp(zone, get_cpu())->pcp; | ||
1099 | migratetype = get_pageblock_migratetype(page); | 1098 | migratetype = get_pageblock_migratetype(page); |
1100 | set_page_private(page, migratetype); | 1099 | set_page_private(page, migratetype); |
1101 | local_irq_save(flags); | 1100 | local_irq_save(flags); |
@@ -1118,6 +1117,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1118 | migratetype = MIGRATE_MOVABLE; | 1117 | migratetype = MIGRATE_MOVABLE; |
1119 | } | 1118 | } |
1120 | 1119 | ||
1120 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
1121 | if (cold) | 1121 | if (cold) |
1122 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1122 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
1123 | else | 1123 | else |
@@ -1130,7 +1130,6 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1130 | 1130 | ||
1131 | out: | 1131 | out: |
1132 | local_irq_restore(flags); | 1132 | local_irq_restore(flags); |
1133 | put_cpu(); | ||
1134 | } | 1133 | } |
1135 | 1134 | ||
1136 | void free_hot_page(struct page *page) | 1135 | void free_hot_page(struct page *page) |
@@ -1180,17 +1179,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1180 | unsigned long flags; | 1179 | unsigned long flags; |
1181 | struct page *page; | 1180 | struct page *page; |
1182 | int cold = !!(gfp_flags & __GFP_COLD); | 1181 | int cold = !!(gfp_flags & __GFP_COLD); |
1183 | int cpu; | ||
1184 | 1182 | ||
1185 | again: | 1183 | again: |
1186 | cpu = get_cpu(); | ||
1187 | if (likely(order == 0)) { | 1184 | if (likely(order == 0)) { |
1188 | struct per_cpu_pages *pcp; | 1185 | struct per_cpu_pages *pcp; |
1189 | struct list_head *list; | 1186 | struct list_head *list; |
1190 | 1187 | ||
1191 | pcp = &zone_pcp(zone, cpu)->pcp; | ||
1192 | list = &pcp->lists[migratetype]; | ||
1193 | local_irq_save(flags); | 1188 | local_irq_save(flags); |
1189 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
1190 | list = &pcp->lists[migratetype]; | ||
1194 | if (list_empty(list)) { | 1191 | if (list_empty(list)) { |
1195 | pcp->count += rmqueue_bulk(zone, 0, | 1192 | pcp->count += rmqueue_bulk(zone, 0, |
1196 | pcp->batch, list, | 1193 | pcp->batch, list, |
@@ -1231,7 +1228,6 @@ again: | |||
1231 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1228 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1232 | zone_statistics(preferred_zone, zone); | 1229 | zone_statistics(preferred_zone, zone); |
1233 | local_irq_restore(flags); | 1230 | local_irq_restore(flags); |
1234 | put_cpu(); | ||
1235 | 1231 | ||
1236 | VM_BUG_ON(bad_range(zone, page)); | 1232 | VM_BUG_ON(bad_range(zone, page)); |
1237 | if (prep_new_page(page, order, gfp_flags)) | 1233 | if (prep_new_page(page, order, gfp_flags)) |
@@ -1240,7 +1236,6 @@ again: | |||
1240 | 1236 | ||
1241 | failed: | 1237 | failed: |
1242 | local_irq_restore(flags); | 1238 | local_irq_restore(flags); |
1243 | put_cpu(); | ||
1244 | return NULL; | 1239 | return NULL; |
1245 | } | 1240 | } |
1246 | 1241 | ||
@@ -2179,7 +2174,7 @@ void show_free_areas(void) | |||
2179 | for_each_online_cpu(cpu) { | 2174 | for_each_online_cpu(cpu) { |
2180 | struct per_cpu_pageset *pageset; | 2175 | struct per_cpu_pageset *pageset; |
2181 | 2176 | ||
2182 | pageset = zone_pcp(zone, cpu); | 2177 | pageset = per_cpu_ptr(zone->pageset, cpu); |
2183 | 2178 | ||
2184 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | 2179 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
2185 | cpu, pageset->pcp.high, | 2180 | cpu, pageset->pcp.high, |
@@ -2744,10 +2739,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2744 | 2739 | ||
2745 | #endif /* CONFIG_NUMA */ | 2740 | #endif /* CONFIG_NUMA */ |
2746 | 2741 | ||
2742 | /* | ||
2743 | * Boot pageset table. One per cpu which is going to be used for all | ||
2744 | * zones and all nodes. The parameters will be set in such a way | ||
2745 | * that an item put on a list will immediately be handed over to | ||
2746 | * the buddy list. This is safe since pageset manipulation is done | ||
2747 | * with interrupts disabled. | ||
2748 | * | ||
2749 | * The boot_pagesets must be kept even after bootup is complete for | ||
2750 | * unused processors and/or zones. They do play a role for bootstrapping | ||
2751 | * hotplugged processors. | ||
2752 | * | ||
2753 | * zoneinfo_show() and maybe other functions do | ||
2754 | * not check if the processor is online before following the pageset pointer. | ||
2755 | * Other parts of the kernel may not check if the zone is available. | ||
2756 | */ | ||
2757 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | ||
2758 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | ||
2759 | |||
2747 | /* return values int ....just for stop_machine() */ | 2760 | /* return values int ....just for stop_machine() */ |
2748 | static int __build_all_zonelists(void *dummy) | 2761 | static int __build_all_zonelists(void *dummy) |
2749 | { | 2762 | { |
2750 | int nid; | 2763 | int nid; |
2764 | int cpu; | ||
2751 | 2765 | ||
2752 | #ifdef CONFIG_NUMA | 2766 | #ifdef CONFIG_NUMA |
2753 | memset(node_load, 0, sizeof(node_load)); | 2767 | memset(node_load, 0, sizeof(node_load)); |
@@ -2758,6 +2772,23 @@ static int __build_all_zonelists(void *dummy) | |||
2758 | build_zonelists(pgdat); | 2772 | build_zonelists(pgdat); |
2759 | build_zonelist_cache(pgdat); | 2773 | build_zonelist_cache(pgdat); |
2760 | } | 2774 | } |
2775 | |||
2776 | /* | ||
2777 | * Initialize the boot_pagesets that are going to be used | ||
2778 | * for bootstrapping processors. The real pagesets for | ||
2779 | * each zone will be allocated later when the per cpu | ||
2780 | * allocator is available. | ||
2781 | * | ||
2782 | * boot_pagesets are used also for bootstrapping offline | ||
2783 | * cpus if the system is already booted because the pagesets | ||
2784 | * are needed to initialize allocators on a specific cpu too. | ||
2785 | * F.e. the percpu allocator needs the page allocator which | ||
2786 | * needs the percpu allocator in order to allocate its pagesets | ||
2787 | * (a chicken-egg dilemma). | ||
2788 | */ | ||
2789 | for_each_possible_cpu(cpu) | ||
2790 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | ||
2791 | |||
2761 | return 0; | 2792 | return 0; |
2762 | } | 2793 | } |
2763 | 2794 | ||
@@ -3095,121 +3126,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3095 | pcp->batch = PAGE_SHIFT * 8; | 3126 | pcp->batch = PAGE_SHIFT * 8; |
3096 | } | 3127 | } |
3097 | 3128 | ||
3098 | |||
3099 | #ifdef CONFIG_NUMA | ||
3100 | /* | ||
3101 | * Boot pageset table. One per cpu which is going to be used for all | ||
3102 | * zones and all nodes. The parameters will be set in such a way | ||
3103 | * that an item put on a list will immediately be handed over to | ||
3104 | * the buddy list. This is safe since pageset manipulation is done | ||
3105 | * with interrupts disabled. | ||
3106 | * | ||
3107 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
3108 | * | ||
3109 | * The boot_pagesets must be kept even after bootup is complete for | ||
3110 | * unused processors and/or zones. They do play a role for bootstrapping | ||
3111 | * hotplugged processors. | ||
3112 | * | ||
3113 | * zoneinfo_show() and maybe other functions do | ||
3114 | * not check if the processor is online before following the pageset pointer. | ||
3115 | * Other parts of the kernel may not check if the zone is available. | ||
3116 | */ | ||
3117 | static struct per_cpu_pageset boot_pageset[NR_CPUS]; | ||
3118 | |||
3119 | /* | 3129 | /* |
3120 | * Dynamically allocate memory for the | 3130 | * Allocate per cpu pagesets and initialize them. |
3121 | * per cpu pageset array in struct zone. | 3131 | * Before this call only boot pagesets were available. |
3132 | * Boot pagesets will no longer be used by this processorr | ||
3133 | * after setup_per_cpu_pageset(). | ||
3122 | */ | 3134 | */ |
3123 | static int __cpuinit process_zones(int cpu) | 3135 | void __init setup_per_cpu_pageset(void) |
3124 | { | 3136 | { |
3125 | struct zone *zone, *dzone; | 3137 | struct zone *zone; |
3126 | int node = cpu_to_node(cpu); | 3138 | int cpu; |
3127 | |||
3128 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
3129 | 3139 | ||
3130 | for_each_populated_zone(zone) { | 3140 | for_each_populated_zone(zone) { |
3131 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 3141 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
3132 | GFP_KERNEL, node); | ||
3133 | if (!zone_pcp(zone, cpu)) | ||
3134 | goto bad; | ||
3135 | |||
3136 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); | ||
3137 | |||
3138 | if (percpu_pagelist_fraction) | ||
3139 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
3140 | (zone->present_pages / percpu_pagelist_fraction)); | ||
3141 | } | ||
3142 | |||
3143 | return 0; | ||
3144 | bad: | ||
3145 | for_each_zone(dzone) { | ||
3146 | if (!populated_zone(dzone)) | ||
3147 | continue; | ||
3148 | if (dzone == zone) | ||
3149 | break; | ||
3150 | kfree(zone_pcp(dzone, cpu)); | ||
3151 | zone_pcp(dzone, cpu) = &boot_pageset[cpu]; | ||
3152 | } | ||
3153 | return -ENOMEM; | ||
3154 | } | ||
3155 | 3142 | ||
3156 | static inline void free_zone_pagesets(int cpu) | 3143 | for_each_possible_cpu(cpu) { |
3157 | { | 3144 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); |
3158 | struct zone *zone; | ||
3159 | |||
3160 | for_each_zone(zone) { | ||
3161 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
3162 | 3145 | ||
3163 | /* Free per_cpu_pageset if it is slab allocated */ | 3146 | setup_pageset(pcp, zone_batchsize(zone)); |
3164 | if (pset != &boot_pageset[cpu]) | ||
3165 | kfree(pset); | ||
3166 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
3167 | } | ||
3168 | } | ||
3169 | 3147 | ||
3170 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | 3148 | if (percpu_pagelist_fraction) |
3171 | unsigned long action, | 3149 | setup_pagelist_highmark(pcp, |
3172 | void *hcpu) | 3150 | (zone->present_pages / |
3173 | { | 3151 | percpu_pagelist_fraction)); |
3174 | int cpu = (long)hcpu; | 3152 | } |
3175 | int ret = NOTIFY_OK; | ||
3176 | |||
3177 | switch (action) { | ||
3178 | case CPU_UP_PREPARE: | ||
3179 | case CPU_UP_PREPARE_FROZEN: | ||
3180 | if (process_zones(cpu)) | ||
3181 | ret = NOTIFY_BAD; | ||
3182 | break; | ||
3183 | case CPU_UP_CANCELED: | ||
3184 | case CPU_UP_CANCELED_FROZEN: | ||
3185 | case CPU_DEAD: | ||
3186 | case CPU_DEAD_FROZEN: | ||
3187 | free_zone_pagesets(cpu); | ||
3188 | break; | ||
3189 | default: | ||
3190 | break; | ||
3191 | } | 3153 | } |
3192 | return ret; | ||
3193 | } | 3154 | } |
3194 | 3155 | ||
3195 | static struct notifier_block __cpuinitdata pageset_notifier = | ||
3196 | { &pageset_cpuup_callback, NULL, 0 }; | ||
3197 | |||
3198 | void __init setup_per_cpu_pageset(void) | ||
3199 | { | ||
3200 | int err; | ||
3201 | |||
3202 | /* Initialize per_cpu_pageset for cpu 0. | ||
3203 | * A cpuup callback will do this for every cpu | ||
3204 | * as it comes online | ||
3205 | */ | ||
3206 | err = process_zones(smp_processor_id()); | ||
3207 | BUG_ON(err); | ||
3208 | register_cpu_notifier(&pageset_notifier); | ||
3209 | } | ||
3210 | |||
3211 | #endif | ||
3212 | |||
3213 | static noinline __init_refok | 3156 | static noinline __init_refok |
3214 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 3157 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
3215 | { | 3158 | { |
@@ -3263,7 +3206,7 @@ static int __zone_pcp_update(void *data) | |||
3263 | struct per_cpu_pageset *pset; | 3206 | struct per_cpu_pageset *pset; |
3264 | struct per_cpu_pages *pcp; | 3207 | struct per_cpu_pages *pcp; |
3265 | 3208 | ||
3266 | pset = zone_pcp(zone, cpu); | 3209 | pset = per_cpu_ptr(zone->pageset, cpu); |
3267 | pcp = &pset->pcp; | 3210 | pcp = &pset->pcp; |
3268 | 3211 | ||
3269 | local_irq_save(flags); | 3212 | local_irq_save(flags); |
@@ -3281,21 +3224,17 @@ void zone_pcp_update(struct zone *zone) | |||
3281 | 3224 | ||
3282 | static __meminit void zone_pcp_init(struct zone *zone) | 3225 | static __meminit void zone_pcp_init(struct zone *zone) |
3283 | { | 3226 | { |
3284 | int cpu; | 3227 | /* |
3285 | unsigned long batch = zone_batchsize(zone); | 3228 | * per cpu subsystem is not up at this point. The following code |
3229 | * relies on the ability of the linker to provide the | ||
3230 | * offset of a (static) per cpu variable into the per cpu area. | ||
3231 | */ | ||
3232 | zone->pageset = &boot_pageset; | ||
3286 | 3233 | ||
3287 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
3288 | #ifdef CONFIG_NUMA | ||
3289 | /* Early boot. Slab allocator not functional yet */ | ||
3290 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
3291 | setup_pageset(&boot_pageset[cpu],0); | ||
3292 | #else | ||
3293 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
3294 | #endif | ||
3295 | } | ||
3296 | if (zone->present_pages) | 3234 | if (zone->present_pages) |
3297 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 3235 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
3298 | zone->name, zone->present_pages, batch); | 3236 | zone->name, zone->present_pages, |
3237 | zone_batchsize(zone)); | ||
3299 | } | 3238 | } |
3300 | 3239 | ||
3301 | __meminit int init_currently_empty_zone(struct zone *zone, | 3240 | __meminit int init_currently_empty_zone(struct zone *zone, |
@@ -4809,10 +4748,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
4809 | if (!write || (ret == -EINVAL)) | 4748 | if (!write || (ret == -EINVAL)) |
4810 | return ret; | 4749 | return ret; |
4811 | for_each_populated_zone(zone) { | 4750 | for_each_populated_zone(zone) { |
4812 | for_each_online_cpu(cpu) { | 4751 | for_each_possible_cpu(cpu) { |
4813 | unsigned long high; | 4752 | unsigned long high; |
4814 | high = zone->present_pages / percpu_pagelist_fraction; | 4753 | high = zone->present_pages / percpu_pagelist_fraction; |
4815 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | 4754 | setup_pagelist_highmark( |
4755 | per_cpu_ptr(zone->pageset, cpu), high); | ||
4816 | } | 4756 | } |
4817 | } | 4757 | } |
4818 | return 0; | 4758 | return 0; |