diff options
author | Haicheng Li <haicheng.li@linux.intel.com> | 2010-05-24 17:32:51 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-05-25 11:07:01 -0400 |
commit | 1f522509c77a5dea8dc384b735314f03908a6415 (patch) | |
tree | 4b848527b90877a8a64c46e8e2d76723405c319d | |
parent | 319774e25fa4b7641bdc3b0a464dd84e62103347 (diff) |
mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset
For each new populated zone of hotadded node, need to update its pagesets
with dynamically allocated per_cpu_pageset struct for all possible CPUs:
1) Detach zone->pageset from the shared boot_pageset
at end of __build_all_zonelists().
2) Use mutex to protect zone->pageset when it's still
shared in onlined_pages()
Otherwises, multiple zones of different nodes would share same boot strapping
boot_pageset for same CPU, which will finally cause below kernel panic:
------------[ cut here ]------------
kernel BUG at mm/page_alloc.c:1239!
invalid opcode: 0000 [#1] SMP
...
Call Trace:
[<ffffffff811300c1>] __alloc_pages_nodemask+0x131/0x7b0
[<ffffffff81162e67>] alloc_pages_current+0x87/0xd0
[<ffffffff81128407>] __page_cache_alloc+0x67/0x70
[<ffffffff811325f0>] __do_page_cache_readahead+0x120/0x260
[<ffffffff81132751>] ra_submit+0x21/0x30
[<ffffffff811329c6>] ondemand_readahead+0x166/0x2c0
[<ffffffff81132ba0>] page_cache_async_readahead+0x80/0xa0
[<ffffffff8112a0e4>] generic_file_aio_read+0x364/0x670
[<ffffffff81266cfa>] nfs_file_read+0xca/0x130
[<ffffffff8117b20a>] do_sync_read+0xfa/0x140
[<ffffffff8117bf75>] vfs_read+0xb5/0x1a0
[<ffffffff8117c151>] sys_read+0x51/0x80
[<ffffffff8103c032>] system_call_fastpath+0x16/0x1b
RIP [<ffffffff8112ff13>] get_page_from_freelist+0x883/0x900
RSP <ffff88000d1e78a8>
---[ end trace 4bda28328b9990db ]
[akpm@linux-foundation.org: merge fix]
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Andi Kleen <andi.kleen@intel.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/mmzone.h | 2 | ||||
-rw-r--r-- | init/main.c | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 2 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 18 | ||||
-rw-r--r-- | mm/page_alloc.c | 17 |
5 files changed, 29 insertions, 12 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f6f2c505fa7e..a367ed5bb3fe 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -652,7 +652,7 @@ typedef struct pglist_data { | |||
652 | 652 | ||
653 | void get_zone_counts(unsigned long *active, unsigned long *inactive, | 653 | void get_zone_counts(unsigned long *active, unsigned long *inactive, |
654 | unsigned long *free); | 654 | unsigned long *free); |
655 | void build_all_zonelists(void); | 655 | void build_all_zonelists(void *data); |
656 | void wakeup_kswapd(struct zone *zone, int order); | 656 | void wakeup_kswapd(struct zone *zone, int order); |
657 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 657 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
658 | int classzone_idx, int alloc_flags); | 658 | int classzone_idx, int alloc_flags); |
diff --git a/init/main.c b/init/main.c index 22881b5e95e3..3bdb152f412f 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -567,7 +567,7 @@ asmlinkage void __init start_kernel(void) | |||
567 | setup_per_cpu_areas(); | 567 | setup_per_cpu_areas(); |
568 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ | 568 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ |
569 | 569 | ||
570 | build_all_zonelists(); | 570 | build_all_zonelists(NULL); |
571 | page_alloc_init(); | 571 | page_alloc_init(); |
572 | 572 | ||
573 | printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); | 573 | printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index a3fbcc0a0abc..3e8b3ba27175 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -358,7 +358,7 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
358 | } | 358 | } |
359 | 359 | ||
360 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) | 360 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) |
361 | build_all_zonelists(); | 361 | build_all_zonelists(NULL); |
362 | #endif | 362 | #endif |
363 | 363 | ||
364 | cpu_maps_update_begin(); | 364 | cpu_maps_update_begin(); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 85eb4d342ac5..089cc97aed3c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -389,6 +389,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
389 | int nid; | 389 | int nid; |
390 | int ret; | 390 | int ret; |
391 | struct memory_notify arg; | 391 | struct memory_notify arg; |
392 | /* | ||
393 | * mutex to protect zone->pageset when it's still shared | ||
394 | * in onlined_pages() | ||
395 | */ | ||
396 | static DEFINE_MUTEX(zone_pageset_mutex); | ||
392 | 397 | ||
393 | arg.start_pfn = pfn; | 398 | arg.start_pfn = pfn; |
394 | arg.nr_pages = nr_pages; | 399 | arg.nr_pages = nr_pages; |
@@ -415,12 +420,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
415 | * This means the page allocator ignores this zone. | 420 | * This means the page allocator ignores this zone. |
416 | * So, zonelist must be updated after online. | 421 | * So, zonelist must be updated after online. |
417 | */ | 422 | */ |
423 | mutex_lock(&zone_pageset_mutex); | ||
418 | if (!populated_zone(zone)) | 424 | if (!populated_zone(zone)) |
419 | need_zonelists_rebuild = 1; | 425 | need_zonelists_rebuild = 1; |
420 | 426 | ||
421 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, | 427 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
422 | online_pages_range); | 428 | online_pages_range); |
423 | if (ret) { | 429 | if (ret) { |
430 | mutex_unlock(&zone_pageset_mutex); | ||
424 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 431 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", |
425 | nr_pages, pfn); | 432 | nr_pages, pfn); |
426 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 433 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
@@ -429,8 +436,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
429 | 436 | ||
430 | zone->present_pages += onlined_pages; | 437 | zone->present_pages += onlined_pages; |
431 | zone->zone_pgdat->node_present_pages += onlined_pages; | 438 | zone->zone_pgdat->node_present_pages += onlined_pages; |
439 | if (need_zonelists_rebuild) | ||
440 | build_all_zonelists(zone); | ||
441 | else | ||
442 | zone_pcp_update(zone); | ||
432 | 443 | ||
433 | zone_pcp_update(zone); | 444 | mutex_unlock(&zone_pageset_mutex); |
434 | setup_per_zone_wmarks(); | 445 | setup_per_zone_wmarks(); |
435 | calculate_zone_inactive_ratio(zone); | 446 | calculate_zone_inactive_ratio(zone); |
436 | if (onlined_pages) { | 447 | if (onlined_pages) { |
@@ -438,10 +449,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
438 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 449 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
439 | } | 450 | } |
440 | 451 | ||
441 | if (need_zonelists_rebuild) | 452 | vm_total_pages = nr_free_pagecache_pages(); |
442 | build_all_zonelists(); | ||
443 | else | ||
444 | vm_total_pages = nr_free_pagecache_pages(); | ||
445 | 453 | ||
446 | writeback_set_ratelimit(); | 454 | writeback_set_ratelimit(); |
447 | 455 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 595d0ac211e2..21c52d2d8624 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2572,7 +2572,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2572 | NUMA_ZONELIST_ORDER_LEN); | 2572 | NUMA_ZONELIST_ORDER_LEN); |
2573 | user_zonelist_order = oldval; | 2573 | user_zonelist_order = oldval; |
2574 | } else if (oldval != user_zonelist_order) | 2574 | } else if (oldval != user_zonelist_order) |
2575 | build_all_zonelists(); | 2575 | build_all_zonelists(NULL); |
2576 | } | 2576 | } |
2577 | out: | 2577 | out: |
2578 | mutex_unlock(&zl_order_mutex); | 2578 | mutex_unlock(&zl_order_mutex); |
@@ -2922,9 +2922,10 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2922 | */ | 2922 | */ |
2923 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | 2923 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); |
2924 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | 2924 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
2925 | static void setup_zone_pageset(struct zone *zone); | ||
2925 | 2926 | ||
2926 | /* return values int ....just for stop_machine() */ | 2927 | /* return values int ....just for stop_machine() */ |
2927 | static int __build_all_zonelists(void *dummy) | 2928 | static __init_refok int __build_all_zonelists(void *data) |
2928 | { | 2929 | { |
2929 | int nid; | 2930 | int nid; |
2930 | int cpu; | 2931 | int cpu; |
@@ -2939,6 +2940,14 @@ static int __build_all_zonelists(void *dummy) | |||
2939 | build_zonelist_cache(pgdat); | 2940 | build_zonelist_cache(pgdat); |
2940 | } | 2941 | } |
2941 | 2942 | ||
2943 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
2944 | /* Setup real pagesets for the new zone */ | ||
2945 | if (data) { | ||
2946 | struct zone *zone = data; | ||
2947 | setup_zone_pageset(zone); | ||
2948 | } | ||
2949 | #endif | ||
2950 | |||
2942 | /* | 2951 | /* |
2943 | * Initialize the boot_pagesets that are going to be used | 2952 | * Initialize the boot_pagesets that are going to be used |
2944 | * for bootstrapping processors. The real pagesets for | 2953 | * for bootstrapping processors. The real pagesets for |
@@ -2958,7 +2967,7 @@ static int __build_all_zonelists(void *dummy) | |||
2958 | return 0; | 2967 | return 0; |
2959 | } | 2968 | } |
2960 | 2969 | ||
2961 | void build_all_zonelists(void) | 2970 | void build_all_zonelists(void *data) |
2962 | { | 2971 | { |
2963 | set_zonelist_order(); | 2972 | set_zonelist_order(); |
2964 | 2973 | ||
@@ -2969,7 +2978,7 @@ void build_all_zonelists(void) | |||
2969 | } else { | 2978 | } else { |
2970 | /* we have to stop all cpus to guarantee there is no user | 2979 | /* we have to stop all cpus to guarantee there is no user |
2971 | of zonelist */ | 2980 | of zonelist */ |
2972 | stop_machine(__build_all_zonelists, NULL, NULL); | 2981 | stop_machine(__build_all_zonelists, data, NULL); |
2973 | /* cpuset refresh routine should be here */ | 2982 | /* cpuset refresh routine should be here */ |
2974 | } | 2983 | } |
2975 | vm_total_pages = nr_free_pagecache_pages(); | 2984 | vm_total_pages = nr_free_pagecache_pages(); |