aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHaicheng Li <haicheng.li@linux.intel.com>2010-05-24 17:32:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-25 11:07:01 -0400
commit1f522509c77a5dea8dc384b735314f03908a6415 (patch)
tree4b848527b90877a8a64c46e8e2d76723405c319d
parent319774e25fa4b7641bdc3b0a464dd84e62103347 (diff)
mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset
For each new populated zone of hotadded node, need to update its pagesets with dynamically allocated per_cpu_pageset struct for all possible CPUs: 1) Detach zone->pageset from the shared boot_pageset at end of __build_all_zonelists(). 2) Use mutex to protect zone->pageset when it's still shared in onlined_pages() Otherwises, multiple zones of different nodes would share same boot strapping boot_pageset for same CPU, which will finally cause below kernel panic: ------------[ cut here ]------------ kernel BUG at mm/page_alloc.c:1239! invalid opcode: 0000 [#1] SMP ... Call Trace: [<ffffffff811300c1>] __alloc_pages_nodemask+0x131/0x7b0 [<ffffffff81162e67>] alloc_pages_current+0x87/0xd0 [<ffffffff81128407>] __page_cache_alloc+0x67/0x70 [<ffffffff811325f0>] __do_page_cache_readahead+0x120/0x260 [<ffffffff81132751>] ra_submit+0x21/0x30 [<ffffffff811329c6>] ondemand_readahead+0x166/0x2c0 [<ffffffff81132ba0>] page_cache_async_readahead+0x80/0xa0 [<ffffffff8112a0e4>] generic_file_aio_read+0x364/0x670 [<ffffffff81266cfa>] nfs_file_read+0xca/0x130 [<ffffffff8117b20a>] do_sync_read+0xfa/0x140 [<ffffffff8117bf75>] vfs_read+0xb5/0x1a0 [<ffffffff8117c151>] sys_read+0x51/0x80 [<ffffffff8103c032>] system_call_fastpath+0x16/0x1b RIP [<ffffffff8112ff13>] get_page_from_freelist+0x883/0x900 RSP <ffff88000d1e78a8> ---[ end trace 4bda28328b9990db ] [akpm@linux-foundation.org: merge fix] Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com> Reviewed-by: Andi Kleen <andi.kleen@intel.com> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h2
-rw-r--r--init/main.c2
-rw-r--r--kernel/cpu.c2
-rw-r--r--mm/memory_hotplug.c18
-rw-r--r--mm/page_alloc.c17
5 files changed, 29 insertions, 12 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f6f2c505fa7e..a367ed5bb3fe 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -652,7 +652,7 @@ typedef struct pglist_data {
652 652
653void get_zone_counts(unsigned long *active, unsigned long *inactive, 653void get_zone_counts(unsigned long *active, unsigned long *inactive,
654 unsigned long *free); 654 unsigned long *free);
655void build_all_zonelists(void); 655void build_all_zonelists(void *data);
656void wakeup_kswapd(struct zone *zone, int order); 656void wakeup_kswapd(struct zone *zone, int order);
657int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 657int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
658 int classzone_idx, int alloc_flags); 658 int classzone_idx, int alloc_flags);
diff --git a/init/main.c b/init/main.c
index 22881b5e95e3..3bdb152f412f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -567,7 +567,7 @@ asmlinkage void __init start_kernel(void)
567 setup_per_cpu_areas(); 567 setup_per_cpu_areas();
568 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 568 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
569 569
570 build_all_zonelists(); 570 build_all_zonelists(NULL);
571 page_alloc_init(); 571 page_alloc_init();
572 572
573 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); 573 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a3fbcc0a0abc..3e8b3ba27175 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -358,7 +358,7 @@ int __cpuinit cpu_up(unsigned int cpu)
358 } 358 }
359 359
360 if (pgdat->node_zonelists->_zonerefs->zone == NULL) 360 if (pgdat->node_zonelists->_zonerefs->zone == NULL)
361 build_all_zonelists(); 361 build_all_zonelists(NULL);
362#endif 362#endif
363 363
364 cpu_maps_update_begin(); 364 cpu_maps_update_begin();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 85eb4d342ac5..089cc97aed3c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -389,6 +389,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
389 int nid; 389 int nid;
390 int ret; 390 int ret;
391 struct memory_notify arg; 391 struct memory_notify arg;
392 /*
393 * mutex to protect zone->pageset when it's still shared
394 * in onlined_pages()
395 */
396 static DEFINE_MUTEX(zone_pageset_mutex);
392 397
393 arg.start_pfn = pfn; 398 arg.start_pfn = pfn;
394 arg.nr_pages = nr_pages; 399 arg.nr_pages = nr_pages;
@@ -415,12 +420,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
415 * This means the page allocator ignores this zone. 420 * This means the page allocator ignores this zone.
416 * So, zonelist must be updated after online. 421 * So, zonelist must be updated after online.
417 */ 422 */
423 mutex_lock(&zone_pageset_mutex);
418 if (!populated_zone(zone)) 424 if (!populated_zone(zone))
419 need_zonelists_rebuild = 1; 425 need_zonelists_rebuild = 1;
420 426
421 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 427 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
422 online_pages_range); 428 online_pages_range);
423 if (ret) { 429 if (ret) {
430 mutex_unlock(&zone_pageset_mutex);
424 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 431 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
425 nr_pages, pfn); 432 nr_pages, pfn);
426 memory_notify(MEM_CANCEL_ONLINE, &arg); 433 memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -429,8 +436,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 436
430 zone->present_pages += onlined_pages; 437 zone->present_pages += onlined_pages;
431 zone->zone_pgdat->node_present_pages += onlined_pages; 438 zone->zone_pgdat->node_present_pages += onlined_pages;
439 if (need_zonelists_rebuild)
440 build_all_zonelists(zone);
441 else
442 zone_pcp_update(zone);
432 443
433 zone_pcp_update(zone); 444 mutex_unlock(&zone_pageset_mutex);
434 setup_per_zone_wmarks(); 445 setup_per_zone_wmarks();
435 calculate_zone_inactive_ratio(zone); 446 calculate_zone_inactive_ratio(zone);
436 if (onlined_pages) { 447 if (onlined_pages) {
@@ -438,10 +449,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
438 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 449 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
439 } 450 }
440 451
441 if (need_zonelists_rebuild) 452 vm_total_pages = nr_free_pagecache_pages();
442 build_all_zonelists();
443 else
444 vm_total_pages = nr_free_pagecache_pages();
445 453
446 writeback_set_ratelimit(); 454 writeback_set_ratelimit();
447 455
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 595d0ac211e2..21c52d2d8624 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2572,7 +2572,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2572 NUMA_ZONELIST_ORDER_LEN); 2572 NUMA_ZONELIST_ORDER_LEN);
2573 user_zonelist_order = oldval; 2573 user_zonelist_order = oldval;
2574 } else if (oldval != user_zonelist_order) 2574 } else if (oldval != user_zonelist_order)
2575 build_all_zonelists(); 2575 build_all_zonelists(NULL);
2576 } 2576 }
2577out: 2577out:
2578 mutex_unlock(&zl_order_mutex); 2578 mutex_unlock(&zl_order_mutex);
@@ -2922,9 +2922,10 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2922 */ 2922 */
2923static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 2923static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2924static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 2924static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2925static void setup_zone_pageset(struct zone *zone);
2925 2926
2926/* return values int ....just for stop_machine() */ 2927/* return values int ....just for stop_machine() */
2927static int __build_all_zonelists(void *dummy) 2928static __init_refok int __build_all_zonelists(void *data)
2928{ 2929{
2929 int nid; 2930 int nid;
2930 int cpu; 2931 int cpu;
@@ -2939,6 +2940,14 @@ static int __build_all_zonelists(void *dummy)
2939 build_zonelist_cache(pgdat); 2940 build_zonelist_cache(pgdat);
2940 } 2941 }
2941 2942
2943#ifdef CONFIG_MEMORY_HOTPLUG
2944 /* Setup real pagesets for the new zone */
2945 if (data) {
2946 struct zone *zone = data;
2947 setup_zone_pageset(zone);
2948 }
2949#endif
2950
2942 /* 2951 /*
2943 * Initialize the boot_pagesets that are going to be used 2952 * Initialize the boot_pagesets that are going to be used
2944 * for bootstrapping processors. The real pagesets for 2953 * for bootstrapping processors. The real pagesets for
@@ -2958,7 +2967,7 @@ static int __build_all_zonelists(void *dummy)
2958 return 0; 2967 return 0;
2959} 2968}
2960 2969
2961void build_all_zonelists(void) 2970void build_all_zonelists(void *data)
2962{ 2971{
2963 set_zonelist_order(); 2972 set_zonelist_order();
2964 2973
@@ -2969,7 +2978,7 @@ void build_all_zonelists(void)
2969 } else { 2978 } else {
2970 /* we have to stop all cpus to guarantee there is no user 2979 /* we have to stop all cpus to guarantee there is no user
2971 of zonelist */ 2980 of zonelist */
2972 stop_machine(__build_all_zonelists, NULL, NULL); 2981 stop_machine(__build_all_zonelists, data, NULL);
2973 /* cpuset refresh routine should be here */ 2982 /* cpuset refresh routine should be here */
2974 } 2983 }
2975 vm_total_pages = nr_free_pagecache_pages(); 2984 vm_total_pages = nr_free_pagecache_pages();