diff options
| author | Haicheng Li <haicheng.li@linux.intel.com> | 2010-05-24 17:32:51 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-05-25 11:07:01 -0400 |
| commit | 1f522509c77a5dea8dc384b735314f03908a6415 (patch) | |
| tree | 4b848527b90877a8a64c46e8e2d76723405c319d /mm | |
| parent | 319774e25fa4b7641bdc3b0a464dd84e62103347 (diff) | |
mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset
For each new populated zone of hotadded node, need to update its pagesets
with dynamically allocated per_cpu_pageset struct for all possible CPUs:
1) Detach zone->pageset from the shared boot_pageset
at end of __build_all_zonelists().
2) Use mutex to protect zone->pageset when it's still
shared in onlined_pages()
Otherwises, multiple zones of different nodes would share same boot strapping
boot_pageset for same CPU, which will finally cause below kernel panic:
------------[ cut here ]------------
kernel BUG at mm/page_alloc.c:1239!
invalid opcode: 0000 [#1] SMP
...
Call Trace:
[<ffffffff811300c1>] __alloc_pages_nodemask+0x131/0x7b0
[<ffffffff81162e67>] alloc_pages_current+0x87/0xd0
[<ffffffff81128407>] __page_cache_alloc+0x67/0x70
[<ffffffff811325f0>] __do_page_cache_readahead+0x120/0x260
[<ffffffff81132751>] ra_submit+0x21/0x30
[<ffffffff811329c6>] ondemand_readahead+0x166/0x2c0
[<ffffffff81132ba0>] page_cache_async_readahead+0x80/0xa0
[<ffffffff8112a0e4>] generic_file_aio_read+0x364/0x670
[<ffffffff81266cfa>] nfs_file_read+0xca/0x130
[<ffffffff8117b20a>] do_sync_read+0xfa/0x140
[<ffffffff8117bf75>] vfs_read+0xb5/0x1a0
[<ffffffff8117c151>] sys_read+0x51/0x80
[<ffffffff8103c032>] system_call_fastpath+0x16/0x1b
RIP [<ffffffff8112ff13>] get_page_from_freelist+0x883/0x900
RSP <ffff88000d1e78a8>
---[ end trace 4bda28328b9990db ]
[akpm@linux-foundation.org: merge fix]
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Andi Kleen <andi.kleen@intel.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/memory_hotplug.c | 18 | ||||
| -rw-r--r-- | mm/page_alloc.c | 17 |
2 files changed, 26 insertions, 9 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 85eb4d342ac5..089cc97aed3c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -389,6 +389,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 389 | int nid; | 389 | int nid; |
| 390 | int ret; | 390 | int ret; |
| 391 | struct memory_notify arg; | 391 | struct memory_notify arg; |
| 392 | /* | ||
| 393 | * mutex to protect zone->pageset when it's still shared | ||
| 394 | * in onlined_pages() | ||
| 395 | */ | ||
| 396 | static DEFINE_MUTEX(zone_pageset_mutex); | ||
| 392 | 397 | ||
| 393 | arg.start_pfn = pfn; | 398 | arg.start_pfn = pfn; |
| 394 | arg.nr_pages = nr_pages; | 399 | arg.nr_pages = nr_pages; |
| @@ -415,12 +420,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 415 | * This means the page allocator ignores this zone. | 420 | * This means the page allocator ignores this zone. |
| 416 | * So, zonelist must be updated after online. | 421 | * So, zonelist must be updated after online. |
| 417 | */ | 422 | */ |
| 423 | mutex_lock(&zone_pageset_mutex); | ||
| 418 | if (!populated_zone(zone)) | 424 | if (!populated_zone(zone)) |
| 419 | need_zonelists_rebuild = 1; | 425 | need_zonelists_rebuild = 1; |
| 420 | 426 | ||
| 421 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, | 427 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
| 422 | online_pages_range); | 428 | online_pages_range); |
| 423 | if (ret) { | 429 | if (ret) { |
| 430 | mutex_unlock(&zone_pageset_mutex); | ||
| 424 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 431 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", |
| 425 | nr_pages, pfn); | 432 | nr_pages, pfn); |
| 426 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 433 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
| @@ -429,8 +436,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 429 | 436 | ||
| 430 | zone->present_pages += onlined_pages; | 437 | zone->present_pages += onlined_pages; |
| 431 | zone->zone_pgdat->node_present_pages += onlined_pages; | 438 | zone->zone_pgdat->node_present_pages += onlined_pages; |
| 439 | if (need_zonelists_rebuild) | ||
| 440 | build_all_zonelists(zone); | ||
| 441 | else | ||
| 442 | zone_pcp_update(zone); | ||
| 432 | 443 | ||
| 433 | zone_pcp_update(zone); | 444 | mutex_unlock(&zone_pageset_mutex); |
| 434 | setup_per_zone_wmarks(); | 445 | setup_per_zone_wmarks(); |
| 435 | calculate_zone_inactive_ratio(zone); | 446 | calculate_zone_inactive_ratio(zone); |
| 436 | if (onlined_pages) { | 447 | if (onlined_pages) { |
| @@ -438,10 +449,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 438 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 449 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
| 439 | } | 450 | } |
| 440 | 451 | ||
| 441 | if (need_zonelists_rebuild) | 452 | vm_total_pages = nr_free_pagecache_pages(); |
| 442 | build_all_zonelists(); | ||
| 443 | else | ||
| 444 | vm_total_pages = nr_free_pagecache_pages(); | ||
| 445 | 453 | ||
| 446 | writeback_set_ratelimit(); | 454 | writeback_set_ratelimit(); |
| 447 | 455 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 595d0ac211e2..21c52d2d8624 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -2572,7 +2572,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
| 2572 | NUMA_ZONELIST_ORDER_LEN); | 2572 | NUMA_ZONELIST_ORDER_LEN); |
| 2573 | user_zonelist_order = oldval; | 2573 | user_zonelist_order = oldval; |
| 2574 | } else if (oldval != user_zonelist_order) | 2574 | } else if (oldval != user_zonelist_order) |
| 2575 | build_all_zonelists(); | 2575 | build_all_zonelists(NULL); |
| 2576 | } | 2576 | } |
| 2577 | out: | 2577 | out: |
| 2578 | mutex_unlock(&zl_order_mutex); | 2578 | mutex_unlock(&zl_order_mutex); |
| @@ -2922,9 +2922,10 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
| 2922 | */ | 2922 | */ |
| 2923 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | 2923 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); |
| 2924 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | 2924 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
| 2925 | static void setup_zone_pageset(struct zone *zone); | ||
| 2925 | 2926 | ||
| 2926 | /* return values int ....just for stop_machine() */ | 2927 | /* return values int ....just for stop_machine() */ |
| 2927 | static int __build_all_zonelists(void *dummy) | 2928 | static __init_refok int __build_all_zonelists(void *data) |
| 2928 | { | 2929 | { |
| 2929 | int nid; | 2930 | int nid; |
| 2930 | int cpu; | 2931 | int cpu; |
| @@ -2939,6 +2940,14 @@ static int __build_all_zonelists(void *dummy) | |||
| 2939 | build_zonelist_cache(pgdat); | 2940 | build_zonelist_cache(pgdat); |
| 2940 | } | 2941 | } |
| 2941 | 2942 | ||
| 2943 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 2944 | /* Setup real pagesets for the new zone */ | ||
| 2945 | if (data) { | ||
| 2946 | struct zone *zone = data; | ||
| 2947 | setup_zone_pageset(zone); | ||
| 2948 | } | ||
| 2949 | #endif | ||
| 2950 | |||
| 2942 | /* | 2951 | /* |
| 2943 | * Initialize the boot_pagesets that are going to be used | 2952 | * Initialize the boot_pagesets that are going to be used |
| 2944 | * for bootstrapping processors. The real pagesets for | 2953 | * for bootstrapping processors. The real pagesets for |
| @@ -2958,7 +2967,7 @@ static int __build_all_zonelists(void *dummy) | |||
| 2958 | return 0; | 2967 | return 0; |
| 2959 | } | 2968 | } |
| 2960 | 2969 | ||
| 2961 | void build_all_zonelists(void) | 2970 | void build_all_zonelists(void *data) |
| 2962 | { | 2971 | { |
| 2963 | set_zonelist_order(); | 2972 | set_zonelist_order(); |
| 2964 | 2973 | ||
| @@ -2969,7 +2978,7 @@ void build_all_zonelists(void) | |||
| 2969 | } else { | 2978 | } else { |
| 2970 | /* we have to stop all cpus to guarantee there is no user | 2979 | /* we have to stop all cpus to guarantee there is no user |
| 2971 | of zonelist */ | 2980 | of zonelist */ |
| 2972 | stop_machine(__build_all_zonelists, NULL, NULL); | 2981 | stop_machine(__build_all_zonelists, data, NULL); |
| 2973 | /* cpuset refresh routine should be here */ | 2982 | /* cpuset refresh routine should be here */ |
| 2974 | } | 2983 | } |
| 2975 | vm_total_pages = nr_free_pagecache_pages(); | 2984 | vm_total_pages = nr_free_pagecache_pages(); |
