diff options
author | Lai Jiangshan <laijs@cn.fujitsu.com> | 2012-12-11 19:01:03 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-11 20:22:23 -0500 |
commit | d9713679dbd2a6ecb840cd5b65a3ec555c1ec3d4 (patch) | |
tree | b998ae43dc18e7459cab5a2cb5ba12900c96c7e1 | |
parent | 6dcd73d7011ba9046f9b98e7f7c9d958f5810e6b (diff) |
memory_hotplug: fix possible incorrect node_states[N_NORMAL_MEMORY]
Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY], it
forgets to manage node_states[N_NORMAL_MEMORY]. This may cause
node_states[N_NORMAL_MEMORY] to become incorrect.
Example, if a node is empty before online, and we online a memory which is
in ZONE_NORMAL. And after online, node_states[N_HIGH_MEMORY] is correct,
but node_states[N_NORMAL_MEMORY] is incorrect, the online code doesn't set
the new online node to node_states[N_NORMAL_MEMORY].
The same thing will happen when offlining (the offline code doesn't clear
the node from node_states[N_NORMAL_MEMORY] when needed). Some memory
managment code depends node_states[N_NORMAL_MEMORY], so we have to fix up
the node_states[N_NORMAL_MEMORY].
We add node_states_check_changes_online() and
node_states_check_changes_offline() to detect whether
node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY] are changed
while hotpluging.
Also add @status_change_nid_normal to struct memory_notify, thus the
memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY] are
changed. (We can add a @flags and reuse @status_change_nid instead of
introducing @status_change_nid_normal, but it will add much more
complexity in memory hotplug callback in every subsystem. So introducing
@status_change_nid_normal is better and it doesn't change the sematics of
@status_change_nid)
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Rob Landley <rob@landley.net>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/memory-hotplug.txt | 5 | ||||
-rw-r--r-- | include/linux/memory.h | 1 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 136 |
3 files changed, 125 insertions, 17 deletions
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 6d0c2519cf47..6e6cbc78f329 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt | |||
@@ -377,15 +377,18 @@ The third argument is passed by pointer of struct memory_notify. | |||
377 | struct memory_notify { | 377 | struct memory_notify { |
378 | unsigned long start_pfn; | 378 | unsigned long start_pfn; |
379 | unsigned long nr_pages; | 379 | unsigned long nr_pages; |
380 | int status_change_nid_normal; | ||
380 | int status_change_nid; | 381 | int status_change_nid; |
381 | } | 382 | } |
382 | 383 | ||
383 | start_pfn is start_pfn of online/offline memory. | 384 | start_pfn is start_pfn of online/offline memory. |
384 | nr_pages is # of pages of online/offline memory. | 385 | nr_pages is # of pages of online/offline memory. |
386 | status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask | ||
387 | is (will be) set/clear, if this is -1, then nodemask status is not changed. | ||
385 | status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) | 388 | status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) |
386 | set/clear. It means a new(memoryless) node gets new memory by online and a | 389 | set/clear. It means a new(memoryless) node gets new memory by online and a |
387 | node loses all memory. If this is -1, then nodemask status is not changed. | 390 | node loses all memory. If this is -1, then nodemask status is not changed. |
388 | If status_changed_nid >= 0, callback should create/discard structures for the | 391 | If status_changed_nid* >= 0, callback should create/discard structures for the |
389 | node if necessary. | 392 | node if necessary. |
390 | 393 | ||
391 | -------------- | 394 | -------------- |
diff --git a/include/linux/memory.h b/include/linux/memory.h index ff9a9f8e0ed9..a09216d0dcc7 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h | |||
@@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn); | |||
53 | struct memory_notify { | 53 | struct memory_notify { |
54 | unsigned long start_pfn; | 54 | unsigned long start_pfn; |
55 | unsigned long nr_pages; | 55 | unsigned long nr_pages; |
56 | int status_change_nid_normal; | ||
56 | int status_change_nid; | 57 | int status_change_nid; |
57 | }; | 58 | }; |
58 | 59 | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ec2f199cc5f7..72195602ded5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -460,6 +460,53 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
460 | return 0; | 460 | return 0; |
461 | } | 461 | } |
462 | 462 | ||
463 | /* check which state of node_states will be changed when online memory */ | ||
464 | static void node_states_check_changes_online(unsigned long nr_pages, | ||
465 | struct zone *zone, struct memory_notify *arg) | ||
466 | { | ||
467 | int nid = zone_to_nid(zone); | ||
468 | enum zone_type zone_last = ZONE_NORMAL; | ||
469 | |||
470 | /* | ||
471 | * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | ||
472 | * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. | ||
473 | * | ||
474 | * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | ||
475 | * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
476 | */ | ||
477 | if (N_HIGH_MEMORY == N_NORMAL_MEMORY) | ||
478 | zone_last = ZONE_MOVABLE; | ||
479 | |||
480 | /* | ||
481 | * if the memory to be online is in a zone of 0...zone_last, and | ||
482 | * the zones of 0...zone_last don't have memory before online, we will | ||
483 | * need to set the node to node_states[N_NORMAL_MEMORY] after | ||
484 | * the memory is online. | ||
485 | */ | ||
486 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) | ||
487 | arg->status_change_nid_normal = nid; | ||
488 | else | ||
489 | arg->status_change_nid_normal = -1; | ||
490 | |||
491 | /* | ||
492 | * if the node don't have memory befor online, we will need to | ||
493 | * set the node to node_states[N_HIGH_MEMORY] after the memory | ||
494 | * is online. | ||
495 | */ | ||
496 | if (!node_state(nid, N_HIGH_MEMORY)) | ||
497 | arg->status_change_nid = nid; | ||
498 | else | ||
499 | arg->status_change_nid = -1; | ||
500 | } | ||
501 | |||
502 | static void node_states_set_node(int node, struct memory_notify *arg) | ||
503 | { | ||
504 | if (arg->status_change_nid_normal >= 0) | ||
505 | node_set_state(node, N_NORMAL_MEMORY); | ||
506 | |||
507 | node_set_state(node, N_HIGH_MEMORY); | ||
508 | } | ||
509 | |||
463 | 510 | ||
464 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | 511 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages) |
465 | { | 512 | { |
@@ -471,13 +518,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
471 | struct memory_notify arg; | 518 | struct memory_notify arg; |
472 | 519 | ||
473 | lock_memory_hotplug(); | 520 | lock_memory_hotplug(); |
521 | /* | ||
522 | * This doesn't need a lock to do pfn_to_page(). | ||
523 | * The section can't be removed here because of the | ||
524 | * memory_block->state_mutex. | ||
525 | */ | ||
526 | zone = page_zone(pfn_to_page(pfn)); | ||
527 | |||
474 | arg.start_pfn = pfn; | 528 | arg.start_pfn = pfn; |
475 | arg.nr_pages = nr_pages; | 529 | arg.nr_pages = nr_pages; |
476 | arg.status_change_nid = -1; | 530 | node_states_check_changes_online(nr_pages, zone, &arg); |
477 | 531 | ||
478 | nid = page_to_nid(pfn_to_page(pfn)); | 532 | nid = page_to_nid(pfn_to_page(pfn)); |
479 | if (node_present_pages(nid) == 0) | ||
480 | arg.status_change_nid = nid; | ||
481 | 533 | ||
482 | ret = memory_notify(MEM_GOING_ONLINE, &arg); | 534 | ret = memory_notify(MEM_GOING_ONLINE, &arg); |
483 | ret = notifier_to_errno(ret); | 535 | ret = notifier_to_errno(ret); |
@@ -487,12 +539,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
487 | return ret; | 539 | return ret; |
488 | } | 540 | } |
489 | /* | 541 | /* |
490 | * This doesn't need a lock to do pfn_to_page(). | ||
491 | * The section can't be removed here because of the | ||
492 | * memory_block->state_mutex. | ||
493 | */ | ||
494 | zone = page_zone(pfn_to_page(pfn)); | ||
495 | /* | ||
496 | * If this zone is not populated, then it is not in zonelist. | 542 | * If this zone is not populated, then it is not in zonelist. |
497 | * This means the page allocator ignores this zone. | 543 | * This means the page allocator ignores this zone. |
498 | * So, zonelist must be updated after online. | 544 | * So, zonelist must be updated after online. |
@@ -521,7 +567,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
521 | zone->present_pages += onlined_pages; | 567 | zone->present_pages += onlined_pages; |
522 | zone->zone_pgdat->node_present_pages += onlined_pages; | 568 | zone->zone_pgdat->node_present_pages += onlined_pages; |
523 | if (onlined_pages) { | 569 | if (onlined_pages) { |
524 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 570 | node_states_set_node(zone_to_nid(zone), &arg); |
525 | if (need_zonelists_rebuild) | 571 | if (need_zonelists_rebuild) |
526 | build_all_zonelists(NULL, NULL); | 572 | build_all_zonelists(NULL, NULL); |
527 | else | 573 | else |
@@ -871,6 +917,67 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
871 | return offlined; | 917 | return offlined; |
872 | } | 918 | } |
873 | 919 | ||
920 | /* check which state of node_states will be changed when offline memory */ | ||
921 | static void node_states_check_changes_offline(unsigned long nr_pages, | ||
922 | struct zone *zone, struct memory_notify *arg) | ||
923 | { | ||
924 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
925 | unsigned long present_pages = 0; | ||
926 | enum zone_type zt, zone_last = ZONE_NORMAL; | ||
927 | |||
928 | /* | ||
929 | * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | ||
930 | * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. | ||
931 | * | ||
932 | * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | ||
933 | * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
934 | */ | ||
935 | if (N_HIGH_MEMORY == N_NORMAL_MEMORY) | ||
936 | zone_last = ZONE_MOVABLE; | ||
937 | |||
938 | /* | ||
939 | * check whether node_states[N_NORMAL_MEMORY] will be changed. | ||
940 | * If the memory to be offline is in a zone of 0...zone_last, | ||
941 | * and it is the last present memory, 0...zone_last will | ||
942 | * become empty after offline , thus we can determind we will | ||
943 | * need to clear the node from node_states[N_NORMAL_MEMORY]. | ||
944 | */ | ||
945 | for (zt = 0; zt <= zone_last; zt++) | ||
946 | present_pages += pgdat->node_zones[zt].present_pages; | ||
947 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
948 | arg->status_change_nid_normal = zone_to_nid(zone); | ||
949 | else | ||
950 | arg->status_change_nid_normal = -1; | ||
951 | |||
952 | /* | ||
953 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE | ||
954 | */ | ||
955 | zone_last = ZONE_MOVABLE; | ||
956 | |||
957 | /* | ||
958 | * check whether node_states[N_HIGH_MEMORY] will be changed | ||
959 | * If we try to offline the last present @nr_pages from the node, | ||
960 | * we can determind we will need to clear the node from | ||
961 | * node_states[N_HIGH_MEMORY]. | ||
962 | */ | ||
963 | for (; zt <= zone_last; zt++) | ||
964 | present_pages += pgdat->node_zones[zt].present_pages; | ||
965 | if (nr_pages >= present_pages) | ||
966 | arg->status_change_nid = zone_to_nid(zone); | ||
967 | else | ||
968 | arg->status_change_nid = -1; | ||
969 | } | ||
970 | |||
971 | static void node_states_clear_node(int node, struct memory_notify *arg) | ||
972 | { | ||
973 | if (arg->status_change_nid_normal >= 0) | ||
974 | node_clear_state(node, N_NORMAL_MEMORY); | ||
975 | |||
976 | if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && | ||
977 | (arg->status_change_nid >= 0)) | ||
978 | node_clear_state(node, N_HIGH_MEMORY); | ||
979 | } | ||
980 | |||
874 | static int __ref __offline_pages(unsigned long start_pfn, | 981 | static int __ref __offline_pages(unsigned long start_pfn, |
875 | unsigned long end_pfn, unsigned long timeout) | 982 | unsigned long end_pfn, unsigned long timeout) |
876 | { | 983 | { |
@@ -905,9 +1012,7 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
905 | 1012 | ||
906 | arg.start_pfn = start_pfn; | 1013 | arg.start_pfn = start_pfn; |
907 | arg.nr_pages = nr_pages; | 1014 | arg.nr_pages = nr_pages; |
908 | arg.status_change_nid = -1; | 1015 | node_states_check_changes_offline(nr_pages, zone, &arg); |
909 | if (nr_pages >= node_present_pages(node)) | ||
910 | arg.status_change_nid = node; | ||
911 | 1016 | ||
912 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); | 1017 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); |
913 | ret = notifier_to_errno(ret); | 1018 | ret = notifier_to_errno(ret); |
@@ -980,10 +1085,9 @@ repeat: | |||
980 | } else | 1085 | } else |
981 | zone_pcp_update(zone); | 1086 | zone_pcp_update(zone); |
982 | 1087 | ||
983 | if (!node_present_pages(node)) { | 1088 | node_states_clear_node(node, &arg); |
984 | node_clear_state(node, N_HIGH_MEMORY); | 1089 | if (arg.status_change_nid >= 0) |
985 | kswapd_stop(node); | 1090 | kswapd_stop(node); |
986 | } | ||
987 | 1091 | ||
988 | vm_total_pages = nr_free_pagecache_pages(); | 1092 | vm_total_pages = nr_free_pagecache_pages(); |
989 | writeback_set_ratelimit(); | 1093 | writeback_set_ratelimit(); |