aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLai Jiangshan <laijs@cn.fujitsu.com>2012-12-11 19:01:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-11 20:22:23 -0500
commitd9713679dbd2a6ecb840cd5b65a3ec555c1ec3d4 (patch)
treeb998ae43dc18e7459cab5a2cb5ba12900c96c7e1
parent6dcd73d7011ba9046f9b98e7f7c9d958f5810e6b (diff)
memory_hotplug: fix possible incorrect node_states[N_NORMAL_MEMORY]
Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY], it forgets to manage node_states[N_NORMAL_MEMORY]. This may cause node_states[N_NORMAL_MEMORY] to become incorrect. Example, if a node is empty before online, and we online a memory which is in ZONE_NORMAL. And after online, node_states[N_HIGH_MEMORY] is correct, but node_states[N_NORMAL_MEMORY] is incorrect, the online code doesn't set the new online node to node_states[N_NORMAL_MEMORY]. The same thing will happen when offlining (the offline code doesn't clear the node from node_states[N_NORMAL_MEMORY] when needed). Some memory managment code depends node_states[N_NORMAL_MEMORY], so we have to fix up the node_states[N_NORMAL_MEMORY]. We add node_states_check_changes_online() and node_states_check_changes_offline() to detect whether node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY] are changed while hotpluging. Also add @status_change_nid_normal to struct memory_notify, thus the memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY] are changed. (We can add a @flags and reuse @status_change_nid instead of introducing @status_change_nid_normal, but it will add much more complexity in memory hotplug callback in every subsystem. So introducing @status_change_nid_normal is better and it doesn't change the sematics of @status_change_nid) Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Rob Landley <rob@landley.net> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Greg Kroah-Hartman <gregkh@suse.de> Cc: Mel Gorman <mgorman@suse.de> Cc: Wen Congyang <wency@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/memory-hotplug.txt5
-rw-r--r--include/linux/memory.h1
-rw-r--r--mm/memory_hotplug.c136
3 files changed, 125 insertions, 17 deletions
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 6d0c2519cf47..6e6cbc78f329 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -377,15 +377,18 @@ The third argument is passed by pointer of struct memory_notify.
377struct memory_notify { 377struct memory_notify {
378 unsigned long start_pfn; 378 unsigned long start_pfn;
379 unsigned long nr_pages; 379 unsigned long nr_pages;
380 int status_change_nid_normal;
380 int status_change_nid; 381 int status_change_nid;
381} 382}
382 383
383start_pfn is start_pfn of online/offline memory. 384start_pfn is start_pfn of online/offline memory.
384nr_pages is # of pages of online/offline memory. 385nr_pages is # of pages of online/offline memory.
386status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
387is (will be) set/clear, if this is -1, then nodemask status is not changed.
385status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) 388status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
386set/clear. It means a new(memoryless) node gets new memory by online and a 389set/clear. It means a new(memoryless) node gets new memory by online and a
387node loses all memory. If this is -1, then nodemask status is not changed. 390node loses all memory. If this is -1, then nodemask status is not changed.
388If status_changed_nid >= 0, callback should create/discard structures for the 391If status_changed_nid* >= 0, callback should create/discard structures for the
389node if necessary. 392node if necessary.
390 393
391-------------- 394--------------
diff --git a/include/linux/memory.h b/include/linux/memory.h
index ff9a9f8e0ed9..a09216d0dcc7 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
53struct memory_notify { 53struct memory_notify {
54 unsigned long start_pfn; 54 unsigned long start_pfn;
55 unsigned long nr_pages; 55 unsigned long nr_pages;
56 int status_change_nid_normal;
56 int status_change_nid; 57 int status_change_nid;
57}; 58};
58 59
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ec2f199cc5f7..72195602ded5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -460,6 +460,53 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
460 return 0; 460 return 0;
461} 461}
462 462
463/* check which state of node_states will be changed when online memory */
464static void node_states_check_changes_online(unsigned long nr_pages,
465 struct zone *zone, struct memory_notify *arg)
466{
467 int nid = zone_to_nid(zone);
468 enum zone_type zone_last = ZONE_NORMAL;
469
470 /*
471 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
472 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
473 *
474 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
475 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
476 */
477 if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
478 zone_last = ZONE_MOVABLE;
479
480 /*
481 * if the memory to be online is in a zone of 0...zone_last, and
482 * the zones of 0...zone_last don't have memory before online, we will
483 * need to set the node to node_states[N_NORMAL_MEMORY] after
484 * the memory is online.
485 */
486 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
487 arg->status_change_nid_normal = nid;
488 else
489 arg->status_change_nid_normal = -1;
490
491 /*
492 * if the node don't have memory befor online, we will need to
493 * set the node to node_states[N_HIGH_MEMORY] after the memory
494 * is online.
495 */
496 if (!node_state(nid, N_HIGH_MEMORY))
497 arg->status_change_nid = nid;
498 else
499 arg->status_change_nid = -1;
500}
501
502static void node_states_set_node(int node, struct memory_notify *arg)
503{
504 if (arg->status_change_nid_normal >= 0)
505 node_set_state(node, N_NORMAL_MEMORY);
506
507 node_set_state(node, N_HIGH_MEMORY);
508}
509
463 510
464int __ref online_pages(unsigned long pfn, unsigned long nr_pages) 511int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
465{ 512{
@@ -471,13 +518,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
471 struct memory_notify arg; 518 struct memory_notify arg;
472 519
473 lock_memory_hotplug(); 520 lock_memory_hotplug();
521 /*
522 * This doesn't need a lock to do pfn_to_page().
523 * The section can't be removed here because of the
524 * memory_block->state_mutex.
525 */
526 zone = page_zone(pfn_to_page(pfn));
527
474 arg.start_pfn = pfn; 528 arg.start_pfn = pfn;
475 arg.nr_pages = nr_pages; 529 arg.nr_pages = nr_pages;
476 arg.status_change_nid = -1; 530 node_states_check_changes_online(nr_pages, zone, &arg);
477 531
478 nid = page_to_nid(pfn_to_page(pfn)); 532 nid = page_to_nid(pfn_to_page(pfn));
479 if (node_present_pages(nid) == 0)
480 arg.status_change_nid = nid;
481 533
482 ret = memory_notify(MEM_GOING_ONLINE, &arg); 534 ret = memory_notify(MEM_GOING_ONLINE, &arg);
483 ret = notifier_to_errno(ret); 535 ret = notifier_to_errno(ret);
@@ -487,12 +539,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
487 return ret; 539 return ret;
488 } 540 }
489 /* 541 /*
490 * This doesn't need a lock to do pfn_to_page().
491 * The section can't be removed here because of the
492 * memory_block->state_mutex.
493 */
494 zone = page_zone(pfn_to_page(pfn));
495 /*
496 * If this zone is not populated, then it is not in zonelist. 542 * If this zone is not populated, then it is not in zonelist.
497 * This means the page allocator ignores this zone. 543 * This means the page allocator ignores this zone.
498 * So, zonelist must be updated after online. 544 * So, zonelist must be updated after online.
@@ -521,7 +567,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
521 zone->present_pages += onlined_pages; 567 zone->present_pages += onlined_pages;
522 zone->zone_pgdat->node_present_pages += onlined_pages; 568 zone->zone_pgdat->node_present_pages += onlined_pages;
523 if (onlined_pages) { 569 if (onlined_pages) {
524 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 570 node_states_set_node(zone_to_nid(zone), &arg);
525 if (need_zonelists_rebuild) 571 if (need_zonelists_rebuild)
526 build_all_zonelists(NULL, NULL); 572 build_all_zonelists(NULL, NULL);
527 else 573 else
@@ -871,6 +917,67 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
871 return offlined; 917 return offlined;
872} 918}
873 919
920/* check which state of node_states will be changed when offline memory */
921static void node_states_check_changes_offline(unsigned long nr_pages,
922 struct zone *zone, struct memory_notify *arg)
923{
924 struct pglist_data *pgdat = zone->zone_pgdat;
925 unsigned long present_pages = 0;
926 enum zone_type zt, zone_last = ZONE_NORMAL;
927
928 /*
929 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
930 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
931 *
932 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
933 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
934 */
935 if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
936 zone_last = ZONE_MOVABLE;
937
938 /*
939 * check whether node_states[N_NORMAL_MEMORY] will be changed.
940 * If the memory to be offline is in a zone of 0...zone_last,
941 * and it is the last present memory, 0...zone_last will
942 * become empty after offline , thus we can determind we will
943 * need to clear the node from node_states[N_NORMAL_MEMORY].
944 */
945 for (zt = 0; zt <= zone_last; zt++)
946 present_pages += pgdat->node_zones[zt].present_pages;
947 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
948 arg->status_change_nid_normal = zone_to_nid(zone);
949 else
950 arg->status_change_nid_normal = -1;
951
952 /*
953 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
954 */
955 zone_last = ZONE_MOVABLE;
956
957 /*
958 * check whether node_states[N_HIGH_MEMORY] will be changed
959 * If we try to offline the last present @nr_pages from the node,
960 * we can determind we will need to clear the node from
961 * node_states[N_HIGH_MEMORY].
962 */
963 for (; zt <= zone_last; zt++)
964 present_pages += pgdat->node_zones[zt].present_pages;
965 if (nr_pages >= present_pages)
966 arg->status_change_nid = zone_to_nid(zone);
967 else
968 arg->status_change_nid = -1;
969}
970
971static void node_states_clear_node(int node, struct memory_notify *arg)
972{
973 if (arg->status_change_nid_normal >= 0)
974 node_clear_state(node, N_NORMAL_MEMORY);
975
976 if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) &&
977 (arg->status_change_nid >= 0))
978 node_clear_state(node, N_HIGH_MEMORY);
979}
980
874static int __ref __offline_pages(unsigned long start_pfn, 981static int __ref __offline_pages(unsigned long start_pfn,
875 unsigned long end_pfn, unsigned long timeout) 982 unsigned long end_pfn, unsigned long timeout)
876{ 983{
@@ -905,9 +1012,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
905 1012
906 arg.start_pfn = start_pfn; 1013 arg.start_pfn = start_pfn;
907 arg.nr_pages = nr_pages; 1014 arg.nr_pages = nr_pages;
908 arg.status_change_nid = -1; 1015 node_states_check_changes_offline(nr_pages, zone, &arg);
909 if (nr_pages >= node_present_pages(node))
910 arg.status_change_nid = node;
911 1016
912 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1017 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
913 ret = notifier_to_errno(ret); 1018 ret = notifier_to_errno(ret);
@@ -980,10 +1085,9 @@ repeat:
980 } else 1085 } else
981 zone_pcp_update(zone); 1086 zone_pcp_update(zone);
982 1087
983 if (!node_present_pages(node)) { 1088 node_states_clear_node(node, &arg);
984 node_clear_state(node, N_HIGH_MEMORY); 1089 if (arg.status_change_nid >= 0)
985 kswapd_stop(node); 1090 kswapd_stop(node);
986 }
987 1091
988 vm_total_pages = nr_free_pagecache_pages(); 1092 vm_total_pages = nr_free_pagecache_pages();
989 writeback_set_ratelimit(); 1093 writeback_set_ratelimit();