diff options
author | Gary Hade <garyhade@us.ibm.com> | 2009-01-06 17:39:14 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-06 18:59:00 -0500 |
commit | c04fc586c1a480ba198f03ae7b6cbd7b57380b91 (patch) | |
tree | 9d6544a3b62cc01dbcbb1e315b84378b45ba86d2 | |
parent | ee53a891f47444c53318b98dac947ede963db400 (diff) |
mm: show node to memory section relationship with symlinks in sysfs
Show node to memory section relationship with symlinks in sysfs
Add /sys/devices/system/node/nodeX/memoryY symlinks for all
the memory sections located on nodeX. For example:
/sys/devices/system/node/node1/memory135 -> ../../memory/memory135
indicates that memory section 135 resides on node1.
Also revises documentation to cover this change as well as updating
Documentation/ABI/testing/sysfs-devices-memory to include descriptions
of memory hotremove files 'phys_device', 'phys_index', and 'state'
that were previously not described there.
In addition to it always being a good policy to provide users with
the maximum possible amount of physical location information for
resources that can be hot-added and/or hot-removed, the following
are some (but likely not all) of the user benefits provided by
this change.
Immediate:
- Provides information needed to determine the specific node
on which a defective DIMM is located. This will reduce system
downtime when the node or defective DIMM is swapped out.
- Prevents unintended onlining of a memory section that was
previously offlined due to a defective DIMM. This could happen
during node hot-add when the user or node hot-add assist script
onlines _all_ offlined sections due to user or script inability
to identify the specific memory sections located on the hot-added
node. The consequences of reintroducing the defective memory
could be ugly.
- Provides information needed to vary the amount and distribution
of memory on specific nodes for testing or debugging purposes.
Future:
- Will provide information needed to identify the memory
sections that need to be offlined prior to physical removal
of a specific node.
Symlink creation during boot was tested on 2-node x86_64, 2-node
ppc64, and 2-node ia64 systems. Symlink creation during physical
memory hot-add tested on a 2-node x86_64 system.
Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/ABI/testing/sysfs-devices-memory | 51 | ||||
-rw-r--r-- | Documentation/memory-hotplug.txt | 16 | ||||
-rw-r--r-- | arch/ia64/mm/init.c | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/mem.c | 2 | ||||
-rw-r--r-- | arch/s390/mm/init.c | 2 | ||||
-rw-r--r-- | arch/sh/mm/init.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 2 | ||||
-rw-r--r-- | drivers/base/memory.c | 19 | ||||
-rw-r--r-- | drivers/base/node.c | 103 | ||||
-rw-r--r-- | include/linux/memory.h | 6 | ||||
-rw-r--r-- | include/linux/memory_hotplug.h | 2 | ||||
-rw-r--r-- | include/linux/node.h | 13 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 11 |
14 files changed, 209 insertions, 25 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 7a16fe1e2270..9fe91c02ee40 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory | |||
@@ -6,7 +6,6 @@ Description: | |||
6 | internal state of the kernel memory blocks. Files could be | 6 | internal state of the kernel memory blocks. Files could be |
7 | added or removed dynamically to represent hot-add/remove | 7 | added or removed dynamically to represent hot-add/remove |
8 | operations. | 8 | operations. |
9 | |||
10 | Users: hotplug memory add/remove tools | 9 | Users: hotplug memory add/remove tools |
11 | https://w3.opensource.ibm.com/projects/powerpc-utils/ | 10 | https://w3.opensource.ibm.com/projects/powerpc-utils/ |
12 | 11 | ||
@@ -19,6 +18,56 @@ Description: | |||
19 | This is useful for a user-level agent to determine | 18 | This is useful for a user-level agent to determine |
20 | identify removable sections of the memory before attempting | 19 | identify removable sections of the memory before attempting |
21 | potentially expensive hot-remove memory operation | 20 | potentially expensive hot-remove memory operation |
21 | Users: hotplug memory remove tools | ||
22 | https://w3.opensource.ibm.com/projects/powerpc-utils/ | ||
23 | |||
24 | What: /sys/devices/system/memory/memoryX/phys_device | ||
25 | Date: September 2008 | ||
26 | Contact: Badari Pulavarty <pbadari@us.ibm.com> | ||
27 | Description: | ||
28 | The file /sys/devices/system/memory/memoryX/phys_device | ||
29 | is read-only and is designed to show the name of physical | ||
30 | memory device. Implementation is currently incomplete. | ||
22 | 31 | ||
32 | What: /sys/devices/system/memory/memoryX/phys_index | ||
33 | Date: September 2008 | ||
34 | Contact: Badari Pulavarty <pbadari@us.ibm.com> | ||
35 | Description: | ||
36 | The file /sys/devices/system/memory/memoryX/phys_index | ||
37 | is read-only and contains the section ID in hexadecimal | ||
38 | which is equivalent to decimal X contained in the | ||
39 | memory section directory name. | ||
40 | |||
41 | What: /sys/devices/system/memory/memoryX/state | ||
42 | Date: September 2008 | ||
43 | Contact: Badari Pulavarty <pbadari@us.ibm.com> | ||
44 | Description: | ||
45 | The file /sys/devices/system/memory/memoryX/state | ||
46 | is read-write. When read, it's contents show the | ||
47 | online/offline state of the memory section. When written, | ||
48 | root can toggle the the online/offline state of a removable | ||
49 | memory section (see removable file description above) | ||
50 | using the following commands. | ||
51 | # echo online > /sys/devices/system/memory/memoryX/state | ||
52 | # echo offline > /sys/devices/system/memory/memoryX/state | ||
53 | |||
54 | For example, if /sys/devices/system/memory/memory22/removable | ||
55 | contains a value of 1 and | ||
56 | /sys/devices/system/memory/memory22/state contains the | ||
57 | string "online" the following command can be executed by | ||
58 | by root to offline that section. | ||
59 | # echo offline > /sys/devices/system/memory/memory22/state | ||
23 | Users: hotplug memory remove tools | 60 | Users: hotplug memory remove tools |
24 | https://w3.opensource.ibm.com/projects/powerpc-utils/ | 61 | https://w3.opensource.ibm.com/projects/powerpc-utils/ |
62 | |||
63 | What: /sys/devices/system/node/nodeX/memoryY | ||
64 | Date: September 2008 | ||
65 | Contact: Gary Hade <garyhade@us.ibm.com> | ||
66 | Description: | ||
67 | When CONFIG_NUMA is enabled | ||
68 | /sys/devices/system/node/nodeX/memoryY is a symbolic link that | ||
69 | points to the corresponding /sys/devices/system/memory/memoryY | ||
70 | memory section directory. For example, the following symbolic | ||
71 | link is created for memory section 9 on node0. | ||
72 | /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 | ||
73 | |||
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 168117bd6ee8..4c2ecf537a4a 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt | |||
@@ -124,7 +124,7 @@ config options. | |||
124 | This option can be kernel module too. | 124 | This option can be kernel module too. |
125 | 125 | ||
126 | -------------------------------- | 126 | -------------------------------- |
127 | 3 sysfs files for memory hotplug | 127 | 4 sysfs files for memory hotplug |
128 | -------------------------------- | 128 | -------------------------------- |
129 | All sections have their device information under /sys/devices/system/memory as | 129 | All sections have their device information under /sys/devices/system/memory as |
130 | 130 | ||
@@ -138,11 +138,12 @@ For example, assume 1GiB section size. A device for a memory starting at | |||
138 | (0x100000000 / 1Gib = 4) | 138 | (0x100000000 / 1Gib = 4) |
139 | This device covers address range [0x100000000 ... 0x140000000) | 139 | This device covers address range [0x100000000 ... 0x140000000) |
140 | 140 | ||
141 | Under each section, you can see 3 files. | 141 | Under each section, you can see 4 files. |
142 | 142 | ||
143 | /sys/devices/system/memory/memoryXXX/phys_index | 143 | /sys/devices/system/memory/memoryXXX/phys_index |
144 | /sys/devices/system/memory/memoryXXX/phys_device | 144 | /sys/devices/system/memory/memoryXXX/phys_device |
145 | /sys/devices/system/memory/memoryXXX/state | 145 | /sys/devices/system/memory/memoryXXX/state |
146 | /sys/devices/system/memory/memoryXXX/removable | ||
146 | 147 | ||
147 | 'phys_index' : read-only and contains section id, same as XXX. | 148 | 'phys_index' : read-only and contains section id, same as XXX. |
148 | 'state' : read-write | 149 | 'state' : read-write |
@@ -150,10 +151,20 @@ Under each section, you can see 3 files. | |||
150 | at write: user can specify "online", "offline" command | 151 | at write: user can specify "online", "offline" command |
151 | 'phys_device': read-only: designed to show the name of physical memory device. | 152 | 'phys_device': read-only: designed to show the name of physical memory device. |
152 | This is not well implemented now. | 153 | This is not well implemented now. |
154 | 'removable' : read-only: contains an integer value indicating | ||
155 | whether the memory section is removable or not | ||
156 | removable. A value of 1 indicates that the memory | ||
157 | section is removable and a value of 0 indicates that | ||
158 | it is not removable. | ||
153 | 159 | ||
154 | NOTE: | 160 | NOTE: |
155 | These directories/files appear after physical memory hotplug phase. | 161 | These directories/files appear after physical memory hotplug phase. |
156 | 162 | ||
163 | If CONFIG_NUMA is enabled the | ||
164 | /sys/devices/system/memory/memoryXXX memory section | ||
165 | directories can also be accessed via symbolic links located in | ||
166 | the /sys/devices/system/node/node* directories. For example: | ||
167 | /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 | ||
157 | 168 | ||
158 | -------------------------------- | 169 | -------------------------------- |
159 | 4. Physical memory hot-add phase | 170 | 4. Physical memory hot-add phase |
@@ -365,7 +376,6 @@ node if necessary. | |||
365 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like | 376 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like |
366 | sysctl or new control file. | 377 | sysctl or new control file. |
367 | - showing memory section and physical device relationship. | 378 | - showing memory section and physical device relationship. |
368 | - showing memory section and node relationship (maybe good for NUMA) | ||
369 | - showing memory section is under ZONE_MOVABLE or not | 379 | - showing memory section is under ZONE_MOVABLE or not |
370 | - test and make it better memory offlining. | 380 | - test and make it better memory offlining. |
371 | - support HugeTLB page migration and offlining. | 381 | - support HugeTLB page migration and offlining. |
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 054bcd9439aa..56e12903973c 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c | |||
@@ -692,7 +692,7 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
692 | pgdat = NODE_DATA(nid); | 692 | pgdat = NODE_DATA(nid); |
693 | 693 | ||
694 | zone = pgdat->node_zones + ZONE_NORMAL; | 694 | zone = pgdat->node_zones + ZONE_NORMAL; |
695 | ret = __add_pages(zone, start_pfn, nr_pages); | 695 | ret = __add_pages(nid, zone, start_pfn, nr_pages); |
696 | 696 | ||
697 | if (ret) | 697 | if (ret) |
698 | printk("%s: Problem encountered in __add_pages() as ret=%d\n", | 698 | printk("%s: Problem encountered in __add_pages() as ret=%d\n", |
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 53b06ebb3f2f..f00f09a77f12 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c | |||
@@ -132,7 +132,7 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
132 | /* this should work for most non-highmem platforms */ | 132 | /* this should work for most non-highmem platforms */ |
133 | zone = pgdata->node_zones; | 133 | zone = pgdata->node_zones; |
134 | 134 | ||
135 | return __add_pages(zone, start_pfn, nr_pages); | 135 | return __add_pages(nid, zone, start_pfn, nr_pages); |
136 | } | 136 | } |
137 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 137 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
138 | 138 | ||
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 158b0d6d7046..f0258ca3b17e 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c | |||
@@ -183,7 +183,7 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
183 | rc = vmem_add_mapping(start, size); | 183 | rc = vmem_add_mapping(start, size); |
184 | if (rc) | 184 | if (rc) |
185 | return rc; | 185 | return rc; |
186 | rc = __add_pages(zone, PFN_DOWN(start), PFN_DOWN(size)); | 186 | rc = __add_pages(nid, zone, PFN_DOWN(start), PFN_DOWN(size)); |
187 | if (rc) | 187 | if (rc) |
188 | vmem_remove_mapping(start, size); | 188 | vmem_remove_mapping(start, size); |
189 | return rc; | 189 | return rc; |
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 6cbef8caeb56..3edf297c829b 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c | |||
@@ -311,7 +311,8 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
311 | pgdat = NODE_DATA(nid); | 311 | pgdat = NODE_DATA(nid); |
312 | 312 | ||
313 | /* We only have ZONE_NORMAL, so this is easy.. */ | 313 | /* We only have ZONE_NORMAL, so this is easy.. */ |
314 | ret = __add_pages(pgdat->node_zones + ZONE_NORMAL, start_pfn, nr_pages); | 314 | ret = __add_pages(nid, pgdat->node_zones + ZONE_NORMAL, |
315 | start_pfn, nr_pages); | ||
315 | if (unlikely(ret)) | 316 | if (unlikely(ret)) |
316 | printk("%s: Failed, __add_pages() == %d\n", __func__, ret); | 317 | printk("%s: Failed, __add_pages() == %d\n", __func__, ret); |
317 | 318 | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f99a6c6c432e..544d724caeee 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -1079,7 +1079,7 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
1079 | unsigned long start_pfn = start >> PAGE_SHIFT; | 1079 | unsigned long start_pfn = start >> PAGE_SHIFT; |
1080 | unsigned long nr_pages = size >> PAGE_SHIFT; | 1080 | unsigned long nr_pages = size >> PAGE_SHIFT; |
1081 | 1081 | ||
1082 | return __add_pages(zone, start_pfn, nr_pages); | 1082 | return __add_pages(nid, zone, start_pfn, nr_pages); |
1083 | } | 1083 | } |
1084 | #endif | 1084 | #endif |
1085 | 1085 | ||
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9f7a0d24d42a..54c437e96541 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -857,7 +857,7 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
857 | if (last_mapped_pfn > max_pfn_mapped) | 857 | if (last_mapped_pfn > max_pfn_mapped) |
858 | max_pfn_mapped = last_mapped_pfn; | 858 | max_pfn_mapped = last_mapped_pfn; |
859 | 859 | ||
860 | ret = __add_pages(zone, start_pfn, nr_pages); | 860 | ret = __add_pages(nid, zone, start_pfn, nr_pages); |
861 | WARN_ON_ONCE(ret); | 861 | WARN_ON_ONCE(ret); |
862 | 862 | ||
863 | return ret; | 863 | return ret; |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5260e9e0df48..989429cfed88 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -347,8 +347,9 @@ static inline int memory_probe_init(void) | |||
347 | * section belongs to... | 347 | * section belongs to... |
348 | */ | 348 | */ |
349 | 349 | ||
350 | static int add_memory_block(unsigned long node_id, struct mem_section *section, | 350 | static int add_memory_block(int nid, struct mem_section *section, |
351 | unsigned long state, int phys_device) | 351 | unsigned long state, int phys_device, |
352 | enum mem_add_context context) | ||
352 | { | 353 | { |
353 | struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); | 354 | struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); |
354 | int ret = 0; | 355 | int ret = 0; |
@@ -370,6 +371,10 @@ static int add_memory_block(unsigned long node_id, struct mem_section *section, | |||
370 | ret = mem_create_simple_file(mem, phys_device); | 371 | ret = mem_create_simple_file(mem, phys_device); |
371 | if (!ret) | 372 | if (!ret) |
372 | ret = mem_create_simple_file(mem, removable); | 373 | ret = mem_create_simple_file(mem, removable); |
374 | if (!ret) { | ||
375 | if (context == HOTPLUG) | ||
376 | ret = register_mem_sect_under_node(mem, nid); | ||
377 | } | ||
373 | 378 | ||
374 | return ret; | 379 | return ret; |
375 | } | 380 | } |
@@ -382,7 +387,7 @@ static int add_memory_block(unsigned long node_id, struct mem_section *section, | |||
382 | * | 387 | * |
383 | * This could be made generic for all sysdev classes. | 388 | * This could be made generic for all sysdev classes. |
384 | */ | 389 | */ |
385 | static struct memory_block *find_memory_block(struct mem_section *section) | 390 | struct memory_block *find_memory_block(struct mem_section *section) |
386 | { | 391 | { |
387 | struct kobject *kobj; | 392 | struct kobject *kobj; |
388 | struct sys_device *sysdev; | 393 | struct sys_device *sysdev; |
@@ -411,6 +416,7 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section, | |||
411 | struct memory_block *mem; | 416 | struct memory_block *mem; |
412 | 417 | ||
413 | mem = find_memory_block(section); | 418 | mem = find_memory_block(section); |
419 | unregister_mem_sect_under_nodes(mem); | ||
414 | mem_remove_simple_file(mem, phys_index); | 420 | mem_remove_simple_file(mem, phys_index); |
415 | mem_remove_simple_file(mem, state); | 421 | mem_remove_simple_file(mem, state); |
416 | mem_remove_simple_file(mem, phys_device); | 422 | mem_remove_simple_file(mem, phys_device); |
@@ -424,9 +430,9 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section, | |||
424 | * need an interface for the VM to add new memory regions, | 430 | * need an interface for the VM to add new memory regions, |
425 | * but without onlining it. | 431 | * but without onlining it. |
426 | */ | 432 | */ |
427 | int register_new_memory(struct mem_section *section) | 433 | int register_new_memory(int nid, struct mem_section *section) |
428 | { | 434 | { |
429 | return add_memory_block(0, section, MEM_OFFLINE, 0); | 435 | return add_memory_block(nid, section, MEM_OFFLINE, 0, HOTPLUG); |
430 | } | 436 | } |
431 | 437 | ||
432 | int unregister_memory_section(struct mem_section *section) | 438 | int unregister_memory_section(struct mem_section *section) |
@@ -458,7 +464,8 @@ int __init memory_dev_init(void) | |||
458 | for (i = 0; i < NR_MEM_SECTIONS; i++) { | 464 | for (i = 0; i < NR_MEM_SECTIONS; i++) { |
459 | if (!present_section_nr(i)) | 465 | if (!present_section_nr(i)) |
460 | continue; | 466 | continue; |
461 | err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); | 467 | err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, |
468 | 0, BOOT); | ||
462 | if (!ret) | 469 | if (!ret) |
463 | ret = err; | 470 | ret = err; |
464 | } | 471 | } |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 91636cd8b6c9..43fa90b837ee 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/module.h> | 6 | #include <linux/module.h> |
7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/memory.h> | ||
9 | #include <linux/node.h> | 10 | #include <linux/node.h> |
10 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
11 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
@@ -248,6 +249,105 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) | |||
248 | return 0; | 249 | return 0; |
249 | } | 250 | } |
250 | 251 | ||
252 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | ||
253 | #define page_initialized(page) (page->lru.next) | ||
254 | |||
255 | static int get_nid_for_pfn(unsigned long pfn) | ||
256 | { | ||
257 | struct page *page; | ||
258 | |||
259 | if (!pfn_valid_within(pfn)) | ||
260 | return -1; | ||
261 | page = pfn_to_page(pfn); | ||
262 | if (!page_initialized(page)) | ||
263 | return -1; | ||
264 | return pfn_to_nid(pfn); | ||
265 | } | ||
266 | |||
267 | /* register memory section under specified node if it spans that node */ | ||
268 | int register_mem_sect_under_node(struct memory_block *mem_blk, int nid) | ||
269 | { | ||
270 | unsigned long pfn, sect_start_pfn, sect_end_pfn; | ||
271 | |||
272 | if (!mem_blk) | ||
273 | return -EFAULT; | ||
274 | if (!node_online(nid)) | ||
275 | return 0; | ||
276 | sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index); | ||
277 | sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1; | ||
278 | for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { | ||
279 | int page_nid; | ||
280 | |||
281 | page_nid = get_nid_for_pfn(pfn); | ||
282 | if (page_nid < 0) | ||
283 | continue; | ||
284 | if (page_nid != nid) | ||
285 | continue; | ||
286 | return sysfs_create_link_nowarn(&node_devices[nid].sysdev.kobj, | ||
287 | &mem_blk->sysdev.kobj, | ||
288 | kobject_name(&mem_blk->sysdev.kobj)); | ||
289 | } | ||
290 | /* mem section does not span the specified node */ | ||
291 | return 0; | ||
292 | } | ||
293 | |||
294 | /* unregister memory section under all nodes that it spans */ | ||
295 | int unregister_mem_sect_under_nodes(struct memory_block *mem_blk) | ||
296 | { | ||
297 | nodemask_t unlinked_nodes; | ||
298 | unsigned long pfn, sect_start_pfn, sect_end_pfn; | ||
299 | |||
300 | if (!mem_blk) | ||
301 | return -EFAULT; | ||
302 | nodes_clear(unlinked_nodes); | ||
303 | sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index); | ||
304 | sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1; | ||
305 | for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { | ||
306 | unsigned int nid; | ||
307 | |||
308 | nid = get_nid_for_pfn(pfn); | ||
309 | if (nid < 0) | ||
310 | continue; | ||
311 | if (!node_online(nid)) | ||
312 | continue; | ||
313 | if (node_test_and_set(nid, unlinked_nodes)) | ||
314 | continue; | ||
315 | sysfs_remove_link(&node_devices[nid].sysdev.kobj, | ||
316 | kobject_name(&mem_blk->sysdev.kobj)); | ||
317 | } | ||
318 | return 0; | ||
319 | } | ||
320 | |||
321 | static int link_mem_sections(int nid) | ||
322 | { | ||
323 | unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn; | ||
324 | unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages; | ||
325 | unsigned long pfn; | ||
326 | int err = 0; | ||
327 | |||
328 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | ||
329 | unsigned long section_nr = pfn_to_section_nr(pfn); | ||
330 | struct mem_section *mem_sect; | ||
331 | struct memory_block *mem_blk; | ||
332 | int ret; | ||
333 | |||
334 | if (!present_section_nr(section_nr)) | ||
335 | continue; | ||
336 | mem_sect = __nr_to_section(section_nr); | ||
337 | mem_blk = find_memory_block(mem_sect); | ||
338 | ret = register_mem_sect_under_node(mem_blk, nid); | ||
339 | if (!err) | ||
340 | err = ret; | ||
341 | |||
342 | /* discard ref obtained in find_memory_block() */ | ||
343 | kobject_put(&mem_blk->sysdev.kobj); | ||
344 | } | ||
345 | return err; | ||
346 | } | ||
347 | #else | ||
348 | static int link_mem_sections(int nid) { return 0; } | ||
349 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ | ||
350 | |||
251 | int register_one_node(int nid) | 351 | int register_one_node(int nid) |
252 | { | 352 | { |
253 | int error = 0; | 353 | int error = 0; |
@@ -267,6 +367,9 @@ int register_one_node(int nid) | |||
267 | if (cpu_to_node(cpu) == nid) | 367 | if (cpu_to_node(cpu) == nid) |
268 | register_cpu_under_node(cpu, nid); | 368 | register_cpu_under_node(cpu, nid); |
269 | } | 369 | } |
370 | |||
371 | /* link memory sections under this node */ | ||
372 | error = link_mem_sections(nid); | ||
270 | } | 373 | } |
271 | 374 | ||
272 | return error; | 375 | return error; |
diff --git a/include/linux/memory.h b/include/linux/memory.h index 36c82c9e6ea7..3fdc10806d31 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h | |||
@@ -79,14 +79,14 @@ static inline int memory_notify(unsigned long val, void *v) | |||
79 | #else | 79 | #else |
80 | extern int register_memory_notifier(struct notifier_block *nb); | 80 | extern int register_memory_notifier(struct notifier_block *nb); |
81 | extern void unregister_memory_notifier(struct notifier_block *nb); | 81 | extern void unregister_memory_notifier(struct notifier_block *nb); |
82 | extern int register_new_memory(struct mem_section *); | 82 | extern int register_new_memory(int, struct mem_section *); |
83 | extern int unregister_memory_section(struct mem_section *); | 83 | extern int unregister_memory_section(struct mem_section *); |
84 | extern int memory_dev_init(void); | 84 | extern int memory_dev_init(void); |
85 | extern int remove_memory_block(unsigned long, struct mem_section *, int); | 85 | extern int remove_memory_block(unsigned long, struct mem_section *, int); |
86 | extern int memory_notify(unsigned long val, void *v); | 86 | extern int memory_notify(unsigned long val, void *v); |
87 | extern struct memory_block *find_memory_block(struct mem_section *); | ||
87 | #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT) | 88 | #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT) |
88 | 89 | enum mem_add_context { BOOT, HOTPLUG }; | |
89 | |||
90 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ | 90 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ |
91 | 91 | ||
92 | #ifdef CONFIG_MEMORY_HOTPLUG | 92 | #ifdef CONFIG_MEMORY_HOTPLUG |
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 763ba81fc0f0..d95f72e79b82 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -72,7 +72,7 @@ extern void __offline_isolated_pages(unsigned long, unsigned long); | |||
72 | extern int offline_pages(unsigned long, unsigned long, unsigned long); | 72 | extern int offline_pages(unsigned long, unsigned long, unsigned long); |
73 | 73 | ||
74 | /* reasonably generic interface to expand the physical pages in a zone */ | 74 | /* reasonably generic interface to expand the physical pages in a zone */ |
75 | extern int __add_pages(struct zone *zone, unsigned long start_pfn, | 75 | extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, |
76 | unsigned long nr_pages); | 76 | unsigned long nr_pages); |
77 | extern int __remove_pages(struct zone *zone, unsigned long start_pfn, | 77 | extern int __remove_pages(struct zone *zone, unsigned long start_pfn, |
78 | unsigned long nr_pages); | 78 | unsigned long nr_pages); |
diff --git a/include/linux/node.h b/include/linux/node.h index bc001bc225c3..681a697b9a86 100644 --- a/include/linux/node.h +++ b/include/linux/node.h | |||
@@ -26,6 +26,7 @@ struct node { | |||
26 | struct sys_device sysdev; | 26 | struct sys_device sysdev; |
27 | }; | 27 | }; |
28 | 28 | ||
29 | struct memory_block; | ||
29 | extern struct node node_devices[]; | 30 | extern struct node node_devices[]; |
30 | 31 | ||
31 | extern int register_node(struct node *, int, struct node *); | 32 | extern int register_node(struct node *, int, struct node *); |
@@ -35,6 +36,9 @@ extern int register_one_node(int nid); | |||
35 | extern void unregister_one_node(int nid); | 36 | extern void unregister_one_node(int nid); |
36 | extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); | 37 | extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); |
37 | extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); | 38 | extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); |
39 | extern int register_mem_sect_under_node(struct memory_block *mem_blk, | ||
40 | int nid); | ||
41 | extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk); | ||
38 | #else | 42 | #else |
39 | static inline int register_one_node(int nid) | 43 | static inline int register_one_node(int nid) |
40 | { | 44 | { |
@@ -52,6 +56,15 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) | |||
52 | { | 56 | { |
53 | return 0; | 57 | return 0; |
54 | } | 58 | } |
59 | static inline int register_mem_sect_under_node(struct memory_block *mem_blk, | ||
60 | int nid) | ||
61 | { | ||
62 | return 0; | ||
63 | } | ||
64 | static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk) | ||
65 | { | ||
66 | return 0; | ||
67 | } | ||
55 | #endif | 68 | #endif |
56 | 69 | ||
57 | #define to_node(sys_device) container_of(sys_device, struct node, sysdev) | 70 | #define to_node(sys_device) container_of(sys_device, struct node, sysdev) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b17371185468..2ba38bc07b47 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
216 | return 0; | 216 | return 0; |
217 | } | 217 | } |
218 | 218 | ||
219 | static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) | 219 | static int __meminit __add_section(int nid, struct zone *zone, |
220 | unsigned long phys_start_pfn) | ||
220 | { | 221 | { |
221 | int nr_pages = PAGES_PER_SECTION; | 222 | int nr_pages = PAGES_PER_SECTION; |
222 | int ret; | 223 | int ret; |
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p | |||
234 | if (ret < 0) | 235 | if (ret < 0) |
235 | return ret; | 236 | return ret; |
236 | 237 | ||
237 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | 238 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
238 | } | 239 | } |
239 | 240 | ||
240 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 241 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
273 | * call this function after deciding the zone to which to | 274 | * call this function after deciding the zone to which to |
274 | * add the new pages. | 275 | * add the new pages. |
275 | */ | 276 | */ |
276 | int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, | 277 | int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, |
277 | unsigned long nr_pages) | 278 | unsigned long nr_pages) |
278 | { | 279 | { |
279 | unsigned long i; | 280 | unsigned long i; |
280 | int err = 0; | 281 | int err = 0; |
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
284 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | 285 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); |
285 | 286 | ||
286 | for (i = start_sec; i <= end_sec; i++) { | 287 | for (i = start_sec; i <= end_sec; i++) { |
287 | err = __add_section(zone, i << PFN_SECTION_SHIFT); | 288 | err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); |
288 | 289 | ||
289 | /* | 290 | /* |
290 | * EEXIST is finally dealt with by ioresource collision | 291 | * EEXIST is finally dealt with by ioresource collision |