aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/memory-hotplug.txt47
-rw-r--r--drivers/base/memory.c155
2 files changed, 139 insertions, 63 deletions
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 57e7e9cc1870..8f485d72cf25 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -126,36 +126,51 @@ config options.
126-------------------------------- 126--------------------------------
1274 sysfs files for memory hotplug 1274 sysfs files for memory hotplug
128-------------------------------- 128--------------------------------
129All sections have their device information under /sys/devices/system/memory as 129All sections have their device information in sysfs. Each section is part of
130a memory block under /sys/devices/system/memory as
130 131
131/sys/devices/system/memory/memoryXXX 132/sys/devices/system/memory/memoryXXX
132(XXX is section id.) 133(XXX is the section id.)
133 134
134Now, XXX is defined as start_address_of_section / section_size. 135Now, XXX is defined as (start_address_of_section / section_size) of the first
136section contained in the memory block. The files 'phys_index' and
137'end_phys_index' under each directory report the beginning and end section id's
138for the memory block covered by the sysfs directory. It is expected that all
139memory sections in this range are present and no memory holes exist in the
140range. Currently there is no way to determine if there is a memory hole, but
141the existence of one should not affect the hotplug capabilities of the memory
142block.
135 143
136For example, assume 1GiB section size. A device for a memory starting at 144For example, assume 1GiB section size. A device for a memory starting at
1370x100000000 is /sys/device/system/memory/memory4 1450x100000000 is /sys/device/system/memory/memory4
138(0x100000000 / 1Gib = 4) 146(0x100000000 / 1Gib = 4)
139This device covers address range [0x100000000 ... 0x140000000) 147This device covers address range [0x100000000 ... 0x140000000)
140 148
141Under each section, you can see 4 files. 149Under each section, you can see 4 or 5 files, the end_phys_index file being
150a recent addition and not present on older kernels.
142 151
143/sys/devices/system/memory/memoryXXX/phys_index 152/sys/devices/system/memory/memoryXXX/start_phys_index
153/sys/devices/system/memory/memoryXXX/end_phys_index
144/sys/devices/system/memory/memoryXXX/phys_device 154/sys/devices/system/memory/memoryXXX/phys_device
145/sys/devices/system/memory/memoryXXX/state 155/sys/devices/system/memory/memoryXXX/state
146/sys/devices/system/memory/memoryXXX/removable 156/sys/devices/system/memory/memoryXXX/removable
147 157
148'phys_index' : read-only and contains section id, same as XXX. 158'phys_index' : read-only and contains section id of the first section
149'state' : read-write 159 in the memory block, same as XXX.
150 at read: contains online/offline state of memory. 160'end_phys_index' : read-only and contains section id of the last section
151 at write: user can specify "online", "offline" command 161 in the memory block.
152'phys_device': read-only: designed to show the name of physical memory device. 162'state' : read-write
153 This is not well implemented now. 163 at read: contains online/offline state of memory.
154'removable' : read-only: contains an integer value indicating 164 at write: user can specify "online", "offline" command
155 whether the memory section is removable or not 165 which will be performed on al sections in the block.
156 removable. A value of 1 indicates that the memory 166'phys_device' : read-only: designed to show the name of physical memory
157 section is removable and a value of 0 indicates that 167 device. This is not well implemented now.
158 it is not removable. 168'removable' : read-only: contains an integer value indicating
169 whether the memory block is removable or not
170 removable. A value of 1 indicates that the memory
171 block is removable and a value of 0 indicates that
172 it is not removable. A memory block is removable only if
173 every section in the block is removable.
159 174
160NOTE: 175NOTE:
161 These directories/files appear after physical memory hotplug phase. 176 These directories/files appear after physical memory hotplug phase.
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index cafeaaf0428f..0b7040042587 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -30,6 +30,14 @@
30static DEFINE_MUTEX(mem_sysfs_mutex); 30static DEFINE_MUTEX(mem_sysfs_mutex);
31 31
32#define MEMORY_CLASS_NAME "memory" 32#define MEMORY_CLASS_NAME "memory"
33#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS)
34
35static int sections_per_block;
36
37static inline int base_memory_block_id(int section_nr)
38{
39 return section_nr / sections_per_block;
40}
33 41
34static struct sysdev_class memory_sysdev_class = { 42static struct sysdev_class memory_sysdev_class = {
35 .name = MEMORY_CLASS_NAME, 43 .name = MEMORY_CLASS_NAME,
@@ -84,28 +92,47 @@ EXPORT_SYMBOL(unregister_memory_isolate_notifier);
84 * register_memory - Setup a sysfs device for a memory block 92 * register_memory - Setup a sysfs device for a memory block
85 */ 93 */
86static 94static
87int register_memory(struct memory_block *memory, struct mem_section *section) 95int register_memory(struct memory_block *memory)
88{ 96{
89 int error; 97 int error;
90 98
91 memory->sysdev.cls = &memory_sysdev_class; 99 memory->sysdev.cls = &memory_sysdev_class;
92 memory->sysdev.id = __section_nr(section); 100 memory->sysdev.id = memory->phys_index / sections_per_block;
93 101
94 error = sysdev_register(&memory->sysdev); 102 error = sysdev_register(&memory->sysdev);
95 return error; 103 return error;
96} 104}
97 105
98static void 106static void
99unregister_memory(struct memory_block *memory, struct mem_section *section) 107unregister_memory(struct memory_block *memory)
100{ 108{
101 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 109 BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
102 BUG_ON(memory->sysdev.id != __section_nr(section));
103 110
104 /* drop the ref. we got in remove_memory_block() */ 111 /* drop the ref. we got in remove_memory_block() */
105 kobject_put(&memory->sysdev.kobj); 112 kobject_put(&memory->sysdev.kobj);
106 sysdev_unregister(&memory->sysdev); 113 sysdev_unregister(&memory->sysdev);
107} 114}
108 115
116unsigned long __weak memory_block_size_bytes(void)
117{
118 return MIN_MEMORY_BLOCK_SIZE;
119}
120
121static unsigned long get_memory_block_size(void)
122{
123 unsigned long block_sz;
124
125 block_sz = memory_block_size_bytes();
126
127 /* Validate blk_sz is a power of 2 and not less than section size */
128 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
129 WARN_ON(1);
130 block_sz = MIN_MEMORY_BLOCK_SIZE;
131 }
132
133 return block_sz;
134}
135
109/* 136/*
110 * use this as the physical section index that this memsection 137 * use this as the physical section index that this memsection
111 * uses. 138 * uses.
@@ -116,7 +143,7 @@ static ssize_t show_mem_phys_index(struct sys_device *dev,
116{ 143{
117 struct memory_block *mem = 144 struct memory_block *mem =
118 container_of(dev, struct memory_block, sysdev); 145 container_of(dev, struct memory_block, sysdev);
119 return sprintf(buf, "%08lx\n", mem->phys_index); 146 return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block);
120} 147}
121 148
122/* 149/*
@@ -125,13 +152,16 @@ static ssize_t show_mem_phys_index(struct sys_device *dev,
125static ssize_t show_mem_removable(struct sys_device *dev, 152static ssize_t show_mem_removable(struct sys_device *dev,
126 struct sysdev_attribute *attr, char *buf) 153 struct sysdev_attribute *attr, char *buf)
127{ 154{
128 unsigned long start_pfn; 155 unsigned long i, pfn;
129 int ret; 156 int ret = 1;
130 struct memory_block *mem = 157 struct memory_block *mem =
131 container_of(dev, struct memory_block, sysdev); 158 container_of(dev, struct memory_block, sysdev);
132 159
133 start_pfn = section_nr_to_pfn(mem->phys_index); 160 for (i = 0; i < sections_per_block; i++) {
134 ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION); 161 pfn = section_nr_to_pfn(mem->phys_index + i);
162 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
163 }
164
135 return sprintf(buf, "%d\n", ret); 165 return sprintf(buf, "%d\n", ret);
136} 166}
137 167
@@ -184,17 +214,14 @@ int memory_isolate_notify(unsigned long val, void *v)
184 * OK to have direct references to sparsemem variables in here. 214 * OK to have direct references to sparsemem variables in here.
185 */ 215 */
186static int 216static int
187memory_block_action(struct memory_block *mem, unsigned long action) 217memory_section_action(unsigned long phys_index, unsigned long action)
188{ 218{
189 int i; 219 int i;
190 unsigned long psection;
191 unsigned long start_pfn, start_paddr; 220 unsigned long start_pfn, start_paddr;
192 struct page *first_page; 221 struct page *first_page;
193 int ret; 222 int ret;
194 int old_state = mem->state;
195 223
196 psection = mem->phys_index; 224 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
197 first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
198 225
199 /* 226 /*
200 * The probe routines leave the pages reserved, just 227 * The probe routines leave the pages reserved, just
@@ -207,8 +234,8 @@ memory_block_action(struct memory_block *mem, unsigned long action)
207 continue; 234 continue;
208 235
209 printk(KERN_WARNING "section number %ld page number %d " 236 printk(KERN_WARNING "section number %ld page number %d "
210 "not reserved, was it already online? \n", 237 "not reserved, was it already online?\n",
211 psection, i); 238 phys_index, i);
212 return -EBUSY; 239 return -EBUSY;
213 } 240 }
214 } 241 }
@@ -219,18 +246,13 @@ memory_block_action(struct memory_block *mem, unsigned long action)
219 ret = online_pages(start_pfn, PAGES_PER_SECTION); 246 ret = online_pages(start_pfn, PAGES_PER_SECTION);
220 break; 247 break;
221 case MEM_OFFLINE: 248 case MEM_OFFLINE:
222 mem->state = MEM_GOING_OFFLINE;
223 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 249 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
224 ret = remove_memory(start_paddr, 250 ret = remove_memory(start_paddr,
225 PAGES_PER_SECTION << PAGE_SHIFT); 251 PAGES_PER_SECTION << PAGE_SHIFT);
226 if (ret) {
227 mem->state = old_state;
228 break;
229 }
230 break; 252 break;
231 default: 253 default:
232 WARN(1, KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", 254 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
233 __func__, mem, action, action); 255 "%ld\n", __func__, phys_index, action, action);
234 ret = -EINVAL; 256 ret = -EINVAL;
235 } 257 }
236 258
@@ -240,7 +262,8 @@ memory_block_action(struct memory_block *mem, unsigned long action)
240static int memory_block_change_state(struct memory_block *mem, 262static int memory_block_change_state(struct memory_block *mem,
241 unsigned long to_state, unsigned long from_state_req) 263 unsigned long to_state, unsigned long from_state_req)
242{ 264{
243 int ret = 0; 265 int i, ret = 0;
266
244 mutex_lock(&mem->state_mutex); 267 mutex_lock(&mem->state_mutex);
245 268
246 if (mem->state != from_state_req) { 269 if (mem->state != from_state_req) {
@@ -248,8 +271,22 @@ static int memory_block_change_state(struct memory_block *mem,
248 goto out; 271 goto out;
249 } 272 }
250 273
251 ret = memory_block_action(mem, to_state); 274 if (to_state == MEM_OFFLINE)
252 if (!ret) 275 mem->state = MEM_GOING_OFFLINE;
276
277 for (i = 0; i < sections_per_block; i++) {
278 ret = memory_section_action(mem->phys_index + i, to_state);
279 if (ret)
280 break;
281 }
282
283 if (ret) {
284 for (i = 0; i < sections_per_block; i++)
285 memory_section_action(mem->phys_index + i,
286 from_state_req);
287
288 mem->state = from_state_req;
289 } else
253 mem->state = to_state; 290 mem->state = to_state;
254 291
255out: 292out:
@@ -262,20 +299,15 @@ store_mem_state(struct sys_device *dev,
262 struct sysdev_attribute *attr, const char *buf, size_t count) 299 struct sysdev_attribute *attr, const char *buf, size_t count)
263{ 300{
264 struct memory_block *mem; 301 struct memory_block *mem;
265 unsigned int phys_section_nr;
266 int ret = -EINVAL; 302 int ret = -EINVAL;
267 303
268 mem = container_of(dev, struct memory_block, sysdev); 304 mem = container_of(dev, struct memory_block, sysdev);
269 phys_section_nr = mem->phys_index;
270
271 if (!present_section_nr(phys_section_nr))
272 goto out;
273 305
274 if (!strncmp(buf, "online", min((int)count, 6))) 306 if (!strncmp(buf, "online", min((int)count, 6)))
275 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 307 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
276 else if(!strncmp(buf, "offline", min((int)count, 7))) 308 else if(!strncmp(buf, "offline", min((int)count, 7)))
277 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 309 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
278out: 310
279 if (ret) 311 if (ret)
280 return ret; 312 return ret;
281 return count; 313 return count;
@@ -315,7 +347,7 @@ static ssize_t
315print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 347print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr,
316 char *buf) 348 char *buf)
317{ 349{
318 return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); 350 return sprintf(buf, "%lx\n", get_memory_block_size());
319} 351}
320 352
321static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 353static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
@@ -444,6 +476,7 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section,
444 struct sys_device *sysdev; 476 struct sys_device *sysdev;
445 struct memory_block *mem; 477 struct memory_block *mem;
446 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 478 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
479 int block_id = base_memory_block_id(__section_nr(section));
447 480
448 kobj = hint ? &hint->sysdev.kobj : NULL; 481 kobj = hint ? &hint->sysdev.kobj : NULL;
449 482
@@ -451,7 +484,7 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section,
451 * This only works because we know that section == sysdev->id 484 * This only works because we know that section == sysdev->id
452 * slightly redundant with sysdev_register() 485 * slightly redundant with sysdev_register()
453 */ 486 */
454 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); 487 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id);
455 488
456 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 489 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj);
457 if (!kobj) 490 if (!kobj)
@@ -476,26 +509,27 @@ struct memory_block *find_memory_block(struct mem_section *section)
476 return find_memory_block_hinted(section, NULL); 509 return find_memory_block_hinted(section, NULL);
477} 510}
478 511
479static int add_memory_block(int nid, struct mem_section *section, 512static int init_memory_block(struct memory_block **memory,
480 unsigned long state, enum mem_add_context context) 513 struct mem_section *section, unsigned long state)
481{ 514{
482 struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); 515 struct memory_block *mem;
483 unsigned long start_pfn; 516 unsigned long start_pfn;
517 int scn_nr;
484 int ret = 0; 518 int ret = 0;
485 519
520 mem = kzalloc(sizeof(*mem), GFP_KERNEL);
486 if (!mem) 521 if (!mem)
487 return -ENOMEM; 522 return -ENOMEM;
488 523
489 mutex_lock(&mem_sysfs_mutex); 524 scn_nr = __section_nr(section);
490 525 mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block;
491 mem->phys_index = __section_nr(section);
492 mem->state = state; 526 mem->state = state;
493 mem->section_count++; 527 mem->section_count++;
494 mutex_init(&mem->state_mutex); 528 mutex_init(&mem->state_mutex);
495 start_pfn = section_nr_to_pfn(mem->phys_index); 529 start_pfn = section_nr_to_pfn(mem->phys_index);
496 mem->phys_device = arch_get_memory_phys_device(start_pfn); 530 mem->phys_device = arch_get_memory_phys_device(start_pfn);
497 531
498 ret = register_memory(mem, section); 532 ret = register_memory(mem);
499 if (!ret) 533 if (!ret)
500 ret = mem_create_simple_file(mem, phys_index); 534 ret = mem_create_simple_file(mem, phys_index);
501 if (!ret) 535 if (!ret)
@@ -504,8 +538,29 @@ static int add_memory_block(int nid, struct mem_section *section,
504 ret = mem_create_simple_file(mem, phys_device); 538 ret = mem_create_simple_file(mem, phys_device);
505 if (!ret) 539 if (!ret)
506 ret = mem_create_simple_file(mem, removable); 540 ret = mem_create_simple_file(mem, removable);
541
542 *memory = mem;
543 return ret;
544}
545
546static int add_memory_section(int nid, struct mem_section *section,
547 unsigned long state, enum mem_add_context context)
548{
549 struct memory_block *mem;
550 int ret = 0;
551
552 mutex_lock(&mem_sysfs_mutex);
553
554 mem = find_memory_block(section);
555 if (mem) {
556 mem->section_count++;
557 kobject_put(&mem->sysdev.kobj);
558 } else
559 ret = init_memory_block(&mem, section, state);
560
507 if (!ret) { 561 if (!ret) {
508 if (context == HOTPLUG) 562 if (context == HOTPLUG &&
563 mem->section_count == sections_per_block)
509 ret = register_mem_sect_under_node(mem, nid); 564 ret = register_mem_sect_under_node(mem, nid);
510 } 565 }
511 566
@@ -528,8 +583,10 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section,
528 mem_remove_simple_file(mem, state); 583 mem_remove_simple_file(mem, state);
529 mem_remove_simple_file(mem, phys_device); 584 mem_remove_simple_file(mem, phys_device);
530 mem_remove_simple_file(mem, removable); 585 mem_remove_simple_file(mem, removable);
531 unregister_memory(mem, section); 586 unregister_memory(mem);
532 } 587 kfree(mem);
588 } else
589 kobject_put(&mem->sysdev.kobj);
533 590
534 mutex_unlock(&mem_sysfs_mutex); 591 mutex_unlock(&mem_sysfs_mutex);
535 return 0; 592 return 0;
@@ -541,7 +598,7 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section,
541 */ 598 */
542int register_new_memory(int nid, struct mem_section *section) 599int register_new_memory(int nid, struct mem_section *section)
543{ 600{
544 return add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG); 601 return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG);
545} 602}
546 603
547int unregister_memory_section(struct mem_section *section) 604int unregister_memory_section(struct mem_section *section)
@@ -560,12 +617,16 @@ int __init memory_dev_init(void)
560 unsigned int i; 617 unsigned int i;
561 int ret; 618 int ret;
562 int err; 619 int err;
620 unsigned long block_sz;
563 621
564 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 622 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
565 ret = sysdev_class_register(&memory_sysdev_class); 623 ret = sysdev_class_register(&memory_sysdev_class);
566 if (ret) 624 if (ret)
567 goto out; 625 goto out;
568 626
627 block_sz = get_memory_block_size();
628 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
629
569 /* 630 /*
570 * Create entries for memory sections that were found 631 * Create entries for memory sections that were found
571 * during boot and have been initialized 632 * during boot and have been initialized
@@ -573,8 +634,8 @@ int __init memory_dev_init(void)
573 for (i = 0; i < NR_MEM_SECTIONS; i++) { 634 for (i = 0; i < NR_MEM_SECTIONS; i++) {
574 if (!present_section_nr(i)) 635 if (!present_section_nr(i))
575 continue; 636 continue;
576 err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 637 err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE,
577 BOOT); 638 BOOT);
578 if (!ret) 639 if (!ret)
579 ret = err; 640 ret = err;
580 } 641 }