diff options
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r-- | mm/memory_hotplug.c | 553 |
1 files changed, 494 insertions, 59 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d04ed87bfacb..b81a367b9f39 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
30 | #include <linux/mm_inline.h> | 30 | #include <linux/mm_inline.h> |
31 | #include <linux/firmware-map.h> | 31 | #include <linux/firmware-map.h> |
32 | #include <linux/stop_machine.h> | ||
32 | 33 | ||
33 | #include <asm/tlbflush.h> | 34 | #include <asm/tlbflush.h> |
34 | 35 | ||
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res) | |||
91 | } | 92 | } |
92 | 93 | ||
93 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 94 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
94 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 95 | void get_page_bootmem(unsigned long info, struct page *page, |
95 | static void get_page_bootmem(unsigned long info, struct page *page, | 96 | unsigned long type) |
96 | unsigned long type) | ||
97 | { | 97 | { |
98 | page->lru.next = (struct list_head *) type; | 98 | page->lru.next = (struct list_head *) type; |
99 | SetPagePrivate(page); | 99 | SetPagePrivate(page); |
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page) | |||
124 | mutex_lock(&ppb_lock); | 124 | mutex_lock(&ppb_lock); |
125 | __free_pages_bootmem(page, 0); | 125 | __free_pages_bootmem(page, 0); |
126 | mutex_unlock(&ppb_lock); | 126 | mutex_unlock(&ppb_lock); |
127 | totalram_pages++; | ||
127 | } | 128 | } |
128 | 129 | ||
129 | } | 130 | } |
130 | 131 | ||
132 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE | ||
133 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | ||
131 | static void register_page_bootmem_info_section(unsigned long start_pfn) | 134 | static void register_page_bootmem_info_section(unsigned long start_pfn) |
132 | { | 135 | { |
133 | unsigned long *usemap, mapsize, section_nr, i; | 136 | unsigned long *usemap, mapsize, section_nr, i; |
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) | |||
161 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); | 164 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); |
162 | 165 | ||
163 | } | 166 | } |
167 | #else /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
168 | static void register_page_bootmem_info_section(unsigned long start_pfn) | ||
169 | { | ||
170 | unsigned long *usemap, mapsize, section_nr, i; | ||
171 | struct mem_section *ms; | ||
172 | struct page *page, *memmap; | ||
173 | |||
174 | if (!pfn_valid(start_pfn)) | ||
175 | return; | ||
176 | |||
177 | section_nr = pfn_to_section_nr(start_pfn); | ||
178 | ms = __nr_to_section(section_nr); | ||
179 | |||
180 | memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); | ||
181 | |||
182 | register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); | ||
183 | |||
184 | usemap = __nr_to_section(section_nr)->pageblock_flags; | ||
185 | page = virt_to_page(usemap); | ||
186 | |||
187 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; | ||
188 | |||
189 | for (i = 0; i < mapsize; i++, page++) | ||
190 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); | ||
191 | } | ||
192 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | ||
164 | 193 | ||
165 | void register_page_bootmem_info_node(struct pglist_data *pgdat) | 194 | void register_page_bootmem_info_node(struct pglist_data *pgdat) |
166 | { | 195 | { |
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
189 | } | 218 | } |
190 | 219 | ||
191 | pfn = pgdat->node_start_pfn; | 220 | pfn = pgdat->node_start_pfn; |
192 | end_pfn = pfn + pgdat->node_spanned_pages; | 221 | end_pfn = pgdat_end_pfn(pgdat); |
193 | 222 | ||
194 | /* register_section info */ | 223 | /* register_section info */ |
195 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 224 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
203 | register_page_bootmem_info_section(pfn); | 232 | register_page_bootmem_info_section(pfn); |
204 | } | 233 | } |
205 | } | 234 | } |
206 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 235 | #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ |
207 | 236 | ||
208 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | 237 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, |
209 | unsigned long end_pfn) | 238 | unsigned long end_pfn) |
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |||
253 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); | 282 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); |
254 | } | 283 | } |
255 | 284 | ||
285 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or | ||
286 | * alloc_bootmem_node_nopanic() */ | ||
287 | static int __ref ensure_zone_is_initialized(struct zone *zone, | ||
288 | unsigned long start_pfn, unsigned long num_pages) | ||
289 | { | ||
290 | if (!zone_is_initialized(zone)) | ||
291 | return init_currently_empty_zone(zone, start_pfn, num_pages, | ||
292 | MEMMAP_HOTPLUG); | ||
293 | return 0; | ||
294 | } | ||
295 | |||
256 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | 296 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, |
257 | unsigned long start_pfn, unsigned long end_pfn) | 297 | unsigned long start_pfn, unsigned long end_pfn) |
258 | { | 298 | { |
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
260 | unsigned long flags; | 300 | unsigned long flags; |
261 | unsigned long z1_start_pfn; | 301 | unsigned long z1_start_pfn; |
262 | 302 | ||
263 | if (!z1->wait_table) { | 303 | ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); |
264 | ret = init_currently_empty_zone(z1, start_pfn, | 304 | if (ret) |
265 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | 305 | return ret; |
266 | if (ret) | ||
267 | return ret; | ||
268 | } | ||
269 | 306 | ||
270 | pgdat_resize_lock(z1->zone_pgdat, &flags); | 307 | pgdat_resize_lock(z1->zone_pgdat, &flags); |
271 | 308 | ||
272 | /* can't move pfns which are higher than @z2 */ | 309 | /* can't move pfns which are higher than @z2 */ |
273 | if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | 310 | if (end_pfn > zone_end_pfn(z2)) |
274 | goto out_fail; | 311 | goto out_fail; |
275 | /* the move out part mast at the left most of @z2 */ | 312 | /* the move out part mast at the left most of @z2 */ |
276 | if (start_pfn > z2->zone_start_pfn) | 313 | if (start_pfn > z2->zone_start_pfn) |
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
286 | z1_start_pfn = start_pfn; | 323 | z1_start_pfn = start_pfn; |
287 | 324 | ||
288 | resize_zone(z1, z1_start_pfn, end_pfn); | 325 | resize_zone(z1, z1_start_pfn, end_pfn); |
289 | resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | 326 | resize_zone(z2, end_pfn, zone_end_pfn(z2)); |
290 | 327 | ||
291 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | 328 | pgdat_resize_unlock(z1->zone_pgdat, &flags); |
292 | 329 | ||
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
305 | unsigned long flags; | 342 | unsigned long flags; |
306 | unsigned long z2_end_pfn; | 343 | unsigned long z2_end_pfn; |
307 | 344 | ||
308 | if (!z2->wait_table) { | 345 | ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); |
309 | ret = init_currently_empty_zone(z2, start_pfn, | 346 | if (ret) |
310 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | 347 | return ret; |
311 | if (ret) | ||
312 | return ret; | ||
313 | } | ||
314 | 348 | ||
315 | pgdat_resize_lock(z1->zone_pgdat, &flags); | 349 | pgdat_resize_lock(z1->zone_pgdat, &flags); |
316 | 350 | ||
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
318 | if (z1->zone_start_pfn > start_pfn) | 352 | if (z1->zone_start_pfn > start_pfn) |
319 | goto out_fail; | 353 | goto out_fail; |
320 | /* the move out part mast at the right most of @z1 */ | 354 | /* the move out part mast at the right most of @z1 */ |
321 | if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) | 355 | if (zone_end_pfn(z1) > end_pfn) |
322 | goto out_fail; | 356 | goto out_fail; |
323 | /* must included/overlap */ | 357 | /* must included/overlap */ |
324 | if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | 358 | if (start_pfn >= zone_end_pfn(z1)) |
325 | goto out_fail; | 359 | goto out_fail; |
326 | 360 | ||
327 | /* use end_pfn for z2's end_pfn if z2 is empty */ | 361 | /* use end_pfn for z2's end_pfn if z2 is empty */ |
328 | if (z2->spanned_pages) | 362 | if (z2->spanned_pages) |
329 | z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; | 363 | z2_end_pfn = zone_end_pfn(z2); |
330 | else | 364 | else |
331 | z2_end_pfn = end_pfn; | 365 | z2_end_pfn = end_pfn; |
332 | 366 | ||
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
363 | int nid = pgdat->node_id; | 397 | int nid = pgdat->node_id; |
364 | int zone_type; | 398 | int zone_type; |
365 | unsigned long flags; | 399 | unsigned long flags; |
400 | int ret; | ||
366 | 401 | ||
367 | zone_type = zone - pgdat->node_zones; | 402 | zone_type = zone - pgdat->node_zones; |
368 | if (!zone->wait_table) { | 403 | ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); |
369 | int ret; | 404 | if (ret) |
405 | return ret; | ||
370 | 406 | ||
371 | ret = init_currently_empty_zone(zone, phys_start_pfn, | ||
372 | nr_pages, MEMMAP_HOTPLUG); | ||
373 | if (ret) | ||
374 | return ret; | ||
375 | } | ||
376 | pgdat_resize_lock(zone->zone_pgdat, &flags); | 407 | pgdat_resize_lock(zone->zone_pgdat, &flags); |
377 | grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); | 408 | grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); |
378 | grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, | 409 | grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, |
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone, | |||
405 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); | 436 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
406 | } | 437 | } |
407 | 438 | ||
408 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 439 | /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ |
409 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 440 | static int find_smallest_section_pfn(int nid, struct zone *zone, |
441 | unsigned long start_pfn, | ||
442 | unsigned long end_pfn) | ||
443 | { | ||
444 | struct mem_section *ms; | ||
445 | |||
446 | for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { | ||
447 | ms = __pfn_to_section(start_pfn); | ||
448 | |||
449 | if (unlikely(!valid_section(ms))) | ||
450 | continue; | ||
451 | |||
452 | if (unlikely(pfn_to_nid(start_pfn) != nid)) | ||
453 | continue; | ||
454 | |||
455 | if (zone && zone != page_zone(pfn_to_page(start_pfn))) | ||
456 | continue; | ||
457 | |||
458 | return start_pfn; | ||
459 | } | ||
460 | |||
461 | return 0; | ||
462 | } | ||
463 | |||
464 | /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ | ||
465 | static int find_biggest_section_pfn(int nid, struct zone *zone, | ||
466 | unsigned long start_pfn, | ||
467 | unsigned long end_pfn) | ||
468 | { | ||
469 | struct mem_section *ms; | ||
470 | unsigned long pfn; | ||
471 | |||
472 | /* pfn is the end pfn of a memory section. */ | ||
473 | pfn = end_pfn - 1; | ||
474 | for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { | ||
475 | ms = __pfn_to_section(pfn); | ||
476 | |||
477 | if (unlikely(!valid_section(ms))) | ||
478 | continue; | ||
479 | |||
480 | if (unlikely(pfn_to_nid(pfn) != nid)) | ||
481 | continue; | ||
482 | |||
483 | if (zone && zone != page_zone(pfn_to_page(pfn))) | ||
484 | continue; | ||
485 | |||
486 | return pfn; | ||
487 | } | ||
488 | |||
489 | return 0; | ||
490 | } | ||
491 | |||
492 | static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, | ||
493 | unsigned long end_pfn) | ||
410 | { | 494 | { |
495 | unsigned long zone_start_pfn = zone->zone_start_pfn; | ||
496 | unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
497 | unsigned long pfn; | ||
498 | struct mem_section *ms; | ||
499 | int nid = zone_to_nid(zone); | ||
500 | |||
501 | zone_span_writelock(zone); | ||
502 | if (zone_start_pfn == start_pfn) { | ||
503 | /* | ||
504 | * If the section is smallest section in the zone, it need | ||
505 | * shrink zone->zone_start_pfn and zone->zone_spanned_pages. | ||
506 | * In this case, we find second smallest valid mem_section | ||
507 | * for shrinking zone. | ||
508 | */ | ||
509 | pfn = find_smallest_section_pfn(nid, zone, end_pfn, | ||
510 | zone_end_pfn); | ||
511 | if (pfn) { | ||
512 | zone->zone_start_pfn = pfn; | ||
513 | zone->spanned_pages = zone_end_pfn - pfn; | ||
514 | } | ||
515 | } else if (zone_end_pfn == end_pfn) { | ||
516 | /* | ||
517 | * If the section is biggest section in the zone, it need | ||
518 | * shrink zone->spanned_pages. | ||
519 | * In this case, we find second biggest valid mem_section for | ||
520 | * shrinking zone. | ||
521 | */ | ||
522 | pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, | ||
523 | start_pfn); | ||
524 | if (pfn) | ||
525 | zone->spanned_pages = pfn - zone_start_pfn + 1; | ||
526 | } | ||
527 | |||
411 | /* | 528 | /* |
412 | * XXX: Freeing memmap with vmemmap is not implement yet. | 529 | * The section is not biggest or smallest mem_section in the zone, it |
413 | * This should be removed later. | 530 | * only creates a hole in the zone. So in this case, we need not |
531 | * change the zone. But perhaps, the zone has only hole data. Thus | ||
532 | * it check the zone has only hole or not. | ||
414 | */ | 533 | */ |
415 | return -EBUSY; | 534 | pfn = zone_start_pfn; |
535 | for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { | ||
536 | ms = __pfn_to_section(pfn); | ||
537 | |||
538 | if (unlikely(!valid_section(ms))) | ||
539 | continue; | ||
540 | |||
541 | if (page_zone(pfn_to_page(pfn)) != zone) | ||
542 | continue; | ||
543 | |||
544 | /* If the section is current section, it continues the loop */ | ||
545 | if (start_pfn == pfn) | ||
546 | continue; | ||
547 | |||
548 | /* If we find valid section, we have nothing to do */ | ||
549 | zone_span_writeunlock(zone); | ||
550 | return; | ||
551 | } | ||
552 | |||
553 | /* The zone has no valid section */ | ||
554 | zone->zone_start_pfn = 0; | ||
555 | zone->spanned_pages = 0; | ||
556 | zone_span_writeunlock(zone); | ||
416 | } | 557 | } |
417 | #else | 558 | |
418 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 559 | static void shrink_pgdat_span(struct pglist_data *pgdat, |
560 | unsigned long start_pfn, unsigned long end_pfn) | ||
561 | { | ||
562 | unsigned long pgdat_start_pfn = pgdat->node_start_pfn; | ||
563 | unsigned long pgdat_end_pfn = | ||
564 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
565 | unsigned long pfn; | ||
566 | struct mem_section *ms; | ||
567 | int nid = pgdat->node_id; | ||
568 | |||
569 | if (pgdat_start_pfn == start_pfn) { | ||
570 | /* | ||
571 | * If the section is smallest section in the pgdat, it need | ||
572 | * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. | ||
573 | * In this case, we find second smallest valid mem_section | ||
574 | * for shrinking zone. | ||
575 | */ | ||
576 | pfn = find_smallest_section_pfn(nid, NULL, end_pfn, | ||
577 | pgdat_end_pfn); | ||
578 | if (pfn) { | ||
579 | pgdat->node_start_pfn = pfn; | ||
580 | pgdat->node_spanned_pages = pgdat_end_pfn - pfn; | ||
581 | } | ||
582 | } else if (pgdat_end_pfn == end_pfn) { | ||
583 | /* | ||
584 | * If the section is biggest section in the pgdat, it need | ||
585 | * shrink pgdat->node_spanned_pages. | ||
586 | * In this case, we find second biggest valid mem_section for | ||
587 | * shrinking zone. | ||
588 | */ | ||
589 | pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, | ||
590 | start_pfn); | ||
591 | if (pfn) | ||
592 | pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * If the section is not biggest or smallest mem_section in the pgdat, | ||
597 | * it only creates a hole in the pgdat. So in this case, we need not | ||
598 | * change the pgdat. | ||
599 | * But perhaps, the pgdat has only hole data. Thus it check the pgdat | ||
600 | * has only hole or not. | ||
601 | */ | ||
602 | pfn = pgdat_start_pfn; | ||
603 | for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { | ||
604 | ms = __pfn_to_section(pfn); | ||
605 | |||
606 | if (unlikely(!valid_section(ms))) | ||
607 | continue; | ||
608 | |||
609 | if (pfn_to_nid(pfn) != nid) | ||
610 | continue; | ||
611 | |||
612 | /* If the section is current section, it continues the loop */ | ||
613 | if (start_pfn == pfn) | ||
614 | continue; | ||
615 | |||
616 | /* If we find valid section, we have nothing to do */ | ||
617 | return; | ||
618 | } | ||
619 | |||
620 | /* The pgdat has no valid section */ | ||
621 | pgdat->node_start_pfn = 0; | ||
622 | pgdat->node_spanned_pages = 0; | ||
623 | } | ||
624 | |||
625 | static void __remove_zone(struct zone *zone, unsigned long start_pfn) | ||
419 | { | 626 | { |
420 | unsigned long flags; | ||
421 | struct pglist_data *pgdat = zone->zone_pgdat; | 627 | struct pglist_data *pgdat = zone->zone_pgdat; |
628 | int nr_pages = PAGES_PER_SECTION; | ||
629 | int zone_type; | ||
630 | unsigned long flags; | ||
631 | |||
632 | zone_type = zone - pgdat->node_zones; | ||
633 | |||
634 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
635 | shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); | ||
636 | shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); | ||
637 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | ||
638 | } | ||
639 | |||
640 | static int __remove_section(struct zone *zone, struct mem_section *ms) | ||
641 | { | ||
642 | unsigned long start_pfn; | ||
643 | int scn_nr; | ||
422 | int ret = -EINVAL; | 644 | int ret = -EINVAL; |
423 | 645 | ||
424 | if (!valid_section(ms)) | 646 | if (!valid_section(ms)) |
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
428 | if (ret) | 650 | if (ret) |
429 | return ret; | 651 | return ret; |
430 | 652 | ||
431 | pgdat_resize_lock(pgdat, &flags); | 653 | scn_nr = __section_nr(ms); |
654 | start_pfn = section_nr_to_pfn(scn_nr); | ||
655 | __remove_zone(zone, start_pfn); | ||
656 | |||
432 | sparse_remove_one_section(zone, ms); | 657 | sparse_remove_one_section(zone, ms); |
433 | pgdat_resize_unlock(pgdat, &flags); | ||
434 | return 0; | 658 | return 0; |
435 | } | 659 | } |
436 | #endif | ||
437 | 660 | ||
438 | /* | 661 | /* |
439 | * Reasonably generic function for adding memory. It is | 662 | * Reasonably generic function for adding memory. It is |
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
797 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; | 1020 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; |
798 | unsigned long start_pfn = start >> PAGE_SHIFT; | 1021 | unsigned long start_pfn = start >> PAGE_SHIFT; |
799 | 1022 | ||
800 | pgdat = arch_alloc_nodedata(nid); | 1023 | pgdat = NODE_DATA(nid); |
801 | if (!pgdat) | 1024 | if (!pgdat) { |
802 | return NULL; | 1025 | pgdat = arch_alloc_nodedata(nid); |
1026 | if (!pgdat) | ||
1027 | return NULL; | ||
803 | 1028 | ||
804 | arch_refresh_nodedata(nid, pgdat); | 1029 | arch_refresh_nodedata(nid, pgdat); |
1030 | } | ||
805 | 1031 | ||
806 | /* we can use NODE_DATA(nid) from here */ | 1032 | /* we can use NODE_DATA(nid) from here */ |
807 | 1033 | ||
@@ -854,7 +1080,8 @@ out: | |||
854 | int __ref add_memory(int nid, u64 start, u64 size) | 1080 | int __ref add_memory(int nid, u64 start, u64 size) |
855 | { | 1081 | { |
856 | pg_data_t *pgdat = NULL; | 1082 | pg_data_t *pgdat = NULL; |
857 | int new_pgdat = 0; | 1083 | bool new_pgdat; |
1084 | bool new_node; | ||
858 | struct resource *res; | 1085 | struct resource *res; |
859 | int ret; | 1086 | int ret; |
860 | 1087 | ||
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
865 | if (!res) | 1092 | if (!res) |
866 | goto out; | 1093 | goto out; |
867 | 1094 | ||
868 | if (!node_online(nid)) { | 1095 | { /* Stupid hack to suppress address-never-null warning */ |
1096 | void *p = NODE_DATA(nid); | ||
1097 | new_pgdat = !p; | ||
1098 | } | ||
1099 | new_node = !node_online(nid); | ||
1100 | if (new_node) { | ||
869 | pgdat = hotadd_new_pgdat(nid, start); | 1101 | pgdat = hotadd_new_pgdat(nid, start); |
870 | ret = -ENOMEM; | 1102 | ret = -ENOMEM; |
871 | if (!pgdat) | 1103 | if (!pgdat) |
872 | goto error; | 1104 | goto error; |
873 | new_pgdat = 1; | ||
874 | } | 1105 | } |
875 | 1106 | ||
876 | /* call arch's memory hotadd */ | 1107 | /* call arch's memory hotadd */ |
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
882 | /* we online node here. we can't roll back from here. */ | 1113 | /* we online node here. we can't roll back from here. */ |
883 | node_set_online(nid); | 1114 | node_set_online(nid); |
884 | 1115 | ||
885 | if (new_pgdat) { | 1116 | if (new_node) { |
886 | ret = register_one_node(nid); | 1117 | ret = register_one_node(nid); |
887 | /* | 1118 | /* |
888 | * If sysfs file of new node can't create, cpu on the node | 1119 | * If sysfs file of new node can't create, cpu on the node |
@@ -901,8 +1132,7 @@ error: | |||
901 | /* rollback pgdat allocation and others */ | 1132 | /* rollback pgdat allocation and others */ |
902 | if (new_pgdat) | 1133 | if (new_pgdat) |
903 | rollback_node_hotadd(nid, pgdat); | 1134 | rollback_node_hotadd(nid, pgdat); |
904 | if (res) | 1135 | release_memory_resource(res); |
905 | release_memory_resource(res); | ||
906 | 1136 | ||
907 | out: | 1137 | out: |
908 | unlock_memory_hotplug(); | 1138 | unlock_memory_hotplug(); |
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1058 | * migrate_pages returns # of failed pages. | 1288 | * migrate_pages returns # of failed pages. |
1059 | */ | 1289 | */ |
1060 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1290 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
1061 | true, MIGRATE_SYNC, | 1291 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1062 | MR_MEMORY_HOTPLUG); | ||
1063 | if (ret) | 1292 | if (ret) |
1064 | putback_lru_pages(&source); | 1293 | putback_lru_pages(&source); |
1065 | } | 1294 | } |
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | |||
1381 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | 1610 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); |
1382 | } | 1611 | } |
1383 | 1612 | ||
1384 | int remove_memory(u64 start, u64 size) | 1613 | /** |
1614 | * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) | ||
1615 | * @start_pfn: start pfn of the memory range | ||
1616 | * @end_pfn: end pft of the memory range | ||
1617 | * @arg: argument passed to func | ||
1618 | * @func: callback for each memory section walked | ||
1619 | * | ||
1620 | * This function walks through all present mem sections in range | ||
1621 | * [start_pfn, end_pfn) and call func on each mem section. | ||
1622 | * | ||
1623 | * Returns the return value of func. | ||
1624 | */ | ||
1625 | static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, | ||
1626 | void *arg, int (*func)(struct memory_block *, void *)) | ||
1385 | { | 1627 | { |
1386 | struct memory_block *mem = NULL; | 1628 | struct memory_block *mem = NULL; |
1387 | struct mem_section *section; | 1629 | struct mem_section *section; |
1388 | unsigned long start_pfn, end_pfn; | ||
1389 | unsigned long pfn, section_nr; | 1630 | unsigned long pfn, section_nr; |
1390 | int ret; | 1631 | int ret; |
1391 | 1632 | ||
1392 | start_pfn = PFN_DOWN(start); | ||
1393 | end_pfn = start_pfn + PFN_DOWN(size); | ||
1394 | |||
1395 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 1633 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
1396 | section_nr = pfn_to_section_nr(pfn); | 1634 | section_nr = pfn_to_section_nr(pfn); |
1397 | if (!present_section_nr(section_nr)) | 1635 | if (!present_section_nr(section_nr)) |
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size) | |||
1408 | if (!mem) | 1646 | if (!mem) |
1409 | continue; | 1647 | continue; |
1410 | 1648 | ||
1411 | ret = offline_memory_block(mem); | 1649 | ret = func(mem, arg); |
1412 | if (ret) { | 1650 | if (ret) { |
1413 | kobject_put(&mem->dev.kobj); | 1651 | kobject_put(&mem->dev.kobj); |
1414 | return ret; | 1652 | return ret; |
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size) | |||
1420 | 1658 | ||
1421 | return 0; | 1659 | return 0; |
1422 | } | 1660 | } |
1661 | |||
1662 | /** | ||
1663 | * offline_memory_block_cb - callback function for offlining memory block | ||
1664 | * @mem: the memory block to be offlined | ||
1665 | * @arg: buffer to hold error msg | ||
1666 | * | ||
1667 | * Always return 0, and put the error msg in arg if any. | ||
1668 | */ | ||
1669 | static int offline_memory_block_cb(struct memory_block *mem, void *arg) | ||
1670 | { | ||
1671 | int *ret = arg; | ||
1672 | int error = offline_memory_block(mem); | ||
1673 | |||
1674 | if (error != 0 && *ret == 0) | ||
1675 | *ret = error; | ||
1676 | |||
1677 | return 0; | ||
1678 | } | ||
1679 | |||
1680 | static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | ||
1681 | { | ||
1682 | int ret = !is_memblock_offlined(mem); | ||
1683 | |||
1684 | if (unlikely(ret)) | ||
1685 | pr_warn("removing memory fails, because memory " | ||
1686 | "[%#010llx-%#010llx] is onlined\n", | ||
1687 | PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), | ||
1688 | PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); | ||
1689 | |||
1690 | return ret; | ||
1691 | } | ||
1692 | |||
1693 | static int check_cpu_on_node(void *data) | ||
1694 | { | ||
1695 | struct pglist_data *pgdat = data; | ||
1696 | int cpu; | ||
1697 | |||
1698 | for_each_present_cpu(cpu) { | ||
1699 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
1700 | /* | ||
1701 | * the cpu on this node isn't removed, and we can't | ||
1702 | * offline this node. | ||
1703 | */ | ||
1704 | return -EBUSY; | ||
1705 | } | ||
1706 | |||
1707 | return 0; | ||
1708 | } | ||
1709 | |||
1710 | static void unmap_cpu_on_node(void *data) | ||
1711 | { | ||
1712 | #ifdef CONFIG_ACPI_NUMA | ||
1713 | struct pglist_data *pgdat = data; | ||
1714 | int cpu; | ||
1715 | |||
1716 | for_each_possible_cpu(cpu) | ||
1717 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
1718 | numa_clear_node(cpu); | ||
1719 | #endif | ||
1720 | } | ||
1721 | |||
1722 | static int check_and_unmap_cpu_on_node(void *data) | ||
1723 | { | ||
1724 | int ret = check_cpu_on_node(data); | ||
1725 | |||
1726 | if (ret) | ||
1727 | return ret; | ||
1728 | |||
1729 | /* | ||
1730 | * the node will be offlined when we come here, so we can clear | ||
1731 | * the cpu_to_node() now. | ||
1732 | */ | ||
1733 | |||
1734 | unmap_cpu_on_node(data); | ||
1735 | return 0; | ||
1736 | } | ||
1737 | |||
1738 | /* offline the node if all memory sections of this node are removed */ | ||
1739 | void try_offline_node(int nid) | ||
1740 | { | ||
1741 | pg_data_t *pgdat = NODE_DATA(nid); | ||
1742 | unsigned long start_pfn = pgdat->node_start_pfn; | ||
1743 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
1744 | unsigned long pfn; | ||
1745 | struct page *pgdat_page = virt_to_page(pgdat); | ||
1746 | int i; | ||
1747 | |||
1748 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | ||
1749 | unsigned long section_nr = pfn_to_section_nr(pfn); | ||
1750 | |||
1751 | if (!present_section_nr(section_nr)) | ||
1752 | continue; | ||
1753 | |||
1754 | if (pfn_to_nid(pfn) != nid) | ||
1755 | continue; | ||
1756 | |||
1757 | /* | ||
1758 | * some memory sections of this node are not removed, and we | ||
1759 | * can't offline node now. | ||
1760 | */ | ||
1761 | return; | ||
1762 | } | ||
1763 | |||
1764 | if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) | ||
1765 | return; | ||
1766 | |||
1767 | /* | ||
1768 | * all memory/cpu of this node are removed, we can offline this | ||
1769 | * node now. | ||
1770 | */ | ||
1771 | node_set_offline(nid); | ||
1772 | unregister_one_node(nid); | ||
1773 | |||
1774 | if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) | ||
1775 | /* node data is allocated from boot memory */ | ||
1776 | return; | ||
1777 | |||
1778 | /* free waittable in each zone */ | ||
1779 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1780 | struct zone *zone = pgdat->node_zones + i; | ||
1781 | |||
1782 | if (zone->wait_table) | ||
1783 | vfree(zone->wait_table); | ||
1784 | } | ||
1785 | |||
1786 | /* | ||
1787 | * Since there is no way to guarentee the address of pgdat/zone is not | ||
1788 | * on stack of any kernel threads or used by other kernel objects | ||
1789 | * without reference counting or other symchronizing method, do not | ||
1790 | * reset node_data and free pgdat here. Just reset it to 0 and reuse | ||
1791 | * the memory when the node is online again. | ||
1792 | */ | ||
1793 | memset(pgdat, 0, sizeof(*pgdat)); | ||
1794 | } | ||
1795 | EXPORT_SYMBOL(try_offline_node); | ||
1796 | |||
1797 | int __ref remove_memory(int nid, u64 start, u64 size) | ||
1798 | { | ||
1799 | unsigned long start_pfn, end_pfn; | ||
1800 | int ret = 0; | ||
1801 | int retry = 1; | ||
1802 | |||
1803 | start_pfn = PFN_DOWN(start); | ||
1804 | end_pfn = start_pfn + PFN_DOWN(size); | ||
1805 | |||
1806 | /* | ||
1807 | * When CONFIG_MEMCG is on, one memory block may be used by other | ||
1808 | * blocks to store page cgroup when onlining pages. But we don't know | ||
1809 | * in what order pages are onlined. So we iterate twice to offline | ||
1810 | * memory: | ||
1811 | * 1st iterate: offline every non primary memory block. | ||
1812 | * 2nd iterate: offline primary (i.e. first added) memory block. | ||
1813 | */ | ||
1814 | repeat: | ||
1815 | walk_memory_range(start_pfn, end_pfn, &ret, | ||
1816 | offline_memory_block_cb); | ||
1817 | if (ret) { | ||
1818 | if (!retry) | ||
1819 | return ret; | ||
1820 | |||
1821 | retry = 0; | ||
1822 | ret = 0; | ||
1823 | goto repeat; | ||
1824 | } | ||
1825 | |||
1826 | lock_memory_hotplug(); | ||
1827 | |||
1828 | /* | ||
1829 | * we have offlined all memory blocks like this: | ||
1830 | * 1. lock memory hotplug | ||
1831 | * 2. offline a memory block | ||
1832 | * 3. unlock memory hotplug | ||
1833 | * | ||
1834 | * repeat step1-3 to offline the memory block. All memory blocks | ||
1835 | * must be offlined before removing memory. But we don't hold the | ||
1836 | * lock in the whole operation. So we should check whether all | ||
1837 | * memory blocks are offlined. | ||
1838 | */ | ||
1839 | |||
1840 | ret = walk_memory_range(start_pfn, end_pfn, NULL, | ||
1841 | is_memblock_offlined_cb); | ||
1842 | if (ret) { | ||
1843 | unlock_memory_hotplug(); | ||
1844 | return ret; | ||
1845 | } | ||
1846 | |||
1847 | /* remove memmap entry */ | ||
1848 | firmware_map_remove(start, start + size, "System RAM"); | ||
1849 | |||
1850 | arch_remove_memory(start, size); | ||
1851 | |||
1852 | try_offline_node(nid); | ||
1853 | |||
1854 | unlock_memory_hotplug(); | ||
1855 | |||
1856 | return 0; | ||
1857 | } | ||
1423 | #else | 1858 | #else |
1424 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | 1859 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) |
1425 | { | 1860 | { |
1426 | return -EINVAL; | 1861 | return -EINVAL; |
1427 | } | 1862 | } |
1428 | int remove_memory(u64 start, u64 size) | 1863 | int remove_memory(int nid, u64 start, u64 size) |
1429 | { | 1864 | { |
1430 | return -EINVAL; | 1865 | return -EINVAL; |
1431 | } | 1866 | } |