aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory_hotplug.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r--mm/memory_hotplug.c553
1 files changed, 494 insertions, 59 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d04ed87bfacb..b81a367b9f39 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h>
32 33
33#include <asm/tlbflush.h> 34#include <asm/tlbflush.h>
34 35
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res)
91} 92}
92 93
93#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 94#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
94#ifndef CONFIG_SPARSEMEM_VMEMMAP 95void get_page_bootmem(unsigned long info, struct page *page,
95static void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type)
96 unsigned long type)
97{ 97{
98 page->lru.next = (struct list_head *) type; 98 page->lru.next = (struct list_head *) type;
99 SetPagePrivate(page); 99 SetPagePrivate(page);
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page)
124 mutex_lock(&ppb_lock); 124 mutex_lock(&ppb_lock);
125 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock); 126 mutex_unlock(&ppb_lock);
127 totalram_pages++;
127 } 128 }
128 129
129} 130}
130 131
132#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
133#ifndef CONFIG_SPARSEMEM_VMEMMAP
131static void register_page_bootmem_info_section(unsigned long start_pfn) 134static void register_page_bootmem_info_section(unsigned long start_pfn)
132{ 135{
133 unsigned long *usemap, mapsize, section_nr, i; 136 unsigned long *usemap, mapsize, section_nr, i;
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
161 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 164 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
162 165
163} 166}
167#else /* CONFIG_SPARSEMEM_VMEMMAP */
168static void register_page_bootmem_info_section(unsigned long start_pfn)
169{
170 unsigned long *usemap, mapsize, section_nr, i;
171 struct mem_section *ms;
172 struct page *page, *memmap;
173
174 if (!pfn_valid(start_pfn))
175 return;
176
177 section_nr = pfn_to_section_nr(start_pfn);
178 ms = __nr_to_section(section_nr);
179
180 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
181
182 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
183
184 usemap = __nr_to_section(section_nr)->pageblock_flags;
185 page = virt_to_page(usemap);
186
187 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
188
189 for (i = 0; i < mapsize; i++, page++)
190 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
191}
192#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
164 193
165void register_page_bootmem_info_node(struct pglist_data *pgdat) 194void register_page_bootmem_info_node(struct pglist_data *pgdat)
166{ 195{
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
189 } 218 }
190 219
191 pfn = pgdat->node_start_pfn; 220 pfn = pgdat->node_start_pfn;
192 end_pfn = pfn + pgdat->node_spanned_pages; 221 end_pfn = pgdat_end_pfn(pgdat);
193 222
194 /* register_section info */ 223 /* register_section info */
195 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
203 register_page_bootmem_info_section(pfn); 232 register_page_bootmem_info_section(pfn);
204 } 233 }
205} 234}
206#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 235#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
207 236
208static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 237static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
209 unsigned long end_pfn) 238 unsigned long end_pfn)
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
253 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 282 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
254} 283}
255 284
285/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
286 * alloc_bootmem_node_nopanic() */
287static int __ref ensure_zone_is_initialized(struct zone *zone,
288 unsigned long start_pfn, unsigned long num_pages)
289{
290 if (!zone_is_initialized(zone))
291 return init_currently_empty_zone(zone, start_pfn, num_pages,
292 MEMMAP_HOTPLUG);
293 return 0;
294}
295
256static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 296static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
257 unsigned long start_pfn, unsigned long end_pfn) 297 unsigned long start_pfn, unsigned long end_pfn)
258{ 298{
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
260 unsigned long flags; 300 unsigned long flags;
261 unsigned long z1_start_pfn; 301 unsigned long z1_start_pfn;
262 302
263 if (!z1->wait_table) { 303 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
264 ret = init_currently_empty_zone(z1, start_pfn, 304 if (ret)
265 end_pfn - start_pfn, MEMMAP_HOTPLUG); 305 return ret;
266 if (ret)
267 return ret;
268 }
269 306
270 pgdat_resize_lock(z1->zone_pgdat, &flags); 307 pgdat_resize_lock(z1->zone_pgdat, &flags);
271 308
272 /* can't move pfns which are higher than @z2 */ 309 /* can't move pfns which are higher than @z2 */
273 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) 310 if (end_pfn > zone_end_pfn(z2))
274 goto out_fail; 311 goto out_fail;
275 /* the move out part mast at the left most of @z2 */ 312 /* the move out part mast at the left most of @z2 */
276 if (start_pfn > z2->zone_start_pfn) 313 if (start_pfn > z2->zone_start_pfn)
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
286 z1_start_pfn = start_pfn; 323 z1_start_pfn = start_pfn;
287 324
288 resize_zone(z1, z1_start_pfn, end_pfn); 325 resize_zone(z1, z1_start_pfn, end_pfn);
289 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); 326 resize_zone(z2, end_pfn, zone_end_pfn(z2));
290 327
291 pgdat_resize_unlock(z1->zone_pgdat, &flags); 328 pgdat_resize_unlock(z1->zone_pgdat, &flags);
292 329
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
305 unsigned long flags; 342 unsigned long flags;
306 unsigned long z2_end_pfn; 343 unsigned long z2_end_pfn;
307 344
308 if (!z2->wait_table) { 345 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
309 ret = init_currently_empty_zone(z2, start_pfn, 346 if (ret)
310 end_pfn - start_pfn, MEMMAP_HOTPLUG); 347 return ret;
311 if (ret)
312 return ret;
313 }
314 348
315 pgdat_resize_lock(z1->zone_pgdat, &flags); 349 pgdat_resize_lock(z1->zone_pgdat, &flags);
316 350
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
318 if (z1->zone_start_pfn > start_pfn) 352 if (z1->zone_start_pfn > start_pfn)
319 goto out_fail; 353 goto out_fail;
320 /* the move out part mast at the right most of @z1 */ 354 /* the move out part mast at the right most of @z1 */
321 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) 355 if (zone_end_pfn(z1) > end_pfn)
322 goto out_fail; 356 goto out_fail;
323 /* must included/overlap */ 357 /* must included/overlap */
324 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) 358 if (start_pfn >= zone_end_pfn(z1))
325 goto out_fail; 359 goto out_fail;
326 360
327 /* use end_pfn for z2's end_pfn if z2 is empty */ 361 /* use end_pfn for z2's end_pfn if z2 is empty */
328 if (z2->spanned_pages) 362 if (z2->spanned_pages)
329 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; 363 z2_end_pfn = zone_end_pfn(z2);
330 else 364 else
331 z2_end_pfn = end_pfn; 365 z2_end_pfn = end_pfn;
332 366
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
363 int nid = pgdat->node_id; 397 int nid = pgdat->node_id;
364 int zone_type; 398 int zone_type;
365 unsigned long flags; 399 unsigned long flags;
400 int ret;
366 401
367 zone_type = zone - pgdat->node_zones; 402 zone_type = zone - pgdat->node_zones;
368 if (!zone->wait_table) { 403 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
369 int ret; 404 if (ret)
405 return ret;
370 406
371 ret = init_currently_empty_zone(zone, phys_start_pfn,
372 nr_pages, MEMMAP_HOTPLUG);
373 if (ret)
374 return ret;
375 }
376 pgdat_resize_lock(zone->zone_pgdat, &flags); 407 pgdat_resize_lock(zone->zone_pgdat, &flags);
377 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 408 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
378 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 409 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
405 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
406} 437}
407 438
408#ifdef CONFIG_SPARSEMEM_VMEMMAP 439/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
409static int __remove_section(struct zone *zone, struct mem_section *ms) 440static int find_smallest_section_pfn(int nid, struct zone *zone,
441 unsigned long start_pfn,
442 unsigned long end_pfn)
443{
444 struct mem_section *ms;
445
446 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
447 ms = __pfn_to_section(start_pfn);
448
449 if (unlikely(!valid_section(ms)))
450 continue;
451
452 if (unlikely(pfn_to_nid(start_pfn) != nid))
453 continue;
454
455 if (zone && zone != page_zone(pfn_to_page(start_pfn)))
456 continue;
457
458 return start_pfn;
459 }
460
461 return 0;
462}
463
464/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
465static int find_biggest_section_pfn(int nid, struct zone *zone,
466 unsigned long start_pfn,
467 unsigned long end_pfn)
468{
469 struct mem_section *ms;
470 unsigned long pfn;
471
472 /* pfn is the end pfn of a memory section. */
473 pfn = end_pfn - 1;
474 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
475 ms = __pfn_to_section(pfn);
476
477 if (unlikely(!valid_section(ms)))
478 continue;
479
480 if (unlikely(pfn_to_nid(pfn) != nid))
481 continue;
482
483 if (zone && zone != page_zone(pfn_to_page(pfn)))
484 continue;
485
486 return pfn;
487 }
488
489 return 0;
490}
491
492static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
493 unsigned long end_pfn)
410{ 494{
495 unsigned long zone_start_pfn = zone->zone_start_pfn;
496 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
497 unsigned long pfn;
498 struct mem_section *ms;
499 int nid = zone_to_nid(zone);
500
501 zone_span_writelock(zone);
502 if (zone_start_pfn == start_pfn) {
503 /*
504 * If the section is smallest section in the zone, it need
505 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
506 * In this case, we find second smallest valid mem_section
507 * for shrinking zone.
508 */
509 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
510 zone_end_pfn);
511 if (pfn) {
512 zone->zone_start_pfn = pfn;
513 zone->spanned_pages = zone_end_pfn - pfn;
514 }
515 } else if (zone_end_pfn == end_pfn) {
516 /*
517 * If the section is biggest section in the zone, it need
518 * shrink zone->spanned_pages.
519 * In this case, we find second biggest valid mem_section for
520 * shrinking zone.
521 */
522 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
523 start_pfn);
524 if (pfn)
525 zone->spanned_pages = pfn - zone_start_pfn + 1;
526 }
527
411 /* 528 /*
412 * XXX: Freeing memmap with vmemmap is not implement yet. 529 * The section is not biggest or smallest mem_section in the zone, it
413 * This should be removed later. 530 * only creates a hole in the zone. So in this case, we need not
531 * change the zone. But perhaps, the zone has only hole data. Thus
532 * it check the zone has only hole or not.
414 */ 533 */
415 return -EBUSY; 534 pfn = zone_start_pfn;
535 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
536 ms = __pfn_to_section(pfn);
537
538 if (unlikely(!valid_section(ms)))
539 continue;
540
541 if (page_zone(pfn_to_page(pfn)) != zone)
542 continue;
543
544 /* If the section is current section, it continues the loop */
545 if (start_pfn == pfn)
546 continue;
547
548 /* If we find valid section, we have nothing to do */
549 zone_span_writeunlock(zone);
550 return;
551 }
552
553 /* The zone has no valid section */
554 zone->zone_start_pfn = 0;
555 zone->spanned_pages = 0;
556 zone_span_writeunlock(zone);
416} 557}
417#else 558
418static int __remove_section(struct zone *zone, struct mem_section *ms) 559static void shrink_pgdat_span(struct pglist_data *pgdat,
560 unsigned long start_pfn, unsigned long end_pfn)
561{
562 unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
563 unsigned long pgdat_end_pfn =
564 pgdat->node_start_pfn + pgdat->node_spanned_pages;
565 unsigned long pfn;
566 struct mem_section *ms;
567 int nid = pgdat->node_id;
568
569 if (pgdat_start_pfn == start_pfn) {
570 /*
571 * If the section is smallest section in the pgdat, it need
572 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
573 * In this case, we find second smallest valid mem_section
574 * for shrinking zone.
575 */
576 pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
577 pgdat_end_pfn);
578 if (pfn) {
579 pgdat->node_start_pfn = pfn;
580 pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
581 }
582 } else if (pgdat_end_pfn == end_pfn) {
583 /*
584 * If the section is biggest section in the pgdat, it need
585 * shrink pgdat->node_spanned_pages.
586 * In this case, we find second biggest valid mem_section for
587 * shrinking zone.
588 */
589 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
590 start_pfn);
591 if (pfn)
592 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
593 }
594
595 /*
596 * If the section is not biggest or smallest mem_section in the pgdat,
597 * it only creates a hole in the pgdat. So in this case, we need not
598 * change the pgdat.
599 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
600 * has only hole or not.
601 */
602 pfn = pgdat_start_pfn;
603 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
604 ms = __pfn_to_section(pfn);
605
606 if (unlikely(!valid_section(ms)))
607 continue;
608
609 if (pfn_to_nid(pfn) != nid)
610 continue;
611
612 /* If the section is current section, it continues the loop */
613 if (start_pfn == pfn)
614 continue;
615
616 /* If we find valid section, we have nothing to do */
617 return;
618 }
619
620 /* The pgdat has no valid section */
621 pgdat->node_start_pfn = 0;
622 pgdat->node_spanned_pages = 0;
623}
624
625static void __remove_zone(struct zone *zone, unsigned long start_pfn)
419{ 626{
420 unsigned long flags;
421 struct pglist_data *pgdat = zone->zone_pgdat; 627 struct pglist_data *pgdat = zone->zone_pgdat;
628 int nr_pages = PAGES_PER_SECTION;
629 int zone_type;
630 unsigned long flags;
631
632 zone_type = zone - pgdat->node_zones;
633
634 pgdat_resize_lock(zone->zone_pgdat, &flags);
635 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
636 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
637 pgdat_resize_unlock(zone->zone_pgdat, &flags);
638}
639
640static int __remove_section(struct zone *zone, struct mem_section *ms)
641{
642 unsigned long start_pfn;
643 int scn_nr;
422 int ret = -EINVAL; 644 int ret = -EINVAL;
423 645
424 if (!valid_section(ms)) 646 if (!valid_section(ms))
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
428 if (ret) 650 if (ret)
429 return ret; 651 return ret;
430 652
431 pgdat_resize_lock(pgdat, &flags); 653 scn_nr = __section_nr(ms);
654 start_pfn = section_nr_to_pfn(scn_nr);
655 __remove_zone(zone, start_pfn);
656
432 sparse_remove_one_section(zone, ms); 657 sparse_remove_one_section(zone, ms);
433 pgdat_resize_unlock(pgdat, &flags);
434 return 0; 658 return 0;
435} 659}
436#endif
437 660
438/* 661/*
439 * Reasonably generic function for adding memory. It is 662 * Reasonably generic function for adding memory. It is
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
797 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1020 unsigned long zholes_size[MAX_NR_ZONES] = {0};
798 unsigned long start_pfn = start >> PAGE_SHIFT; 1021 unsigned long start_pfn = start >> PAGE_SHIFT;
799 1022
800 pgdat = arch_alloc_nodedata(nid); 1023 pgdat = NODE_DATA(nid);
801 if (!pgdat) 1024 if (!pgdat) {
802 return NULL; 1025 pgdat = arch_alloc_nodedata(nid);
1026 if (!pgdat)
1027 return NULL;
803 1028
804 arch_refresh_nodedata(nid, pgdat); 1029 arch_refresh_nodedata(nid, pgdat);
1030 }
805 1031
806 /* we can use NODE_DATA(nid) from here */ 1032 /* we can use NODE_DATA(nid) from here */
807 1033
@@ -854,7 +1080,8 @@ out:
854int __ref add_memory(int nid, u64 start, u64 size) 1080int __ref add_memory(int nid, u64 start, u64 size)
855{ 1081{
856 pg_data_t *pgdat = NULL; 1082 pg_data_t *pgdat = NULL;
857 int new_pgdat = 0; 1083 bool new_pgdat;
1084 bool new_node;
858 struct resource *res; 1085 struct resource *res;
859 int ret; 1086 int ret;
860 1087
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
865 if (!res) 1092 if (!res)
866 goto out; 1093 goto out;
867 1094
868 if (!node_online(nid)) { 1095 { /* Stupid hack to suppress address-never-null warning */
1096 void *p = NODE_DATA(nid);
1097 new_pgdat = !p;
1098 }
1099 new_node = !node_online(nid);
1100 if (new_node) {
869 pgdat = hotadd_new_pgdat(nid, start); 1101 pgdat = hotadd_new_pgdat(nid, start);
870 ret = -ENOMEM; 1102 ret = -ENOMEM;
871 if (!pgdat) 1103 if (!pgdat)
872 goto error; 1104 goto error;
873 new_pgdat = 1;
874 } 1105 }
875 1106
876 /* call arch's memory hotadd */ 1107 /* call arch's memory hotadd */
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
882 /* we online node here. we can't roll back from here. */ 1113 /* we online node here. we can't roll back from here. */
883 node_set_online(nid); 1114 node_set_online(nid);
884 1115
885 if (new_pgdat) { 1116 if (new_node) {
886 ret = register_one_node(nid); 1117 ret = register_one_node(nid);
887 /* 1118 /*
888 * If sysfs file of new node can't create, cpu on the node 1119 * If sysfs file of new node can't create, cpu on the node
@@ -901,8 +1132,7 @@ error:
901 /* rollback pgdat allocation and others */ 1132 /* rollback pgdat allocation and others */
902 if (new_pgdat) 1133 if (new_pgdat)
903 rollback_node_hotadd(nid, pgdat); 1134 rollback_node_hotadd(nid, pgdat);
904 if (res) 1135 release_memory_resource(res);
905 release_memory_resource(res);
906 1136
907out: 1137out:
908 unlock_memory_hotplug(); 1138 unlock_memory_hotplug();
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1058 * migrate_pages returns # of failed pages. 1288 * migrate_pages returns # of failed pages.
1059 */ 1289 */
1060 ret = migrate_pages(&source, alloc_migrate_target, 0, 1290 ret = migrate_pages(&source, alloc_migrate_target, 0,
1061 true, MIGRATE_SYNC, 1291 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1062 MR_MEMORY_HOTPLUG);
1063 if (ret) 1292 if (ret)
1064 putback_lru_pages(&source); 1293 putback_lru_pages(&source);
1065 } 1294 }
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1381 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1610 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1382} 1611}
1383 1612
1384int remove_memory(u64 start, u64 size) 1613/**
1614 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1615 * @start_pfn: start pfn of the memory range
1616 * @end_pfn: end pft of the memory range
1617 * @arg: argument passed to func
1618 * @func: callback for each memory section walked
1619 *
1620 * This function walks through all present mem sections in range
1621 * [start_pfn, end_pfn) and call func on each mem section.
1622 *
1623 * Returns the return value of func.
1624 */
1625static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1626 void *arg, int (*func)(struct memory_block *, void *))
1385{ 1627{
1386 struct memory_block *mem = NULL; 1628 struct memory_block *mem = NULL;
1387 struct mem_section *section; 1629 struct mem_section *section;
1388 unsigned long start_pfn, end_pfn;
1389 unsigned long pfn, section_nr; 1630 unsigned long pfn, section_nr;
1390 int ret; 1631 int ret;
1391 1632
1392 start_pfn = PFN_DOWN(start);
1393 end_pfn = start_pfn + PFN_DOWN(size);
1394
1395 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1633 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1396 section_nr = pfn_to_section_nr(pfn); 1634 section_nr = pfn_to_section_nr(pfn);
1397 if (!present_section_nr(section_nr)) 1635 if (!present_section_nr(section_nr))
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size)
1408 if (!mem) 1646 if (!mem)
1409 continue; 1647 continue;
1410 1648
1411 ret = offline_memory_block(mem); 1649 ret = func(mem, arg);
1412 if (ret) { 1650 if (ret) {
1413 kobject_put(&mem->dev.kobj); 1651 kobject_put(&mem->dev.kobj);
1414 return ret; 1652 return ret;
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size)
1420 1658
1421 return 0; 1659 return 0;
1422} 1660}
1661
1662/**
1663 * offline_memory_block_cb - callback function for offlining memory block
1664 * @mem: the memory block to be offlined
1665 * @arg: buffer to hold error msg
1666 *
1667 * Always return 0, and put the error msg in arg if any.
1668 */
1669static int offline_memory_block_cb(struct memory_block *mem, void *arg)
1670{
1671 int *ret = arg;
1672 int error = offline_memory_block(mem);
1673
1674 if (error != 0 && *ret == 0)
1675 *ret = error;
1676
1677 return 0;
1678}
1679
1680static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1681{
1682 int ret = !is_memblock_offlined(mem);
1683
1684 if (unlikely(ret))
1685 pr_warn("removing memory fails, because memory "
1686 "[%#010llx-%#010llx] is onlined\n",
1687 PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
1688 PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
1689
1690 return ret;
1691}
1692
1693static int check_cpu_on_node(void *data)
1694{
1695 struct pglist_data *pgdat = data;
1696 int cpu;
1697
1698 for_each_present_cpu(cpu) {
1699 if (cpu_to_node(cpu) == pgdat->node_id)
1700 /*
1701 * the cpu on this node isn't removed, and we can't
1702 * offline this node.
1703 */
1704 return -EBUSY;
1705 }
1706
1707 return 0;
1708}
1709
1710static void unmap_cpu_on_node(void *data)
1711{
1712#ifdef CONFIG_ACPI_NUMA
1713 struct pglist_data *pgdat = data;
1714 int cpu;
1715
1716 for_each_possible_cpu(cpu)
1717 if (cpu_to_node(cpu) == pgdat->node_id)
1718 numa_clear_node(cpu);
1719#endif
1720}
1721
1722static int check_and_unmap_cpu_on_node(void *data)
1723{
1724 int ret = check_cpu_on_node(data);
1725
1726 if (ret)
1727 return ret;
1728
1729 /*
1730 * the node will be offlined when we come here, so we can clear
1731 * the cpu_to_node() now.
1732 */
1733
1734 unmap_cpu_on_node(data);
1735 return 0;
1736}
1737
1738/* offline the node if all memory sections of this node are removed */
1739void try_offline_node(int nid)
1740{
1741 pg_data_t *pgdat = NODE_DATA(nid);
1742 unsigned long start_pfn = pgdat->node_start_pfn;
1743 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1744 unsigned long pfn;
1745 struct page *pgdat_page = virt_to_page(pgdat);
1746 int i;
1747
1748 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1749 unsigned long section_nr = pfn_to_section_nr(pfn);
1750
1751 if (!present_section_nr(section_nr))
1752 continue;
1753
1754 if (pfn_to_nid(pfn) != nid)
1755 continue;
1756
1757 /*
1758 * some memory sections of this node are not removed, and we
1759 * can't offline node now.
1760 */
1761 return;
1762 }
1763
1764 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
1765 return;
1766
1767 /*
1768 * all memory/cpu of this node are removed, we can offline this
1769 * node now.
1770 */
1771 node_set_offline(nid);
1772 unregister_one_node(nid);
1773
1774 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
1775 /* node data is allocated from boot memory */
1776 return;
1777
1778 /* free waittable in each zone */
1779 for (i = 0; i < MAX_NR_ZONES; i++) {
1780 struct zone *zone = pgdat->node_zones + i;
1781
1782 if (zone->wait_table)
1783 vfree(zone->wait_table);
1784 }
1785
1786 /*
1787 * Since there is no way to guarentee the address of pgdat/zone is not
1788 * on stack of any kernel threads or used by other kernel objects
1789 * without reference counting or other symchronizing method, do not
1790 * reset node_data and free pgdat here. Just reset it to 0 and reuse
1791 * the memory when the node is online again.
1792 */
1793 memset(pgdat, 0, sizeof(*pgdat));
1794}
1795EXPORT_SYMBOL(try_offline_node);
1796
1797int __ref remove_memory(int nid, u64 start, u64 size)
1798{
1799 unsigned long start_pfn, end_pfn;
1800 int ret = 0;
1801 int retry = 1;
1802
1803 start_pfn = PFN_DOWN(start);
1804 end_pfn = start_pfn + PFN_DOWN(size);
1805
1806 /*
1807 * When CONFIG_MEMCG is on, one memory block may be used by other
1808 * blocks to store page cgroup when onlining pages. But we don't know
1809 * in what order pages are onlined. So we iterate twice to offline
1810 * memory:
1811 * 1st iterate: offline every non primary memory block.
1812 * 2nd iterate: offline primary (i.e. first added) memory block.
1813 */
1814repeat:
1815 walk_memory_range(start_pfn, end_pfn, &ret,
1816 offline_memory_block_cb);
1817 if (ret) {
1818 if (!retry)
1819 return ret;
1820
1821 retry = 0;
1822 ret = 0;
1823 goto repeat;
1824 }
1825
1826 lock_memory_hotplug();
1827
1828 /*
1829 * we have offlined all memory blocks like this:
1830 * 1. lock memory hotplug
1831 * 2. offline a memory block
1832 * 3. unlock memory hotplug
1833 *
1834 * repeat step1-3 to offline the memory block. All memory blocks
1835 * must be offlined before removing memory. But we don't hold the
1836 * lock in the whole operation. So we should check whether all
1837 * memory blocks are offlined.
1838 */
1839
1840 ret = walk_memory_range(start_pfn, end_pfn, NULL,
1841 is_memblock_offlined_cb);
1842 if (ret) {
1843 unlock_memory_hotplug();
1844 return ret;
1845 }
1846
1847 /* remove memmap entry */
1848 firmware_map_remove(start, start + size, "System RAM");
1849
1850 arch_remove_memory(start, size);
1851
1852 try_offline_node(nid);
1853
1854 unlock_memory_hotplug();
1855
1856 return 0;
1857}
1423#else 1858#else
1424int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1859int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1425{ 1860{
1426 return -EINVAL; 1861 return -EINVAL;
1427} 1862}
1428int remove_memory(u64 start, u64 size) 1863int remove_memory(int nid, u64 start, u64 size)
1429{ 1864{
1430 return -EINVAL; 1865 return -EINVAL;
1431} 1866}