aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/ia64/mm/init.c9
-rw-r--r--arch/powerpc/mm/mem.c10
-rw-r--r--arch/s390/mm/init.c30
-rw-r--r--arch/sh/mm/init.c8
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c9
-rw-r--r--drivers/base/memory.c52
-rw-r--r--include/linux/memory_hotplug.h13
-rw-r--r--include/linux/mmzone.h16
-rw-r--r--kernel/memremap.c4
-rw-r--r--mm/memory_hotplug.c201
-rw-r--r--mm/sparse.c3
12 files changed, 185 insertions, 175 deletions
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 39e2aeb4669d..80db57d063d0 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -648,18 +648,11 @@ mem_init (void)
648#ifdef CONFIG_MEMORY_HOTPLUG 648#ifdef CONFIG_MEMORY_HOTPLUG
649int arch_add_memory(int nid, u64 start, u64 size, bool for_device) 649int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
650{ 650{
651 pg_data_t *pgdat;
652 struct zone *zone;
653 unsigned long start_pfn = start >> PAGE_SHIFT; 651 unsigned long start_pfn = start >> PAGE_SHIFT;
654 unsigned long nr_pages = size >> PAGE_SHIFT; 652 unsigned long nr_pages = size >> PAGE_SHIFT;
655 int ret; 653 int ret;
656 654
657 pgdat = NODE_DATA(nid); 655 ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
658
659 zone = pgdat->node_zones +
660 zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
661 ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
662
663 if (ret) 656 if (ret)
664 printk("%s: Problem encountered in __add_pages() as ret=%d\n", 657 printk("%s: Problem encountered in __add_pages() as ret=%d\n",
665 __func__, ret); 658 __func__, ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index e6b2e6618b6c..72c46eb53215 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -128,16 +128,12 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
128 128
129int arch_add_memory(int nid, u64 start, u64 size, bool for_device) 129int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
130{ 130{
131 struct pglist_data *pgdata;
132 struct zone *zone;
133 unsigned long start_pfn = start >> PAGE_SHIFT; 131 unsigned long start_pfn = start >> PAGE_SHIFT;
134 unsigned long nr_pages = size >> PAGE_SHIFT; 132 unsigned long nr_pages = size >> PAGE_SHIFT;
135 int rc; 133 int rc;
136 134
137 resize_hpt_for_hotplug(memblock_phys_mem_size()); 135 resize_hpt_for_hotplug(memblock_phys_mem_size());
138 136
139 pgdata = NODE_DATA(nid);
140
141 start = (unsigned long)__va(start); 137 start = (unsigned long)__va(start);
142 rc = create_section_mapping(start, start + size); 138 rc = create_section_mapping(start, start + size);
143 if (rc) { 139 if (rc) {
@@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
147 return -EFAULT; 143 return -EFAULT;
148 } 144 }
149 145
150 /* this should work for most non-highmem platforms */ 146 return __add_pages(nid, start_pfn, nr_pages, !for_device);
151 zone = pgdata->node_zones +
152 zone_for_memory(nid, start, size, 0, for_device);
153
154 return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
155} 147}
156 148
157#ifdef CONFIG_MEMORY_HOTREMOVE 149#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index a3d549966b6a..bfa918e3592b 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -168,41 +168,15 @@ unsigned long memory_block_size_bytes(void)
168#ifdef CONFIG_MEMORY_HOTPLUG 168#ifdef CONFIG_MEMORY_HOTPLUG
169int arch_add_memory(int nid, u64 start, u64 size, bool for_device) 169int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
170{ 170{
171 unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
172 unsigned long start_pfn = PFN_DOWN(start); 171 unsigned long start_pfn = PFN_DOWN(start);
173 unsigned long size_pages = PFN_DOWN(size); 172 unsigned long size_pages = PFN_DOWN(size);
174 pg_data_t *pgdat = NODE_DATA(nid); 173 int rc;
175 struct zone *zone;
176 int rc, i;
177 174
178 rc = vmem_add_mapping(start, size); 175 rc = vmem_add_mapping(start, size);
179 if (rc) 176 if (rc)
180 return rc; 177 return rc;
181 178
182 for (i = 0; i < MAX_NR_ZONES; i++) { 179 rc = __add_pages(nid, start_pfn, size_pages, !for_device);
183 zone = pgdat->node_zones + i;
184 if (zone_idx(zone) != ZONE_MOVABLE) {
185 /* Add range within existing zone limits, if possible */
186 zone_start_pfn = zone->zone_start_pfn;
187 zone_end_pfn = zone->zone_start_pfn +
188 zone->spanned_pages;
189 } else {
190 /* Add remaining range to ZONE_MOVABLE */
191 zone_start_pfn = start_pfn;
192 zone_end_pfn = start_pfn + size_pages;
193 }
194 if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn)
195 continue;
196 nr_pages = (start_pfn + size_pages > zone_end_pfn) ?
197 zone_end_pfn - start_pfn : size_pages;
198 rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
199 if (rc)
200 break;
201 start_pfn += nr_pages;
202 size_pages -= nr_pages;
203 if (!size_pages)
204 break;
205 }
206 if (rc) 180 if (rc)
207 vmem_remove_mapping(start, size); 181 vmem_remove_mapping(start, size);
208 return rc; 182 return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index a9d57f75ae8c..3813a610a2bb 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -487,18 +487,12 @@ void free_initrd_mem(unsigned long start, unsigned long end)
487#ifdef CONFIG_MEMORY_HOTPLUG 487#ifdef CONFIG_MEMORY_HOTPLUG
488int arch_add_memory(int nid, u64 start, u64 size, bool for_device) 488int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
489{ 489{
490 pg_data_t *pgdat;
491 unsigned long start_pfn = PFN_DOWN(start); 490 unsigned long start_pfn = PFN_DOWN(start);
492 unsigned long nr_pages = size >> PAGE_SHIFT; 491 unsigned long nr_pages = size >> PAGE_SHIFT;
493 int ret; 492 int ret;
494 493
495 pgdat = NODE_DATA(nid);
496
497 /* We only have ZONE_NORMAL, so this is easy.. */ 494 /* We only have ZONE_NORMAL, so this is easy.. */
498 ret = __add_pages(nid, pgdat->node_zones + 495 ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
499 zone_for_memory(nid, start, size, ZONE_NORMAL,
500 for_device),
501 start_pfn, nr_pages, !for_device);
502 if (unlikely(ret)) 496 if (unlikely(ret))
503 printk("%s: Failed, __add_pages() == %d\n", __func__, ret); 497 printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
504 498
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 94594b889144..a424066d0552 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -825,13 +825,10 @@ void __init mem_init(void)
825#ifdef CONFIG_MEMORY_HOTPLUG 825#ifdef CONFIG_MEMORY_HOTPLUG
826int arch_add_memory(int nid, u64 start, u64 size, bool for_device) 826int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
827{ 827{
828 struct pglist_data *pgdata = NODE_DATA(nid);
829 struct zone *zone = pgdata->node_zones +
830 zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
831 unsigned long start_pfn = start >> PAGE_SHIFT; 828 unsigned long start_pfn = start >> PAGE_SHIFT;
832 unsigned long nr_pages = size >> PAGE_SHIFT; 829 unsigned long nr_pages = size >> PAGE_SHIFT;
833 830
834 return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); 831 return __add_pages(nid, start_pfn, nr_pages, !for_device);
835} 832}
836 833
837#ifdef CONFIG_MEMORY_HOTREMOVE 834#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9d64291459b6..06afa84ac0a0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -772,22 +772,15 @@ static void update_end_of_memory_vars(u64 start, u64 size)
772 } 772 }
773} 773}
774 774
775/*
776 * Memory is added always to NORMAL zone. This means you will never get
777 * additional DMA/DMA32 memory.
778 */
779int arch_add_memory(int nid, u64 start, u64 size, bool for_device) 775int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
780{ 776{
781 struct pglist_data *pgdat = NODE_DATA(nid);
782 struct zone *zone = pgdat->node_zones +
783 zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
784 unsigned long start_pfn = start >> PAGE_SHIFT; 777 unsigned long start_pfn = start >> PAGE_SHIFT;
785 unsigned long nr_pages = size >> PAGE_SHIFT; 778 unsigned long nr_pages = size >> PAGE_SHIFT;
786 int ret; 779 int ret;
787 780
788 init_memory_mapping(start, start + size); 781 init_memory_mapping(start, start + size);
789 782
790 ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); 783 ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
791 WARN_ON_ONCE(ret); 784 WARN_ON_ONCE(ret);
792 785
793 /* update max_pfn, max_low_pfn and high_memory */ 786 /* update max_pfn, max_low_pfn and high_memory */
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 1e884d82af6f..b86fda30ce62 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -392,39 +392,43 @@ static ssize_t show_valid_zones(struct device *dev,
392 struct device_attribute *attr, char *buf) 392 struct device_attribute *attr, char *buf)
393{ 393{
394 struct memory_block *mem = to_memory_block(dev); 394 struct memory_block *mem = to_memory_block(dev);
395 unsigned long start_pfn, end_pfn; 395 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
396 unsigned long valid_start, valid_end, valid_pages;
397 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 396 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
398 struct zone *zone; 397 unsigned long valid_start_pfn, valid_end_pfn;
399 int zone_shift = 0; 398 bool append = false;
399 int nid;
400 400
401 start_pfn = section_nr_to_pfn(mem->start_section_nr); 401 /*
402 end_pfn = start_pfn + nr_pages; 402 * The block contains more than one zone can not be offlined.
403 403 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
404 /* The block contains more than one zone can not be offlined. */ 404 */
405 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) 405 if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn))
406 return sprintf(buf, "none\n"); 406 return sprintf(buf, "none\n");
407 407
408 zone = page_zone(pfn_to_page(valid_start)); 408 start_pfn = valid_start_pfn;
409 valid_pages = valid_end - valid_start; 409 nr_pages = valid_end_pfn - start_pfn;
410
411 /* MMOP_ONLINE_KEEP */
412 sprintf(buf, "%s", zone->name);
413 410
414 /* MMOP_ONLINE_KERNEL */ 411 /*
415 zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); 412 * Check the existing zone. Make sure that we do that only on the
416 if (zone_shift) { 413 * online nodes otherwise the page_zone is not reliable
417 strcat(buf, " "); 414 */
418 strcat(buf, (zone + zone_shift)->name); 415 if (mem->state == MEM_ONLINE) {
416 strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
417 goto out;
419 } 418 }
420 419
421 /* MMOP_ONLINE_MOVABLE */ 420 nid = pfn_to_nid(start_pfn);
422 zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); 421 if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
423 if (zone_shift) { 422 strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name);
424 strcat(buf, " "); 423 append = true;
425 strcat(buf, (zone + zone_shift)->name);
426 } 424 }
427 425
426 if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
427 if (append)
428 strcat(buf, " ");
429 strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
430 }
431out:
428 strcat(buf, "\n"); 432 strcat(buf, "\n");
429 433
430 return strlen(buf); 434 return strlen(buf);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a61aede1b391..8a07a49fd8dc 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -123,8 +123,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
123 unsigned long nr_pages); 123 unsigned long nr_pages);
124#endif /* CONFIG_MEMORY_HOTREMOVE */ 124#endif /* CONFIG_MEMORY_HOTREMOVE */
125 125
126/* reasonably generic interface to expand the physical pages in a zone */ 126/* reasonably generic interface to expand the physical pages */
127extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, 127extern int __add_pages(int nid, unsigned long start_pfn,
128 unsigned long nr_pages, bool want_memblock); 128 unsigned long nr_pages, bool want_memblock);
129 129
130#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
@@ -299,15 +299,16 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online);
299extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, 299extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
300 bool for_device); 300 bool for_device);
301extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); 301extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
302extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
303 unsigned long nr_pages);
302extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); 304extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
303extern bool is_memblock_offlined(struct memory_block *mem); 305extern bool is_memblock_offlined(struct memory_block *mem);
304extern void remove_memory(int nid, u64 start, u64 size); 306extern void remove_memory(int nid, u64 start, u64 size);
305extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); 307extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn);
306extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, 308extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
307 unsigned long map_offset); 309 unsigned long map_offset);
308extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, 310extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
309 unsigned long pnum); 311 unsigned long pnum);
310extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, 312extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
311 enum zone_type target, int *zone_shift); 313 int online_type);
312
313#endif /* __LINUX_MEMORY_HOTPLUG_H */ 314#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2aaf7e08c5a8..abc1641011f2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -533,6 +533,22 @@ static inline bool zone_is_empty(struct zone *zone)
533} 533}
534 534
535/* 535/*
536 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
537 * intersection with the given zone
538 */
539static inline bool zone_intersects(struct zone *zone,
540 unsigned long start_pfn, unsigned long nr_pages)
541{
542 if (zone_is_empty(zone))
543 return false;
544 if (start_pfn >= zone_end_pfn(zone) ||
545 start_pfn + nr_pages <= zone->zone_start_pfn)
546 return false;
547
548 return true;
549}
550
551/*
536 * The "priority" of VM scanning is how much of the queues we will scan in one 552 * The "priority" of VM scanning is how much of the queues we will scan in one
537 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 553 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
538 * queues ("queue_length >> 12") during an aging round. 554 * queues ("queue_length >> 12") during an aging round.
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 23a6483c3666..281eb478856a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -359,6 +359,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
359 359
360 mem_hotplug_begin(); 360 mem_hotplug_begin();
361 error = arch_add_memory(nid, align_start, align_size, true); 361 error = arch_add_memory(nid, align_start, align_size, true);
362 if (!error)
363 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
364 align_start >> PAGE_SHIFT,
365 align_size >> PAGE_SHIFT);
362 mem_hotplug_done(); 366 mem_hotplug_done();
363 if (error) 367 if (error)
364 goto err_add_memory; 368 goto err_add_memory;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b2ebe9ad7f6c..9438ffe24cb2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -433,25 +433,6 @@ out_fail:
433 return -1; 433 return -1;
434} 434}
435 435
436static struct zone * __meminit move_pfn_range(int zone_shift,
437 unsigned long start_pfn, unsigned long end_pfn)
438{
439 struct zone *zone = page_zone(pfn_to_page(start_pfn));
440 int ret = 0;
441
442 if (zone_shift < 0)
443 ret = move_pfn_range_left(zone + zone_shift, zone,
444 start_pfn, end_pfn);
445 else if (zone_shift)
446 ret = move_pfn_range_right(zone, zone + zone_shift,
447 start_pfn, end_pfn);
448
449 if (ret)
450 return NULL;
451
452 return zone + zone_shift;
453}
454
455static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 436static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
456 unsigned long end_pfn) 437 unsigned long end_pfn)
457{ 438{
@@ -493,23 +474,35 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
493 return 0; 474 return 0;
494} 475}
495 476
496static int __meminit __add_section(int nid, struct zone *zone, 477static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
497 unsigned long phys_start_pfn, bool want_memblock) 478 bool want_memblock)
498{ 479{
499 int ret; 480 int ret;
481 int i;
500 482
501 if (pfn_valid(phys_start_pfn)) 483 if (pfn_valid(phys_start_pfn))
502 return -EEXIST; 484 return -EEXIST;
503 485
504 ret = sparse_add_one_section(zone, phys_start_pfn); 486 ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
505
506 if (ret < 0) 487 if (ret < 0)
507 return ret; 488 return ret;
508 489
509 ret = __add_zone(zone, phys_start_pfn); 490 /*
491 * Make all the pages reserved so that nobody will stumble over half
492 * initialized state.
493 * FIXME: We also have to associate it with a node because pfn_to_node
494 * relies on having page with the proper node.
495 */
496 for (i = 0; i < PAGES_PER_SECTION; i++) {
497 unsigned long pfn = phys_start_pfn + i;
498 struct page *page;
499 if (!pfn_valid(pfn))
500 continue;
510 501
511 if (ret < 0) 502 page = pfn_to_page(pfn);
512 return ret; 503 set_page_node(page, nid);
504 SetPageReserved(page);
505 }
513 506
514 if (!want_memblock) 507 if (!want_memblock)
515 return 0; 508 return 0;
@@ -523,7 +516,7 @@ static int __meminit __add_section(int nid, struct zone *zone,
523 * call this function after deciding the zone to which to 516 * call this function after deciding the zone to which to
524 * add the new pages. 517 * add the new pages.
525 */ 518 */
526int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 519int __ref __add_pages(int nid, unsigned long phys_start_pfn,
527 unsigned long nr_pages, bool want_memblock) 520 unsigned long nr_pages, bool want_memblock)
528{ 521{
529 unsigned long i; 522 unsigned long i;
@@ -531,8 +524,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
531 int start_sec, end_sec; 524 int start_sec, end_sec;
532 struct vmem_altmap *altmap; 525 struct vmem_altmap *altmap;
533 526
534 clear_zone_contiguous(zone);
535
536 /* during initialize mem_map, align hot-added range to section */ 527 /* during initialize mem_map, align hot-added range to section */
537 start_sec = pfn_to_section_nr(phys_start_pfn); 528 start_sec = pfn_to_section_nr(phys_start_pfn);
538 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 529 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -552,7 +543,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
552 } 543 }
553 544
554 for (i = start_sec; i <= end_sec; i++) { 545 for (i = start_sec; i <= end_sec; i++) {
555 err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); 546 err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
556 547
557 /* 548 /*
558 * EEXIST is finally dealt with by ioresource collision 549 * EEXIST is finally dealt with by ioresource collision
@@ -565,7 +556,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
565 } 556 }
566 vmemmap_populate_print_last(); 557 vmemmap_populate_print_last();
567out: 558out:
568 set_zone_contiguous(zone);
569 return err; 559 return err;
570} 560}
571EXPORT_SYMBOL_GPL(__add_pages); 561EXPORT_SYMBOL_GPL(__add_pages);
@@ -1034,39 +1024,109 @@ static void node_states_set_node(int node, struct memory_notify *arg)
1034 node_set_state(node, N_MEMORY); 1024 node_set_state(node, N_MEMORY);
1035} 1025}
1036 1026
1037bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, 1027bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
1038 enum zone_type target, int *zone_shift)
1039{ 1028{
1040 struct zone *zone = page_zone(pfn_to_page(pfn)); 1029 struct pglist_data *pgdat = NODE_DATA(nid);
1041 enum zone_type idx = zone_idx(zone); 1030 struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
1042 int i; 1031 struct zone *normal_zone = &pgdat->node_zones[ZONE_NORMAL];
1043 1032
1044 *zone_shift = 0; 1033 /*
1034 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
1035 * physically before ZONE_MOVABLE. All we need is they do not
1036 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
1037 * though so let's stick with it for simplicity for now.
1038 * TODO make sure we do not overlap with ZONE_DEVICE
1039 */
1040 if (online_type == MMOP_ONLINE_KERNEL) {
1041 if (zone_is_empty(movable_zone))
1042 return true;
1043 return movable_zone->zone_start_pfn >= pfn + nr_pages;
1044 } else if (online_type == MMOP_ONLINE_MOVABLE) {
1045 return zone_end_pfn(normal_zone) <= pfn;
1046 }
1045 1047
1046 if (idx < target) { 1048 /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
1047 /* pages must be at end of current zone */ 1049 return online_type == MMOP_ONLINE_KEEP;
1048 if (pfn + nr_pages != zone_end_pfn(zone)) 1050}
1049 return false;
1050 1051
1051 /* no zones in use between current zone and target */ 1052static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
1052 for (i = idx + 1; i < target; i++) 1053 unsigned long nr_pages)
1053 if (zone_is_initialized(zone - idx + i)) 1054{
1054 return false; 1055 unsigned long old_end_pfn = zone_end_pfn(zone);
1055 }
1056 1056
1057 if (target < idx) { 1057 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
1058 /* pages must be at beginning of current zone */ 1058 zone->zone_start_pfn = start_pfn;
1059 if (pfn != zone->zone_start_pfn) 1059
1060 return false; 1060 zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
1061}
1062
1063static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
1064 unsigned long nr_pages)
1065{
1066 unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
1061 1067
1062 /* no zones in use between current zone and target */ 1068 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
1063 for (i = target + 1; i < idx; i++) 1069 pgdat->node_start_pfn = start_pfn;
1064 if (zone_is_initialized(zone - idx + i)) 1070
1065 return false; 1071 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
1072}
1073
1074void move_pfn_range_to_zone(struct zone *zone,
1075 unsigned long start_pfn, unsigned long nr_pages)
1076{
1077 struct pglist_data *pgdat = zone->zone_pgdat;
1078 int nid = pgdat->node_id;
1079 unsigned long flags;
1080
1081 if (zone_is_empty(zone))
1082 init_currently_empty_zone(zone, start_pfn, nr_pages);
1083
1084 clear_zone_contiguous(zone);
1085
1086 /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
1087 pgdat_resize_lock(pgdat, &flags);
1088 zone_span_writelock(zone);
1089 resize_zone_range(zone, start_pfn, nr_pages);
1090 zone_span_writeunlock(zone);
1091 resize_pgdat_range(pgdat, start_pfn, nr_pages);
1092 pgdat_resize_unlock(pgdat, &flags);
1093
1094 /*
1095 * TODO now we have a visible range of pages which are not associated
1096 * with their zone properly. Not nice but set_pfnblock_flags_mask
1097 * expects the zone spans the pfn range. All the pages in the range
1098 * are reserved so nobody should be touching them so we should be safe
1099 */
1100 memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
1101
1102 set_zone_contiguous(zone);
1103}
1104
1105/*
1106 * Associates the given pfn range with the given node and the zone appropriate
1107 * for the given online type.
1108 */
1109static struct zone * __meminit move_pfn_range(int online_type, int nid,
1110 unsigned long start_pfn, unsigned long nr_pages)
1111{
1112 struct pglist_data *pgdat = NODE_DATA(nid);
1113 struct zone *zone = &pgdat->node_zones[ZONE_NORMAL];
1114
1115 if (online_type == MMOP_ONLINE_KEEP) {
1116 struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
1117 /*
1118 * MMOP_ONLINE_KEEP inherits the current zone which is
1119 * ZONE_NORMAL by default but we might be within ZONE_MOVABLE
1120 * already.
1121 */
1122 if (zone_intersects(movable_zone, start_pfn, nr_pages))
1123 zone = movable_zone;
1124 } else if (online_type == MMOP_ONLINE_MOVABLE) {
1125 zone = &pgdat->node_zones[ZONE_MOVABLE];
1066 } 1126 }
1067 1127
1068 *zone_shift = target - idx; 1128 move_pfn_range_to_zone(zone, start_pfn, nr_pages);
1069 return true; 1129 return zone;
1070} 1130}
1071 1131
1072/* Must be protected by mem_hotplug_begin() */ 1132/* Must be protected by mem_hotplug_begin() */
@@ -1079,38 +1139,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1079 int nid; 1139 int nid;
1080 int ret; 1140 int ret;
1081 struct memory_notify arg; 1141 struct memory_notify arg;
1082 int zone_shift = 0;
1083 1142
1084 /* 1143 nid = pfn_to_nid(pfn);
1085 * This doesn't need a lock to do pfn_to_page(). 1144 if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
1086 * The section can't be removed here because of the
1087 * memory_block->state_mutex.
1088 */
1089 zone = page_zone(pfn_to_page(pfn));
1090
1091 if ((zone_idx(zone) > ZONE_NORMAL ||
1092 online_type == MMOP_ONLINE_MOVABLE) &&
1093 !can_online_high_movable(pfn_to_nid(pfn)))
1094 return -EINVAL; 1145 return -EINVAL;
1095 1146
1096 if (online_type == MMOP_ONLINE_KERNEL) { 1147 if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid))
1097 if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
1098 return -EINVAL;
1099 } else if (online_type == MMOP_ONLINE_MOVABLE) {
1100 if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
1101 return -EINVAL;
1102 }
1103
1104 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
1105 if (!zone)
1106 return -EINVAL; 1148 return -EINVAL;
1107 1149
1150 /* associate pfn range with the zone */
1151 zone = move_pfn_range(online_type, nid, pfn, nr_pages);
1152
1108 arg.start_pfn = pfn; 1153 arg.start_pfn = pfn;
1109 arg.nr_pages = nr_pages; 1154 arg.nr_pages = nr_pages;
1110 node_states_check_changes_online(nr_pages, zone, &arg); 1155 node_states_check_changes_online(nr_pages, zone, &arg);
1111 1156
1112 nid = zone_to_nid(zone);
1113
1114 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1157 ret = memory_notify(MEM_GOING_ONLINE, &arg);
1115 ret = notifier_to_errno(ret); 1158 ret = notifier_to_errno(ret);
1116 if (ret) 1159 if (ret)
diff --git a/mm/sparse.c b/mm/sparse.c
index 9d7fd666015e..7b4be3fd5cac 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -761,10 +761,9 @@ static void free_map_bootmem(struct page *memmap)
761 * set. If this is <=0, then that means that the passed-in 761 * set. If this is <=0, then that means that the passed-in
762 * map was not consumed and must be freed. 762 * map was not consumed and must be freed.
763 */ 763 */
764int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) 764int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)
765{ 765{
766 unsigned long section_nr = pfn_to_section_nr(start_pfn); 766 unsigned long section_nr = pfn_to_section_nr(start_pfn);
767 struct pglist_data *pgdat = zone->zone_pgdat;
768 struct mem_section *ms; 767 struct mem_section *ms;
769 struct page *memmap; 768 struct page *memmap;
770 unsigned long *usemap; 769 unsigned long *usemap;