aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorK. Y. Srinivasan <kys@microsoft.com>2013-03-15 15:25:43 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2013-03-15 15:11:51 -0400
commit1cac8cd4d146b60a7c70d778b5be928281b3b551 (patch)
tree1f4d6a20704074ab3c67a0e9435ca9331d8a06b2
parent0cf40a3e661b09c0dda795a77ccc0402c3153859 (diff)
Drivers: hv: balloon: Implement hot-add functionality
Implement the memory hot-add functionality. With this, Linux guests can fully participate in the Dynamic Memory protocol implemented in the Windows hosts. In this version of the patch, based Olaf Herring's feedback, I have gotten rid of the module level dependency on MEMORY_HOTPLUG. Instead the code within the driver that depends on MEMORY_HOTPLUG has the appropriate compilation switches. This would allow this driver to support pure ballooning in cases where the kernel does not support memory hotplug. Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--drivers/hv/hv_balloon.c408
1 files changed, 387 insertions, 21 deletions
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 4743db9e5f34..2cf7d4e964bd 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -412,6 +412,27 @@ struct dm_info_msg {
412 * End protocol definitions. 412 * End protocol definitions.
413 */ 413 */
414 414
415/*
416 * State to manage hot adding memory into the guest.
417 * The range start_pfn : end_pfn specifies the range
418 * that the host has asked us to hot add. The range
419 * start_pfn : ha_end_pfn specifies the range that we have
420 * currently hot added. We hot add in multiples of 128M
421 * chunks; it is possible that we may not be able to bring
422 * online all the pages in the region. The range
423 * covered_start_pfn : covered_end_pfn defines the pages that can
424 * be brough online.
425 */
426
427struct hv_hotadd_state {
428 struct list_head list;
429 unsigned long start_pfn;
430 unsigned long covered_start_pfn;
431 unsigned long covered_end_pfn;
432 unsigned long ha_end_pfn;
433 unsigned long end_pfn;
434};
435
415struct balloon_state { 436struct balloon_state {
416 __u32 num_pages; 437 __u32 num_pages;
417 struct work_struct wrk; 438 struct work_struct wrk;
@@ -419,16 +440,17 @@ struct balloon_state {
419 440
420struct hot_add_wrk { 441struct hot_add_wrk {
421 union dm_mem_page_range ha_page_range; 442 union dm_mem_page_range ha_page_range;
443 union dm_mem_page_range ha_region_range;
422 struct work_struct wrk; 444 struct work_struct wrk;
423}; 445};
424 446
425static bool hot_add; 447static bool hot_add = true;
426static bool do_hot_add; 448static bool do_hot_add;
427/* 449/*
428 * Delay reporting memory pressure by 450 * Delay reporting memory pressure by
429 * the specified number of seconds. 451 * the specified number of seconds.
430 */ 452 */
431static uint pressure_report_delay = 30; 453static uint pressure_report_delay = 45;
432 454
433module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); 455module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
434MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); 456MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
@@ -456,6 +478,7 @@ enum hv_dm_state {
456static __u8 recv_buffer[PAGE_SIZE]; 478static __u8 recv_buffer[PAGE_SIZE];
457static __u8 *send_buffer; 479static __u8 *send_buffer;
458#define PAGES_IN_2M 512 480#define PAGES_IN_2M 512
481#define HA_CHUNK (32 * 1024)
459 482
460struct hv_dynmem_device { 483struct hv_dynmem_device {
461 struct hv_device *dev; 484 struct hv_device *dev;
@@ -479,6 +502,17 @@ struct hv_dynmem_device {
479 struct hot_add_wrk ha_wrk; 502 struct hot_add_wrk ha_wrk;
480 503
481 /* 504 /*
505 * This state tracks if the host has specified a hot-add
506 * region.
507 */
508 bool host_specified_ha_region;
509
510 /*
511 * State to synchronize hot-add.
512 */
513 struct completion ol_waitevent;
514 bool ha_waiting;
515 /*
482 * This thread handles hot-add 516 * This thread handles hot-add
483 * requests from the host as well as notifying 517 * requests from the host as well as notifying
484 * the host with regards to memory pressure in 518 * the host with regards to memory pressure in
@@ -487,6 +521,11 @@ struct hv_dynmem_device {
487 struct task_struct *thread; 521 struct task_struct *thread;
488 522
489 /* 523 /*
524 * A list of hot-add regions.
525 */
526 struct list_head ha_region_list;
527
528 /*
490 * We start with the highest version we can support 529 * We start with the highest version we can support
491 * and downgrade based on the host; we save here the 530 * and downgrade based on the host; we save here the
492 * next version to try. 531 * next version to try.
@@ -496,35 +535,329 @@ struct hv_dynmem_device {
496 535
497static struct hv_dynmem_device dm_device; 536static struct hv_dynmem_device dm_device;
498 537
499static void hot_add_req(struct work_struct *dummy) 538#ifdef CONFIG_MEMORY_HOTPLUG
539
540void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size)
500{ 541{
542 int i;
501 543
502 struct dm_hot_add_response resp; 544 for (i = 0; i < size; i++) {
545 struct page *pg;
546 pg = pfn_to_page(start_pfn + i);
547 __online_page_set_limits(pg);
548 __online_page_increment_counters(pg);
549 __online_page_free(pg);
550 }
551}
552
553static void hv_mem_hot_add(unsigned long start, unsigned long size,
554 unsigned long pfn_count,
555 struct hv_hotadd_state *has)
556{
557 int ret = 0;
558 int i, nid, t;
559 unsigned long start_pfn;
560 unsigned long processed_pfn;
561 unsigned long total_pfn = pfn_count;
562
563 for (i = 0; i < (size/HA_CHUNK); i++) {
564 start_pfn = start + (i * HA_CHUNK);
565 has->ha_end_pfn += HA_CHUNK;
566
567 if (total_pfn > HA_CHUNK) {
568 processed_pfn = HA_CHUNK;
569 total_pfn -= HA_CHUNK;
570 } else {
571 processed_pfn = total_pfn;
572 total_pfn = 0;
573 }
574
575 has->covered_end_pfn += processed_pfn;
503 576
504 if (do_hot_add) { 577 init_completion(&dm_device.ol_waitevent);
578 dm_device.ha_waiting = true;
505 579
506 pr_info("Memory hot add not supported\n"); 580 nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
581 ret = add_memory(nid, PFN_PHYS((start_pfn)),
582 (HA_CHUNK << PAGE_SHIFT));
583
584 if (ret) {
585 pr_info("hot_add memory failed error is %d\n", ret);
586 has->ha_end_pfn -= HA_CHUNK;
587 has->covered_end_pfn -= processed_pfn;
588 break;
589 }
507 590
508 /* 591 /*
509 * Currently we do not support hot add. 592 * Wait for the memory block to be onlined.
510 * Just fail the request.
511 */ 593 */
594 t = wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ);
595 if (t == 0) {
596 pr_info("hot_add memory timedout\n");
597 has->ha_end_pfn -= HA_CHUNK;
598 has->covered_end_pfn -= processed_pfn;
599 break;
600 }
601
512 } 602 }
513 603
604 return;
605}
606
607static void hv_online_page(struct page *pg)
608{
609 struct list_head *cur;
610 struct hv_hotadd_state *has;
611 unsigned long cur_start_pgp;
612 unsigned long cur_end_pgp;
613
614 if (dm_device.ha_waiting) {
615 dm_device.ha_waiting = false;
616 complete(&dm_device.ol_waitevent);
617 }
618
619 list_for_each(cur, &dm_device.ha_region_list) {
620 has = list_entry(cur, struct hv_hotadd_state, list);
621 cur_start_pgp = (unsigned long)
622 pfn_to_page(has->covered_start_pfn);
623 cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn);
624
625 if (((unsigned long)pg >= cur_start_pgp) &&
626 ((unsigned long)pg < cur_end_pgp)) {
627 /*
628 * This frame is currently backed; online the
629 * page.
630 */
631 __online_page_set_limits(pg);
632 __online_page_increment_counters(pg);
633 __online_page_free(pg);
634 has->covered_start_pfn++;
635 }
636 }
637}
638
639static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
640{
641 struct list_head *cur;
642 struct hv_hotadd_state *has;
643 unsigned long residual, new_inc;
644
645 if (list_empty(&dm_device.ha_region_list))
646 return false;
647
648 list_for_each(cur, &dm_device.ha_region_list) {
649 has = list_entry(cur, struct hv_hotadd_state, list);
650
651 /*
652 * If the pfn range we are dealing with is not in the current
653 * "hot add block", move on.
654 */
655 if ((start_pfn >= has->end_pfn))
656 continue;
657 /*
658 * If the current hot add-request extends beyond
659 * our current limit; extend it.
660 */
661 if ((start_pfn + pfn_cnt) > has->end_pfn) {
662 residual = (start_pfn + pfn_cnt - has->end_pfn);
663 /*
664 * Extend the region by multiples of HA_CHUNK.
665 */
666 new_inc = (residual / HA_CHUNK) * HA_CHUNK;
667 if (residual % HA_CHUNK)
668 new_inc += HA_CHUNK;
669
670 has->end_pfn += new_inc;
671 }
672
673 /*
674 * If the current start pfn is not where the covered_end
675 * is, update it.
676 */
677
678 if (has->covered_end_pfn != start_pfn) {
679 has->covered_end_pfn = start_pfn;
680 has->covered_start_pfn = start_pfn;
681 }
682 return true;
683
684 }
685
686 return false;
687}
688
689static unsigned long handle_pg_range(unsigned long pg_start,
690 unsigned long pg_count)
691{
692 unsigned long start_pfn = pg_start;
693 unsigned long pfn_cnt = pg_count;
694 unsigned long size;
695 struct list_head *cur;
696 struct hv_hotadd_state *has;
697 unsigned long pgs_ol = 0;
698 unsigned long old_covered_state;
699
700 if (list_empty(&dm_device.ha_region_list))
701 return 0;
702
703 list_for_each(cur, &dm_device.ha_region_list) {
704 has = list_entry(cur, struct hv_hotadd_state, list);
705
706 /*
707 * If the pfn range we are dealing with is not in the current
708 * "hot add block", move on.
709 */
710 if ((start_pfn >= has->end_pfn))
711 continue;
712
713 old_covered_state = has->covered_end_pfn;
714
715 if (start_pfn < has->ha_end_pfn) {
716 /*
717 * This is the case where we are backing pages
718 * in an already hot added region. Bring
719 * these pages online first.
720 */
721 pgs_ol = has->ha_end_pfn - start_pfn;
722 if (pgs_ol > pfn_cnt)
723 pgs_ol = pfn_cnt;
724 hv_bring_pgs_online(start_pfn, pgs_ol);
725 has->covered_end_pfn += pgs_ol;
726 has->covered_start_pfn += pgs_ol;
727 pfn_cnt -= pgs_ol;
728 }
729
730 if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
731 /*
732 * We have some residual hot add range
733 * that needs to be hot added; hot add
734 * it now. Hot add a multiple of
735 * of HA_CHUNK that fully covers the pages
736 * we have.
737 */
738 size = (has->end_pfn - has->ha_end_pfn);
739 if (pfn_cnt <= size) {
740 size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
741 if (pfn_cnt % HA_CHUNK)
742 size += HA_CHUNK;
743 } else {
744 pfn_cnt = size;
745 }
746 hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has);
747 }
748 /*
749 * If we managed to online any pages that were given to us,
750 * we declare success.
751 */
752 return has->covered_end_pfn - old_covered_state;
753
754 }
755
756 return 0;
757}
758
759static unsigned long process_hot_add(unsigned long pg_start,
760 unsigned long pfn_cnt,
761 unsigned long rg_start,
762 unsigned long rg_size)
763{
764 struct hv_hotadd_state *ha_region = NULL;
765
766 if (pfn_cnt == 0)
767 return 0;
768
769 if (!dm_device.host_specified_ha_region)
770 if (pfn_covered(pg_start, pfn_cnt))
771 goto do_pg_range;
772
773 /*
774 * If the host has specified a hot-add range; deal with it first.
775 */
776
777 if ((rg_size != 0) && (!dm_device.host_specified_ha_region)) {
778 ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL);
779 if (!ha_region)
780 return 0;
781
782 INIT_LIST_HEAD(&ha_region->list);
783
784 list_add_tail(&ha_region->list, &dm_device.ha_region_list);
785 ha_region->start_pfn = rg_start;
786 ha_region->ha_end_pfn = rg_start;
787 ha_region->covered_start_pfn = pg_start;
788 ha_region->covered_end_pfn = pg_start;
789 ha_region->end_pfn = rg_start + rg_size;
790 }
791
792do_pg_range:
793 /*
794 * Process the page range specified; bringing them
795 * online if possible.
796 */
797 return handle_pg_range(pg_start, pfn_cnt);
798}
799
800#endif
801
802static void hot_add_req(struct work_struct *dummy)
803{
804 struct dm_hot_add_response resp;
805#ifdef CONFIG_MEMORY_HOTPLUG
806 unsigned long pg_start, pfn_cnt;
807 unsigned long rg_start, rg_sz;
808#endif
809 struct hv_dynmem_device *dm = &dm_device;
810
514 memset(&resp, 0, sizeof(struct dm_hot_add_response)); 811 memset(&resp, 0, sizeof(struct dm_hot_add_response));
515 resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; 812 resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;
516 resp.hdr.size = sizeof(struct dm_hot_add_response); 813 resp.hdr.size = sizeof(struct dm_hot_add_response);
517 resp.hdr.trans_id = atomic_inc_return(&trans_id); 814 resp.hdr.trans_id = atomic_inc_return(&trans_id);
518 815
519 resp.page_count = 0; 816#ifdef CONFIG_MEMORY_HOTPLUG
520 resp.result = 0; 817 pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
818 pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
521 819
522 dm_device.state = DM_INITIALIZED; 820 rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
523 vmbus_sendpacket(dm_device.dev->channel, &resp, 821 rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
822
823 if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
824 unsigned long region_size;
825 unsigned long region_start;
826
827 /*
828 * The host has not specified the hot-add region.
829 * Based on the hot-add page range being specified,
830 * compute a hot-add region that can cover the pages
831 * that need to be hot-added while ensuring the alignment
832 * and size requirements of Linux as it relates to hot-add.
833 */
834 region_start = pg_start;
835 region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
836 if (pfn_cnt % HA_CHUNK)
837 region_size += HA_CHUNK;
838
839 region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
840
841 rg_start = region_start;
842 rg_sz = region_size;
843 }
844
845 resp.page_count = process_hot_add(pg_start, pfn_cnt,
846 rg_start, rg_sz);
847#endif
848 if (resp.page_count > 0)
849 resp.result = 1;
850 else
851 resp.result = 0;
852
853 if (!do_hot_add || (resp.page_count == 0))
854 pr_info("Memory hot add failed\n");
855
856 dm->state = DM_INITIALIZED;
857 vmbus_sendpacket(dm->dev->channel, &resp,
524 sizeof(struct dm_hot_add_response), 858 sizeof(struct dm_hot_add_response),
525 (unsigned long)NULL, 859 (unsigned long)NULL,
526 VM_PKT_DATA_INBAND, 0); 860 VM_PKT_DATA_INBAND, 0);
527
528} 861}
529 862
530static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) 863static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
@@ -867,6 +1200,7 @@ static void balloon_onchannelcallback(void *context)
867 struct dm_balloon *bal_msg; 1200 struct dm_balloon *bal_msg;
868 struct dm_hot_add *ha_msg; 1201 struct dm_hot_add *ha_msg;
869 union dm_mem_page_range *ha_pg_range; 1202 union dm_mem_page_range *ha_pg_range;
1203 union dm_mem_page_range *ha_region;
870 1204
871 memset(recv_buffer, 0, sizeof(recv_buffer)); 1205 memset(recv_buffer, 0, sizeof(recv_buffer));
872 vmbus_recvpacket(dev->channel, recv_buffer, 1206 vmbus_recvpacket(dev->channel, recv_buffer,
@@ -907,8 +1241,26 @@ static void balloon_onchannelcallback(void *context)
907 pr_warn("Currently hot-adding\n"); 1241 pr_warn("Currently hot-adding\n");
908 dm->state = DM_HOT_ADD; 1242 dm->state = DM_HOT_ADD;
909 ha_msg = (struct dm_hot_add *)recv_buffer; 1243 ha_msg = (struct dm_hot_add *)recv_buffer;
910 ha_pg_range = &ha_msg->range; 1244 if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) {
911 dm_device.ha_wrk.ha_page_range = *ha_pg_range; 1245 /*
1246 * This is a normal hot-add request specifying
1247 * hot-add memory.
1248 */
1249 ha_pg_range = &ha_msg->range;
1250 dm->ha_wrk.ha_page_range = *ha_pg_range;
1251 dm->ha_wrk.ha_region_range.page_range = 0;
1252 } else {
1253 /*
1254 * Host is specifying that we first hot-add
1255 * a region and then partially populate this
1256 * region.
1257 */
1258 dm->host_specified_ha_region = true;
1259 ha_pg_range = &ha_msg->range;
1260 ha_region = &ha_pg_range[1];
1261 dm->ha_wrk.ha_page_range = *ha_pg_range;
1262 dm->ha_wrk.ha_region_range = *ha_region;
1263 }
912 schedule_work(&dm_device.ha_wrk.wrk); 1264 schedule_work(&dm_device.ha_wrk.wrk);
913 break; 1265 break;
914 1266
@@ -952,8 +1304,10 @@ static int balloon_probe(struct hv_device *dev,
952 dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7; 1304 dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
953 init_completion(&dm_device.host_event); 1305 init_completion(&dm_device.host_event);
954 init_completion(&dm_device.config_event); 1306 init_completion(&dm_device.config_event);
1307 INIT_LIST_HEAD(&dm_device.ha_region_list);
955 INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); 1308 INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
956 INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); 1309 INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
1310 dm_device.host_specified_ha_region = false;
957 1311
958 dm_device.thread = 1312 dm_device.thread =
959 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); 1313 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
@@ -962,6 +1316,10 @@ static int balloon_probe(struct hv_device *dev,
962 goto probe_error1; 1316 goto probe_error1;
963 } 1317 }
964 1318
1319#ifdef CONFIG_MEMORY_HOTPLUG
1320 set_online_page_callback(&hv_online_page);
1321#endif
1322
965 hv_set_drvdata(dev, &dm_device); 1323 hv_set_drvdata(dev, &dm_device);
966 /* 1324 /*
967 * Initiate the hand shake with the host and negotiate 1325 * Initiate the hand shake with the host and negotiate
@@ -1006,12 +1364,6 @@ static int balloon_probe(struct hv_device *dev,
1006 cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); 1364 cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);
1007 1365
1008 cap_msg.caps.cap_bits.balloon = 1; 1366 cap_msg.caps.cap_bits.balloon = 1;
1009 /*
1010 * While we currently don't support hot-add,
1011 * we still advertise this capability since the
1012 * host requires that guests partcipating in the
1013 * dynamic memory protocol support hot add.
1014 */
1015 cap_msg.caps.cap_bits.hot_add = 1; 1367 cap_msg.caps.cap_bits.hot_add = 1;
1016 1368
1017 /* 1369 /*
@@ -1049,6 +1401,9 @@ static int balloon_probe(struct hv_device *dev,
1049 return 0; 1401 return 0;
1050 1402
1051probe_error2: 1403probe_error2:
1404#ifdef CONFIG_MEMORY_HOTPLUG
1405 restore_online_page_callback(&hv_online_page);
1406#endif
1052 kthread_stop(dm_device.thread); 1407 kthread_stop(dm_device.thread);
1053 1408
1054probe_error1: 1409probe_error1:
@@ -1061,15 +1416,26 @@ probe_error0:
1061static int balloon_remove(struct hv_device *dev) 1416static int balloon_remove(struct hv_device *dev)
1062{ 1417{
1063 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 1418 struct hv_dynmem_device *dm = hv_get_drvdata(dev);
1419 struct list_head *cur, *tmp;
1420 struct hv_hotadd_state *has;
1064 1421
1065 if (dm->num_pages_ballooned != 0) 1422 if (dm->num_pages_ballooned != 0)
1066 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); 1423 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
1067 1424
1068 cancel_work_sync(&dm->balloon_wrk.wrk); 1425 cancel_work_sync(&dm->balloon_wrk.wrk);
1069 cancel_work_sync(&dm->ha_wrk.wrk); 1426 cancel_work_sync(&dm->ha_wrk.wrk);
1427
1070 vmbus_close(dev->channel); 1428 vmbus_close(dev->channel);
1071 kthread_stop(dm->thread); 1429 kthread_stop(dm->thread);
1072 kfree(send_buffer); 1430 kfree(send_buffer);
1431#ifdef CONFIG_MEMORY_HOTPLUG
1432 restore_online_page_callback(&hv_online_page);
1433#endif
1434 list_for_each_safe(cur, tmp, &dm->ha_region_list) {
1435 has = list_entry(cur, struct hv_hotadd_state, list);
1436 list_del(&has->list);
1437 kfree(has);
1438 }
1073 1439
1074 return 0; 1440 return 0;
1075} 1441}