aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kvm/book3s_hv.c
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2011-06-28 20:25:44 -0400
committerAvi Kivity <avi@redhat.com>2011-07-12 06:16:57 -0400
commitaa04b4cc5be64b4fb9ef4e0fdf2418e2f4737fb2 (patch)
tree97a3ff14e43424e28a27e0f3be088649818c1b76 /arch/powerpc/kvm/book3s_hv.c
parent371fefd6f2dc46668e00871930dde613b88d4bde (diff)
KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests
This adds infrastructure which will be needed to allow book3s_hv KVM to run on older POWER processors, including PPC970, which don't support the Virtual Real Mode Area (VRMA) facility, but only the Real Mode Offset (RMO) facility. These processors require a physically contiguous, aligned area of memory for each guest. When the guest does an access in real mode (MMU off), the address is compared against a limit value, and if it is lower, the address is ORed with an offset value (from the Real Mode Offset Register (RMOR)) and the result becomes the real address for the access. The size of the RMA has to be one of a set of supported values, which usually includes 64MB, 128MB, 256MB and some larger powers of 2. Since we are unlikely to be able to allocate 64MB or more of physically contiguous memory after the kernel has been running for a while, we allocate a pool of RMAs at boot time using the bootmem allocator. The size and number of the RMAs can be set using the kvm_rma_size=xx and kvm_rma_count=xx kernel command line options. KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability of the pool of preallocated RMAs. The capability value is 1 if the processor can use an RMA but doesn't require one (because it supports the VRMA facility), or 2 if the processor requires an RMA for each guest. This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the pool and returns a file descriptor which can be used to map the RMA. It also returns the size of the RMA in the argument structure. Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION ioctl calls from userspace. To cope with this, we now preallocate the kvm->arch.ram_pginfo array when the VM is created with a size sufficient for up to 64GB of guest memory. Subsequently we will get rid of this array and use memory associated with each memslot instead. This moves most of the code that translates the user addresses into host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level to kvmppc_core_prepare_memory_region. Also, instead of having to look up the VMA for each page in order to check the page size, we now check that the pages we get are compound pages of 16MB. However, if we are adding memory that is mapped to an RMA, we don't bother with calling get_user_pages_fast and instead just offset from the base pfn for the RMA. Typically the RMA gets added after vcpus are created, which makes it inconvenient to have the LPCR (logical partition control register) value in the vcpu->arch struct, since the LPCR controls whether the processor uses RMA or VRMA for the guest. This moves the LPCR value into the kvm->arch struct and arranges for the MER (mediated external request) bit, which is the only bit that varies between vcpus, to be set in assembly code when going into the guest if there is a pending external interrupt request. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Alexander Graf <agraf@suse.de>
Diffstat (limited to 'arch/powerpc/kvm/book3s_hv.c')
-rw-r--r--arch/powerpc/kvm/book3s_hv.c259
1 files changed, 250 insertions, 9 deletions
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 36b6d98f1197..04da135cae61 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -27,6 +27,8 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/anon_inodes.h> 28#include <linux/anon_inodes.h>
29#include <linux/cpumask.h> 29#include <linux/cpumask.h>
30#include <linux/spinlock.h>
31#include <linux/page-flags.h>
30 32
31#include <asm/reg.h> 33#include <asm/reg.h>
32#include <asm/cputable.h> 34#include <asm/cputable.h>
@@ -40,11 +42,22 @@
40#include <asm/lppaca.h> 42#include <asm/lppaca.h>
41#include <asm/processor.h> 43#include <asm/processor.h>
42#include <asm/cputhreads.h> 44#include <asm/cputhreads.h>
45#include <asm/page.h>
43#include <linux/gfp.h> 46#include <linux/gfp.h>
44#include <linux/sched.h> 47#include <linux/sched.h>
45#include <linux/vmalloc.h> 48#include <linux/vmalloc.h>
46#include <linux/highmem.h> 49#include <linux/highmem.h>
47 50
51/*
52 * For now, limit memory to 64GB and require it to be large pages.
53 * This value is chosen because it makes the ram_pginfo array be
54 * 64kB in size, which is about as large as we want to be trying
55 * to allocate with kmalloc.
56 */
57#define MAX_MEM_ORDER 36
58
59#define LARGE_PAGE_ORDER 24 /* 16MB pages */
60
48/* #define EXIT_DEBUG */ 61/* #define EXIT_DEBUG */
49/* #define EXIT_DEBUG_SIMPLE */ 62/* #define EXIT_DEBUG_SIMPLE */
50/* #define EXIT_DEBUG_INT */ 63/* #define EXIT_DEBUG_INT */
@@ -129,7 +142,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
129 pr_err(" ESID = %.16llx VSID = %.16llx\n", 142 pr_err(" ESID = %.16llx VSID = %.16llx\n",
130 vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); 143 vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
131 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", 144 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
132 vcpu->arch.lpcr, vcpu->kvm->arch.sdr1, 145 vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
133 vcpu->arch.last_inst); 146 vcpu->arch.last_inst);
134} 147}
135 148
@@ -441,7 +454,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
441 int err = -EINVAL; 454 int err = -EINVAL;
442 int core; 455 int core;
443 struct kvmppc_vcore *vcore; 456 struct kvmppc_vcore *vcore;
444 unsigned long lpcr;
445 457
446 core = id / threads_per_core; 458 core = id / threads_per_core;
447 if (core >= KVM_MAX_VCORES) 459 if (core >= KVM_MAX_VCORES)
@@ -464,10 +476,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
464 vcpu->arch.pvr = mfspr(SPRN_PVR); 476 vcpu->arch.pvr = mfspr(SPRN_PVR);
465 kvmppc_set_pvr(vcpu, vcpu->arch.pvr); 477 kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
466 478
467 lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
468 lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
469 vcpu->arch.lpcr = lpcr;
470
471 kvmppc_mmu_book3s_hv_init(vcpu); 479 kvmppc_mmu_book3s_hv_init(vcpu);
472 480
473 /* 481 /*
@@ -910,24 +918,216 @@ fail:
910 return ret; 918 return ret;
911} 919}
912 920
921/* Work out RMLS (real mode limit selector) field value for a given RMA size.
922 Assumes POWER7. */
923static inline int lpcr_rmls(unsigned long rma_size)
924{
925 switch (rma_size) {
926 case 32ul << 20: /* 32 MB */
927 return 8;
928 case 64ul << 20: /* 64 MB */
929 return 3;
930 case 128ul << 20: /* 128 MB */
931 return 7;
932 case 256ul << 20: /* 256 MB */
933 return 4;
934 case 1ul << 30: /* 1 GB */
935 return 2;
936 case 16ul << 30: /* 16 GB */
937 return 1;
938 case 256ul << 30: /* 256 GB */
939 return 0;
940 default:
941 return -1;
942 }
943}
944
945static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
946{
947 struct kvmppc_rma_info *ri = vma->vm_file->private_data;
948 struct page *page;
949
950 if (vmf->pgoff >= ri->npages)
951 return VM_FAULT_SIGBUS;
952
953 page = pfn_to_page(ri->base_pfn + vmf->pgoff);
954 get_page(page);
955 vmf->page = page;
956 return 0;
957}
958
959static const struct vm_operations_struct kvm_rma_vm_ops = {
960 .fault = kvm_rma_fault,
961};
962
963static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
964{
965 vma->vm_flags |= VM_RESERVED;
966 vma->vm_ops = &kvm_rma_vm_ops;
967 return 0;
968}
969
970static int kvm_rma_release(struct inode *inode, struct file *filp)
971{
972 struct kvmppc_rma_info *ri = filp->private_data;
973
974 kvm_release_rma(ri);
975 return 0;
976}
977
978static struct file_operations kvm_rma_fops = {
979 .mmap = kvm_rma_mmap,
980 .release = kvm_rma_release,
981};
982
983long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
984{
985 struct kvmppc_rma_info *ri;
986 long fd;
987
988 ri = kvm_alloc_rma();
989 if (!ri)
990 return -ENOMEM;
991
992 fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
993 if (fd < 0)
994 kvm_release_rma(ri);
995
996 ret->rma_size = ri->npages << PAGE_SHIFT;
997 return fd;
998}
999
1000static struct page *hva_to_page(unsigned long addr)
1001{
1002 struct page *page[1];
1003 int npages;
1004
1005 might_sleep();
1006
1007 npages = get_user_pages_fast(addr, 1, 1, page);
1008
1009 if (unlikely(npages != 1))
1010 return 0;
1011
1012 return page[0];
1013}
1014
913int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1015int kvmppc_core_prepare_memory_region(struct kvm *kvm,
914 struct kvm_userspace_memory_region *mem) 1016 struct kvm_userspace_memory_region *mem)
915{ 1017{
916 if (mem->guest_phys_addr == 0 && mem->memory_size != 0) 1018 unsigned long psize, porder;
917 return kvmppc_prepare_vrma(kvm, mem); 1019 unsigned long i, npages, totalpages;
1020 unsigned long pg_ix;
1021 struct kvmppc_pginfo *pginfo;
1022 unsigned long hva;
1023 struct kvmppc_rma_info *ri = NULL;
1024 struct page *page;
1025
1026 /* For now, only allow 16MB pages */
1027 porder = LARGE_PAGE_ORDER;
1028 psize = 1ul << porder;
1029 if ((mem->memory_size & (psize - 1)) ||
1030 (mem->guest_phys_addr & (psize - 1))) {
1031 pr_err("bad memory_size=%llx @ %llx\n",
1032 mem->memory_size, mem->guest_phys_addr);
1033 return -EINVAL;
1034 }
1035
1036 npages = mem->memory_size >> porder;
1037 totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
1038
1039 /* More memory than we have space to track? */
1040 if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
1041 return -EINVAL;
1042
1043 /* Do we already have an RMA registered? */
1044 if (mem->guest_phys_addr == 0 && kvm->arch.rma)
1045 return -EINVAL;
1046
1047 if (totalpages > kvm->arch.ram_npages)
1048 kvm->arch.ram_npages = totalpages;
1049
1050 /* Is this one of our preallocated RMAs? */
1051 if (mem->guest_phys_addr == 0) {
1052 struct vm_area_struct *vma;
1053
1054 down_read(&current->mm->mmap_sem);
1055 vma = find_vma(current->mm, mem->userspace_addr);
1056 if (vma && vma->vm_file &&
1057 vma->vm_file->f_op == &kvm_rma_fops &&
1058 mem->userspace_addr == vma->vm_start)
1059 ri = vma->vm_file->private_data;
1060 up_read(&current->mm->mmap_sem);
1061 }
1062
1063 if (ri) {
1064 unsigned long rma_size;
1065 unsigned long lpcr;
1066 long rmls;
1067
1068 rma_size = ri->npages << PAGE_SHIFT;
1069 if (rma_size > mem->memory_size)
1070 rma_size = mem->memory_size;
1071 rmls = lpcr_rmls(rma_size);
1072 if (rmls < 0) {
1073 pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
1074 return -EINVAL;
1075 }
1076 atomic_inc(&ri->use_count);
1077 kvm->arch.rma = ri;
1078 kvm->arch.n_rma_pages = rma_size >> porder;
1079 lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L);
1080 lpcr |= rmls << LPCR_RMLS_SH;
1081 kvm->arch.lpcr = lpcr;
1082 kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
1083 pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
1084 ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
1085 }
1086
1087 pg_ix = mem->guest_phys_addr >> porder;
1088 pginfo = kvm->arch.ram_pginfo + pg_ix;
1089 for (i = 0; i < npages; ++i, ++pg_ix) {
1090 if (ri && pg_ix < kvm->arch.n_rma_pages) {
1091 pginfo[i].pfn = ri->base_pfn +
1092 (pg_ix << (porder - PAGE_SHIFT));
1093 continue;
1094 }
1095 hva = mem->userspace_addr + (i << porder);
1096 page = hva_to_page(hva);
1097 if (!page) {
1098 pr_err("oops, no pfn for hva %lx\n", hva);
1099 goto err;
1100 }
1101 /* Check it's a 16MB page */
1102 if (!PageHead(page) ||
1103 compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
1104 pr_err("page at %lx isn't 16MB (o=%d)\n",
1105 hva, compound_order(page));
1106 goto err;
1107 }
1108 pginfo[i].pfn = page_to_pfn(page);
1109 }
1110
918 return 0; 1111 return 0;
1112
1113 err:
1114 return -EINVAL;
919} 1115}
920 1116
921void kvmppc_core_commit_memory_region(struct kvm *kvm, 1117void kvmppc_core_commit_memory_region(struct kvm *kvm,
922 struct kvm_userspace_memory_region *mem) 1118 struct kvm_userspace_memory_region *mem)
923{ 1119{
924 if (mem->guest_phys_addr == 0 && mem->memory_size != 0) 1120 if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
1121 !kvm->arch.rma)
925 kvmppc_map_vrma(kvm, mem); 1122 kvmppc_map_vrma(kvm, mem);
926} 1123}
927 1124
928int kvmppc_core_init_vm(struct kvm *kvm) 1125int kvmppc_core_init_vm(struct kvm *kvm)
929{ 1126{
930 long r; 1127 long r;
1128 unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
1129 long err = -ENOMEM;
1130 unsigned long lpcr;
931 1131
932 /* Allocate hashed page table */ 1132 /* Allocate hashed page table */
933 r = kvmppc_alloc_hpt(kvm); 1133 r = kvmppc_alloc_hpt(kvm);
@@ -935,11 +1135,52 @@ int kvmppc_core_init_vm(struct kvm *kvm)
935 return r; 1135 return r;
936 1136
937 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1137 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1138
1139 kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
1140 GFP_KERNEL);
1141 if (!kvm->arch.ram_pginfo) {
1142 pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
1143 npages * sizeof(struct kvmppc_pginfo));
1144 goto out_free;
1145 }
1146
1147 kvm->arch.ram_npages = 0;
1148 kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
1149 kvm->arch.ram_porder = LARGE_PAGE_ORDER;
1150 kvm->arch.rma = NULL;
1151 kvm->arch.n_rma_pages = 0;
1152
1153 lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
1154 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
1155 LPCR_VPM0 | LPCR_VRMA_L;
1156 kvm->arch.lpcr = lpcr;
1157
1158
938 return 0; 1159 return 0;
1160
1161 out_free:
1162 kvmppc_free_hpt(kvm);
1163 return err;
939} 1164}
940 1165
941void kvmppc_core_destroy_vm(struct kvm *kvm) 1166void kvmppc_core_destroy_vm(struct kvm *kvm)
942{ 1167{
1168 struct kvmppc_pginfo *pginfo;
1169 unsigned long i;
1170
1171 if (kvm->arch.ram_pginfo) {
1172 pginfo = kvm->arch.ram_pginfo;
1173 kvm->arch.ram_pginfo = NULL;
1174 for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
1175 if (pginfo[i].pfn)
1176 put_page(pfn_to_page(pginfo[i].pfn));
1177 kfree(pginfo);
1178 }
1179 if (kvm->arch.rma) {
1180 kvm_release_rma(kvm->arch.rma);
1181 kvm->arch.rma = NULL;
1182 }
1183
943 kvmppc_free_hpt(kvm); 1184 kvmppc_free_hpt(kvm);
944 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); 1185 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
945} 1186}