aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2012-11-19 17:57:20 -0500
committerAlexander Graf <agraf@suse.de>2012-12-05 19:33:57 -0500
commita2932923ccf63c419c77aaa18ac09be98f2c94d8 (patch)
tree391d9fc64d93bac26b442d7f2211dc99dbd10e67 /arch/powerpc
parent6b445ad4f839b06e68dd8e178e1168482ca20310 (diff)
KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT
A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor. Reads on this fd return the contents of the HPT (hashed page table), writes create and/or remove entries in the HPT. There is a new capability, KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl. The ioctl takes an argument structure with the index of the first HPT entry to read out and a set of flags. The flags indicate whether the user is intending to read or write the HPT, and whether to return all entries or only the "bolted" entries (those with the bolted bit, 0x10, set in the first doubleword). This is intended for use in implementing qemu's savevm/loadvm and for live migration. Therefore, on reads, the first pass returns information about all HPTEs (or all bolted HPTEs). When the first pass reaches the end of the HPT, it returns from the read. Subsequent reads only return information about HPTEs that have changed since they were last read. A read that finds no changed HPTEs in the HPT following where the last read finished will return 0 bytes. The format of the data provides a simple run-length compression of the invalid entries. Each block of data starts with a header that indicates the index (position in the HPT, which is just an array), the number of valid entries starting at that index (may be zero), and the number of invalid entries following those valid entries. The valid entries, 16 bytes each, follow the header. The invalid entries are not explicitly represented. Signed-off-by: Paul Mackerras <paulus@samba.org> [agraf: fix documentation] Signed-off-by: Alexander Graf <agraf@suse.de>
Diffstat (limited to 'arch/powerpc')
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h22
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h2
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h25
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c344
-rw-r--r--arch/powerpc/kvm/book3s_hv.c12
-rw-r--r--arch/powerpc/kvm/powerpc.c17
6 files changed, 410 insertions, 12 deletions
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index b322e5bd6964..38bec1dc9928 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -246,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
246 return !(memslot->base_gfn & mask) && !(memslot->npages & mask); 246 return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
247} 247}
248 248
249/*
250 * This works for 4k, 64k and 16M pages on POWER7,
251 * and 4k and 16M pages on PPC970.
252 */
253static inline unsigned long slb_pgsize_encoding(unsigned long psize)
254{
255 unsigned long senc = 0;
256
257 if (psize > 0x1000) {
258 senc = SLB_VSID_L;
259 if (psize == 0x10000)
260 senc |= SLB_VSID_LP_01;
261 }
262 return senc;
263}
264
265static inline int is_vrma_hpte(unsigned long hpte_v)
266{
267 return (hpte_v & ~0xffffffUL) ==
268 (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
269}
270
249#endif /* __ASM_KVM_BOOK3S_64_H__ */ 271#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 609cca3e9426..1ca31e92ee75 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -164,6 +164,8 @@ extern void kvmppc_bookehv_exit(void);
164 164
165extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); 165extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
166 166
167extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
168
167/* 169/*
168 * Cuts out inst bits with ordering according to spec. 170 * Cuts out inst bits with ordering according to spec.
169 * That means the leftmost bit is zero. All given bits are included. 171 * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index b89ae4db45ce..514883dd311e 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -331,6 +331,31 @@ struct kvm_book3e_206_tlb_params {
331 __u32 reserved[8]; 331 __u32 reserved[8];
332}; 332};
333 333
334/* For KVM_PPC_GET_HTAB_FD */
335struct kvm_get_htab_fd {
336 __u64 flags;
337 __u64 start_index;
338 __u64 reserved[2];
339};
340
341/* Values for kvm_get_htab_fd.flags */
342#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1)
343#define KVM_GET_HTAB_WRITE ((__u64)0x2)
344
345/*
346 * Data read on the file descriptor is formatted as a series of
347 * records, each consisting of a header followed by a series of
348 * `n_valid' HPTEs (16 bytes each), which are all valid. Following
349 * those valid HPTEs there are `n_invalid' invalid HPTEs, which
350 * are not represented explicitly in the stream. The same format
351 * is used for writing.
352 */
353struct kvm_get_htab_header {
354 __u32 index;
355 __u16 n_valid;
356 __u16 n_invalid;
357};
358
334#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) 359#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
335#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2) 360#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
336#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3) 361#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 6ee6516a0bee..0aa40734c8f6 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -25,6 +25,8 @@
25#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
26#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
27#include <linux/srcu.h> 27#include <linux/srcu.h>
28#include <linux/anon_inodes.h>
29#include <linux/file.h>
28 30
29#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
30#include <asm/kvm_ppc.h> 32#include <asm/kvm_ppc.h>
@@ -1145,6 +1147,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
1145 put_page(page); 1147 put_page(page);
1146} 1148}
1147 1149
1150/*
1151 * Functions for reading and writing the hash table via reads and
1152 * writes on a file descriptor.
1153 *
1154 * Reads return the guest view of the hash table, which has to be
1155 * pieced together from the real hash table and the guest_rpte
1156 * values in the revmap array.
1157 *
1158 * On writes, each HPTE written is considered in turn, and if it
1159 * is valid, it is written to the HPT as if an H_ENTER with the
1160 * exact flag set was done. When the invalid count is non-zero
1161 * in the header written to the stream, the kernel will make
1162 * sure that that many HPTEs are invalid, and invalidate them
1163 * if not.
1164 */
1165
1166struct kvm_htab_ctx {
1167 unsigned long index;
1168 unsigned long flags;
1169 struct kvm *kvm;
1170 int first_pass;
1171};
1172
1173#define HPTE_SIZE (2 * sizeof(unsigned long))
1174
1175static long record_hpte(unsigned long flags, unsigned long *hptp,
1176 unsigned long *hpte, struct revmap_entry *revp,
1177 int want_valid, int first_pass)
1178{
1179 unsigned long v, r;
1180 int ok = 1;
1181 int valid, dirty;
1182
1183 /* Unmodified entries are uninteresting except on the first pass */
1184 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1185 if (!first_pass && !dirty)
1186 return 0;
1187
1188 valid = 0;
1189 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1190 valid = 1;
1191 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1192 !(hptp[0] & HPTE_V_BOLTED))
1193 valid = 0;
1194 }
1195 if (valid != want_valid)
1196 return 0;
1197
1198 v = r = 0;
1199 if (valid || dirty) {
1200 /* lock the HPTE so it's stable and read it */
1201 preempt_disable();
1202 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1203 cpu_relax();
1204 v = hptp[0];
1205 if (v & HPTE_V_ABSENT) {
1206 v &= ~HPTE_V_ABSENT;
1207 v |= HPTE_V_VALID;
1208 }
1209 /* re-evaluate valid and dirty from synchronized HPTE value */
1210 valid = !!(v & HPTE_V_VALID);
1211 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1212 valid = 0;
1213 r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
1214 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1215 /* only clear modified if this is the right sort of entry */
1216 if (valid == want_valid && dirty) {
1217 r &= ~HPTE_GR_MODIFIED;
1218 revp->guest_rpte = r;
1219 }
1220 asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
1221 hptp[0] &= ~HPTE_V_HVLOCK;
1222 preempt_enable();
1223 if (!(valid == want_valid && (first_pass || dirty)))
1224 ok = 0;
1225 }
1226 hpte[0] = v;
1227 hpte[1] = r;
1228 return ok;
1229}
1230
1231static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1232 size_t count, loff_t *ppos)
1233{
1234 struct kvm_htab_ctx *ctx = file->private_data;
1235 struct kvm *kvm = ctx->kvm;
1236 struct kvm_get_htab_header hdr;
1237 unsigned long *hptp;
1238 struct revmap_entry *revp;
1239 unsigned long i, nb, nw;
1240 unsigned long __user *lbuf;
1241 struct kvm_get_htab_header __user *hptr;
1242 unsigned long flags;
1243 int first_pass;
1244 unsigned long hpte[2];
1245
1246 if (!access_ok(VERIFY_WRITE, buf, count))
1247 return -EFAULT;
1248
1249 first_pass = ctx->first_pass;
1250 flags = ctx->flags;
1251
1252 i = ctx->index;
1253 hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1254 revp = kvm->arch.revmap + i;
1255 lbuf = (unsigned long __user *)buf;
1256
1257 nb = 0;
1258 while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1259 /* Initialize header */
1260 hptr = (struct kvm_get_htab_header __user *)buf;
1261 hdr.index = i;
1262 hdr.n_valid = 0;
1263 hdr.n_invalid = 0;
1264 nw = nb;
1265 nb += sizeof(hdr);
1266 lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1267
1268 /* Skip uninteresting entries, i.e. clean on not-first pass */
1269 if (!first_pass) {
1270 while (i < kvm->arch.hpt_npte &&
1271 !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
1272 ++i;
1273 hptp += 2;
1274 ++revp;
1275 }
1276 }
1277
1278 /* Grab a series of valid entries */
1279 while (i < kvm->arch.hpt_npte &&
1280 hdr.n_valid < 0xffff &&
1281 nb + HPTE_SIZE < count &&
1282 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1283 /* valid entry, write it out */
1284 ++hdr.n_valid;
1285 if (__put_user(hpte[0], lbuf) ||
1286 __put_user(hpte[1], lbuf + 1))
1287 return -EFAULT;
1288 nb += HPTE_SIZE;
1289 lbuf += 2;
1290 ++i;
1291 hptp += 2;
1292 ++revp;
1293 }
1294 /* Now skip invalid entries while we can */
1295 while (i < kvm->arch.hpt_npte &&
1296 hdr.n_invalid < 0xffff &&
1297 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1298 /* found an invalid entry */
1299 ++hdr.n_invalid;
1300 ++i;
1301 hptp += 2;
1302 ++revp;
1303 }
1304
1305 if (hdr.n_valid || hdr.n_invalid) {
1306 /* write back the header */
1307 if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1308 return -EFAULT;
1309 nw = nb;
1310 buf = (char __user *)lbuf;
1311 } else {
1312 nb = nw;
1313 }
1314
1315 /* Check if we've wrapped around the hash table */
1316 if (i >= kvm->arch.hpt_npte) {
1317 i = 0;
1318 ctx->first_pass = 0;
1319 break;
1320 }
1321 }
1322
1323 ctx->index = i;
1324
1325 return nb;
1326}
1327
1328static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1329 size_t count, loff_t *ppos)
1330{
1331 struct kvm_htab_ctx *ctx = file->private_data;
1332 struct kvm *kvm = ctx->kvm;
1333 struct kvm_get_htab_header hdr;
1334 unsigned long i, j;
1335 unsigned long v, r;
1336 unsigned long __user *lbuf;
1337 unsigned long *hptp;
1338 unsigned long tmp[2];
1339 ssize_t nb;
1340 long int err, ret;
1341 int rma_setup;
1342
1343 if (!access_ok(VERIFY_READ, buf, count))
1344 return -EFAULT;
1345
1346 /* lock out vcpus from running while we're doing this */
1347 mutex_lock(&kvm->lock);
1348 rma_setup = kvm->arch.rma_setup_done;
1349 if (rma_setup) {
1350 kvm->arch.rma_setup_done = 0; /* temporarily */
1351 /* order rma_setup_done vs. vcpus_running */
1352 smp_mb();
1353 if (atomic_read(&kvm->arch.vcpus_running)) {
1354 kvm->arch.rma_setup_done = 1;
1355 mutex_unlock(&kvm->lock);
1356 return -EBUSY;
1357 }
1358 }
1359
1360 err = 0;
1361 for (nb = 0; nb + sizeof(hdr) <= count; ) {
1362 err = -EFAULT;
1363 if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1364 break;
1365
1366 err = 0;
1367 if (nb + hdr.n_valid * HPTE_SIZE > count)
1368 break;
1369
1370 nb += sizeof(hdr);
1371 buf += sizeof(hdr);
1372
1373 err = -EINVAL;
1374 i = hdr.index;
1375 if (i >= kvm->arch.hpt_npte ||
1376 i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
1377 break;
1378
1379 hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1380 lbuf = (unsigned long __user *)buf;
1381 for (j = 0; j < hdr.n_valid; ++j) {
1382 err = -EFAULT;
1383 if (__get_user(v, lbuf) || __get_user(r, lbuf + 1))
1384 goto out;
1385 err = -EINVAL;
1386 if (!(v & HPTE_V_VALID))
1387 goto out;
1388 lbuf += 2;
1389 nb += HPTE_SIZE;
1390
1391 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
1392 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1393 err = -EIO;
1394 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1395 tmp);
1396 if (ret != H_SUCCESS) {
1397 pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
1398 "r=%lx\n", ret, i, v, r);
1399 goto out;
1400 }
1401 if (!rma_setup && is_vrma_hpte(v)) {
1402 unsigned long psize = hpte_page_size(v, r);
1403 unsigned long senc = slb_pgsize_encoding(psize);
1404 unsigned long lpcr;
1405
1406 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1407 (VRMA_VSID << SLB_VSID_SHIFT_1T);
1408 lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
1409 lpcr |= senc << (LPCR_VRMASD_SH - 4);
1410 kvm->arch.lpcr = lpcr;
1411 rma_setup = 1;
1412 }
1413 ++i;
1414 hptp += 2;
1415 }
1416
1417 for (j = 0; j < hdr.n_invalid; ++j) {
1418 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
1419 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1420 ++i;
1421 hptp += 2;
1422 }
1423 err = 0;
1424 }
1425
1426 out:
1427 /* Order HPTE updates vs. rma_setup_done */
1428 smp_wmb();
1429 kvm->arch.rma_setup_done = rma_setup;
1430 mutex_unlock(&kvm->lock);
1431
1432 if (err)
1433 return err;
1434 return nb;
1435}
1436
1437static int kvm_htab_release(struct inode *inode, struct file *filp)
1438{
1439 struct kvm_htab_ctx *ctx = filp->private_data;
1440
1441 filp->private_data = NULL;
1442 if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1443 atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1444 kvm_put_kvm(ctx->kvm);
1445 kfree(ctx);
1446 return 0;
1447}
1448
1449static struct file_operations kvm_htab_fops = {
1450 .read = kvm_htab_read,
1451 .write = kvm_htab_write,
1452 .llseek = default_llseek,
1453 .release = kvm_htab_release,
1454};
1455
1456int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1457{
1458 int ret;
1459 struct kvm_htab_ctx *ctx;
1460 int rwflag;
1461
1462 /* reject flags we don't recognize */
1463 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1464 return -EINVAL;
1465 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1466 if (!ctx)
1467 return -ENOMEM;
1468 kvm_get_kvm(kvm);
1469 ctx->kvm = kvm;
1470 ctx->index = ghf->start_index;
1471 ctx->flags = ghf->flags;
1472 ctx->first_pass = 1;
1473
1474 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1475 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
1476 if (ret < 0) {
1477 kvm_put_kvm(kvm);
1478 return ret;
1479 }
1480
1481 if (rwflag == O_RDONLY) {
1482 mutex_lock(&kvm->slots_lock);
1483 atomic_inc(&kvm->arch.hpte_mod_interest);
1484 /* make sure kvmppc_do_h_enter etc. see the increment */
1485 synchronize_srcu_expedited(&kvm->srcu);
1486 mutex_unlock(&kvm->slots_lock);
1487 }
1488
1489 return ret;
1490}
1491
1148void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 1492void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
1149{ 1493{
1150 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 1494 struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 843eb754a1d5..a4f59dbcd800 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1563,18 +1563,6 @@ out:
1563 return r; 1563 return r;
1564} 1564}
1565 1565
1566static unsigned long slb_pgsize_encoding(unsigned long psize)
1567{
1568 unsigned long senc = 0;
1569
1570 if (psize > 0x1000) {
1571 senc = SLB_VSID_L;
1572 if (psize == 0x10000)
1573 senc |= SLB_VSID_LP_01;
1574 }
1575 return senc;
1576}
1577
1578static void unpin_slot(struct kvm_memory_slot *memslot) 1566static void unpin_slot(struct kvm_memory_slot *memslot)
1579{ 1567{
1580 unsigned long *physp; 1568 unsigned long *physp;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index d583ea15e151..70739a089560 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -354,6 +354,12 @@ int kvm_dev_ioctl_check_extension(long ext)
354 r = 1; 354 r = 1;
355#else 355#else
356 r = 0; 356 r = 0;
357 break;
358#endif
359#ifdef CONFIG_KVM_BOOK3S_64_HV
360 case KVM_CAP_PPC_HTAB_FD:
361 r = 1;
362 break;
357#endif 363#endif
358 break; 364 break;
359 case KVM_CAP_NR_VCPUS: 365 case KVM_CAP_NR_VCPUS:
@@ -954,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
954 r = 0; 960 r = 0;
955 break; 961 break;
956 } 962 }
963
964 case KVM_PPC_GET_HTAB_FD: {
965 struct kvm *kvm = filp->private_data;
966 struct kvm_get_htab_fd ghf;
967
968 r = -EFAULT;
969 if (copy_from_user(&ghf, argp, sizeof(ghf)))
970 break;
971 r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
972 break;
973 }
957#endif /* CONFIG_KVM_BOOK3S_64_HV */ 974#endif /* CONFIG_KVM_BOOK3S_64_HV */
958 975
959#ifdef CONFIG_PPC_BOOK3S_64 976#ifdef CONFIG_PPC_BOOK3S_64