aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2015-02-10 23:45:10 -0500
committerRusty Russell <rusty@rustcorp.com.au>2015-02-11 01:17:31 -0500
commitc565650b1028bc551e5d16dd0ec8f7078da7cace (patch)
tree019581b2a4821eba84ebdff179034c5ac57ca1a3
parentc9e433e4b852b70ea267388cf9b5d8096b04c44c (diff)
lguest: send trap 13 through to userspace.
We copy 7 bytes at eip for userspace's instruction decode; we have to carefully handle the case where eip is at the end of a page. We can't leave this to userspace since kernel has all the page table decode logic. The decode logic moves to userspace, basically unchanged. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
-rw-r--r--drivers/lguest/x86/core.c133
-rw-r--r--tools/lguest/lguest.c149
2 files changed, 192 insertions, 90 deletions
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index f7a16b4ea456..42e87bf14113 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -314,95 +314,52 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
314 * usually attached to a PC. 314 * usually attached to a PC.
315 * 315 *
316 * When the Guest uses one of these instructions, we get a trap (General 316 * When the Guest uses one of these instructions, we get a trap (General
317 * Protection Fault) and come here. We see if it's one of those troublesome 317 * Protection Fault) and come here. We queue this to be sent out to the
318 * instructions and skip over it. We return true if we did. 318 * Launcher to handle.
319 */ 319 */
320static int emulate_insn(struct lg_cpu *cpu)
321{
322 u8 insn;
323 unsigned int insnlen = 0, in = 0, small_operand = 0;
324 /*
325 * The eip contains the *virtual* address of the Guest's instruction:
326 * walk the Guest's page tables to find the "physical" address.
327 */
328 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
329
330 /*
331 * This must be the Guest kernel trying to do something, not userspace!
332 * The bottom two bits of the CS segment register are the privilege
333 * level.
334 */
335 if ((cpu->regs->cs & 3) != GUEST_PL)
336 return 0;
337 320
338 /* Decoding x86 instructions is icky. */ 321/*
339 insn = lgread(cpu, physaddr, u8); 322 * The eip contains the *virtual* address of the Guest's instruction:
340 323 * we copy the instruction here so the Launcher doesn't have to walk
341 /* 324 * the page tables to decode it. We handle the case (eg. in a kernel
342 * Around 2.6.33, the kernel started using an emulation for the 325 * module) where the instruction is over two pages, and the pages are
343 * cmpxchg8b instruction in early boot on many configurations. This 326 * virtually but not physically contiguous.
344 * code isn't paravirtualized, and it tries to disable interrupts. 327 *
345 * Ignore it, which will Mostly Work. 328 * The longest possible x86 instruction is 15 bytes, but we don't handle
346 */ 329 * anything that strange.
347 if (insn == 0xfa) { 330 */
348 /* "cli", or Clear Interrupt Enable instruction. Skip it. */ 331static void copy_from_guest(struct lg_cpu *cpu,
349 cpu->regs->eip++; 332 void *dst, unsigned long vaddr, size_t len)
350 return 1; 333{
334 size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
335 unsigned long paddr;
336
337 BUG_ON(len > PAGE_SIZE);
338
339 /* If it goes over a page, copy in two parts. */
340 if (len > to_page_end) {
341 /* But make sure the next page is mapped! */
342 if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
343 copy_from_guest(cpu, dst + to_page_end,
344 vaddr + to_page_end,
345 len - to_page_end);
346 else
347 /* Otherwise fill with zeroes. */
348 memset(dst + to_page_end, 0, len - to_page_end);
349 len = to_page_end;
351 } 350 }
352 351
353 /* 352 /* This will kill the guest if it isn't mapped, but that
354 * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. 353 * shouldn't happen. */
355 */ 354 __lgread(cpu, dst, guest_pa(cpu, vaddr), len);
356 if (insn == 0x66) { 355}
357 small_operand = 1;
358 /* The instruction is 1 byte so far, read the next byte. */
359 insnlen = 1;
360 insn = lgread(cpu, physaddr + insnlen, u8);
361 }
362 356
363 /*
364 * We can ignore the lower bit for the moment and decode the 4 opcodes
365 * we need to emulate.
366 */
367 switch (insn & 0xFE) {
368 case 0xE4: /* in <next byte>,%al */
369 insnlen += 2;
370 in = 1;
371 break;
372 case 0xEC: /* in (%dx),%al */
373 insnlen += 1;
374 in = 1;
375 break;
376 case 0xE6: /* out %al,<next byte> */
377 insnlen += 2;
378 break;
379 case 0xEE: /* out %al,(%dx) */
380 insnlen += 1;
381 break;
382 default:
383 /* OK, we don't know what this is, can't emulate. */
384 return 0;
385 }
386 357
387 /* 358static void setup_emulate_insn(struct lg_cpu *cpu)
388 * If it was an "IN" instruction, they expect the result to be read 359{
389 * into %eax, so we change %eax. We always return all-ones, which 360 cpu->pending.trap = 13;
390 * traditionally means "there's nothing there". 361 copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
391 */ 362 sizeof(cpu->pending.insn));
392 if (in) {
393 /* Lower bit tells means it's a 32/16 bit access */
394 if (insn & 0x1) {
395 if (small_operand)
396 cpu->regs->eax |= 0xFFFF;
397 else
398 cpu->regs->eax = 0xFFFFFFFF;
399 } else
400 cpu->regs->eax |= 0xFF;
401 }
402 /* Finally, we've "done" the instruction, so move past it. */
403 cpu->regs->eip += insnlen;
404 /* Success! */
405 return 1;
406} 363}
407 364
408/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 365/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
@@ -410,14 +367,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
410{ 367{
411 switch (cpu->regs->trapnum) { 368 switch (cpu->regs->trapnum) {
412 case 13: /* We've intercepted a General Protection Fault. */ 369 case 13: /* We've intercepted a General Protection Fault. */
413 /* 370 /* Hand to Launcher to emulate those pesky IN and OUT insns */
414 * Check if this was one of those annoying IN or OUT
415 * instructions which we need to emulate. If so, we just go
416 * back into the Guest after we've done it.
417 */
418 if (cpu->regs->errcode == 0) { 371 if (cpu->regs->errcode == 0) {
419 if (emulate_insn(cpu)) 372 setup_emulate_insn(cpu);
420 return; 373 return;
421 } 374 }
422 break; 375 break;
423 case 14: /* We've intercepted a Page Fault. */ 376 case 14: /* We've intercepted a Page Fault. */
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 0e754d04876d..b2217657f62c 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -41,6 +41,7 @@
41#include <signal.h> 41#include <signal.h>
42#include <pwd.h> 42#include <pwd.h>
43#include <grp.h> 43#include <grp.h>
44#include <sys/user.h>
44 45
45#ifndef VIRTIO_F_ANY_LAYOUT 46#ifndef VIRTIO_F_ANY_LAYOUT
46#define VIRTIO_F_ANY_LAYOUT 27 47#define VIRTIO_F_ANY_LAYOUT 27
@@ -1143,6 +1144,150 @@ static void handle_output(unsigned long addr)
1143 strnlen(from_guest_phys(addr), guest_limit - addr)); 1144 strnlen(from_guest_phys(addr), guest_limit - addr));
1144} 1145}
1145 1146
1147/*L:216
1148 * This is where we emulate a handful of Guest instructions. It's ugly
1149 * and we used to do it in the kernel but it grew over time.
1150 */
1151
1152/*
1153 * We use the ptrace syscall's pt_regs struct to talk about registers
1154 * to lguest: these macros convert the names to the offsets.
1155 */
1156#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
1157#define setreg(name, val) \
1158 setreg_off(offsetof(struct user_regs_struct, name), (val))
1159
1160static u32 getreg_off(size_t offset)
1161{
1162 u32 r;
1163 unsigned long args[] = { LHREQ_GETREG, offset };
1164
1165 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1166 err(1, "Getting register %u", offset);
1167 if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
1168 err(1, "Reading register %u", offset);
1169
1170 return r;
1171}
1172
1173static void setreg_off(size_t offset, u32 val)
1174{
1175 unsigned long args[] = { LHREQ_SETREG, offset, val };
1176
1177 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1178 err(1, "Setting register %u", offset);
1179}
1180
1181static void emulate_insn(const u8 insn[])
1182{
1183 unsigned long args[] = { LHREQ_TRAP, 13 };
1184 unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
1185 unsigned int eax, port, mask;
1186 /*
1187 * We always return all-ones on IO port reads, which traditionally
1188 * means "there's nothing there".
1189 */
1190 u32 val = 0xFFFFFFFF;
1191
1192 /*
1193 * This must be the Guest kernel trying to do something, not userspace!
1194 * The bottom two bits of the CS segment register are the privilege
1195 * level.
1196 */
1197 if ((getreg(xcs) & 3) != 0x1)
1198 goto no_emulate;
1199
1200 /* Decoding x86 instructions is icky. */
1201
1202 /*
1203 * Around 2.6.33, the kernel started using an emulation for the
1204 * cmpxchg8b instruction in early boot on many configurations. This
1205 * code isn't paravirtualized, and it tries to disable interrupts.
1206 * Ignore it, which will Mostly Work.
1207 */
1208 if (insn[insnlen] == 0xfa) {
1209 /* "cli", or Clear Interrupt Enable instruction. Skip it. */
1210 insnlen = 1;
1211 goto skip_insn;
1212 }
1213
1214 /*
1215 * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
1216 */
1217 if (insn[insnlen] == 0x66) {
1218 small_operand = 1;
1219 /* The instruction is 1 byte so far, read the next byte. */
1220 insnlen = 1;
1221 }
1222
1223 /* If the lower bit isn't set, it's a single byte access */
1224 byte_access = !(insn[insnlen] & 1);
1225
1226 /*
1227 * Now we can ignore the lower bit and decode the 4 opcodes
1228 * we need to emulate.
1229 */
1230 switch (insn[insnlen] & 0xFE) {
1231 case 0xE4: /* in <next byte>,%al */
1232 port = insn[insnlen+1];
1233 insnlen += 2;
1234 in = 1;
1235 break;
1236 case 0xEC: /* in (%dx),%al */
1237 port = getreg(edx) & 0xFFFF;
1238 insnlen += 1;
1239 in = 1;
1240 break;
1241 case 0xE6: /* out %al,<next byte> */
1242 port = insn[insnlen+1];
1243 insnlen += 2;
1244 break;
1245 case 0xEE: /* out %al,(%dx) */
1246 port = getreg(edx) & 0xFFFF;
1247 insnlen += 1;
1248 break;
1249 default:
1250 /* OK, we don't know what this is, can't emulate. */
1251 goto no_emulate;
1252 }
1253
1254 /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
1255 if (byte_access)
1256 mask = 0xFF;
1257 else if (small_operand)
1258 mask = 0xFFFF;
1259 else
1260 mask = 0xFFFFFFFF;
1261
1262 /*
1263 * If it was an "IN" instruction, they expect the result to be read
1264 * into %eax, so we change %eax.
1265 */
1266 eax = getreg(eax);
1267
1268 if (in) {
1269 /* Clear the bits we're about to read */
1270 eax &= ~mask;
1271 /* Copy bits in from val. */
1272 eax |= val & mask;
1273 /* Now update the register. */
1274 setreg(eax, eax);
1275 }
1276
1277 verbose("IO %s of %x to %u: %#08x\n",
1278 in ? "IN" : "OUT", mask, port, eax);
1279skip_insn:
1280 /* Finally, we've "done" the instruction, so move past it. */
1281 setreg(eip, getreg(eip) + insnlen);
1282 return;
1283
1284no_emulate:
1285 /* Inject trap into Guest. */
1286 if (write(lguest_fd, args, sizeof(args)) < 0)
1287 err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
1288}
1289
1290
1146/*L:190 1291/*L:190
1147 * Device Setup 1292 * Device Setup
1148 * 1293 *
@@ -1832,6 +1977,10 @@ static void __attribute__((noreturn)) run_guest(void)
1832 verbose("Notify on address %#08x\n", 1977 verbose("Notify on address %#08x\n",
1833 notify.addr); 1978 notify.addr);
1834 handle_output(notify.addr); 1979 handle_output(notify.addr);
1980 } else if (notify.trap == 13) {
1981 verbose("Emulating instruction at %#x\n",
1982 getreg(eip));
1983 emulate_insn(notify.insn);
1835 } else 1984 } else
1836 errx(1, "Unknown trap %i addr %#08x\n", 1985 errx(1, "Unknown trap %i addr %#08x\n",
1837 notify.trap, notify.addr); 1986 notify.trap, notify.addr);