diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2015-02-10 23:45:10 -0500 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2015-02-11 01:17:31 -0500 |
commit | c565650b1028bc551e5d16dd0ec8f7078da7cace (patch) | |
tree | 019581b2a4821eba84ebdff179034c5ac57ca1a3 | |
parent | c9e433e4b852b70ea267388cf9b5d8096b04c44c (diff) |
lguest: send trap 13 through to userspace.
We copy 7 bytes at eip for userspace's instruction decode; we have to
carefully handle the case where eip is at the end of a page. We can't
leave this to userspace since kernel has all the page table decode
logic.
The decode logic moves to userspace, basically unchanged.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
-rw-r--r-- | drivers/lguest/x86/core.c | 133 | ||||
-rw-r--r-- | tools/lguest/lguest.c | 149 |
2 files changed, 192 insertions, 90 deletions
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index f7a16b4ea456..42e87bf14113 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -314,95 +314,52 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) | |||
314 | * usually attached to a PC. | 314 | * usually attached to a PC. |
315 | * | 315 | * |
316 | * When the Guest uses one of these instructions, we get a trap (General | 316 | * When the Guest uses one of these instructions, we get a trap (General |
317 | * Protection Fault) and come here. We see if it's one of those troublesome | 317 | * Protection Fault) and come here. We queue this to be sent out to the |
318 | * instructions and skip over it. We return true if we did. | 318 | * Launcher to handle. |
319 | */ | 319 | */ |
320 | static int emulate_insn(struct lg_cpu *cpu) | ||
321 | { | ||
322 | u8 insn; | ||
323 | unsigned int insnlen = 0, in = 0, small_operand = 0; | ||
324 | /* | ||
325 | * The eip contains the *virtual* address of the Guest's instruction: | ||
326 | * walk the Guest's page tables to find the "physical" address. | ||
327 | */ | ||
328 | unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); | ||
329 | |||
330 | /* | ||
331 | * This must be the Guest kernel trying to do something, not userspace! | ||
332 | * The bottom two bits of the CS segment register are the privilege | ||
333 | * level. | ||
334 | */ | ||
335 | if ((cpu->regs->cs & 3) != GUEST_PL) | ||
336 | return 0; | ||
337 | 320 | ||
338 | /* Decoding x86 instructions is icky. */ | 321 | /* |
339 | insn = lgread(cpu, physaddr, u8); | 322 | * The eip contains the *virtual* address of the Guest's instruction: |
340 | 323 | * we copy the instruction here so the Launcher doesn't have to walk | |
341 | /* | 324 | * the page tables to decode it. We handle the case (eg. in a kernel |
342 | * Around 2.6.33, the kernel started using an emulation for the | 325 | * module) where the instruction is over two pages, and the pages are |
343 | * cmpxchg8b instruction in early boot on many configurations. This | 326 | * virtually but not physically contiguous. |
344 | * code isn't paravirtualized, and it tries to disable interrupts. | 327 | * |
345 | * Ignore it, which will Mostly Work. | 328 | * The longest possible x86 instruction is 15 bytes, but we don't handle |
346 | */ | 329 | * anything that strange. |
347 | if (insn == 0xfa) { | 330 | */ |
348 | /* "cli", or Clear Interrupt Enable instruction. Skip it. */ | 331 | static void copy_from_guest(struct lg_cpu *cpu, |
349 | cpu->regs->eip++; | 332 | void *dst, unsigned long vaddr, size_t len) |
350 | return 1; | 333 | { |
334 | size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE); | ||
335 | unsigned long paddr; | ||
336 | |||
337 | BUG_ON(len > PAGE_SIZE); | ||
338 | |||
339 | /* If it goes over a page, copy in two parts. */ | ||
340 | if (len > to_page_end) { | ||
341 | /* But make sure the next page is mapped! */ | ||
342 | if (__guest_pa(cpu, vaddr + to_page_end, &paddr)) | ||
343 | copy_from_guest(cpu, dst + to_page_end, | ||
344 | vaddr + to_page_end, | ||
345 | len - to_page_end); | ||
346 | else | ||
347 | /* Otherwise fill with zeroes. */ | ||
348 | memset(dst + to_page_end, 0, len - to_page_end); | ||
349 | len = to_page_end; | ||
351 | } | 350 | } |
352 | 351 | ||
353 | /* | 352 | /* This will kill the guest if it isn't mapped, but that |
354 | * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. | 353 | * shouldn't happen. */ |
355 | */ | 354 | __lgread(cpu, dst, guest_pa(cpu, vaddr), len); |
356 | if (insn == 0x66) { | 355 | } |
357 | small_operand = 1; | ||
358 | /* The instruction is 1 byte so far, read the next byte. */ | ||
359 | insnlen = 1; | ||
360 | insn = lgread(cpu, physaddr + insnlen, u8); | ||
361 | } | ||
362 | 356 | ||
363 | /* | ||
364 | * We can ignore the lower bit for the moment and decode the 4 opcodes | ||
365 | * we need to emulate. | ||
366 | */ | ||
367 | switch (insn & 0xFE) { | ||
368 | case 0xE4: /* in <next byte>,%al */ | ||
369 | insnlen += 2; | ||
370 | in = 1; | ||
371 | break; | ||
372 | case 0xEC: /* in (%dx),%al */ | ||
373 | insnlen += 1; | ||
374 | in = 1; | ||
375 | break; | ||
376 | case 0xE6: /* out %al,<next byte> */ | ||
377 | insnlen += 2; | ||
378 | break; | ||
379 | case 0xEE: /* out %al,(%dx) */ | ||
380 | insnlen += 1; | ||
381 | break; | ||
382 | default: | ||
383 | /* OK, we don't know what this is, can't emulate. */ | ||
384 | return 0; | ||
385 | } | ||
386 | 357 | ||
387 | /* | 358 | static void setup_emulate_insn(struct lg_cpu *cpu) |
388 | * If it was an "IN" instruction, they expect the result to be read | 359 | { |
389 | * into %eax, so we change %eax. We always return all-ones, which | 360 | cpu->pending.trap = 13; |
390 | * traditionally means "there's nothing there". | 361 | copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, |
391 | */ | 362 | sizeof(cpu->pending.insn)); |
392 | if (in) { | ||
393 | /* Lower bit tells means it's a 32/16 bit access */ | ||
394 | if (insn & 0x1) { | ||
395 | if (small_operand) | ||
396 | cpu->regs->eax |= 0xFFFF; | ||
397 | else | ||
398 | cpu->regs->eax = 0xFFFFFFFF; | ||
399 | } else | ||
400 | cpu->regs->eax |= 0xFF; | ||
401 | } | ||
402 | /* Finally, we've "done" the instruction, so move past it. */ | ||
403 | cpu->regs->eip += insnlen; | ||
404 | /* Success! */ | ||
405 | return 1; | ||
406 | } | 363 | } |
407 | 364 | ||
408 | /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ | 365 | /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ |
@@ -410,14 +367,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) | |||
410 | { | 367 | { |
411 | switch (cpu->regs->trapnum) { | 368 | switch (cpu->regs->trapnum) { |
412 | case 13: /* We've intercepted a General Protection Fault. */ | 369 | case 13: /* We've intercepted a General Protection Fault. */ |
413 | /* | 370 | /* Hand to Launcher to emulate those pesky IN and OUT insns */ |
414 | * Check if this was one of those annoying IN or OUT | ||
415 | * instructions which we need to emulate. If so, we just go | ||
416 | * back into the Guest after we've done it. | ||
417 | */ | ||
418 | if (cpu->regs->errcode == 0) { | 371 | if (cpu->regs->errcode == 0) { |
419 | if (emulate_insn(cpu)) | 372 | setup_emulate_insn(cpu); |
420 | return; | 373 | return; |
421 | } | 374 | } |
422 | break; | 375 | break; |
423 | case 14: /* We've intercepted a Page Fault. */ | 376 | case 14: /* We've intercepted a Page Fault. */ |
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 0e754d04876d..b2217657f62c 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <signal.h> | 41 | #include <signal.h> |
42 | #include <pwd.h> | 42 | #include <pwd.h> |
43 | #include <grp.h> | 43 | #include <grp.h> |
44 | #include <sys/user.h> | ||
44 | 45 | ||
45 | #ifndef VIRTIO_F_ANY_LAYOUT | 46 | #ifndef VIRTIO_F_ANY_LAYOUT |
46 | #define VIRTIO_F_ANY_LAYOUT 27 | 47 | #define VIRTIO_F_ANY_LAYOUT 27 |
@@ -1143,6 +1144,150 @@ static void handle_output(unsigned long addr) | |||
1143 | strnlen(from_guest_phys(addr), guest_limit - addr)); | 1144 | strnlen(from_guest_phys(addr), guest_limit - addr)); |
1144 | } | 1145 | } |
1145 | 1146 | ||
1147 | /*L:216 | ||
1148 | * This is where we emulate a handful of Guest instructions. It's ugly | ||
1149 | * and we used to do it in the kernel but it grew over time. | ||
1150 | */ | ||
1151 | |||
1152 | /* | ||
1153 | * We use the ptrace syscall's pt_regs struct to talk about registers | ||
1154 | * to lguest: these macros convert the names to the offsets. | ||
1155 | */ | ||
1156 | #define getreg(name) getreg_off(offsetof(struct user_regs_struct, name)) | ||
1157 | #define setreg(name, val) \ | ||
1158 | setreg_off(offsetof(struct user_regs_struct, name), (val)) | ||
1159 | |||
1160 | static u32 getreg_off(size_t offset) | ||
1161 | { | ||
1162 | u32 r; | ||
1163 | unsigned long args[] = { LHREQ_GETREG, offset }; | ||
1164 | |||
1165 | if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) | ||
1166 | err(1, "Getting register %u", offset); | ||
1167 | if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r)) | ||
1168 | err(1, "Reading register %u", offset); | ||
1169 | |||
1170 | return r; | ||
1171 | } | ||
1172 | |||
1173 | static void setreg_off(size_t offset, u32 val) | ||
1174 | { | ||
1175 | unsigned long args[] = { LHREQ_SETREG, offset, val }; | ||
1176 | |||
1177 | if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) | ||
1178 | err(1, "Setting register %u", offset); | ||
1179 | } | ||
1180 | |||
1181 | static void emulate_insn(const u8 insn[]) | ||
1182 | { | ||
1183 | unsigned long args[] = { LHREQ_TRAP, 13 }; | ||
1184 | unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access; | ||
1185 | unsigned int eax, port, mask; | ||
1186 | /* | ||
1187 | * We always return all-ones on IO port reads, which traditionally | ||
1188 | * means "there's nothing there". | ||
1189 | */ | ||
1190 | u32 val = 0xFFFFFFFF; | ||
1191 | |||
1192 | /* | ||
1193 | * This must be the Guest kernel trying to do something, not userspace! | ||
1194 | * The bottom two bits of the CS segment register are the privilege | ||
1195 | * level. | ||
1196 | */ | ||
1197 | if ((getreg(xcs) & 3) != 0x1) | ||
1198 | goto no_emulate; | ||
1199 | |||
1200 | /* Decoding x86 instructions is icky. */ | ||
1201 | |||
1202 | /* | ||
1203 | * Around 2.6.33, the kernel started using an emulation for the | ||
1204 | * cmpxchg8b instruction in early boot on many configurations. This | ||
1205 | * code isn't paravirtualized, and it tries to disable interrupts. | ||
1206 | * Ignore it, which will Mostly Work. | ||
1207 | */ | ||
1208 | if (insn[insnlen] == 0xfa) { | ||
1209 | /* "cli", or Clear Interrupt Enable instruction. Skip it. */ | ||
1210 | insnlen = 1; | ||
1211 | goto skip_insn; | ||
1212 | } | ||
1213 | |||
1214 | /* | ||
1215 | * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. | ||
1216 | */ | ||
1217 | if (insn[insnlen] == 0x66) { | ||
1218 | small_operand = 1; | ||
1219 | /* The instruction is 1 byte so far, read the next byte. */ | ||
1220 | insnlen = 1; | ||
1221 | } | ||
1222 | |||
1223 | /* If the lower bit isn't set, it's a single byte access */ | ||
1224 | byte_access = !(insn[insnlen] & 1); | ||
1225 | |||
1226 | /* | ||
1227 | * Now we can ignore the lower bit and decode the 4 opcodes | ||
1228 | * we need to emulate. | ||
1229 | */ | ||
1230 | switch (insn[insnlen] & 0xFE) { | ||
1231 | case 0xE4: /* in <next byte>,%al */ | ||
1232 | port = insn[insnlen+1]; | ||
1233 | insnlen += 2; | ||
1234 | in = 1; | ||
1235 | break; | ||
1236 | case 0xEC: /* in (%dx),%al */ | ||
1237 | port = getreg(edx) & 0xFFFF; | ||
1238 | insnlen += 1; | ||
1239 | in = 1; | ||
1240 | break; | ||
1241 | case 0xE6: /* out %al,<next byte> */ | ||
1242 | port = insn[insnlen+1]; | ||
1243 | insnlen += 2; | ||
1244 | break; | ||
1245 | case 0xEE: /* out %al,(%dx) */ | ||
1246 | port = getreg(edx) & 0xFFFF; | ||
1247 | insnlen += 1; | ||
1248 | break; | ||
1249 | default: | ||
1250 | /* OK, we don't know what this is, can't emulate. */ | ||
1251 | goto no_emulate; | ||
1252 | } | ||
1253 | |||
1254 | /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */ | ||
1255 | if (byte_access) | ||
1256 | mask = 0xFF; | ||
1257 | else if (small_operand) | ||
1258 | mask = 0xFFFF; | ||
1259 | else | ||
1260 | mask = 0xFFFFFFFF; | ||
1261 | |||
1262 | /* | ||
1263 | * If it was an "IN" instruction, they expect the result to be read | ||
1264 | * into %eax, so we change %eax. | ||
1265 | */ | ||
1266 | eax = getreg(eax); | ||
1267 | |||
1268 | if (in) { | ||
1269 | /* Clear the bits we're about to read */ | ||
1270 | eax &= ~mask; | ||
1271 | /* Copy bits in from val. */ | ||
1272 | eax |= val & mask; | ||
1273 | /* Now update the register. */ | ||
1274 | setreg(eax, eax); | ||
1275 | } | ||
1276 | |||
1277 | verbose("IO %s of %x to %u: %#08x\n", | ||
1278 | in ? "IN" : "OUT", mask, port, eax); | ||
1279 | skip_insn: | ||
1280 | /* Finally, we've "done" the instruction, so move past it. */ | ||
1281 | setreg(eip, getreg(eip) + insnlen); | ||
1282 | return; | ||
1283 | |||
1284 | no_emulate: | ||
1285 | /* Inject trap into Guest. */ | ||
1286 | if (write(lguest_fd, args, sizeof(args)) < 0) | ||
1287 | err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip)); | ||
1288 | } | ||
1289 | |||
1290 | |||
1146 | /*L:190 | 1291 | /*L:190 |
1147 | * Device Setup | 1292 | * Device Setup |
1148 | * | 1293 | * |
@@ -1832,6 +1977,10 @@ static void __attribute__((noreturn)) run_guest(void) | |||
1832 | verbose("Notify on address %#08x\n", | 1977 | verbose("Notify on address %#08x\n", |
1833 | notify.addr); | 1978 | notify.addr); |
1834 | handle_output(notify.addr); | 1979 | handle_output(notify.addr); |
1980 | } else if (notify.trap == 13) { | ||
1981 | verbose("Emulating instruction at %#x\n", | ||
1982 | getreg(eip)); | ||
1983 | emulate_insn(notify.insn); | ||
1835 | } else | 1984 | } else |
1836 | errx(1, "Unknown trap %i addr %#08x\n", | 1985 | errx(1, "Unknown trap %i addr %#08x\n", |
1837 | notify.trap, notify.addr); | 1986 | notify.trap, notify.addr); |