diff options
Diffstat (limited to 'drivers/lguest/interrupts_and_traps.c')
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 288 |
1 files changed, 191 insertions, 97 deletions
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 0e9067b0d507..18648180db02 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -1,4 +1,5 @@ | |||
1 | /*P:800 Interrupts (traps) are complicated enough to earn their own file. | 1 | /*P:800 |
2 | * Interrupts (traps) are complicated enough to earn their own file. | ||
2 | * There are three classes of interrupts: | 3 | * There are three classes of interrupts: |
3 | * | 4 | * |
4 | * 1) Real hardware interrupts which occur while we're running the Guest, | 5 | * 1) Real hardware interrupts which occur while we're running the Guest, |
@@ -10,7 +11,8 @@ | |||
10 | * just like real hardware would deliver them. Traps from the Guest can be set | 11 | * just like real hardware would deliver them. Traps from the Guest can be set |
11 | * up to go directly back into the Guest, but sometimes the Host wants to see | 12 | * up to go directly back into the Guest, but sometimes the Host wants to see |
12 | * them first, so we also have a way of "reflecting" them into the Guest as if | 13 | * them first, so we also have a way of "reflecting" them into the Guest as if |
13 | * they had been delivered to it directly. :*/ | 14 | * they had been delivered to it directly. |
15 | :*/ | ||
14 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
15 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
16 | #include <linux/module.h> | 18 | #include <linux/module.h> |
@@ -26,8 +28,10 @@ static unsigned long idt_address(u32 lo, u32 hi) | |||
26 | return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); | 28 | return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); |
27 | } | 29 | } |
28 | 30 | ||
29 | /* The "type" of the interrupt handler is a 4 bit field: we only support a | 31 | /* |
30 | * couple of types. */ | 32 | * The "type" of the interrupt handler is a 4 bit field: we only support a |
33 | * couple of types. | ||
34 | */ | ||
31 | static int idt_type(u32 lo, u32 hi) | 35 | static int idt_type(u32 lo, u32 hi) |
32 | { | 36 | { |
33 | return (hi >> 8) & 0xF; | 37 | return (hi >> 8) & 0xF; |
@@ -39,8 +43,10 @@ static bool idt_present(u32 lo, u32 hi) | |||
39 | return (hi & 0x8000); | 43 | return (hi & 0x8000); |
40 | } | 44 | } |
41 | 45 | ||
42 | /* We need a helper to "push" a value onto the Guest's stack, since that's a | 46 | /* |
43 | * big part of what delivering an interrupt does. */ | 47 | * We need a helper to "push" a value onto the Guest's stack, since that's a |
48 | * big part of what delivering an interrupt does. | ||
49 | */ | ||
44 | static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) | 50 | static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) |
45 | { | 51 | { |
46 | /* Stack grows upwards: move stack then write value. */ | 52 | /* Stack grows upwards: move stack then write value. */ |
@@ -48,7 +54,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) | |||
48 | lgwrite(cpu, *gstack, u32, val); | 54 | lgwrite(cpu, *gstack, u32, val); |
49 | } | 55 | } |
50 | 56 | ||
51 | /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or | 57 | /*H:210 |
58 | * The set_guest_interrupt() routine actually delivers the interrupt or | ||
52 | * trap. The mechanics of delivering traps and interrupts to the Guest are the | 59 | * trap. The mechanics of delivering traps and interrupts to the Guest are the |
53 | * same, except some traps have an "error code" which gets pushed onto the | 60 | * same, except some traps have an "error code" which gets pushed onto the |
54 | * stack as well: the caller tells us if this is one. | 61 | * stack as well: the caller tells us if this is one. |
@@ -59,7 +66,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) | |||
59 | * | 66 | * |
60 | * We set up the stack just like the CPU does for a real interrupt, so it's | 67 | * We set up the stack just like the CPU does for a real interrupt, so it's |
61 | * identical for the Guest (and the standard "iret" instruction will undo | 68 | * identical for the Guest (and the standard "iret" instruction will undo |
62 | * it). */ | 69 | * it). |
70 | */ | ||
63 | static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, | 71 | static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, |
64 | bool has_err) | 72 | bool has_err) |
65 | { | 73 | { |
@@ -67,20 +75,26 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, | |||
67 | u32 eflags, ss, irq_enable; | 75 | u32 eflags, ss, irq_enable; |
68 | unsigned long virtstack; | 76 | unsigned long virtstack; |
69 | 77 | ||
70 | /* There are two cases for interrupts: one where the Guest is already | 78 | /* |
79 | * There are two cases for interrupts: one where the Guest is already | ||
71 | * in the kernel, and a more complex one where the Guest is in | 80 | * in the kernel, and a more complex one where the Guest is in |
72 | * userspace. We check the privilege level to find out. */ | 81 | * userspace. We check the privilege level to find out. |
82 | */ | ||
73 | if ((cpu->regs->ss&0x3) != GUEST_PL) { | 83 | if ((cpu->regs->ss&0x3) != GUEST_PL) { |
74 | /* The Guest told us their kernel stack with the SET_STACK | 84 | /* |
75 | * hypercall: both the virtual address and the segment */ | 85 | * The Guest told us their kernel stack with the SET_STACK |
86 | * hypercall: both the virtual address and the segment. | ||
87 | */ | ||
76 | virtstack = cpu->esp1; | 88 | virtstack = cpu->esp1; |
77 | ss = cpu->ss1; | 89 | ss = cpu->ss1; |
78 | 90 | ||
79 | origstack = gstack = guest_pa(cpu, virtstack); | 91 | origstack = gstack = guest_pa(cpu, virtstack); |
80 | /* We push the old stack segment and pointer onto the new | 92 | /* |
93 | * We push the old stack segment and pointer onto the new | ||
81 | * stack: when the Guest does an "iret" back from the interrupt | 94 | * stack: when the Guest does an "iret" back from the interrupt |
82 | * handler the CPU will notice they're dropping privilege | 95 | * handler the CPU will notice they're dropping privilege |
83 | * levels and expect these here. */ | 96 | * levels and expect these here. |
97 | */ | ||
84 | push_guest_stack(cpu, &gstack, cpu->regs->ss); | 98 | push_guest_stack(cpu, &gstack, cpu->regs->ss); |
85 | push_guest_stack(cpu, &gstack, cpu->regs->esp); | 99 | push_guest_stack(cpu, &gstack, cpu->regs->esp); |
86 | } else { | 100 | } else { |
@@ -91,18 +105,22 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, | |||
91 | origstack = gstack = guest_pa(cpu, virtstack); | 105 | origstack = gstack = guest_pa(cpu, virtstack); |
92 | } | 106 | } |
93 | 107 | ||
94 | /* Remember that we never let the Guest actually disable interrupts, so | 108 | /* |
109 | * Remember that we never let the Guest actually disable interrupts, so | ||
95 | * the "Interrupt Flag" bit is always set. We copy that bit from the | 110 | * the "Interrupt Flag" bit is always set. We copy that bit from the |
96 | * Guest's "irq_enabled" field into the eflags word: we saw the Guest | 111 | * Guest's "irq_enabled" field into the eflags word: we saw the Guest |
97 | * copy it back in "lguest_iret". */ | 112 | * copy it back in "lguest_iret". |
113 | */ | ||
98 | eflags = cpu->regs->eflags; | 114 | eflags = cpu->regs->eflags; |
99 | if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 | 115 | if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 |
100 | && !(irq_enable & X86_EFLAGS_IF)) | 116 | && !(irq_enable & X86_EFLAGS_IF)) |
101 | eflags &= ~X86_EFLAGS_IF; | 117 | eflags &= ~X86_EFLAGS_IF; |
102 | 118 | ||
103 | /* An interrupt is expected to push three things on the stack: the old | 119 | /* |
120 | * An interrupt is expected to push three things on the stack: the old | ||
104 | * "eflags" word, the old code segment, and the old instruction | 121 | * "eflags" word, the old code segment, and the old instruction |
105 | * pointer. */ | 122 | * pointer. |
123 | */ | ||
106 | push_guest_stack(cpu, &gstack, eflags); | 124 | push_guest_stack(cpu, &gstack, eflags); |
107 | push_guest_stack(cpu, &gstack, cpu->regs->cs); | 125 | push_guest_stack(cpu, &gstack, cpu->regs->cs); |
108 | push_guest_stack(cpu, &gstack, cpu->regs->eip); | 126 | push_guest_stack(cpu, &gstack, cpu->regs->eip); |
@@ -111,15 +129,19 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, | |||
111 | if (has_err) | 129 | if (has_err) |
112 | push_guest_stack(cpu, &gstack, cpu->regs->errcode); | 130 | push_guest_stack(cpu, &gstack, cpu->regs->errcode); |
113 | 131 | ||
114 | /* Now we've pushed all the old state, we change the stack, the code | 132 | /* |
115 | * segment and the address to execute. */ | 133 | * Now we've pushed all the old state, we change the stack, the code |
134 | * segment and the address to execute. | ||
135 | */ | ||
116 | cpu->regs->ss = ss; | 136 | cpu->regs->ss = ss; |
117 | cpu->regs->esp = virtstack + (gstack - origstack); | 137 | cpu->regs->esp = virtstack + (gstack - origstack); |
118 | cpu->regs->cs = (__KERNEL_CS|GUEST_PL); | 138 | cpu->regs->cs = (__KERNEL_CS|GUEST_PL); |
119 | cpu->regs->eip = idt_address(lo, hi); | 139 | cpu->regs->eip = idt_address(lo, hi); |
120 | 140 | ||
121 | /* There are two kinds of interrupt handlers: 0xE is an "interrupt | 141 | /* |
122 | * gate" which expects interrupts to be disabled on entry. */ | 142 | * There are two kinds of interrupt handlers: 0xE is an "interrupt |
143 | * gate" which expects interrupts to be disabled on entry. | ||
144 | */ | ||
123 | if (idt_type(lo, hi) == 0xE) | 145 | if (idt_type(lo, hi) == 0xE) |
124 | if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) | 146 | if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) |
125 | kill_guest(cpu, "Disabling interrupts"); | 147 | kill_guest(cpu, "Disabling interrupts"); |
@@ -130,7 +152,8 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, | |||
130 | * | 152 | * |
131 | * interrupt_pending() returns the first pending interrupt which isn't blocked | 153 | * interrupt_pending() returns the first pending interrupt which isn't blocked |
132 | * by the Guest. It is called before every entry to the Guest, and just before | 154 | * by the Guest. It is called before every entry to the Guest, and just before |
133 | * we go to sleep when the Guest has halted itself. */ | 155 | * we go to sleep when the Guest has halted itself. |
156 | */ | ||
134 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) | 157 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) |
135 | { | 158 | { |
136 | unsigned int irq; | 159 | unsigned int irq; |
@@ -140,8 +163,10 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) | |||
140 | if (!cpu->lg->lguest_data) | 163 | if (!cpu->lg->lguest_data) |
141 | return LGUEST_IRQS; | 164 | return LGUEST_IRQS; |
142 | 165 | ||
143 | /* Take our "irqs_pending" array and remove any interrupts the Guest | 166 | /* |
144 | * wants blocked: the result ends up in "blk". */ | 167 | * Take our "irqs_pending" array and remove any interrupts the Guest |
168 | * wants blocked: the result ends up in "blk". | ||
169 | */ | ||
145 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, | 170 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, |
146 | sizeof(blk))) | 171 | sizeof(blk))) |
147 | return LGUEST_IRQS; | 172 | return LGUEST_IRQS; |
@@ -154,16 +179,20 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) | |||
154 | return irq; | 179 | return irq; |
155 | } | 180 | } |
156 | 181 | ||
157 | /* This actually diverts the Guest to running an interrupt handler, once an | 182 | /* |
158 | * interrupt has been identified by interrupt_pending(). */ | 183 | * This actually diverts the Guest to running an interrupt handler, once an |
184 | * interrupt has been identified by interrupt_pending(). | ||
185 | */ | ||
159 | void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) | 186 | void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) |
160 | { | 187 | { |
161 | struct desc_struct *idt; | 188 | struct desc_struct *idt; |
162 | 189 | ||
163 | BUG_ON(irq >= LGUEST_IRQS); | 190 | BUG_ON(irq >= LGUEST_IRQS); |
164 | 191 | ||
165 | /* They may be in the middle of an iret, where they asked us never to | 192 | /* |
166 | * deliver interrupts. */ | 193 | * They may be in the middle of an iret, where they asked us never to |
194 | * deliver interrupts. | ||
195 | */ | ||
167 | if (cpu->regs->eip >= cpu->lg->noirq_start && | 196 | if (cpu->regs->eip >= cpu->lg->noirq_start && |
168 | (cpu->regs->eip < cpu->lg->noirq_end)) | 197 | (cpu->regs->eip < cpu->lg->noirq_end)) |
169 | return; | 198 | return; |
@@ -187,29 +216,37 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) | |||
187 | } | 216 | } |
188 | } | 217 | } |
189 | 218 | ||
190 | /* Look at the IDT entry the Guest gave us for this interrupt. The | 219 | /* |
220 | * Look at the IDT entry the Guest gave us for this interrupt. The | ||
191 | * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip | 221 | * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip |
192 | * over them. */ | 222 | * over them. |
223 | */ | ||
193 | idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; | 224 | idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; |
194 | /* If they don't have a handler (yet?), we just ignore it */ | 225 | /* If they don't have a handler (yet?), we just ignore it */ |
195 | if (idt_present(idt->a, idt->b)) { | 226 | if (idt_present(idt->a, idt->b)) { |
196 | /* OK, mark it no longer pending and deliver it. */ | 227 | /* OK, mark it no longer pending and deliver it. */ |
197 | clear_bit(irq, cpu->irqs_pending); | 228 | clear_bit(irq, cpu->irqs_pending); |
198 | /* set_guest_interrupt() takes the interrupt descriptor and a | 229 | /* |
230 | * set_guest_interrupt() takes the interrupt descriptor and a | ||
199 | * flag to say whether this interrupt pushes an error code onto | 231 | * flag to say whether this interrupt pushes an error code onto |
200 | * the stack as well: virtual interrupts never do. */ | 232 | * the stack as well: virtual interrupts never do. |
233 | */ | ||
201 | set_guest_interrupt(cpu, idt->a, idt->b, false); | 234 | set_guest_interrupt(cpu, idt->a, idt->b, false); |
202 | } | 235 | } |
203 | 236 | ||
204 | /* Every time we deliver an interrupt, we update the timestamp in the | 237 | /* |
238 | * Every time we deliver an interrupt, we update the timestamp in the | ||
205 | * Guest's lguest_data struct. It would be better for the Guest if we | 239 | * Guest's lguest_data struct. It would be better for the Guest if we |
206 | * did this more often, but it can actually be quite slow: doing it | 240 | * did this more often, but it can actually be quite slow: doing it |
207 | * here is a compromise which means at least it gets updated every | 241 | * here is a compromise which means at least it gets updated every |
208 | * timer interrupt. */ | 242 | * timer interrupt. |
243 | */ | ||
209 | write_timestamp(cpu); | 244 | write_timestamp(cpu); |
210 | 245 | ||
211 | /* If there are no other interrupts we want to deliver, clear | 246 | /* |
212 | * the pending flag. */ | 247 | * If there are no other interrupts we want to deliver, clear |
248 | * the pending flag. | ||
249 | */ | ||
213 | if (!more) | 250 | if (!more) |
214 | put_user(0, &cpu->lg->lguest_data->irq_pending); | 251 | put_user(0, &cpu->lg->lguest_data->irq_pending); |
215 | } | 252 | } |
@@ -217,24 +254,29 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) | |||
217 | /* And this is the routine when we want to set an interrupt for the Guest. */ | 254 | /* And this is the routine when we want to set an interrupt for the Guest. */ |
218 | void set_interrupt(struct lg_cpu *cpu, unsigned int irq) | 255 | void set_interrupt(struct lg_cpu *cpu, unsigned int irq) |
219 | { | 256 | { |
220 | /* Next time the Guest runs, the core code will see if it can deliver | 257 | /* |
221 | * this interrupt. */ | 258 | * Next time the Guest runs, the core code will see if it can deliver |
259 | * this interrupt. | ||
260 | */ | ||
222 | set_bit(irq, cpu->irqs_pending); | 261 | set_bit(irq, cpu->irqs_pending); |
223 | 262 | ||
224 | /* Make sure it sees it; it might be asleep (eg. halted), or | 263 | /* |
225 | * running the Guest right now, in which case kick_process() | 264 | * Make sure it sees it; it might be asleep (eg. halted), or running |
226 | * will knock it out. */ | 265 | * the Guest right now, in which case kick_process() will knock it out. |
266 | */ | ||
227 | if (!wake_up_process(cpu->tsk)) | 267 | if (!wake_up_process(cpu->tsk)) |
228 | kick_process(cpu->tsk); | 268 | kick_process(cpu->tsk); |
229 | } | 269 | } |
230 | /*:*/ | 270 | /*:*/ |
231 | 271 | ||
232 | /* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent | 272 | /* |
273 | * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent | ||
233 | * me a patch, so we support that too. It'd be a big step for lguest if half | 274 | * me a patch, so we support that too. It'd be a big step for lguest if half |
234 | * the Plan 9 user base were to start using it. | 275 | * the Plan 9 user base were to start using it. |
235 | * | 276 | * |
236 | * Actually now I think of it, it's possible that Ron *is* half the Plan 9 | 277 | * Actually now I think of it, it's possible that Ron *is* half the Plan 9 |
237 | * userbase. Oh well. */ | 278 | * userbase. Oh well. |
279 | */ | ||
238 | static bool could_be_syscall(unsigned int num) | 280 | static bool could_be_syscall(unsigned int num) |
239 | { | 281 | { |
240 | /* Normal Linux SYSCALL_VECTOR or reserved vector? */ | 282 | /* Normal Linux SYSCALL_VECTOR or reserved vector? */ |
@@ -274,9 +316,11 @@ void free_interrupts(void) | |||
274 | clear_bit(syscall_vector, used_vectors); | 316 | clear_bit(syscall_vector, used_vectors); |
275 | } | 317 | } |
276 | 318 | ||
277 | /*H:220 Now we've got the routines to deliver interrupts, delivering traps like | 319 | /*H:220 |
320 | * Now we've got the routines to deliver interrupts, delivering traps like | ||
278 | * page fault is easy. The only trick is that Intel decided that some traps | 321 | * page fault is easy. The only trick is that Intel decided that some traps |
279 | * should have error codes: */ | 322 | * should have error codes: |
323 | */ | ||
280 | static bool has_err(unsigned int trap) | 324 | static bool has_err(unsigned int trap) |
281 | { | 325 | { |
282 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); | 326 | return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); |
@@ -285,13 +329,17 @@ static bool has_err(unsigned int trap) | |||
285 | /* deliver_trap() returns true if it could deliver the trap. */ | 329 | /* deliver_trap() returns true if it could deliver the trap. */ |
286 | bool deliver_trap(struct lg_cpu *cpu, unsigned int num) | 330 | bool deliver_trap(struct lg_cpu *cpu, unsigned int num) |
287 | { | 331 | { |
288 | /* Trap numbers are always 8 bit, but we set an impossible trap number | 332 | /* |
289 | * for traps inside the Switcher, so check that here. */ | 333 | * Trap numbers are always 8 bit, but we set an impossible trap number |
334 | * for traps inside the Switcher, so check that here. | ||
335 | */ | ||
290 | if (num >= ARRAY_SIZE(cpu->arch.idt)) | 336 | if (num >= ARRAY_SIZE(cpu->arch.idt)) |
291 | return false; | 337 | return false; |
292 | 338 | ||
293 | /* Early on the Guest hasn't set the IDT entries (or maybe it put a | 339 | /* |
294 | * bogus one in): if we fail here, the Guest will be killed. */ | 340 | * Early on the Guest hasn't set the IDT entries (or maybe it put a |
341 | * bogus one in): if we fail here, the Guest will be killed. | ||
342 | */ | ||
295 | if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) | 343 | if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) |
296 | return false; | 344 | return false; |
297 | set_guest_interrupt(cpu, cpu->arch.idt[num].a, | 345 | set_guest_interrupt(cpu, cpu->arch.idt[num].a, |
@@ -299,7 +347,8 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) | |||
299 | return true; | 347 | return true; |
300 | } | 348 | } |
301 | 349 | ||
302 | /*H:250 Here's the hard part: returning to the Host every time a trap happens | 350 | /*H:250 |
351 | * Here's the hard part: returning to the Host every time a trap happens | ||
303 | * and then calling deliver_trap() and re-entering the Guest is slow. | 352 | * and then calling deliver_trap() and re-entering the Guest is slow. |
304 | * Particularly because Guest userspace system calls are traps (usually trap | 353 | * Particularly because Guest userspace system calls are traps (usually trap |
305 | * 128). | 354 | * 128). |
@@ -311,69 +360,87 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) | |||
311 | * the other hypervisors would beat it up at lunchtime. | 360 | * the other hypervisors would beat it up at lunchtime. |
312 | * | 361 | * |
313 | * This routine indicates if a particular trap number could be delivered | 362 | * This routine indicates if a particular trap number could be delivered |
314 | * directly. */ | 363 | * directly. |
364 | */ | ||
315 | static bool direct_trap(unsigned int num) | 365 | static bool direct_trap(unsigned int num) |
316 | { | 366 | { |
317 | /* Hardware interrupts don't go to the Guest at all (except system | 367 | /* |
318 | * call). */ | 368 | * Hardware interrupts don't go to the Guest at all (except system |
369 | * call). | ||
370 | */ | ||
319 | if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) | 371 | if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) |
320 | return false; | 372 | return false; |
321 | 373 | ||
322 | /* The Host needs to see page faults (for shadow paging and to save the | 374 | /* |
375 | * The Host needs to see page faults (for shadow paging and to save the | ||
323 | * fault address), general protection faults (in/out emulation) and | 376 | * fault address), general protection faults (in/out emulation) and |
324 | * device not available (TS handling), invalid opcode fault (kvm hcall), | 377 | * device not available (TS handling), invalid opcode fault (kvm hcall), |
325 | * and of course, the hypercall trap. */ | 378 | * and of course, the hypercall trap. |
379 | */ | ||
326 | return num != 14 && num != 13 && num != 7 && | 380 | return num != 14 && num != 13 && num != 7 && |
327 | num != 6 && num != LGUEST_TRAP_ENTRY; | 381 | num != 6 && num != LGUEST_TRAP_ENTRY; |
328 | } | 382 | } |
329 | /*:*/ | 383 | /*:*/ |
330 | 384 | ||
331 | /*M:005 The Guest has the ability to turn its interrupt gates into trap gates, | 385 | /*M:005 |
386 | * The Guest has the ability to turn its interrupt gates into trap gates, | ||
332 | * if it is careful. The Host will let trap gates can go directly to the | 387 | * if it is careful. The Host will let trap gates can go directly to the |
333 | * Guest, but the Guest needs the interrupts atomically disabled for an | 388 | * Guest, but the Guest needs the interrupts atomically disabled for an |
334 | * interrupt gate. It can do this by pointing the trap gate at instructions | 389 | * interrupt gate. It can do this by pointing the trap gate at instructions |
335 | * within noirq_start and noirq_end, where it can safely disable interrupts. */ | 390 | * within noirq_start and noirq_end, where it can safely disable interrupts. |
391 | */ | ||
336 | 392 | ||
337 | /*M:006 The Guests do not use the sysenter (fast system call) instruction, | 393 | /*M:006 |
394 | * The Guests do not use the sysenter (fast system call) instruction, | ||
338 | * because it's hardcoded to enter privilege level 0 and so can't go direct. | 395 | * because it's hardcoded to enter privilege level 0 and so can't go direct. |
339 | * It's about twice as fast as the older "int 0x80" system call, so it might | 396 | * It's about twice as fast as the older "int 0x80" system call, so it might |
340 | * still be worthwhile to handle it in the Switcher and lcall down to the | 397 | * still be worthwhile to handle it in the Switcher and lcall down to the |
341 | * Guest. The sysenter semantics are hairy tho: search for that keyword in | 398 | * Guest. The sysenter semantics are hairy tho: search for that keyword in |
342 | * entry.S :*/ | 399 | * entry.S |
400 | :*/ | ||
343 | 401 | ||
344 | /*H:260 When we make traps go directly into the Guest, we need to make sure | 402 | /*H:260 |
403 | * When we make traps go directly into the Guest, we need to make sure | ||
345 | * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the | 404 | * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the |
346 | * CPU trying to deliver the trap will fault while trying to push the interrupt | 405 | * CPU trying to deliver the trap will fault while trying to push the interrupt |
347 | * words on the stack: this is called a double fault, and it forces us to kill | 406 | * words on the stack: this is called a double fault, and it forces us to kill |
348 | * the Guest. | 407 | * the Guest. |
349 | * | 408 | * |
350 | * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ | 409 | * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. |
410 | */ | ||
351 | void pin_stack_pages(struct lg_cpu *cpu) | 411 | void pin_stack_pages(struct lg_cpu *cpu) |
352 | { | 412 | { |
353 | unsigned int i; | 413 | unsigned int i; |
354 | 414 | ||
355 | /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or | 415 | /* |
356 | * two pages of stack space. */ | 416 | * Depending on the CONFIG_4KSTACKS option, the Guest can have one or |
417 | * two pages of stack space. | ||
418 | */ | ||
357 | for (i = 0; i < cpu->lg->stack_pages; i++) | 419 | for (i = 0; i < cpu->lg->stack_pages; i++) |
358 | /* The stack grows *upwards*, so the address we're given is the | 420 | /* |
421 | * The stack grows *upwards*, so the address we're given is the | ||
359 | * start of the page after the kernel stack. Subtract one to | 422 | * start of the page after the kernel stack. Subtract one to |
360 | * get back onto the first stack page, and keep subtracting to | 423 | * get back onto the first stack page, and keep subtracting to |
361 | * get to the rest of the stack pages. */ | 424 | * get to the rest of the stack pages. |
425 | */ | ||
362 | pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); | 426 | pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); |
363 | } | 427 | } |
364 | 428 | ||
365 | /* Direct traps also mean that we need to know whenever the Guest wants to use | 429 | /* |
430 | * Direct traps also mean that we need to know whenever the Guest wants to use | ||
366 | * a different kernel stack, so we can change the IDT entries to use that | 431 | * a different kernel stack, so we can change the IDT entries to use that |
367 | * stack. The IDT entries expect a virtual address, so unlike most addresses | 432 | * stack. The IDT entries expect a virtual address, so unlike most addresses |
368 | * the Guest gives us, the "esp" (stack pointer) value here is virtual, not | 433 | * the Guest gives us, the "esp" (stack pointer) value here is virtual, not |
369 | * physical. | 434 | * physical. |
370 | * | 435 | * |
371 | * In Linux each process has its own kernel stack, so this happens a lot: we | 436 | * In Linux each process has its own kernel stack, so this happens a lot: we |
372 | * change stacks on each context switch. */ | 437 | * change stacks on each context switch. |
438 | */ | ||
373 | void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) | 439 | void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) |
374 | { | 440 | { |
375 | /* You are not allowed have a stack segment with privilege level 0: bad | 441 | /* |
376 | * Guest! */ | 442 | * You're not allowed a stack segment with privilege level 0: bad Guest! |
443 | */ | ||
377 | if ((seg & 0x3) != GUEST_PL) | 444 | if ((seg & 0x3) != GUEST_PL) |
378 | kill_guest(cpu, "bad stack segment %i", seg); | 445 | kill_guest(cpu, "bad stack segment %i", seg); |
379 | /* We only expect one or two stack pages. */ | 446 | /* We only expect one or two stack pages. */ |
@@ -387,11 +454,15 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) | |||
387 | pin_stack_pages(cpu); | 454 | pin_stack_pages(cpu); |
388 | } | 455 | } |
389 | 456 | ||
390 | /* All this reference to mapping stacks leads us neatly into the other complex | 457 | /* |
391 | * part of the Host: page table handling. */ | 458 | * All this reference to mapping stacks leads us neatly into the other complex |
459 | * part of the Host: page table handling. | ||
460 | */ | ||
392 | 461 | ||
393 | /*H:235 This is the routine which actually checks the Guest's IDT entry and | 462 | /*H:235 |
394 | * transfers it into the entry in "struct lguest": */ | 463 | * This is the routine which actually checks the Guest's IDT entry and |
464 | * transfers it into the entry in "struct lguest": | ||
465 | */ | ||
395 | static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, | 466 | static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, |
396 | unsigned int num, u32 lo, u32 hi) | 467 | unsigned int num, u32 lo, u32 hi) |
397 | { | 468 | { |
@@ -407,30 +478,38 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, | |||
407 | if (type != 0xE && type != 0xF) | 478 | if (type != 0xE && type != 0xF) |
408 | kill_guest(cpu, "bad IDT type %i", type); | 479 | kill_guest(cpu, "bad IDT type %i", type); |
409 | 480 | ||
410 | /* We only copy the handler address, present bit, privilege level and | 481 | /* |
482 | * We only copy the handler address, present bit, privilege level and | ||
411 | * type. The privilege level controls where the trap can be triggered | 483 | * type. The privilege level controls where the trap can be triggered |
412 | * manually with an "int" instruction. This is usually GUEST_PL, | 484 | * manually with an "int" instruction. This is usually GUEST_PL, |
413 | * except for system calls which userspace can use. */ | 485 | * except for system calls which userspace can use. |
486 | */ | ||
414 | trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); | 487 | trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); |
415 | trap->b = (hi&0xFFFFEF00); | 488 | trap->b = (hi&0xFFFFEF00); |
416 | } | 489 | } |
417 | 490 | ||
418 | /*H:230 While we're here, dealing with delivering traps and interrupts to the | 491 | /*H:230 |
492 | * While we're here, dealing with delivering traps and interrupts to the | ||
419 | * Guest, we might as well complete the picture: how the Guest tells us where | 493 | * Guest, we might as well complete the picture: how the Guest tells us where |
420 | * it wants them to go. This would be simple, except making traps fast | 494 | * it wants them to go. This would be simple, except making traps fast |
421 | * requires some tricks. | 495 | * requires some tricks. |
422 | * | 496 | * |
423 | * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the | 497 | * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the |
424 | * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ | 498 | * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. |
499 | */ | ||
425 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) | 500 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) |
426 | { | 501 | { |
427 | /* Guest never handles: NMI, doublefault, spurious interrupt or | 502 | /* |
428 | * hypercall. We ignore when it tries to set them. */ | 503 | * Guest never handles: NMI, doublefault, spurious interrupt or |
504 | * hypercall. We ignore when it tries to set them. | ||
505 | */ | ||
429 | if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) | 506 | if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) |
430 | return; | 507 | return; |
431 | 508 | ||
432 | /* Mark the IDT as changed: next time the Guest runs we'll know we have | 509 | /* |
433 | * to copy this again. */ | 510 | * Mark the IDT as changed: next time the Guest runs we'll know we have |
511 | * to copy this again. | ||
512 | */ | ||
434 | cpu->changed |= CHANGED_IDT; | 513 | cpu->changed |= CHANGED_IDT; |
435 | 514 | ||
436 | /* Check that the Guest doesn't try to step outside the bounds. */ | 515 | /* Check that the Guest doesn't try to step outside the bounds. */ |
@@ -440,9 +519,11 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) | |||
440 | set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); | 519 | set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); |
441 | } | 520 | } |
442 | 521 | ||
443 | /* The default entry for each interrupt points into the Switcher routines which | 522 | /* |
523 | * The default entry for each interrupt points into the Switcher routines which | ||
444 | * simply return to the Host. The run_guest() loop will then call | 524 | * simply return to the Host. The run_guest() loop will then call |
445 | * deliver_trap() to bounce it back into the Guest. */ | 525 | * deliver_trap() to bounce it back into the Guest. |
526 | */ | ||
446 | static void default_idt_entry(struct desc_struct *idt, | 527 | static void default_idt_entry(struct desc_struct *idt, |
447 | int trap, | 528 | int trap, |
448 | const unsigned long handler, | 529 | const unsigned long handler, |
@@ -451,13 +532,17 @@ static void default_idt_entry(struct desc_struct *idt, | |||
451 | /* A present interrupt gate. */ | 532 | /* A present interrupt gate. */ |
452 | u32 flags = 0x8e00; | 533 | u32 flags = 0x8e00; |
453 | 534 | ||
454 | /* Set the privilege level on the entry for the hypercall: this allows | 535 | /* |
455 | * the Guest to use the "int" instruction to trigger it. */ | 536 | * Set the privilege level on the entry for the hypercall: this allows |
537 | * the Guest to use the "int" instruction to trigger it. | ||
538 | */ | ||
456 | if (trap == LGUEST_TRAP_ENTRY) | 539 | if (trap == LGUEST_TRAP_ENTRY) |
457 | flags |= (GUEST_PL << 13); | 540 | flags |= (GUEST_PL << 13); |
458 | else if (base) | 541 | else if (base) |
459 | /* Copy priv. level from what Guest asked for. This allows | 542 | /* |
460 | * debug (int 3) traps from Guest userspace, for example. */ | 543 | * Copy privilege level from what Guest asked for. This allows |
544 | * debug (int 3) traps from Guest userspace, for example. | ||
545 | */ | ||
461 | flags |= (base->b & 0x6000); | 546 | flags |= (base->b & 0x6000); |
462 | 547 | ||
463 | /* Now pack it into the IDT entry in its weird format. */ | 548 | /* Now pack it into the IDT entry in its weird format. */ |
@@ -475,16 +560,20 @@ void setup_default_idt_entries(struct lguest_ro_state *state, | |||
475 | default_idt_entry(&state->guest_idt[i], i, def[i], NULL); | 560 | default_idt_entry(&state->guest_idt[i], i, def[i], NULL); |
476 | } | 561 | } |
477 | 562 | ||
478 | /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead | 563 | /*H:240 |
564 | * We don't use the IDT entries in the "struct lguest" directly, instead | ||
479 | * we copy them into the IDT which we've set up for Guests on this CPU, just | 565 | * we copy them into the IDT which we've set up for Guests on this CPU, just |
480 | * before we run the Guest. This routine does that copy. */ | 566 | * before we run the Guest. This routine does that copy. |
567 | */ | ||
481 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, | 568 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, |
482 | const unsigned long *def) | 569 | const unsigned long *def) |
483 | { | 570 | { |
484 | unsigned int i; | 571 | unsigned int i; |
485 | 572 | ||
486 | /* We can simply copy the direct traps, otherwise we use the default | 573 | /* |
487 | * ones in the Switcher: they will return to the Host. */ | 574 | * We can simply copy the direct traps, otherwise we use the default |
575 | * ones in the Switcher: they will return to the Host. | ||
576 | */ | ||
488 | for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { | 577 | for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { |
489 | const struct desc_struct *gidt = &cpu->arch.idt[i]; | 578 | const struct desc_struct *gidt = &cpu->arch.idt[i]; |
490 | 579 | ||
@@ -492,14 +581,16 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, | |||
492 | if (!direct_trap(i)) | 581 | if (!direct_trap(i)) |
493 | continue; | 582 | continue; |
494 | 583 | ||
495 | /* Only trap gates (type 15) can go direct to the Guest. | 584 | /* |
585 | * Only trap gates (type 15) can go direct to the Guest. | ||
496 | * Interrupt gates (type 14) disable interrupts as they are | 586 | * Interrupt gates (type 14) disable interrupts as they are |
497 | * entered, which we never let the Guest do. Not present | 587 | * entered, which we never let the Guest do. Not present |
498 | * entries (type 0x0) also can't go direct, of course. | 588 | * entries (type 0x0) also can't go direct, of course. |
499 | * | 589 | * |
500 | * If it can't go direct, we still need to copy the priv. level: | 590 | * If it can't go direct, we still need to copy the priv. level: |
501 | * they might want to give userspace access to a software | 591 | * they might want to give userspace access to a software |
502 | * interrupt. */ | 592 | * interrupt. |
593 | */ | ||
503 | if (idt_type(gidt->a, gidt->b) == 0xF) | 594 | if (idt_type(gidt->a, gidt->b) == 0xF) |
504 | idt[i] = *gidt; | 595 | idt[i] = *gidt; |
505 | else | 596 | else |
@@ -518,7 +609,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, | |||
518 | * the next timer interrupt (in nanoseconds). We use the high-resolution timer | 609 | * the next timer interrupt (in nanoseconds). We use the high-resolution timer |
519 | * infrastructure to set a callback at that time. | 610 | * infrastructure to set a callback at that time. |
520 | * | 611 | * |
521 | * 0 means "turn off the clock". */ | 612 | * 0 means "turn off the clock". |
613 | */ | ||
522 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) | 614 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) |
523 | { | 615 | { |
524 | ktime_t expires; | 616 | ktime_t expires; |
@@ -529,9 +621,11 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) | |||
529 | return; | 621 | return; |
530 | } | 622 | } |
531 | 623 | ||
532 | /* We use wallclock time here, so the Guest might not be running for | 624 | /* |
625 | * We use wallclock time here, so the Guest might not be running for | ||
533 | * all the time between now and the timer interrupt it asked for. This | 626 | * all the time between now and the timer interrupt it asked for. This |
534 | * is almost always the right thing to do. */ | 627 | * is almost always the right thing to do. |
628 | */ | ||
535 | expires = ktime_add_ns(ktime_get_real(), delta); | 629 | expires = ktime_add_ns(ktime_get_real(), delta); |
536 | hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); | 630 | hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); |
537 | } | 631 | } |