diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-12 12:32:26 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-12 12:32:26 -0400 |
| commit | 7f3591cfacf2d79c4f42238e46c7d053da8e020d (patch) | |
| tree | f2e9ed7b6b0bc176facaa49846734790023a6b16 /drivers | |
| parent | 16ffc3eeaa00d513b0076b7b2b96419f28acc912 (diff) | |
| parent | d1f0132e76a11b05167313c606a853953f416081 (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest: (31 commits)
lguest: add support for indirect ring entries
lguest: suppress notifications in example Launcher
lguest: try to batch interrupts on network receive
lguest: avoid sending interrupts to Guest when no activity occurs.
lguest: implement deferred interrupts in example Launcher
lguest: remove obsolete LHREQ_BREAK call
lguest: have example Launcher service all devices in separate threads
lguest: use eventfds for device notification
eventfd: export eventfd_signal and eventfd_fget for lguest
lguest: allow any process to send interrupts
lguest: PAE fixes
lguest: PAE support
lguest: Add support for kvm_hypercall4()
lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD
lguest: use native_set_* macros, which properly handle 64-bit entries when PAE is activated
lguest: map switcher with executable page table entries
lguest: fix writev returning short on console output
lguest: clean up length-used value in example launcher
lguest: Segment selectors are 16-bit long. Fix lg_cpu.ss1 definition.
lguest: beyond ARRAY_SIZE of cpu->arch.gdt
...
Diffstat (limited to 'drivers')
| -rw-r--r-- | drivers/lguest/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/lguest/core.c | 30 | ||||
| -rw-r--r-- | drivers/lguest/hypercalls.c | 14 | ||||
| -rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 57 | ||||
| -rw-r--r-- | drivers/lguest/lg.h | 28 | ||||
| -rw-r--r-- | drivers/lguest/lguest_user.c | 127 | ||||
| -rw-r--r-- | drivers/lguest/page_tables.c | 396 | ||||
| -rw-r--r-- | drivers/lguest/segments.c | 2 |
8 files changed, 531 insertions, 125 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index a3d3cbab359a..0aaa0597a622 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | config LGUEST | 1 | config LGUEST |
| 2 | tristate "Linux hypervisor example code" | 2 | tristate "Linux hypervisor example code" |
| 3 | depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX | 3 | depends on X86_32 && EXPERIMENTAL && EVENTFD |
| 4 | select HVC_DRIVER | 4 | select HVC_DRIVER |
| 5 | ---help--- | 5 | ---help--- |
| 6 | This is a very simple module which allows you to run | 6 | This is a very simple module which allows you to run |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 4845fb3cf74b..a6974e9b8ebf 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
| @@ -95,7 +95,7 @@ static __init int map_switcher(void) | |||
| 95 | * array of struct pages. It increments that pointer, but we don't | 95 | * array of struct pages. It increments that pointer, but we don't |
| 96 | * care. */ | 96 | * care. */ |
| 97 | pagep = switcher_page; | 97 | pagep = switcher_page; |
| 98 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); | 98 | err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); |
| 99 | if (err) { | 99 | if (err) { |
| 100 | printk("lguest: map_vm_area failed: %i\n", err); | 100 | printk("lguest: map_vm_area failed: %i\n", err); |
| 101 | goto free_vma; | 101 | goto free_vma; |
| @@ -188,6 +188,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
| 188 | { | 188 | { |
| 189 | /* We stop running once the Guest is dead. */ | 189 | /* We stop running once the Guest is dead. */ |
| 190 | while (!cpu->lg->dead) { | 190 | while (!cpu->lg->dead) { |
| 191 | unsigned int irq; | ||
| 192 | bool more; | ||
| 193 | |||
| 191 | /* First we run any hypercalls the Guest wants done. */ | 194 | /* First we run any hypercalls the Guest wants done. */ |
| 192 | if (cpu->hcall) | 195 | if (cpu->hcall) |
| 193 | do_hypercalls(cpu); | 196 | do_hypercalls(cpu); |
| @@ -195,23 +198,23 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
| 195 | /* It's possible the Guest did a NOTIFY hypercall to the | 198 | /* It's possible the Guest did a NOTIFY hypercall to the |
| 196 | * Launcher, in which case we return from the read() now. */ | 199 | * Launcher, in which case we return from the read() now. */ |
| 197 | if (cpu->pending_notify) { | 200 | if (cpu->pending_notify) { |
| 198 | if (put_user(cpu->pending_notify, user)) | 201 | if (!send_notify_to_eventfd(cpu)) { |
| 199 | return -EFAULT; | 202 | if (put_user(cpu->pending_notify, user)) |
| 200 | return sizeof(cpu->pending_notify); | 203 | return -EFAULT; |
| 204 | return sizeof(cpu->pending_notify); | ||
| 205 | } | ||
| 201 | } | 206 | } |
| 202 | 207 | ||
| 203 | /* Check for signals */ | 208 | /* Check for signals */ |
| 204 | if (signal_pending(current)) | 209 | if (signal_pending(current)) |
| 205 | return -ERESTARTSYS; | 210 | return -ERESTARTSYS; |
| 206 | 211 | ||
| 207 | /* If Waker set break_out, return to Launcher. */ | ||
| 208 | if (cpu->break_out) | ||
| 209 | return -EAGAIN; | ||
| 210 | |||
| 211 | /* Check if there are any interrupts which can be delivered now: | 212 | /* Check if there are any interrupts which can be delivered now: |
| 212 | * if so, this sets up the hander to be executed when we next | 213 | * if so, this sets up the hander to be executed when we next |
| 213 | * run the Guest. */ | 214 | * run the Guest. */ |
| 214 | maybe_do_interrupt(cpu); | 215 | irq = interrupt_pending(cpu, &more); |
| 216 | if (irq < LGUEST_IRQS) | ||
| 217 | try_deliver_interrupt(cpu, irq, more); | ||
| 215 | 218 | ||
| 216 | /* All long-lived kernel loops need to check with this horrible | 219 | /* All long-lived kernel loops need to check with this horrible |
| 217 | * thing called the freezer. If the Host is trying to suspend, | 220 | * thing called the freezer. If the Host is trying to suspend, |
| @@ -224,10 +227,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
| 224 | break; | 227 | break; |
| 225 | 228 | ||
| 226 | /* If the Guest asked to be stopped, we sleep. The Guest's | 229 | /* If the Guest asked to be stopped, we sleep. The Guest's |
| 227 | * clock timer or LHREQ_BREAK from the Waker will wake us. */ | 230 | * clock timer will wake us. */ |
| 228 | if (cpu->halted) { | 231 | if (cpu->halted) { |
| 229 | set_current_state(TASK_INTERRUPTIBLE); | 232 | set_current_state(TASK_INTERRUPTIBLE); |
| 230 | schedule(); | 233 | /* Just before we sleep, make sure no interrupt snuck in |
| 234 | * which we should be doing. */ | ||
| 235 | if (interrupt_pending(cpu, &more) < LGUEST_IRQS) | ||
| 236 | set_current_state(TASK_RUNNING); | ||
| 237 | else | ||
| 238 | schedule(); | ||
| 231 | continue; | 239 | continue; |
| 232 | } | 240 | } |
| 233 | 241 | ||
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 54d66f05fefa..c29ffa19cb74 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
| @@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
| 37 | /* This call does nothing, except by breaking out of the Guest | 37 | /* This call does nothing, except by breaking out of the Guest |
| 38 | * it makes us process all the asynchronous hypercalls. */ | 38 | * it makes us process all the asynchronous hypercalls. */ |
| 39 | break; | 39 | break; |
| 40 | case LHCALL_SEND_INTERRUPTS: | ||
| 41 | /* This call does nothing too, but by breaking out of the Guest | ||
| 42 | * it makes us process any pending interrupts. */ | ||
| 43 | break; | ||
| 40 | case LHCALL_LGUEST_INIT: | 44 | case LHCALL_LGUEST_INIT: |
| 41 | /* You can't get here unless you're already initialized. Don't | 45 | /* You can't get here unless you're already initialized. Don't |
| 42 | * do that. */ | 46 | * do that. */ |
| @@ -73,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
| 73 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); | 77 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); |
| 74 | break; | 78 | break; |
| 75 | case LHCALL_SET_PTE: | 79 | case LHCALL_SET_PTE: |
| 80 | #ifdef CONFIG_X86_PAE | ||
| 81 | guest_set_pte(cpu, args->arg1, args->arg2, | ||
| 82 | __pte(args->arg3 | (u64)args->arg4 << 32)); | ||
| 83 | #else | ||
| 76 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); | 84 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); |
| 85 | #endif | ||
| 86 | break; | ||
| 87 | case LHCALL_SET_PGD: | ||
| 88 | guest_set_pgd(cpu->lg, args->arg1, args->arg2); | ||
| 77 | break; | 89 | break; |
| 90 | #ifdef CONFIG_X86_PAE | ||
| 78 | case LHCALL_SET_PMD: | 91 | case LHCALL_SET_PMD: |
| 79 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); | 92 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); |
| 80 | break; | 93 | break; |
| 94 | #endif | ||
| 81 | case LHCALL_SET_CLOCKEVENT: | 95 | case LHCALL_SET_CLOCKEVENT: |
| 82 | guest_set_clockevent(cpu, args->arg1); | 96 | guest_set_clockevent(cpu, args->arg1); |
| 83 | break; | 97 | break; |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 6e99adbe1946..0e9067b0d507 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
| @@ -128,30 +128,39 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, | |||
| 128 | /*H:205 | 128 | /*H:205 |
| 129 | * Virtual Interrupts. | 129 | * Virtual Interrupts. |
| 130 | * | 130 | * |
| 131 | * maybe_do_interrupt() gets called before every entry to the Guest, to see if | 131 | * interrupt_pending() returns the first pending interrupt which isn't blocked |
| 132 | * we should divert the Guest to running an interrupt handler. */ | 132 | * by the Guest. It is called before every entry to the Guest, and just before |
| 133 | void maybe_do_interrupt(struct lg_cpu *cpu) | 133 | * we go to sleep when the Guest has halted itself. */ |
| 134 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) | ||
| 134 | { | 135 | { |
| 135 | unsigned int irq; | 136 | unsigned int irq; |
| 136 | DECLARE_BITMAP(blk, LGUEST_IRQS); | 137 | DECLARE_BITMAP(blk, LGUEST_IRQS); |
| 137 | struct desc_struct *idt; | ||
| 138 | 138 | ||
| 139 | /* If the Guest hasn't even initialized yet, we can do nothing. */ | 139 | /* If the Guest hasn't even initialized yet, we can do nothing. */ |
| 140 | if (!cpu->lg->lguest_data) | 140 | if (!cpu->lg->lguest_data) |
| 141 | return; | 141 | return LGUEST_IRQS; |
| 142 | 142 | ||
| 143 | /* Take our "irqs_pending" array and remove any interrupts the Guest | 143 | /* Take our "irqs_pending" array and remove any interrupts the Guest |
| 144 | * wants blocked: the result ends up in "blk". */ | 144 | * wants blocked: the result ends up in "blk". */ |
| 145 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, | 145 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, |
| 146 | sizeof(blk))) | 146 | sizeof(blk))) |
| 147 | return; | 147 | return LGUEST_IRQS; |
| 148 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); | 148 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); |
| 149 | 149 | ||
| 150 | /* Find the first interrupt. */ | 150 | /* Find the first interrupt. */ |
| 151 | irq = find_first_bit(blk, LGUEST_IRQS); | 151 | irq = find_first_bit(blk, LGUEST_IRQS); |
| 152 | /* None? Nothing to do */ | 152 | *more = find_next_bit(blk, LGUEST_IRQS, irq+1); |
| 153 | if (irq >= LGUEST_IRQS) | 153 | |
| 154 | return; | 154 | return irq; |
| 155 | } | ||
| 156 | |||
| 157 | /* This actually diverts the Guest to running an interrupt handler, once an | ||
| 158 | * interrupt has been identified by interrupt_pending(). */ | ||
| 159 | void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) | ||
| 160 | { | ||
| 161 | struct desc_struct *idt; | ||
| 162 | |||
| 163 | BUG_ON(irq >= LGUEST_IRQS); | ||
| 155 | 164 | ||
| 156 | /* They may be in the middle of an iret, where they asked us never to | 165 | /* They may be in the middle of an iret, where they asked us never to |
| 157 | * deliver interrupts. */ | 166 | * deliver interrupts. */ |
| @@ -170,8 +179,12 @@ void maybe_do_interrupt(struct lg_cpu *cpu) | |||
| 170 | u32 irq_enabled; | 179 | u32 irq_enabled; |
| 171 | if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) | 180 | if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) |
| 172 | irq_enabled = 0; | 181 | irq_enabled = 0; |
| 173 | if (!irq_enabled) | 182 | if (!irq_enabled) { |
| 183 | /* Make sure they know an IRQ is pending. */ | ||
| 184 | put_user(X86_EFLAGS_IF, | ||
| 185 | &cpu->lg->lguest_data->irq_pending); | ||
| 174 | return; | 186 | return; |
| 187 | } | ||
| 175 | } | 188 | } |
| 176 | 189 | ||
| 177 | /* Look at the IDT entry the Guest gave us for this interrupt. The | 190 | /* Look at the IDT entry the Guest gave us for this interrupt. The |
| @@ -194,6 +207,25 @@ void maybe_do_interrupt(struct lg_cpu *cpu) | |||
| 194 | * here is a compromise which means at least it gets updated every | 207 | * here is a compromise which means at least it gets updated every |
| 195 | * timer interrupt. */ | 208 | * timer interrupt. */ |
| 196 | write_timestamp(cpu); | 209 | write_timestamp(cpu); |
| 210 | |||
| 211 | /* If there are no other interrupts we want to deliver, clear | ||
| 212 | * the pending flag. */ | ||
| 213 | if (!more) | ||
| 214 | put_user(0, &cpu->lg->lguest_data->irq_pending); | ||
| 215 | } | ||
| 216 | |||
| 217 | /* And this is the routine when we want to set an interrupt for the Guest. */ | ||
| 218 | void set_interrupt(struct lg_cpu *cpu, unsigned int irq) | ||
| 219 | { | ||
| 220 | /* Next time the Guest runs, the core code will see if it can deliver | ||
| 221 | * this interrupt. */ | ||
| 222 | set_bit(irq, cpu->irqs_pending); | ||
| 223 | |||
| 224 | /* Make sure it sees it; it might be asleep (eg. halted), or | ||
| 225 | * running the Guest right now, in which case kick_process() | ||
| 226 | * will knock it out. */ | ||
| 227 | if (!wake_up_process(cpu->tsk)) | ||
| 228 | kick_process(cpu->tsk); | ||
| 197 | } | 229 | } |
| 198 | /*:*/ | 230 | /*:*/ |
| 199 | 231 | ||
| @@ -510,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) | |||
| 510 | struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); | 542 | struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); |
| 511 | 543 | ||
| 512 | /* Remember the first interrupt is the timer interrupt. */ | 544 | /* Remember the first interrupt is the timer interrupt. */ |
| 513 | set_bit(0, cpu->irqs_pending); | 545 | set_interrupt(cpu, 0); |
| 514 | /* If the Guest is actually stopped, we need to wake it up. */ | ||
| 515 | if (cpu->halted) | ||
| 516 | wake_up_process(cpu->tsk); | ||
| 517 | return HRTIMER_NORESTART; | 546 | return HRTIMER_NORESTART; |
| 518 | } | 547 | } |
| 519 | 548 | ||
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index af92a176697f..d4e8979735cb 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
| @@ -49,7 +49,7 @@ struct lg_cpu { | |||
| 49 | u32 cr2; | 49 | u32 cr2; |
| 50 | int ts; | 50 | int ts; |
| 51 | u32 esp1; | 51 | u32 esp1; |
| 52 | u8 ss1; | 52 | u16 ss1; |
| 53 | 53 | ||
| 54 | /* Bitmap of what has changed: see CHANGED_* above. */ | 54 | /* Bitmap of what has changed: see CHANGED_* above. */ |
| 55 | int changed; | 55 | int changed; |
| @@ -71,9 +71,7 @@ struct lg_cpu { | |||
| 71 | /* Virtual clock device */ | 71 | /* Virtual clock device */ |
| 72 | struct hrtimer hrt; | 72 | struct hrtimer hrt; |
| 73 | 73 | ||
| 74 | /* Do we need to stop what we're doing and return to userspace? */ | 74 | /* Did the Guest tell us to halt? */ |
| 75 | int break_out; | ||
| 76 | wait_queue_head_t break_wq; | ||
| 77 | int halted; | 75 | int halted; |
| 78 | 76 | ||
| 79 | /* Pending virtual interrupts */ | 77 | /* Pending virtual interrupts */ |
| @@ -82,6 +80,16 @@ struct lg_cpu { | |||
| 82 | struct lg_cpu_arch arch; | 80 | struct lg_cpu_arch arch; |
| 83 | }; | 81 | }; |
| 84 | 82 | ||
| 83 | struct lg_eventfd { | ||
| 84 | unsigned long addr; | ||
| 85 | struct file *event; | ||
| 86 | }; | ||
| 87 | |||
| 88 | struct lg_eventfd_map { | ||
| 89 | unsigned int num; | ||
| 90 | struct lg_eventfd map[]; | ||
| 91 | }; | ||
| 92 | |||
| 85 | /* The private info the thread maintains about the guest. */ | 93 | /* The private info the thread maintains about the guest. */ |
| 86 | struct lguest | 94 | struct lguest |
| 87 | { | 95 | { |
| @@ -102,6 +110,8 @@ struct lguest | |||
| 102 | unsigned int stack_pages; | 110 | unsigned int stack_pages; |
| 103 | u32 tsc_khz; | 111 | u32 tsc_khz; |
| 104 | 112 | ||
| 113 | struct lg_eventfd_map *eventfds; | ||
| 114 | |||
| 105 | /* Dead? */ | 115 | /* Dead? */ |
| 106 | const char *dead; | 116 | const char *dead; |
| 107 | }; | 117 | }; |
| @@ -137,9 +147,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); | |||
| 137 | * in the kernel. */ | 147 | * in the kernel. */ |
| 138 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) | 148 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) |
| 139 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) | 149 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) |
| 150 | #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) | ||
| 151 | #define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) | ||
| 140 | 152 | ||
| 141 | /* interrupts_and_traps.c: */ | 153 | /* interrupts_and_traps.c: */ |
| 142 | void maybe_do_interrupt(struct lg_cpu *cpu); | 154 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); |
| 155 | void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); | ||
| 156 | void set_interrupt(struct lg_cpu *cpu, unsigned int irq); | ||
| 143 | bool deliver_trap(struct lg_cpu *cpu, unsigned int num); | 157 | bool deliver_trap(struct lg_cpu *cpu, unsigned int num); |
| 144 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, | 158 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, |
| 145 | u32 low, u32 hi); | 159 | u32 low, u32 hi); |
| @@ -150,6 +164,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state, | |||
| 150 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, | 164 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, |
| 151 | const unsigned long *def); | 165 | const unsigned long *def); |
| 152 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); | 166 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); |
| 167 | bool send_notify_to_eventfd(struct lg_cpu *cpu); | ||
| 153 | void init_clockdev(struct lg_cpu *cpu); | 168 | void init_clockdev(struct lg_cpu *cpu); |
| 154 | bool check_syscall_vector(struct lguest *lg); | 169 | bool check_syscall_vector(struct lguest *lg); |
| 155 | int init_interrupts(void); | 170 | int init_interrupts(void); |
| @@ -168,7 +183,10 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); | |||
| 168 | int init_guest_pagetable(struct lguest *lg); | 183 | int init_guest_pagetable(struct lguest *lg); |
| 169 | void free_guest_pagetable(struct lguest *lg); | 184 | void free_guest_pagetable(struct lguest *lg); |
| 170 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); | 185 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); |
| 186 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); | ||
| 187 | #ifdef CONFIG_X86_PAE | ||
| 171 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); | 188 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); |
| 189 | #endif | ||
| 172 | void guest_pagetable_clear_all(struct lg_cpu *cpu); | 190 | void guest_pagetable_clear_all(struct lg_cpu *cpu); |
| 173 | void guest_pagetable_flush_user(struct lg_cpu *cpu); | 191 | void guest_pagetable_flush_user(struct lg_cpu *cpu); |
| 174 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, | 192 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index b8ee103eed5f..32e297121058 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
| @@ -7,32 +7,83 @@ | |||
| 7 | #include <linux/miscdevice.h> | 7 | #include <linux/miscdevice.h> |
| 8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
| 9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
| 10 | #include <linux/eventfd.h> | ||
| 11 | #include <linux/file.h> | ||
| 10 | #include "lg.h" | 12 | #include "lg.h" |
| 11 | 13 | ||
| 12 | /*L:055 When something happens, the Waker process needs a way to stop the | 14 | bool send_notify_to_eventfd(struct lg_cpu *cpu) |
| 13 | * kernel running the Guest and return to the Launcher. So the Waker writes | ||
| 14 | * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher | ||
| 15 | * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release | ||
| 16 | * the Waker. */ | ||
| 17 | static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) | ||
| 18 | { | 15 | { |
| 19 | unsigned long on; | 16 | unsigned int i; |
| 17 | struct lg_eventfd_map *map; | ||
| 18 | |||
| 19 | /* lg->eventfds is RCU-protected */ | ||
| 20 | rcu_read_lock(); | ||
| 21 | map = rcu_dereference(cpu->lg->eventfds); | ||
| 22 | for (i = 0; i < map->num; i++) { | ||
| 23 | if (map->map[i].addr == cpu->pending_notify) { | ||
| 24 | eventfd_signal(map->map[i].event, 1); | ||
| 25 | cpu->pending_notify = 0; | ||
| 26 | break; | ||
| 27 | } | ||
| 28 | } | ||
| 29 | rcu_read_unlock(); | ||
| 30 | return cpu->pending_notify == 0; | ||
| 31 | } | ||
| 20 | 32 | ||
| 21 | /* Fetch whether they're turning break on or off. */ | 33 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) |
| 22 | if (get_user(on, input) != 0) | 34 | { |
| 23 | return -EFAULT; | 35 | struct lg_eventfd_map *new, *old = lg->eventfds; |
| 24 | 36 | ||
| 25 | if (on) { | 37 | if (!addr) |
| 26 | cpu->break_out = 1; | 38 | return -EINVAL; |
| 27 | /* Pop it out of the Guest (may be running on different CPU) */ | 39 | |
| 28 | wake_up_process(cpu->tsk); | 40 | /* Replace the old array with the new one, carefully: others can |
| 29 | /* Wait for them to reset it */ | 41 | * be accessing it at the same time */ |
| 30 | return wait_event_interruptible(cpu->break_wq, !cpu->break_out); | 42 | new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), |
| 31 | } else { | 43 | GFP_KERNEL); |
| 32 | cpu->break_out = 0; | 44 | if (!new) |
| 33 | wake_up(&cpu->break_wq); | 45 | return -ENOMEM; |
| 34 | return 0; | 46 | |
| 47 | /* First make identical copy. */ | ||
| 48 | memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); | ||
| 49 | new->num = old->num; | ||
| 50 | |||
| 51 | /* Now append new entry. */ | ||
| 52 | new->map[new->num].addr = addr; | ||
| 53 | new->map[new->num].event = eventfd_fget(fd); | ||
| 54 | if (IS_ERR(new->map[new->num].event)) { | ||
| 55 | kfree(new); | ||
| 56 | return PTR_ERR(new->map[new->num].event); | ||
| 35 | } | 57 | } |
| 58 | new->num++; | ||
| 59 | |||
| 60 | /* Now put new one in place. */ | ||
| 61 | rcu_assign_pointer(lg->eventfds, new); | ||
| 62 | |||
| 63 | /* We're not in a big hurry. Wait until noone's looking at old | ||
| 64 | * version, then delete it. */ | ||
| 65 | synchronize_rcu(); | ||
| 66 | kfree(old); | ||
| 67 | |||
| 68 | return 0; | ||
| 69 | } | ||
| 70 | |||
| 71 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | ||
| 72 | { | ||
| 73 | unsigned long addr, fd; | ||
| 74 | int err; | ||
| 75 | |||
| 76 | if (get_user(addr, input) != 0) | ||
| 77 | return -EFAULT; | ||
| 78 | input++; | ||
| 79 | if (get_user(fd, input) != 0) | ||
| 80 | return -EFAULT; | ||
| 81 | |||
| 82 | mutex_lock(&lguest_lock); | ||
| 83 | err = add_eventfd(lg, addr, fd); | ||
| 84 | mutex_unlock(&lguest_lock); | ||
| 85 | |||
| 86 | return 0; | ||
| 36 | } | 87 | } |
| 37 | 88 | ||
| 38 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt | 89 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt |
| @@ -45,9 +96,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) | |||
| 45 | return -EFAULT; | 96 | return -EFAULT; |
| 46 | if (irq >= LGUEST_IRQS) | 97 | if (irq >= LGUEST_IRQS) |
| 47 | return -EINVAL; | 98 | return -EINVAL; |
| 48 | /* Next time the Guest runs, the core code will see if it can deliver | 99 | |
| 49 | * this interrupt. */ | 100 | set_interrupt(cpu, irq); |
| 50 | set_bit(irq, cpu->irqs_pending); | ||
| 51 | return 0; | 101 | return 0; |
| 52 | } | 102 | } |
| 53 | 103 | ||
| @@ -126,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | |||
| 126 | * address. */ | 176 | * address. */ |
| 127 | lguest_arch_setup_regs(cpu, start_ip); | 177 | lguest_arch_setup_regs(cpu, start_ip); |
| 128 | 178 | ||
| 129 | /* Initialize the queue for the Waker to wait on */ | ||
| 130 | init_waitqueue_head(&cpu->break_wq); | ||
| 131 | |||
| 132 | /* We keep a pointer to the Launcher task (ie. current task) for when | 179 | /* We keep a pointer to the Launcher task (ie. current task) for when |
| 133 | * other Guests want to wake this one (eg. console input). */ | 180 | * other Guests want to wake this one (eg. console input). */ |
| 134 | cpu->tsk = current; | 181 | cpu->tsk = current; |
| @@ -185,6 +232,13 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 185 | goto unlock; | 232 | goto unlock; |
| 186 | } | 233 | } |
| 187 | 234 | ||
| 235 | lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); | ||
| 236 | if (!lg->eventfds) { | ||
| 237 | err = -ENOMEM; | ||
| 238 | goto free_lg; | ||
| 239 | } | ||
| 240 | lg->eventfds->num = 0; | ||
| 241 | |||
| 188 | /* Populate the easy fields of our "struct lguest" */ | 242 | /* Populate the easy fields of our "struct lguest" */ |
| 189 | lg->mem_base = (void __user *)args[0]; | 243 | lg->mem_base = (void __user *)args[0]; |
| 190 | lg->pfn_limit = args[1]; | 244 | lg->pfn_limit = args[1]; |
| @@ -192,7 +246,7 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 192 | /* This is the first cpu (cpu 0) and it will start booting at args[2] */ | 246 | /* This is the first cpu (cpu 0) and it will start booting at args[2] */ |
| 193 | err = lg_cpu_start(&lg->cpus[0], 0, args[2]); | 247 | err = lg_cpu_start(&lg->cpus[0], 0, args[2]); |
| 194 | if (err) | 248 | if (err) |
| 195 | goto release_guest; | 249 | goto free_eventfds; |
| 196 | 250 | ||
| 197 | /* Initialize the Guest's shadow page tables, using the toplevel | 251 | /* Initialize the Guest's shadow page tables, using the toplevel |
| 198 | * address the Launcher gave us. This allocates memory, so can fail. */ | 252 | * address the Launcher gave us. This allocates memory, so can fail. */ |
| @@ -211,7 +265,9 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 211 | free_regs: | 265 | free_regs: |
| 212 | /* FIXME: This should be in free_vcpu */ | 266 | /* FIXME: This should be in free_vcpu */ |
| 213 | free_page(lg->cpus[0].regs_page); | 267 | free_page(lg->cpus[0].regs_page); |
| 214 | release_guest: | 268 | free_eventfds: |
| 269 | kfree(lg->eventfds); | ||
| 270 | free_lg: | ||
| 215 | kfree(lg); | 271 | kfree(lg); |
| 216 | unlock: | 272 | unlock: |
| 217 | mutex_unlock(&lguest_lock); | 273 | mutex_unlock(&lguest_lock); |
| @@ -252,11 +308,6 @@ static ssize_t write(struct file *file, const char __user *in, | |||
| 252 | /* Once the Guest is dead, you can only read() why it died. */ | 308 | /* Once the Guest is dead, you can only read() why it died. */ |
| 253 | if (lg->dead) | 309 | if (lg->dead) |
| 254 | return -ENOENT; | 310 | return -ENOENT; |
| 255 | |||
| 256 | /* If you're not the task which owns the Guest, all you can do | ||
| 257 | * is break the Launcher out of running the Guest. */ | ||
| 258 | if (current != cpu->tsk && req != LHREQ_BREAK) | ||
| 259 | return -EPERM; | ||
| 260 | } | 311 | } |
| 261 | 312 | ||
| 262 | switch (req) { | 313 | switch (req) { |
| @@ -264,8 +315,8 @@ static ssize_t write(struct file *file, const char __user *in, | |||
| 264 | return initialize(file, input); | 315 | return initialize(file, input); |
| 265 | case LHREQ_IRQ: | 316 | case LHREQ_IRQ: |
| 266 | return user_send_irq(cpu, input); | 317 | return user_send_irq(cpu, input); |
| 267 | case LHREQ_BREAK: | 318 | case LHREQ_EVENTFD: |
| 268 | return break_guest_out(cpu, input); | 319 | return attach_eventfd(lg, input); |
| 269 | default: | 320 | default: |
| 270 | return -EINVAL; | 321 | return -EINVAL; |
| 271 | } | 322 | } |
| @@ -303,6 +354,12 @@ static int close(struct inode *inode, struct file *file) | |||
| 303 | * the Launcher's memory management structure. */ | 354 | * the Launcher's memory management structure. */ |
| 304 | mmput(lg->cpus[i].mm); | 355 | mmput(lg->cpus[i].mm); |
| 305 | } | 356 | } |
| 357 | |||
| 358 | /* Release any eventfds they registered. */ | ||
| 359 | for (i = 0; i < lg->eventfds->num; i++) | ||
| 360 | fput(lg->eventfds->map[i].event); | ||
| 361 | kfree(lg->eventfds); | ||
| 362 | |||
| 306 | /* If lg->dead doesn't contain an error code it will be NULL or a | 363 | /* If lg->dead doesn't contain an error code it will be NULL or a |
| 307 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ | 364 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ |
| 308 | if (!IS_ERR(lg->dead)) | 365 | if (!IS_ERR(lg->dead)) |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a059cf9980f7..a6fe1abda240 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
| @@ -53,6 +53,17 @@ | |||
| 53 | * page. */ | 53 | * page. */ |
| 54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
| 55 | 55 | ||
| 56 | /* For PAE we need the PMD index as well. We use the last 2MB, so we | ||
| 57 | * will need the last pmd entry of the last pmd page. */ | ||
| 58 | #ifdef CONFIG_X86_PAE | ||
| 59 | #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) | ||
| 60 | #define RESERVE_MEM 2U | ||
| 61 | #define CHECK_GPGD_MASK _PAGE_PRESENT | ||
| 62 | #else | ||
| 63 | #define RESERVE_MEM 4U | ||
| 64 | #define CHECK_GPGD_MASK _PAGE_TABLE | ||
| 65 | #endif | ||
| 66 | |||
| 56 | /* We actually need a separate PTE page for each CPU. Remember that after the | 67 | /* We actually need a separate PTE page for each CPU. Remember that after the |
| 57 | * Switcher code itself comes two pages for each CPU, and we don't want this | 68 | * Switcher code itself comes two pages for each CPU, and we don't want this |
| 58 | * CPU's guest to see the pages of any other CPU. */ | 69 | * CPU's guest to see the pages of any other CPU. */ |
| @@ -73,24 +84,59 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) | |||
| 73 | { | 84 | { |
| 74 | unsigned int index = pgd_index(vaddr); | 85 | unsigned int index = pgd_index(vaddr); |
| 75 | 86 | ||
| 87 | #ifndef CONFIG_X86_PAE | ||
| 76 | /* We kill any Guest trying to touch the Switcher addresses. */ | 88 | /* We kill any Guest trying to touch the Switcher addresses. */ |
| 77 | if (index >= SWITCHER_PGD_INDEX) { | 89 | if (index >= SWITCHER_PGD_INDEX) { |
| 78 | kill_guest(cpu, "attempt to access switcher pages"); | 90 | kill_guest(cpu, "attempt to access switcher pages"); |
| 79 | index = 0; | 91 | index = 0; |
| 80 | } | 92 | } |
| 93 | #endif | ||
| 81 | /* Return a pointer index'th pgd entry for the i'th page table. */ | 94 | /* Return a pointer index'th pgd entry for the i'th page table. */ |
| 82 | return &cpu->lg->pgdirs[i].pgdir[index]; | 95 | return &cpu->lg->pgdirs[i].pgdir[index]; |
| 83 | } | 96 | } |
| 84 | 97 | ||
| 98 | #ifdef CONFIG_X86_PAE | ||
| 99 | /* This routine then takes the PGD entry given above, which contains the | ||
| 100 | * address of the PMD page. It then returns a pointer to the PMD entry for the | ||
| 101 | * given address. */ | ||
| 102 | static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | ||
| 103 | { | ||
| 104 | unsigned int index = pmd_index(vaddr); | ||
| 105 | pmd_t *page; | ||
| 106 | |||
| 107 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
| 108 | if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && | ||
| 109 | index >= SWITCHER_PMD_INDEX) { | ||
| 110 | kill_guest(cpu, "attempt to access switcher pages"); | ||
| 111 | index = 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | /* You should never call this if the PGD entry wasn't valid */ | ||
| 115 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | ||
| 116 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | ||
| 117 | |||
| 118 | return &page[index]; | ||
| 119 | } | ||
| 120 | #endif | ||
| 121 | |||
| 85 | /* This routine then takes the page directory entry returned above, which | 122 | /* This routine then takes the page directory entry returned above, which |
| 86 | * contains the address of the page table entry (PTE) page. It then returns a | 123 | * contains the address of the page table entry (PTE) page. It then returns a |
| 87 | * pointer to the PTE entry for the given address. */ | 124 | * pointer to the PTE entry for the given address. */ |
| 88 | static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) | 125 | static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) |
| 89 | { | 126 | { |
| 127 | #ifdef CONFIG_X86_PAE | ||
| 128 | pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); | ||
| 129 | pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); | ||
| 130 | |||
| 131 | /* You should never call this if the PMD entry wasn't valid */ | ||
| 132 | BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); | ||
| 133 | #else | ||
| 90 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 134 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
| 91 | /* You should never call this if the PGD entry wasn't valid */ | 135 | /* You should never call this if the PGD entry wasn't valid */ |
| 92 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | 136 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
| 93 | return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; | 137 | #endif |
| 138 | |||
| 139 | return &page[pte_index(vaddr)]; | ||
| 94 | } | 140 | } |
| 95 | 141 | ||
| 96 | /* These two functions just like the above two, except they access the Guest | 142 | /* These two functions just like the above two, except they access the Guest |
| @@ -101,12 +147,32 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 101 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); | 147 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); |
| 102 | } | 148 | } |
| 103 | 149 | ||
| 104 | static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) | 150 | #ifdef CONFIG_X86_PAE |
| 151 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | ||
| 152 | { | ||
| 153 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | ||
| 154 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | ||
| 155 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | ||
| 156 | } | ||
| 157 | |||
| 158 | static unsigned long gpte_addr(struct lg_cpu *cpu, | ||
| 159 | pmd_t gpmd, unsigned long vaddr) | ||
| 160 | { | ||
| 161 | unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; | ||
| 162 | |||
| 163 | BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); | ||
| 164 | return gpage + pte_index(vaddr) * sizeof(pte_t); | ||
| 165 | } | ||
| 166 | #else | ||
| 167 | static unsigned long gpte_addr(struct lg_cpu *cpu, | ||
| 168 | pgd_t gpgd, unsigned long vaddr) | ||
| 105 | { | 169 | { |
| 106 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 170 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
| 171 | |||
| 107 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | 172 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
| 108 | return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); | 173 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
| 109 | } | 174 | } |
| 175 | #endif | ||
| 110 | /*:*/ | 176 | /*:*/ |
| 111 | 177 | ||
| 112 | /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as | 178 | /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as |
| @@ -171,7 +237,7 @@ static void release_pte(pte_t pte) | |||
| 171 | /* Remember that get_user_pages_fast() took a reference to the page, in | 237 | /* Remember that get_user_pages_fast() took a reference to the page, in |
| 172 | * get_pfn()? We have to put it back now. */ | 238 | * get_pfn()? We have to put it back now. */ |
| 173 | if (pte_flags(pte) & _PAGE_PRESENT) | 239 | if (pte_flags(pte) & _PAGE_PRESENT) |
| 174 | put_page(pfn_to_page(pte_pfn(pte))); | 240 | put_page(pte_page(pte)); |
| 175 | } | 241 | } |
| 176 | /*:*/ | 242 | /*:*/ |
| 177 | 243 | ||
| @@ -184,11 +250,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) | |||
| 184 | 250 | ||
| 185 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) | 251 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
| 186 | { | 252 | { |
| 187 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || | 253 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || |
| 188 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) | 254 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) |
| 189 | kill_guest(cpu, "bad page directory entry"); | 255 | kill_guest(cpu, "bad page directory entry"); |
| 190 | } | 256 | } |
| 191 | 257 | ||
| 258 | #ifdef CONFIG_X86_PAE | ||
| 259 | static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) | ||
| 260 | { | ||
| 261 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || | ||
| 262 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) | ||
| 263 | kill_guest(cpu, "bad page middle directory entry"); | ||
| 264 | } | ||
| 265 | #endif | ||
| 266 | |||
| 192 | /*H:330 | 267 | /*H:330 |
| 193 | * (i) Looking up a page table entry when the Guest faults. | 268 | * (i) Looking up a page table entry when the Guest faults. |
| 194 | * | 269 | * |
| @@ -207,6 +282,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 207 | pte_t gpte; | 282 | pte_t gpte; |
| 208 | pte_t *spte; | 283 | pte_t *spte; |
| 209 | 284 | ||
| 285 | #ifdef CONFIG_X86_PAE | ||
| 286 | pmd_t *spmd; | ||
| 287 | pmd_t gpmd; | ||
| 288 | #endif | ||
| 289 | |||
| 210 | /* First step: get the top-level Guest page table entry. */ | 290 | /* First step: get the top-level Guest page table entry. */ |
| 211 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 291 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
| 212 | /* Toplevel not present? We can't map it in. */ | 292 | /* Toplevel not present? We can't map it in. */ |
| @@ -228,12 +308,45 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 228 | check_gpgd(cpu, gpgd); | 308 | check_gpgd(cpu, gpgd); |
| 229 | /* And we copy the flags to the shadow PGD entry. The page | 309 | /* And we copy the flags to the shadow PGD entry. The page |
| 230 | * number in the shadow PGD is the page we just allocated. */ | 310 | * number in the shadow PGD is the page we just allocated. */ |
| 231 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); | 311 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); |
| 232 | } | 312 | } |
| 233 | 313 | ||
| 314 | #ifdef CONFIG_X86_PAE | ||
| 315 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
| 316 | /* middle level not present? We can't map it in. */ | ||
| 317 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
| 318 | return false; | ||
| 319 | |||
| 320 | /* Now look at the matching shadow entry. */ | ||
| 321 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
| 322 | |||
| 323 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { | ||
| 324 | /* No shadow entry: allocate a new shadow PTE page. */ | ||
| 325 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | ||
| 326 | |||
| 327 | /* This is not really the Guest's fault, but killing it is | ||
| 328 | * simple for this corner case. */ | ||
| 329 | if (!ptepage) { | ||
| 330 | kill_guest(cpu, "out of memory allocating pte page"); | ||
| 331 | return false; | ||
| 332 | } | ||
| 333 | |||
| 334 | /* We check that the Guest pmd is OK. */ | ||
| 335 | check_gpmd(cpu, gpmd); | ||
| 336 | |||
| 337 | /* And we copy the flags to the shadow PMD entry. The page | ||
| 338 | * number in the shadow PMD is the page we just allocated. */ | ||
| 339 | native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); | ||
| 340 | } | ||
| 341 | |||
| 342 | /* OK, now we look at the lower level in the Guest page table: keep its | ||
| 343 | * address, because we might update it later. */ | ||
| 344 | gpte_ptr = gpte_addr(cpu, gpmd, vaddr); | ||
| 345 | #else | ||
| 234 | /* OK, now we look at the lower level in the Guest page table: keep its | 346 | /* OK, now we look at the lower level in the Guest page table: keep its |
| 235 | * address, because we might update it later. */ | 347 | * address, because we might update it later. */ |
| 236 | gpte_ptr = gpte_addr(gpgd, vaddr); | 348 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
| 349 | #endif | ||
| 237 | gpte = lgread(cpu, gpte_ptr, pte_t); | 350 | gpte = lgread(cpu, gpte_ptr, pte_t); |
| 238 | 351 | ||
| 239 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 352 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
| @@ -259,7 +372,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 259 | gpte = pte_mkdirty(gpte); | 372 | gpte = pte_mkdirty(gpte); |
| 260 | 373 | ||
| 261 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 374 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
| 262 | spte = spte_addr(*spgd, vaddr); | 375 | spte = spte_addr(cpu, *spgd, vaddr); |
| 263 | /* If there was a valid shadow PTE entry here before, we release it. | 376 | /* If there was a valid shadow PTE entry here before, we release it. |
| 264 | * This can happen with a write to a previously read-only entry. */ | 377 | * This can happen with a write to a previously read-only entry. */ |
| 265 | release_pte(*spte); | 378 | release_pte(*spte); |
| @@ -273,7 +386,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 273 | * table entry, even if the Guest says it's writable. That way | 386 | * table entry, even if the Guest says it's writable. That way |
| 274 | * we will come back here when a write does actually occur, so | 387 | * we will come back here when a write does actually occur, so |
| 275 | * we can update the Guest's _PAGE_DIRTY flag. */ | 388 | * we can update the Guest's _PAGE_DIRTY flag. */ |
| 276 | *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); | 389 | native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); |
| 277 | 390 | ||
| 278 | /* Finally, we write the Guest PTE entry back: we've set the | 391 | /* Finally, we write the Guest PTE entry back: we've set the |
| 279 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | 392 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ |
| @@ -301,14 +414,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 301 | pgd_t *spgd; | 414 | pgd_t *spgd; |
| 302 | unsigned long flags; | 415 | unsigned long flags; |
| 303 | 416 | ||
| 417 | #ifdef CONFIG_X86_PAE | ||
| 418 | pmd_t *spmd; | ||
| 419 | #endif | ||
| 304 | /* Look at the current top level entry: is it present? */ | 420 | /* Look at the current top level entry: is it present? */ |
| 305 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 421 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
| 306 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | 422 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
| 307 | return false; | 423 | return false; |
| 308 | 424 | ||
| 425 | #ifdef CONFIG_X86_PAE | ||
| 426 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
| 427 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) | ||
| 428 | return false; | ||
| 429 | #endif | ||
| 430 | |||
| 309 | /* Check the flags on the pte entry itself: it must be present and | 431 | /* Check the flags on the pte entry itself: it must be present and |
| 310 | * writable. */ | 432 | * writable. */ |
| 311 | flags = pte_flags(*(spte_addr(*spgd, vaddr))); | 433 | flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); |
| 312 | 434 | ||
| 313 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 435 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
| 314 | } | 436 | } |
| @@ -322,8 +444,43 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 322 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 444 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
| 323 | } | 445 | } |
| 324 | 446 | ||
| 447 | #ifdef CONFIG_X86_PAE | ||
| 448 | static void release_pmd(pmd_t *spmd) | ||
| 449 | { | ||
| 450 | /* If the entry's not present, there's nothing to release. */ | ||
| 451 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { | ||
| 452 | unsigned int i; | ||
| 453 | pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); | ||
| 454 | /* For each entry in the page, we might need to release it. */ | ||
| 455 | for (i = 0; i < PTRS_PER_PTE; i++) | ||
| 456 | release_pte(ptepage[i]); | ||
| 457 | /* Now we can free the page of PTEs */ | ||
| 458 | free_page((long)ptepage); | ||
| 459 | /* And zero out the PMD entry so we never release it twice. */ | ||
| 460 | native_set_pmd(spmd, __pmd(0)); | ||
| 461 | } | ||
| 462 | } | ||
| 463 | |||
| 464 | static void release_pgd(pgd_t *spgd) | ||
| 465 | { | ||
| 466 | /* If the entry's not present, there's nothing to release. */ | ||
| 467 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | ||
| 468 | unsigned int i; | ||
| 469 | pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
| 470 | |||
| 471 | for (i = 0; i < PTRS_PER_PMD; i++) | ||
| 472 | release_pmd(&pmdpage[i]); | ||
| 473 | |||
| 474 | /* Now we can free the page of PMDs */ | ||
| 475 | free_page((long)pmdpage); | ||
| 476 | /* And zero out the PGD entry so we never release it twice. */ | ||
| 477 | set_pgd(spgd, __pgd(0)); | ||
| 478 | } | ||
| 479 | } | ||
| 480 | |||
| 481 | #else /* !CONFIG_X86_PAE */ | ||
| 325 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 482 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ |
| 326 | static void release_pgd(struct lguest *lg, pgd_t *spgd) | 483 | static void release_pgd(pgd_t *spgd) |
| 327 | { | 484 | { |
| 328 | /* If the entry's not present, there's nothing to release. */ | 485 | /* If the entry's not present, there's nothing to release. */ |
| 329 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 486 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
| @@ -341,7 +498,7 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd) | |||
| 341 | *spgd = __pgd(0); | 498 | *spgd = __pgd(0); |
| 342 | } | 499 | } |
| 343 | } | 500 | } |
| 344 | 501 | #endif | |
| 345 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() | 502 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() |
| 346 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. | 503 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. |
| 347 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ | 504 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ |
| @@ -350,7 +507,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) | |||
| 350 | unsigned int i; | 507 | unsigned int i; |
| 351 | /* Release every pgd entry up to the kernel's address. */ | 508 | /* Release every pgd entry up to the kernel's address. */ |
| 352 | for (i = 0; i < pgd_index(lg->kernel_address); i++) | 509 | for (i = 0; i < pgd_index(lg->kernel_address); i++) |
| 353 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 510 | release_pgd(lg->pgdirs[idx].pgdir + i); |
| 354 | } | 511 | } |
| 355 | 512 | ||
| 356 | /*H:440 (v) Flushing (throwing away) page tables, | 513 | /*H:440 (v) Flushing (throwing away) page tables, |
| @@ -369,7 +526,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 369 | { | 526 | { |
| 370 | pgd_t gpgd; | 527 | pgd_t gpgd; |
| 371 | pte_t gpte; | 528 | pte_t gpte; |
| 372 | 529 | #ifdef CONFIG_X86_PAE | |
| 530 | pmd_t gpmd; | ||
| 531 | #endif | ||
| 373 | /* First step: get the top-level Guest page table entry. */ | 532 | /* First step: get the top-level Guest page table entry. */ |
| 374 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 533 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
| 375 | /* Toplevel not present? We can't map it in. */ | 534 | /* Toplevel not present? We can't map it in. */ |
| @@ -378,7 +537,14 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 378 | return -1UL; | 537 | return -1UL; |
| 379 | } | 538 | } |
| 380 | 539 | ||
| 381 | gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); | 540 | #ifdef CONFIG_X86_PAE |
| 541 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
| 542 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
| 543 | kill_guest(cpu, "Bad address %#lx", vaddr); | ||
| 544 | gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); | ||
| 545 | #else | ||
| 546 | gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); | ||
| 547 | #endif | ||
| 382 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 548 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
| 383 | kill_guest(cpu, "Bad address %#lx", vaddr); | 549 | kill_guest(cpu, "Bad address %#lx", vaddr); |
| 384 | 550 | ||
| @@ -405,6 +571,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
| 405 | int *blank_pgdir) | 571 | int *blank_pgdir) |
| 406 | { | 572 | { |
| 407 | unsigned int next; | 573 | unsigned int next; |
| 574 | #ifdef CONFIG_X86_PAE | ||
| 575 | pmd_t *pmd_table; | ||
| 576 | #endif | ||
| 408 | 577 | ||
| 409 | /* We pick one entry at random to throw out. Choosing the Least | 578 | /* We pick one entry at random to throw out. Choosing the Least |
| 410 | * Recently Used might be better, but this is easy. */ | 579 | * Recently Used might be better, but this is easy. */ |
| @@ -416,10 +585,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
| 416 | /* If the allocation fails, just keep using the one we have */ | 585 | /* If the allocation fails, just keep using the one we have */ |
| 417 | if (!cpu->lg->pgdirs[next].pgdir) | 586 | if (!cpu->lg->pgdirs[next].pgdir) |
| 418 | next = cpu->cpu_pgd; | 587 | next = cpu->cpu_pgd; |
| 419 | else | 588 | else { |
| 420 | /* This is a blank page, so there are no kernel | 589 | #ifdef CONFIG_X86_PAE |
| 421 | * mappings: caller must map the stack! */ | 590 | /* In PAE mode, allocate a pmd page and populate the |
| 591 | * last pgd entry. */ | ||
| 592 | pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); | ||
| 593 | if (!pmd_table) { | ||
| 594 | free_page((long)cpu->lg->pgdirs[next].pgdir); | ||
| 595 | set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); | ||
| 596 | next = cpu->cpu_pgd; | ||
| 597 | } else { | ||
| 598 | set_pgd(cpu->lg->pgdirs[next].pgdir + | ||
| 599 | SWITCHER_PGD_INDEX, | ||
| 600 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
| 601 | /* This is a blank page, so there are no kernel | ||
| 602 | * mappings: caller must map the stack! */ | ||
| 603 | *blank_pgdir = 1; | ||
| 604 | } | ||
| 605 | #else | ||
| 422 | *blank_pgdir = 1; | 606 | *blank_pgdir = 1; |
| 607 | #endif | ||
| 608 | } | ||
| 423 | } | 609 | } |
| 424 | /* Record which Guest toplevel this shadows. */ | 610 | /* Record which Guest toplevel this shadows. */ |
| 425 | cpu->lg->pgdirs[next].gpgdir = gpgdir; | 611 | cpu->lg->pgdirs[next].gpgdir = gpgdir; |
| @@ -431,7 +617,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
| 431 | 617 | ||
| 432 | /*H:430 (iv) Switching page tables | 618 | /*H:430 (iv) Switching page tables |
| 433 | * | 619 | * |
| 434 | * Now we've seen all the page table setting and manipulation, let's see what | 620 | * Now we've seen all the page table setting and manipulation, let's see |
| 435 | * what happens when the Guest changes page tables (ie. changes the top-level | 621 | * what happens when the Guest changes page tables (ie. changes the top-level |
| 436 | * pgdir). This occurs on almost every context switch. */ | 622 | * pgdir). This occurs on almost every context switch. */ |
| 437 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | 623 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) |
| @@ -460,10 +646,25 @@ static void release_all_pagetables(struct lguest *lg) | |||
| 460 | 646 | ||
| 461 | /* Every shadow pagetable this Guest has */ | 647 | /* Every shadow pagetable this Guest has */ |
| 462 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 648 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
| 463 | if (lg->pgdirs[i].pgdir) | 649 | if (lg->pgdirs[i].pgdir) { |
| 650 | #ifdef CONFIG_X86_PAE | ||
| 651 | pgd_t *spgd; | ||
| 652 | pmd_t *pmdpage; | ||
| 653 | unsigned int k; | ||
| 654 | |||
| 655 | /* Get the last pmd page. */ | ||
| 656 | spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; | ||
| 657 | pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
| 658 | |||
| 659 | /* And release the pmd entries of that pmd page, | ||
| 660 | * except for the switcher pmd. */ | ||
| 661 | for (k = 0; k < SWITCHER_PMD_INDEX; k++) | ||
| 662 | release_pmd(&pmdpage[k]); | ||
| 663 | #endif | ||
| 464 | /* Every PGD entry except the Switcher at the top */ | 664 | /* Every PGD entry except the Switcher at the top */ |
| 465 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | 665 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) |
| 466 | release_pgd(lg, lg->pgdirs[i].pgdir + j); | 666 | release_pgd(lg->pgdirs[i].pgdir + j); |
| 667 | } | ||
| 467 | } | 668 | } |
| 468 | 669 | ||
| 469 | /* We also throw away everything when a Guest tells us it's changed a kernel | 670 | /* We also throw away everything when a Guest tells us it's changed a kernel |
| @@ -504,24 +705,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
| 504 | { | 705 | { |
| 505 | /* Look up the matching shadow page directory entry. */ | 706 | /* Look up the matching shadow page directory entry. */ |
| 506 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); | 707 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); |
| 708 | #ifdef CONFIG_X86_PAE | ||
| 709 | pmd_t *spmd; | ||
| 710 | #endif | ||
| 507 | 711 | ||
| 508 | /* If the top level isn't present, there's no entry to update. */ | 712 | /* If the top level isn't present, there's no entry to update. */ |
| 509 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 713 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
| 510 | /* Otherwise, we start by releasing the existing entry. */ | 714 | #ifdef CONFIG_X86_PAE |
| 511 | pte_t *spte = spte_addr(*spgd, vaddr); | 715 | spmd = spmd_addr(cpu, *spgd, vaddr); |
| 512 | release_pte(*spte); | 716 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { |
| 513 | 717 | #endif | |
| 514 | /* If they're setting this entry as dirty or accessed, we might | 718 | /* Otherwise, we start by releasing |
| 515 | * as well put that entry they've given us in now. This shaves | 719 | * the existing entry. */ |
| 516 | * 10% off a copy-on-write micro-benchmark. */ | 720 | pte_t *spte = spte_addr(cpu, *spgd, vaddr); |
| 517 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 721 | release_pte(*spte); |
| 518 | check_gpte(cpu, gpte); | 722 | |
| 519 | *spte = gpte_to_spte(cpu, gpte, | 723 | /* If they're setting this entry as dirty or accessed, |
| 520 | pte_flags(gpte) & _PAGE_DIRTY); | 724 | * we might as well put that entry they've given us |
| 521 | } else | 725 | * in now. This shaves 10% off a |
| 522 | /* Otherwise kill it and we can demand_page() it in | 726 | * copy-on-write micro-benchmark. */ |
| 523 | * later. */ | 727 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
| 524 | *spte = __pte(0); | 728 | check_gpte(cpu, gpte); |
| 729 | native_set_pte(spte, | ||
| 730 | gpte_to_spte(cpu, gpte, | ||
| 731 | pte_flags(gpte) & _PAGE_DIRTY)); | ||
| 732 | } else | ||
| 733 | /* Otherwise kill it and we can demand_page() | ||
| 734 | * it in later. */ | ||
| 735 | native_set_pte(spte, __pte(0)); | ||
| 736 | #ifdef CONFIG_X86_PAE | ||
| 737 | } | ||
| 738 | #endif | ||
| 525 | } | 739 | } |
| 526 | } | 740 | } |
| 527 | 741 | ||
| @@ -568,12 +782,10 @@ void guest_set_pte(struct lg_cpu *cpu, | |||
| 568 | * | 782 | * |
| 569 | * So with that in mind here's our code to to update a (top-level) PGD entry: | 783 | * So with that in mind here's our code to to update a (top-level) PGD entry: |
| 570 | */ | 784 | */ |
| 571 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) | 785 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) |
| 572 | { | 786 | { |
| 573 | int pgdir; | 787 | int pgdir; |
| 574 | 788 | ||
| 575 | /* The kernel seems to try to initialize this early on: we ignore its | ||
| 576 | * attempts to map over the Switcher. */ | ||
| 577 | if (idx >= SWITCHER_PGD_INDEX) | 789 | if (idx >= SWITCHER_PGD_INDEX) |
| 578 | return; | 790 | return; |
| 579 | 791 | ||
| @@ -581,8 +793,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
| 581 | pgdir = find_pgdir(lg, gpgdir); | 793 | pgdir = find_pgdir(lg, gpgdir); |
| 582 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | 794 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) |
| 583 | /* ... throw it away. */ | 795 | /* ... throw it away. */ |
| 584 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); | 796 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
| 585 | } | 797 | } |
| 798 | #ifdef CONFIG_X86_PAE | ||
| 799 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | ||
| 800 | { | ||
| 801 | guest_pagetable_clear_all(&lg->cpus[0]); | ||
| 802 | } | ||
| 803 | #endif | ||
| 586 | 804 | ||
| 587 | /* Once we know how much memory we have we can construct simple identity | 805 | /* Once we know how much memory we have we can construct simple identity |
| 588 | * (which set virtual == physical) and linear mappings | 806 | * (which set virtual == physical) and linear mappings |
| @@ -596,8 +814,16 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 596 | { | 814 | { |
| 597 | pgd_t __user *pgdir; | 815 | pgd_t __user *pgdir; |
| 598 | pte_t __user *linear; | 816 | pte_t __user *linear; |
| 599 | unsigned int mapped_pages, i, linear_pages, phys_linear; | ||
| 600 | unsigned long mem_base = (unsigned long)lg->mem_base; | 817 | unsigned long mem_base = (unsigned long)lg->mem_base; |
| 818 | unsigned int mapped_pages, i, linear_pages; | ||
| 819 | #ifdef CONFIG_X86_PAE | ||
| 820 | pmd_t __user *pmds; | ||
| 821 | unsigned int j; | ||
| 822 | pgd_t pgd; | ||
| 823 | pmd_t pmd; | ||
| 824 | #else | ||
| 825 | unsigned int phys_linear; | ||
| 826 | #endif | ||
| 601 | 827 | ||
| 602 | /* We have mapped_pages frames to map, so we need | 828 | /* We have mapped_pages frames to map, so we need |
| 603 | * linear_pages page tables to map them. */ | 829 | * linear_pages page tables to map them. */ |
| @@ -610,6 +836,9 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 610 | /* Now we use the next linear_pages pages as pte pages */ | 836 | /* Now we use the next linear_pages pages as pte pages */ |
| 611 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 837 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
| 612 | 838 | ||
| 839 | #ifdef CONFIG_X86_PAE | ||
| 840 | pmds = (void *)linear - PAGE_SIZE; | ||
| 841 | #endif | ||
| 613 | /* Linear mapping is easy: put every page's address into the | 842 | /* Linear mapping is easy: put every page's address into the |
| 614 | * mapping in order. */ | 843 | * mapping in order. */ |
| 615 | for (i = 0; i < mapped_pages; i++) { | 844 | for (i = 0; i < mapped_pages; i++) { |
| @@ -621,6 +850,22 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 621 | 850 | ||
| 622 | /* The top level points to the linear page table pages above. | 851 | /* The top level points to the linear page table pages above. |
| 623 | * We setup the identity and linear mappings here. */ | 852 | * We setup the identity and linear mappings here. */ |
| 853 | #ifdef CONFIG_X86_PAE | ||
| 854 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | ||
| 855 | i += PTRS_PER_PTE, j++) { | ||
| 856 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | ||
| 857 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
| 858 | |||
| 859 | if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) | ||
| 860 | return -EFAULT; | ||
| 861 | } | ||
| 862 | |||
| 863 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | ||
| 864 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | ||
| 865 | return -EFAULT; | ||
| 866 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | ||
| 867 | return -EFAULT; | ||
| 868 | #else | ||
| 624 | phys_linear = (unsigned long)linear - mem_base; | 869 | phys_linear = (unsigned long)linear - mem_base; |
| 625 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 870 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
| 626 | pgd_t pgd; | 871 | pgd_t pgd; |
| @@ -633,6 +878,7 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 633 | &pgd, sizeof(pgd))) | 878 | &pgd, sizeof(pgd))) |
| 634 | return -EFAULT; | 879 | return -EFAULT; |
| 635 | } | 880 | } |
| 881 | #endif | ||
| 636 | 882 | ||
| 637 | /* We return the top level (guest-physical) address: remember where | 883 | /* We return the top level (guest-physical) address: remember where |
| 638 | * this is. */ | 884 | * this is. */ |
| @@ -648,7 +894,10 @@ int init_guest_pagetable(struct lguest *lg) | |||
| 648 | u64 mem; | 894 | u64 mem; |
| 649 | u32 initrd_size; | 895 | u32 initrd_size; |
| 650 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; | 896 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; |
| 651 | 897 | #ifdef CONFIG_X86_PAE | |
| 898 | pgd_t *pgd; | ||
| 899 | pmd_t *pmd_table; | ||
| 900 | #endif | ||
| 652 | /* Get the Guest memory size and the ramdisk size from the boot header | 901 | /* Get the Guest memory size and the ramdisk size from the boot header |
| 653 | * located at lg->mem_base (Guest address 0). */ | 902 | * located at lg->mem_base (Guest address 0). */ |
| 654 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) | 903 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) |
| @@ -663,6 +912,15 @@ int init_guest_pagetable(struct lguest *lg) | |||
| 663 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 912 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
| 664 | if (!lg->pgdirs[0].pgdir) | 913 | if (!lg->pgdirs[0].pgdir) |
| 665 | return -ENOMEM; | 914 | return -ENOMEM; |
| 915 | #ifdef CONFIG_X86_PAE | ||
| 916 | pgd = lg->pgdirs[0].pgdir; | ||
| 917 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | ||
| 918 | if (!pmd_table) | ||
| 919 | return -ENOMEM; | ||
| 920 | |||
| 921 | set_pgd(pgd + SWITCHER_PGD_INDEX, | ||
| 922 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
| 923 | #endif | ||
| 666 | lg->cpus[0].cpu_pgd = 0; | 924 | lg->cpus[0].cpu_pgd = 0; |
| 667 | return 0; | 925 | return 0; |
| 668 | } | 926 | } |
| @@ -672,17 +930,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
| 672 | { | 930 | { |
| 673 | /* We get the kernel address: above this is all kernel memory. */ | 931 | /* We get the kernel address: above this is all kernel memory. */ |
| 674 | if (get_user(cpu->lg->kernel_address, | 932 | if (get_user(cpu->lg->kernel_address, |
| 675 | &cpu->lg->lguest_data->kernel_address) | 933 | &cpu->lg->lguest_data->kernel_address) |
| 676 | /* We tell the Guest that it can't use the top 4MB of virtual | 934 | /* We tell the Guest that it can't use the top 2 or 4 MB |
| 677 | * addresses used by the Switcher. */ | 935 | * of virtual addresses used by the Switcher. */ |
| 678 | || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) | 936 | || put_user(RESERVE_MEM * 1024 * 1024, |
| 679 | || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) | 937 | &cpu->lg->lguest_data->reserve_mem) |
| 938 | || put_user(cpu->lg->pgdirs[0].gpgdir, | ||
| 939 | &cpu->lg->lguest_data->pgdir)) | ||
| 680 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 940 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
| 681 | 941 | ||
| 682 | /* In flush_user_mappings() we loop from 0 to | 942 | /* In flush_user_mappings() we loop from 0 to |
| 683 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | 943 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the |
| 684 | * Switcher mappings, so check that now. */ | 944 | * Switcher mappings, so check that now. */ |
| 945 | #ifdef CONFIG_X86_PAE | ||
| 946 | if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && | ||
| 947 | pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) | ||
| 948 | #else | ||
| 685 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) | 949 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) |
| 950 | #endif | ||
| 686 | kill_guest(cpu, "bad kernel address %#lx", | 951 | kill_guest(cpu, "bad kernel address %#lx", |
| 687 | cpu->lg->kernel_address); | 952 | cpu->lg->kernel_address); |
| 688 | } | 953 | } |
| @@ -708,16 +973,30 @@ void free_guest_pagetable(struct lguest *lg) | |||
| 708 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | 973 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
| 709 | { | 974 | { |
| 710 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 975 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
| 711 | pgd_t switcher_pgd; | ||
| 712 | pte_t regs_pte; | 976 | pte_t regs_pte; |
| 713 | unsigned long pfn; | 977 | unsigned long pfn; |
| 714 | 978 | ||
| 979 | #ifdef CONFIG_X86_PAE | ||
| 980 | pmd_t switcher_pmd; | ||
| 981 | pmd_t *pmd_table; | ||
| 982 | |||
| 983 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | ||
| 984 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | ||
| 985 | |||
| 986 | pmd_table = __va(pgd_pfn(cpu->lg-> | ||
| 987 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | ||
| 988 | << PAGE_SHIFT); | ||
| 989 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | ||
| 990 | #else | ||
| 991 | pgd_t switcher_pgd; | ||
| 992 | |||
| 715 | /* Make the last PGD entry for this Guest point to the Switcher's PTE | 993 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
| 716 | * page for this CPU (with appropriate flags). */ | 994 | * page for this CPU (with appropriate flags). */ |
| 717 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); | 995 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); |
| 718 | 996 | ||
| 719 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 997 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
| 720 | 998 | ||
| 999 | #endif | ||
| 721 | /* We also change the Switcher PTE page. When we're running the Guest, | 1000 | /* We also change the Switcher PTE page. When we're running the Guest, |
| 722 | * we want the Guest's "regs" page to appear where the first Switcher | 1001 | * we want the Guest's "regs" page to appear where the first Switcher |
| 723 | * page for this CPU is. This is an optimization: when the Switcher | 1002 | * page for this CPU is. This is an optimization: when the Switcher |
| @@ -726,8 +1005,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
| 726 | * page is already mapped there, we don't have to copy them out | 1005 | * page is already mapped there, we don't have to copy them out |
| 727 | * again. */ | 1006 | * again. */ |
| 728 | pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; | 1007 | pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; |
| 729 | regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); | 1008 | native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); |
| 730 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; | 1009 | native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], |
| 1010 | regs_pte); | ||
| 731 | } | 1011 | } |
| 732 | /*:*/ | 1012 | /*:*/ |
| 733 | 1013 | ||
| @@ -752,21 +1032,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
| 752 | 1032 | ||
| 753 | /* The first entries are easy: they map the Switcher code. */ | 1033 | /* The first entries are easy: they map the Switcher code. */ |
| 754 | for (i = 0; i < pages; i++) { | 1034 | for (i = 0; i < pages; i++) { |
| 755 | pte[i] = mk_pte(switcher_page[i], | 1035 | native_set_pte(&pte[i], mk_pte(switcher_page[i], |
| 756 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); | 1036 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); |
| 757 | } | 1037 | } |
| 758 | 1038 | ||
| 759 | /* The only other thing we map is this CPU's pair of pages. */ | 1039 | /* The only other thing we map is this CPU's pair of pages. */ |
| 760 | i = pages + cpu*2; | 1040 | i = pages + cpu*2; |
| 761 | 1041 | ||
| 762 | /* First page (Guest registers) is writable from the Guest */ | 1042 | /* First page (Guest registers) is writable from the Guest */ |
| 763 | pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), | 1043 | native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), |
| 764 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); | 1044 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); |
| 765 | 1045 | ||
| 766 | /* The second page contains the "struct lguest_ro_state", and is | 1046 | /* The second page contains the "struct lguest_ro_state", and is |
| 767 | * read-only. */ | 1047 | * read-only. */ |
| 768 | pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), | 1048 | native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), |
| 769 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); | 1049 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); |
| 770 | } | 1050 | } |
| 771 | 1051 | ||
| 772 | /* We've made it through the page table code. Perhaps our tired brains are | 1052 | /* We've made it through the page table code. Perhaps our tired brains are |
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 7ede64ffeef9..482ed5a18750 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c | |||
| @@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) | |||
| 150 | { | 150 | { |
| 151 | /* We assume the Guest has the same number of GDT entries as the | 151 | /* We assume the Guest has the same number of GDT entries as the |
| 152 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ | 152 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ |
| 153 | if (num > ARRAY_SIZE(cpu->arch.gdt)) | 153 | if (num >= ARRAY_SIZE(cpu->arch.gdt)) |
| 154 | kill_guest(cpu, "too many gdt entries %i", num); | 154 | kill_guest(cpu, "too many gdt entries %i", num); |
| 155 | 155 | ||
| 156 | /* Set it up, then fix it. */ | 156 | /* Set it up, then fix it. */ |
