diff options
author | David S. Miller <davem@davemloft.net> | 2009-06-15 06:02:23 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-06-15 06:02:23 -0400 |
commit | 9cbc1cb8cd46ce1f7645b9de249b2ce8460129bb (patch) | |
tree | 8d104ec2a459346b99413b0b77421ca7b9936c1a /drivers/lguest | |
parent | ca44d6e60f9de26281fda203f58b570e1748c015 (diff) | |
parent | 45e3e1935e2857c54783291107d33323b3ef33c8 (diff) |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts:
Documentation/feature-removal-schedule.txt
drivers/scsi/fcoe/fcoe.c
net/core/drop_monitor.c
net/core/net-traces.c
Diffstat (limited to 'drivers/lguest')
-rw-r--r-- | drivers/lguest/Kconfig | 2 | ||||
-rw-r--r-- | drivers/lguest/core.c | 30 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 14 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 57 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 28 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 41 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 127 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 396 | ||||
-rw-r--r-- | drivers/lguest/segments.c | 2 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 19 |
10 files changed, 578 insertions, 138 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index a3d3cbab359a..0aaa0597a622 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig | |||
@@ -1,6 +1,6 @@ | |||
1 | config LGUEST | 1 | config LGUEST |
2 | tristate "Linux hypervisor example code" | 2 | tristate "Linux hypervisor example code" |
3 | depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX | 3 | depends on X86_32 && EXPERIMENTAL && EVENTFD |
4 | select HVC_DRIVER | 4 | select HVC_DRIVER |
5 | ---help--- | 5 | ---help--- |
6 | This is a very simple module which allows you to run | 6 | This is a very simple module which allows you to run |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 4845fb3cf74b..a6974e9b8ebf 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -95,7 +95,7 @@ static __init int map_switcher(void) | |||
95 | * array of struct pages. It increments that pointer, but we don't | 95 | * array of struct pages. It increments that pointer, but we don't |
96 | * care. */ | 96 | * care. */ |
97 | pagep = switcher_page; | 97 | pagep = switcher_page; |
98 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); | 98 | err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); |
99 | if (err) { | 99 | if (err) { |
100 | printk("lguest: map_vm_area failed: %i\n", err); | 100 | printk("lguest: map_vm_area failed: %i\n", err); |
101 | goto free_vma; | 101 | goto free_vma; |
@@ -188,6 +188,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
188 | { | 188 | { |
189 | /* We stop running once the Guest is dead. */ | 189 | /* We stop running once the Guest is dead. */ |
190 | while (!cpu->lg->dead) { | 190 | while (!cpu->lg->dead) { |
191 | unsigned int irq; | ||
192 | bool more; | ||
193 | |||
191 | /* First we run any hypercalls the Guest wants done. */ | 194 | /* First we run any hypercalls the Guest wants done. */ |
192 | if (cpu->hcall) | 195 | if (cpu->hcall) |
193 | do_hypercalls(cpu); | 196 | do_hypercalls(cpu); |
@@ -195,23 +198,23 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
195 | /* It's possible the Guest did a NOTIFY hypercall to the | 198 | /* It's possible the Guest did a NOTIFY hypercall to the |
196 | * Launcher, in which case we return from the read() now. */ | 199 | * Launcher, in which case we return from the read() now. */ |
197 | if (cpu->pending_notify) { | 200 | if (cpu->pending_notify) { |
198 | if (put_user(cpu->pending_notify, user)) | 201 | if (!send_notify_to_eventfd(cpu)) { |
199 | return -EFAULT; | 202 | if (put_user(cpu->pending_notify, user)) |
200 | return sizeof(cpu->pending_notify); | 203 | return -EFAULT; |
204 | return sizeof(cpu->pending_notify); | ||
205 | } | ||
201 | } | 206 | } |
202 | 207 | ||
203 | /* Check for signals */ | 208 | /* Check for signals */ |
204 | if (signal_pending(current)) | 209 | if (signal_pending(current)) |
205 | return -ERESTARTSYS; | 210 | return -ERESTARTSYS; |
206 | 211 | ||
207 | /* If Waker set break_out, return to Launcher. */ | ||
208 | if (cpu->break_out) | ||
209 | return -EAGAIN; | ||
210 | |||
211 | /* Check if there are any interrupts which can be delivered now: | 212 | /* Check if there are any interrupts which can be delivered now: |
212 | * if so, this sets up the hander to be executed when we next | 213 | * if so, this sets up the hander to be executed when we next |
213 | * run the Guest. */ | 214 | * run the Guest. */ |
214 | maybe_do_interrupt(cpu); | 215 | irq = interrupt_pending(cpu, &more); |
216 | if (irq < LGUEST_IRQS) | ||
217 | try_deliver_interrupt(cpu, irq, more); | ||
215 | 218 | ||
216 | /* All long-lived kernel loops need to check with this horrible | 219 | /* All long-lived kernel loops need to check with this horrible |
217 | * thing called the freezer. If the Host is trying to suspend, | 220 | * thing called the freezer. If the Host is trying to suspend, |
@@ -224,10 +227,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
224 | break; | 227 | break; |
225 | 228 | ||
226 | /* If the Guest asked to be stopped, we sleep. The Guest's | 229 | /* If the Guest asked to be stopped, we sleep. The Guest's |
227 | * clock timer or LHREQ_BREAK from the Waker will wake us. */ | 230 | * clock timer will wake us. */ |
228 | if (cpu->halted) { | 231 | if (cpu->halted) { |
229 | set_current_state(TASK_INTERRUPTIBLE); | 232 | set_current_state(TASK_INTERRUPTIBLE); |
230 | schedule(); | 233 | /* Just before we sleep, make sure no interrupt snuck in |
234 | * which we should be doing. */ | ||
235 | if (interrupt_pending(cpu, &more) < LGUEST_IRQS) | ||
236 | set_current_state(TASK_RUNNING); | ||
237 | else | ||
238 | schedule(); | ||
231 | continue; | 239 | continue; |
232 | } | 240 | } |
233 | 241 | ||
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 54d66f05fefa..c29ffa19cb74 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
37 | /* This call does nothing, except by breaking out of the Guest | 37 | /* This call does nothing, except by breaking out of the Guest |
38 | * it makes us process all the asynchronous hypercalls. */ | 38 | * it makes us process all the asynchronous hypercalls. */ |
39 | break; | 39 | break; |
40 | case LHCALL_SEND_INTERRUPTS: | ||
41 | /* This call does nothing too, but by breaking out of the Guest | ||
42 | * it makes us process any pending interrupts. */ | ||
43 | break; | ||
40 | case LHCALL_LGUEST_INIT: | 44 | case LHCALL_LGUEST_INIT: |
41 | /* You can't get here unless you're already initialized. Don't | 45 | /* You can't get here unless you're already initialized. Don't |
42 | * do that. */ | 46 | * do that. */ |
@@ -73,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
73 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); | 77 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); |
74 | break; | 78 | break; |
75 | case LHCALL_SET_PTE: | 79 | case LHCALL_SET_PTE: |
80 | #ifdef CONFIG_X86_PAE | ||
81 | guest_set_pte(cpu, args->arg1, args->arg2, | ||
82 | __pte(args->arg3 | (u64)args->arg4 << 32)); | ||
83 | #else | ||
76 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); | 84 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); |
85 | #endif | ||
86 | break; | ||
87 | case LHCALL_SET_PGD: | ||
88 | guest_set_pgd(cpu->lg, args->arg1, args->arg2); | ||
77 | break; | 89 | break; |
90 | #ifdef CONFIG_X86_PAE | ||
78 | case LHCALL_SET_PMD: | 91 | case LHCALL_SET_PMD: |
79 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); | 92 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); |
80 | break; | 93 | break; |
94 | #endif | ||
81 | case LHCALL_SET_CLOCKEVENT: | 95 | case LHCALL_SET_CLOCKEVENT: |
82 | guest_set_clockevent(cpu, args->arg1); | 96 | guest_set_clockevent(cpu, args->arg1); |
83 | break; | 97 | break; |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 6e99adbe1946..0e9067b0d507 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -128,30 +128,39 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, | |||
128 | /*H:205 | 128 | /*H:205 |
129 | * Virtual Interrupts. | 129 | * Virtual Interrupts. |
130 | * | 130 | * |
131 | * maybe_do_interrupt() gets called before every entry to the Guest, to see if | 131 | * interrupt_pending() returns the first pending interrupt which isn't blocked |
132 | * we should divert the Guest to running an interrupt handler. */ | 132 | * by the Guest. It is called before every entry to the Guest, and just before |
133 | void maybe_do_interrupt(struct lg_cpu *cpu) | 133 | * we go to sleep when the Guest has halted itself. */ |
134 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) | ||
134 | { | 135 | { |
135 | unsigned int irq; | 136 | unsigned int irq; |
136 | DECLARE_BITMAP(blk, LGUEST_IRQS); | 137 | DECLARE_BITMAP(blk, LGUEST_IRQS); |
137 | struct desc_struct *idt; | ||
138 | 138 | ||
139 | /* If the Guest hasn't even initialized yet, we can do nothing. */ | 139 | /* If the Guest hasn't even initialized yet, we can do nothing. */ |
140 | if (!cpu->lg->lguest_data) | 140 | if (!cpu->lg->lguest_data) |
141 | return; | 141 | return LGUEST_IRQS; |
142 | 142 | ||
143 | /* Take our "irqs_pending" array and remove any interrupts the Guest | 143 | /* Take our "irqs_pending" array and remove any interrupts the Guest |
144 | * wants blocked: the result ends up in "blk". */ | 144 | * wants blocked: the result ends up in "blk". */ |
145 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, | 145 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, |
146 | sizeof(blk))) | 146 | sizeof(blk))) |
147 | return; | 147 | return LGUEST_IRQS; |
148 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); | 148 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); |
149 | 149 | ||
150 | /* Find the first interrupt. */ | 150 | /* Find the first interrupt. */ |
151 | irq = find_first_bit(blk, LGUEST_IRQS); | 151 | irq = find_first_bit(blk, LGUEST_IRQS); |
152 | /* None? Nothing to do */ | 152 | *more = find_next_bit(blk, LGUEST_IRQS, irq+1); |
153 | if (irq >= LGUEST_IRQS) | 153 | |
154 | return; | 154 | return irq; |
155 | } | ||
156 | |||
157 | /* This actually diverts the Guest to running an interrupt handler, once an | ||
158 | * interrupt has been identified by interrupt_pending(). */ | ||
159 | void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) | ||
160 | { | ||
161 | struct desc_struct *idt; | ||
162 | |||
163 | BUG_ON(irq >= LGUEST_IRQS); | ||
155 | 164 | ||
156 | /* They may be in the middle of an iret, where they asked us never to | 165 | /* They may be in the middle of an iret, where they asked us never to |
157 | * deliver interrupts. */ | 166 | * deliver interrupts. */ |
@@ -170,8 +179,12 @@ void maybe_do_interrupt(struct lg_cpu *cpu) | |||
170 | u32 irq_enabled; | 179 | u32 irq_enabled; |
171 | if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) | 180 | if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) |
172 | irq_enabled = 0; | 181 | irq_enabled = 0; |
173 | if (!irq_enabled) | 182 | if (!irq_enabled) { |
183 | /* Make sure they know an IRQ is pending. */ | ||
184 | put_user(X86_EFLAGS_IF, | ||
185 | &cpu->lg->lguest_data->irq_pending); | ||
174 | return; | 186 | return; |
187 | } | ||
175 | } | 188 | } |
176 | 189 | ||
177 | /* Look at the IDT entry the Guest gave us for this interrupt. The | 190 | /* Look at the IDT entry the Guest gave us for this interrupt. The |
@@ -194,6 +207,25 @@ void maybe_do_interrupt(struct lg_cpu *cpu) | |||
194 | * here is a compromise which means at least it gets updated every | 207 | * here is a compromise which means at least it gets updated every |
195 | * timer interrupt. */ | 208 | * timer interrupt. */ |
196 | write_timestamp(cpu); | 209 | write_timestamp(cpu); |
210 | |||
211 | /* If there are no other interrupts we want to deliver, clear | ||
212 | * the pending flag. */ | ||
213 | if (!more) | ||
214 | put_user(0, &cpu->lg->lguest_data->irq_pending); | ||
215 | } | ||
216 | |||
217 | /* And this is the routine when we want to set an interrupt for the Guest. */ | ||
218 | void set_interrupt(struct lg_cpu *cpu, unsigned int irq) | ||
219 | { | ||
220 | /* Next time the Guest runs, the core code will see if it can deliver | ||
221 | * this interrupt. */ | ||
222 | set_bit(irq, cpu->irqs_pending); | ||
223 | |||
224 | /* Make sure it sees it; it might be asleep (eg. halted), or | ||
225 | * running the Guest right now, in which case kick_process() | ||
226 | * will knock it out. */ | ||
227 | if (!wake_up_process(cpu->tsk)) | ||
228 | kick_process(cpu->tsk); | ||
197 | } | 229 | } |
198 | /*:*/ | 230 | /*:*/ |
199 | 231 | ||
@@ -510,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) | |||
510 | struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); | 542 | struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); |
511 | 543 | ||
512 | /* Remember the first interrupt is the timer interrupt. */ | 544 | /* Remember the first interrupt is the timer interrupt. */ |
513 | set_bit(0, cpu->irqs_pending); | 545 | set_interrupt(cpu, 0); |
514 | /* If the Guest is actually stopped, we need to wake it up. */ | ||
515 | if (cpu->halted) | ||
516 | wake_up_process(cpu->tsk); | ||
517 | return HRTIMER_NORESTART; | 546 | return HRTIMER_NORESTART; |
518 | } | 547 | } |
519 | 548 | ||
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index af92a176697f..d4e8979735cb 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -49,7 +49,7 @@ struct lg_cpu { | |||
49 | u32 cr2; | 49 | u32 cr2; |
50 | int ts; | 50 | int ts; |
51 | u32 esp1; | 51 | u32 esp1; |
52 | u8 ss1; | 52 | u16 ss1; |
53 | 53 | ||
54 | /* Bitmap of what has changed: see CHANGED_* above. */ | 54 | /* Bitmap of what has changed: see CHANGED_* above. */ |
55 | int changed; | 55 | int changed; |
@@ -71,9 +71,7 @@ struct lg_cpu { | |||
71 | /* Virtual clock device */ | 71 | /* Virtual clock device */ |
72 | struct hrtimer hrt; | 72 | struct hrtimer hrt; |
73 | 73 | ||
74 | /* Do we need to stop what we're doing and return to userspace? */ | 74 | /* Did the Guest tell us to halt? */ |
75 | int break_out; | ||
76 | wait_queue_head_t break_wq; | ||
77 | int halted; | 75 | int halted; |
78 | 76 | ||
79 | /* Pending virtual interrupts */ | 77 | /* Pending virtual interrupts */ |
@@ -82,6 +80,16 @@ struct lg_cpu { | |||
82 | struct lg_cpu_arch arch; | 80 | struct lg_cpu_arch arch; |
83 | }; | 81 | }; |
84 | 82 | ||
83 | struct lg_eventfd { | ||
84 | unsigned long addr; | ||
85 | struct file *event; | ||
86 | }; | ||
87 | |||
88 | struct lg_eventfd_map { | ||
89 | unsigned int num; | ||
90 | struct lg_eventfd map[]; | ||
91 | }; | ||
92 | |||
85 | /* The private info the thread maintains about the guest. */ | 93 | /* The private info the thread maintains about the guest. */ |
86 | struct lguest | 94 | struct lguest |
87 | { | 95 | { |
@@ -102,6 +110,8 @@ struct lguest | |||
102 | unsigned int stack_pages; | 110 | unsigned int stack_pages; |
103 | u32 tsc_khz; | 111 | u32 tsc_khz; |
104 | 112 | ||
113 | struct lg_eventfd_map *eventfds; | ||
114 | |||
105 | /* Dead? */ | 115 | /* Dead? */ |
106 | const char *dead; | 116 | const char *dead; |
107 | }; | 117 | }; |
@@ -137,9 +147,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); | |||
137 | * in the kernel. */ | 147 | * in the kernel. */ |
138 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) | 148 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) |
139 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) | 149 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) |
150 | #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) | ||
151 | #define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) | ||
140 | 152 | ||
141 | /* interrupts_and_traps.c: */ | 153 | /* interrupts_and_traps.c: */ |
142 | void maybe_do_interrupt(struct lg_cpu *cpu); | 154 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); |
155 | void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); | ||
156 | void set_interrupt(struct lg_cpu *cpu, unsigned int irq); | ||
143 | bool deliver_trap(struct lg_cpu *cpu, unsigned int num); | 157 | bool deliver_trap(struct lg_cpu *cpu, unsigned int num); |
144 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, | 158 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, |
145 | u32 low, u32 hi); | 159 | u32 low, u32 hi); |
@@ -150,6 +164,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state, | |||
150 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, | 164 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, |
151 | const unsigned long *def); | 165 | const unsigned long *def); |
152 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); | 166 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); |
167 | bool send_notify_to_eventfd(struct lg_cpu *cpu); | ||
153 | void init_clockdev(struct lg_cpu *cpu); | 168 | void init_clockdev(struct lg_cpu *cpu); |
154 | bool check_syscall_vector(struct lguest *lg); | 169 | bool check_syscall_vector(struct lguest *lg); |
155 | int init_interrupts(void); | 170 | int init_interrupts(void); |
@@ -168,7 +183,10 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); | |||
168 | int init_guest_pagetable(struct lguest *lg); | 183 | int init_guest_pagetable(struct lguest *lg); |
169 | void free_guest_pagetable(struct lguest *lg); | 184 | void free_guest_pagetable(struct lguest *lg); |
170 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); | 185 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); |
186 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); | ||
187 | #ifdef CONFIG_X86_PAE | ||
171 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); | 188 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); |
189 | #endif | ||
172 | void guest_pagetable_clear_all(struct lg_cpu *cpu); | 190 | void guest_pagetable_clear_all(struct lg_cpu *cpu); |
173 | void guest_pagetable_flush_user(struct lg_cpu *cpu); | 191 | void guest_pagetable_flush_user(struct lg_cpu *cpu); |
174 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, | 192 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, |
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index df44d962626d..e082cdac88b4 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c | |||
@@ -228,7 +228,8 @@ extern void lguest_setup_irq(unsigned int irq); | |||
228 | * function. */ | 228 | * function. */ |
229 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, | 229 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, |
230 | unsigned index, | 230 | unsigned index, |
231 | void (*callback)(struct virtqueue *vq)) | 231 | void (*callback)(struct virtqueue *vq), |
232 | const char *name) | ||
232 | { | 233 | { |
233 | struct lguest_device *ldev = to_lgdev(vdev); | 234 | struct lguest_device *ldev = to_lgdev(vdev); |
234 | struct lguest_vq_info *lvq; | 235 | struct lguest_vq_info *lvq; |
@@ -263,7 +264,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, | |||
263 | /* OK, tell virtio_ring.c to set up a virtqueue now we know its size | 264 | /* OK, tell virtio_ring.c to set up a virtqueue now we know its size |
264 | * and we've got a pointer to its pages. */ | 265 | * and we've got a pointer to its pages. */ |
265 | vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, | 266 | vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, |
266 | vdev, lvq->pages, lg_notify, callback); | 267 | vdev, lvq->pages, lg_notify, callback, name); |
267 | if (!vq) { | 268 | if (!vq) { |
268 | err = -ENOMEM; | 269 | err = -ENOMEM; |
269 | goto unmap; | 270 | goto unmap; |
@@ -312,6 +313,38 @@ static void lg_del_vq(struct virtqueue *vq) | |||
312 | kfree(lvq); | 313 | kfree(lvq); |
313 | } | 314 | } |
314 | 315 | ||
316 | static void lg_del_vqs(struct virtio_device *vdev) | ||
317 | { | ||
318 | struct virtqueue *vq, *n; | ||
319 | |||
320 | list_for_each_entry_safe(vq, n, &vdev->vqs, list) | ||
321 | lg_del_vq(vq); | ||
322 | } | ||
323 | |||
324 | static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs, | ||
325 | struct virtqueue *vqs[], | ||
326 | vq_callback_t *callbacks[], | ||
327 | const char *names[]) | ||
328 | { | ||
329 | struct lguest_device *ldev = to_lgdev(vdev); | ||
330 | int i; | ||
331 | |||
332 | /* We must have this many virtqueues. */ | ||
333 | if (nvqs > ldev->desc->num_vq) | ||
334 | return -ENOENT; | ||
335 | |||
336 | for (i = 0; i < nvqs; ++i) { | ||
337 | vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]); | ||
338 | if (IS_ERR(vqs[i])) | ||
339 | goto error; | ||
340 | } | ||
341 | return 0; | ||
342 | |||
343 | error: | ||
344 | lg_del_vqs(vdev); | ||
345 | return PTR_ERR(vqs[i]); | ||
346 | } | ||
347 | |||
315 | /* The ops structure which hooks everything together. */ | 348 | /* The ops structure which hooks everything together. */ |
316 | static struct virtio_config_ops lguest_config_ops = { | 349 | static struct virtio_config_ops lguest_config_ops = { |
317 | .get_features = lg_get_features, | 350 | .get_features = lg_get_features, |
@@ -321,8 +354,8 @@ static struct virtio_config_ops lguest_config_ops = { | |||
321 | .get_status = lg_get_status, | 354 | .get_status = lg_get_status, |
322 | .set_status = lg_set_status, | 355 | .set_status = lg_set_status, |
323 | .reset = lg_reset, | 356 | .reset = lg_reset, |
324 | .find_vq = lg_find_vq, | 357 | .find_vqs = lg_find_vqs, |
325 | .del_vq = lg_del_vq, | 358 | .del_vqs = lg_del_vqs, |
326 | }; | 359 | }; |
327 | 360 | ||
328 | /* The root device for the lguest virtio devices. This makes them appear as | 361 | /* The root device for the lguest virtio devices. This makes them appear as |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index b8ee103eed5f..32e297121058 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -7,32 +7,83 @@ | |||
7 | #include <linux/miscdevice.h> | 7 | #include <linux/miscdevice.h> |
8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/eventfd.h> | ||
11 | #include <linux/file.h> | ||
10 | #include "lg.h" | 12 | #include "lg.h" |
11 | 13 | ||
12 | /*L:055 When something happens, the Waker process needs a way to stop the | 14 | bool send_notify_to_eventfd(struct lg_cpu *cpu) |
13 | * kernel running the Guest and return to the Launcher. So the Waker writes | ||
14 | * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher | ||
15 | * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release | ||
16 | * the Waker. */ | ||
17 | static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) | ||
18 | { | 15 | { |
19 | unsigned long on; | 16 | unsigned int i; |
17 | struct lg_eventfd_map *map; | ||
18 | |||
19 | /* lg->eventfds is RCU-protected */ | ||
20 | rcu_read_lock(); | ||
21 | map = rcu_dereference(cpu->lg->eventfds); | ||
22 | for (i = 0; i < map->num; i++) { | ||
23 | if (map->map[i].addr == cpu->pending_notify) { | ||
24 | eventfd_signal(map->map[i].event, 1); | ||
25 | cpu->pending_notify = 0; | ||
26 | break; | ||
27 | } | ||
28 | } | ||
29 | rcu_read_unlock(); | ||
30 | return cpu->pending_notify == 0; | ||
31 | } | ||
20 | 32 | ||
21 | /* Fetch whether they're turning break on or off. */ | 33 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) |
22 | if (get_user(on, input) != 0) | 34 | { |
23 | return -EFAULT; | 35 | struct lg_eventfd_map *new, *old = lg->eventfds; |
24 | 36 | ||
25 | if (on) { | 37 | if (!addr) |
26 | cpu->break_out = 1; | 38 | return -EINVAL; |
27 | /* Pop it out of the Guest (may be running on different CPU) */ | 39 | |
28 | wake_up_process(cpu->tsk); | 40 | /* Replace the old array with the new one, carefully: others can |
29 | /* Wait for them to reset it */ | 41 | * be accessing it at the same time */ |
30 | return wait_event_interruptible(cpu->break_wq, !cpu->break_out); | 42 | new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), |
31 | } else { | 43 | GFP_KERNEL); |
32 | cpu->break_out = 0; | 44 | if (!new) |
33 | wake_up(&cpu->break_wq); | 45 | return -ENOMEM; |
34 | return 0; | 46 | |
47 | /* First make identical copy. */ | ||
48 | memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); | ||
49 | new->num = old->num; | ||
50 | |||
51 | /* Now append new entry. */ | ||
52 | new->map[new->num].addr = addr; | ||
53 | new->map[new->num].event = eventfd_fget(fd); | ||
54 | if (IS_ERR(new->map[new->num].event)) { | ||
55 | kfree(new); | ||
56 | return PTR_ERR(new->map[new->num].event); | ||
35 | } | 57 | } |
58 | new->num++; | ||
59 | |||
60 | /* Now put new one in place. */ | ||
61 | rcu_assign_pointer(lg->eventfds, new); | ||
62 | |||
63 | /* We're not in a big hurry. Wait until noone's looking at old | ||
64 | * version, then delete it. */ | ||
65 | synchronize_rcu(); | ||
66 | kfree(old); | ||
67 | |||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | ||
72 | { | ||
73 | unsigned long addr, fd; | ||
74 | int err; | ||
75 | |||
76 | if (get_user(addr, input) != 0) | ||
77 | return -EFAULT; | ||
78 | input++; | ||
79 | if (get_user(fd, input) != 0) | ||
80 | return -EFAULT; | ||
81 | |||
82 | mutex_lock(&lguest_lock); | ||
83 | err = add_eventfd(lg, addr, fd); | ||
84 | mutex_unlock(&lguest_lock); | ||
85 | |||
86 | return 0; | ||
36 | } | 87 | } |
37 | 88 | ||
38 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt | 89 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt |
@@ -45,9 +96,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) | |||
45 | return -EFAULT; | 96 | return -EFAULT; |
46 | if (irq >= LGUEST_IRQS) | 97 | if (irq >= LGUEST_IRQS) |
47 | return -EINVAL; | 98 | return -EINVAL; |
48 | /* Next time the Guest runs, the core code will see if it can deliver | 99 | |
49 | * this interrupt. */ | 100 | set_interrupt(cpu, irq); |
50 | set_bit(irq, cpu->irqs_pending); | ||
51 | return 0; | 101 | return 0; |
52 | } | 102 | } |
53 | 103 | ||
@@ -126,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | |||
126 | * address. */ | 176 | * address. */ |
127 | lguest_arch_setup_regs(cpu, start_ip); | 177 | lguest_arch_setup_regs(cpu, start_ip); |
128 | 178 | ||
129 | /* Initialize the queue for the Waker to wait on */ | ||
130 | init_waitqueue_head(&cpu->break_wq); | ||
131 | |||
132 | /* We keep a pointer to the Launcher task (ie. current task) for when | 179 | /* We keep a pointer to the Launcher task (ie. current task) for when |
133 | * other Guests want to wake this one (eg. console input). */ | 180 | * other Guests want to wake this one (eg. console input). */ |
134 | cpu->tsk = current; | 181 | cpu->tsk = current; |
@@ -185,6 +232,13 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
185 | goto unlock; | 232 | goto unlock; |
186 | } | 233 | } |
187 | 234 | ||
235 | lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); | ||
236 | if (!lg->eventfds) { | ||
237 | err = -ENOMEM; | ||
238 | goto free_lg; | ||
239 | } | ||
240 | lg->eventfds->num = 0; | ||
241 | |||
188 | /* Populate the easy fields of our "struct lguest" */ | 242 | /* Populate the easy fields of our "struct lguest" */ |
189 | lg->mem_base = (void __user *)args[0]; | 243 | lg->mem_base = (void __user *)args[0]; |
190 | lg->pfn_limit = args[1]; | 244 | lg->pfn_limit = args[1]; |
@@ -192,7 +246,7 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
192 | /* This is the first cpu (cpu 0) and it will start booting at args[2] */ | 246 | /* This is the first cpu (cpu 0) and it will start booting at args[2] */ |
193 | err = lg_cpu_start(&lg->cpus[0], 0, args[2]); | 247 | err = lg_cpu_start(&lg->cpus[0], 0, args[2]); |
194 | if (err) | 248 | if (err) |
195 | goto release_guest; | 249 | goto free_eventfds; |
196 | 250 | ||
197 | /* Initialize the Guest's shadow page tables, using the toplevel | 251 | /* Initialize the Guest's shadow page tables, using the toplevel |
198 | * address the Launcher gave us. This allocates memory, so can fail. */ | 252 | * address the Launcher gave us. This allocates memory, so can fail. */ |
@@ -211,7 +265,9 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
211 | free_regs: | 265 | free_regs: |
212 | /* FIXME: This should be in free_vcpu */ | 266 | /* FIXME: This should be in free_vcpu */ |
213 | free_page(lg->cpus[0].regs_page); | 267 | free_page(lg->cpus[0].regs_page); |
214 | release_guest: | 268 | free_eventfds: |
269 | kfree(lg->eventfds); | ||
270 | free_lg: | ||
215 | kfree(lg); | 271 | kfree(lg); |
216 | unlock: | 272 | unlock: |
217 | mutex_unlock(&lguest_lock); | 273 | mutex_unlock(&lguest_lock); |
@@ -252,11 +308,6 @@ static ssize_t write(struct file *file, const char __user *in, | |||
252 | /* Once the Guest is dead, you can only read() why it died. */ | 308 | /* Once the Guest is dead, you can only read() why it died. */ |
253 | if (lg->dead) | 309 | if (lg->dead) |
254 | return -ENOENT; | 310 | return -ENOENT; |
255 | |||
256 | /* If you're not the task which owns the Guest, all you can do | ||
257 | * is break the Launcher out of running the Guest. */ | ||
258 | if (current != cpu->tsk && req != LHREQ_BREAK) | ||
259 | return -EPERM; | ||
260 | } | 311 | } |
261 | 312 | ||
262 | switch (req) { | 313 | switch (req) { |
@@ -264,8 +315,8 @@ static ssize_t write(struct file *file, const char __user *in, | |||
264 | return initialize(file, input); | 315 | return initialize(file, input); |
265 | case LHREQ_IRQ: | 316 | case LHREQ_IRQ: |
266 | return user_send_irq(cpu, input); | 317 | return user_send_irq(cpu, input); |
267 | case LHREQ_BREAK: | 318 | case LHREQ_EVENTFD: |
268 | return break_guest_out(cpu, input); | 319 | return attach_eventfd(lg, input); |
269 | default: | 320 | default: |
270 | return -EINVAL; | 321 | return -EINVAL; |
271 | } | 322 | } |
@@ -303,6 +354,12 @@ static int close(struct inode *inode, struct file *file) | |||
303 | * the Launcher's memory management structure. */ | 354 | * the Launcher's memory management structure. */ |
304 | mmput(lg->cpus[i].mm); | 355 | mmput(lg->cpus[i].mm); |
305 | } | 356 | } |
357 | |||
358 | /* Release any eventfds they registered. */ | ||
359 | for (i = 0; i < lg->eventfds->num; i++) | ||
360 | fput(lg->eventfds->map[i].event); | ||
361 | kfree(lg->eventfds); | ||
362 | |||
306 | /* If lg->dead doesn't contain an error code it will be NULL or a | 363 | /* If lg->dead doesn't contain an error code it will be NULL or a |
307 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ | 364 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ |
308 | if (!IS_ERR(lg->dead)) | 365 | if (!IS_ERR(lg->dead)) |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a059cf9980f7..a6fe1abda240 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -53,6 +53,17 @@ | |||
53 | * page. */ | 53 | * page. */ |
54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
55 | 55 | ||
56 | /* For PAE we need the PMD index as well. We use the last 2MB, so we | ||
57 | * will need the last pmd entry of the last pmd page. */ | ||
58 | #ifdef CONFIG_X86_PAE | ||
59 | #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) | ||
60 | #define RESERVE_MEM 2U | ||
61 | #define CHECK_GPGD_MASK _PAGE_PRESENT | ||
62 | #else | ||
63 | #define RESERVE_MEM 4U | ||
64 | #define CHECK_GPGD_MASK _PAGE_TABLE | ||
65 | #endif | ||
66 | |||
56 | /* We actually need a separate PTE page for each CPU. Remember that after the | 67 | /* We actually need a separate PTE page for each CPU. Remember that after the |
57 | * Switcher code itself comes two pages for each CPU, and we don't want this | 68 | * Switcher code itself comes two pages for each CPU, and we don't want this |
58 | * CPU's guest to see the pages of any other CPU. */ | 69 | * CPU's guest to see the pages of any other CPU. */ |
@@ -73,24 +84,59 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) | |||
73 | { | 84 | { |
74 | unsigned int index = pgd_index(vaddr); | 85 | unsigned int index = pgd_index(vaddr); |
75 | 86 | ||
87 | #ifndef CONFIG_X86_PAE | ||
76 | /* We kill any Guest trying to touch the Switcher addresses. */ | 88 | /* We kill any Guest trying to touch the Switcher addresses. */ |
77 | if (index >= SWITCHER_PGD_INDEX) { | 89 | if (index >= SWITCHER_PGD_INDEX) { |
78 | kill_guest(cpu, "attempt to access switcher pages"); | 90 | kill_guest(cpu, "attempt to access switcher pages"); |
79 | index = 0; | 91 | index = 0; |
80 | } | 92 | } |
93 | #endif | ||
81 | /* Return a pointer index'th pgd entry for the i'th page table. */ | 94 | /* Return a pointer index'th pgd entry for the i'th page table. */ |
82 | return &cpu->lg->pgdirs[i].pgdir[index]; | 95 | return &cpu->lg->pgdirs[i].pgdir[index]; |
83 | } | 96 | } |
84 | 97 | ||
98 | #ifdef CONFIG_X86_PAE | ||
99 | /* This routine then takes the PGD entry given above, which contains the | ||
100 | * address of the PMD page. It then returns a pointer to the PMD entry for the | ||
101 | * given address. */ | ||
102 | static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | ||
103 | { | ||
104 | unsigned int index = pmd_index(vaddr); | ||
105 | pmd_t *page; | ||
106 | |||
107 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
108 | if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && | ||
109 | index >= SWITCHER_PMD_INDEX) { | ||
110 | kill_guest(cpu, "attempt to access switcher pages"); | ||
111 | index = 0; | ||
112 | } | ||
113 | |||
114 | /* You should never call this if the PGD entry wasn't valid */ | ||
115 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | ||
116 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | ||
117 | |||
118 | return &page[index]; | ||
119 | } | ||
120 | #endif | ||
121 | |||
85 | /* This routine then takes the page directory entry returned above, which | 122 | /* This routine then takes the page directory entry returned above, which |
86 | * contains the address of the page table entry (PTE) page. It then returns a | 123 | * contains the address of the page table entry (PTE) page. It then returns a |
87 | * pointer to the PTE entry for the given address. */ | 124 | * pointer to the PTE entry for the given address. */ |
88 | static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) | 125 | static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) |
89 | { | 126 | { |
127 | #ifdef CONFIG_X86_PAE | ||
128 | pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); | ||
129 | pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); | ||
130 | |||
131 | /* You should never call this if the PMD entry wasn't valid */ | ||
132 | BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); | ||
133 | #else | ||
90 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 134 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
91 | /* You should never call this if the PGD entry wasn't valid */ | 135 | /* You should never call this if the PGD entry wasn't valid */ |
92 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | 136 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
93 | return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; | 137 | #endif |
138 | |||
139 | return &page[pte_index(vaddr)]; | ||
94 | } | 140 | } |
95 | 141 | ||
96 | /* These two functions just like the above two, except they access the Guest | 142 | /* These two functions just like the above two, except they access the Guest |
@@ -101,12 +147,32 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
101 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); | 147 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); |
102 | } | 148 | } |
103 | 149 | ||
104 | static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) | 150 | #ifdef CONFIG_X86_PAE |
151 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | ||
152 | { | ||
153 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | ||
154 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | ||
155 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | ||
156 | } | ||
157 | |||
158 | static unsigned long gpte_addr(struct lg_cpu *cpu, | ||
159 | pmd_t gpmd, unsigned long vaddr) | ||
160 | { | ||
161 | unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; | ||
162 | |||
163 | BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); | ||
164 | return gpage + pte_index(vaddr) * sizeof(pte_t); | ||
165 | } | ||
166 | #else | ||
167 | static unsigned long gpte_addr(struct lg_cpu *cpu, | ||
168 | pgd_t gpgd, unsigned long vaddr) | ||
105 | { | 169 | { |
106 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 170 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
171 | |||
107 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | 172 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
108 | return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); | 173 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
109 | } | 174 | } |
175 | #endif | ||
110 | /*:*/ | 176 | /*:*/ |
111 | 177 | ||
112 | /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as | 178 | /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as |
@@ -171,7 +237,7 @@ static void release_pte(pte_t pte) | |||
171 | /* Remember that get_user_pages_fast() took a reference to the page, in | 237 | /* Remember that get_user_pages_fast() took a reference to the page, in |
172 | * get_pfn()? We have to put it back now. */ | 238 | * get_pfn()? We have to put it back now. */ |
173 | if (pte_flags(pte) & _PAGE_PRESENT) | 239 | if (pte_flags(pte) & _PAGE_PRESENT) |
174 | put_page(pfn_to_page(pte_pfn(pte))); | 240 | put_page(pte_page(pte)); |
175 | } | 241 | } |
176 | /*:*/ | 242 | /*:*/ |
177 | 243 | ||
@@ -184,11 +250,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) | |||
184 | 250 | ||
185 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) | 251 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
186 | { | 252 | { |
187 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || | 253 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || |
188 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) | 254 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) |
189 | kill_guest(cpu, "bad page directory entry"); | 255 | kill_guest(cpu, "bad page directory entry"); |
190 | } | 256 | } |
191 | 257 | ||
258 | #ifdef CONFIG_X86_PAE | ||
259 | static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) | ||
260 | { | ||
261 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || | ||
262 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) | ||
263 | kill_guest(cpu, "bad page middle directory entry"); | ||
264 | } | ||
265 | #endif | ||
266 | |||
192 | /*H:330 | 267 | /*H:330 |
193 | * (i) Looking up a page table entry when the Guest faults. | 268 | * (i) Looking up a page table entry when the Guest faults. |
194 | * | 269 | * |
@@ -207,6 +282,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
207 | pte_t gpte; | 282 | pte_t gpte; |
208 | pte_t *spte; | 283 | pte_t *spte; |
209 | 284 | ||
285 | #ifdef CONFIG_X86_PAE | ||
286 | pmd_t *spmd; | ||
287 | pmd_t gpmd; | ||
288 | #endif | ||
289 | |||
210 | /* First step: get the top-level Guest page table entry. */ | 290 | /* First step: get the top-level Guest page table entry. */ |
211 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 291 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
212 | /* Toplevel not present? We can't map it in. */ | 292 | /* Toplevel not present? We can't map it in. */ |
@@ -228,12 +308,45 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
228 | check_gpgd(cpu, gpgd); | 308 | check_gpgd(cpu, gpgd); |
229 | /* And we copy the flags to the shadow PGD entry. The page | 309 | /* And we copy the flags to the shadow PGD entry. The page |
230 | * number in the shadow PGD is the page we just allocated. */ | 310 | * number in the shadow PGD is the page we just allocated. */ |
231 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); | 311 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); |
232 | } | 312 | } |
233 | 313 | ||
314 | #ifdef CONFIG_X86_PAE | ||
315 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
316 | /* middle level not present? We can't map it in. */ | ||
317 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
318 | return false; | ||
319 | |||
320 | /* Now look at the matching shadow entry. */ | ||
321 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
322 | |||
323 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { | ||
324 | /* No shadow entry: allocate a new shadow PTE page. */ | ||
325 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | ||
326 | |||
327 | /* This is not really the Guest's fault, but killing it is | ||
328 | * simple for this corner case. */ | ||
329 | if (!ptepage) { | ||
330 | kill_guest(cpu, "out of memory allocating pte page"); | ||
331 | return false; | ||
332 | } | ||
333 | |||
334 | /* We check that the Guest pmd is OK. */ | ||
335 | check_gpmd(cpu, gpmd); | ||
336 | |||
337 | /* And we copy the flags to the shadow PMD entry. The page | ||
338 | * number in the shadow PMD is the page we just allocated. */ | ||
339 | native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); | ||
340 | } | ||
341 | |||
342 | /* OK, now we look at the lower level in the Guest page table: keep its | ||
343 | * address, because we might update it later. */ | ||
344 | gpte_ptr = gpte_addr(cpu, gpmd, vaddr); | ||
345 | #else | ||
234 | /* OK, now we look at the lower level in the Guest page table: keep its | 346 | /* OK, now we look at the lower level in the Guest page table: keep its |
235 | * address, because we might update it later. */ | 347 | * address, because we might update it later. */ |
236 | gpte_ptr = gpte_addr(gpgd, vaddr); | 348 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
349 | #endif | ||
237 | gpte = lgread(cpu, gpte_ptr, pte_t); | 350 | gpte = lgread(cpu, gpte_ptr, pte_t); |
238 | 351 | ||
239 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 352 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
@@ -259,7 +372,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
259 | gpte = pte_mkdirty(gpte); | 372 | gpte = pte_mkdirty(gpte); |
260 | 373 | ||
261 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 374 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
262 | spte = spte_addr(*spgd, vaddr); | 375 | spte = spte_addr(cpu, *spgd, vaddr); |
263 | /* If there was a valid shadow PTE entry here before, we release it. | 376 | /* If there was a valid shadow PTE entry here before, we release it. |
264 | * This can happen with a write to a previously read-only entry. */ | 377 | * This can happen with a write to a previously read-only entry. */ |
265 | release_pte(*spte); | 378 | release_pte(*spte); |
@@ -273,7 +386,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
273 | * table entry, even if the Guest says it's writable. That way | 386 | * table entry, even if the Guest says it's writable. That way |
274 | * we will come back here when a write does actually occur, so | 387 | * we will come back here when a write does actually occur, so |
275 | * we can update the Guest's _PAGE_DIRTY flag. */ | 388 | * we can update the Guest's _PAGE_DIRTY flag. */ |
276 | *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); | 389 | native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); |
277 | 390 | ||
278 | /* Finally, we write the Guest PTE entry back: we've set the | 391 | /* Finally, we write the Guest PTE entry back: we've set the |
279 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | 392 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ |
@@ -301,14 +414,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) | |||
301 | pgd_t *spgd; | 414 | pgd_t *spgd; |
302 | unsigned long flags; | 415 | unsigned long flags; |
303 | 416 | ||
417 | #ifdef CONFIG_X86_PAE | ||
418 | pmd_t *spmd; | ||
419 | #endif | ||
304 | /* Look at the current top level entry: is it present? */ | 420 | /* Look at the current top level entry: is it present? */ |
305 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 421 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
306 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | 422 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
307 | return false; | 423 | return false; |
308 | 424 | ||
425 | #ifdef CONFIG_X86_PAE | ||
426 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
427 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) | ||
428 | return false; | ||
429 | #endif | ||
430 | |||
309 | /* Check the flags on the pte entry itself: it must be present and | 431 | /* Check the flags on the pte entry itself: it must be present and |
310 | * writable. */ | 432 | * writable. */ |
311 | flags = pte_flags(*(spte_addr(*spgd, vaddr))); | 433 | flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); |
312 | 434 | ||
313 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 435 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
314 | } | 436 | } |
@@ -322,8 +444,43 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
322 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 444 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
323 | } | 445 | } |
324 | 446 | ||
447 | #ifdef CONFIG_X86_PAE | ||
448 | static void release_pmd(pmd_t *spmd) | ||
449 | { | ||
450 | /* If the entry's not present, there's nothing to release. */ | ||
451 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { | ||
452 | unsigned int i; | ||
453 | pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); | ||
454 | /* For each entry in the page, we might need to release it. */ | ||
455 | for (i = 0; i < PTRS_PER_PTE; i++) | ||
456 | release_pte(ptepage[i]); | ||
457 | /* Now we can free the page of PTEs */ | ||
458 | free_page((long)ptepage); | ||
459 | /* And zero out the PMD entry so we never release it twice. */ | ||
460 | native_set_pmd(spmd, __pmd(0)); | ||
461 | } | ||
462 | } | ||
463 | |||
464 | static void release_pgd(pgd_t *spgd) | ||
465 | { | ||
466 | /* If the entry's not present, there's nothing to release. */ | ||
467 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | ||
468 | unsigned int i; | ||
469 | pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
470 | |||
471 | for (i = 0; i < PTRS_PER_PMD; i++) | ||
472 | release_pmd(&pmdpage[i]); | ||
473 | |||
474 | /* Now we can free the page of PMDs */ | ||
475 | free_page((long)pmdpage); | ||
476 | /* And zero out the PGD entry so we never release it twice. */ | ||
477 | set_pgd(spgd, __pgd(0)); | ||
478 | } | ||
479 | } | ||
480 | |||
481 | #else /* !CONFIG_X86_PAE */ | ||
325 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 482 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ |
326 | static void release_pgd(struct lguest *lg, pgd_t *spgd) | 483 | static void release_pgd(pgd_t *spgd) |
327 | { | 484 | { |
328 | /* If the entry's not present, there's nothing to release. */ | 485 | /* If the entry's not present, there's nothing to release. */ |
329 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 486 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
@@ -341,7 +498,7 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd) | |||
341 | *spgd = __pgd(0); | 498 | *spgd = __pgd(0); |
342 | } | 499 | } |
343 | } | 500 | } |
344 | 501 | #endif | |
345 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() | 502 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() |
346 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. | 503 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. |
347 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ | 504 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ |
@@ -350,7 +507,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) | |||
350 | unsigned int i; | 507 | unsigned int i; |
351 | /* Release every pgd entry up to the kernel's address. */ | 508 | /* Release every pgd entry up to the kernel's address. */ |
352 | for (i = 0; i < pgd_index(lg->kernel_address); i++) | 509 | for (i = 0; i < pgd_index(lg->kernel_address); i++) |
353 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 510 | release_pgd(lg->pgdirs[idx].pgdir + i); |
354 | } | 511 | } |
355 | 512 | ||
356 | /*H:440 (v) Flushing (throwing away) page tables, | 513 | /*H:440 (v) Flushing (throwing away) page tables, |
@@ -369,7 +526,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
369 | { | 526 | { |
370 | pgd_t gpgd; | 527 | pgd_t gpgd; |
371 | pte_t gpte; | 528 | pte_t gpte; |
372 | 529 | #ifdef CONFIG_X86_PAE | |
530 | pmd_t gpmd; | ||
531 | #endif | ||
373 | /* First step: get the top-level Guest page table entry. */ | 532 | /* First step: get the top-level Guest page table entry. */ |
374 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 533 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
375 | /* Toplevel not present? We can't map it in. */ | 534 | /* Toplevel not present? We can't map it in. */ |
@@ -378,7 +537,14 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
378 | return -1UL; | 537 | return -1UL; |
379 | } | 538 | } |
380 | 539 | ||
381 | gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); | 540 | #ifdef CONFIG_X86_PAE |
541 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
542 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
543 | kill_guest(cpu, "Bad address %#lx", vaddr); | ||
544 | gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); | ||
545 | #else | ||
546 | gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); | ||
547 | #endif | ||
382 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 548 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
383 | kill_guest(cpu, "Bad address %#lx", vaddr); | 549 | kill_guest(cpu, "Bad address %#lx", vaddr); |
384 | 550 | ||
@@ -405,6 +571,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
405 | int *blank_pgdir) | 571 | int *blank_pgdir) |
406 | { | 572 | { |
407 | unsigned int next; | 573 | unsigned int next; |
574 | #ifdef CONFIG_X86_PAE | ||
575 | pmd_t *pmd_table; | ||
576 | #endif | ||
408 | 577 | ||
409 | /* We pick one entry at random to throw out. Choosing the Least | 578 | /* We pick one entry at random to throw out. Choosing the Least |
410 | * Recently Used might be better, but this is easy. */ | 579 | * Recently Used might be better, but this is easy. */ |
@@ -416,10 +585,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
416 | /* If the allocation fails, just keep using the one we have */ | 585 | /* If the allocation fails, just keep using the one we have */ |
417 | if (!cpu->lg->pgdirs[next].pgdir) | 586 | if (!cpu->lg->pgdirs[next].pgdir) |
418 | next = cpu->cpu_pgd; | 587 | next = cpu->cpu_pgd; |
419 | else | 588 | else { |
420 | /* This is a blank page, so there are no kernel | 589 | #ifdef CONFIG_X86_PAE |
421 | * mappings: caller must map the stack! */ | 590 | /* In PAE mode, allocate a pmd page and populate the |
591 | * last pgd entry. */ | ||
592 | pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); | ||
593 | if (!pmd_table) { | ||
594 | free_page((long)cpu->lg->pgdirs[next].pgdir); | ||
595 | set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); | ||
596 | next = cpu->cpu_pgd; | ||
597 | } else { | ||
598 | set_pgd(cpu->lg->pgdirs[next].pgdir + | ||
599 | SWITCHER_PGD_INDEX, | ||
600 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
601 | /* This is a blank page, so there are no kernel | ||
602 | * mappings: caller must map the stack! */ | ||
603 | *blank_pgdir = 1; | ||
604 | } | ||
605 | #else | ||
422 | *blank_pgdir = 1; | 606 | *blank_pgdir = 1; |
607 | #endif | ||
608 | } | ||
423 | } | 609 | } |
424 | /* Record which Guest toplevel this shadows. */ | 610 | /* Record which Guest toplevel this shadows. */ |
425 | cpu->lg->pgdirs[next].gpgdir = gpgdir; | 611 | cpu->lg->pgdirs[next].gpgdir = gpgdir; |
@@ -431,7 +617,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
431 | 617 | ||
432 | /*H:430 (iv) Switching page tables | 618 | /*H:430 (iv) Switching page tables |
433 | * | 619 | * |
434 | * Now we've seen all the page table setting and manipulation, let's see what | 620 | * Now we've seen all the page table setting and manipulation, let's see |
435 | * what happens when the Guest changes page tables (ie. changes the top-level | 621 | * what happens when the Guest changes page tables (ie. changes the top-level |
436 | * pgdir). This occurs on almost every context switch. */ | 622 | * pgdir). This occurs on almost every context switch. */ |
437 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | 623 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) |
@@ -460,10 +646,25 @@ static void release_all_pagetables(struct lguest *lg) | |||
460 | 646 | ||
461 | /* Every shadow pagetable this Guest has */ | 647 | /* Every shadow pagetable this Guest has */ |
462 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 648 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
463 | if (lg->pgdirs[i].pgdir) | 649 | if (lg->pgdirs[i].pgdir) { |
650 | #ifdef CONFIG_X86_PAE | ||
651 | pgd_t *spgd; | ||
652 | pmd_t *pmdpage; | ||
653 | unsigned int k; | ||
654 | |||
655 | /* Get the last pmd page. */ | ||
656 | spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; | ||
657 | pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
658 | |||
659 | /* And release the pmd entries of that pmd page, | ||
660 | * except for the switcher pmd. */ | ||
661 | for (k = 0; k < SWITCHER_PMD_INDEX; k++) | ||
662 | release_pmd(&pmdpage[k]); | ||
663 | #endif | ||
464 | /* Every PGD entry except the Switcher at the top */ | 664 | /* Every PGD entry except the Switcher at the top */ |
465 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | 665 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) |
466 | release_pgd(lg, lg->pgdirs[i].pgdir + j); | 666 | release_pgd(lg->pgdirs[i].pgdir + j); |
667 | } | ||
467 | } | 668 | } |
468 | 669 | ||
469 | /* We also throw away everything when a Guest tells us it's changed a kernel | 670 | /* We also throw away everything when a Guest tells us it's changed a kernel |
@@ -504,24 +705,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
504 | { | 705 | { |
505 | /* Look up the matching shadow page directory entry. */ | 706 | /* Look up the matching shadow page directory entry. */ |
506 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); | 707 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); |
708 | #ifdef CONFIG_X86_PAE | ||
709 | pmd_t *spmd; | ||
710 | #endif | ||
507 | 711 | ||
508 | /* If the top level isn't present, there's no entry to update. */ | 712 | /* If the top level isn't present, there's no entry to update. */ |
509 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 713 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
510 | /* Otherwise, we start by releasing the existing entry. */ | 714 | #ifdef CONFIG_X86_PAE |
511 | pte_t *spte = spte_addr(*spgd, vaddr); | 715 | spmd = spmd_addr(cpu, *spgd, vaddr); |
512 | release_pte(*spte); | 716 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { |
513 | 717 | #endif | |
514 | /* If they're setting this entry as dirty or accessed, we might | 718 | /* Otherwise, we start by releasing |
515 | * as well put that entry they've given us in now. This shaves | 719 | * the existing entry. */ |
516 | * 10% off a copy-on-write micro-benchmark. */ | 720 | pte_t *spte = spte_addr(cpu, *spgd, vaddr); |
517 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 721 | release_pte(*spte); |
518 | check_gpte(cpu, gpte); | 722 | |
519 | *spte = gpte_to_spte(cpu, gpte, | 723 | /* If they're setting this entry as dirty or accessed, |
520 | pte_flags(gpte) & _PAGE_DIRTY); | 724 | * we might as well put that entry they've given us |
521 | } else | 725 | * in now. This shaves 10% off a |
522 | /* Otherwise kill it and we can demand_page() it in | 726 | * copy-on-write micro-benchmark. */ |
523 | * later. */ | 727 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
524 | *spte = __pte(0); | 728 | check_gpte(cpu, gpte); |
729 | native_set_pte(spte, | ||
730 | gpte_to_spte(cpu, gpte, | ||
731 | pte_flags(gpte) & _PAGE_DIRTY)); | ||
732 | } else | ||
733 | /* Otherwise kill it and we can demand_page() | ||
734 | * it in later. */ | ||
735 | native_set_pte(spte, __pte(0)); | ||
736 | #ifdef CONFIG_X86_PAE | ||
737 | } | ||
738 | #endif | ||
525 | } | 739 | } |
526 | } | 740 | } |
527 | 741 | ||
@@ -568,12 +782,10 @@ void guest_set_pte(struct lg_cpu *cpu, | |||
568 | * | 782 | * |
569 | * So with that in mind here's our code to to update a (top-level) PGD entry: | 783 | * So with that in mind here's our code to to update a (top-level) PGD entry: |
570 | */ | 784 | */ |
571 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) | 785 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) |
572 | { | 786 | { |
573 | int pgdir; | 787 | int pgdir; |
574 | 788 | ||
575 | /* The kernel seems to try to initialize this early on: we ignore its | ||
576 | * attempts to map over the Switcher. */ | ||
577 | if (idx >= SWITCHER_PGD_INDEX) | 789 | if (idx >= SWITCHER_PGD_INDEX) |
578 | return; | 790 | return; |
579 | 791 | ||
@@ -581,8 +793,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
581 | pgdir = find_pgdir(lg, gpgdir); | 793 | pgdir = find_pgdir(lg, gpgdir); |
582 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | 794 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) |
583 | /* ... throw it away. */ | 795 | /* ... throw it away. */ |
584 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); | 796 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
585 | } | 797 | } |
798 | #ifdef CONFIG_X86_PAE | ||
799 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | ||
800 | { | ||
801 | guest_pagetable_clear_all(&lg->cpus[0]); | ||
802 | } | ||
803 | #endif | ||
586 | 804 | ||
587 | /* Once we know how much memory we have we can construct simple identity | 805 | /* Once we know how much memory we have we can construct simple identity |
588 | * (which set virtual == physical) and linear mappings | 806 | * (which set virtual == physical) and linear mappings |
@@ -596,8 +814,16 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
596 | { | 814 | { |
597 | pgd_t __user *pgdir; | 815 | pgd_t __user *pgdir; |
598 | pte_t __user *linear; | 816 | pte_t __user *linear; |
599 | unsigned int mapped_pages, i, linear_pages, phys_linear; | ||
600 | unsigned long mem_base = (unsigned long)lg->mem_base; | 817 | unsigned long mem_base = (unsigned long)lg->mem_base; |
818 | unsigned int mapped_pages, i, linear_pages; | ||
819 | #ifdef CONFIG_X86_PAE | ||
820 | pmd_t __user *pmds; | ||
821 | unsigned int j; | ||
822 | pgd_t pgd; | ||
823 | pmd_t pmd; | ||
824 | #else | ||
825 | unsigned int phys_linear; | ||
826 | #endif | ||
601 | 827 | ||
602 | /* We have mapped_pages frames to map, so we need | 828 | /* We have mapped_pages frames to map, so we need |
603 | * linear_pages page tables to map them. */ | 829 | * linear_pages page tables to map them. */ |
@@ -610,6 +836,9 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
610 | /* Now we use the next linear_pages pages as pte pages */ | 836 | /* Now we use the next linear_pages pages as pte pages */ |
611 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 837 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
612 | 838 | ||
839 | #ifdef CONFIG_X86_PAE | ||
840 | pmds = (void *)linear - PAGE_SIZE; | ||
841 | #endif | ||
613 | /* Linear mapping is easy: put every page's address into the | 842 | /* Linear mapping is easy: put every page's address into the |
614 | * mapping in order. */ | 843 | * mapping in order. */ |
615 | for (i = 0; i < mapped_pages; i++) { | 844 | for (i = 0; i < mapped_pages; i++) { |
@@ -621,6 +850,22 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
621 | 850 | ||
622 | /* The top level points to the linear page table pages above. | 851 | /* The top level points to the linear page table pages above. |
623 | * We setup the identity and linear mappings here. */ | 852 | * We setup the identity and linear mappings here. */ |
853 | #ifdef CONFIG_X86_PAE | ||
854 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | ||
855 | i += PTRS_PER_PTE, j++) { | ||
856 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | ||
857 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
858 | |||
859 | if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) | ||
860 | return -EFAULT; | ||
861 | } | ||
862 | |||
863 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | ||
864 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | ||
865 | return -EFAULT; | ||
866 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | ||
867 | return -EFAULT; | ||
868 | #else | ||
624 | phys_linear = (unsigned long)linear - mem_base; | 869 | phys_linear = (unsigned long)linear - mem_base; |
625 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 870 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
626 | pgd_t pgd; | 871 | pgd_t pgd; |
@@ -633,6 +878,7 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
633 | &pgd, sizeof(pgd))) | 878 | &pgd, sizeof(pgd))) |
634 | return -EFAULT; | 879 | return -EFAULT; |
635 | } | 880 | } |
881 | #endif | ||
636 | 882 | ||
637 | /* We return the top level (guest-physical) address: remember where | 883 | /* We return the top level (guest-physical) address: remember where |
638 | * this is. */ | 884 | * this is. */ |
@@ -648,7 +894,10 @@ int init_guest_pagetable(struct lguest *lg) | |||
648 | u64 mem; | 894 | u64 mem; |
649 | u32 initrd_size; | 895 | u32 initrd_size; |
650 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; | 896 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; |
651 | 897 | #ifdef CONFIG_X86_PAE | |
898 | pgd_t *pgd; | ||
899 | pmd_t *pmd_table; | ||
900 | #endif | ||
652 | /* Get the Guest memory size and the ramdisk size from the boot header | 901 | /* Get the Guest memory size and the ramdisk size from the boot header |
653 | * located at lg->mem_base (Guest address 0). */ | 902 | * located at lg->mem_base (Guest address 0). */ |
654 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) | 903 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) |
@@ -663,6 +912,15 @@ int init_guest_pagetable(struct lguest *lg) | |||
663 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 912 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
664 | if (!lg->pgdirs[0].pgdir) | 913 | if (!lg->pgdirs[0].pgdir) |
665 | return -ENOMEM; | 914 | return -ENOMEM; |
915 | #ifdef CONFIG_X86_PAE | ||
916 | pgd = lg->pgdirs[0].pgdir; | ||
917 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | ||
918 | if (!pmd_table) | ||
919 | return -ENOMEM; | ||
920 | |||
921 | set_pgd(pgd + SWITCHER_PGD_INDEX, | ||
922 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
923 | #endif | ||
666 | lg->cpus[0].cpu_pgd = 0; | 924 | lg->cpus[0].cpu_pgd = 0; |
667 | return 0; | 925 | return 0; |
668 | } | 926 | } |
@@ -672,17 +930,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
672 | { | 930 | { |
673 | /* We get the kernel address: above this is all kernel memory. */ | 931 | /* We get the kernel address: above this is all kernel memory. */ |
674 | if (get_user(cpu->lg->kernel_address, | 932 | if (get_user(cpu->lg->kernel_address, |
675 | &cpu->lg->lguest_data->kernel_address) | 933 | &cpu->lg->lguest_data->kernel_address) |
676 | /* We tell the Guest that it can't use the top 4MB of virtual | 934 | /* We tell the Guest that it can't use the top 2 or 4 MB |
677 | * addresses used by the Switcher. */ | 935 | * of virtual addresses used by the Switcher. */ |
678 | || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) | 936 | || put_user(RESERVE_MEM * 1024 * 1024, |
679 | || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) | 937 | &cpu->lg->lguest_data->reserve_mem) |
938 | || put_user(cpu->lg->pgdirs[0].gpgdir, | ||
939 | &cpu->lg->lguest_data->pgdir)) | ||
680 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 940 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
681 | 941 | ||
682 | /* In flush_user_mappings() we loop from 0 to | 942 | /* In flush_user_mappings() we loop from 0 to |
683 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | 943 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the |
684 | * Switcher mappings, so check that now. */ | 944 | * Switcher mappings, so check that now. */ |
945 | #ifdef CONFIG_X86_PAE | ||
946 | if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && | ||
947 | pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) | ||
948 | #else | ||
685 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) | 949 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) |
950 | #endif | ||
686 | kill_guest(cpu, "bad kernel address %#lx", | 951 | kill_guest(cpu, "bad kernel address %#lx", |
687 | cpu->lg->kernel_address); | 952 | cpu->lg->kernel_address); |
688 | } | 953 | } |
@@ -708,16 +973,30 @@ void free_guest_pagetable(struct lguest *lg) | |||
708 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | 973 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
709 | { | 974 | { |
710 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 975 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
711 | pgd_t switcher_pgd; | ||
712 | pte_t regs_pte; | 976 | pte_t regs_pte; |
713 | unsigned long pfn; | 977 | unsigned long pfn; |
714 | 978 | ||
979 | #ifdef CONFIG_X86_PAE | ||
980 | pmd_t switcher_pmd; | ||
981 | pmd_t *pmd_table; | ||
982 | |||
983 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | ||
984 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | ||
985 | |||
986 | pmd_table = __va(pgd_pfn(cpu->lg-> | ||
987 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | ||
988 | << PAGE_SHIFT); | ||
989 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | ||
990 | #else | ||
991 | pgd_t switcher_pgd; | ||
992 | |||
715 | /* Make the last PGD entry for this Guest point to the Switcher's PTE | 993 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
716 | * page for this CPU (with appropriate flags). */ | 994 | * page for this CPU (with appropriate flags). */ |
717 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); | 995 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); |
718 | 996 | ||
719 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 997 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
720 | 998 | ||
999 | #endif | ||
721 | /* We also change the Switcher PTE page. When we're running the Guest, | 1000 | /* We also change the Switcher PTE page. When we're running the Guest, |
722 | * we want the Guest's "regs" page to appear where the first Switcher | 1001 | * we want the Guest's "regs" page to appear where the first Switcher |
723 | * page for this CPU is. This is an optimization: when the Switcher | 1002 | * page for this CPU is. This is an optimization: when the Switcher |
@@ -726,8 +1005,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
726 | * page is already mapped there, we don't have to copy them out | 1005 | * page is already mapped there, we don't have to copy them out |
727 | * again. */ | 1006 | * again. */ |
728 | pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; | 1007 | pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; |
729 | regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); | 1008 | native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); |
730 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; | 1009 | native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], |
1010 | regs_pte); | ||
731 | } | 1011 | } |
732 | /*:*/ | 1012 | /*:*/ |
733 | 1013 | ||
@@ -752,21 +1032,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
752 | 1032 | ||
753 | /* The first entries are easy: they map the Switcher code. */ | 1033 | /* The first entries are easy: they map the Switcher code. */ |
754 | for (i = 0; i < pages; i++) { | 1034 | for (i = 0; i < pages; i++) { |
755 | pte[i] = mk_pte(switcher_page[i], | 1035 | native_set_pte(&pte[i], mk_pte(switcher_page[i], |
756 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); | 1036 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); |
757 | } | 1037 | } |
758 | 1038 | ||
759 | /* The only other thing we map is this CPU's pair of pages. */ | 1039 | /* The only other thing we map is this CPU's pair of pages. */ |
760 | i = pages + cpu*2; | 1040 | i = pages + cpu*2; |
761 | 1041 | ||
762 | /* First page (Guest registers) is writable from the Guest */ | 1042 | /* First page (Guest registers) is writable from the Guest */ |
763 | pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), | 1043 | native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), |
764 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); | 1044 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); |
765 | 1045 | ||
766 | /* The second page contains the "struct lguest_ro_state", and is | 1046 | /* The second page contains the "struct lguest_ro_state", and is |
767 | * read-only. */ | 1047 | * read-only. */ |
768 | pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), | 1048 | native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), |
769 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); | 1049 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); |
770 | } | 1050 | } |
771 | 1051 | ||
772 | /* We've made it through the page table code. Perhaps our tired brains are | 1052 | /* We've made it through the page table code. Perhaps our tired brains are |
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 7ede64ffeef9..482ed5a18750 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c | |||
@@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) | |||
150 | { | 150 | { |
151 | /* We assume the Guest has the same number of GDT entries as the | 151 | /* We assume the Guest has the same number of GDT entries as the |
152 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ | 152 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ |
153 | if (num > ARRAY_SIZE(cpu->arch.gdt)) | 153 | if (num >= ARRAY_SIZE(cpu->arch.gdt)) |
154 | kill_guest(cpu, "too many gdt entries %i", num); | 154 | kill_guest(cpu, "too many gdt entries %i", num); |
155 | 155 | ||
156 | /* Set it up, then fix it. */ | 156 | /* Set it up, then fix it. */ |
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 1a83910f674f..eaf722fe309a 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -358,6 +358,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) | |||
358 | if (emulate_insn(cpu)) | 358 | if (emulate_insn(cpu)) |
359 | return; | 359 | return; |
360 | } | 360 | } |
361 | /* If KVM is active, the vmcall instruction triggers a | ||
362 | * General Protection Fault. Normally it triggers an | ||
363 | * invalid opcode fault (6): */ | ||
364 | case 6: | ||
365 | /* We need to check if ring == GUEST_PL and | ||
366 | * faulting instruction == vmcall. */ | ||
367 | if (is_hypercall(cpu)) { | ||
368 | rewrite_hypercall(cpu); | ||
369 | return; | ||
370 | } | ||
361 | break; | 371 | break; |
362 | case 14: /* We've intercepted a Page Fault. */ | 372 | case 14: /* We've intercepted a Page Fault. */ |
363 | /* The Guest accessed a virtual address that wasn't mapped. | 373 | /* The Guest accessed a virtual address that wasn't mapped. |
@@ -403,15 +413,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) | |||
403 | * up the pointer now to indicate a hypercall is pending. */ | 413 | * up the pointer now to indicate a hypercall is pending. */ |
404 | cpu->hcall = (struct hcall_args *)cpu->regs; | 414 | cpu->hcall = (struct hcall_args *)cpu->regs; |
405 | return; | 415 | return; |
406 | case 6: | ||
407 | /* kvm hypercalls trigger an invalid opcode fault (6). | ||
408 | * We need to check if ring == GUEST_PL and | ||
409 | * faulting instruction == vmcall. */ | ||
410 | if (is_hypercall(cpu)) { | ||
411 | rewrite_hypercall(cpu); | ||
412 | return; | ||
413 | } | ||
414 | break; | ||
415 | } | 416 | } |
416 | 417 | ||
417 | /* We didn't handle the trap, so it needs to go to the Guest. */ | 418 | /* We didn't handle the trap, so it needs to go to the Guest. */ |