aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-18 12:24:01 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-18 12:24:01 -0500
commit53861af9a17022898619a2ae4ead0dfc601b7c13 (patch)
treedc11088d9e86fa1d8d8479974864153a8f976897 /drivers/lguest
parent5c2770079fb9b8c5bfb7113d9e76de66e77a0e24 (diff)
parent5b40a7daf51812b35cf05d1601a779a7043f8414 (diff)
Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux
Pull virtio updates from Rusty Russell: "OK, this has the big virtio 1.0 implementation, as specified by OASIS. On top of tht is the major rework of lguest, to use PCI and virtio 1.0, to double-check the implementation. Then comes the inevitable fixes and cleanups from that work" * tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (80 commits) virtio: don't set VIRTIO_CONFIG_S_DRIVER_OK twice. virtio_net: unconditionally define struct virtio_net_hdr_v1. tools/lguest: don't use legacy definitions for net device in example launcher. virtio: Don't expose legacy net features when VIRTIO_NET_NO_LEGACY defined. tools/lguest: use common error macros in the example launcher. tools/lguest: give virtqueues names for better error messages tools/lguest: more documentation and checking of virtio 1.0 compliance. lguest: don't look in console features to find emerg_wr. tools/lguest: don't start devices until DRIVER_OK status set. tools/lguest: handle indirect partway through chain. tools/lguest: insert driver references from the 1.0 spec (4.1 Virtio Over PCI) tools/lguest: insert device references from the 1.0 spec (4.1 Virtio Over PCI) tools/lguest: rename virtio_pci_cfg_cap field to match spec. tools/lguest: fix features_accepted logic in example launcher. tools/lguest: handle device reset correctly in example launcher. virtual: Documentation: simplify and generalize paravirt_ops.txt lguest: remove NOTIFY call and eventfd facility. lguest: remove NOTIFY facility from demonstration launcher. lguest: use the PCI console device's emerg_wr for early boot messages. lguest: always put console in PCI slot #1. ...
Diffstat (limited to 'drivers/lguest')
-rw-r--r--drivers/lguest/Makefile3
-rw-r--r--drivers/lguest/core.c29
-rw-r--r--drivers/lguest/hypercalls.c7
-rw-r--r--drivers/lguest/lg.h26
-rw-r--r--drivers/lguest/lguest_device.c540
-rw-r--r--drivers/lguest/lguest_user.c221
-rw-r--r--drivers/lguest/page_tables.c75
-rw-r--r--drivers/lguest/x86/core.c198
8 files changed, 252 insertions, 847 deletions
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index c4197503900e..16f52ee73994 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,6 +1,3 @@
1# Guest requires the device configuration and probing code.
2obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
3
4# Host requires the other files, which can be a module. 1# Host requires the other files, which can be a module.
5obj-$(CONFIG_LGUEST) += lg.o 2obj-$(CONFIG_LGUEST) += lg.o
6lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ 3lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 6590558d1d31..7dc93aa004c8 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -208,6 +208,14 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
208 */ 208 */
209int run_guest(struct lg_cpu *cpu, unsigned long __user *user) 209int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
210{ 210{
211 /* If the launcher asked for a register with LHREQ_GETREG */
212 if (cpu->reg_read) {
213 if (put_user(*cpu->reg_read, user))
214 return -EFAULT;
215 cpu->reg_read = NULL;
216 return sizeof(*cpu->reg_read);
217 }
218
211 /* We stop running once the Guest is dead. */ 219 /* We stop running once the Guest is dead. */
212 while (!cpu->lg->dead) { 220 while (!cpu->lg->dead) {
213 unsigned int irq; 221 unsigned int irq;
@@ -217,21 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
217 if (cpu->hcall) 225 if (cpu->hcall)
218 do_hypercalls(cpu); 226 do_hypercalls(cpu);
219 227
220 /* 228 /* Do we have to tell the Launcher about a trap? */
221 * It's possible the Guest did a NOTIFY hypercall to the 229 if (cpu->pending.trap) {
222 * Launcher. 230 if (copy_to_user(user, &cpu->pending,
223 */ 231 sizeof(cpu->pending)))
224 if (cpu->pending_notify) { 232 return -EFAULT;
225 /* 233 return sizeof(cpu->pending);
226 * Does it just needs to write to a registered
227 * eventfd (ie. the appropriate virtqueue thread)?
228 */
229 if (!send_notify_to_eventfd(cpu)) {
230 /* OK, we tell the main Launcher. */
231 if (put_user(cpu->pending_notify, user))
232 return -EFAULT;
233 return sizeof(cpu->pending_notify);
234 }
235 } 234 }
236 235
237 /* 236 /*
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 83511eb0923d..1219af493c0f 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -117,9 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
117 /* Similarly, this sets the halted flag for run_guest(). */ 117 /* Similarly, this sets the halted flag for run_guest(). */
118 cpu->halted = 1; 118 cpu->halted = 1;
119 break; 119 break;
120 case LHCALL_NOTIFY:
121 cpu->pending_notify = args->arg1;
122 break;
123 default: 120 default:
124 /* It should be an architecture-specific hypercall. */ 121 /* It should be an architecture-specific hypercall. */
125 if (lguest_arch_do_hcall(cpu, args)) 122 if (lguest_arch_do_hcall(cpu, args))
@@ -189,7 +186,7 @@ static void do_async_hcalls(struct lg_cpu *cpu)
189 * Stop doing hypercalls if they want to notify the Launcher: 186 * Stop doing hypercalls if they want to notify the Launcher:
190 * it needs to service this first. 187 * it needs to service this first.
191 */ 188 */
192 if (cpu->pending_notify) 189 if (cpu->pending.trap)
193 break; 190 break;
194 } 191 }
195} 192}
@@ -280,7 +277,7 @@ void do_hypercalls(struct lg_cpu *cpu)
280 * NOTIFY to the Launcher, we want to return now. Otherwise we do 277 * NOTIFY to the Launcher, we want to return now. Otherwise we do
281 * the hypercall. 278 * the hypercall.
282 */ 279 */
283 if (!cpu->pending_notify) { 280 if (!cpu->pending.trap) {
284 do_hcall(cpu, cpu->hcall); 281 do_hcall(cpu, cpu->hcall);
285 /* 282 /*
286 * Tricky point: we reset the hcall pointer to mark the 283 * Tricky point: we reset the hcall pointer to mark the
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 2eef40be4c04..307e8b39e7d1 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -50,7 +50,10 @@ struct lg_cpu {
50 /* Bitmap of what has changed: see CHANGED_* above. */ 50 /* Bitmap of what has changed: see CHANGED_* above. */
51 int changed; 51 int changed;
52 52
53 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ 53 /* Pending operation. */
54 struct lguest_pending pending;
55
56 unsigned long *reg_read; /* register from LHREQ_GETREG */
54 57
55 /* At end of a page shared mapped over lguest_pages in guest. */ 58 /* At end of a page shared mapped over lguest_pages in guest. */
56 unsigned long regs_page; 59 unsigned long regs_page;
@@ -78,24 +81,18 @@ struct lg_cpu {
78 struct lg_cpu_arch arch; 81 struct lg_cpu_arch arch;
79}; 82};
80 83
81struct lg_eventfd {
82 unsigned long addr;
83 struct eventfd_ctx *event;
84};
85
86struct lg_eventfd_map {
87 unsigned int num;
88 struct lg_eventfd map[];
89};
90
91/* The private info the thread maintains about the guest. */ 84/* The private info the thread maintains about the guest. */
92struct lguest { 85struct lguest {
93 struct lguest_data __user *lguest_data; 86 struct lguest_data __user *lguest_data;
94 struct lg_cpu cpus[NR_CPUS]; 87 struct lg_cpu cpus[NR_CPUS];
95 unsigned int nr_cpus; 88 unsigned int nr_cpus;
96 89
90 /* Valid guest memory pages must be < this. */
97 u32 pfn_limit; 91 u32 pfn_limit;
98 92
93 /* Device memory is >= pfn_limit and < device_limit. */
94 u32 device_limit;
95
99 /* 96 /*
100 * This provides the offset to the base of guest-physical memory in the 97 * This provides the offset to the base of guest-physical memory in the
101 * Launcher. 98 * Launcher.
@@ -110,8 +107,6 @@ struct lguest {
110 unsigned int stack_pages; 107 unsigned int stack_pages;
111 u32 tsc_khz; 108 u32 tsc_khz;
112 109
113 struct lg_eventfd_map *eventfds;
114
115 /* Dead? */ 110 /* Dead? */
116 const char *dead; 111 const char *dead;
117}; 112};
@@ -197,8 +192,10 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu);
197void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, 192void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
198 unsigned long vaddr, pte_t val); 193 unsigned long vaddr, pte_t val);
199void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); 194void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
200bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode); 195bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
196 unsigned long *iomem);
201void pin_page(struct lg_cpu *cpu, unsigned long vaddr); 197void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
198bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
202unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); 199unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
203void page_table_guest_data_init(struct lg_cpu *cpu); 200void page_table_guest_data_init(struct lg_cpu *cpu);
204 201
@@ -210,6 +207,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu);
210int lguest_arch_init_hypercalls(struct lg_cpu *cpu); 207int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
211int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); 208int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
212void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); 209void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
210unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
213 211
214/* <arch>/switcher.S: */ 212/* <arch>/switcher.S: */
215extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; 213extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
deleted file mode 100644
index 89088d6538fd..000000000000
--- a/drivers/lguest/lguest_device.c
+++ /dev/null
@@ -1,540 +0,0 @@
1/*P:050
2 * Lguest guests use a very simple method to describe devices. It's a
3 * series of device descriptors contained just above the top of normal Guest
4 * memory.
5 *
6 * We use the standard "virtio" device infrastructure, which provides us with a
7 * console, a network and a block driver. Each one expects some configuration
8 * information and a "virtqueue" or two to send and receive data.
9:*/
10#include <linux/init.h>
11#include <linux/bootmem.h>
12#include <linux/lguest_launcher.h>
13#include <linux/virtio.h>
14#include <linux/virtio_config.h>
15#include <linux/interrupt.h>
16#include <linux/virtio_ring.h>
17#include <linux/err.h>
18#include <linux/export.h>
19#include <linux/slab.h>
20#include <asm/io.h>
21#include <asm/paravirt.h>
22#include <asm/lguest_hcall.h>
23
24/* The pointer to our (page) of device descriptions. */
25static void *lguest_devices;
26
27/*
28 * For Guests, device memory can be used as normal memory, so we cast away the
29 * __iomem to quieten sparse.
30 */
31static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
32{
33 return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages);
34}
35
36static inline void lguest_unmap(void *addr)
37{
38 iounmap((__force void __iomem *)addr);
39}
40
41/*D:100
42 * Each lguest device is just a virtio device plus a pointer to its entry
43 * in the lguest_devices page.
44 */
45struct lguest_device {
46 struct virtio_device vdev;
47
48 /* The entry in the lguest_devices page for this device. */
49 struct lguest_device_desc *desc;
50};
51
52/*
53 * Since the virtio infrastructure hands us a pointer to the virtio_device all
54 * the time, it helps to have a curt macro to get a pointer to the struct
55 * lguest_device it's enclosed in.
56 */
57#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev)
58
59/*D:130
60 * Device configurations
61 *
62 * The configuration information for a device consists of one or more
63 * virtqueues, a feature bitmap, and some configuration bytes. The
64 * configuration bytes don't really matter to us: the Launcher sets them up, and
65 * the driver will look at them during setup.
66 *
67 * A convenient routine to return the device's virtqueue config array:
68 * immediately after the descriptor.
69 */
70static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc)
71{
72 return (void *)(desc + 1);
73}
74
75/* The features come immediately after the virtqueues. */
76static u8 *lg_features(const struct lguest_device_desc *desc)
77{
78 return (void *)(lg_vq(desc) + desc->num_vq);
79}
80
81/* The config space comes after the two feature bitmasks. */
82static u8 *lg_config(const struct lguest_device_desc *desc)
83{
84 return lg_features(desc) + desc->feature_len * 2;
85}
86
87/* The total size of the config page used by this device (incl. desc) */
88static unsigned desc_size(const struct lguest_device_desc *desc)
89{
90 return sizeof(*desc)
91 + desc->num_vq * sizeof(struct lguest_vqconfig)
92 + desc->feature_len * 2
93 + desc->config_len;
94}
95
96/* This gets the device's feature bits. */
97static u64 lg_get_features(struct virtio_device *vdev)
98{
99 unsigned int i;
100 u32 features = 0;
101 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
102 u8 *in_features = lg_features(desc);
103
104 /* We do this the slow but generic way. */
105 for (i = 0; i < min(desc->feature_len * 8, 32); i++)
106 if (in_features[i / 8] & (1 << (i % 8)))
107 features |= (1 << i);
108
109 return features;
110}
111
112/*
113 * To notify on reset or feature finalization, we (ab)use the NOTIFY
114 * hypercall, with the descriptor address of the device.
115 */
116static void status_notify(struct virtio_device *vdev)
117{
118 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
119
120 hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
121}
122
123/*
124 * The virtio core takes the features the Host offers, and copies the ones
125 * supported by the driver into the vdev->features array. Once that's all
126 * sorted out, this routine is called so we can tell the Host which features we
127 * understand and accept.
128 */
129static int lg_finalize_features(struct virtio_device *vdev)
130{
131 unsigned int i, bits;
132 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
133 /* Second half of bitmap is features we accept. */
134 u8 *out_features = lg_features(desc) + desc->feature_len;
135
136 /* Give virtio_ring a chance to accept features. */
137 vring_transport_features(vdev);
138
139 /* Make sure we don't have any features > 32 bits! */
140 BUG_ON((u32)vdev->features != vdev->features);
141
142 /*
143 * Since lguest is currently x86-only, we're little-endian. That
144 * means we could just memcpy. But it's not time critical, and in
145 * case someone copies this code, we do it the slow, obvious way.
146 */
147 memset(out_features, 0, desc->feature_len);
148 bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
149 for (i = 0; i < bits; i++) {
150 if (__virtio_test_bit(vdev, i))
151 out_features[i / 8] |= (1 << (i % 8));
152 }
153
154 /* Tell Host we've finished with this device's feature negotiation */
155 status_notify(vdev);
156
157 return 0;
158}
159
160/* Once they've found a field, getting a copy of it is easy. */
161static void lg_get(struct virtio_device *vdev, unsigned int offset,
162 void *buf, unsigned len)
163{
164 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
165
166 /* Check they didn't ask for more than the length of the config! */
167 BUG_ON(offset + len > desc->config_len);
168 memcpy(buf, lg_config(desc) + offset, len);
169}
170
171/* Setting the contents is also trivial. */
172static void lg_set(struct virtio_device *vdev, unsigned int offset,
173 const void *buf, unsigned len)
174{
175 struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
176
177 /* Check they didn't ask for more than the length of the config! */
178 BUG_ON(offset + len > desc->config_len);
179 memcpy(lg_config(desc) + offset, buf, len);
180}
181
182/*
183 * The operations to get and set the status word just access the status field
184 * of the device descriptor.
185 */
186static u8 lg_get_status(struct virtio_device *vdev)
187{
188 return to_lgdev(vdev)->desc->status;
189}
190
191static void lg_set_status(struct virtio_device *vdev, u8 status)
192{
193 BUG_ON(!status);
194 to_lgdev(vdev)->desc->status = status;
195
196 /* Tell Host immediately if we failed. */
197 if (status & VIRTIO_CONFIG_S_FAILED)
198 status_notify(vdev);
199}
200
201static void lg_reset(struct virtio_device *vdev)
202{
203 /* 0 status means "reset" */
204 to_lgdev(vdev)->desc->status = 0;
205 status_notify(vdev);
206}
207
208/*
209 * Virtqueues
210 *
211 * The other piece of infrastructure virtio needs is a "virtqueue": a way of
212 * the Guest device registering buffers for the other side to read from or
213 * write into (ie. send and receive buffers). Each device can have multiple
214 * virtqueues: for example the console driver uses one queue for sending and
215 * another for receiving.
216 *
217 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
218 * already exists in virtio_ring.c. We just need to connect it up.
219 *
220 * We start with the information we need to keep about each virtqueue.
221 */
222
223/*D:140 This is the information we remember about each virtqueue. */
224struct lguest_vq_info {
225 /* A copy of the information contained in the device config. */
226 struct lguest_vqconfig config;
227
228 /* The address where we mapped the virtio ring, so we can unmap it. */
229 void *pages;
230};
231
232/*
233 * When the virtio_ring code wants to prod the Host, it calls us here and we
234 * make a hypercall. We hand the physical address of the virtqueue so the Host
235 * knows which virtqueue we're talking about.
236 */
237static bool lg_notify(struct virtqueue *vq)
238{
239 /*
240 * We store our virtqueue information in the "priv" pointer of the
241 * virtqueue structure.
242 */
243 struct lguest_vq_info *lvq = vq->priv;
244
245 hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0);
246 return true;
247}
248
249/* An extern declaration inside a C file is bad form. Don't do it. */
250extern int lguest_setup_irq(unsigned int irq);
251
252/*
253 * This routine finds the Nth virtqueue described in the configuration of
254 * this device and sets it up.
255 *
256 * This is kind of an ugly duckling. It'd be nicer to have a standard
257 * representation of a virtqueue in the configuration space, but it seems that
258 * everyone wants to do it differently. The KVM coders want the Guest to
259 * allocate its own pages and tell the Host where they are, but for lguest it's
260 * simpler for the Host to simply tell us where the pages are.
261 */
262static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
263 unsigned index,
264 void (*callback)(struct virtqueue *vq),
265 const char *name)
266{
267 struct lguest_device *ldev = to_lgdev(vdev);
268 struct lguest_vq_info *lvq;
269 struct virtqueue *vq;
270 int err;
271
272 if (!name)
273 return NULL;
274
275 /* We must have this many virtqueues. */
276 if (index >= ldev->desc->num_vq)
277 return ERR_PTR(-ENOENT);
278
279 lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
280 if (!lvq)
281 return ERR_PTR(-ENOMEM);
282
283 /*
284 * Make a copy of the "struct lguest_vqconfig" entry, which sits after
285 * the descriptor. We need a copy because the config space might not
286 * be aligned correctly.
287 */
288 memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config));
289
290 printk("Mapping virtqueue %i addr %lx\n", index,
291 (unsigned long)lvq->config.pfn << PAGE_SHIFT);
292 /* Figure out how many pages the ring will take, and map that memory */
293 lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
294 DIV_ROUND_UP(vring_size(lvq->config.num,
295 LGUEST_VRING_ALIGN),
296 PAGE_SIZE));
297 if (!lvq->pages) {
298 err = -ENOMEM;
299 goto free_lvq;
300 }
301
302 /*
303 * OK, tell virtio_ring.c to set up a virtqueue now we know its size
304 * and we've got a pointer to its pages. Note that we set weak_barriers
305 * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
306 * barriers.
307 */
308 vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
309 true, lvq->pages, lg_notify, callback, name);
310 if (!vq) {
311 err = -ENOMEM;
312 goto unmap;
313 }
314
315 /* Make sure the interrupt is allocated. */
316 err = lguest_setup_irq(lvq->config.irq);
317 if (err)
318 goto destroy_vring;
319
320 /*
321 * Tell the interrupt for this virtqueue to go to the virtio_ring
322 * interrupt handler.
323 *
324 * FIXME: We used to have a flag for the Host to tell us we could use
325 * the interrupt as a source of randomness: it'd be nice to have that
326 * back.
327 */
328 err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
329 dev_name(&vdev->dev), vq);
330 if (err)
331 goto free_desc;
332
333 /*
334 * Last of all we hook up our 'struct lguest_vq_info" to the
335 * virtqueue's priv pointer.
336 */
337 vq->priv = lvq;
338 return vq;
339
340free_desc:
341 irq_free_desc(lvq->config.irq);
342destroy_vring:
343 vring_del_virtqueue(vq);
344unmap:
345 lguest_unmap(lvq->pages);
346free_lvq:
347 kfree(lvq);
348 return ERR_PTR(err);
349}
350/*:*/
351
352/* Cleaning up a virtqueue is easy */
353static void lg_del_vq(struct virtqueue *vq)
354{
355 struct lguest_vq_info *lvq = vq->priv;
356
357 /* Release the interrupt */
358 free_irq(lvq->config.irq, vq);
359 /* Tell virtio_ring.c to free the virtqueue. */
360 vring_del_virtqueue(vq);
361 /* Unmap the pages containing the ring. */
362 lguest_unmap(lvq->pages);
363 /* Free our own queue information. */
364 kfree(lvq);
365}
366
367static void lg_del_vqs(struct virtio_device *vdev)
368{
369 struct virtqueue *vq, *n;
370
371 list_for_each_entry_safe(vq, n, &vdev->vqs, list)
372 lg_del_vq(vq);
373}
374
375static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
376 struct virtqueue *vqs[],
377 vq_callback_t *callbacks[],
378 const char *names[])
379{
380 struct lguest_device *ldev = to_lgdev(vdev);
381 int i;
382
383 /* We must have this many virtqueues. */
384 if (nvqs > ldev->desc->num_vq)
385 return -ENOENT;
386
387 for (i = 0; i < nvqs; ++i) {
388 vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
389 if (IS_ERR(vqs[i]))
390 goto error;
391 }
392 return 0;
393
394error:
395 lg_del_vqs(vdev);
396 return PTR_ERR(vqs[i]);
397}
398
399static const char *lg_bus_name(struct virtio_device *vdev)
400{
401 return "";
402}
403
404/* The ops structure which hooks everything together. */
405static const struct virtio_config_ops lguest_config_ops = {
406 .get_features = lg_get_features,
407 .finalize_features = lg_finalize_features,
408 .get = lg_get,
409 .set = lg_set,
410 .get_status = lg_get_status,
411 .set_status = lg_set_status,
412 .reset = lg_reset,
413 .find_vqs = lg_find_vqs,
414 .del_vqs = lg_del_vqs,
415 .bus_name = lg_bus_name,
416};
417
418/*
419 * The root device for the lguest virtio devices. This makes them appear as
420 * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2.
421 */
422static struct device *lguest_root;
423
424/*D:120
425 * This is the core of the lguest bus: actually adding a new device.
426 * It's a separate function because it's neater that way, and because an
427 * earlier version of the code supported hotplug and unplug. They were removed
428 * early on because they were never used.
429 *
430 * As Andrew Tridgell says, "Untested code is buggy code".
431 *
432 * It's worth reading this carefully: we start with a pointer to the new device
433 * descriptor in the "lguest_devices" page, and the offset into the device
434 * descriptor page so we can uniquely identify it if things go badly wrong.
435 */
436static void add_lguest_device(struct lguest_device_desc *d,
437 unsigned int offset)
438{
439 struct lguest_device *ldev;
440
441 /* Start with zeroed memory; Linux's device layer counts on it. */
442 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
443 if (!ldev) {
444 printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n",
445 offset, d->type);
446 return;
447 }
448
449 /* This devices' parent is the lguest/ dir. */
450 ldev->vdev.dev.parent = lguest_root;
451 /*
452 * The device type comes straight from the descriptor. There's also a
453 * device vendor field in the virtio_device struct, which we leave as
454 * 0.
455 */
456 ldev->vdev.id.device = d->type;
457 /*
458 * We have a simple set of routines for querying the device's
459 * configuration information and setting its status.
460 */
461 ldev->vdev.config = &lguest_config_ops;
462 /* And we remember the device's descriptor for lguest_config_ops. */
463 ldev->desc = d;
464
465 /*
466 * register_virtio_device() sets up the generic fields for the struct
467 * virtio_device and calls device_register(). This makes the bus
468 * infrastructure look for a matching driver.
469 */
470 if (register_virtio_device(&ldev->vdev) != 0) {
471 printk(KERN_ERR "Failed to register lguest dev %u type %u\n",
472 offset, d->type);
473 kfree(ldev);
474 }
475}
476
477/*D:110
478 * scan_devices() simply iterates through the device page. The type 0 is
479 * reserved to mean "end of devices".
480 */
481static void scan_devices(void)
482{
483 unsigned int i;
484 struct lguest_device_desc *d;
485
486 /* We start at the page beginning, and skip over each entry. */
487 for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
488 d = lguest_devices + i;
489
490 /* Once we hit a zero, stop. */
491 if (d->type == 0)
492 break;
493
494 printk("Device at %i has size %u\n", i, desc_size(d));
495 add_lguest_device(d, i);
496 }
497}
498
499/*D:105
500 * Fairly early in boot, lguest_devices_init() is called to set up the
501 * lguest device infrastructure. We check that we are a Guest by checking
502 * pv_info.name: there are other ways of checking, but this seems most
503 * obvious to me.
504 *
505 * So we can access the "struct lguest_device_desc"s easily, we map that memory
506 * and store the pointer in the global "lguest_devices". Then we register a
507 * root device from which all our devices will hang (this seems to be the
508 * correct sysfs incantation).
509 *
510 * Finally we call scan_devices() which adds all the devices found in the
511 * lguest_devices page.
512 */
513static int __init lguest_devices_init(void)
514{
515 if (strcmp(pv_info.name, "lguest") != 0)
516 return 0;
517
518 lguest_root = root_device_register("lguest");
519 if (IS_ERR(lguest_root))
520 panic("Could not register lguest root");
521
522 /* Devices are in a single page above top of "normal" mem */
523 lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
524
525 scan_devices();
526 return 0;
527}
528/* We do this after core stuff, but before the drivers. */
529postcore_initcall(lguest_devices_init);
530
531/*D:150
532 * At this point in the journey we used to now wade through the lguest
533 * devices themselves: net, block and console. Since they're all now virtio
534 * devices rather than lguest-specific, I've decided to ignore them. Mostly,
535 * they're kind of boring. But this does mean you'll never experience the
536 * thrill of reading the forbidden love scene buried deep in the block driver.
537 *
538 * "make Launcher" beckons, where we answer questions like "Where do Guests
539 * come from?", and "What do you do when someone asks for optimization?".
540 */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 4263f4cc8c55..c4c6113eb9a6 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -2,175 +2,62 @@
2 * launcher controls and communicates with the Guest. For example, 2 * launcher controls and communicates with the Guest. For example,
3 * the first write will tell us the Guest's memory layout and entry 3 * the first write will tell us the Guest's memory layout and entry
4 * point. A read will run the Guest until something happens, such as 4 * point. A read will run the Guest until something happens, such as
5 * a signal or the Guest doing a NOTIFY out to the Launcher. There is 5 * a signal or the Guest accessing a device.
6 * also a way for the Launcher to attach eventfds to particular NOTIFY
7 * values instead of returning from the read() call.
8:*/ 6:*/
9#include <linux/uaccess.h> 7#include <linux/uaccess.h>
10#include <linux/miscdevice.h> 8#include <linux/miscdevice.h>
11#include <linux/fs.h> 9#include <linux/fs.h>
12#include <linux/sched.h> 10#include <linux/sched.h>
13#include <linux/eventfd.h>
14#include <linux/file.h> 11#include <linux/file.h>
15#include <linux/slab.h> 12#include <linux/slab.h>
16#include <linux/export.h> 13#include <linux/export.h>
17#include "lg.h" 14#include "lg.h"
18 15
19/*L:056 16/*L:052
20 * Before we move on, let's jump ahead and look at what the kernel does when 17 The Launcher can get the registers, and also set some of them.
21 * it needs to look up the eventfds. That will complete our picture of how we 18*/
22 * use RCU. 19static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
23 *
24 * The notification value is in cpu->pending_notify: we return true if it went
25 * to an eventfd.
26 */
27bool send_notify_to_eventfd(struct lg_cpu *cpu)
28{
29 unsigned int i;
30 struct lg_eventfd_map *map;
31
32 /*
33 * This "rcu_read_lock()" helps track when someone is still looking at
34 * the (RCU-using) eventfds array. It's not actually a lock at all;
35 * indeed it's a noop in many configurations. (You didn't expect me to
36 * explain all the RCU secrets here, did you?)
37 */
38 rcu_read_lock();
39 /*
40 * rcu_dereference is the counter-side of rcu_assign_pointer(); it
41 * makes sure we don't access the memory pointed to by
42 * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy,
43 * but Alpha allows this! Paul McKenney points out that a really
44 * aggressive compiler could have the same effect:
45 * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
46 *
47 * So play safe, use rcu_dereference to get the rcu-protected pointer:
48 */
49 map = rcu_dereference(cpu->lg->eventfds);
50 /*
51 * Simple array search: even if they add an eventfd while we do this,
52 * we'll continue to use the old array and just won't see the new one.
53 */
54 for (i = 0; i < map->num; i++) {
55 if (map->map[i].addr == cpu->pending_notify) {
56 eventfd_signal(map->map[i].event, 1);
57 cpu->pending_notify = 0;
58 break;
59 }
60 }
61 /* We're done with the rcu-protected variable cpu->lg->eventfds. */
62 rcu_read_unlock();
63
64 /* If we cleared the notification, it's because we found a match. */
65 return cpu->pending_notify == 0;
66}
67
68/*L:055
69 * One of the more tricksy tricks in the Linux Kernel is a technique called
70 * Read Copy Update. Since one point of lguest is to teach lguest journeyers
71 * about kernel coding, I use it here. (In case you're curious, other purposes
72 * include learning about virtualization and instilling a deep appreciation for
73 * simplicity and puppies).
74 *
75 * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
76 * add new eventfds without ever blocking readers from accessing the array.
77 * The current Launcher only does this during boot, so that never happens. But
78 * Read Copy Update is cool, and adding a lock risks damaging even more puppies
79 * than this code does.
80 *
81 * We allocate a brand new one-larger array, copy the old one and add our new
82 * element. Then we make the lg eventfd pointer point to the new array.
83 * That's the easy part: now we need to free the old one, but we need to make
84 * sure no slow CPU somewhere is still looking at it. That's what
85 * synchronize_rcu does for us: waits until every CPU has indicated that it has
86 * moved on to know it's no longer using the old one.
87 *
88 * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
89 */
90static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
91{ 20{
92 struct lg_eventfd_map *new, *old = lg->eventfds; 21 unsigned long which;
93
94 /*
95 * We don't allow notifications on value 0 anyway (pending_notify of
96 * 0 means "nothing pending").
97 */
98 if (!addr)
99 return -EINVAL;
100
101 /*
102 * Replace the old array with the new one, carefully: others can
103 * be accessing it at the same time.
104 */
105 new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
106 GFP_KERNEL);
107 if (!new)
108 return -ENOMEM;
109 22
110 /* First make identical copy. */ 23 /* We re-use the ptrace structure to specify which register to read. */
111 memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); 24 if (get_user(which, input) != 0)
112 new->num = old->num; 25 return -EFAULT;
113
114 /* Now append new entry. */
115 new->map[new->num].addr = addr;
116 new->map[new->num].event = eventfd_ctx_fdget(fd);
117 if (IS_ERR(new->map[new->num].event)) {
118 int err = PTR_ERR(new->map[new->num].event);
119 kfree(new);
120 return err;
121 }
122 new->num++;
123 26
124 /* 27 /*
125 * Now put new one in place: rcu_assign_pointer() is a fancy way of 28 * We set up the cpu register pointer, and their next read will
126 * doing "lg->eventfds = new", but it uses memory barriers to make 29 * actually get the value (instead of running the guest).
127 * absolutely sure that the contents of "new" written above is nailed
128 * down before we actually do the assignment.
129 * 30 *
130 * We have to think about these kinds of things when we're operating on 31 * The last argument 'true' says we can access any register.
131 * live data without locks.
132 */ 32 */
133 rcu_assign_pointer(lg->eventfds, new); 33 cpu->reg_read = lguest_arch_regptr(cpu, which, true);
34 if (!cpu->reg_read)
35 return -ENOENT;
134 36
135 /* 37 /* And because this is a write() call, we return the length used. */
136 * We're not in a big hurry. Wait until no one's looking at old 38 return sizeof(unsigned long) * 2;
137 * version, then free it.
138 */
139 synchronize_rcu();
140 kfree(old);
141
142 return 0;
143} 39}
144 40
145/*L:052 41static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
146 * Receiving notifications from the Guest is usually done by attaching a
147 * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will
148 * become readable when the Guest does an LHCALL_NOTIFY with that value.
149 *
150 * This is really convenient for processing each virtqueue in a separate
151 * thread.
152 */
153static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
154{ 42{
155 unsigned long addr, fd; 43 unsigned long which, value, *reg;
156 int err;
157 44
158 if (get_user(addr, input) != 0) 45 /* We re-use the ptrace structure to specify which register to read. */
46 if (get_user(which, input) != 0)
159 return -EFAULT; 47 return -EFAULT;
160 input++; 48 input++;
161 if (get_user(fd, input) != 0) 49 if (get_user(value, input) != 0)
162 return -EFAULT; 50 return -EFAULT;
163 51
164 /* 52 /* The last argument 'false' means we can't access all registers. */
165 * Just make sure two callers don't add eventfds at once. We really 53 reg = lguest_arch_regptr(cpu, which, false);
166 * only need to lock against callers adding to the same Guest, so using 54 if (!reg)
167 * the Big Lguest Lock is overkill. But this is setup, not a fast path. 55 return -ENOENT;
168 */
169 mutex_lock(&lguest_lock);
170 err = add_eventfd(lg, addr, fd);
171 mutex_unlock(&lguest_lock);
172 56
173 return err; 57 *reg = value;
58
59 /* And because this is a write() call, we return the length used. */
60 return sizeof(unsigned long) * 3;
174} 61}
175 62
176/*L:050 63/*L:050
@@ -194,6 +81,23 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
194 return 0; 81 return 0;
195} 82}
196 83
84/*L:053
85 * Deliver a trap: this is used by the Launcher if it can't emulate
86 * an instruction.
87 */
88static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
89{
90 unsigned long trapnum;
91
92 if (get_user(trapnum, input) != 0)
93 return -EFAULT;
94
95 if (!deliver_trap(cpu, trapnum))
96 return -EINVAL;
97
98 return 0;
99}
100
197/*L:040 101/*L:040
198 * Once our Guest is initialized, the Launcher makes it run by reading 102 * Once our Guest is initialized, the Launcher makes it run by reading
199 * from /dev/lguest. 103 * from /dev/lguest.
@@ -237,8 +141,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
237 * If we returned from read() last time because the Guest sent I/O, 141 * If we returned from read() last time because the Guest sent I/O,
238 * clear the flag. 142 * clear the flag.
239 */ 143 */
240 if (cpu->pending_notify) 144 if (cpu->pending.trap)
241 cpu->pending_notify = 0; 145 cpu->pending.trap = 0;
242 146
243 /* Run the Guest until something interesting happens. */ 147 /* Run the Guest until something interesting happens. */
244 return run_guest(cpu, (unsigned long __user *)user); 148 return run_guest(cpu, (unsigned long __user *)user);
@@ -319,7 +223,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
319 /* "struct lguest" contains all we (the Host) know about a Guest. */ 223 /* "struct lguest" contains all we (the Host) know about a Guest. */
320 struct lguest *lg; 224 struct lguest *lg;
321 int err; 225 int err;
322 unsigned long args[3]; 226 unsigned long args[4];
323 227
324 /* 228 /*
325 * We grab the Big Lguest lock, which protects against multiple 229 * We grab the Big Lguest lock, which protects against multiple
@@ -343,21 +247,15 @@ static int initialize(struct file *file, const unsigned long __user *input)
343 goto unlock; 247 goto unlock;
344 } 248 }
345 249
346 lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
347 if (!lg->eventfds) {
348 err = -ENOMEM;
349 goto free_lg;
350 }
351 lg->eventfds->num = 0;
352
353 /* Populate the easy fields of our "struct lguest" */ 250 /* Populate the easy fields of our "struct lguest" */
354 lg->mem_base = (void __user *)args[0]; 251 lg->mem_base = (void __user *)args[0];
355 lg->pfn_limit = args[1]; 252 lg->pfn_limit = args[1];
253 lg->device_limit = args[3];
356 254
357 /* This is the first cpu (cpu 0) and it will start booting at args[2] */ 255 /* This is the first cpu (cpu 0) and it will start booting at args[2] */
358 err = lg_cpu_start(&lg->cpus[0], 0, args[2]); 256 err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
359 if (err) 257 if (err)
360 goto free_eventfds; 258 goto free_lg;
361 259
362 /* 260 /*
363 * Initialize the Guest's shadow page tables. This allocates 261 * Initialize the Guest's shadow page tables. This allocates
@@ -378,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
378free_regs: 276free_regs:
379 /* FIXME: This should be in free_vcpu */ 277 /* FIXME: This should be in free_vcpu */
380 free_page(lg->cpus[0].regs_page); 278 free_page(lg->cpus[0].regs_page);
381free_eventfds:
382 kfree(lg->eventfds);
383free_lg: 279free_lg:
384 kfree(lg); 280 kfree(lg);
385unlock: 281unlock:
@@ -432,8 +328,12 @@ static ssize_t write(struct file *file, const char __user *in,
432 return initialize(file, input); 328 return initialize(file, input);
433 case LHREQ_IRQ: 329 case LHREQ_IRQ:
434 return user_send_irq(cpu, input); 330 return user_send_irq(cpu, input);
435 case LHREQ_EVENTFD: 331 case LHREQ_GETREG:
436 return attach_eventfd(lg, input); 332 return getreg_setup(cpu, input);
333 case LHREQ_SETREG:
334 return setreg(cpu, input);
335 case LHREQ_TRAP:
336 return trap(cpu, input);
437 default: 337 default:
438 return -EINVAL; 338 return -EINVAL;
439 } 339 }
@@ -478,11 +378,6 @@ static int close(struct inode *inode, struct file *file)
478 mmput(lg->cpus[i].mm); 378 mmput(lg->cpus[i].mm);
479 } 379 }
480 380
481 /* Release any eventfds they registered. */
482 for (i = 0; i < lg->eventfds->num; i++)
483 eventfd_ctx_put(lg->eventfds->map[i].event);
484 kfree(lg->eventfds);
485
486 /* 381 /*
487 * If lg->dead doesn't contain an error code it will be NULL or a 382 * If lg->dead doesn't contain an error code it will be NULL or a
488 * kmalloc()ed string, either of which is ok to hand to kfree(). 383 * kmalloc()ed string, either of which is ok to hand to kfree().
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index e8b55c3a6170..e3abebc912c0 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -250,6 +250,16 @@ static void release_pte(pte_t pte)
250} 250}
251/*:*/ 251/*:*/
252 252
253static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
254{
255 /* We don't handle large pages. */
256 if (pte_flags(gpte) & _PAGE_PSE)
257 return false;
258
259 return (pte_pfn(gpte) >= cpu->lg->pfn_limit
260 && pte_pfn(gpte) < cpu->lg->device_limit);
261}
262
253static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) 263static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
254{ 264{
255 if ((pte_flags(gpte) & _PAGE_PSE) || 265 if ((pte_flags(gpte) & _PAGE_PSE) ||
@@ -374,8 +384,14 @@ static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
374 * 384 *
375 * If we fixed up the fault (ie. we mapped the address), this routine returns 385 * If we fixed up the fault (ie. we mapped the address), this routine returns
376 * true. Otherwise, it was a real fault and we need to tell the Guest. 386 * true. Otherwise, it was a real fault and we need to tell the Guest.
387 *
388 * There's a corner case: they're trying to access memory between
389 * pfn_limit and device_limit, which is I/O memory. In this case, we
390 * return false and set @iomem to the physical address, so the the
391 * Launcher can handle the instruction manually.
377 */ 392 */
378bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 393bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
394 unsigned long *iomem)
379{ 395{
380 unsigned long gpte_ptr; 396 unsigned long gpte_ptr;
381 pte_t gpte; 397 pte_t gpte;
@@ -383,6 +399,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
383 pmd_t gpmd; 399 pmd_t gpmd;
384 pgd_t gpgd; 400 pgd_t gpgd;
385 401
402 *iomem = 0;
403
386 /* We never demand page the Switcher, so trying is a mistake. */ 404 /* We never demand page the Switcher, so trying is a mistake. */
387 if (vaddr >= switcher_addr) 405 if (vaddr >= switcher_addr)
388 return false; 406 return false;
@@ -459,6 +477,12 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
459 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 477 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
460 return false; 478 return false;
461 479
480 /* If they're accessing io memory, we expect a fault. */
481 if (gpte_in_iomem(cpu, gpte)) {
482 *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
483 return false;
484 }
485
462 /* 486 /*
463 * Check that the Guest PTE flags are OK, and the page number is below 487 * Check that the Guest PTE flags are OK, and the page number is below
464 * the pfn_limit (ie. not mapping the Launcher binary). 488 * the pfn_limit (ie. not mapping the Launcher binary).
@@ -553,7 +577,9 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
553 */ 577 */
554void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 578void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
555{ 579{
556 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 580 unsigned long iomem;
581
582 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
557 kill_guest(cpu, "bad stack page %#lx", vaddr); 583 kill_guest(cpu, "bad stack page %#lx", vaddr);
558} 584}
559/*:*/ 585/*:*/
@@ -647,7 +673,7 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu)
647/*:*/ 673/*:*/
648 674
649/* We walk down the guest page tables to get a guest-physical address */ 675/* We walk down the guest page tables to get a guest-physical address */
650unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) 676bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
651{ 677{
652 pgd_t gpgd; 678 pgd_t gpgd;
653 pte_t gpte; 679 pte_t gpte;
@@ -656,31 +682,47 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
656#endif 682#endif
657 683
658 /* Still not set up? Just map 1:1. */ 684 /* Still not set up? Just map 1:1. */
659 if (unlikely(cpu->linear_pages)) 685 if (unlikely(cpu->linear_pages)) {
660 return vaddr; 686 *paddr = vaddr;
687 return true;
688 }
661 689
662 /* First step: get the top-level Guest page table entry. */ 690 /* First step: get the top-level Guest page table entry. */
663 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 691 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
664 /* Toplevel not present? We can't map it in. */ 692 /* Toplevel not present? We can't map it in. */
665 if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) { 693 if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
666 kill_guest(cpu, "Bad address %#lx", vaddr); 694 goto fail;
667 return -1UL;
668 }
669 695
670#ifdef CONFIG_X86_PAE 696#ifdef CONFIG_X86_PAE
671 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 697 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
672 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) { 698 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
673 kill_guest(cpu, "Bad address %#lx", vaddr); 699 goto fail;
674 return -1UL;
675 }
676 gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); 700 gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
677#else 701#else
678 gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); 702 gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
679#endif 703#endif
680 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 704 if (!(pte_flags(gpte) & _PAGE_PRESENT))
681 kill_guest(cpu, "Bad address %#lx", vaddr); 705 goto fail;
706
707 *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
708 return true;
709
710fail:
711 *paddr = -1UL;
712 return false;
713}
682 714
683 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 715/*
716 * This is the version we normally use: kills the Guest if it uses a
717 * bad address
718 */
719unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
720{
721 unsigned long paddr;
722
723 if (!__guest_pa(cpu, vaddr, &paddr))
724 kill_guest(cpu, "Bad address %#lx", vaddr);
725 return paddr;
684} 726}
685 727
686/* 728/*
@@ -912,7 +954,8 @@ static void __guest_set_pte(struct lg_cpu *cpu, int idx,
912 * now. This shaves 10% off a copy-on-write 954 * now. This shaves 10% off a copy-on-write
913 * micro-benchmark. 955 * micro-benchmark.
914 */ 956 */
915 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 957 if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
958 && !gpte_in_iomem(cpu, gpte)) {
916 if (!check_gpte(cpu, gpte)) 959 if (!check_gpte(cpu, gpte))
917 return; 960 return;
918 set_pte(spte, 961 set_pte(spte,
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 6adfd7ba4c97..30f2aef69d78 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -182,6 +182,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
182} 182}
183/*:*/ 183/*:*/
184 184
185unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
186{
187 switch (reg_off) {
188 case offsetof(struct pt_regs, bx):
189 return &cpu->regs->ebx;
190 case offsetof(struct pt_regs, cx):
191 return &cpu->regs->ecx;
192 case offsetof(struct pt_regs, dx):
193 return &cpu->regs->edx;
194 case offsetof(struct pt_regs, si):
195 return &cpu->regs->esi;
196 case offsetof(struct pt_regs, di):
197 return &cpu->regs->edi;
198 case offsetof(struct pt_regs, bp):
199 return &cpu->regs->ebp;
200 case offsetof(struct pt_regs, ax):
201 return &cpu->regs->eax;
202 case offsetof(struct pt_regs, ip):
203 return &cpu->regs->eip;
204 case offsetof(struct pt_regs, sp):
205 return &cpu->regs->esp;
206 }
207
208 /* Launcher can read these, but we don't allow any setting. */
209 if (any) {
210 switch (reg_off) {
211 case offsetof(struct pt_regs, ds):
212 return &cpu->regs->ds;
213 case offsetof(struct pt_regs, es):
214 return &cpu->regs->es;
215 case offsetof(struct pt_regs, fs):
216 return &cpu->regs->fs;
217 case offsetof(struct pt_regs, gs):
218 return &cpu->regs->gs;
219 case offsetof(struct pt_regs, cs):
220 return &cpu->regs->cs;
221 case offsetof(struct pt_regs, flags):
222 return &cpu->regs->eflags;
223 case offsetof(struct pt_regs, ss):
224 return &cpu->regs->ss;
225 }
226 }
227
228 return NULL;
229}
230
185/*M:002 231/*M:002
186 * There are hooks in the scheduler which we can register to tell when we 232 * There are hooks in the scheduler which we can register to tell when we
187 * get kicked off the CPU (preempt_notifier_register()). This would allow us 233 * get kicked off the CPU (preempt_notifier_register()). This would allow us
@@ -269,110 +315,73 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
269 * usually attached to a PC. 315 * usually attached to a PC.
270 * 316 *
271 * When the Guest uses one of these instructions, we get a trap (General 317 * When the Guest uses one of these instructions, we get a trap (General
272 * Protection Fault) and come here. We see if it's one of those troublesome 318 * Protection Fault) and come here. We queue this to be sent out to the
273 * instructions and skip over it. We return true if we did. 319 * Launcher to handle.
274 */ 320 */
275static int emulate_insn(struct lg_cpu *cpu)
276{
277 u8 insn;
278 unsigned int insnlen = 0, in = 0, small_operand = 0;
279 /*
280 * The eip contains the *virtual* address of the Guest's instruction:
281 * walk the Guest's page tables to find the "physical" address.
282 */
283 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
284
285 /*
286 * This must be the Guest kernel trying to do something, not userspace!
287 * The bottom two bits of the CS segment register are the privilege
288 * level.
289 */
290 if ((cpu->regs->cs & 3) != GUEST_PL)
291 return 0;
292
293 /* Decoding x86 instructions is icky. */
294 insn = lgread(cpu, physaddr, u8);
295 321
296 /* 322/*
297 * Around 2.6.33, the kernel started using an emulation for the 323 * The eip contains the *virtual* address of the Guest's instruction:
298 * cmpxchg8b instruction in early boot on many configurations. This 324 * we copy the instruction here so the Launcher doesn't have to walk
299 * code isn't paravirtualized, and it tries to disable interrupts. 325 * the page tables to decode it. We handle the case (eg. in a kernel
300 * Ignore it, which will Mostly Work. 326 * module) where the instruction is over two pages, and the pages are
301 */ 327 * virtually but not physically contiguous.
302 if (insn == 0xfa) { 328 *
303 /* "cli", or Clear Interrupt Enable instruction. Skip it. */ 329 * The longest possible x86 instruction is 15 bytes, but we don't handle
304 cpu->regs->eip++; 330 * anything that strange.
305 return 1; 331 */
332static void copy_from_guest(struct lg_cpu *cpu,
333 void *dst, unsigned long vaddr, size_t len)
334{
335 size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
336 unsigned long paddr;
337
338 BUG_ON(len > PAGE_SIZE);
339
340 /* If it goes over a page, copy in two parts. */
341 if (len > to_page_end) {
342 /* But make sure the next page is mapped! */
343 if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
344 copy_from_guest(cpu, dst + to_page_end,
345 vaddr + to_page_end,
346 len - to_page_end);
347 else
348 /* Otherwise fill with zeroes. */
349 memset(dst + to_page_end, 0, len - to_page_end);
350 len = to_page_end;
306 } 351 }
307 352
308 /* 353 /* This will kill the guest if it isn't mapped, but that
309 * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. 354 * shouldn't happen. */
310 */ 355 __lgread(cpu, dst, guest_pa(cpu, vaddr), len);
311 if (insn == 0x66) { 356}
312 small_operand = 1;
313 /* The instruction is 1 byte so far, read the next byte. */
314 insnlen = 1;
315 insn = lgread(cpu, physaddr + insnlen, u8);
316 }
317 357
318 /*
319 * We can ignore the lower bit for the moment and decode the 4 opcodes
320 * we need to emulate.
321 */
322 switch (insn & 0xFE) {
323 case 0xE4: /* in <next byte>,%al */
324 insnlen += 2;
325 in = 1;
326 break;
327 case 0xEC: /* in (%dx),%al */
328 insnlen += 1;
329 in = 1;
330 break;
331 case 0xE6: /* out %al,<next byte> */
332 insnlen += 2;
333 break;
334 case 0xEE: /* out %al,(%dx) */
335 insnlen += 1;
336 break;
337 default:
338 /* OK, we don't know what this is, can't emulate. */
339 return 0;
340 }
341 358
342 /* 359static void setup_emulate_insn(struct lg_cpu *cpu)
343 * If it was an "IN" instruction, they expect the result to be read 360{
344 * into %eax, so we change %eax. We always return all-ones, which 361 cpu->pending.trap = 13;
345 * traditionally means "there's nothing there". 362 copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
346 */ 363 sizeof(cpu->pending.insn));
347 if (in) { 364}
348 /* Lower bit tells means it's a 32/16 bit access */ 365
349 if (insn & 0x1) { 366static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
350 if (small_operand) 367{
351 cpu->regs->eax |= 0xFFFF; 368 cpu->pending.trap = 14;
352 else 369 cpu->pending.addr = iomem_addr;
353 cpu->regs->eax = 0xFFFFFFFF; 370 copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
354 } else 371 sizeof(cpu->pending.insn));
355 cpu->regs->eax |= 0xFF;
356 }
357 /* Finally, we've "done" the instruction, so move past it. */
358 cpu->regs->eip += insnlen;
359 /* Success! */
360 return 1;
361} 372}
362 373
363/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ 374/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
364void lguest_arch_handle_trap(struct lg_cpu *cpu) 375void lguest_arch_handle_trap(struct lg_cpu *cpu)
365{ 376{
377 unsigned long iomem_addr;
378
366 switch (cpu->regs->trapnum) { 379 switch (cpu->regs->trapnum) {
367 case 13: /* We've intercepted a General Protection Fault. */ 380 case 13: /* We've intercepted a General Protection Fault. */
368 /* 381 /* Hand to Launcher to emulate those pesky IN and OUT insns */
369 * Check if this was one of those annoying IN or OUT
370 * instructions which we need to emulate. If so, we just go
371 * back into the Guest after we've done it.
372 */
373 if (cpu->regs->errcode == 0) { 382 if (cpu->regs->errcode == 0) {
374 if (emulate_insn(cpu)) 383 setup_emulate_insn(cpu);
375 return; 384 return;
376 } 385 }
377 break; 386 break;
378 case 14: /* We've intercepted a Page Fault. */ 387 case 14: /* We've intercepted a Page Fault. */
@@ -387,9 +396,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
387 * whether kernel or userspace code. 396 * whether kernel or userspace code.
388 */ 397 */
389 if (demand_page(cpu, cpu->arch.last_pagefault, 398 if (demand_page(cpu, cpu->arch.last_pagefault,
390 cpu->regs->errcode)) 399 cpu->regs->errcode, &iomem_addr))
391 return; 400 return;
392 401
402 /* Was this an access to memory mapped IO? */
403 if (iomem_addr) {
404 /* Tell Launcher, let it handle it. */
405 setup_iomem_insn(cpu, iomem_addr);
406 return;
407 }
408
393 /* 409 /*
394 * OK, it's really not there (or not OK): the Guest needs to 410 * OK, it's really not there (or not OK): the Guest needs to
395 * know. We write out the cr2 value so it knows where the 411 * know. We write out the cr2 value so it knows where the