diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2007-07-26 13:41:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-26 14:35:17 -0400 |
commit | dde797899ac17ebb812b7566044124d785e98dc7 (patch) | |
tree | 531ae7fd415d267e49acfedbbf4f03cf86e5eac1 /drivers/lguest | |
parent | e2c9784325490c878b7f69aeec1bed98b288bd97 (diff) |
lguest: documentation IV: Launcher
Documentation: The Launcher
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/lguest')
-rw-r--r-- | drivers/lguest/core.c | 24 | ||||
-rw-r--r-- | drivers/lguest/io.c | 247 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 25 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 159 |
4 files changed, 427 insertions, 28 deletions
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 2cea0c80c992..1eb05f9a56b6 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -208,24 +208,39 @@ static int emulate_insn(struct lguest *lg) | |||
208 | return 1; | 208 | return 1; |
209 | } | 209 | } |
210 | 210 | ||
211 | /*L:305 | ||
212 | * Dealing With Guest Memory. | ||
213 | * | ||
214 | * When the Guest gives us (what it thinks is) a physical address, we can use | ||
215 | * the normal copy_from_user() & copy_to_user() on that address: remember, | ||
216 | * Guest physical == Launcher virtual. | ||
217 | * | ||
218 | * But we can't trust the Guest: it might be trying to access the Launcher | ||
219 | * code. We have to check that the range is below the pfn_limit the Launcher | ||
220 | * gave us. We have to make sure that addr + len doesn't give us a false | ||
221 | * positive by overflowing, too. */ | ||
211 | int lguest_address_ok(const struct lguest *lg, | 222 | int lguest_address_ok(const struct lguest *lg, |
212 | unsigned long addr, unsigned long len) | 223 | unsigned long addr, unsigned long len) |
213 | { | 224 | { |
214 | return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); | 225 | return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); |
215 | } | 226 | } |
216 | 227 | ||
217 | /* Just like get_user, but don't let guest access lguest binary. */ | 228 | /* This is a convenient routine to get a 32-bit value from the Guest (a very |
229 | * common operation). Here we can see how useful the kill_lguest() routine we | ||
230 | * met in the Launcher can be: we return a random value (0) instead of needing | ||
231 | * to return an error. */ | ||
218 | u32 lgread_u32(struct lguest *lg, unsigned long addr) | 232 | u32 lgread_u32(struct lguest *lg, unsigned long addr) |
219 | { | 233 | { |
220 | u32 val = 0; | 234 | u32 val = 0; |
221 | 235 | ||
222 | /* Don't let them access lguest binary */ | 236 | /* Don't let them access lguest binary. */ |
223 | if (!lguest_address_ok(lg, addr, sizeof(val)) | 237 | if (!lguest_address_ok(lg, addr, sizeof(val)) |
224 | || get_user(val, (u32 __user *)addr) != 0) | 238 | || get_user(val, (u32 __user *)addr) != 0) |
225 | kill_guest(lg, "bad read address %#lx", addr); | 239 | kill_guest(lg, "bad read address %#lx", addr); |
226 | return val; | 240 | return val; |
227 | } | 241 | } |
228 | 242 | ||
243 | /* Same thing for writing a value. */ | ||
229 | void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) | 244 | void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) |
230 | { | 245 | { |
231 | if (!lguest_address_ok(lg, addr, sizeof(val)) | 246 | if (!lguest_address_ok(lg, addr, sizeof(val)) |
@@ -233,6 +248,9 @@ void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) | |||
233 | kill_guest(lg, "bad write address %#lx", addr); | 248 | kill_guest(lg, "bad write address %#lx", addr); |
234 | } | 249 | } |
235 | 250 | ||
251 | /* This routine is more generic, and copies a range of Guest bytes into a | ||
252 | * buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so | ||
253 | * the caller doesn't end up using uninitialized kernel memory. */ | ||
236 | void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) | 254 | void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) |
237 | { | 255 | { |
238 | if (!lguest_address_ok(lg, addr, bytes) | 256 | if (!lguest_address_ok(lg, addr, bytes) |
@@ -243,6 +261,7 @@ void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) | |||
243 | } | 261 | } |
244 | } | 262 | } |
245 | 263 | ||
264 | /* Similarly, our generic routine to copy into a range of Guest bytes. */ | ||
246 | void lgwrite(struct lguest *lg, unsigned long addr, const void *b, | 265 | void lgwrite(struct lguest *lg, unsigned long addr, const void *b, |
247 | unsigned bytes) | 266 | unsigned bytes) |
248 | { | 267 | { |
@@ -250,6 +269,7 @@ void lgwrite(struct lguest *lg, unsigned long addr, const void *b, | |||
250 | || copy_to_user((void __user *)addr, b, bytes) != 0) | 269 | || copy_to_user((void __user *)addr, b, bytes) != 0) |
251 | kill_guest(lg, "bad write address %#lx len %u", addr, bytes); | 270 | kill_guest(lg, "bad write address %#lx len %u", addr, bytes); |
252 | } | 271 | } |
272 | /* (end of memory access helper routines) :*/ | ||
253 | 273 | ||
254 | static void set_ts(void) | 274 | static void set_ts(void) |
255 | { | 275 | { |
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c index d2f02f0653ca..da288128e44f 100644 --- a/drivers/lguest/io.c +++ b/drivers/lguest/io.c | |||
@@ -27,8 +27,36 @@ | |||
27 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
28 | #include "lg.h" | 28 | #include "lg.h" |
29 | 29 | ||
30 | /*L:300 | ||
31 | * I/O | ||
32 | * | ||
33 | * Getting data in and out of the Guest is quite an art. There are numerous | ||
34 | * ways to do it, and they all suck differently. We try to keep things fairly | ||
35 | * close to "real" hardware so our Guest's drivers don't look like an alien | ||
36 | * visitation in the middle of the Linux code, and yet make sure that Guests | ||
37 | * can talk directly to other Guests, not just the Launcher. | ||
38 | * | ||
39 | * To do this, the Guest gives us a key when it binds or sends DMA buffers. | ||
40 | * The key corresponds to a "physical" address inside the Guest (ie. a virtual | ||
41 | * address inside the Launcher process). We don't, however, use this key | ||
42 | * directly. | ||
43 | * | ||
44 | * We want Guests which share memory to be able to DMA to each other: two | ||
45 | * Launchers can mmap memory the same file, then the Guests can communicate. | ||
46 | * Fortunately, the futex code provides us with a way to get a "union | ||
47 | * futex_key" corresponding to the memory lying at a virtual address: if the | ||
48 | * two processes share memory, the "union futex_key" for that memory will match | ||
49 | * even if the memory is mapped at different addresses in each. So we always | ||
50 | * convert the keys to "union futex_key"s to compare them. | ||
51 | * | ||
52 | * Before we dive into this though, we need to look at another set of helper | ||
53 | * routines used throughout the Host kernel code to access Guest memory. | ||
54 | :*/ | ||
30 | static struct list_head dma_hash[61]; | 55 | static struct list_head dma_hash[61]; |
31 | 56 | ||
57 | /* An unfortunate side effect of the Linux double-linked list implementation is | ||
58 | * that there's no good way to statically initialize an array of linked | ||
59 | * lists. */ | ||
32 | void lguest_io_init(void) | 60 | void lguest_io_init(void) |
33 | { | 61 | { |
34 | unsigned int i; | 62 | unsigned int i; |
@@ -60,6 +88,19 @@ kill: | |||
60 | return 0; | 88 | return 0; |
61 | } | 89 | } |
62 | 90 | ||
91 | /*L:330 This is our hash function, using the wonderful Jenkins hash. | ||
92 | * | ||
93 | * The futex key is a union with three parts: an unsigned long word, a pointer, | ||
94 | * and an int "offset". We could use jhash_2words() which takes three u32s. | ||
95 | * (Ok, the hash functions are great: the naming sucks though). | ||
96 | * | ||
97 | * It's nice to be portable to 64-bit platforms, so we use the more generic | ||
98 | * jhash2(), which takes an array of u32, the number of u32s, and an initial | ||
99 | * u32 to roll in. This is uglier, but breaks down to almost the same code on | ||
100 | * 32-bit platforms like this one. | ||
101 | * | ||
102 | * We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61). | ||
103 | */ | ||
63 | static unsigned int hash(const union futex_key *key) | 104 | static unsigned int hash(const union futex_key *key) |
64 | { | 105 | { |
65 | return jhash2((u32*)&key->both.word, | 106 | return jhash2((u32*)&key->both.word, |
@@ -68,6 +109,9 @@ static unsigned int hash(const union futex_key *key) | |||
68 | % ARRAY_SIZE(dma_hash); | 109 | % ARRAY_SIZE(dma_hash); |
69 | } | 110 | } |
70 | 111 | ||
112 | /* This is a convenience routine to compare two keys. It's a much bemoaned C | ||
113 | * weakness that it doesn't allow '==' on structures or unions, so we have to | ||
114 | * open-code it like this. */ | ||
71 | static inline int key_eq(const union futex_key *a, const union futex_key *b) | 115 | static inline int key_eq(const union futex_key *a, const union futex_key *b) |
72 | { | 116 | { |
73 | return (a->both.word == b->both.word | 117 | return (a->both.word == b->both.word |
@@ -75,22 +119,36 @@ static inline int key_eq(const union futex_key *a, const union futex_key *b) | |||
75 | && a->both.offset == b->both.offset); | 119 | && a->both.offset == b->both.offset); |
76 | } | 120 | } |
77 | 121 | ||
78 | /* Must hold read lock on dmainfo owner's current->mm->mmap_sem */ | 122 | /*L:360 OK, when we need to actually free up a Guest's DMA array we do several |
123 | * things, so we have a convenient function to do it. | ||
124 | * | ||
125 | * The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem | ||
126 | * for the drop_futex_key_refs(). */ | ||
79 | static void unlink_dma(struct lguest_dma_info *dmainfo) | 127 | static void unlink_dma(struct lguest_dma_info *dmainfo) |
80 | { | 128 | { |
129 | /* You locked this too, right? */ | ||
81 | BUG_ON(!mutex_is_locked(&lguest_lock)); | 130 | BUG_ON(!mutex_is_locked(&lguest_lock)); |
131 | /* This is how we know that the entry is free. */ | ||
82 | dmainfo->interrupt = 0; | 132 | dmainfo->interrupt = 0; |
133 | /* Remove it from the hash table. */ | ||
83 | list_del(&dmainfo->list); | 134 | list_del(&dmainfo->list); |
135 | /* Drop the references we were holding (to the inode or mm). */ | ||
84 | drop_futex_key_refs(&dmainfo->key); | 136 | drop_futex_key_refs(&dmainfo->key); |
85 | } | 137 | } |
86 | 138 | ||
139 | /*L:350 This is the routine which we call when the Guest asks to unregister a | ||
140 | * DMA array attached to a given key. Returns true if the array was found. */ | ||
87 | static int unbind_dma(struct lguest *lg, | 141 | static int unbind_dma(struct lguest *lg, |
88 | const union futex_key *key, | 142 | const union futex_key *key, |
89 | unsigned long dmas) | 143 | unsigned long dmas) |
90 | { | 144 | { |
91 | int i, ret = 0; | 145 | int i, ret = 0; |
92 | 146 | ||
147 | /* We don't bother with the hash table, just look through all this | ||
148 | * Guest's DMA arrays. */ | ||
93 | for (i = 0; i < LGUEST_MAX_DMA; i++) { | 149 | for (i = 0; i < LGUEST_MAX_DMA; i++) { |
150 | /* In theory it could have more than one array on the same key, | ||
151 | * or one array on multiple keys, so we check both */ | ||
94 | if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { | 152 | if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { |
95 | unlink_dma(&lg->dma[i]); | 153 | unlink_dma(&lg->dma[i]); |
96 | ret = 1; | 154 | ret = 1; |
@@ -100,51 +158,91 @@ static int unbind_dma(struct lguest *lg, | |||
100 | return ret; | 158 | return ret; |
101 | } | 159 | } |
102 | 160 | ||
161 | /*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct | ||
162 | * lguest_dma" for receiving I/O. | ||
163 | * | ||
164 | * The Guest wants to bind an array of "struct lguest_dma"s to a particular key | ||
165 | * to receive input. This only happens when the Guest is setting up a new | ||
166 | * device, so it doesn't have to be very fast. | ||
167 | * | ||
168 | * It returns 1 on a successful registration (it can fail if we hit the limit | ||
169 | * of registrations for this Guest). | ||
170 | */ | ||
103 | int bind_dma(struct lguest *lg, | 171 | int bind_dma(struct lguest *lg, |
104 | unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) | 172 | unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) |
105 | { | 173 | { |
106 | unsigned int i; | 174 | unsigned int i; |
107 | int ret = 0; | 175 | int ret = 0; |
108 | union futex_key key; | 176 | union futex_key key; |
177 | /* Futex code needs the mmap_sem. */ | ||
109 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; | 178 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; |
110 | 179 | ||
180 | /* Invalid interrupt? (We could kill the guest here). */ | ||
111 | if (interrupt >= LGUEST_IRQS) | 181 | if (interrupt >= LGUEST_IRQS) |
112 | return 0; | 182 | return 0; |
113 | 183 | ||
184 | /* We need to grab the Big Lguest Lock, because other Guests may be | ||
185 | * trying to look through this Guest's DMAs to send something while | ||
186 | * we're doing this. */ | ||
114 | mutex_lock(&lguest_lock); | 187 | mutex_lock(&lguest_lock); |
115 | down_read(fshared); | 188 | down_read(fshared); |
116 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | 189 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { |
117 | kill_guest(lg, "bad dma key %#lx", ukey); | 190 | kill_guest(lg, "bad dma key %#lx", ukey); |
118 | goto unlock; | 191 | goto unlock; |
119 | } | 192 | } |
193 | |||
194 | /* We want to keep this key valid once we drop mmap_sem, so we have to | ||
195 | * hold a reference. */ | ||
120 | get_futex_key_refs(&key); | 196 | get_futex_key_refs(&key); |
121 | 197 | ||
198 | /* If the Guest specified an interrupt of 0, that means they want to | ||
199 | * unregister this array of "struct lguest_dma"s. */ | ||
122 | if (interrupt == 0) | 200 | if (interrupt == 0) |
123 | ret = unbind_dma(lg, &key, dmas); | 201 | ret = unbind_dma(lg, &key, dmas); |
124 | else { | 202 | else { |
203 | /* Look through this Guest's dma array for an unused entry. */ | ||
125 | for (i = 0; i < LGUEST_MAX_DMA; i++) { | 204 | for (i = 0; i < LGUEST_MAX_DMA; i++) { |
205 | /* If the interrupt is non-zero, the entry is already | ||
206 | * used. */ | ||
126 | if (lg->dma[i].interrupt) | 207 | if (lg->dma[i].interrupt) |
127 | continue; | 208 | continue; |
128 | 209 | ||
210 | /* OK, a free one! Fill on our details. */ | ||
129 | lg->dma[i].dmas = dmas; | 211 | lg->dma[i].dmas = dmas; |
130 | lg->dma[i].num_dmas = numdmas; | 212 | lg->dma[i].num_dmas = numdmas; |
131 | lg->dma[i].next_dma = 0; | 213 | lg->dma[i].next_dma = 0; |
132 | lg->dma[i].key = key; | 214 | lg->dma[i].key = key; |
133 | lg->dma[i].guestid = lg->guestid; | 215 | lg->dma[i].guestid = lg->guestid; |
134 | lg->dma[i].interrupt = interrupt; | 216 | lg->dma[i].interrupt = interrupt; |
217 | |||
218 | /* Now we add it to the hash table: the position | ||
219 | * depends on the futex key that we got. */ | ||
135 | list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); | 220 | list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); |
221 | /* Success! */ | ||
136 | ret = 1; | 222 | ret = 1; |
137 | goto unlock; | 223 | goto unlock; |
138 | } | 224 | } |
139 | } | 225 | } |
226 | /* If we didn't find a slot to put the key in, drop the reference | ||
227 | * again. */ | ||
140 | drop_futex_key_refs(&key); | 228 | drop_futex_key_refs(&key); |
141 | unlock: | 229 | unlock: |
230 | /* Unlock and out. */ | ||
142 | up_read(fshared); | 231 | up_read(fshared); |
143 | mutex_unlock(&lguest_lock); | 232 | mutex_unlock(&lguest_lock); |
144 | return ret; | 233 | return ret; |
145 | } | 234 | } |
146 | 235 | ||
147 | /* lgread from another guest */ | 236 | /*L:385 Note that our routines to access a different Guest's memory are called |
237 | * lgread_other() and lgwrite_other(): these names emphasize that they are only | ||
238 | * used when the Guest is *not* the current Guest. | ||
239 | * | ||
240 | * The interface for copying from another process's memory is called | ||
241 | * access_process_vm(), with a final argument of 0 for a read, and 1 for a | ||
242 | * write. | ||
243 | * | ||
244 | * We need lgread_other() to read the destination Guest's "struct lguest_dma" | ||
245 | * array. */ | ||
148 | static int lgread_other(struct lguest *lg, | 246 | static int lgread_other(struct lguest *lg, |
149 | void *buf, u32 addr, unsigned bytes) | 247 | void *buf, u32 addr, unsigned bytes) |
150 | { | 248 | { |
@@ -157,7 +255,8 @@ static int lgread_other(struct lguest *lg, | |||
157 | return 1; | 255 | return 1; |
158 | } | 256 | } |
159 | 257 | ||
160 | /* lgwrite to another guest */ | 258 | /* "lgwrite()" to another Guest: used to update the destination "used_len" once |
259 | * we've transferred data into the buffer. */ | ||
161 | static int lgwrite_other(struct lguest *lg, u32 addr, | 260 | static int lgwrite_other(struct lguest *lg, u32 addr, |
162 | const void *buf, unsigned bytes) | 261 | const void *buf, unsigned bytes) |
163 | { | 262 | { |
@@ -170,6 +269,15 @@ static int lgwrite_other(struct lguest *lg, u32 addr, | |||
170 | return 1; | 269 | return 1; |
171 | } | 270 | } |
172 | 271 | ||
272 | /*L:400 This is the generic engine which copies from a source "struct | ||
273 | * lguest_dma" from this Guest into another Guest's "struct lguest_dma". The | ||
274 | * destination Guest's pages have already been mapped, as contained in the | ||
275 | * pages array. | ||
276 | * | ||
277 | * If you're wondering if there's a nice "copy from one process to another" | ||
278 | * routine, so was I. But Linux isn't really set up to copy between two | ||
279 | * unrelated processes, so we have to write it ourselves. | ||
280 | */ | ||
173 | static u32 copy_data(struct lguest *srclg, | 281 | static u32 copy_data(struct lguest *srclg, |
174 | const struct lguest_dma *src, | 282 | const struct lguest_dma *src, |
175 | const struct lguest_dma *dst, | 283 | const struct lguest_dma *dst, |
@@ -178,33 +286,59 @@ static u32 copy_data(struct lguest *srclg, | |||
178 | unsigned int totlen, si, di, srcoff, dstoff; | 286 | unsigned int totlen, si, di, srcoff, dstoff; |
179 | void *maddr = NULL; | 287 | void *maddr = NULL; |
180 | 288 | ||
289 | /* We return the total length transferred. */ | ||
181 | totlen = 0; | 290 | totlen = 0; |
291 | |||
292 | /* We keep indexes into the source and destination "struct lguest_dma", | ||
293 | * and an offset within each region. */ | ||
182 | si = di = 0; | 294 | si = di = 0; |
183 | srcoff = dstoff = 0; | 295 | srcoff = dstoff = 0; |
296 | |||
297 | /* We loop until the source or destination is exhausted. */ | ||
184 | while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] | 298 | while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] |
185 | && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { | 299 | && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { |
300 | /* We can only transfer the rest of the src buffer, or as much | ||
301 | * as will fit into the destination buffer. */ | ||
186 | u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); | 302 | u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); |
187 | 303 | ||
304 | /* For systems using "highmem" we need to use kmap() to access | ||
305 | * the page we want. We often use the same page over and over, | ||
306 | * so rather than kmap() it on every loop, we set the maddr | ||
307 | * pointer to NULL when we need to move to the next | ||
308 | * destination page. */ | ||
188 | if (!maddr) | 309 | if (!maddr) |
189 | maddr = kmap(pages[di]); | 310 | maddr = kmap(pages[di]); |
190 | 311 | ||
191 | /* FIXME: This is not completely portable, since | 312 | /* Copy directly from (this Guest's) source address to the |
192 | archs do different things for copy_to_user_page. */ | 313 | * destination Guest's kmap()ed buffer. Note that maddr points |
314 | * to the start of the page: we need to add the offset of the | ||
315 | * destination address and offset within the buffer. */ | ||
316 | |||
317 | /* FIXME: This is not completely portable. I looked at | ||
318 | * copy_to_user_page(), and some arch's seem to need special | ||
319 | * flushes. x86 is fine. */ | ||
193 | if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, | 320 | if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, |
194 | (void __user *)src->addr[si], len) != 0) { | 321 | (void __user *)src->addr[si], len) != 0) { |
322 | /* If a copy failed, it's the source's fault. */ | ||
195 | kill_guest(srclg, "bad address in sending DMA"); | 323 | kill_guest(srclg, "bad address in sending DMA"); |
196 | totlen = 0; | 324 | totlen = 0; |
197 | break; | 325 | break; |
198 | } | 326 | } |
199 | 327 | ||
328 | /* Increment the total and src & dst offsets */ | ||
200 | totlen += len; | 329 | totlen += len; |
201 | srcoff += len; | 330 | srcoff += len; |
202 | dstoff += len; | 331 | dstoff += len; |
332 | |||
333 | /* Presumably we reached the end of the src or dest buffers: */ | ||
203 | if (srcoff == src->len[si]) { | 334 | if (srcoff == src->len[si]) { |
335 | /* Move to the next buffer at offset 0 */ | ||
204 | si++; | 336 | si++; |
205 | srcoff = 0; | 337 | srcoff = 0; |
206 | } | 338 | } |
207 | if (dstoff == dst->len[di]) { | 339 | if (dstoff == dst->len[di]) { |
340 | /* We need to unmap that destination page and reset | ||
341 | * maddr ready for the next one. */ | ||
208 | kunmap(pages[di]); | 342 | kunmap(pages[di]); |
209 | maddr = NULL; | 343 | maddr = NULL; |
210 | di++; | 344 | di++; |
@@ -212,13 +346,15 @@ static u32 copy_data(struct lguest *srclg, | |||
212 | } | 346 | } |
213 | } | 347 | } |
214 | 348 | ||
349 | /* If we still had a page mapped at the end, unmap now. */ | ||
215 | if (maddr) | 350 | if (maddr) |
216 | kunmap(pages[di]); | 351 | kunmap(pages[di]); |
217 | 352 | ||
218 | return totlen; | 353 | return totlen; |
219 | } | 354 | } |
220 | 355 | ||
221 | /* Src is us, ie. current. */ | 356 | /*L:390 This is how we transfer a "struct lguest_dma" from the source Guest |
357 | * (the current Guest which called SEND_DMA) to another Guest. */ | ||
222 | static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, | 358 | static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, |
223 | struct lguest *dstlg, const struct lguest_dma *dst) | 359 | struct lguest *dstlg, const struct lguest_dma *dst) |
224 | { | 360 | { |
@@ -226,23 +362,31 @@ static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, | |||
226 | u32 ret; | 362 | u32 ret; |
227 | struct page *pages[LGUEST_MAX_DMA_SECTIONS]; | 363 | struct page *pages[LGUEST_MAX_DMA_SECTIONS]; |
228 | 364 | ||
365 | /* We check that both source and destination "struct lguest_dma"s are | ||
366 | * within the bounds of the source and destination Guests */ | ||
229 | if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) | 367 | if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) |
230 | return 0; | 368 | return 0; |
231 | 369 | ||
232 | /* First get the destination pages */ | 370 | /* We need to map the pages which correspond to each parts of |
371 | * destination buffer. */ | ||
233 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { | 372 | for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { |
234 | if (dst->len[i] == 0) | 373 | if (dst->len[i] == 0) |
235 | break; | 374 | break; |
375 | /* get_user_pages() is a complicated function, especially since | ||
376 | * we only want a single page. But it works, and returns the | ||
377 | * number of pages. Note that we're holding the destination's | ||
378 | * mmap_sem, as get_user_pages() requires. */ | ||
236 | if (get_user_pages(dstlg->tsk, dstlg->mm, | 379 | if (get_user_pages(dstlg->tsk, dstlg->mm, |
237 | dst->addr[i], 1, 1, 1, pages+i, NULL) | 380 | dst->addr[i], 1, 1, 1, pages+i, NULL) |
238 | != 1) { | 381 | != 1) { |
382 | /* This means the destination gave us a bogus buffer */ | ||
239 | kill_guest(dstlg, "Error mapping DMA pages"); | 383 | kill_guest(dstlg, "Error mapping DMA pages"); |
240 | ret = 0; | 384 | ret = 0; |
241 | goto drop_pages; | 385 | goto drop_pages; |
242 | } | 386 | } |
243 | } | 387 | } |
244 | 388 | ||
245 | /* Now copy until we run out of src or dst. */ | 389 | /* Now copy the data until we run out of src or dst. */ |
246 | ret = copy_data(srclg, src, dst, pages); | 390 | ret = copy_data(srclg, src, dst, pages); |
247 | 391 | ||
248 | drop_pages: | 392 | drop_pages: |
@@ -251,6 +395,11 @@ drop_pages: | |||
251 | return ret; | 395 | return ret; |
252 | } | 396 | } |
253 | 397 | ||
398 | /*L:380 Transferring data from one Guest to another is not as simple as I'd | ||
399 | * like. We've found the "struct lguest_dma_info" bound to the same address as | ||
400 | * the send, we need to copy into it. | ||
401 | * | ||
402 | * This function returns true if the destination array was empty. */ | ||
254 | static int dma_transfer(struct lguest *srclg, | 403 | static int dma_transfer(struct lguest *srclg, |
255 | unsigned long udma, | 404 | unsigned long udma, |
256 | struct lguest_dma_info *dst) | 405 | struct lguest_dma_info *dst) |
@@ -259,15 +408,23 @@ static int dma_transfer(struct lguest *srclg, | |||
259 | struct lguest *dstlg; | 408 | struct lguest *dstlg; |
260 | u32 i, dma = 0; | 409 | u32 i, dma = 0; |
261 | 410 | ||
411 | /* From the "struct lguest_dma_info" we found in the hash, grab the | ||
412 | * Guest. */ | ||
262 | dstlg = &lguests[dst->guestid]; | 413 | dstlg = &lguests[dst->guestid]; |
263 | /* Get our dma list. */ | 414 | /* Read in the source "struct lguest_dma" handed to SEND_DMA. */ |
264 | lgread(srclg, &src_dma, udma, sizeof(src_dma)); | 415 | lgread(srclg, &src_dma, udma, sizeof(src_dma)); |
265 | 416 | ||
266 | /* We can't deadlock against them dmaing to us, because this | 417 | /* We need the destination's mmap_sem, and we already hold the source's |
267 | * is all under the lguest_lock. */ | 418 | * mmap_sem for the futex key lookup. Normally this would suggest that |
419 | * we could deadlock if the destination Guest was trying to send to | ||
420 | * this source Guest at the same time, which is another reason that all | ||
421 | * I/O is done under the big lguest_lock. */ | ||
268 | down_read(&dstlg->mm->mmap_sem); | 422 | down_read(&dstlg->mm->mmap_sem); |
269 | 423 | ||
424 | /* Look through the destination DMA array for an available buffer. */ | ||
270 | for (i = 0; i < dst->num_dmas; i++) { | 425 | for (i = 0; i < dst->num_dmas; i++) { |
426 | /* We keep a "next_dma" pointer which often helps us avoid | ||
427 | * looking at lots of previously-filled entries. */ | ||
271 | dma = (dst->next_dma + i) % dst->num_dmas; | 428 | dma = (dst->next_dma + i) % dst->num_dmas; |
272 | if (!lgread_other(dstlg, &dst_dma, | 429 | if (!lgread_other(dstlg, &dst_dma, |
273 | dst->dmas + dma * sizeof(struct lguest_dma), | 430 | dst->dmas + dma * sizeof(struct lguest_dma), |
@@ -277,30 +434,46 @@ static int dma_transfer(struct lguest *srclg, | |||
277 | if (!dst_dma.used_len) | 434 | if (!dst_dma.used_len) |
278 | break; | 435 | break; |
279 | } | 436 | } |
437 | |||
438 | /* If we found a buffer, we do the actual data copy. */ | ||
280 | if (i != dst->num_dmas) { | 439 | if (i != dst->num_dmas) { |
281 | unsigned long used_lenp; | 440 | unsigned long used_lenp; |
282 | unsigned int ret; | 441 | unsigned int ret; |
283 | 442 | ||
284 | ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); | 443 | ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); |
285 | /* Put used length in src. */ | 444 | /* Put used length in the source "struct lguest_dma"'s used_len |
445 | * field. It's a little tricky to figure out where that is, | ||
446 | * though. */ | ||
286 | lgwrite_u32(srclg, | 447 | lgwrite_u32(srclg, |
287 | udma+offsetof(struct lguest_dma, used_len), ret); | 448 | udma+offsetof(struct lguest_dma, used_len), ret); |
449 | /* Tranferring 0 bytes is OK if the source buffer was empty. */ | ||
288 | if (ret == 0 && src_dma.len[0] != 0) | 450 | if (ret == 0 && src_dma.len[0] != 0) |
289 | goto fail; | 451 | goto fail; |
290 | 452 | ||
291 | /* Make sure destination sees contents before length. */ | 453 | /* The destination Guest might be running on a different CPU: |
454 | * we have to make sure that it will see the "used_len" field | ||
455 | * change to non-zero *after* it sees the data we copied into | ||
456 | * the buffer. Hence a write memory barrier. */ | ||
292 | wmb(); | 457 | wmb(); |
458 | /* Figuring out where the destination's used_len field for this | ||
459 | * "struct lguest_dma" in the array is also a little ugly. */ | ||
293 | used_lenp = dst->dmas | 460 | used_lenp = dst->dmas |
294 | + dma * sizeof(struct lguest_dma) | 461 | + dma * sizeof(struct lguest_dma) |
295 | + offsetof(struct lguest_dma, used_len); | 462 | + offsetof(struct lguest_dma, used_len); |
296 | lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); | 463 | lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); |
464 | /* Move the cursor for next time. */ | ||
297 | dst->next_dma++; | 465 | dst->next_dma++; |
298 | } | 466 | } |
299 | up_read(&dstlg->mm->mmap_sem); | 467 | up_read(&dstlg->mm->mmap_sem); |
300 | 468 | ||
301 | /* Do this last so dst doesn't simply sleep on lock. */ | 469 | /* We trigger the destination interrupt, even if the destination was |
470 | * empty and we didn't transfer anything: this gives them a chance to | ||
471 | * wake up and refill. */ | ||
302 | set_bit(dst->interrupt, dstlg->irqs_pending); | 472 | set_bit(dst->interrupt, dstlg->irqs_pending); |
473 | /* Wake up the destination process. */ | ||
303 | wake_up_process(dstlg->tsk); | 474 | wake_up_process(dstlg->tsk); |
475 | /* If we passed the last "struct lguest_dma", the receive had no | ||
476 | * buffers left. */ | ||
304 | return i == dst->num_dmas; | 477 | return i == dst->num_dmas; |
305 | 478 | ||
306 | fail: | 479 | fail: |
@@ -308,6 +481,8 @@ fail: | |||
308 | return 0; | 481 | return 0; |
309 | } | 482 | } |
310 | 483 | ||
484 | /*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA | ||
485 | * hypercall. We find out who's listening, and send to them. */ | ||
311 | void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) | 486 | void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) |
312 | { | 487 | { |
313 | union futex_key key; | 488 | union futex_key key; |
@@ -317,31 +492,43 @@ void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) | |||
317 | again: | 492 | again: |
318 | mutex_lock(&lguest_lock); | 493 | mutex_lock(&lguest_lock); |
319 | down_read(fshared); | 494 | down_read(fshared); |
495 | /* Get the futex key for the key the Guest gave us */ | ||
320 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | 496 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { |
321 | kill_guest(lg, "bad sending DMA key"); | 497 | kill_guest(lg, "bad sending DMA key"); |
322 | goto unlock; | 498 | goto unlock; |
323 | } | 499 | } |
324 | /* Shared mapping? Look for other guests... */ | 500 | /* Since the key must be a multiple of 4, the futex key uses the lower |
501 | * bit of the "offset" field (which would always be 0) to indicate a | ||
502 | * mapping which is shared with other processes (ie. Guests). */ | ||
325 | if (key.shared.offset & 1) { | 503 | if (key.shared.offset & 1) { |
326 | struct lguest_dma_info *i; | 504 | struct lguest_dma_info *i; |
505 | /* Look through the hash for other Guests. */ | ||
327 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { | 506 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { |
507 | /* Don't send to ourselves. */ | ||
328 | if (i->guestid == lg->guestid) | 508 | if (i->guestid == lg->guestid) |
329 | continue; | 509 | continue; |
330 | if (!key_eq(&key, &i->key)) | 510 | if (!key_eq(&key, &i->key)) |
331 | continue; | 511 | continue; |
332 | 512 | ||
513 | /* If dma_transfer() tells us the destination has no | ||
514 | * available buffers, we increment "empty". */ | ||
333 | empty += dma_transfer(lg, udma, i); | 515 | empty += dma_transfer(lg, udma, i); |
334 | break; | 516 | break; |
335 | } | 517 | } |
518 | /* If the destination is empty, we release our locks and | ||
519 | * give the destination Guest a brief chance to restock. */ | ||
336 | if (empty == 1) { | 520 | if (empty == 1) { |
337 | /* Give any recipients one chance to restock. */ | 521 | /* Give any recipients one chance to restock. */ |
338 | up_read(¤t->mm->mmap_sem); | 522 | up_read(¤t->mm->mmap_sem); |
339 | mutex_unlock(&lguest_lock); | 523 | mutex_unlock(&lguest_lock); |
524 | /* Next time, we won't try again. */ | ||
340 | empty++; | 525 | empty++; |
341 | goto again; | 526 | goto again; |
342 | } | 527 | } |
343 | } else { | 528 | } else { |
344 | /* Private mapping: tell our userspace. */ | 529 | /* Private mapping: Guest is sending to its Launcher. We set |
530 | * the "dma_is_pending" flag so that the main loop will exit | ||
531 | * and the Launcher's read() from /dev/lguest will return. */ | ||
345 | lg->dma_is_pending = 1; | 532 | lg->dma_is_pending = 1; |
346 | lg->pending_dma = udma; | 533 | lg->pending_dma = udma; |
347 | lg->pending_key = ukey; | 534 | lg->pending_key = ukey; |
@@ -350,6 +537,7 @@ unlock: | |||
350 | up_read(fshared); | 537 | up_read(fshared); |
351 | mutex_unlock(&lguest_lock); | 538 | mutex_unlock(&lguest_lock); |
352 | } | 539 | } |
540 | /*:*/ | ||
353 | 541 | ||
354 | void release_all_dma(struct lguest *lg) | 542 | void release_all_dma(struct lguest *lg) |
355 | { | 543 | { |
@@ -365,7 +553,8 @@ void release_all_dma(struct lguest *lg) | |||
365 | up_read(&lg->mm->mmap_sem); | 553 | up_read(&lg->mm->mmap_sem); |
366 | } | 554 | } |
367 | 555 | ||
368 | /* Userspace wants a dma buffer from this guest. */ | 556 | /*L:320 This routine looks for a DMA buffer registered by the Guest on the |
557 | * given key (using the BIND_DMA hypercall). */ | ||
369 | unsigned long get_dma_buffer(struct lguest *lg, | 558 | unsigned long get_dma_buffer(struct lguest *lg, |
370 | unsigned long ukey, unsigned long *interrupt) | 559 | unsigned long ukey, unsigned long *interrupt) |
371 | { | 560 | { |
@@ -374,15 +563,29 @@ unsigned long get_dma_buffer(struct lguest *lg, | |||
374 | struct lguest_dma_info *i; | 563 | struct lguest_dma_info *i; |
375 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; | 564 | struct rw_semaphore *fshared = ¤t->mm->mmap_sem; |
376 | 565 | ||
566 | /* Take the Big Lguest Lock to stop other Guests sending this Guest DMA | ||
567 | * at the same time. */ | ||
377 | mutex_lock(&lguest_lock); | 568 | mutex_lock(&lguest_lock); |
569 | /* To match between Guests sharing the same underlying memory we steal | ||
570 | * code from the futex infrastructure. This requires that we hold the | ||
571 | * "mmap_sem" for our process (the Launcher), and pass it to the futex | ||
572 | * code. */ | ||
378 | down_read(fshared); | 573 | down_read(fshared); |
574 | |||
575 | /* This can fail if it's not a valid address, or if the address is not | ||
576 | * divisible by 4 (the futex code needs that, we don't really). */ | ||
379 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { | 577 | if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { |
380 | kill_guest(lg, "bad registered DMA buffer"); | 578 | kill_guest(lg, "bad registered DMA buffer"); |
381 | goto unlock; | 579 | goto unlock; |
382 | } | 580 | } |
581 | /* Search the hash table for matching entries (the Launcher can only | ||
582 | * send to its own Guest for the moment, so the entry must be for this | ||
583 | * Guest) */ | ||
383 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { | 584 | list_for_each_entry(i, &dma_hash[hash(&key)], list) { |
384 | if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { | 585 | if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { |
385 | unsigned int j; | 586 | unsigned int j; |
587 | /* Look through the registered DMA array for an | ||
588 | * available buffer. */ | ||
386 | for (j = 0; j < i->num_dmas; j++) { | 589 | for (j = 0; j < i->num_dmas; j++) { |
387 | struct lguest_dma dma; | 590 | struct lguest_dma dma; |
388 | 591 | ||
@@ -391,6 +594,8 @@ unsigned long get_dma_buffer(struct lguest *lg, | |||
391 | if (dma.used_len == 0) | 594 | if (dma.used_len == 0) |
392 | break; | 595 | break; |
393 | } | 596 | } |
597 | /* Store the interrupt the Guest wants when the buffer | ||
598 | * is used. */ | ||
394 | *interrupt = i->interrupt; | 599 | *interrupt = i->interrupt; |
395 | break; | 600 | break; |
396 | } | 601 | } |
@@ -400,4 +605,12 @@ unlock: | |||
400 | mutex_unlock(&lguest_lock); | 605 | mutex_unlock(&lguest_lock); |
401 | return ret; | 606 | return ret; |
402 | } | 607 | } |
608 | /*:*/ | ||
403 | 609 | ||
610 | /*L:410 This really has completed the Launcher. Not only have we now finished | ||
611 | * the longest chapter in our journey, but this also means we are over halfway | ||
612 | * through! | ||
613 | * | ||
614 | * Enough prevaricating around the bush: it is time for us to dive into the | ||
615 | * core of the Host, in "make Host". | ||
616 | */ | ||
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 3e2ddfbc816e..3b9dc123a7df 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -244,6 +244,30 @@ unsigned long get_dma_buffer(struct lguest *lg, unsigned long key, | |||
244 | /* hypercalls.c: */ | 244 | /* hypercalls.c: */ |
245 | void do_hypercalls(struct lguest *lg); | 245 | void do_hypercalls(struct lguest *lg); |
246 | 246 | ||
247 | /*L:035 | ||
248 | * Let's step aside for the moment, to study one important routine that's used | ||
249 | * widely in the Host code. | ||
250 | * | ||
251 | * There are many cases where the Guest does something invalid, like pass crap | ||
252 | * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite | ||
253 | * acceptable to simply terminate the Guest and give the Launcher a nicely | ||
254 | * formatted reason. It's also simpler for the Guest itself, which doesn't | ||
255 | * need to check most hypercalls for "success"; if you're still running, it | ||
256 | * succeeded. | ||
257 | * | ||
258 | * Once this is called, the Guest will never run again, so most Host code can | ||
259 | * call this then continue as if nothing had happened. This means many | ||
260 | * functions don't have to explicitly return an error code, which keeps the | ||
261 | * code simple. | ||
262 | * | ||
263 | * It also means that this can be called more than once: only the first one is | ||
264 | * remembered. The only trick is that we still need to kill the Guest even if | ||
265 | * we can't allocate memory to store the reason. Linux has a neat way of | ||
266 | * packing error codes into invalid pointers, so we use that here. | ||
267 | * | ||
268 | * Like any macro which uses an "if", it is safely wrapped in a run-once "do { | ||
269 | * } while(0)". | ||
270 | */ | ||
247 | #define kill_guest(lg, fmt...) \ | 271 | #define kill_guest(lg, fmt...) \ |
248 | do { \ | 272 | do { \ |
249 | if (!(lg)->dead) { \ | 273 | if (!(lg)->dead) { \ |
@@ -252,6 +276,7 @@ do { \ | |||
252 | (lg)->dead = ERR_PTR(-ENOMEM); \ | 276 | (lg)->dead = ERR_PTR(-ENOMEM); \ |
253 | } \ | 277 | } \ |
254 | } while(0) | 278 | } while(0) |
279 | /* (End of aside) :*/ | ||
255 | 280 | ||
256 | static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) | 281 | static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) |
257 | { | 282 | { |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 6ae86f20ce3d..80d1b58c7698 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -9,33 +9,62 @@ | |||
9 | #include <linux/fs.h> | 9 | #include <linux/fs.h> |
10 | #include "lg.h" | 10 | #include "lg.h" |
11 | 11 | ||
12 | /*L:030 setup_regs() doesn't really belong in this file, but it gives us an | ||
13 | * early glimpse deeper into the Host so it's worth having here. | ||
14 | * | ||
15 | * Most of the Guest's registers are left alone: we used get_zeroed_page() to | ||
16 | * allocate the structure, so they will be 0. */ | ||
12 | static void setup_regs(struct lguest_regs *regs, unsigned long start) | 17 | static void setup_regs(struct lguest_regs *regs, unsigned long start) |
13 | { | 18 | { |
14 | /* Write out stack in format lguest expects, so we can switch to it. */ | 19 | /* There are four "segment" registers which the Guest needs to boot: |
20 | * The "code segment" register (cs) refers to the kernel code segment | ||
21 | * __KERNEL_CS, and the "data", "extra" and "stack" segment registers | ||
22 | * refer to the kernel data segment __KERNEL_DS. | ||
23 | * | ||
24 | * The privilege level is packed into the lower bits. The Guest runs | ||
25 | * at privilege level 1 (GUEST_PL).*/ | ||
15 | regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; | 26 | regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; |
16 | regs->cs = __KERNEL_CS|GUEST_PL; | 27 | regs->cs = __KERNEL_CS|GUEST_PL; |
17 | regs->eflags = 0x202; /* Interrupts enabled. */ | 28 | |
29 | /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) | ||
30 | * is supposed to always be "1". Bit 9 (0x200) controls whether | ||
31 | * interrupts are enabled. We always leave interrupts enabled while | ||
32 | * running the Guest. */ | ||
33 | regs->eflags = 0x202; | ||
34 | |||
35 | /* The "Extended Instruction Pointer" register says where the Guest is | ||
36 | * running. */ | ||
18 | regs->eip = start; | 37 | regs->eip = start; |
19 | /* esi points to our boot information (physical address 0) */ | 38 | |
39 | /* %esi points to our boot information, at physical address 0, so don't | ||
40 | * touch it. */ | ||
20 | } | 41 | } |
21 | 42 | ||
22 | /* + addr */ | 43 | /*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a |
44 | * DMA buffer. This is done by writing LHREQ_GETDMA and the key to | ||
45 | * /dev/lguest. */ | ||
23 | static long user_get_dma(struct lguest *lg, const u32 __user *input) | 46 | static long user_get_dma(struct lguest *lg, const u32 __user *input) |
24 | { | 47 | { |
25 | unsigned long key, udma, irq; | 48 | unsigned long key, udma, irq; |
26 | 49 | ||
50 | /* Fetch the key they wrote to us. */ | ||
27 | if (get_user(key, input) != 0) | 51 | if (get_user(key, input) != 0) |
28 | return -EFAULT; | 52 | return -EFAULT; |
53 | /* Look for a free Guest DMA buffer bound to that key. */ | ||
29 | udma = get_dma_buffer(lg, key, &irq); | 54 | udma = get_dma_buffer(lg, key, &irq); |
30 | if (!udma) | 55 | if (!udma) |
31 | return -ENOENT; | 56 | return -ENOENT; |
32 | 57 | ||
33 | /* We put irq number in udma->used_len. */ | 58 | /* We need to tell the Launcher what interrupt the Guest expects after |
59 | * the buffer is filled. We stash it in udma->used_len. */ | ||
34 | lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); | 60 | lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); |
61 | |||
62 | /* The (guest-physical) address of the DMA buffer is returned from | ||
63 | * the write(). */ | ||
35 | return udma; | 64 | return udma; |
36 | } | 65 | } |
37 | 66 | ||
38 | /* To force the Guest to stop running and return to the Launcher, the | 67 | /*L:315 To force the Guest to stop running and return to the Launcher, the |
39 | * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The | 68 | * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The |
40 | * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ | 69 | * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ |
41 | static int break_guest_out(struct lguest *lg, const u32 __user *input) | 70 | static int break_guest_out(struct lguest *lg, const u32 __user *input) |
@@ -59,7 +88,8 @@ static int break_guest_out(struct lguest *lg, const u32 __user *input) | |||
59 | } | 88 | } |
60 | } | 89 | } |
61 | 90 | ||
62 | /* + irq */ | 91 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt |
92 | * number to /dev/lguest. */ | ||
63 | static int user_send_irq(struct lguest *lg, const u32 __user *input) | 93 | static int user_send_irq(struct lguest *lg, const u32 __user *input) |
64 | { | 94 | { |
65 | u32 irq; | 95 | u32 irq; |
@@ -68,14 +98,19 @@ static int user_send_irq(struct lguest *lg, const u32 __user *input) | |||
68 | return -EFAULT; | 98 | return -EFAULT; |
69 | if (irq >= LGUEST_IRQS) | 99 | if (irq >= LGUEST_IRQS) |
70 | return -EINVAL; | 100 | return -EINVAL; |
101 | /* Next time the Guest runs, the core code will see if it can deliver | ||
102 | * this interrupt. */ | ||
71 | set_bit(irq, lg->irqs_pending); | 103 | set_bit(irq, lg->irqs_pending); |
72 | return 0; | 104 | return 0; |
73 | } | 105 | } |
74 | 106 | ||
107 | /*L:040 Once our Guest is initialized, the Launcher makes it run by reading | ||
108 | * from /dev/lguest. */ | ||
75 | static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | 109 | static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) |
76 | { | 110 | { |
77 | struct lguest *lg = file->private_data; | 111 | struct lguest *lg = file->private_data; |
78 | 112 | ||
113 | /* You must write LHREQ_INITIALIZE first! */ | ||
79 | if (!lg) | 114 | if (!lg) |
80 | return -EINVAL; | 115 | return -EINVAL; |
81 | 116 | ||
@@ -83,27 +118,52 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) | |||
83 | if (current != lg->tsk) | 118 | if (current != lg->tsk) |
84 | return -EPERM; | 119 | return -EPERM; |
85 | 120 | ||
121 | /* If the guest is already dead, we indicate why */ | ||
86 | if (lg->dead) { | 122 | if (lg->dead) { |
87 | size_t len; | 123 | size_t len; |
88 | 124 | ||
125 | /* lg->dead either contains an error code, or a string. */ | ||
89 | if (IS_ERR(lg->dead)) | 126 | if (IS_ERR(lg->dead)) |
90 | return PTR_ERR(lg->dead); | 127 | return PTR_ERR(lg->dead); |
91 | 128 | ||
129 | /* We can only return as much as the buffer they read with. */ | ||
92 | len = min(size, strlen(lg->dead)+1); | 130 | len = min(size, strlen(lg->dead)+1); |
93 | if (copy_to_user(user, lg->dead, len) != 0) | 131 | if (copy_to_user(user, lg->dead, len) != 0) |
94 | return -EFAULT; | 132 | return -EFAULT; |
95 | return len; | 133 | return len; |
96 | } | 134 | } |
97 | 135 | ||
136 | /* If we returned from read() last time because the Guest sent DMA, | ||
137 | * clear the flag. */ | ||
98 | if (lg->dma_is_pending) | 138 | if (lg->dma_is_pending) |
99 | lg->dma_is_pending = 0; | 139 | lg->dma_is_pending = 0; |
100 | 140 | ||
141 | /* Run the Guest until something interesting happens. */ | ||
101 | return run_guest(lg, (unsigned long __user *)user); | 142 | return run_guest(lg, (unsigned long __user *)user); |
102 | } | 143 | } |
103 | 144 | ||
104 | /* Take: pfnlimit, pgdir, start, pageoffset. */ | 145 | /*L:020 The initialization write supplies 4 32-bit values (in addition to the |
146 | * 32-bit LHREQ_INITIALIZE value). These are: | ||
147 | * | ||
148 | * pfnlimit: The highest (Guest-physical) page number the Guest should be | ||
149 | * allowed to access. The Launcher has to live in Guest memory, so it sets | ||
150 | * this to ensure the Guest can't reach it. | ||
151 | * | ||
152 | * pgdir: The (Guest-physical) address of the top of the initial Guest | ||
153 | * pagetables (which are set up by the Launcher). | ||
154 | * | ||
155 | * start: The first instruction to execute ("eip" in x86-speak). | ||
156 | * | ||
157 | * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should | ||
158 | * probably wean the code off this, but it's a very useful constant! Any | ||
159 | * address above this is within the Guest kernel, and any kernel address can | ||
160 | * quickly converted from physical to virtual by adding PAGE_OFFSET. It's | ||
161 | * 0xC0000000 (3G) by default, but it's configurable at kernel build time. | ||
162 | */ | ||
105 | static int initialize(struct file *file, const u32 __user *input) | 163 | static int initialize(struct file *file, const u32 __user *input) |
106 | { | 164 | { |
165 | /* "struct lguest" contains everything we (the Host) know about a | ||
166 | * Guest. */ | ||
107 | struct lguest *lg; | 167 | struct lguest *lg; |
108 | int err, i; | 168 | int err, i; |
109 | u32 args[4]; | 169 | u32 args[4]; |
@@ -111,7 +171,7 @@ static int initialize(struct file *file, const u32 __user *input) | |||
111 | /* We grab the Big Lguest lock, which protects the global array | 171 | /* We grab the Big Lguest lock, which protects the global array |
112 | * "lguests" and multiple simultaneous initializations. */ | 172 | * "lguests" and multiple simultaneous initializations. */ |
113 | mutex_lock(&lguest_lock); | 173 | mutex_lock(&lguest_lock); |
114 | 174 | /* You can't initialize twice! Close the device and start again... */ | |
115 | if (file->private_data) { | 175 | if (file->private_data) { |
116 | err = -EBUSY; | 176 | err = -EBUSY; |
117 | goto unlock; | 177 | goto unlock; |
@@ -122,37 +182,70 @@ static int initialize(struct file *file, const u32 __user *input) | |||
122 | goto unlock; | 182 | goto unlock; |
123 | } | 183 | } |
124 | 184 | ||
185 | /* Find an unused guest. */ | ||
125 | i = find_free_guest(); | 186 | i = find_free_guest(); |
126 | if (i < 0) { | 187 | if (i < 0) { |
127 | err = -ENOSPC; | 188 | err = -ENOSPC; |
128 | goto unlock; | 189 | goto unlock; |
129 | } | 190 | } |
191 | /* OK, we have an index into the "lguest" array: "lg" is a convenient | ||
192 | * pointer. */ | ||
130 | lg = &lguests[i]; | 193 | lg = &lguests[i]; |
194 | |||
195 | /* Populate the easy fields of our "struct lguest" */ | ||
131 | lg->guestid = i; | 196 | lg->guestid = i; |
132 | lg->pfn_limit = args[0]; | 197 | lg->pfn_limit = args[0]; |
133 | lg->page_offset = args[3]; | 198 | lg->page_offset = args[3]; |
199 | |||
200 | /* We need a complete page for the Guest registers: they are accessible | ||
201 | * to the Guest and we can only grant it access to whole pages. */ | ||
134 | lg->regs_page = get_zeroed_page(GFP_KERNEL); | 202 | lg->regs_page = get_zeroed_page(GFP_KERNEL); |
135 | if (!lg->regs_page) { | 203 | if (!lg->regs_page) { |
136 | err = -ENOMEM; | 204 | err = -ENOMEM; |
137 | goto release_guest; | 205 | goto release_guest; |
138 | } | 206 | } |
207 | /* We actually put the registers at the bottom of the page. */ | ||
139 | lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); | 208 | lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); |
140 | 209 | ||
210 | /* Initialize the Guest's shadow page tables, using the toplevel | ||
211 | * address the Launcher gave us. This allocates memory, so can | ||
212 | * fail. */ | ||
141 | err = init_guest_pagetable(lg, args[1]); | 213 | err = init_guest_pagetable(lg, args[1]); |
142 | if (err) | 214 | if (err) |
143 | goto free_regs; | 215 | goto free_regs; |
144 | 216 | ||
217 | /* Now we initialize the Guest's registers, handing it the start | ||
218 | * address. */ | ||
145 | setup_regs(lg->regs, args[2]); | 219 | setup_regs(lg->regs, args[2]); |
220 | |||
221 | /* There are a couple of GDT entries the Guest expects when first | ||
222 | * booting. */ | ||
146 | setup_guest_gdt(lg); | 223 | setup_guest_gdt(lg); |
224 | |||
225 | /* The timer for lguest's clock needs initialization. */ | ||
147 | init_clockdev(lg); | 226 | init_clockdev(lg); |
227 | |||
228 | /* We keep a pointer to the Launcher task (ie. current task) for when | ||
229 | * other Guests want to wake this one (inter-Guest I/O). */ | ||
148 | lg->tsk = current; | 230 | lg->tsk = current; |
231 | /* We need to keep a pointer to the Launcher's memory map, because if | ||
232 | * the Launcher dies we need to clean it up. If we don't keep a | ||
233 | * reference, it is destroyed before close() is called. */ | ||
149 | lg->mm = get_task_mm(lg->tsk); | 234 | lg->mm = get_task_mm(lg->tsk); |
235 | |||
236 | /* Initialize the queue for the waker to wait on */ | ||
150 | init_waitqueue_head(&lg->break_wq); | 237 | init_waitqueue_head(&lg->break_wq); |
238 | |||
239 | /* We remember which CPU's pages this Guest used last, for optimization | ||
240 | * when the same Guest runs on the same CPU twice. */ | ||
151 | lg->last_pages = NULL; | 241 | lg->last_pages = NULL; |
242 | |||
243 | /* We keep our "struct lguest" in the file's private_data. */ | ||
152 | file->private_data = lg; | 244 | file->private_data = lg; |
153 | 245 | ||
154 | mutex_unlock(&lguest_lock); | 246 | mutex_unlock(&lguest_lock); |
155 | 247 | ||
248 | /* And because this is a write() call, we return the length used. */ | ||
156 | return sizeof(args); | 249 | return sizeof(args); |
157 | 250 | ||
158 | free_regs: | 251 | free_regs: |
@@ -164,9 +257,15 @@ unlock: | |||
164 | return err; | 257 | return err; |
165 | } | 258 | } |
166 | 259 | ||
260 | /*L:010 The first operation the Launcher does must be a write. All writes | ||
261 | * start with a 32 bit number: for the first write this must be | ||
262 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use | ||
263 | * writes of other values to get DMA buffers and send interrupts. */ | ||
167 | static ssize_t write(struct file *file, const char __user *input, | 264 | static ssize_t write(struct file *file, const char __user *input, |
168 | size_t size, loff_t *off) | 265 | size_t size, loff_t *off) |
169 | { | 266 | { |
267 | /* Once the guest is initialized, we hold the "struct lguest" in the | ||
268 | * file private data. */ | ||
170 | struct lguest *lg = file->private_data; | 269 | struct lguest *lg = file->private_data; |
171 | u32 req; | 270 | u32 req; |
172 | 271 | ||
@@ -174,8 +273,11 @@ static ssize_t write(struct file *file, const char __user *input, | |||
174 | return -EFAULT; | 273 | return -EFAULT; |
175 | input += sizeof(req); | 274 | input += sizeof(req); |
176 | 275 | ||
276 | /* If you haven't initialized, you must do that first. */ | ||
177 | if (req != LHREQ_INITIALIZE && !lg) | 277 | if (req != LHREQ_INITIALIZE && !lg) |
178 | return -EINVAL; | 278 | return -EINVAL; |
279 | |||
280 | /* Once the Guest is dead, all you can do is read() why it died. */ | ||
179 | if (lg && lg->dead) | 281 | if (lg && lg->dead) |
180 | return -ENOENT; | 282 | return -ENOENT; |
181 | 283 | ||
@@ -197,33 +299,72 @@ static ssize_t write(struct file *file, const char __user *input, | |||
197 | } | 299 | } |
198 | } | 300 | } |
199 | 301 | ||
302 | /*L:060 The final piece of interface code is the close() routine. It reverses | ||
303 | * everything done in initialize(). This is usually called because the | ||
304 | * Launcher exited. | ||
305 | * | ||
306 | * Note that the close routine returns 0 or a negative error number: it can't | ||
307 | * really fail, but it can whine. I blame Sun for this wart, and K&R C for | ||
308 | * letting them do it. :*/ | ||
200 | static int close(struct inode *inode, struct file *file) | 309 | static int close(struct inode *inode, struct file *file) |
201 | { | 310 | { |
202 | struct lguest *lg = file->private_data; | 311 | struct lguest *lg = file->private_data; |
203 | 312 | ||
313 | /* If we never successfully initialized, there's nothing to clean up */ | ||
204 | if (!lg) | 314 | if (!lg) |
205 | return 0; | 315 | return 0; |
206 | 316 | ||
317 | /* We need the big lock, to protect from inter-guest I/O and other | ||
318 | * Launchers initializing guests. */ | ||
207 | mutex_lock(&lguest_lock); | 319 | mutex_lock(&lguest_lock); |
208 | /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ | 320 | /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ |
209 | hrtimer_cancel(&lg->hrt); | 321 | hrtimer_cancel(&lg->hrt); |
322 | /* Free any DMA buffers the Guest had bound. */ | ||
210 | release_all_dma(lg); | 323 | release_all_dma(lg); |
324 | /* Free up the shadow page tables for the Guest. */ | ||
211 | free_guest_pagetable(lg); | 325 | free_guest_pagetable(lg); |
326 | /* Now all the memory cleanups are done, it's safe to release the | ||
327 | * Launcher's memory management structure. */ | ||
212 | mmput(lg->mm); | 328 | mmput(lg->mm); |
329 | /* If lg->dead doesn't contain an error code it will be NULL or a | ||
330 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ | ||
213 | if (!IS_ERR(lg->dead)) | 331 | if (!IS_ERR(lg->dead)) |
214 | kfree(lg->dead); | 332 | kfree(lg->dead); |
333 | /* We can free up the register page we allocated. */ | ||
215 | free_page(lg->regs_page); | 334 | free_page(lg->regs_page); |
335 | /* We clear the entire structure, which also marks it as free for the | ||
336 | * next user. */ | ||
216 | memset(lg, 0, sizeof(*lg)); | 337 | memset(lg, 0, sizeof(*lg)); |
338 | /* Release lock and exit. */ | ||
217 | mutex_unlock(&lguest_lock); | 339 | mutex_unlock(&lguest_lock); |
340 | |||
218 | return 0; | 341 | return 0; |
219 | } | 342 | } |
220 | 343 | ||
344 | /*L:000 | ||
345 | * Welcome to our journey through the Launcher! | ||
346 | * | ||
347 | * The Launcher is the Host userspace program which sets up, runs and services | ||
348 | * the Guest. In fact, many comments in the Drivers which refer to "the Host" | ||
349 | * doing things are inaccurate: the Launcher does all the device handling for | ||
350 | * the Guest. The Guest can't tell what's done by the the Launcher and what by | ||
351 | * the Host. | ||
352 | * | ||
353 | * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we | ||
354 | * shall see more of that later. | ||
355 | * | ||
356 | * We begin our understanding with the Host kernel interface which the Launcher | ||
357 | * uses: reading and writing a character device called /dev/lguest. All the | ||
358 | * work happens in the read(), write() and close() routines: */ | ||
221 | static struct file_operations lguest_fops = { | 359 | static struct file_operations lguest_fops = { |
222 | .owner = THIS_MODULE, | 360 | .owner = THIS_MODULE, |
223 | .release = close, | 361 | .release = close, |
224 | .write = write, | 362 | .write = write, |
225 | .read = read, | 363 | .read = read, |
226 | }; | 364 | }; |
365 | |||
366 | /* This is a textbook example of a "misc" character device. Populate a "struct | ||
367 | * miscdevice" and register it with misc_register(). */ | ||
227 | static struct miscdevice lguest_dev = { | 368 | static struct miscdevice lguest_dev = { |
228 | .minor = MISC_DYNAMIC_MINOR, | 369 | .minor = MISC_DYNAMIC_MINOR, |
229 | .name = "lguest", | 370 | .name = "lguest", |