aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2007-07-26 13:41:03 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-26 14:35:17 -0400
commite2c9784325490c878b7f69aeec1bed98b288bd97 (patch)
treed474007607c713a30db818107ca0581269f059a2 /drivers
parentb2b47c214f4e85ce3968120d42e8b18eccb4f4e3 (diff)
lguest: documentation III: Drivers
Documentation: The Drivers Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/lguest_blk.c169
-rw-r--r--drivers/char/hvc_lguest.c77
-rw-r--r--drivers/lguest/lguest_bus.c72
-rw-r--r--drivers/net/lguest_net.c218
4 files changed, 501 insertions, 35 deletions
diff --git a/drivers/block/lguest_blk.c b/drivers/block/lguest_blk.c
index 5b79d0724171..93e3c4001bf5 100644
--- a/drivers/block/lguest_blk.c
+++ b/drivers/block/lguest_blk.c
@@ -1,6 +1,12 @@
1/* A simple block driver for lguest. 1/*D:400
2 * The Guest block driver
2 * 3 *
3 * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 4 * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
5 * The mechanism is simple: we place the information about the request in the
6 * device page, then use SEND_DMA (containing the data for a write, or an empty
7 * "ping" DMA for a read).
8 :*/
9/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
4 * 10 *
5 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 12 * it under the terms of the GNU General Public License as published by
@@ -25,27 +31,50 @@
25 31
26static char next_block_index = 'a'; 32static char next_block_index = 'a';
27 33
34/*D:420 Here is the structure which holds all the information we need about
35 * each Guest block device.
36 *
37 * I'm sure at this stage, you're wondering "hey, where was the adventure I was
38 * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
39 * my blog". I think Real adventures have boring bits, too, and you're in the
40 * middle of one. But it gets better. Just not quite yet. */
28struct blockdev 41struct blockdev
29{ 42{
43 /* The block queue infrastructure wants a spinlock: it is held while it
44 * calls our block request function. We grab it in our interrupt
45 * handler so the responses don't mess with new requests. */
30 spinlock_t lock; 46 spinlock_t lock;
31 47
32 /* The disk structure for the kernel. */ 48 /* The disk structure registered with kernel. */
33 struct gendisk *disk; 49 struct gendisk *disk;
34 50
35 /* The major number for this disk. */ 51 /* The major device number for this disk, and the interrupt. We only
52 * really keep them here for completeness; we'd need them if we
53 * supported device unplugging. */
36 int major; 54 int major;
37 int irq; 55 int irq;
38 56
57 /* The physical address of this device's memory page */
39 unsigned long phys_addr; 58 unsigned long phys_addr;
40 /* The mapped block page. */ 59 /* The mapped memory page for convenient acces. */
41 struct lguest_block_page *lb_page; 60 struct lguest_block_page *lb_page;
42 61
43 /* We only have a single request outstanding at a time. */ 62 /* We only have a single request outstanding at a time: this is it. */
44 struct lguest_dma dma; 63 struct lguest_dma dma;
45 struct request *req; 64 struct request *req;
46}; 65};
47 66
48/* Jens gave me this nice helper to end all chunks of a request. */ 67/*D:495 We originally used end_request() throughout the driver, but it turns
68 * out that end_request() is deprecated, and doesn't actually end the request
69 * (which seems like a good reason to deprecate it!). It simply ends the first
70 * bio. So if we had 3 bios in a "struct request" we would do all 3,
71 * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
72 * work as we needed to do.
73 *
74 * This reinforced to me that I do not understand the block layer.
75 *
76 * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
77 * request. This improved disk speed by 130%. */
49static void end_entire_request(struct request *req, int uptodate) 78static void end_entire_request(struct request *req, int uptodate)
50{ 79{
51 if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) 80 if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
@@ -55,30 +84,62 @@ static void end_entire_request(struct request *req, int uptodate)
55 end_that_request_last(req, uptodate); 84 end_that_request_last(req, uptodate);
56} 85}
57 86
87/* I'm told there are only two stories in the world worth telling: love and
88 * hate. So there used to be a love scene here like this:
89 *
90 * Launcher: We could make beautiful I/O together, you and I.
91 * Guest: My, that's a big disk!
92 *
93 * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
94
95/*D:490 This is the interrupt handler, called when a block read or write has
96 * been completed for us. */
58static irqreturn_t lgb_irq(int irq, void *_bd) 97static irqreturn_t lgb_irq(int irq, void *_bd)
59{ 98{
99 /* We handed our "struct blockdev" as the argument to request_irq(), so
100 * it is passed through to us here. This tells us which device we're
101 * dealing with in case we have more than one. */
60 struct blockdev *bd = _bd; 102 struct blockdev *bd = _bd;
61 unsigned long flags; 103 unsigned long flags;
62 104
105 /* We weren't doing anything? Strange, but could happen if we shared
106 * interrupts (we don't!). */
63 if (!bd->req) { 107 if (!bd->req) {
64 pr_debug("No work!\n"); 108 pr_debug("No work!\n");
65 return IRQ_NONE; 109 return IRQ_NONE;
66 } 110 }
67 111
112 /* Not done yet? That's equally strange. */
68 if (!bd->lb_page->result) { 113 if (!bd->lb_page->result) {
69 pr_debug("No result!\n"); 114 pr_debug("No result!\n");
70 return IRQ_NONE; 115 return IRQ_NONE;
71 } 116 }
72 117
118 /* We have to grab the lock before ending the request. */
73 spin_lock_irqsave(&bd->lock, flags); 119 spin_lock_irqsave(&bd->lock, flags);
120 /* "result" is 1 for success, 2 for failure: end_entire_request() wants
121 * to know whether this succeeded or not. */
74 end_entire_request(bd->req, bd->lb_page->result == 1); 122 end_entire_request(bd->req, bd->lb_page->result == 1);
123 /* Clear out request, it's done. */
75 bd->req = NULL; 124 bd->req = NULL;
125 /* Reset incoming DMA for next time. */
76 bd->dma.used_len = 0; 126 bd->dma.used_len = 0;
127 /* Ready for more reads or writes */
77 blk_start_queue(bd->disk->queue); 128 blk_start_queue(bd->disk->queue);
78 spin_unlock_irqrestore(&bd->lock, flags); 129 spin_unlock_irqrestore(&bd->lock, flags);
130
131 /* The interrupt was for us, we dealt with it. */
79 return IRQ_HANDLED; 132 return IRQ_HANDLED;
80} 133}
81 134
135/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
136 * each of which contains "struct bio_vec"s, each of which contains a page, an
137 * offset and a length.
138 *
139 * Fortunately there are iterators to help us walk through the "struct
140 * request". Even more fortunately, there were plenty of places to steal the
141 * code from. We pack the "struct request" into our "struct lguest_dma" and
142 * return the total length. */
82static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma) 143static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
83{ 144{
84 unsigned int i = 0, idx, len = 0; 145 unsigned int i = 0, idx, len = 0;
@@ -87,8 +148,13 @@ static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
87 rq_for_each_bio(bio, req) { 148 rq_for_each_bio(bio, req) {
88 struct bio_vec *bvec; 149 struct bio_vec *bvec;
89 bio_for_each_segment(bvec, bio, idx) { 150 bio_for_each_segment(bvec, bio, idx) {
151 /* We told the block layer not to give us too many. */
90 BUG_ON(i == LGUEST_MAX_DMA_SECTIONS); 152 BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
153 /* If we had a zero-length segment, it would look like
154 * the end of the data referred to by the "struct
155 * lguest_dma", so make sure that doesn't happen. */
91 BUG_ON(!bvec->bv_len); 156 BUG_ON(!bvec->bv_len);
157 /* Convert page & offset to a physical address */
92 dma->addr[i] = page_to_phys(bvec->bv_page) 158 dma->addr[i] = page_to_phys(bvec->bv_page)
93 + bvec->bv_offset; 159 + bvec->bv_offset;
94 dma->len[i] = bvec->bv_len; 160 dma->len[i] = bvec->bv_len;
@@ -96,26 +162,39 @@ static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
96 i++; 162 i++;
97 } 163 }
98 } 164 }
165 /* If the array isn't full, we mark the end with a 0 length */
99 if (i < LGUEST_MAX_DMA_SECTIONS) 166 if (i < LGUEST_MAX_DMA_SECTIONS)
100 dma->len[i] = 0; 167 dma->len[i] = 0;
101 return len; 168 return len;
102} 169}
103 170
171/* This creates an empty DMA, useful for prodding the Host without sending data
172 * (ie. when we want to do a read) */
104static void empty_dma(struct lguest_dma *dma) 173static void empty_dma(struct lguest_dma *dma)
105{ 174{
106 dma->len[0] = 0; 175 dma->len[0] = 0;
107} 176}
108 177
178/*D:470 Setting up a request is fairly easy: */
109static void setup_req(struct blockdev *bd, 179static void setup_req(struct blockdev *bd,
110 int type, struct request *req, struct lguest_dma *dma) 180 int type, struct request *req, struct lguest_dma *dma)
111{ 181{
182 /* The type is 1 (write) or 0 (read). */
112 bd->lb_page->type = type; 183 bd->lb_page->type = type;
184 /* The sector on disk where the read or write starts. */
113 bd->lb_page->sector = req->sector; 185 bd->lb_page->sector = req->sector;
186 /* The result is initialized to 0 (unfinished). */
114 bd->lb_page->result = 0; 187 bd->lb_page->result = 0;
188 /* The current request (so we can end it in the interrupt handler). */
115 bd->req = req; 189 bd->req = req;
190 /* The number of bytes: returned as a side-effect of req_to_dma(),
191 * which packs the block layer's "struct request" into our "struct
192 * lguest_dma" */
116 bd->lb_page->bytes = req_to_dma(req, dma); 193 bd->lb_page->bytes = req_to_dma(req, dma);
117} 194}
118 195
196/*D:450 Write is pretty straightforward: we pack the request into a "struct
197 * lguest_dma", then use SEND_DMA to send the request. */
119static void do_write(struct blockdev *bd, struct request *req) 198static void do_write(struct blockdev *bd, struct request *req)
120{ 199{
121 struct lguest_dma send; 200 struct lguest_dma send;
@@ -126,6 +205,9 @@ static void do_write(struct blockdev *bd, struct request *req)
126 lguest_send_dma(bd->phys_addr, &send); 205 lguest_send_dma(bd->phys_addr, &send);
127} 206}
128 207
208/* Read is similar to write, except we pack the request into our receive
209 * "struct lguest_dma" and send through an empty DMA just to tell the Host that
210 * there's a request pending. */
129static void do_read(struct blockdev *bd, struct request *req) 211static void do_read(struct blockdev *bd, struct request *req)
130{ 212{
131 struct lguest_dma ping; 213 struct lguest_dma ping;
@@ -137,21 +219,30 @@ static void do_read(struct blockdev *bd, struct request *req)
137 lguest_send_dma(bd->phys_addr, &ping); 219 lguest_send_dma(bd->phys_addr, &ping);
138} 220}
139 221
222/*D:440 This where requests come in: we get handed the request queue and are
223 * expected to pull a "struct request" off it until we've finished them or
224 * we're waiting for a reply: */
140static void do_lgb_request(struct request_queue *q) 225static void do_lgb_request(struct request_queue *q)
141{ 226{
142 struct blockdev *bd; 227 struct blockdev *bd;
143 struct request *req; 228 struct request *req;
144 229
145again: 230again:
231 /* This sometimes returns NULL even on the very first time around. I
232 * wonder if it's something to do with letting elves handle the request
233 * queue... */
146 req = elv_next_request(q); 234 req = elv_next_request(q);
147 if (!req) 235 if (!req)
148 return; 236 return;
149 237
238 /* We attached the struct blockdev to the disk: get it back */
150 bd = req->rq_disk->private_data; 239 bd = req->rq_disk->private_data;
151 /* Sometimes we get repeated requests after blk_stop_queue. */ 240 /* Sometimes we get repeated requests after blk_stop_queue(), but we
241 * can only handle one at a time. */
152 if (bd->req) 242 if (bd->req)
153 return; 243 return;
154 244
245 /* We only do reads and writes: no tricky business! */
155 if (!blk_fs_request(req)) { 246 if (!blk_fs_request(req)) {
156 pr_debug("Got non-command 0x%08x\n", req->cmd_type); 247 pr_debug("Got non-command 0x%08x\n", req->cmd_type);
157 req->errors++; 248 req->errors++;
@@ -164,20 +255,31 @@ again:
164 else 255 else
165 do_read(bd, req); 256 do_read(bd, req);
166 257
167 /* Wait for interrupt to tell us it's done. */ 258 /* We've put out the request, so stop any more coming in until we get
259 * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
168 blk_stop_queue(q); 260 blk_stop_queue(q);
169} 261}
170 262
263/*D:430 This is the "struct block_device_operations" we attach to the disk at
264 * the end of lguestblk_probe(). It doesn't seem to want much. */
171static struct block_device_operations lguestblk_fops = { 265static struct block_device_operations lguestblk_fops = {
172 .owner = THIS_MODULE, 266 .owner = THIS_MODULE,
173}; 267};
174 268
269/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
270 * quite why. I do know that the IDE code sent two or three of the maintainers
271 * insane, perhaps this is the fringe of the same disease?
272 *
273 * As in the console code, the probe function gets handed the generic
274 * lguest_device from lguest_bus.c: */
175static int lguestblk_probe(struct lguest_device *lgdev) 275static int lguestblk_probe(struct lguest_device *lgdev)
176{ 276{
177 struct blockdev *bd; 277 struct blockdev *bd;
178 int err; 278 int err;
179 int irqflags = IRQF_SHARED; 279 int irqflags = IRQF_SHARED;
180 280
281 /* First we allocate our own "struct blockdev" and initialize the easy
282 * fields. */
181 bd = kmalloc(sizeof(*bd), GFP_KERNEL); 283 bd = kmalloc(sizeof(*bd), GFP_KERNEL);
182 if (!bd) 284 if (!bd)
183 return -ENOMEM; 285 return -ENOMEM;
@@ -187,59 +289,100 @@ static int lguestblk_probe(struct lguest_device *lgdev)
187 bd->req = NULL; 289 bd->req = NULL;
188 bd->dma.used_len = 0; 290 bd->dma.used_len = 0;
189 bd->dma.len[0] = 0; 291 bd->dma.len[0] = 0;
292 /* The descriptor in the lguest_devices array provided by the Host
293 * gives the Guest the physical page number of the device's page. */
190 bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT); 294 bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
191 295
296 /* We use lguest_map() to get a pointer to the device page */
192 bd->lb_page = lguest_map(bd->phys_addr, 1); 297 bd->lb_page = lguest_map(bd->phys_addr, 1);
193 if (!bd->lb_page) { 298 if (!bd->lb_page) {
194 err = -ENOMEM; 299 err = -ENOMEM;
195 goto out_free_bd; 300 goto out_free_bd;
196 } 301 }
197 302
303 /* We need a major device number: 0 means "assign one dynamically". */
198 bd->major = register_blkdev(0, "lguestblk"); 304 bd->major = register_blkdev(0, "lguestblk");
199 if (bd->major < 0) { 305 if (bd->major < 0) {
200 err = bd->major; 306 err = bd->major;
201 goto out_unmap; 307 goto out_unmap;
202 } 308 }
203 309
310 /* This allocates a "struct gendisk" where we pack all the information
311 * about the disk which the rest of Linux sees. We ask for one minor
312 * number; I do wonder if we should be asking for more. */
204 bd->disk = alloc_disk(1); 313 bd->disk = alloc_disk(1);
205 if (!bd->disk) { 314 if (!bd->disk) {
206 err = -ENOMEM; 315 err = -ENOMEM;
207 goto out_unregister_blkdev; 316 goto out_unregister_blkdev;
208 } 317 }
209 318
319 /* Every disk needs a queue for requests to come in: we set up the
320 * queue with a callback function (the core of our driver) and the lock
321 * to use. */
210 bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock); 322 bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
211 if (!bd->disk->queue) { 323 if (!bd->disk->queue) {
212 err = -ENOMEM; 324 err = -ENOMEM;
213 goto out_put_disk; 325 goto out_put_disk;
214 } 326 }
215 327
216 /* We can only handle a certain number of sg entries */ 328 /* We can only handle a certain number of pointers in our SEND_DMA
329 * call, so we set that with blk_queue_max_hw_segments(). This is not
330 * to be confused with blk_queue_max_phys_segments() of course! I
331 * know, who could possibly confuse the two?
332 *
333 * Well, it's simple to tell them apart: this one seems to work and the
334 * other one didn't. */
217 blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS); 335 blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
218 /* Buffers must not cross page boundaries */ 336
337 /* Due to technical limitations of our Host (and simple coding) we
338 * can't have a single buffer which crosses a page boundary. Tell it
339 * here. This means that our maximum request size is 16
340 * (LGUEST_MAX_DMA_SECTIONS) pages. */
219 blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1); 341 blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
220 342
343 /* We name our disk: this becomes the device name when udev does its
344 * magic thing and creates the device node, such as /dev/lgba.
345 * next_block_index is a global which starts at 'a'. Unfortunately
346 * this simple increment logic means that the 27th disk will be called
347 * "/dev/lgb{". In that case, I recommend having at least 29 disks, so
348 * your /dev directory will be balanced. */
221 sprintf(bd->disk->disk_name, "lgb%c", next_block_index++); 349 sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
350
351 /* We look to the device descriptor again to see if this device's
352 * interrupts are expected to be random. If they are, we tell the irq
353 * subsystem. At the moment this bit is always set. */
222 if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) 354 if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
223 irqflags |= IRQF_SAMPLE_RANDOM; 355 irqflags |= IRQF_SAMPLE_RANDOM;
356
357 /* Now we have the name and irqflags, we can request the interrupt; we
358 * give it the "struct blockdev" we have set up to pass to lgb_irq()
359 * when there is an interrupt. */
224 err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd); 360 err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
225 if (err) 361 if (err)
226 goto out_cleanup_queue; 362 goto out_cleanup_queue;
227 363
364 /* We bind our one-entry DMA pool to the key for this block device so
365 * the Host can reply to our requests. The key is equal to the
366 * physical address of the device's page, which is conveniently
367 * unique. */
228 err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq); 368 err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
229 if (err) 369 if (err)
230 goto out_free_irq; 370 goto out_free_irq;
231 371
372 /* We finish our disk initialization and add the disk to the system. */
232 bd->disk->major = bd->major; 373 bd->disk->major = bd->major;
233 bd->disk->first_minor = 0; 374 bd->disk->first_minor = 0;
234 bd->disk->private_data = bd; 375 bd->disk->private_data = bd;
235 bd->disk->fops = &lguestblk_fops; 376 bd->disk->fops = &lguestblk_fops;
236 /* This is initialized to the disk size by the other end. */ 377 /* This is initialized to the disk size by the Launcher. */
237 set_capacity(bd->disk, bd->lb_page->num_sectors); 378 set_capacity(bd->disk, bd->lb_page->num_sectors);
238 add_disk(bd->disk); 379 add_disk(bd->disk);
239 380
240 printk(KERN_INFO "%s: device %i at major %d\n", 381 printk(KERN_INFO "%s: device %i at major %d\n",
241 bd->disk->disk_name, lgdev->index, bd->major); 382 bd->disk->disk_name, lgdev->index, bd->major);
242 383
384 /* We don't need to keep the "struct blockdev" around, but if we ever
385 * implemented device removal, we'd need this. */
243 lgdev->private = bd; 386 lgdev->private = bd;
244 return 0; 387 return 0;
245 388
@@ -258,6 +401,8 @@ out_free_bd:
258 return err; 401 return err;
259} 402}
260 403
404/*D:410 The boilerplate code for registering the lguest block driver is just
405 * like the console: */
261static struct lguest_driver lguestblk_drv = { 406static struct lguest_driver lguestblk_drv = {
262 .name = "lguestblk", 407 .name = "lguestblk",
263 .owner = THIS_MODULE, 408 .owner = THIS_MODULE,
diff --git a/drivers/char/hvc_lguest.c b/drivers/char/hvc_lguest.c
index e7b889e404a7..1de8967cce06 100644
--- a/drivers/char/hvc_lguest.c
+++ b/drivers/char/hvc_lguest.c
@@ -1,6 +1,19 @@
1/* Simple console for lguest. 1/*D:300
2 * The Guest console driver
2 * 3 *
3 * Copyright (C) 2006 Rusty Russell, IBM Corporation 4 * This is a trivial console driver: we use lguest's DMA mechanism to send
5 * bytes out, and register a DMA buffer to receive bytes in. It is assumed to
6 * be present and available from the very beginning of boot.
7 *
8 * Writing console drivers is one of the few remaining Dark Arts in Linux.
9 * Fortunately for us, the path of virtual consoles has been well-trodden by
10 * the PowerPC folks, who wrote "hvc_console.c" to generically support any
11 * virtual console. We use that infrastructure which only requires us to write
12 * the basic put_chars and get_chars functions and call the right register
13 * functions.
14 :*/
15
16/* Copyright (C) 2006 Rusty Russell, IBM Corporation
4 * 17 *
5 * This program is free software; you can redistribute it and/or modify 18 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 19 * it under the terms of the GNU General Public License as published by
@@ -21,49 +34,81 @@
21#include <linux/lguest_bus.h> 34#include <linux/lguest_bus.h>
22#include "hvc_console.h" 35#include "hvc_console.h"
23 36
37/*D:340 This is our single console input buffer, with associated "struct
38 * lguest_dma" referring to it. Note the 0-terminated length array, and the
39 * use of physical address for the buffer itself. */
24static char inbuf[256]; 40static char inbuf[256];
25static struct lguest_dma cons_input = { .used_len = 0, 41static struct lguest_dma cons_input = { .used_len = 0,
26 .addr[0] = __pa(inbuf), 42 .addr[0] = __pa(inbuf),
27 .len[0] = sizeof(inbuf), 43 .len[0] = sizeof(inbuf),
28 .len[1] = 0 }; 44 .len[1] = 0 };
29 45
46/*D:310 The put_chars() callback is pretty straightforward.
47 *
48 * First we put the pointer and length in a "struct lguest_dma": we only have
49 * one pointer, so we set the second length to 0. Then we use SEND_DMA to send
50 * the data to (Host) buffers attached to the console key. Usually a device's
51 * key is a physical address within the device's memory, but because the
52 * console device doesn't have any associated physical memory, we use the
53 * LGUEST_CONSOLE_DMA_KEY constant (aka 0). */
30static int put_chars(u32 vtermno, const char *buf, int count) 54static int put_chars(u32 vtermno, const char *buf, int count)
31{ 55{
32 struct lguest_dma dma; 56 struct lguest_dma dma;
33 57
34 /* FIXME: what if it's over a page boundary? */ 58 /* FIXME: DMA buffers in a "struct lguest_dma" are not allowed
59 * to go over page boundaries. This never seems to happen,
60 * but if it did we'd need to fix this code. */
35 dma.len[0] = count; 61 dma.len[0] = count;
36 dma.len[1] = 0; 62 dma.len[1] = 0;
37 dma.addr[0] = __pa(buf); 63 dma.addr[0] = __pa(buf);
38 64
39 lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma); 65 lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma);
66 /* We're expected to return the amount of data we wrote: all of it. */
40 return count; 67 return count;
41} 68}
42 69
70/*D:350 get_chars() is the callback from the hvc_console infrastructure when
71 * an interrupt is received.
72 *
73 * Firstly we see if our buffer has been filled: if not, we return. The rest
74 * of the code deals with the fact that the hvc_console() infrastructure only
75 * asks us for 16 bytes at a time. We keep a "cons_offset" variable for
76 * partially-read buffers. */
43static int get_chars(u32 vtermno, char *buf, int count) 77static int get_chars(u32 vtermno, char *buf, int count)
44{ 78{
45 static int cons_offset; 79 static int cons_offset;
46 80
81 /* Nothing left to see here... */
47 if (!cons_input.used_len) 82 if (!cons_input.used_len)
48 return 0; 83 return 0;
49 84
85 /* You want more than we have to give? Well, try wanting less! */
50 if (cons_input.used_len - cons_offset < count) 86 if (cons_input.used_len - cons_offset < count)
51 count = cons_input.used_len - cons_offset; 87 count = cons_input.used_len - cons_offset;
52 88
89 /* Copy across to their buffer and increment offset. */
53 memcpy(buf, inbuf + cons_offset, count); 90 memcpy(buf, inbuf + cons_offset, count);
54 cons_offset += count; 91 cons_offset += count;
92
93 /* Finished? Zero offset, and reset cons_input so Host will use it
94 * again. */
55 if (cons_offset == cons_input.used_len) { 95 if (cons_offset == cons_input.used_len) {
56 cons_offset = 0; 96 cons_offset = 0;
57 cons_input.used_len = 0; 97 cons_input.used_len = 0;
58 } 98 }
59 return count; 99 return count;
60} 100}
101/*:*/
61 102
62static struct hv_ops lguest_cons = { 103static struct hv_ops lguest_cons = {
63 .get_chars = get_chars, 104 .get_chars = get_chars,
64 .put_chars = put_chars, 105 .put_chars = put_chars,
65}; 106};
66 107
108/*D:320 Console drivers are initialized very early so boot messages can go
109 * out. At this stage, the console is output-only. Our driver checks we're a
110 * Guest, and if so hands hvc_instantiate() the console number (0), priority
111 * (0), and the struct hv_ops containing the put_chars() function. */
67static int __init cons_init(void) 112static int __init cons_init(void)
68{ 113{
69 if (strcmp(paravirt_ops.name, "lguest") != 0) 114 if (strcmp(paravirt_ops.name, "lguest") != 0)
@@ -73,21 +118,46 @@ static int __init cons_init(void)
73} 118}
74console_initcall(cons_init); 119console_initcall(cons_init);
75 120
121/*D:370 To set up and manage our virtual console, we call hvc_alloc() and
122 * stash the result in the private pointer of the "struct lguest_device".
123 * Since we never remove the console device we never need this pointer again,
124 * but using ->private is considered good form, and you never know who's going
125 * to copy your driver.
126 *
127 * Once the console is set up, we bind our input buffer ready for input. */
76static int lguestcons_probe(struct lguest_device *lgdev) 128static int lguestcons_probe(struct lguest_device *lgdev)
77{ 129{
78 int err; 130 int err;
79 131
132 /* The first argument of hvc_alloc() is the virtual console number, so
133 * we use zero. The second argument is the interrupt number.
134 *
135 * The third argument is a "struct hv_ops" containing the put_chars()
136 * and get_chars() pointers. The final argument is the output buffer
137 * size: we use 256 and expect the Host to have room for us to send
138 * that much. */
80 lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256); 139 lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256);
81 if (IS_ERR(lgdev->private)) 140 if (IS_ERR(lgdev->private))
82 return PTR_ERR(lgdev->private); 141 return PTR_ERR(lgdev->private);
83 142
143 /* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY.
144 * "cons_input" is that statically-initialized global DMA buffer we saw
145 * above, and we also give the interrupt we want. */
84 err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1, 146 err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1,
85 lgdev_irq(lgdev)); 147 lgdev_irq(lgdev));
86 if (err) 148 if (err)
87 printk("lguest console: failed to bind buffer.\n"); 149 printk("lguest console: failed to bind buffer.\n");
88 return err; 150 return err;
89} 151}
152/* Note the use of lgdev_irq() for the interrupt number. We tell hvc_alloc()
153 * to expect input when this interrupt is triggered, and then tell
154 * lguest_bind_dma() that is the interrupt to send us when input comes in. */
90 155
156/*D:360 From now on the console driver follows standard Guest driver form:
157 * register_lguest_driver() registers the device type and probe function, and
158 * the probe function sets up the device.
159 *
160 * The standard "struct lguest_driver": */
91static struct lguest_driver lguestcons_drv = { 161static struct lguest_driver lguestcons_drv = {
92 .name = "lguestcons", 162 .name = "lguestcons",
93 .owner = THIS_MODULE, 163 .owner = THIS_MODULE,
@@ -95,6 +165,7 @@ static struct lguest_driver lguestcons_drv = {
95 .probe = lguestcons_probe, 165 .probe = lguestcons_probe,
96}; 166};
97 167
168/* The standard init function */
98static int __init hvc_lguest_init(void) 169static int __init hvc_lguest_init(void)
99{ 170{
100 return register_lguest_driver(&lguestcons_drv); 171 return register_lguest_driver(&lguestcons_drv);
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c
index 9a22d199502e..55a7940ca732 100644
--- a/drivers/lguest/lguest_bus.c
+++ b/drivers/lguest/lguest_bus.c
@@ -46,6 +46,10 @@ static struct device_attribute lguest_dev_attrs[] = {
46 __ATTR_NULL 46 __ATTR_NULL
47}; 47};
48 48
49/*D:130 The generic bus infrastructure requires a function which says whether a
50 * device matches a driver. For us, it is simple: "struct lguest_driver"
51 * contains a "device_type" field which indicates what type of device it can
52 * handle, so we just cast the args and compare: */
49static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) 53static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
50{ 54{
51 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 55 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
@@ -53,6 +57,7 @@ static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
53 57
54 return (drv->device_type == lguest_devices[dev->index].type); 58 return (drv->device_type == lguest_devices[dev->index].type);
55} 59}
60/*:*/
56 61
57struct lguest_bus { 62struct lguest_bus {
58 struct bus_type bus; 63 struct bus_type bus;
@@ -71,11 +76,24 @@ static struct lguest_bus lguest_bus = {
71 } 76 }
72}; 77};
73 78
79/*D:140 This is the callback which occurs once the bus infrastructure matches
80 * up a device and driver, ie. in response to add_lguest_device() calling
81 * device_register(), or register_lguest_driver() calling driver_register().
82 *
83 * At the moment it's always the latter: the devices are added first, since
84 * scan_devices() is called from a "core_initcall", and the drivers themselves
85 * called later as a normal "initcall". But it would work the other way too.
86 *
87 * So now we have the happy couple, we add the status bit to indicate that we
88 * found a driver. If the driver truly loves the device, it will return
89 * happiness from its probe function (ok, perhaps this wasn't my greatest
90 * analogy), and we set the final "driver ok" bit so the Host sees it's all
91 * green. */
74static int lguest_dev_probe(struct device *_dev) 92static int lguest_dev_probe(struct device *_dev)
75{ 93{
76 int ret; 94 int ret;
77 struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); 95 struct lguest_device*dev = container_of(_dev,struct lguest_device,dev);
78 struct lguest_driver *drv = container_of(dev->dev.driver, 96 struct lguest_driver*drv = container_of(dev->dev.driver,
79 struct lguest_driver, drv); 97 struct lguest_driver, drv);
80 98
81 lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; 99 lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
@@ -85,6 +103,10 @@ static int lguest_dev_probe(struct device *_dev)
85 return ret; 103 return ret;
86} 104}
87 105
106/* The last part of the bus infrastructure is the function lguest drivers use
107 * to register themselves. Firstly, we do nothing if there's no lguest bus
108 * (ie. this is not a Guest), otherwise we fill in the embedded generic "struct
109 * driver" fields and call the generic driver_register(). */
88int register_lguest_driver(struct lguest_driver *drv) 110int register_lguest_driver(struct lguest_driver *drv)
89{ 111{
90 if (!lguest_devices) 112 if (!lguest_devices)
@@ -97,12 +119,36 @@ int register_lguest_driver(struct lguest_driver *drv)
97 119
98 return driver_register(&drv->drv); 120 return driver_register(&drv->drv);
99} 121}
122
123/* At the moment we build all the drivers into the kernel because they're so
124 * simple: 8144 bytes for all three of them as I type this. And as the console
125 * really needs to be built in, it's actually only 3527 bytes for the network
126 * and block drivers.
127 *
128 * If they get complex it will make sense for them to be modularized, so we
129 * need to explicitly export the symbol.
130 *
131 * I don't think non-GPL modules make sense, so it's a GPL-only export.
132 */
100EXPORT_SYMBOL_GPL(register_lguest_driver); 133EXPORT_SYMBOL_GPL(register_lguest_driver);
101 134
135/*D:120 This is the core of the lguest bus: actually adding a new device.
136 * It's a separate function because it's neater that way, and because an
137 * earlier version of the code supported hotplug and unplug. They were removed
138 * early on because they were never used.
139 *
140 * As Andrew Tridgell says, "Untested code is buggy code".
141 *
142 * It's worth reading this carefully: we start with an index into the array of
143 * "struct lguest_device_desc"s indicating the device which is new: */
102static void add_lguest_device(unsigned int index) 144static void add_lguest_device(unsigned int index)
103{ 145{
104 struct lguest_device *new; 146 struct lguest_device *new;
105 147
148 /* Each "struct lguest_device_desc" has a "status" field, which the
149 * Guest updates as the device is probed. In the worst case, the Host
150 * can look at these bits to tell what part of device setup failed,
151 * even if the console isn't available. */
106 lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; 152 lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
107 new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); 153 new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
108 if (!new) { 154 if (!new) {
@@ -111,12 +157,17 @@ static void add_lguest_device(unsigned int index)
111 return; 157 return;
112 } 158 }
113 159
160 /* The "struct lguest_device" setup is pretty straight-forward example
161 * code. */
114 new->index = index; 162 new->index = index;
115 new->private = NULL; 163 new->private = NULL;
116 memset(&new->dev, 0, sizeof(new->dev)); 164 memset(&new->dev, 0, sizeof(new->dev));
117 new->dev.parent = &lguest_bus.dev; 165 new->dev.parent = &lguest_bus.dev;
118 new->dev.bus = &lguest_bus.bus; 166 new->dev.bus = &lguest_bus.bus;
119 sprintf(new->dev.bus_id, "%u", index); 167 sprintf(new->dev.bus_id, "%u", index);
168
169 /* device_register() causes the bus infrastructure to look for a
170 * matching driver. */
120 if (device_register(&new->dev) != 0) { 171 if (device_register(&new->dev) != 0) {
121 printk(KERN_EMERG "Cannot register lguest device %u\n", index); 172 printk(KERN_EMERG "Cannot register lguest device %u\n", index);
122 lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; 173 lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
@@ -124,6 +175,9 @@ static void add_lguest_device(unsigned int index)
124 } 175 }
125} 176}
126 177
178/*D:110 scan_devices() simply iterates through the device array. The type 0
179 * is reserved to mean "no device", and anything else means we have found a
180 * device: add it. */
127static void scan_devices(void) 181static void scan_devices(void)
128{ 182{
129 unsigned int i; 183 unsigned int i;
@@ -133,12 +187,23 @@ static void scan_devices(void)
133 add_lguest_device(i); 187 add_lguest_device(i);
134} 188}
135 189
190/*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest
191 * bus. We check that we are a Guest by checking paravirt_ops.name: there are
192 * other ways of checking, but this seems most obvious to me.
193 *
194 * So we can access the array of "struct lguest_device_desc"s easily, we map
195 * that memory and store the pointer in the global "lguest_devices". Then we
196 * register the bus with the core. Doing two registrations seems clunky to me,
197 * but it seems to be the correct sysfs incantation.
198 *
199 * Finally we call scan_devices() which adds all the devices found in the
200 * "struct lguest_device_desc" array. */
136static int __init lguest_bus_init(void) 201static int __init lguest_bus_init(void)
137{ 202{
138 if (strcmp(paravirt_ops.name, "lguest") != 0) 203 if (strcmp(paravirt_ops.name, "lguest") != 0)
139 return 0; 204 return 0;
140 205
141 /* Devices are in page above top of "normal" mem. */ 206 /* Devices are in a single page above top of "normal" mem */
142 lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); 207 lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
143 208
144 if (bus_register(&lguest_bus.bus) != 0 209 if (bus_register(&lguest_bus.bus) != 0
@@ -148,4 +213,5 @@ static int __init lguest_bus_init(void)
148 scan_devices(); 213 scan_devices();
149 return 0; 214 return 0;
150} 215}
216/* Do this after core stuff, before devices. */
151postcore_initcall(lguest_bus_init); 217postcore_initcall(lguest_bus_init);
diff --git a/drivers/net/lguest_net.c b/drivers/net/lguest_net.c
index 112778652f7d..20df6a848923 100644
--- a/drivers/net/lguest_net.c
+++ b/drivers/net/lguest_net.c
@@ -1,6 +1,13 @@
1/* A simple network driver for lguest. 1/*D:500
2 * The Guest network driver.
2 * 3 *
3 * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 4 * This is very simple a virtual network driver, and our last Guest driver.
5 * The only trick is that it can talk directly to multiple other recipients
6 * (ie. other Guests on the same network). It can also be used with only the
7 * Host on the network.
8 :*/
9
10/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
4 * 11 *
5 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 13 * it under the terms of the GNU General Public License as published by
@@ -28,23 +35,28 @@
28#define MAX_LANS 4 35#define MAX_LANS 4
29#define NUM_SKBS 8 36#define NUM_SKBS 8
30 37
38/*D:530 The "struct lguestnet_info" contains all the information we need to
39 * know about the network device. */
31struct lguestnet_info 40struct lguestnet_info
32{ 41{
33 /* The shared page(s). */ 42 /* The mapped device page(s) (an array of "struct lguest_net"). */
34 struct lguest_net *peer; 43 struct lguest_net *peer;
44 /* The physical address of the device page(s) */
35 unsigned long peer_phys; 45 unsigned long peer_phys;
46 /* The size of the device page(s). */
36 unsigned long mapsize; 47 unsigned long mapsize;
37 48
38 /* The lguest_device I come from */ 49 /* The lguest_device I come from */
39 struct lguest_device *lgdev; 50 struct lguest_device *lgdev;
40 51
41 /* My peerid. */ 52 /* My peerid (ie. my slot in the array). */
42 unsigned int me; 53 unsigned int me;
43 54
44 /* Receive queue. */ 55 /* Receive queue: the network packets waiting to be filled. */
45 struct sk_buff *skb[NUM_SKBS]; 56 struct sk_buff *skb[NUM_SKBS];
46 struct lguest_dma dma[NUM_SKBS]; 57 struct lguest_dma dma[NUM_SKBS];
47}; 58};
59/*:*/
48 60
49/* How many bytes left in this page. */ 61/* How many bytes left in this page. */
50static unsigned int rest_of_page(void *data) 62static unsigned int rest_of_page(void *data)
@@ -52,39 +64,82 @@ static unsigned int rest_of_page(void *data)
52 return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); 64 return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
53} 65}
54 66
55/* Simple convention: offset 4 * peernum. */ 67/*D:570 Each peer (ie. Guest or Host) on the network binds their receive
68 * buffers to a different key: we simply use the physical address of the
69 * device's memory page plus the peer number. The Host insists that all keys
70 * be a multiple of 4, so we multiply the peer number by 4. */
56static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum) 71static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum)
57{ 72{
58 return info->peer_phys + 4 * peernum; 73 return info->peer_phys + 4 * peernum;
59} 74}
60 75
76/* This is the routine which sets up a "struct lguest_dma" to point to a
77 * network packet, similar to req_to_dma() in lguest_blk.c. The structure of a
78 * "struct sk_buff" has grown complex over the years: it consists of a "head"
79 * linear section pointed to by "skb->data", and possibly an array of
80 * "fragments" in the case of a non-linear packet.
81 *
82 * Our receive buffers don't use fragments at all but outgoing skbs might, so
83 * we handle it. */
61static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen, 84static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen,
62 struct lguest_dma *dma) 85 struct lguest_dma *dma)
63{ 86{
64 unsigned int i, seg; 87 unsigned int i, seg;
65 88
89 /* First, we put the linear region into the "struct lguest_dma". Each
90 * entry can't go over a page boundary, so even though all our packets
91 * are 1514 bytes or less, we might need to use two entries here: */
66 for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) { 92 for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) {
67 dma->addr[seg] = virt_to_phys(skb->data + i); 93 dma->addr[seg] = virt_to_phys(skb->data + i);
68 dma->len[seg] = min((unsigned)(headlen - i), 94 dma->len[seg] = min((unsigned)(headlen - i),
69 rest_of_page(skb->data + i)); 95 rest_of_page(skb->data + i));
70 } 96 }
97
98 /* Now we handle the fragments: at least they're guaranteed not to go
99 * over a page. skb_shinfo(skb) returns a pointer to the structure
100 * which tells us about the number of fragments and the fragment
101 * array. */
71 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) { 102 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) {
72 const skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 103 const skb_frag_t *f = &skb_shinfo(skb)->frags[i];
73 /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */ 104 /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */
74 if (seg == LGUEST_MAX_DMA_SECTIONS) { 105 if (seg == LGUEST_MAX_DMA_SECTIONS) {
106 /* We will end up sending a truncated packet should
107 * this ever happen. Plus, a cool log message! */
75 printk("Woah dude! Megapacket!\n"); 108 printk("Woah dude! Megapacket!\n");
76 break; 109 break;
77 } 110 }
78 dma->addr[seg] = page_to_phys(f->page) + f->page_offset; 111 dma->addr[seg] = page_to_phys(f->page) + f->page_offset;
79 dma->len[seg] = f->size; 112 dma->len[seg] = f->size;
80 } 113 }
114
115 /* If after all that we didn't use the entire "struct lguest_dma"
116 * array, we terminate it with a 0 length. */
81 if (seg < LGUEST_MAX_DMA_SECTIONS) 117 if (seg < LGUEST_MAX_DMA_SECTIONS)
82 dma->len[seg] = 0; 118 dma->len[seg] = 0;
83} 119}
84 120
85/* We overload multicast bit to show promiscuous mode. */ 121/*
122 * Packet transmission.
123 *
124 * Our packet transmission is a little unusual. A real network card would just
125 * send out the packet and leave the receivers to decide if they're interested.
126 * Instead, we look through the network device memory page and see if any of
127 * the ethernet addresses match the packet destination, and if so we send it to
128 * that Guest.
129 *
130 * This is made a little more complicated in two cases. The first case is
131 * broadcast packets: for that we send the packet to all Guests on the network,
132 * one at a time. The second case is "promiscuous" mode, where a Guest wants
133 * to see all the packets on the network. We need a way for the Guest to tell
134 * us it wants to see all packets, so it sets the "multicast" bit on its
135 * published MAC address, which is never valid in a real ethernet address.
136 */
86#define PROMISC_BIT 0x01 137#define PROMISC_BIT 0x01
87 138
139/* This is the callback which is summoned whenever the network device's
140 * multicast or promiscuous state changes. If the card is in promiscuous mode,
141 * we advertise that in our ethernet address in the device's memory. We do the
142 * same if Linux wants any or all multicast traffic. */
88static void lguestnet_set_multicast(struct net_device *dev) 143static void lguestnet_set_multicast(struct net_device *dev)
89{ 144{
90 struct lguestnet_info *info = netdev_priv(dev); 145 struct lguestnet_info *info = netdev_priv(dev);
@@ -95,11 +150,14 @@ static void lguestnet_set_multicast(struct net_device *dev)
95 info->peer[info->me].mac[0] &= ~PROMISC_BIT; 150 info->peer[info->me].mac[0] &= ~PROMISC_BIT;
96} 151}
97 152
153/* A simple test function to see if a peer wants to see all packets.*/
98static int promisc(struct lguestnet_info *info, unsigned int peer) 154static int promisc(struct lguestnet_info *info, unsigned int peer)
99{ 155{
100 return info->peer[peer].mac[0] & PROMISC_BIT; 156 return info->peer[peer].mac[0] & PROMISC_BIT;
101} 157}
102 158
159/* Another simple function to see if a peer's advertised ethernet address
160 * matches a packet's destination ethernet address. */
103static int mac_eq(const unsigned char mac[ETH_ALEN], 161static int mac_eq(const unsigned char mac[ETH_ALEN],
104 struct lguestnet_info *info, unsigned int peer) 162 struct lguestnet_info *info, unsigned int peer)
105{ 163{
@@ -109,6 +167,8 @@ static int mac_eq(const unsigned char mac[ETH_ALEN],
109 return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0; 167 return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0;
110} 168}
111 169
170/* This is the function which actually sends a packet once we've decided a
171 * peer wants it: */
112static void transfer_packet(struct net_device *dev, 172static void transfer_packet(struct net_device *dev,
113 struct sk_buff *skb, 173 struct sk_buff *skb,
114 unsigned int peernum) 174 unsigned int peernum)
@@ -116,76 +176,134 @@ static void transfer_packet(struct net_device *dev,
116 struct lguestnet_info *info = netdev_priv(dev); 176 struct lguestnet_info *info = netdev_priv(dev);
117 struct lguest_dma dma; 177 struct lguest_dma dma;
118 178
179 /* We use our handy "struct lguest_dma" packing function to prepare
180 * the skb for sending. */
119 skb_to_dma(skb, skb_headlen(skb), &dma); 181 skb_to_dma(skb, skb_headlen(skb), &dma);
120 pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len); 182 pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len);
121 183
184 /* This is the actual send call which copies the packet. */
122 lguest_send_dma(peer_key(info, peernum), &dma); 185 lguest_send_dma(peer_key(info, peernum), &dma);
186
187 /* Check that the entire packet was transmitted. If not, it could mean
188 * that the other Guest registered a short receive buffer, but this
189 * driver should never do that. More likely, the peer is dead. */
123 if (dma.used_len != skb->len) { 190 if (dma.used_len != skb->len) {
124 dev->stats.tx_carrier_errors++; 191 dev->stats.tx_carrier_errors++;
125 pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n", 192 pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n",
126 peernum, dma.used_len, skb->len, 193 peernum, dma.used_len, skb->len,
127 (void *)dma.addr[0], dma.len[0]); 194 (void *)dma.addr[0], dma.len[0]);
128 } else { 195 } else {
196 /* On success we update the stats. */
129 dev->stats.tx_bytes += skb->len; 197 dev->stats.tx_bytes += skb->len;
130 dev->stats.tx_packets++; 198 dev->stats.tx_packets++;
131 } 199 }
132} 200}
133 201
202/* Another helper function to tell is if a slot in the device memory is unused.
203 * Since we always set the Local Assignment bit in the ethernet address, the
204 * first byte can never be 0. */
134static int unused_peer(const struct lguest_net peer[], unsigned int num) 205static int unused_peer(const struct lguest_net peer[], unsigned int num)
135{ 206{
136 return peer[num].mac[0] == 0; 207 return peer[num].mac[0] == 0;
137} 208}
138 209
210/* Finally, here is the routine which handles an outgoing packet. It's called
211 * "start_xmit" for traditional reasons. */
139static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev) 212static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
140{ 213{
141 unsigned int i; 214 unsigned int i;
142 int broadcast; 215 int broadcast;
143 struct lguestnet_info *info = netdev_priv(dev); 216 struct lguestnet_info *info = netdev_priv(dev);
217 /* Extract the destination ethernet address from the packet. */
144 const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; 218 const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
145 219
146 pr_debug("%s: xmit %02x:%02x:%02x:%02x:%02x:%02x\n", 220 pr_debug("%s: xmit %02x:%02x:%02x:%02x:%02x:%02x\n",
147 dev->name, dest[0],dest[1],dest[2],dest[3],dest[4],dest[5]); 221 dev->name, dest[0],dest[1],dest[2],dest[3],dest[4],dest[5]);
148 222
223 /* If it's a multicast packet, we broadcast to everyone. That's not
224 * very efficient, but there are very few applications which actually
225 * use multicast, which is a shame really.
226 *
227 * As etherdevice.h points out: "By definition the broadcast address is
228 * also a multicast address." So we don't have to test for broadcast
229 * packets separately. */
149 broadcast = is_multicast_ether_addr(dest); 230 broadcast = is_multicast_ether_addr(dest);
231
232 /* Look through all the published ethernet addresses to see if we
233 * should send this packet. */
150 for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) { 234 for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) {
235 /* We don't send to ourselves (we actually can't SEND_DMA to
236 * ourselves anyway), and don't send to unused slots.*/
151 if (i == info->me || unused_peer(info->peer, i)) 237 if (i == info->me || unused_peer(info->peer, i))
152 continue; 238 continue;
153 239
240 /* If it's broadcast we send it. If they want every packet we
241 * send it. If the destination matches their address we send
242 * it. Otherwise we go to the next peer. */
154 if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i)) 243 if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i))
155 continue; 244 continue;
156 245
157 pr_debug("lguestnet %s: sending from %i to %i\n", 246 pr_debug("lguestnet %s: sending from %i to %i\n",
158 dev->name, info->me, i); 247 dev->name, info->me, i);
248 /* Our routine which actually does the transfer. */
159 transfer_packet(dev, skb, i); 249 transfer_packet(dev, skb, i);
160 } 250 }
251
252 /* An xmit routine is expected to dispose of the packet, so we do. */
161 dev_kfree_skb(skb); 253 dev_kfree_skb(skb);
254
255 /* As per kernel convention, 0 means success. This is why I love
256 * networking: even if we never sent to anyone, that's still
257 * success! */
162 return 0; 258 return 0;
163} 259}
164 260
165/* Find a new skb to put in this slot in shared mem. */ 261/*D:560
262 * Packet receiving.
263 *
264 * First, here's a helper routine which fills one of our array of receive
265 * buffers: */
166static int fill_slot(struct net_device *dev, unsigned int slot) 266static int fill_slot(struct net_device *dev, unsigned int slot)
167{ 267{
168 struct lguestnet_info *info = netdev_priv(dev); 268 struct lguestnet_info *info = netdev_priv(dev);
169 /* Try to create and register a new one. */ 269
270 /* We can receive ETH_DATA_LEN (1500) byte packets, plus a standard
271 * ethernet header of ETH_HLEN (14) bytes. */
170 info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN); 272 info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN);
171 if (!info->skb[slot]) { 273 if (!info->skb[slot]) {
172 printk("%s: could not fill slot %i\n", dev->name, slot); 274 printk("%s: could not fill slot %i\n", dev->name, slot);
173 return -ENOMEM; 275 return -ENOMEM;
174 } 276 }
175 277
278 /* skb_to_dma() is a helper which sets up the "struct lguest_dma" to
279 * point to the data in the skb: we also use it for sending out a
280 * packet. */
176 skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]); 281 skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]);
282
283 /* This is a Write Memory Barrier: it ensures that the entry in the
284 * receive buffer array is written *before* we set the "used_len" entry
285 * to 0. If the Host were looking at the receive buffer array from a
286 * different CPU, it could potentially see "used_len = 0" and not see
287 * the updated receive buffer information. This would be a horribly
288 * nasty bug, so make sure the compiler and CPU know this has to happen
289 * first. */
177 wmb(); 290 wmb();
178 /* Now we tell hypervisor it can use the slot. */ 291 /* Writing 0 to "used_len" tells the Host it can use this receive
292 * buffer now. */
179 info->dma[slot].used_len = 0; 293 info->dma[slot].used_len = 0;
180 return 0; 294 return 0;
181} 295}
182 296
297/* This is the actual receive routine. When we receive an interrupt from the
298 * Host to tell us a packet has been delivered, we arrive here: */
183static irqreturn_t lguestnet_rcv(int irq, void *dev_id) 299static irqreturn_t lguestnet_rcv(int irq, void *dev_id)
184{ 300{
185 struct net_device *dev = dev_id; 301 struct net_device *dev = dev_id;
186 struct lguestnet_info *info = netdev_priv(dev); 302 struct lguestnet_info *info = netdev_priv(dev);
187 unsigned int i, done = 0; 303 unsigned int i, done = 0;
188 304
305 /* Look through our entire receive array for an entry which has data
306 * in it. */
189 for (i = 0; i < ARRAY_SIZE(info->dma); i++) { 307 for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
190 unsigned int length; 308 unsigned int length;
191 struct sk_buff *skb; 309 struct sk_buff *skb;
@@ -194,10 +312,16 @@ static irqreturn_t lguestnet_rcv(int irq, void *dev_id)
194 if (length == 0) 312 if (length == 0)
195 continue; 313 continue;
196 314
315 /* We've found one! Remember the skb (we grabbed the length
316 * above), and immediately refill the slot we've taken it
317 * from. */
197 done++; 318 done++;
198 skb = info->skb[i]; 319 skb = info->skb[i];
199 fill_slot(dev, i); 320 fill_slot(dev, i);
200 321
322 /* This shouldn't happen: micropackets could be sent by a
323 * badly-behaved Guest on the network, but the Host will never
324 * stuff more data in the buffer than the buffer length. */
201 if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) { 325 if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) {
202 pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n", 326 pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n",
203 dev->name, length); 327 dev->name, length);
@@ -205,36 +329,72 @@ static irqreturn_t lguestnet_rcv(int irq, void *dev_id)
205 continue; 329 continue;
206 } 330 }
207 331
332 /* skb_put(), what a great function! I've ranted about this
333 * function before (http://lkml.org/lkml/1999/9/26/24). You
334 * call it after you've added data to the end of an skb (in
335 * this case, it was the Host which wrote the data). */
208 skb_put(skb, length); 336 skb_put(skb, length);
337
338 /* The ethernet header contains a protocol field: we use the
339 * standard helper to extract it, and place the result in
340 * skb->protocol. The helper also sets up skb->pkt_type and
341 * eats up the ethernet header from the front of the packet. */
209 skb->protocol = eth_type_trans(skb, dev); 342 skb->protocol = eth_type_trans(skb, dev);
210 /* This is a reliable transport. */ 343
344 /* If this device doesn't need checksums for sending, we also
345 * don't need to check the packets when they come in. */
211 if (dev->features & NETIF_F_NO_CSUM) 346 if (dev->features & NETIF_F_NO_CSUM)
212 skb->ip_summed = CHECKSUM_UNNECESSARY; 347 skb->ip_summed = CHECKSUM_UNNECESSARY;
348
349 /* As a last resort for debugging the driver or the lguest I/O
350 * subsystem, you can uncomment the "#define DEBUG" at the top
351 * of this file, which turns all the pr_debug() into printk()
352 * and floods the logs. */
213 pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 353 pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
214 ntohs(skb->protocol), skb->len, skb->pkt_type); 354 ntohs(skb->protocol), skb->len, skb->pkt_type);
215 355
356 /* Update the packet and byte counts (visible from ifconfig,
357 * and good for debugging). */
216 dev->stats.rx_bytes += skb->len; 358 dev->stats.rx_bytes += skb->len;
217 dev->stats.rx_packets++; 359 dev->stats.rx_packets++;
360
361 /* Hand our fresh network packet into the stack's "network
362 * interface receive" routine. That will free the packet
363 * itself when it's finished. */
218 netif_rx(skb); 364 netif_rx(skb);
219 } 365 }
366
367 /* If we found any packets, we assume the interrupt was for us. */
220 return done ? IRQ_HANDLED : IRQ_NONE; 368 return done ? IRQ_HANDLED : IRQ_NONE;
221} 369}
222 370
371/*D:550 This is where we start: when the device is brought up by dhcpd or
372 * ifconfig. At this point we advertise our MAC address to the rest of the
373 * network, and register receive buffers ready for incoming packets. */
223static int lguestnet_open(struct net_device *dev) 374static int lguestnet_open(struct net_device *dev)
224{ 375{
225 int i; 376 int i;
226 struct lguestnet_info *info = netdev_priv(dev); 377 struct lguestnet_info *info = netdev_priv(dev);
227 378
228 /* Set up our MAC address */ 379 /* Copy our MAC address into the device page, so others on the network
380 * can find us. */
229 memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN); 381 memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN);
230 382
231 /* Turn on promisc mode if needed */ 383 /* We might already be in promisc mode (dev->flags & IFF_PROMISC). Our
384 * set_multicast callback handles this already, so we call it now. */
232 lguestnet_set_multicast(dev); 385 lguestnet_set_multicast(dev);
233 386
387 /* Allocate packets and put them into our "struct lguest_dma" array.
388 * If we fail to allocate all the packets we could still limp along,
389 * but it's a sign of real stress so we should probably give up now. */
234 for (i = 0; i < ARRAY_SIZE(info->dma); i++) { 390 for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
235 if (fill_slot(dev, i) != 0) 391 if (fill_slot(dev, i) != 0)
236 goto cleanup; 392 goto cleanup;
237 } 393 }
394
395 /* Finally we tell the Host where our array of "struct lguest_dma"
396 * receive buffers is, binding it to the key corresponding to the
397 * device's physical memory plus our peerid. */
238 if (lguest_bind_dma(peer_key(info,info->me), info->dma, 398 if (lguest_bind_dma(peer_key(info,info->me), info->dma,
239 NUM_SKBS, lgdev_irq(info->lgdev)) != 0) 399 NUM_SKBS, lgdev_irq(info->lgdev)) != 0)
240 goto cleanup; 400 goto cleanup;
@@ -245,22 +405,29 @@ cleanup:
245 dev_kfree_skb(info->skb[i]); 405 dev_kfree_skb(info->skb[i]);
246 return -ENOMEM; 406 return -ENOMEM;
247} 407}
408/*:*/
248 409
410/* The close routine is called when the device is no longer in use: we clean up
411 * elegantly. */
249static int lguestnet_close(struct net_device *dev) 412static int lguestnet_close(struct net_device *dev)
250{ 413{
251 unsigned int i; 414 unsigned int i;
252 struct lguestnet_info *info = netdev_priv(dev); 415 struct lguestnet_info *info = netdev_priv(dev);
253 416
254 /* Clear all trace: others might deliver packets, we'll ignore it. */ 417 /* Clear all trace of our existence out of the device memory by setting
418 * the slot which held our MAC address to 0 (unused). */
255 memset(&info->peer[info->me], 0, sizeof(info->peer[info->me])); 419 memset(&info->peer[info->me], 0, sizeof(info->peer[info->me]));
256 420
257 /* Deregister sg lists. */ 421 /* Unregister our array of receive buffers */
258 lguest_unbind_dma(peer_key(info, info->me), info->dma); 422 lguest_unbind_dma(peer_key(info, info->me), info->dma);
259 for (i = 0; i < ARRAY_SIZE(info->dma); i++) 423 for (i = 0; i < ARRAY_SIZE(info->dma); i++)
260 dev_kfree_skb(info->skb[i]); 424 dev_kfree_skb(info->skb[i]);
261 return 0; 425 return 0;
262} 426}
263 427
428/*D:510 The network device probe function is basically a standard ethernet
429 * device setup. It reads the "struct lguest_device_desc" and sets the "struct
430 * net_device". Oh, the line-by-line excitement! Let's skip over it. :*/
264static int lguestnet_probe(struct lguest_device *lgdev) 431static int lguestnet_probe(struct lguest_device *lgdev)
265{ 432{
266 int err, irqf = IRQF_SHARED; 433 int err, irqf = IRQF_SHARED;
@@ -290,10 +457,16 @@ static int lguestnet_probe(struct lguest_device *lgdev)
290 dev->stop = lguestnet_close; 457 dev->stop = lguestnet_close;
291 dev->hard_start_xmit = lguestnet_start_xmit; 458 dev->hard_start_xmit = lguestnet_start_xmit;
292 459
293 /* Turning on/off promisc will call dev->set_multicast_list. 460 /* We don't actually support multicast yet, but turning on/off
294 * We don't actually support multicast yet */ 461 * promisc also calls dev->set_multicast_list. */
295 dev->set_multicast_list = lguestnet_set_multicast; 462 dev->set_multicast_list = lguestnet_set_multicast;
296 SET_NETDEV_DEV(dev, &lgdev->dev); 463 SET_NETDEV_DEV(dev, &lgdev->dev);
464
465 /* The network code complains if you have "scatter-gather" capability
466 * if you don't also handle checksums (it seem that would be
467 * "illogical"). So we use a lie of omission and don't tell it that we
468 * can handle scattered packets unless we also don't want checksums,
469 * even though to us they're completely independent. */
297 if (desc->features & LGUEST_NET_F_NOCSUM) 470 if (desc->features & LGUEST_NET_F_NOCSUM)
298 dev->features = NETIF_F_SG|NETIF_F_NO_CSUM; 471 dev->features = NETIF_F_SG|NETIF_F_NO_CSUM;
299 472
@@ -325,6 +498,9 @@ static int lguestnet_probe(struct lguest_device *lgdev)
325 } 498 }
326 499
327 pr_debug("lguestnet: registered device %s\n", dev->name); 500 pr_debug("lguestnet: registered device %s\n", dev->name);
501 /* Finally, we put the "struct net_device" in the generic "struct
502 * lguest_device"s private pointer. Again, it's not necessary, but
503 * makes sure the cool kernel kids don't tease us. */
328 lgdev->private = dev; 504 lgdev->private = dev;
329 return 0; 505 return 0;
330 506
@@ -352,3 +528,11 @@ module_init(lguestnet_init);
352 528
353MODULE_DESCRIPTION("Lguest network driver"); 529MODULE_DESCRIPTION("Lguest network driver");
354MODULE_LICENSE("GPL"); 530MODULE_LICENSE("GPL");
531
532/*D:580
533 * This is the last of the Drivers, and with this we have covered the many and
534 * wonderous and fine (and boring) details of the Guest.
535 *
536 * "make Launcher" beckons, where we answer questions like "Where do Guests
537 * come from?", and "What do you do when someone asks for optimization?"
538 */