aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/lguest_blk.c169
1 files changed, 157 insertions, 12 deletions
diff --git a/drivers/block/lguest_blk.c b/drivers/block/lguest_blk.c
index 5b79d0724171..93e3c4001bf5 100644
--- a/drivers/block/lguest_blk.c
+++ b/drivers/block/lguest_blk.c
@@ -1,6 +1,12 @@
1/* A simple block driver for lguest. 1/*D:400
2 * The Guest block driver
2 * 3 *
3 * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 4 * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
5 * The mechanism is simple: we place the information about the request in the
6 * device page, then use SEND_DMA (containing the data for a write, or an empty
7 * "ping" DMA for a read).
8 :*/
9/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
4 * 10 *
5 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 12 * it under the terms of the GNU General Public License as published by
@@ -25,27 +31,50 @@
25 31
26static char next_block_index = 'a'; 32static char next_block_index = 'a';
27 33
34/*D:420 Here is the structure which holds all the information we need about
35 * each Guest block device.
36 *
37 * I'm sure at this stage, you're wondering "hey, where was the adventure I was
38 * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
39 * my blog". I think Real adventures have boring bits, too, and you're in the
40 * middle of one. But it gets better. Just not quite yet. */
28struct blockdev 41struct blockdev
29{ 42{
43 /* The block queue infrastructure wants a spinlock: it is held while it
44 * calls our block request function. We grab it in our interrupt
45 * handler so the responses don't mess with new requests. */
30 spinlock_t lock; 46 spinlock_t lock;
31 47
32 /* The disk structure for the kernel. */ 48 /* The disk structure registered with kernel. */
33 struct gendisk *disk; 49 struct gendisk *disk;
34 50
35 /* The major number for this disk. */ 51 /* The major device number for this disk, and the interrupt. We only
52 * really keep them here for completeness; we'd need them if we
53 * supported device unplugging. */
36 int major; 54 int major;
37 int irq; 55 int irq;
38 56
57 /* The physical address of this device's memory page */
39 unsigned long phys_addr; 58 unsigned long phys_addr;
40 /* The mapped block page. */ 59 /* The mapped memory page for convenient acces. */
41 struct lguest_block_page *lb_page; 60 struct lguest_block_page *lb_page;
42 61
43 /* We only have a single request outstanding at a time. */ 62 /* We only have a single request outstanding at a time: this is it. */
44 struct lguest_dma dma; 63 struct lguest_dma dma;
45 struct request *req; 64 struct request *req;
46}; 65};
47 66
48/* Jens gave me this nice helper to end all chunks of a request. */ 67/*D:495 We originally used end_request() throughout the driver, but it turns
68 * out that end_request() is deprecated, and doesn't actually end the request
69 * (which seems like a good reason to deprecate it!). It simply ends the first
70 * bio. So if we had 3 bios in a "struct request" we would do all 3,
71 * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
72 * work as we needed to do.
73 *
74 * This reinforced to me that I do not understand the block layer.
75 *
76 * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
77 * request. This improved disk speed by 130%. */
49static void end_entire_request(struct request *req, int uptodate) 78static void end_entire_request(struct request *req, int uptodate)
50{ 79{
51 if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) 80 if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
@@ -55,30 +84,62 @@ static void end_entire_request(struct request *req, int uptodate)
55 end_that_request_last(req, uptodate); 84 end_that_request_last(req, uptodate);
56} 85}
57 86
87/* I'm told there are only two stories in the world worth telling: love and
88 * hate. So there used to be a love scene here like this:
89 *
90 * Launcher: We could make beautiful I/O together, you and I.
91 * Guest: My, that's a big disk!
92 *
93 * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
94
95/*D:490 This is the interrupt handler, called when a block read or write has
96 * been completed for us. */
58static irqreturn_t lgb_irq(int irq, void *_bd) 97static irqreturn_t lgb_irq(int irq, void *_bd)
59{ 98{
99 /* We handed our "struct blockdev" as the argument to request_irq(), so
100 * it is passed through to us here. This tells us which device we're
101 * dealing with in case we have more than one. */
60 struct blockdev *bd = _bd; 102 struct blockdev *bd = _bd;
61 unsigned long flags; 103 unsigned long flags;
62 104
105 /* We weren't doing anything? Strange, but could happen if we shared
106 * interrupts (we don't!). */
63 if (!bd->req) { 107 if (!bd->req) {
64 pr_debug("No work!\n"); 108 pr_debug("No work!\n");
65 return IRQ_NONE; 109 return IRQ_NONE;
66 } 110 }
67 111
112 /* Not done yet? That's equally strange. */
68 if (!bd->lb_page->result) { 113 if (!bd->lb_page->result) {
69 pr_debug("No result!\n"); 114 pr_debug("No result!\n");
70 return IRQ_NONE; 115 return IRQ_NONE;
71 } 116 }
72 117
118 /* We have to grab the lock before ending the request. */
73 spin_lock_irqsave(&bd->lock, flags); 119 spin_lock_irqsave(&bd->lock, flags);
120 /* "result" is 1 for success, 2 for failure: end_entire_request() wants
121 * to know whether this succeeded or not. */
74 end_entire_request(bd->req, bd->lb_page->result == 1); 122 end_entire_request(bd->req, bd->lb_page->result == 1);
123 /* Clear out request, it's done. */
75 bd->req = NULL; 124 bd->req = NULL;
125 /* Reset incoming DMA for next time. */
76 bd->dma.used_len = 0; 126 bd->dma.used_len = 0;
127 /* Ready for more reads or writes */
77 blk_start_queue(bd->disk->queue); 128 blk_start_queue(bd->disk->queue);
78 spin_unlock_irqrestore(&bd->lock, flags); 129 spin_unlock_irqrestore(&bd->lock, flags);
130
131 /* The interrupt was for us, we dealt with it. */
79 return IRQ_HANDLED; 132 return IRQ_HANDLED;
80} 133}
81 134
135/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
136 * each of which contains "struct bio_vec"s, each of which contains a page, an
137 * offset and a length.
138 *
139 * Fortunately there are iterators to help us walk through the "struct
140 * request". Even more fortunately, there were plenty of places to steal the
141 * code from. We pack the "struct request" into our "struct lguest_dma" and
142 * return the total length. */
82static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma) 143static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
83{ 144{
84 unsigned int i = 0, idx, len = 0; 145 unsigned int i = 0, idx, len = 0;
@@ -87,8 +148,13 @@ static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
87 rq_for_each_bio(bio, req) { 148 rq_for_each_bio(bio, req) {
88 struct bio_vec *bvec; 149 struct bio_vec *bvec;
89 bio_for_each_segment(bvec, bio, idx) { 150 bio_for_each_segment(bvec, bio, idx) {
151 /* We told the block layer not to give us too many. */
90 BUG_ON(i == LGUEST_MAX_DMA_SECTIONS); 152 BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
153 /* If we had a zero-length segment, it would look like
154 * the end of the data referred to by the "struct
155 * lguest_dma", so make sure that doesn't happen. */
91 BUG_ON(!bvec->bv_len); 156 BUG_ON(!bvec->bv_len);
157 /* Convert page & offset to a physical address */
92 dma->addr[i] = page_to_phys(bvec->bv_page) 158 dma->addr[i] = page_to_phys(bvec->bv_page)
93 + bvec->bv_offset; 159 + bvec->bv_offset;
94 dma->len[i] = bvec->bv_len; 160 dma->len[i] = bvec->bv_len;
@@ -96,26 +162,39 @@ static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
96 i++; 162 i++;
97 } 163 }
98 } 164 }
165 /* If the array isn't full, we mark the end with a 0 length */
99 if (i < LGUEST_MAX_DMA_SECTIONS) 166 if (i < LGUEST_MAX_DMA_SECTIONS)
100 dma->len[i] = 0; 167 dma->len[i] = 0;
101 return len; 168 return len;
102} 169}
103 170
171/* This creates an empty DMA, useful for prodding the Host without sending data
172 * (ie. when we want to do a read) */
104static void empty_dma(struct lguest_dma *dma) 173static void empty_dma(struct lguest_dma *dma)
105{ 174{
106 dma->len[0] = 0; 175 dma->len[0] = 0;
107} 176}
108 177
178/*D:470 Setting up a request is fairly easy: */
109static void setup_req(struct blockdev *bd, 179static void setup_req(struct blockdev *bd,
110 int type, struct request *req, struct lguest_dma *dma) 180 int type, struct request *req, struct lguest_dma *dma)
111{ 181{
182 /* The type is 1 (write) or 0 (read). */
112 bd->lb_page->type = type; 183 bd->lb_page->type = type;
184 /* The sector on disk where the read or write starts. */
113 bd->lb_page->sector = req->sector; 185 bd->lb_page->sector = req->sector;
186 /* The result is initialized to 0 (unfinished). */
114 bd->lb_page->result = 0; 187 bd->lb_page->result = 0;
188 /* The current request (so we can end it in the interrupt handler). */
115 bd->req = req; 189 bd->req = req;
190 /* The number of bytes: returned as a side-effect of req_to_dma(),
191 * which packs the block layer's "struct request" into our "struct
192 * lguest_dma" */
116 bd->lb_page->bytes = req_to_dma(req, dma); 193 bd->lb_page->bytes = req_to_dma(req, dma);
117} 194}
118 195
196/*D:450 Write is pretty straightforward: we pack the request into a "struct
197 * lguest_dma", then use SEND_DMA to send the request. */
119static void do_write(struct blockdev *bd, struct request *req) 198static void do_write(struct blockdev *bd, struct request *req)
120{ 199{
121 struct lguest_dma send; 200 struct lguest_dma send;
@@ -126,6 +205,9 @@ static void do_write(struct blockdev *bd, struct request *req)
126 lguest_send_dma(bd->phys_addr, &send); 205 lguest_send_dma(bd->phys_addr, &send);
127} 206}
128 207
208/* Read is similar to write, except we pack the request into our receive
209 * "struct lguest_dma" and send through an empty DMA just to tell the Host that
210 * there's a request pending. */
129static void do_read(struct blockdev *bd, struct request *req) 211static void do_read(struct blockdev *bd, struct request *req)
130{ 212{
131 struct lguest_dma ping; 213 struct lguest_dma ping;
@@ -137,21 +219,30 @@ static void do_read(struct blockdev *bd, struct request *req)
137 lguest_send_dma(bd->phys_addr, &ping); 219 lguest_send_dma(bd->phys_addr, &ping);
138} 220}
139 221
222/*D:440 This where requests come in: we get handed the request queue and are
223 * expected to pull a "struct request" off it until we've finished them or
224 * we're waiting for a reply: */
140static void do_lgb_request(struct request_queue *q) 225static void do_lgb_request(struct request_queue *q)
141{ 226{
142 struct blockdev *bd; 227 struct blockdev *bd;
143 struct request *req; 228 struct request *req;
144 229
145again: 230again:
231 /* This sometimes returns NULL even on the very first time around. I
232 * wonder if it's something to do with letting elves handle the request
233 * queue... */
146 req = elv_next_request(q); 234 req = elv_next_request(q);
147 if (!req) 235 if (!req)
148 return; 236 return;
149 237
238 /* We attached the struct blockdev to the disk: get it back */
150 bd = req->rq_disk->private_data; 239 bd = req->rq_disk->private_data;
151 /* Sometimes we get repeated requests after blk_stop_queue. */ 240 /* Sometimes we get repeated requests after blk_stop_queue(), but we
241 * can only handle one at a time. */
152 if (bd->req) 242 if (bd->req)
153 return; 243 return;
154 244
245 /* We only do reads and writes: no tricky business! */
155 if (!blk_fs_request(req)) { 246 if (!blk_fs_request(req)) {
156 pr_debug("Got non-command 0x%08x\n", req->cmd_type); 247 pr_debug("Got non-command 0x%08x\n", req->cmd_type);
157 req->errors++; 248 req->errors++;
@@ -164,20 +255,31 @@ again:
164 else 255 else
165 do_read(bd, req); 256 do_read(bd, req);
166 257
167 /* Wait for interrupt to tell us it's done. */ 258 /* We've put out the request, so stop any more coming in until we get
259 * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
168 blk_stop_queue(q); 260 blk_stop_queue(q);
169} 261}
170 262
263/*D:430 This is the "struct block_device_operations" we attach to the disk at
264 * the end of lguestblk_probe(). It doesn't seem to want much. */
171static struct block_device_operations lguestblk_fops = { 265static struct block_device_operations lguestblk_fops = {
172 .owner = THIS_MODULE, 266 .owner = THIS_MODULE,
173}; 267};
174 268
269/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
270 * quite why. I do know that the IDE code sent two or three of the maintainers
271 * insane, perhaps this is the fringe of the same disease?
272 *
273 * As in the console code, the probe function gets handed the generic
274 * lguest_device from lguest_bus.c: */
175static int lguestblk_probe(struct lguest_device *lgdev) 275static int lguestblk_probe(struct lguest_device *lgdev)
176{ 276{
177 struct blockdev *bd; 277 struct blockdev *bd;
178 int err; 278 int err;
179 int irqflags = IRQF_SHARED; 279 int irqflags = IRQF_SHARED;
180 280
281 /* First we allocate our own "struct blockdev" and initialize the easy
282 * fields. */
181 bd = kmalloc(sizeof(*bd), GFP_KERNEL); 283 bd = kmalloc(sizeof(*bd), GFP_KERNEL);
182 if (!bd) 284 if (!bd)
183 return -ENOMEM; 285 return -ENOMEM;
@@ -187,59 +289,100 @@ static int lguestblk_probe(struct lguest_device *lgdev)
187 bd->req = NULL; 289 bd->req = NULL;
188 bd->dma.used_len = 0; 290 bd->dma.used_len = 0;
189 bd->dma.len[0] = 0; 291 bd->dma.len[0] = 0;
292 /* The descriptor in the lguest_devices array provided by the Host
293 * gives the Guest the physical page number of the device's page. */
190 bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT); 294 bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
191 295
296 /* We use lguest_map() to get a pointer to the device page */
192 bd->lb_page = lguest_map(bd->phys_addr, 1); 297 bd->lb_page = lguest_map(bd->phys_addr, 1);
193 if (!bd->lb_page) { 298 if (!bd->lb_page) {
194 err = -ENOMEM; 299 err = -ENOMEM;
195 goto out_free_bd; 300 goto out_free_bd;
196 } 301 }
197 302
303 /* We need a major device number: 0 means "assign one dynamically". */
198 bd->major = register_blkdev(0, "lguestblk"); 304 bd->major = register_blkdev(0, "lguestblk");
199 if (bd->major < 0) { 305 if (bd->major < 0) {
200 err = bd->major; 306 err = bd->major;
201 goto out_unmap; 307 goto out_unmap;
202 } 308 }
203 309
310 /* This allocates a "struct gendisk" where we pack all the information
311 * about the disk which the rest of Linux sees. We ask for one minor
312 * number; I do wonder if we should be asking for more. */
204 bd->disk = alloc_disk(1); 313 bd->disk = alloc_disk(1);
205 if (!bd->disk) { 314 if (!bd->disk) {
206 err = -ENOMEM; 315 err = -ENOMEM;
207 goto out_unregister_blkdev; 316 goto out_unregister_blkdev;
208 } 317 }
209 318
319 /* Every disk needs a queue for requests to come in: we set up the
320 * queue with a callback function (the core of our driver) and the lock
321 * to use. */
210 bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock); 322 bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
211 if (!bd->disk->queue) { 323 if (!bd->disk->queue) {
212 err = -ENOMEM; 324 err = -ENOMEM;
213 goto out_put_disk; 325 goto out_put_disk;
214 } 326 }
215 327
216 /* We can only handle a certain number of sg entries */ 328 /* We can only handle a certain number of pointers in our SEND_DMA
329 * call, so we set that with blk_queue_max_hw_segments(). This is not
330 * to be confused with blk_queue_max_phys_segments() of course! I
331 * know, who could possibly confuse the two?
332 *
333 * Well, it's simple to tell them apart: this one seems to work and the
334 * other one didn't. */
217 blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS); 335 blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
218 /* Buffers must not cross page boundaries */ 336
337 /* Due to technical limitations of our Host (and simple coding) we
338 * can't have a single buffer which crosses a page boundary. Tell it
339 * here. This means that our maximum request size is 16
340 * (LGUEST_MAX_DMA_SECTIONS) pages. */
219 blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1); 341 blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
220 342
343 /* We name our disk: this becomes the device name when udev does its
344 * magic thing and creates the device node, such as /dev/lgba.
345 * next_block_index is a global which starts at 'a'. Unfortunately
346 * this simple increment logic means that the 27th disk will be called
347 * "/dev/lgb{". In that case, I recommend having at least 29 disks, so
348 * your /dev directory will be balanced. */
221 sprintf(bd->disk->disk_name, "lgb%c", next_block_index++); 349 sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
350
351 /* We look to the device descriptor again to see if this device's
352 * interrupts are expected to be random. If they are, we tell the irq
353 * subsystem. At the moment this bit is always set. */
222 if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) 354 if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
223 irqflags |= IRQF_SAMPLE_RANDOM; 355 irqflags |= IRQF_SAMPLE_RANDOM;
356
357 /* Now we have the name and irqflags, we can request the interrupt; we
358 * give it the "struct blockdev" we have set up to pass to lgb_irq()
359 * when there is an interrupt. */
224 err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd); 360 err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
225 if (err) 361 if (err)
226 goto out_cleanup_queue; 362 goto out_cleanup_queue;
227 363
364 /* We bind our one-entry DMA pool to the key for this block device so
365 * the Host can reply to our requests. The key is equal to the
366 * physical address of the device's page, which is conveniently
367 * unique. */
228 err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq); 368 err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
229 if (err) 369 if (err)
230 goto out_free_irq; 370 goto out_free_irq;
231 371
372 /* We finish our disk initialization and add the disk to the system. */
232 bd->disk->major = bd->major; 373 bd->disk->major = bd->major;
233 bd->disk->first_minor = 0; 374 bd->disk->first_minor = 0;
234 bd->disk->private_data = bd; 375 bd->disk->private_data = bd;
235 bd->disk->fops = &lguestblk_fops; 376 bd->disk->fops = &lguestblk_fops;
236 /* This is initialized to the disk size by the other end. */ 377 /* This is initialized to the disk size by the Launcher. */
237 set_capacity(bd->disk, bd->lb_page->num_sectors); 378 set_capacity(bd->disk, bd->lb_page->num_sectors);
238 add_disk(bd->disk); 379 add_disk(bd->disk);
239 380
240 printk(KERN_INFO "%s: device %i at major %d\n", 381 printk(KERN_INFO "%s: device %i at major %d\n",
241 bd->disk->disk_name, lgdev->index, bd->major); 382 bd->disk->disk_name, lgdev->index, bd->major);
242 383
384 /* We don't need to keep the "struct blockdev" around, but if we ever
385 * implemented device removal, we'd need this. */
243 lgdev->private = bd; 386 lgdev->private = bd;
244 return 0; 387 return 0;
245 388
@@ -258,6 +401,8 @@ out_free_bd:
258 return err; 401 return err;
259} 402}
260 403
404/*D:410 The boilerplate code for registering the lguest block driver is just
405 * like the console: */
261static struct lguest_driver lguestblk_drv = { 406static struct lguest_driver lguestblk_drv = {
262 .name = "lguestblk", 407 .name = "lguestblk",
263 .owner = THIS_MODULE, 408 .owner = THIS_MODULE,