aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@xensource.com>2007-07-17 21:37:06 -0400
committerJeremy Fitzhardinge <jeremy@goop.org>2007-07-18 11:47:45 -0400
commit9f27ee595038653ddf8bca871200d39247d6f4fc (patch)
tree602b4fe83ccfd6cb65b146ed80e84baec0fb6cb7
parent4bac07c993d03434ea902d3d4290d9e45944b66c (diff)
xen: add virtual block device driver.
The block device frontend driver allows the kernel to access block devices exported exported by a virtual machine containing a physical block device driver. Signed-off-by: Ian Pratt <ian.pratt@xensource.com> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Arjan van de Ven <arjan@infradead.org> Cc: Greg KH <greg@kroah.com> Cc: Jens Axboe <axboe@kernel.dk>
-rw-r--r--drivers/block/Kconfig9
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/xen-blkfront.c988
-rw-r--r--include/linux/major.h2
4 files changed, 1000 insertions, 0 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8f65b88cf711..a4a311992408 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -427,4 +427,13 @@ config XILINX_SYSACE
427 help 427 help
428 Include support for the Xilinx SystemACE CompactFlash interface 428 Include support for the Xilinx SystemACE CompactFlash interface
429 429
430config XEN_BLKDEV_FRONTEND
431 tristate "Xen virtual block device support"
432 depends on XEN
433 default y
434 help
435 This driver implements the front-end of the Xen virtual
436 block device driver. It communicates with a back-end driver
437 in another domain which drives the actual block device.
438
430endif # BLK_DEV 439endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 9ee08ab4ffa8..3e31532df0ed 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o
29obj-$(CONFIG_BLK_DEV_SX8) += sx8.o 29obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
30obj-$(CONFIG_BLK_DEV_UB) += ub.o 30obj-$(CONFIG_BLK_DEV_UB) += ub.o
31 31
32obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
new file mode 100644
index 000000000000..6746c29181f8
--- /dev/null
+++ b/drivers/block/xen-blkfront.c
@@ -0,0 +1,988 @@
1/*
2 * blkfront.c
3 *
4 * XenLinux virtual block device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 * Copyright (c) 2005, XenSource Ltd
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation; or, when distributed
16 * separately from the Linux kernel or incorporated into other
17 * software packages, subject to the following license:
18 *
19 * Permission is hereby granted, free of charge, to any person obtaining a copy
20 * of this source file (the "Software"), to deal in the Software without
21 * restriction, including without limitation the rights to use, copy, modify,
22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23 * and to permit persons to whom the Software is furnished to do so, subject to
24 * the following conditions:
25 *
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35 * IN THE SOFTWARE.
36 */
37
38#include <linux/interrupt.h>
39#include <linux/blkdev.h>
40#include <linux/module.h>
41
42#include <xen/xenbus.h>
43#include <xen/grant_table.h>
44#include <xen/events.h>
45#include <xen/page.h>
46
47#include <xen/interface/grant_table.h>
48#include <xen/interface/io/blkif.h>
49
50#include <asm/xen/hypervisor.h>
51
52enum blkif_state {
53 BLKIF_STATE_DISCONNECTED,
54 BLKIF_STATE_CONNECTED,
55 BLKIF_STATE_SUSPENDED,
56};
57
58struct blk_shadow {
59 struct blkif_request req;
60 unsigned long request;
61 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
62};
63
64static struct block_device_operations xlvbd_block_fops;
65
66#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
67
68/*
69 * We have one of these per vbd, whether ide, scsi or 'other'. They
70 * hang in private_data off the gendisk structure. We may end up
71 * putting all kinds of interesting stuff here :-)
72 */
73struct blkfront_info
74{
75 struct xenbus_device *xbdev;
76 dev_t dev;
77 struct gendisk *gd;
78 int vdevice;
79 blkif_vdev_t handle;
80 enum blkif_state connected;
81 int ring_ref;
82 struct blkif_front_ring ring;
83 unsigned int evtchn, irq;
84 struct request_queue *rq;
85 struct work_struct work;
86 struct gnttab_free_callback callback;
87 struct blk_shadow shadow[BLK_RING_SIZE];
88 unsigned long shadow_free;
89 int feature_barrier;
90
91 /**
92 * The number of people holding this device open. We won't allow a
93 * hot-unplug unless this is 0.
94 */
95 int users;
96};
97
98static DEFINE_SPINLOCK(blkif_io_lock);
99
100#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
101 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
102#define GRANT_INVALID_REF 0
103
104#define PARTS_PER_DISK 16
105
106#define BLKIF_MAJOR(dev) ((dev)>>8)
107#define BLKIF_MINOR(dev) ((dev) & 0xff)
108
109#define DEV_NAME "xvd" /* name in /dev */
110
111/* Information about our VBDs. */
112#define MAX_VBDS 64
113static LIST_HEAD(vbds_list);
114
115static int get_id_from_freelist(struct blkfront_info *info)
116{
117 unsigned long free = info->shadow_free;
118 BUG_ON(free > BLK_RING_SIZE);
119 info->shadow_free = info->shadow[free].req.id;
120 info->shadow[free].req.id = 0x0fffffee; /* debug */
121 return free;
122}
123
124static void add_id_to_freelist(struct blkfront_info *info,
125 unsigned long id)
126{
127 info->shadow[id].req.id = info->shadow_free;
128 info->shadow[id].request = 0;
129 info->shadow_free = id;
130}
131
132static void blkif_restart_queue_callback(void *arg)
133{
134 struct blkfront_info *info = (struct blkfront_info *)arg;
135 schedule_work(&info->work);
136}
137
138/*
139 * blkif_queue_request
140 *
141 * request block io
142 *
143 * id: for guest use only.
144 * operation: BLKIF_OP_{READ,WRITE,PROBE}
145 * buffer: buffer to read/write into. this should be a
146 * virtual address in the guest os.
147 */
148static int blkif_queue_request(struct request *req)
149{
150 struct blkfront_info *info = req->rq_disk->private_data;
151 unsigned long buffer_mfn;
152 struct blkif_request *ring_req;
153 struct bio *bio;
154 struct bio_vec *bvec;
155 int idx;
156 unsigned long id;
157 unsigned int fsect, lsect;
158 int ref;
159 grant_ref_t gref_head;
160
161 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
162 return 1;
163
164 if (gnttab_alloc_grant_references(
165 BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
166 gnttab_request_free_callback(
167 &info->callback,
168 blkif_restart_queue_callback,
169 info,
170 BLKIF_MAX_SEGMENTS_PER_REQUEST);
171 return 1;
172 }
173
174 /* Fill out a communications ring structure. */
175 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
176 id = get_id_from_freelist(info);
177 info->shadow[id].request = (unsigned long)req;
178
179 ring_req->id = id;
180 ring_req->sector_number = (blkif_sector_t)req->sector;
181 ring_req->handle = info->handle;
182
183 ring_req->operation = rq_data_dir(req) ?
184 BLKIF_OP_WRITE : BLKIF_OP_READ;
185 if (blk_barrier_rq(req))
186 ring_req->operation = BLKIF_OP_WRITE_BARRIER;
187
188 ring_req->nr_segments = 0;
189 rq_for_each_bio (bio, req) {
190 bio_for_each_segment (bvec, bio, idx) {
191 BUG_ON(ring_req->nr_segments
192 == BLKIF_MAX_SEGMENTS_PER_REQUEST);
193 buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
194 fsect = bvec->bv_offset >> 9;
195 lsect = fsect + (bvec->bv_len >> 9) - 1;
196 /* install a grant reference. */
197 ref = gnttab_claim_grant_reference(&gref_head);
198 BUG_ON(ref == -ENOSPC);
199
200 gnttab_grant_foreign_access_ref(
201 ref,
202 info->xbdev->otherend_id,
203 buffer_mfn,
204 rq_data_dir(req) );
205
206 info->shadow[id].frame[ring_req->nr_segments] =
207 mfn_to_pfn(buffer_mfn);
208
209 ring_req->seg[ring_req->nr_segments] =
210 (struct blkif_request_segment) {
211 .gref = ref,
212 .first_sect = fsect,
213 .last_sect = lsect };
214
215 ring_req->nr_segments++;
216 }
217 }
218
219 info->ring.req_prod_pvt++;
220
221 /* Keep a private copy so we can reissue requests when recovering. */
222 info->shadow[id].req = *ring_req;
223
224 gnttab_free_grant_references(gref_head);
225
226 return 0;
227}
228
229
230static inline void flush_requests(struct blkfront_info *info)
231{
232 int notify;
233
234 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
235
236 if (notify)
237 notify_remote_via_irq(info->irq);
238}
239
240/*
241 * do_blkif_request
242 * read a block; request is in a request queue
243 */
244static void do_blkif_request(request_queue_t *rq)
245{
246 struct blkfront_info *info = NULL;
247 struct request *req;
248 int queued;
249
250 pr_debug("Entered do_blkif_request\n");
251
252 queued = 0;
253
254 while ((req = elv_next_request(rq)) != NULL) {
255 info = req->rq_disk->private_data;
256 if (!blk_fs_request(req)) {
257 end_request(req, 0);
258 continue;
259 }
260
261 if (RING_FULL(&info->ring))
262 goto wait;
263
264 pr_debug("do_blk_req %p: cmd %p, sec %lx, "
265 "(%u/%li) buffer:%p [%s]\n",
266 req, req->cmd, (unsigned long)req->sector,
267 req->current_nr_sectors,
268 req->nr_sectors, req->buffer,
269 rq_data_dir(req) ? "write" : "read");
270
271
272 blkdev_dequeue_request(req);
273 if (blkif_queue_request(req)) {
274 blk_requeue_request(rq, req);
275wait:
276 /* Avoid pointless unplugs. */
277 blk_stop_queue(rq);
278 break;
279 }
280
281 queued++;
282 }
283
284 if (queued != 0)
285 flush_requests(info);
286}
287
288static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
289{
290 request_queue_t *rq;
291
292 rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
293 if (rq == NULL)
294 return -1;
295
296 elevator_init(rq, "noop");
297
298 /* Hard sector size and max sectors impersonate the equiv. hardware. */
299 blk_queue_hardsect_size(rq, sector_size);
300 blk_queue_max_sectors(rq, 512);
301
302 /* Each segment in a request is up to an aligned page in size. */
303 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
304 blk_queue_max_segment_size(rq, PAGE_SIZE);
305
306 /* Ensure a merged request will fit in a single I/O ring slot. */
307 blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
308 blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
309
310 /* Make sure buffer addresses are sector-aligned. */
311 blk_queue_dma_alignment(rq, 511);
312
313 gd->queue = rq;
314
315 return 0;
316}
317
318
319static int xlvbd_barrier(struct blkfront_info *info)
320{
321 int err;
322
323 err = blk_queue_ordered(info->rq,
324 info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
325 NULL);
326
327 if (err)
328 return err;
329
330 printk(KERN_INFO "blkfront: %s: barriers %s\n",
331 info->gd->disk_name,
332 info->feature_barrier ? "enabled" : "disabled");
333 return 0;
334}
335
336
337static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
338 int vdevice, u16 vdisk_info, u16 sector_size,
339 struct blkfront_info *info)
340{
341 struct gendisk *gd;
342 int nr_minors = 1;
343 int err = -ENODEV;
344
345 BUG_ON(info->gd != NULL);
346 BUG_ON(info->rq != NULL);
347
348 if ((minor % PARTS_PER_DISK) == 0)
349 nr_minors = PARTS_PER_DISK;
350
351 gd = alloc_disk(nr_minors);
352 if (gd == NULL)
353 goto out;
354
355 if (nr_minors > 1)
356 sprintf(gd->disk_name, "%s%c", DEV_NAME,
357 'a' + minor / PARTS_PER_DISK);
358 else
359 sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
360 'a' + minor / PARTS_PER_DISK,
361 minor % PARTS_PER_DISK);
362
363 gd->major = XENVBD_MAJOR;
364 gd->first_minor = minor;
365 gd->fops = &xlvbd_block_fops;
366 gd->private_data = info;
367 gd->driverfs_dev = &(info->xbdev->dev);
368 set_capacity(gd, capacity);
369
370 if (xlvbd_init_blk_queue(gd, sector_size)) {
371 del_gendisk(gd);
372 goto out;
373 }
374
375 info->rq = gd->queue;
376 info->gd = gd;
377
378 if (info->feature_barrier)
379 xlvbd_barrier(info);
380
381 if (vdisk_info & VDISK_READONLY)
382 set_disk_ro(gd, 1);
383
384 if (vdisk_info & VDISK_REMOVABLE)
385 gd->flags |= GENHD_FL_REMOVABLE;
386
387 if (vdisk_info & VDISK_CDROM)
388 gd->flags |= GENHD_FL_CD;
389
390 return 0;
391
392 out:
393 return err;
394}
395
396static void kick_pending_request_queues(struct blkfront_info *info)
397{
398 if (!RING_FULL(&info->ring)) {
399 /* Re-enable calldowns. */
400 blk_start_queue(info->rq);
401 /* Kick things off immediately. */
402 do_blkif_request(info->rq);
403 }
404}
405
406static void blkif_restart_queue(struct work_struct *work)
407{
408 struct blkfront_info *info = container_of(work, struct blkfront_info, work);
409
410 spin_lock_irq(&blkif_io_lock);
411 if (info->connected == BLKIF_STATE_CONNECTED)
412 kick_pending_request_queues(info);
413 spin_unlock_irq(&blkif_io_lock);
414}
415
416static void blkif_free(struct blkfront_info *info, int suspend)
417{
418 /* Prevent new requests being issued until we fix things up. */
419 spin_lock_irq(&blkif_io_lock);
420 info->connected = suspend ?
421 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
422 /* No more blkif_request(). */
423 if (info->rq)
424 blk_stop_queue(info->rq);
425 /* No more gnttab callback work. */
426 gnttab_cancel_free_callback(&info->callback);
427 spin_unlock_irq(&blkif_io_lock);
428
429 /* Flush gnttab callback work. Must be done with no locks held. */
430 flush_scheduled_work();
431
432 /* Free resources associated with old device channel. */
433 if (info->ring_ref != GRANT_INVALID_REF) {
434 gnttab_end_foreign_access(info->ring_ref, 0,
435 (unsigned long)info->ring.sring);
436 info->ring_ref = GRANT_INVALID_REF;
437 info->ring.sring = NULL;
438 }
439 if (info->irq)
440 unbind_from_irqhandler(info->irq, info);
441 info->evtchn = info->irq = 0;
442
443}
444
445static void blkif_completion(struct blk_shadow *s)
446{
447 int i;
448 for (i = 0; i < s->req.nr_segments; i++)
449 gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
450}
451
452static irqreturn_t blkif_interrupt(int irq, void *dev_id)
453{
454 struct request *req;
455 struct blkif_response *bret;
456 RING_IDX i, rp;
457 unsigned long flags;
458 struct blkfront_info *info = (struct blkfront_info *)dev_id;
459 int uptodate;
460
461 spin_lock_irqsave(&blkif_io_lock, flags);
462
463 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
464 spin_unlock_irqrestore(&blkif_io_lock, flags);
465 return IRQ_HANDLED;
466 }
467
468 again:
469 rp = info->ring.sring->rsp_prod;
470 rmb(); /* Ensure we see queued responses up to 'rp'. */
471
472 for (i = info->ring.rsp_cons; i != rp; i++) {
473 unsigned long id;
474 int ret;
475
476 bret = RING_GET_RESPONSE(&info->ring, i);
477 id = bret->id;
478 req = (struct request *)info->shadow[id].request;
479
480 blkif_completion(&info->shadow[id]);
481
482 add_id_to_freelist(info, id);
483
484 uptodate = (bret->status == BLKIF_RSP_OKAY);
485 switch (bret->operation) {
486 case BLKIF_OP_WRITE_BARRIER:
487 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
488 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
489 info->gd->disk_name);
490 uptodate = -EOPNOTSUPP;
491 info->feature_barrier = 0;
492 xlvbd_barrier(info);
493 }
494 /* fall through */
495 case BLKIF_OP_READ:
496 case BLKIF_OP_WRITE:
497 if (unlikely(bret->status != BLKIF_RSP_OKAY))
498 dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
499 "request: %x\n", bret->status);
500
501 ret = end_that_request_first(req, uptodate,
502 req->hard_nr_sectors);
503 BUG_ON(ret);
504 end_that_request_last(req, uptodate);
505 break;
506 default:
507 BUG();
508 }
509 }
510
511 info->ring.rsp_cons = i;
512
513 if (i != info->ring.req_prod_pvt) {
514 int more_to_do;
515 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
516 if (more_to_do)
517 goto again;
518 } else
519 info->ring.sring->rsp_event = i + 1;
520
521 kick_pending_request_queues(info);
522
523 spin_unlock_irqrestore(&blkif_io_lock, flags);
524
525 return IRQ_HANDLED;
526}
527
528
529static int setup_blkring(struct xenbus_device *dev,
530 struct blkfront_info *info)
531{
532 struct blkif_sring *sring;
533 int err;
534
535 info->ring_ref = GRANT_INVALID_REF;
536
537 sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL);
538 if (!sring) {
539 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
540 return -ENOMEM;
541 }
542 SHARED_RING_INIT(sring);
543 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
544
545 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
546 if (err < 0) {
547 free_page((unsigned long)sring);
548 info->ring.sring = NULL;
549 goto fail;
550 }
551 info->ring_ref = err;
552
553 err = xenbus_alloc_evtchn(dev, &info->evtchn);
554 if (err)
555 goto fail;
556
557 err = bind_evtchn_to_irqhandler(info->evtchn,
558 blkif_interrupt,
559 IRQF_SAMPLE_RANDOM, "blkif", info);
560 if (err <= 0) {
561 xenbus_dev_fatal(dev, err,
562 "bind_evtchn_to_irqhandler failed");
563 goto fail;
564 }
565 info->irq = err;
566
567 return 0;
568fail:
569 blkif_free(info, 0);
570 return err;
571}
572
573
574/* Common code used when first setting up, and when resuming. */
575static int talk_to_backend(struct xenbus_device *dev,
576 struct blkfront_info *info)
577{
578 const char *message = NULL;
579 struct xenbus_transaction xbt;
580 int err;
581
582 /* Create shared ring, alloc event channel. */
583 err = setup_blkring(dev, info);
584 if (err)
585 goto out;
586
587again:
588 err = xenbus_transaction_start(&xbt);
589 if (err) {
590 xenbus_dev_fatal(dev, err, "starting transaction");
591 goto destroy_blkring;
592 }
593
594 err = xenbus_printf(xbt, dev->nodename,
595 "ring-ref", "%u", info->ring_ref);
596 if (err) {
597 message = "writing ring-ref";
598 goto abort_transaction;
599 }
600 err = xenbus_printf(xbt, dev->nodename,
601 "event-channel", "%u", info->evtchn);
602 if (err) {
603 message = "writing event-channel";
604 goto abort_transaction;
605 }
606
607 err = xenbus_transaction_end(xbt, 0);
608 if (err) {
609 if (err == -EAGAIN)
610 goto again;
611 xenbus_dev_fatal(dev, err, "completing transaction");
612 goto destroy_blkring;
613 }
614
615 xenbus_switch_state(dev, XenbusStateInitialised);
616
617 return 0;
618
619 abort_transaction:
620 xenbus_transaction_end(xbt, 1);
621 if (message)
622 xenbus_dev_fatal(dev, err, "%s", message);
623 destroy_blkring:
624 blkif_free(info, 0);
625 out:
626 return err;
627}
628
629
630/**
631 * Entry point to this code when a new device is created. Allocate the basic
632 * structures and the ring buffer for communication with the backend, and
633 * inform the backend of the appropriate details for those. Switch to
634 * Initialised state.
635 */
636static int blkfront_probe(struct xenbus_device *dev,
637 const struct xenbus_device_id *id)
638{
639 int err, vdevice, i;
640 struct blkfront_info *info;
641
642 /* FIXME: Use dynamic device id if this is not set. */
643 err = xenbus_scanf(XBT_NIL, dev->nodename,
644 "virtual-device", "%i", &vdevice);
645 if (err != 1) {
646 xenbus_dev_fatal(dev, err, "reading virtual-device");
647 return err;
648 }
649
650 info = kzalloc(sizeof(*info), GFP_KERNEL);
651 if (!info) {
652 xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
653 return -ENOMEM;
654 }
655
656 info->xbdev = dev;
657 info->vdevice = vdevice;
658 info->connected = BLKIF_STATE_DISCONNECTED;
659 INIT_WORK(&info->work, blkif_restart_queue);
660
661 for (i = 0; i < BLK_RING_SIZE; i++)
662 info->shadow[i].req.id = i+1;
663 info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
664
665 /* Front end dir is a number, which is used as the id. */
666 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
667 dev->dev.driver_data = info;
668
669 err = talk_to_backend(dev, info);
670 if (err) {
671 kfree(info);
672 dev->dev.driver_data = NULL;
673 return err;
674 }
675
676 return 0;
677}
678
679
680static int blkif_recover(struct blkfront_info *info)
681{
682 int i;
683 struct blkif_request *req;
684 struct blk_shadow *copy;
685 int j;
686
687 /* Stage 1: Make a safe copy of the shadow state. */
688 copy = kmalloc(sizeof(info->shadow), GFP_KERNEL);
689 if (!copy)
690 return -ENOMEM;
691 memcpy(copy, info->shadow, sizeof(info->shadow));
692
693 /* Stage 2: Set up free list. */
694 memset(&info->shadow, 0, sizeof(info->shadow));
695 for (i = 0; i < BLK_RING_SIZE; i++)
696 info->shadow[i].req.id = i+1;
697 info->shadow_free = info->ring.req_prod_pvt;
698 info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
699
700 /* Stage 3: Find pending requests and requeue them. */
701 for (i = 0; i < BLK_RING_SIZE; i++) {
702 /* Not in use? */
703 if (copy[i].request == 0)
704 continue;
705
706 /* Grab a request slot and copy shadow state into it. */
707 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
708 *req = copy[i].req;
709
710 /* We get a new request id, and must reset the shadow state. */
711 req->id = get_id_from_freelist(info);
712 memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
713
714 /* Rewrite any grant references invalidated by susp/resume. */
715 for (j = 0; j < req->nr_segments; j++)
716 gnttab_grant_foreign_access_ref(
717 req->seg[j].gref,
718 info->xbdev->otherend_id,
719 pfn_to_mfn(info->shadow[req->id].frame[j]),
720 rq_data_dir(
721 (struct request *)
722 info->shadow[req->id].request));
723 info->shadow[req->id].req = *req;
724
725 info->ring.req_prod_pvt++;
726 }
727
728 kfree(copy);
729
730 xenbus_switch_state(info->xbdev, XenbusStateConnected);
731
732 spin_lock_irq(&blkif_io_lock);
733
734 /* Now safe for us to use the shared ring */
735 info->connected = BLKIF_STATE_CONNECTED;
736
737 /* Send off requeued requests */
738 flush_requests(info);
739
740 /* Kick any other new requests queued since we resumed */
741 kick_pending_request_queues(info);
742
743 spin_unlock_irq(&blkif_io_lock);
744
745 return 0;
746}
747
748/**
749 * We are reconnecting to the backend, due to a suspend/resume, or a backend
750 * driver restart. We tear down our blkif structure and recreate it, but
751 * leave the device-layer structures intact so that this is transparent to the
752 * rest of the kernel.
753 */
754static int blkfront_resume(struct xenbus_device *dev)
755{
756 struct blkfront_info *info = dev->dev.driver_data;
757 int err;
758
759 dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
760
761 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
762
763 err = talk_to_backend(dev, info);
764 if (info->connected == BLKIF_STATE_SUSPENDED && !err)
765 err = blkif_recover(info);
766
767 return err;
768}
769
770
771/*
772 * Invoked when the backend is finally 'ready' (and has told produced
773 * the details about the physical device - #sectors, size, etc).
774 */
775static void blkfront_connect(struct blkfront_info *info)
776{
777 unsigned long long sectors;
778 unsigned long sector_size;
779 unsigned int binfo;
780 int err;
781
782 if ((info->connected == BLKIF_STATE_CONNECTED) ||
783 (info->connected == BLKIF_STATE_SUSPENDED) )
784 return;
785
786 dev_dbg(&info->xbdev->dev, "%s:%s.\n",
787 __func__, info->xbdev->otherend);
788
789 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
790 "sectors", "%llu", &sectors,
791 "info", "%u", &binfo,
792 "sector-size", "%lu", &sector_size,
793 NULL);
794 if (err) {
795 xenbus_dev_fatal(info->xbdev, err,
796 "reading backend fields at %s",
797 info->xbdev->otherend);
798 return;
799 }
800
801 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
802 "feature-barrier", "%lu", &info->feature_barrier,
803 NULL);
804 if (err)
805 info->feature_barrier = 0;
806
807 err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
808 sectors, info->vdevice,
809 binfo, sector_size, info);
810 if (err) {
811 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
812 info->xbdev->otherend);
813 return;
814 }
815
816 xenbus_switch_state(info->xbdev, XenbusStateConnected);
817
818 /* Kick pending requests. */
819 spin_lock_irq(&blkif_io_lock);
820 info->connected = BLKIF_STATE_CONNECTED;
821 kick_pending_request_queues(info);
822 spin_unlock_irq(&blkif_io_lock);
823
824 add_disk(info->gd);
825}
826
827/**
828 * Handle the change of state of the backend to Closing. We must delete our
829 * device-layer structures now, to ensure that writes are flushed through to
830 * the backend. Once is this done, we can switch to Closed in
831 * acknowledgement.
832 */
833static void blkfront_closing(struct xenbus_device *dev)
834{
835 struct blkfront_info *info = dev->dev.driver_data;
836 unsigned long flags;
837
838 dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
839
840 if (info->rq == NULL)
841 goto out;
842
843 spin_lock_irqsave(&blkif_io_lock, flags);
844
845 del_gendisk(info->gd);
846
847 /* No more blkif_request(). */
848 blk_stop_queue(info->rq);
849
850 /* No more gnttab callback work. */
851 gnttab_cancel_free_callback(&info->callback);
852 spin_unlock_irqrestore(&blkif_io_lock, flags);
853
854 /* Flush gnttab callback work. Must be done with no locks held. */
855 flush_scheduled_work();
856
857 blk_cleanup_queue(info->rq);
858 info->rq = NULL;
859
860 out:
861 xenbus_frontend_closed(dev);
862}
863
864/**
865 * Callback received when the backend's state changes.
866 */
867static void backend_changed(struct xenbus_device *dev,
868 enum xenbus_state backend_state)
869{
870 struct blkfront_info *info = dev->dev.driver_data;
871 struct block_device *bd;
872
873 dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
874
875 switch (backend_state) {
876 case XenbusStateInitialising:
877 case XenbusStateInitWait:
878 case XenbusStateInitialised:
879 case XenbusStateUnknown:
880 case XenbusStateClosed:
881 break;
882
883 case XenbusStateConnected:
884 blkfront_connect(info);
885 break;
886
887 case XenbusStateClosing:
888 bd = bdget(info->dev);
889 if (bd == NULL)
890 xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
891
892 mutex_lock(&bd->bd_mutex);
893 if (info->users > 0)
894 xenbus_dev_error(dev, -EBUSY,
895 "Device in use; refusing to close");
896 else
897 blkfront_closing(dev);
898 mutex_unlock(&bd->bd_mutex);
899 bdput(bd);
900 break;
901 }
902}
903
904static int blkfront_remove(struct xenbus_device *dev)
905{
906 struct blkfront_info *info = dev->dev.driver_data;
907
908 dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
909
910 blkif_free(info, 0);
911
912 kfree(info);
913
914 return 0;
915}
916
917static int blkif_open(struct inode *inode, struct file *filep)
918{
919 struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
920 info->users++;
921 return 0;
922}
923
924static int blkif_release(struct inode *inode, struct file *filep)
925{
926 struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
927 info->users--;
928 if (info->users == 0) {
929 /* Check whether we have been instructed to close. We will
930 have ignored this request initially, as the device was
931 still mounted. */
932 struct xenbus_device *dev = info->xbdev;
933 enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
934
935 if (state == XenbusStateClosing)
936 blkfront_closing(dev);
937 }
938 return 0;
939}
940
941static struct block_device_operations xlvbd_block_fops =
942{
943 .owner = THIS_MODULE,
944 .open = blkif_open,
945 .release = blkif_release,
946};
947
948
949static struct xenbus_device_id blkfront_ids[] = {
950 { "vbd" },
951 { "" }
952};
953
954static struct xenbus_driver blkfront = {
955 .name = "vbd",
956 .owner = THIS_MODULE,
957 .ids = blkfront_ids,
958 .probe = blkfront_probe,
959 .remove = blkfront_remove,
960 .resume = blkfront_resume,
961 .otherend_changed = backend_changed,
962};
963
964static int __init xlblk_init(void)
965{
966 if (!is_running_on_xen())
967 return -ENODEV;
968
969 if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
970 printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
971 XENVBD_MAJOR, DEV_NAME);
972 return -ENODEV;
973 }
974
975 return xenbus_register_frontend(&blkfront);
976}
977module_init(xlblk_init);
978
979
980static void xlblk_exit(void)
981{
982 return xenbus_unregister_driver(&blkfront);
983}
984module_exit(xlblk_exit);
985
986MODULE_DESCRIPTION("Xen virtual block device frontend");
987MODULE_LICENSE("GPL");
988MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
diff --git a/include/linux/major.h b/include/linux/major.h
index 7e7c9093919a..0cb98053537a 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -158,6 +158,8 @@
158#define VXSPEC_MAJOR 200 /* VERITAS volume config driver */ 158#define VXSPEC_MAJOR 200 /* VERITAS volume config driver */
159#define VXDMP_MAJOR 201 /* VERITAS volume multipath driver */ 159#define VXDMP_MAJOR 201 /* VERITAS volume multipath driver */
160 160
161#define XENVBD_MAJOR 202 /* Xen virtual block device */
162
161#define MSR_MAJOR 202 163#define MSR_MAJOR 202
162#define CPUID_MAJOR 203 164#define CPUID_MAJOR 203
163 165