aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-05-25 12:15:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-25 12:15:35 -0400
commit929cfdd5d3bdc772aff32e5a3fb4e3894394aa75 (patch)
treef67202d079eaf1f8d65b2e1bfac70b768ae34bc4 /drivers/block
parent798ce8f1cca29dcc3f4b55947f611f4ffb32ac2b (diff)
parenta1c15c59feee36267c43142a41152fbf7402afb6 (diff)
Merge branch 'for-2.6.40/drivers' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.40/drivers' of git://git.kernel.dk/linux-2.6-block: (110 commits) loop: handle on-demand devices correctly loop: limit 'max_part' module param to DISK_MAX_PARTS drbd: fix warning drbd: fix warning drbd: Fix spelling drbd: fix schedule in atomic drbd: Take a more conservative approach when deciding max_bio_size drbd: Fixed state transitions after async outdate-peer-handler returned drbd: Disallow the peer_disk_state to be D_OUTDATED while connected drbd: Fix for the connection problems on high latency links drbd: fix potential activity log refcount imbalance in error path drbd: Only downgrade the disk state in case of disk failures drbd: fix disconnect/reconnect loop, if ping-timeout == ping-int drbd: fix potential distributed deadlock lru_cache.h: fix comments referring to ts_ instead of lc_ drbd: Fix for application IO with the on-io-error=pass-on policy xen/p2m: Add EXPORT_SYMBOL_GPL to the M2P override functions. xen/p2m/m2p/gnttab: Support GNTMAP_host_map in the M2P override. xen/blkback: don't fail empty barrier requests xen/blkback: fix xenbus_transaction_start() hang caused by double xenbus_transaction_end() ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig21
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/cciss.c571
-rw-r--r--drivers/block/cciss.h11
-rw-r--r--drivers/block/cciss_cmd.h11
-rw-r--r--drivers/block/cciss_scsi.c41
-rw-r--r--drivers/block/cciss_scsi.h4
-rw-r--r--drivers/block/drbd/drbd_actlog.c2
-rw-r--r--drivers/block/drbd/drbd_bitmap.c6
-rw-r--r--drivers/block/drbd/drbd_int.h19
-rw-r--r--drivers/block/drbd/drbd_main.c37
-rw-r--r--drivers/block/drbd/drbd_nl.c127
-rw-r--r--drivers/block/drbd/drbd_receiver.c68
-rw-r--r--drivers/block/drbd/drbd_req.c20
-rw-r--r--drivers/block/drbd/drbd_req.h5
-rw-r--r--drivers/block/drbd/drbd_worker.c98
-rw-r--r--drivers/block/loop.c11
-rw-r--r--drivers/block/xen-blkback/Makefile3
-rw-r--r--drivers/block/xen-blkback/blkback.c824
-rw-r--r--drivers/block/xen-blkback/common.h233
-rw-r--r--drivers/block/xen-blkback/xenbus.c768
-rw-r--r--drivers/block/xen-blkfront.c51
22 files changed, 2637 insertions, 295 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 83c32cb72582..717d6e4e18d3 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -470,6 +470,27 @@ config XEN_BLKDEV_FRONTEND
470 block device driver. It communicates with a back-end driver 470 block device driver. It communicates with a back-end driver
471 in another domain which drives the actual block device. 471 in another domain which drives the actual block device.
472 472
473config XEN_BLKDEV_BACKEND
474 tristate "Block-device backend driver"
475 depends on XEN_BACKEND
476 help
477 The block-device backend driver allows the kernel to export its
478 block devices to other guests via a high-performance shared-memory
479 interface.
480
481 The corresponding Linux frontend driver is enabled by the
482 CONFIG_XEN_BLKDEV_FRONTEND configuration option.
483
484 The backend driver attaches itself to a any block device specified
485 in the XenBus configuration. There are no limits to what the block
486 device as long as it has a major and minor.
487
488 If you are compiling a kernel to run in a Xen block backend driver
489 domain (often this is domain 0) you should say Y here. To
490 compile this driver as a module, chose M here: the module
491 will be called xen-blkback.
492
493
473config VIRTIO_BLK 494config VIRTIO_BLK
474 tristate "Virtio block driver (EXPERIMENTAL)" 495 tristate "Virtio block driver (EXPERIMENTAL)"
475 depends on EXPERIMENTAL && VIRTIO 496 depends on EXPERIMENTAL && VIRTIO
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 40528ba56d1b..76646e9a1c91 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
36obj-$(CONFIG_BLK_DEV_HD) += hd.o 36obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ 40obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
40obj-$(CONFIG_BLK_DEV_RBD) += rbd.o 41obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
41 42
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9bf13988f1a2..8f4ef656a1af 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -64,6 +64,10 @@ MODULE_DESCRIPTION("Driver for HP Smart Array Controllers");
64MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers"); 64MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
65MODULE_VERSION("3.6.26"); 65MODULE_VERSION("3.6.26");
66MODULE_LICENSE("GPL"); 66MODULE_LICENSE("GPL");
67static int cciss_tape_cmds = 6;
68module_param(cciss_tape_cmds, int, 0644);
69MODULE_PARM_DESC(cciss_tape_cmds,
70 "number of commands to allocate for tape devices (default: 6)");
67 71
68static DEFINE_MUTEX(cciss_mutex); 72static DEFINE_MUTEX(cciss_mutex);
69static struct proc_dir_entry *proc_cciss; 73static struct proc_dir_entry *proc_cciss;
@@ -194,6 +198,8 @@ static int __devinit cciss_find_cfg_addrs(struct pci_dev *pdev,
194static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev, 198static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
195 unsigned long *memory_bar); 199 unsigned long *memory_bar);
196static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag); 200static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag);
201static __devinit int write_driver_ver_to_cfgtable(
202 CfgTable_struct __iomem *cfgtable);
197 203
198/* performant mode helper functions */ 204/* performant mode helper functions */
199static void calc_bucket_map(int *bucket, int num_buckets, int nsgs, 205static void calc_bucket_map(int *bucket, int num_buckets, int nsgs,
@@ -556,7 +562,7 @@ static void __devinit cciss_procinit(ctlr_info_t *h)
556#define to_hba(n) container_of(n, struct ctlr_info, dev) 562#define to_hba(n) container_of(n, struct ctlr_info, dev)
557#define to_drv(n) container_of(n, drive_info_struct, dev) 563#define to_drv(n) container_of(n, drive_info_struct, dev)
558 564
559/* List of controllers which cannot be reset on kexec with reset_devices */ 565/* List of controllers which cannot be hard reset on kexec with reset_devices */
560static u32 unresettable_controller[] = { 566static u32 unresettable_controller[] = {
561 0x324a103C, /* Smart Array P712m */ 567 0x324a103C, /* Smart Array P712m */
562 0x324b103C, /* SmartArray P711m */ 568 0x324b103C, /* SmartArray P711m */
@@ -574,23 +580,45 @@ static u32 unresettable_controller[] = {
574 0x409D0E11, /* Smart Array 6400 EM */ 580 0x409D0E11, /* Smart Array 6400 EM */
575}; 581};
576 582
577static int ctlr_is_resettable(struct ctlr_info *h) 583/* List of controllers which cannot even be soft reset */
584static u32 soft_unresettable_controller[] = {
585 0x409C0E11, /* Smart Array 6400 */
586 0x409D0E11, /* Smart Array 6400 EM */
587};
588
589static int ctlr_is_hard_resettable(u32 board_id)
578{ 590{
579 int i; 591 int i;
580 592
581 for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++) 593 for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++)
582 if (unresettable_controller[i] == h->board_id) 594 if (unresettable_controller[i] == board_id)
583 return 0; 595 return 0;
584 return 1; 596 return 1;
585} 597}
586 598
599static int ctlr_is_soft_resettable(u32 board_id)
600{
601 int i;
602
603 for (i = 0; i < ARRAY_SIZE(soft_unresettable_controller); i++)
604 if (soft_unresettable_controller[i] == board_id)
605 return 0;
606 return 1;
607}
608
609static int ctlr_is_resettable(u32 board_id)
610{
611 return ctlr_is_hard_resettable(board_id) ||
612 ctlr_is_soft_resettable(board_id);
613}
614
587static ssize_t host_show_resettable(struct device *dev, 615static ssize_t host_show_resettable(struct device *dev,
588 struct device_attribute *attr, 616 struct device_attribute *attr,
589 char *buf) 617 char *buf)
590{ 618{
591 struct ctlr_info *h = to_hba(dev); 619 struct ctlr_info *h = to_hba(dev);
592 620
593 return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h)); 621 return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h->board_id));
594} 622}
595static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL); 623static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL);
596 624
@@ -2567,7 +2595,7 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
2567 } 2595 }
2568 } else if (cmd_type == TYPE_MSG) { 2596 } else if (cmd_type == TYPE_MSG) {
2569 switch (cmd) { 2597 switch (cmd) {
2570 case 0: /* ABORT message */ 2598 case CCISS_ABORT_MSG:
2571 c->Request.CDBLen = 12; 2599 c->Request.CDBLen = 12;
2572 c->Request.Type.Attribute = ATTR_SIMPLE; 2600 c->Request.Type.Attribute = ATTR_SIMPLE;
2573 c->Request.Type.Direction = XFER_WRITE; 2601 c->Request.Type.Direction = XFER_WRITE;
@@ -2577,16 +2605,16 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
2577 /* buff contains the tag of the command to abort */ 2605 /* buff contains the tag of the command to abort */
2578 memcpy(&c->Request.CDB[4], buff, 8); 2606 memcpy(&c->Request.CDB[4], buff, 8);
2579 break; 2607 break;
2580 case 1: /* RESET message */ 2608 case CCISS_RESET_MSG:
2581 c->Request.CDBLen = 16; 2609 c->Request.CDBLen = 16;
2582 c->Request.Type.Attribute = ATTR_SIMPLE; 2610 c->Request.Type.Attribute = ATTR_SIMPLE;
2583 c->Request.Type.Direction = XFER_NONE; 2611 c->Request.Type.Direction = XFER_NONE;
2584 c->Request.Timeout = 0; 2612 c->Request.Timeout = 0;
2585 memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB)); 2613 memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
2586 c->Request.CDB[0] = cmd; /* reset */ 2614 c->Request.CDB[0] = cmd; /* reset */
2587 c->Request.CDB[1] = 0x03; /* reset a target */ 2615 c->Request.CDB[1] = CCISS_RESET_TYPE_TARGET;
2588 break; 2616 break;
2589 case 3: /* No-Op message */ 2617 case CCISS_NOOP_MSG:
2590 c->Request.CDBLen = 1; 2618 c->Request.CDBLen = 1;
2591 c->Request.Type.Attribute = ATTR_SIMPLE; 2619 c->Request.Type.Attribute = ATTR_SIMPLE;
2592 c->Request.Type.Direction = XFER_WRITE; 2620 c->Request.Type.Direction = XFER_WRITE;
@@ -2615,6 +2643,31 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
2615 return status; 2643 return status;
2616} 2644}
2617 2645
2646static int __devinit cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr,
2647 u8 reset_type)
2648{
2649 CommandList_struct *c;
2650 int return_status;
2651
2652 c = cmd_alloc(h);
2653 if (!c)
2654 return -ENOMEM;
2655 return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0,
2656 CTLR_LUNID, TYPE_MSG);
2657 c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */
2658 if (return_status != IO_OK) {
2659 cmd_special_free(h, c);
2660 return return_status;
2661 }
2662 c->waiting = NULL;
2663 enqueue_cmd_and_start_io(h, c);
2664 /* Don't wait for completion, the reset won't complete. Don't free
2665 * the command either. This is the last command we will send before
2666 * re-initializing everything, so it doesn't matter and won't leak.
2667 */
2668 return 0;
2669}
2670
2618static int check_target_status(ctlr_info_t *h, CommandList_struct *c) 2671static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
2619{ 2672{
2620 switch (c->err_info->ScsiStatus) { 2673 switch (c->err_info->ScsiStatus) {
@@ -3461,6 +3514,63 @@ static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
3461 return next_command(h); 3514 return next_command(h);
3462} 3515}
3463 3516
3517/* Some controllers, like p400, will give us one interrupt
3518 * after a soft reset, even if we turned interrupts off.
3519 * Only need to check for this in the cciss_xxx_discard_completions
3520 * functions.
3521 */
3522static int ignore_bogus_interrupt(ctlr_info_t *h)
3523{
3524 if (likely(!reset_devices))
3525 return 0;
3526
3527 if (likely(h->interrupts_enabled))
3528 return 0;
3529
3530 dev_info(&h->pdev->dev, "Received interrupt while interrupts disabled "
3531 "(known firmware bug.) Ignoring.\n");
3532
3533 return 1;
3534}
3535
3536static irqreturn_t cciss_intx_discard_completions(int irq, void *dev_id)
3537{
3538 ctlr_info_t *h = dev_id;
3539 unsigned long flags;
3540 u32 raw_tag;
3541
3542 if (ignore_bogus_interrupt(h))
3543 return IRQ_NONE;
3544
3545 if (interrupt_not_for_us(h))
3546 return IRQ_NONE;
3547 spin_lock_irqsave(&h->lock, flags);
3548 while (interrupt_pending(h)) {
3549 raw_tag = get_next_completion(h);
3550 while (raw_tag != FIFO_EMPTY)
3551 raw_tag = next_command(h);
3552 }
3553 spin_unlock_irqrestore(&h->lock, flags);
3554 return IRQ_HANDLED;
3555}
3556
3557static irqreturn_t cciss_msix_discard_completions(int irq, void *dev_id)
3558{
3559 ctlr_info_t *h = dev_id;
3560 unsigned long flags;
3561 u32 raw_tag;
3562
3563 if (ignore_bogus_interrupt(h))
3564 return IRQ_NONE;
3565
3566 spin_lock_irqsave(&h->lock, flags);
3567 raw_tag = get_next_completion(h);
3568 while (raw_tag != FIFO_EMPTY)
3569 raw_tag = next_command(h);
3570 spin_unlock_irqrestore(&h->lock, flags);
3571 return IRQ_HANDLED;
3572}
3573
3464static irqreturn_t do_cciss_intx(int irq, void *dev_id) 3574static irqreturn_t do_cciss_intx(int irq, void *dev_id)
3465{ 3575{
3466 ctlr_info_t *h = dev_id; 3576 ctlr_info_t *h = dev_id;
@@ -4078,6 +4188,9 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h)
4078 cfg_base_addr_index) + cfg_offset, sizeof(h->cfgtable)); 4188 cfg_base_addr_index) + cfg_offset, sizeof(h->cfgtable));
4079 if (!h->cfgtable) 4189 if (!h->cfgtable)
4080 return -ENOMEM; 4190 return -ENOMEM;
4191 rc = write_driver_ver_to_cfgtable(h->cfgtable);
4192 if (rc)
4193 return rc;
4081 /* Find performant mode table. */ 4194 /* Find performant mode table. */
4082 trans_offset = readl(&h->cfgtable->TransMethodOffset); 4195 trans_offset = readl(&h->cfgtable->TransMethodOffset);
4083 h->transtable = remap_pci_mem(pci_resource_start(h->pdev, 4196 h->transtable = remap_pci_mem(pci_resource_start(h->pdev,
@@ -4112,7 +4225,7 @@ static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
4112static void __devinit cciss_find_board_params(ctlr_info_t *h) 4225static void __devinit cciss_find_board_params(ctlr_info_t *h)
4113{ 4226{
4114 cciss_get_max_perf_mode_cmds(h); 4227 cciss_get_max_perf_mode_cmds(h);
4115 h->nr_cmds = h->max_commands - 4; /* Allow room for some ioctls */ 4228 h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds;
4116 h->maxsgentries = readl(&(h->cfgtable->MaxSGElements)); 4229 h->maxsgentries = readl(&(h->cfgtable->MaxSGElements));
4117 /* 4230 /*
4118 * Limit in-command s/g elements to 32 save dma'able memory. 4231 * Limit in-command s/g elements to 32 save dma'able memory.
@@ -4348,7 +4461,7 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
4348 tag = readl(vaddr + SA5_REPLY_PORT_OFFSET); 4461 tag = readl(vaddr + SA5_REPLY_PORT_OFFSET);
4349 if ((tag & ~3) == paddr32) 4462 if ((tag & ~3) == paddr32)
4350 break; 4463 break;
4351 schedule_timeout_uninterruptible(HZ); 4464 msleep(CCISS_POST_RESET_NOOP_TIMEOUT_MSECS);
4352 } 4465 }
4353 4466
4354 iounmap(vaddr); 4467 iounmap(vaddr);
@@ -4375,11 +4488,10 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
4375 return 0; 4488 return 0;
4376} 4489}
4377 4490
4378#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0)
4379#define cciss_noop(p) cciss_message(p, 3, 0) 4491#define cciss_noop(p) cciss_message(p, 3, 0)
4380 4492
4381static int cciss_controller_hard_reset(struct pci_dev *pdev, 4493static int cciss_controller_hard_reset(struct pci_dev *pdev,
4382 void * __iomem vaddr, bool use_doorbell) 4494 void * __iomem vaddr, u32 use_doorbell)
4383{ 4495{
4384 u16 pmcsr; 4496 u16 pmcsr;
4385 int pos; 4497 int pos;
@@ -4390,8 +4502,7 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
4390 * other way using the doorbell register. 4502 * other way using the doorbell register.
4391 */ 4503 */
4392 dev_info(&pdev->dev, "using doorbell to reset controller\n"); 4504 dev_info(&pdev->dev, "using doorbell to reset controller\n");
4393 writel(DOORBELL_CTLR_RESET, vaddr + SA5_DOORBELL); 4505 writel(use_doorbell, vaddr + SA5_DOORBELL);
4394 msleep(1000);
4395 } else { /* Try to do it the PCI power state way */ 4506 } else { /* Try to do it the PCI power state way */
4396 4507
4397 /* Quoting from the Open CISS Specification: "The Power 4508 /* Quoting from the Open CISS Specification: "The Power
@@ -4422,12 +4533,64 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
4422 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 4533 pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
4423 pmcsr |= PCI_D0; 4534 pmcsr |= PCI_D0;
4424 pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); 4535 pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
4425
4426 msleep(500);
4427 } 4536 }
4428 return 0; 4537 return 0;
4429} 4538}
4430 4539
4540static __devinit void init_driver_version(char *driver_version, int len)
4541{
4542 memset(driver_version, 0, len);
4543 strncpy(driver_version, "cciss " DRIVER_NAME, len - 1);
4544}
4545
4546static __devinit int write_driver_ver_to_cfgtable(
4547 CfgTable_struct __iomem *cfgtable)
4548{
4549 char *driver_version;
4550 int i, size = sizeof(cfgtable->driver_version);
4551
4552 driver_version = kmalloc(size, GFP_KERNEL);
4553 if (!driver_version)
4554 return -ENOMEM;
4555
4556 init_driver_version(driver_version, size);
4557 for (i = 0; i < size; i++)
4558 writeb(driver_version[i], &cfgtable->driver_version[i]);
4559 kfree(driver_version);
4560 return 0;
4561}
4562
4563static __devinit void read_driver_ver_from_cfgtable(
4564 CfgTable_struct __iomem *cfgtable, unsigned char *driver_ver)
4565{
4566 int i;
4567
4568 for (i = 0; i < sizeof(cfgtable->driver_version); i++)
4569 driver_ver[i] = readb(&cfgtable->driver_version[i]);
4570}
4571
4572static __devinit int controller_reset_failed(
4573 CfgTable_struct __iomem *cfgtable)
4574{
4575
4576 char *driver_ver, *old_driver_ver;
4577 int rc, size = sizeof(cfgtable->driver_version);
4578
4579 old_driver_ver = kmalloc(2 * size, GFP_KERNEL);
4580 if (!old_driver_ver)
4581 return -ENOMEM;
4582 driver_ver = old_driver_ver + size;
4583
4584 /* After a reset, the 32 bytes of "driver version" in the cfgtable
4585 * should have been changed, otherwise we know the reset failed.
4586 */
4587 init_driver_version(old_driver_ver, size);
4588 read_driver_ver_from_cfgtable(cfgtable, driver_ver);
4589 rc = !memcmp(driver_ver, old_driver_ver, size);
4590 kfree(old_driver_ver);
4591 return rc;
4592}
4593
4431/* This does a hard reset of the controller using PCI power management 4594/* This does a hard reset of the controller using PCI power management
4432 * states or using the doorbell register. */ 4595 * states or using the doorbell register. */
4433static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) 4596static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
@@ -4437,10 +4600,10 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4437 u64 cfg_base_addr_index; 4600 u64 cfg_base_addr_index;
4438 void __iomem *vaddr; 4601 void __iomem *vaddr;
4439 unsigned long paddr; 4602 unsigned long paddr;
4440 u32 misc_fw_support, active_transport; 4603 u32 misc_fw_support;
4441 int rc; 4604 int rc;
4442 CfgTable_struct __iomem *cfgtable; 4605 CfgTable_struct __iomem *cfgtable;
4443 bool use_doorbell; 4606 u32 use_doorbell;
4444 u32 board_id; 4607 u32 board_id;
4445 u16 command_register; 4608 u16 command_register;
4446 4609
@@ -4464,12 +4627,16 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4464 * likely not be happy. Just forbid resetting this conjoined mess. 4627 * likely not be happy. Just forbid resetting this conjoined mess.
4465 */ 4628 */
4466 cciss_lookup_board_id(pdev, &board_id); 4629 cciss_lookup_board_id(pdev, &board_id);
4467 if (board_id == 0x409C0E11 || board_id == 0x409D0E11) { 4630 if (!ctlr_is_resettable(board_id)) {
4468 dev_warn(&pdev->dev, "Cannot reset Smart Array 640x " 4631 dev_warn(&pdev->dev, "Cannot reset Smart Array 640x "
4469 "due to shared cache module."); 4632 "due to shared cache module.");
4470 return -ENODEV; 4633 return -ENODEV;
4471 } 4634 }
4472 4635
4636 /* if controller is soft- but not hard resettable... */
4637 if (!ctlr_is_hard_resettable(board_id))
4638 return -ENOTSUPP; /* try soft reset later. */
4639
4473 /* Save the PCI command register */ 4640 /* Save the PCI command register */
4474 pci_read_config_word(pdev, 4, &command_register); 4641 pci_read_config_word(pdev, 4, &command_register);
4475 /* Turn the board off. This is so that later pci_restore_state() 4642 /* Turn the board off. This is so that later pci_restore_state()
@@ -4497,16 +4664,28 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4497 rc = -ENOMEM; 4664 rc = -ENOMEM;
4498 goto unmap_vaddr; 4665 goto unmap_vaddr;
4499 } 4666 }
4667 rc = write_driver_ver_to_cfgtable(cfgtable);
4668 if (rc)
4669 goto unmap_vaddr;
4500 4670
4501 /* If reset via doorbell register is supported, use that. */ 4671 /* If reset via doorbell register is supported, use that.
4502 misc_fw_support = readl(&cfgtable->misc_fw_support); 4672 * There are two such methods. Favor the newest method.
4503 use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
4504
4505 /* The doorbell reset seems to cause lockups on some Smart
4506 * Arrays (e.g. P410, P410i, maybe others). Until this is
4507 * fixed or at least isolated, avoid the doorbell reset.
4508 */ 4673 */
4509 use_doorbell = 0; 4674 misc_fw_support = readl(&cfgtable->misc_fw_support);
4675 use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET2;
4676 if (use_doorbell) {
4677 use_doorbell = DOORBELL_CTLR_RESET2;
4678 } else {
4679 use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
4680 if (use_doorbell) {
4681 dev_warn(&pdev->dev, "Controller claims that "
4682 "'Bit 2 doorbell reset' is "
4683 "supported, but not 'bit 5 doorbell reset'. "
4684 "Firmware update is recommended.\n");
4685 rc = -ENOTSUPP; /* use the soft reset */
4686 goto unmap_cfgtable;
4687 }
4688 }
4510 4689
4511 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell); 4690 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
4512 if (rc) 4691 if (rc)
@@ -4524,30 +4703,31 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4524 msleep(CCISS_POST_RESET_PAUSE_MSECS); 4703 msleep(CCISS_POST_RESET_PAUSE_MSECS);
4525 4704
4526 /* Wait for board to become not ready, then ready. */ 4705 /* Wait for board to become not ready, then ready. */
4527 dev_info(&pdev->dev, "Waiting for board to become ready.\n"); 4706 dev_info(&pdev->dev, "Waiting for board to reset.\n");
4528 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY); 4707 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
4529 if (rc) /* Don't bail, might be E500, etc. which can't be reset */ 4708 if (rc) {
4530 dev_warn(&pdev->dev, 4709 dev_warn(&pdev->dev, "Failed waiting for board to hard reset."
4531 "failed waiting for board to become not ready\n"); 4710 " Will try soft reset.\n");
4711 rc = -ENOTSUPP; /* Not expected, but try soft reset later */
4712 goto unmap_cfgtable;
4713 }
4532 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY); 4714 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
4533 if (rc) { 4715 if (rc) {
4534 dev_warn(&pdev->dev, 4716 dev_warn(&pdev->dev,
4535 "failed waiting for board to become ready\n"); 4717 "failed waiting for board to become ready "
4718 "after hard reset\n");
4536 goto unmap_cfgtable; 4719 goto unmap_cfgtable;
4537 } 4720 }
4538 dev_info(&pdev->dev, "board ready.\n");
4539 4721
4540 /* Controller should be in simple mode at this point. If it's not, 4722 rc = controller_reset_failed(vaddr);
4541 * It means we're on one of those controllers which doesn't support 4723 if (rc < 0)
4542 * the doorbell reset method and on which the PCI power management reset 4724 goto unmap_cfgtable;
4543 * method doesn't work (P800, for example.) 4725 if (rc) {
4544 * In those cases, don't try to proceed, as it generally doesn't work. 4726 dev_warn(&pdev->dev, "Unable to successfully hard reset "
4545 */ 4727 "controller. Will try soft reset.\n");
4546 active_transport = readl(&cfgtable->TransportActive); 4728 rc = -ENOTSUPP; /* Not expected, but try soft reset later */
4547 if (active_transport & PERFORMANT_MODE) { 4729 } else {
4548 dev_warn(&pdev->dev, "Unable to successfully reset controller," 4730 dev_info(&pdev->dev, "Board ready after hard reset.\n");
4549 " Ignoring controller.\n");
4550 rc = -ENODEV;
4551 } 4731 }
4552 4732
4553unmap_cfgtable: 4733unmap_cfgtable:
@@ -4574,11 +4754,12 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
4574 * due to concerns about shared bbwc between 6402/6404 pair. 4754 * due to concerns about shared bbwc between 6402/6404 pair.
4575 */ 4755 */
4576 if (rc == -ENOTSUPP) 4756 if (rc == -ENOTSUPP)
4577 return 0; /* just try to do the kdump anyhow. */ 4757 return rc; /* just try to do the kdump anyhow. */
4578 if (rc) 4758 if (rc)
4579 return -ENODEV; 4759 return -ENODEV;
4580 4760
4581 /* Now try to get the controller to respond to a no-op */ 4761 /* Now try to get the controller to respond to a no-op */
4762 dev_warn(&pdev->dev, "Waiting for controller to respond to no-op\n");
4582 for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) { 4763 for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) {
4583 if (cciss_noop(pdev) == 0) 4764 if (cciss_noop(pdev) == 0)
4584 break; 4765 break;
@@ -4591,6 +4772,148 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
4591 return 0; 4772 return 0;
4592} 4773}
4593 4774
4775static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h)
4776{
4777 h->cmd_pool_bits = kmalloc(
4778 DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) *
4779 sizeof(unsigned long), GFP_KERNEL);
4780 h->cmd_pool = pci_alloc_consistent(h->pdev,
4781 h->nr_cmds * sizeof(CommandList_struct),
4782 &(h->cmd_pool_dhandle));
4783 h->errinfo_pool = pci_alloc_consistent(h->pdev,
4784 h->nr_cmds * sizeof(ErrorInfo_struct),
4785 &(h->errinfo_pool_dhandle));
4786 if ((h->cmd_pool_bits == NULL)
4787 || (h->cmd_pool == NULL)
4788 || (h->errinfo_pool == NULL)) {
4789 dev_err(&h->pdev->dev, "out of memory");
4790 return -ENOMEM;
4791 }
4792 return 0;
4793}
4794
4795static __devinit int cciss_allocate_scatterlists(ctlr_info_t *h)
4796{
4797 int i;
4798
4799 /* zero it, so that on free we need not know how many were alloc'ed */
4800 h->scatter_list = kzalloc(h->max_commands *
4801 sizeof(struct scatterlist *), GFP_KERNEL);
4802 if (!h->scatter_list)
4803 return -ENOMEM;
4804
4805 for (i = 0; i < h->nr_cmds; i++) {
4806 h->scatter_list[i] = kmalloc(sizeof(struct scatterlist) *
4807 h->maxsgentries, GFP_KERNEL);
4808 if (h->scatter_list[i] == NULL) {
4809 dev_err(&h->pdev->dev, "could not allocate "
4810 "s/g lists\n");
4811 return -ENOMEM;
4812 }
4813 }
4814 return 0;
4815}
4816
4817static void cciss_free_scatterlists(ctlr_info_t *h)
4818{
4819 int i;
4820
4821 if (h->scatter_list) {
4822 for (i = 0; i < h->nr_cmds; i++)
4823 kfree(h->scatter_list[i]);
4824 kfree(h->scatter_list);
4825 }
4826}
4827
4828static void cciss_free_cmd_pool(ctlr_info_t *h)
4829{
4830 kfree(h->cmd_pool_bits);
4831 if (h->cmd_pool)
4832 pci_free_consistent(h->pdev,
4833 h->nr_cmds * sizeof(CommandList_struct),
4834 h->cmd_pool, h->cmd_pool_dhandle);
4835 if (h->errinfo_pool)
4836 pci_free_consistent(h->pdev,
4837 h->nr_cmds * sizeof(ErrorInfo_struct),
4838 h->errinfo_pool, h->errinfo_pool_dhandle);
4839}
4840
4841static int cciss_request_irq(ctlr_info_t *h,
4842 irqreturn_t (*msixhandler)(int, void *),
4843 irqreturn_t (*intxhandler)(int, void *))
4844{
4845 if (h->msix_vector || h->msi_vector) {
4846 if (!request_irq(h->intr[PERF_MODE_INT], msixhandler,
4847 IRQF_DISABLED, h->devname, h))
4848 return 0;
4849 dev_err(&h->pdev->dev, "Unable to get msi irq %d"
4850 " for %s\n", h->intr[PERF_MODE_INT],
4851 h->devname);
4852 return -1;
4853 }
4854
4855 if (!request_irq(h->intr[PERF_MODE_INT], intxhandler,
4856 IRQF_DISABLED, h->devname, h))
4857 return 0;
4858 dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
4859 h->intr[PERF_MODE_INT], h->devname);
4860 return -1;
4861}
4862
4863static int __devinit cciss_kdump_soft_reset(ctlr_info_t *h)
4864{
4865 if (cciss_send_reset(h, CTLR_LUNID, CCISS_RESET_TYPE_CONTROLLER)) {
4866 dev_warn(&h->pdev->dev, "Resetting array controller failed.\n");
4867 return -EIO;
4868 }
4869
4870 dev_info(&h->pdev->dev, "Waiting for board to soft reset.\n");
4871 if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_NOT_READY)) {
4872 dev_warn(&h->pdev->dev, "Soft reset had no effect.\n");
4873 return -1;
4874 }
4875
4876 dev_info(&h->pdev->dev, "Board reset, awaiting READY status.\n");
4877 if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY)) {
4878 dev_warn(&h->pdev->dev, "Board failed to become ready "
4879 "after soft reset.\n");
4880 return -1;
4881 }
4882
4883 return 0;
4884}
4885
4886static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
4887{
4888 int ctlr = h->ctlr;
4889
4890 free_irq(h->intr[PERF_MODE_INT], h);
4891#ifdef CONFIG_PCI_MSI
4892 if (h->msix_vector)
4893 pci_disable_msix(h->pdev);
4894 else if (h->msi_vector)
4895 pci_disable_msi(h->pdev);
4896#endif /* CONFIG_PCI_MSI */
4897 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
4898 cciss_free_scatterlists(h);
4899 cciss_free_cmd_pool(h);
4900 kfree(h->blockFetchTable);
4901 if (h->reply_pool)
4902 pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
4903 h->reply_pool, h->reply_pool_dhandle);
4904 if (h->transtable)
4905 iounmap(h->transtable);
4906 if (h->cfgtable)
4907 iounmap(h->cfgtable);
4908 if (h->vaddr)
4909 iounmap(h->vaddr);
4910 unregister_blkdev(h->major, h->devname);
4911 cciss_destroy_hba_sysfs_entry(h);
4912 pci_release_regions(h->pdev);
4913 kfree(h);
4914 hba[ctlr] = NULL;
4915}
4916
4594/* 4917/*
4595 * This is it. Find all the controllers and register them. I really hate 4918 * This is it. Find all the controllers and register them. I really hate
4596 * stealing all these major device numbers. 4919 * stealing all these major device numbers.
@@ -4601,15 +4924,28 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4601{ 4924{
4602 int i; 4925 int i;
4603 int j = 0; 4926 int j = 0;
4604 int k = 0;
4605 int rc; 4927 int rc;
4928 int try_soft_reset = 0;
4606 int dac, return_code; 4929 int dac, return_code;
4607 InquiryData_struct *inq_buff; 4930 InquiryData_struct *inq_buff;
4608 ctlr_info_t *h; 4931 ctlr_info_t *h;
4932 unsigned long flags;
4609 4933
4610 rc = cciss_init_reset_devices(pdev); 4934 rc = cciss_init_reset_devices(pdev);
4611 if (rc) 4935 if (rc) {
4612 return rc; 4936 if (rc != -ENOTSUPP)
4937 return rc;
4938 /* If the reset fails in a particular way (it has no way to do
4939 * a proper hard reset, so returns -ENOTSUPP) we can try to do
4940 * a soft reset once we get the controller configured up to the
4941 * point that it can accept a command.
4942 */
4943 try_soft_reset = 1;
4944 rc = 0;
4945 }
4946
4947reinit_after_soft_reset:
4948
4613 i = alloc_cciss_hba(pdev); 4949 i = alloc_cciss_hba(pdev);
4614 if (i < 0) 4950 if (i < 0)
4615 return -1; 4951 return -1;
@@ -4627,6 +4963,11 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4627 sprintf(h->devname, "cciss%d", i); 4963 sprintf(h->devname, "cciss%d", i);
4628 h->ctlr = i; 4964 h->ctlr = i;
4629 4965
4966 if (cciss_tape_cmds < 2)
4967 cciss_tape_cmds = 2;
4968 if (cciss_tape_cmds > 16)
4969 cciss_tape_cmds = 16;
4970
4630 init_completion(&h->scan_wait); 4971 init_completion(&h->scan_wait);
4631 4972
4632 if (cciss_create_hba_sysfs_entry(h)) 4973 if (cciss_create_hba_sysfs_entry(h))
@@ -4662,62 +5003,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4662 5003
4663 /* make sure the board interrupts are off */ 5004 /* make sure the board interrupts are off */
4664 h->access.set_intr_mask(h, CCISS_INTR_OFF); 5005 h->access.set_intr_mask(h, CCISS_INTR_OFF);
4665 if (h->msi_vector || h->msix_vector) { 5006 rc = cciss_request_irq(h, do_cciss_msix_intr, do_cciss_intx);
4666 if (request_irq(h->intr[PERF_MODE_INT], 5007 if (rc)
4667 do_cciss_msix_intr, 5008 goto clean2;
4668 IRQF_DISABLED, h->devname, h)) {
4669 dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
4670 h->intr[PERF_MODE_INT], h->devname);
4671 goto clean2;
4672 }
4673 } else {
4674 if (request_irq(h->intr[PERF_MODE_INT], do_cciss_intx,
4675 IRQF_DISABLED, h->devname, h)) {
4676 dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
4677 h->intr[PERF_MODE_INT], h->devname);
4678 goto clean2;
4679 }
4680 }
4681 5009
4682 dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n", 5010 dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
4683 h->devname, pdev->device, pci_name(pdev), 5011 h->devname, pdev->device, pci_name(pdev),
4684 h->intr[PERF_MODE_INT], dac ? "" : " not"); 5012 h->intr[PERF_MODE_INT], dac ? "" : " not");
4685 5013
4686 h->cmd_pool_bits = 5014 if (cciss_allocate_cmd_pool(h))
4687 kmalloc(DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG)
4688 * sizeof(unsigned long), GFP_KERNEL);
4689 h->cmd_pool = (CommandList_struct *)
4690 pci_alloc_consistent(h->pdev,
4691 h->nr_cmds * sizeof(CommandList_struct),
4692 &(h->cmd_pool_dhandle));
4693 h->errinfo_pool = (ErrorInfo_struct *)
4694 pci_alloc_consistent(h->pdev,
4695 h->nr_cmds * sizeof(ErrorInfo_struct),
4696 &(h->errinfo_pool_dhandle));
4697 if ((h->cmd_pool_bits == NULL)
4698 || (h->cmd_pool == NULL)
4699 || (h->errinfo_pool == NULL)) {
4700 dev_err(&h->pdev->dev, "out of memory");
4701 goto clean4; 5015 goto clean4;
4702 }
4703 5016
4704 /* Need space for temp scatter list */ 5017 if (cciss_allocate_scatterlists(h))
4705 h->scatter_list = kmalloc(h->max_commands *
4706 sizeof(struct scatterlist *),
4707 GFP_KERNEL);
4708 if (!h->scatter_list)
4709 goto clean4; 5018 goto clean4;
4710 5019
4711 for (k = 0; k < h->nr_cmds; k++) {
4712 h->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
4713 h->maxsgentries,
4714 GFP_KERNEL);
4715 if (h->scatter_list[k] == NULL) {
4716 dev_err(&h->pdev->dev,
4717 "could not allocate s/g lists\n");
4718 goto clean4;
4719 }
4720 }
4721 h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, 5020 h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
4722 h->chainsize, h->nr_cmds); 5021 h->chainsize, h->nr_cmds);
4723 if (!h->cmd_sg_list && h->chainsize > 0) 5022 if (!h->cmd_sg_list && h->chainsize > 0)
@@ -4741,6 +5040,62 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4741 h->gendisk[j] = NULL; 5040 h->gendisk[j] = NULL;
4742 } 5041 }
4743 5042
5043 /* At this point, the controller is ready to take commands.
5044 * Now, if reset_devices and the hard reset didn't work, try
5045 * the soft reset and see if that works.
5046 */
5047 if (try_soft_reset) {
5048
5049 /* This is kind of gross. We may or may not get a completion
5050 * from the soft reset command, and if we do, then the value
5051 * from the fifo may or may not be valid. So, we wait 10 secs
5052 * after the reset throwing away any completions we get during
5053 * that time. Unregister the interrupt handler and register
5054 * fake ones to scoop up any residual completions.
5055 */
5056 spin_lock_irqsave(&h->lock, flags);
5057 h->access.set_intr_mask(h, CCISS_INTR_OFF);
5058 spin_unlock_irqrestore(&h->lock, flags);
5059 free_irq(h->intr[PERF_MODE_INT], h);
5060 rc = cciss_request_irq(h, cciss_msix_discard_completions,
5061 cciss_intx_discard_completions);
5062 if (rc) {
5063 dev_warn(&h->pdev->dev, "Failed to request_irq after "
5064 "soft reset.\n");
5065 goto clean4;
5066 }
5067
5068 rc = cciss_kdump_soft_reset(h);
5069 if (rc) {
5070 dev_warn(&h->pdev->dev, "Soft reset failed.\n");
5071 goto clean4;
5072 }
5073
5074 dev_info(&h->pdev->dev, "Board READY.\n");
5075 dev_info(&h->pdev->dev,
5076 "Waiting for stale completions to drain.\n");
5077 h->access.set_intr_mask(h, CCISS_INTR_ON);
5078 msleep(10000);
5079 h->access.set_intr_mask(h, CCISS_INTR_OFF);
5080
5081 rc = controller_reset_failed(h->cfgtable);
5082 if (rc)
5083 dev_info(&h->pdev->dev,
5084 "Soft reset appears to have failed.\n");
5085
5086 /* since the controller's reset, we have to go back and re-init
5087 * everything. Easiest to just forget what we've done and do it
5088 * all over again.
5089 */
5090 cciss_undo_allocations_after_kdump_soft_reset(h);
5091 try_soft_reset = 0;
5092 if (rc)
5093 /* don't go to clean4, we already unallocated */
5094 return -ENODEV;
5095
5096 goto reinit_after_soft_reset;
5097 }
5098
4744 cciss_scsi_setup(h); 5099 cciss_scsi_setup(h);
4745 5100
4746 /* Turn the interrupts on so we can service requests */ 5101 /* Turn the interrupts on so we can service requests */
@@ -4775,21 +5130,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
4775 return 1; 5130 return 1;
4776 5131
4777clean4: 5132clean4:
4778 kfree(h->cmd_pool_bits); 5133 cciss_free_cmd_pool(h);
4779 /* Free up sg elements */ 5134 cciss_free_scatterlists(h);
4780 for (k-- ; k >= 0; k--)
4781 kfree(h->scatter_list[k]);
4782 kfree(h->scatter_list);
4783 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); 5135 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
4784 if (h->cmd_pool)
4785 pci_free_consistent(h->pdev,
4786 h->nr_cmds * sizeof(CommandList_struct),
4787 h->cmd_pool, h->cmd_pool_dhandle);
4788 if (h->errinfo_pool)
4789 pci_free_consistent(h->pdev,
4790 h->nr_cmds * sizeof(ErrorInfo_struct),
4791 h->errinfo_pool,
4792 h->errinfo_pool_dhandle);
4793 free_irq(h->intr[PERF_MODE_INT], h); 5136 free_irq(h->intr[PERF_MODE_INT], h);
4794clean2: 5137clean2:
4795 unregister_blkdev(h->major, h->devname); 5138 unregister_blkdev(h->major, h->devname);
@@ -4887,16 +5230,16 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
4887 iounmap(h->cfgtable); 5230 iounmap(h->cfgtable);
4888 iounmap(h->vaddr); 5231 iounmap(h->vaddr);
4889 5232
4890 pci_free_consistent(h->pdev, h->nr_cmds * sizeof(CommandList_struct), 5233 cciss_free_cmd_pool(h);
4891 h->cmd_pool, h->cmd_pool_dhandle);
4892 pci_free_consistent(h->pdev, h->nr_cmds * sizeof(ErrorInfo_struct),
4893 h->errinfo_pool, h->errinfo_pool_dhandle);
4894 kfree(h->cmd_pool_bits);
4895 /* Free up sg elements */ 5234 /* Free up sg elements */
4896 for (j = 0; j < h->nr_cmds; j++) 5235 for (j = 0; j < h->nr_cmds; j++)
4897 kfree(h->scatter_list[j]); 5236 kfree(h->scatter_list[j]);
4898 kfree(h->scatter_list); 5237 kfree(h->scatter_list);
4899 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); 5238 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
5239 kfree(h->blockFetchTable);
5240 if (h->reply_pool)
5241 pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
5242 h->reply_pool, h->reply_pool_dhandle);
4900 /* 5243 /*
4901 * Deliberately omit pci_disable_device(): it does something nasty to 5244 * Deliberately omit pci_disable_device(): it does something nasty to
4902 * Smart Array controllers that pci_enable_device does not undo 5245 * Smart Array controllers that pci_enable_device does not undo
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 554bbd907d14..16b4d58d84dd 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -200,7 +200,7 @@ struct ctlr_info
200 * the above. 200 * the above.
201 */ 201 */
202#define CCISS_BOARD_READY_WAIT_SECS (120) 202#define CCISS_BOARD_READY_WAIT_SECS (120)
203#define CCISS_BOARD_NOT_READY_WAIT_SECS (10) 203#define CCISS_BOARD_NOT_READY_WAIT_SECS (100)
204#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100) 204#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
205#define CCISS_BOARD_READY_ITERATIONS \ 205#define CCISS_BOARD_READY_ITERATIONS \
206 ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \ 206 ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \
@@ -209,8 +209,9 @@ struct ctlr_info
209 ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \ 209 ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
210 CCISS_BOARD_READY_POLL_INTERVAL_MSECS) 210 CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
211#define CCISS_POST_RESET_PAUSE_MSECS (3000) 211#define CCISS_POST_RESET_PAUSE_MSECS (3000)
212#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000) 212#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (4000)
213#define CCISS_POST_RESET_NOOP_RETRIES (12) 213#define CCISS_POST_RESET_NOOP_RETRIES (12)
214#define CCISS_POST_RESET_NOOP_TIMEOUT_MSECS (10000)
214 215
215/* 216/*
216 Send the command to the hardware 217 Send the command to the hardware
@@ -239,11 +240,13 @@ static void SA5_intr_mask(ctlr_info_t *h, unsigned long val)
239 { /* Turn interrupts on */ 240 { /* Turn interrupts on */
240 h->interrupts_enabled = 1; 241 h->interrupts_enabled = 1;
241 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 242 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
243 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
242 } else /* Turn them off */ 244 } else /* Turn them off */
243 { 245 {
244 h->interrupts_enabled = 0; 246 h->interrupts_enabled = 0;
245 writel( SA5_INTR_OFF, 247 writel( SA5_INTR_OFF,
246 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 248 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
249 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
247 } 250 }
248} 251}
249/* 252/*
@@ -257,11 +260,13 @@ static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val)
257 { /* Turn interrupts on */ 260 { /* Turn interrupts on */
258 h->interrupts_enabled = 1; 261 h->interrupts_enabled = 1;
259 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 262 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
263 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
260 } else /* Turn them off */ 264 } else /* Turn them off */
261 { 265 {
262 h->interrupts_enabled = 0; 266 h->interrupts_enabled = 0;
263 writel( SA5B_INTR_OFF, 267 writel( SA5B_INTR_OFF,
264 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 268 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
269 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
265 } 270 }
266} 271}
267 272
@@ -271,10 +276,12 @@ static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val)
271 if (val) { /* turn on interrupts */ 276 if (val) { /* turn on interrupts */
272 h->interrupts_enabled = 1; 277 h->interrupts_enabled = 1;
273 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 278 writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
279 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
274 } else { 280 } else {
275 h->interrupts_enabled = 0; 281 h->interrupts_enabled = 0;
276 writel(SA5_PERF_INTR_OFF, 282 writel(SA5_PERF_INTR_OFF,
277 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); 283 h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
284 (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
278 } 285 }
279} 286}
280 287
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index cd441bef031f..d9be6b4d49a6 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -53,6 +53,7 @@
53#define CFGTBL_ChangeReq 0x00000001l 53#define CFGTBL_ChangeReq 0x00000001l
54#define CFGTBL_AccCmds 0x00000001l 54#define CFGTBL_AccCmds 0x00000001l
55#define DOORBELL_CTLR_RESET 0x00000004l 55#define DOORBELL_CTLR_RESET 0x00000004l
56#define DOORBELL_CTLR_RESET2 0x00000020l
56 57
57#define CFGTBL_Trans_Simple 0x00000002l 58#define CFGTBL_Trans_Simple 0x00000002l
58#define CFGTBL_Trans_Performant 0x00000004l 59#define CFGTBL_Trans_Performant 0x00000004l
@@ -142,6 +143,14 @@ typedef struct _ReadCapdata_struct_16
142#define BMIC_CACHE_FLUSH 0xc2 143#define BMIC_CACHE_FLUSH 0xc2
143#define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */ 144#define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */
144 145
146#define CCISS_ABORT_MSG 0x00
147#define CCISS_RESET_MSG 0x01
148#define CCISS_RESET_TYPE_CONTROLLER 0x00
149#define CCISS_RESET_TYPE_BUS 0x01
150#define CCISS_RESET_TYPE_TARGET 0x03
151#define CCISS_RESET_TYPE_LUN 0x04
152#define CCISS_NOOP_MSG 0x03
153
145/* Command List Structure */ 154/* Command List Structure */
146#define CTLR_LUNID "\0\0\0\0\0\0\0\0" 155#define CTLR_LUNID "\0\0\0\0\0\0\0\0"
147 156
@@ -235,6 +244,8 @@ typedef struct _CfgTable_struct {
235 u8 reserved[0x78 - 0x58]; 244 u8 reserved[0x78 - 0x58];
236 u32 misc_fw_support; /* offset 0x78 */ 245 u32 misc_fw_support; /* offset 0x78 */
237#define MISC_FW_DOORBELL_RESET (0x02) 246#define MISC_FW_DOORBELL_RESET (0x02)
247#define MISC_FW_DOORBELL_RESET2 (0x10)
248 u8 driver_version[32];
238} CfgTable_struct; 249} CfgTable_struct;
239 250
240struct TransTable_struct { 251struct TransTable_struct {
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index df793803f5ae..696100241a6f 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = {
84 .proc_name = "cciss", 84 .proc_name = "cciss",
85 .proc_info = cciss_scsi_proc_info, 85 .proc_info = cciss_scsi_proc_info,
86 .queuecommand = cciss_scsi_queue_command, 86 .queuecommand = cciss_scsi_queue_command,
87 .can_queue = SCSI_CCISS_CAN_QUEUE,
88 .this_id = 7, 87 .this_id = 7,
89 .cmd_per_lun = 1, 88 .cmd_per_lun = 1,
90 .use_clustering = DISABLE_CLUSTERING, 89 .use_clustering = DISABLE_CLUSTERING,
@@ -108,16 +107,13 @@ struct cciss_scsi_cmd_stack_elem_t {
108 107
109#pragma pack() 108#pragma pack()
110 109
111#define CMD_STACK_SIZE (SCSI_CCISS_CAN_QUEUE * \
112 CCISS_MAX_SCSI_DEVS_PER_HBA + 2)
113 // plus two for init time usage
114
115#pragma pack(1) 110#pragma pack(1)
116struct cciss_scsi_cmd_stack_t { 111struct cciss_scsi_cmd_stack_t {
117 struct cciss_scsi_cmd_stack_elem_t *pool; 112 struct cciss_scsi_cmd_stack_elem_t *pool;
118 struct cciss_scsi_cmd_stack_elem_t *elem[CMD_STACK_SIZE]; 113 struct cciss_scsi_cmd_stack_elem_t **elem;
119 dma_addr_t cmd_pool_handle; 114 dma_addr_t cmd_pool_handle;
120 int top; 115 int top;
116 int nelems;
121}; 117};
122#pragma pack() 118#pragma pack()
123 119
@@ -191,7 +187,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
191 sa = h->scsi_ctlr; 187 sa = h->scsi_ctlr;
192 stk = &sa->cmd_stack; 188 stk = &sa->cmd_stack;
193 stk->top++; 189 stk->top++;
194 if (stk->top >= CMD_STACK_SIZE) { 190 if (stk->top >= stk->nelems) {
195 dev_err(&h->pdev->dev, 191 dev_err(&h->pdev->dev,
196 "scsi_cmd_free called too many times.\n"); 192 "scsi_cmd_free called too many times.\n");
197 BUG(); 193 BUG();
@@ -206,13 +202,14 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
206 struct cciss_scsi_cmd_stack_t *stk; 202 struct cciss_scsi_cmd_stack_t *stk;
207 size_t size; 203 size_t size;
208 204
205 stk = &sa->cmd_stack;
206 stk->nelems = cciss_tape_cmds + 2;
209 sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, 207 sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
210 h->chainsize, CMD_STACK_SIZE); 208 h->chainsize, stk->nelems);
211 if (!sa->cmd_sg_list && h->chainsize > 0) 209 if (!sa->cmd_sg_list && h->chainsize > 0)
212 return -ENOMEM; 210 return -ENOMEM;
213 211
214 stk = &sa->cmd_stack; 212 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
215 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE;
216 213
217 /* Check alignment, see cciss_cmd.h near CommandList_struct def. */ 214 /* Check alignment, see cciss_cmd.h near CommandList_struct def. */
218 BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0); 215 BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0);
@@ -221,18 +218,23 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
221 pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle); 218 pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle);
222 219
223 if (stk->pool == NULL) { 220 if (stk->pool == NULL) {
224 cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); 221 cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
225 sa->cmd_sg_list = NULL; 222 sa->cmd_sg_list = NULL;
226 return -ENOMEM; 223 return -ENOMEM;
227 } 224 }
228 225 stk->elem = kmalloc(sizeof(stk->elem[0]) * stk->nelems, GFP_KERNEL);
229 for (i=0; i<CMD_STACK_SIZE; i++) { 226 if (!stk->elem) {
227 pci_free_consistent(h->pdev, size, stk->pool,
228 stk->cmd_pool_handle);
229 return -1;
230 }
231 for (i = 0; i < stk->nelems; i++) {
230 stk->elem[i] = &stk->pool[i]; 232 stk->elem[i] = &stk->pool[i];
231 stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + 233 stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle +
232 (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i)); 234 (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
233 stk->elem[i]->cmdindex = i; 235 stk->elem[i]->cmdindex = i;
234 } 236 }
235 stk->top = CMD_STACK_SIZE-1; 237 stk->top = stk->nelems-1;
236 return 0; 238 return 0;
237} 239}
238 240
@@ -245,16 +247,18 @@ scsi_cmd_stack_free(ctlr_info_t *h)
245 247
246 sa = h->scsi_ctlr; 248 sa = h->scsi_ctlr;
247 stk = &sa->cmd_stack; 249 stk = &sa->cmd_stack;
248 if (stk->top != CMD_STACK_SIZE-1) { 250 if (stk->top != stk->nelems-1) {
249 dev_warn(&h->pdev->dev, 251 dev_warn(&h->pdev->dev,
250 "bug: %d scsi commands are still outstanding.\n", 252 "bug: %d scsi commands are still outstanding.\n",
251 CMD_STACK_SIZE - stk->top); 253 stk->nelems - stk->top);
252 } 254 }
253 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE; 255 size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
254 256
255 pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle); 257 pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle);
256 stk->pool = NULL; 258 stk->pool = NULL;
257 cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); 259 cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
260 kfree(stk->elem);
261 stk->elem = NULL;
258} 262}
259 263
260#if 0 264#if 0
@@ -859,6 +863,7 @@ cciss_scsi_detect(ctlr_info_t *h)
859 sh->io_port = 0; // good enough? FIXME, 863 sh->io_port = 0; // good enough? FIXME,
860 sh->n_io_port = 0; // I don't think we use these two... 864 sh->n_io_port = 0; // I don't think we use these two...
861 sh->this_id = SELF_SCSI_ID; 865 sh->this_id = SELF_SCSI_ID;
866 sh->can_queue = cciss_tape_cmds;
862 sh->sg_tablesize = h->maxsgentries; 867 sh->sg_tablesize = h->maxsgentries;
863 sh->max_cmd_len = MAX_COMMAND_SIZE; 868 sh->max_cmd_len = MAX_COMMAND_SIZE;
864 869
diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h
index 6d5822fe851a..e71d986727ca 100644
--- a/drivers/block/cciss_scsi.h
+++ b/drivers/block/cciss_scsi.h
@@ -36,13 +36,9 @@
36 addressible natively, and may in fact turn 36 addressible natively, and may in fact turn
37 out to be not scsi at all. */ 37 out to be not scsi at all. */
38 38
39#define SCSI_CCISS_CAN_QUEUE 2
40 39
41/* 40/*
42 41
43Note, cmd_per_lun could give us some trouble, so I'm setting it very low.
44Likewise, SCSI_CCISS_CAN_QUEUE is set very conservatively.
45
46If the upper scsi layer tries to track how many commands we have 42If the upper scsi layer tries to track how many commands we have
47outstanding, it will be operating under the misapprehension that it is 43outstanding, it will be operating under the misapprehension that it is
48the only one sending us requests. We also have the block interface, 44the only one sending us requests. We also have the block interface,
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index c6828b68d77b..09ef9a878ef0 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -28,7 +28,7 @@
28#include "drbd_int.h" 28#include "drbd_int.h"
29#include "drbd_wrappers.h" 29#include "drbd_wrappers.h"
30 30
31/* We maintain a trivial check sum in our on disk activity log. 31/* We maintain a trivial checksum in our on disk activity log.
32 * With that we can ensure correct operation even when the storage 32 * With that we can ensure correct operation even when the storage
33 * device might do a partial (last) sector write while losing power. 33 * device might do a partial (last) sector write while losing power.
34 */ 34 */
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 76210ba401ac..f440a02dfdb1 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -74,7 +74,7 @@
74 * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage 74 * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
75 * seems excessive. 75 * seems excessive.
76 * 76 *
77 * We plan to reduce the amount of in-core bitmap pages by pageing them in 77 * We plan to reduce the amount of in-core bitmap pages by paging them in
78 * and out against their on-disk location as necessary, but need to make 78 * and out against their on-disk location as necessary, but need to make
79 * sure we don't cause too much meta data IO, and must not deadlock in 79 * sure we don't cause too much meta data IO, and must not deadlock in
80 * tight memory situations. This needs some more work. 80 * tight memory situations. This needs some more work.
@@ -200,7 +200,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
200 * we if bits have been cleared since last IO. */ 200 * we if bits have been cleared since last IO. */
201#define BM_PAGE_LAZY_WRITEOUT 28 201#define BM_PAGE_LAZY_WRITEOUT 28
202 202
203/* store_page_idx uses non-atomic assingment. It is only used directly after 203/* store_page_idx uses non-atomic assignment. It is only used directly after
204 * allocating the page. All other bm_set_page_* and bm_clear_page_* need to 204 * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
205 * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap 205 * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
206 * changes) may happen from various contexts, and wait_on_bit/wake_up_bit 206 * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
@@ -318,7 +318,7 @@ static void bm_unmap(unsigned long *p_addr)
318/* word offset from start of bitmap to word number _in_page_ 318/* word offset from start of bitmap to word number _in_page_
319 * modulo longs per page 319 * modulo longs per page
320#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) 320#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
321 hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) 321 hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
322 so do it explicitly: 322 so do it explicitly:
323 */ 323 */
324#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) 324#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d871b14ed5a1..ef2ceed3be4b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -700,7 +700,7 @@ struct drbd_request {
700 * see drbd_endio_pri(). */ 700 * see drbd_endio_pri(). */
701 struct bio *private_bio; 701 struct bio *private_bio;
702 702
703 struct hlist_node colision; 703 struct hlist_node collision;
704 sector_t sector; 704 sector_t sector;
705 unsigned int size; 705 unsigned int size;
706 unsigned int epoch; /* barrier_nr */ 706 unsigned int epoch; /* barrier_nr */
@@ -766,7 +766,7 @@ struct digest_info {
766 766
767struct drbd_epoch_entry { 767struct drbd_epoch_entry {
768 struct drbd_work w; 768 struct drbd_work w;
769 struct hlist_node colision; 769 struct hlist_node collision;
770 struct drbd_epoch *epoch; /* for writes */ 770 struct drbd_epoch *epoch; /* for writes */
771 struct drbd_conf *mdev; 771 struct drbd_conf *mdev;
772 struct page *pages; 772 struct page *pages;
@@ -1129,6 +1129,8 @@ struct drbd_conf {
1129 int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ 1129 int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
1130 int rs_planed; /* resync sectors already planned */ 1130 int rs_planed; /* resync sectors already planned */
1131 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ 1131 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
1132 int peer_max_bio_size;
1133 int local_max_bio_size;
1132}; 1134};
1133 1135
1134static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1136static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1218,8 +1220,6 @@ extern void drbd_free_resources(struct drbd_conf *mdev);
1218extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, 1220extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
1219 unsigned int set_size); 1221 unsigned int set_size);
1220extern void tl_clear(struct drbd_conf *mdev); 1222extern void tl_clear(struct drbd_conf *mdev);
1221enum drbd_req_event;
1222extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
1223extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); 1223extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
1224extern void drbd_free_sock(struct drbd_conf *mdev); 1224extern void drbd_free_sock(struct drbd_conf *mdev);
1225extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, 1225extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
@@ -1434,6 +1434,7 @@ struct bm_extent {
1434 * hash table. */ 1434 * hash table. */
1435#define HT_SHIFT 8 1435#define HT_SHIFT 8
1436#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) 1436#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
1437#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */
1437 1438
1438#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ 1439#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
1439 1440
@@ -1518,9 +1519,9 @@ extern void drbd_resume_io(struct drbd_conf *mdev);
1518extern char *ppsize(char *buf, unsigned long long size); 1519extern char *ppsize(char *buf, unsigned long long size);
1519extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); 1520extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
1520enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; 1521enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1521extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); 1522extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
1522extern void resync_after_online_grow(struct drbd_conf *); 1523extern void resync_after_online_grow(struct drbd_conf *);
1523extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); 1524extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
1524extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, 1525extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
1525 enum drbd_role new_role, 1526 enum drbd_role new_role,
1526 int force); 1527 int force);
@@ -1828,6 +1829,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
1828 if (!forcedetach) { 1829 if (!forcedetach) {
1829 if (__ratelimit(&drbd_ratelimit_state)) 1830 if (__ratelimit(&drbd_ratelimit_state))
1830 dev_err(DEV, "Local IO failed in %s.\n", where); 1831 dev_err(DEV, "Local IO failed in %s.\n", where);
1832 if (mdev->state.disk > D_INCONSISTENT)
1833 _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
1831 break; 1834 break;
1832 } 1835 }
1833 /* NOTE fall through to detach case if forcedetach set */ 1836 /* NOTE fall through to detach case if forcedetach set */
@@ -2153,6 +2156,10 @@ static inline int get_net_conf(struct drbd_conf *mdev)
2153static inline void put_ldev(struct drbd_conf *mdev) 2156static inline void put_ldev(struct drbd_conf *mdev)
2154{ 2157{
2155 int i = atomic_dec_return(&mdev->local_cnt); 2158 int i = atomic_dec_return(&mdev->local_cnt);
2159
2160 /* This may be called from some endio handler,
2161 * so we must not sleep here. */
2162
2156 __release(local); 2163 __release(local);
2157 D_ASSERT(i >= 0); 2164 D_ASSERT(i >= 0);
2158 if (i == 0) { 2165 if (i == 0) {
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 5b525c179f39..0358e55356c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -745,6 +745,9 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
745 mdev->agreed_pro_version < 88) 745 mdev->agreed_pro_version < 88)
746 rv = SS_NOT_SUPPORTED; 746 rv = SS_NOT_SUPPORTED;
747 747
748 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
749 rv = SS_CONNECTED_OUTDATES;
750
748 return rv; 751 return rv;
749} 752}
750 753
@@ -1565,6 +1568,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1565 put_ldev(mdev); 1568 put_ldev(mdev);
1566 } 1569 }
1567 1570
1571 /* Notify peer that I had a local IO error, and did not detached.. */
1572 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1573 drbd_send_state(mdev);
1574
1568 /* Disks got bigger while they were detached */ 1575 /* Disks got bigger while they were detached */
1569 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && 1576 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1570 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { 1577 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
@@ -2064,7 +2071,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
2064{ 2071{
2065 struct p_sizes p; 2072 struct p_sizes p;
2066 sector_t d_size, u_size; 2073 sector_t d_size, u_size;
2067 int q_order_type; 2074 int q_order_type, max_bio_size;
2068 int ok; 2075 int ok;
2069 2076
2070 if (get_ldev_if_state(mdev, D_NEGOTIATING)) { 2077 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -2072,17 +2079,20 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
2072 d_size = drbd_get_max_capacity(mdev->ldev); 2079 d_size = drbd_get_max_capacity(mdev->ldev);
2073 u_size = mdev->ldev->dc.disk_size; 2080 u_size = mdev->ldev->dc.disk_size;
2074 q_order_type = drbd_queue_order_type(mdev); 2081 q_order_type = drbd_queue_order_type(mdev);
2082 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2083 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
2075 put_ldev(mdev); 2084 put_ldev(mdev);
2076 } else { 2085 } else {
2077 d_size = 0; 2086 d_size = 0;
2078 u_size = 0; 2087 u_size = 0;
2079 q_order_type = QUEUE_ORDERED_NONE; 2088 q_order_type = QUEUE_ORDERED_NONE;
2089 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
2080 } 2090 }
2081 2091
2082 p.d_size = cpu_to_be64(d_size); 2092 p.d_size = cpu_to_be64(d_size);
2083 p.u_size = cpu_to_be64(u_size); 2093 p.u_size = cpu_to_be64(u_size);
2084 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); 2094 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2085 p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9); 2095 p.max_bio_size = cpu_to_be32(max_bio_size);
2086 p.queue_order_type = cpu_to_be16(q_order_type); 2096 p.queue_order_type = cpu_to_be16(q_order_type);
2087 p.dds_flags = cpu_to_be16(flags); 2097 p.dds_flags = cpu_to_be16(flags);
2088 2098
@@ -2722,7 +2732,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2722 2732
2723 /* double check digest, sometimes buffers have been modified in flight. */ 2733 /* double check digest, sometimes buffers have been modified in flight. */
2724 if (dgs > 0 && dgs <= 64) { 2734 if (dgs > 0 && dgs <= 64) {
2725 /* 64 byte, 512 bit, is the larges digest size 2735 /* 64 byte, 512 bit, is the largest digest size
2726 * currently supported in kernel crypto. */ 2736 * currently supported in kernel crypto. */
2727 unsigned char digest[64]; 2737 unsigned char digest[64];
2728 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest); 2738 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
@@ -3041,6 +3051,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
3041 mdev->agreed_pro_version = PRO_VERSION_MAX; 3051 mdev->agreed_pro_version = PRO_VERSION_MAX;
3042 mdev->write_ordering = WO_bdev_flush; 3052 mdev->write_ordering = WO_bdev_flush;
3043 mdev->resync_wenr = LC_FREE; 3053 mdev->resync_wenr = LC_FREE;
3054 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3055 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3044} 3056}
3045 3057
3046void drbd_mdev_cleanup(struct drbd_conf *mdev) 3058void drbd_mdev_cleanup(struct drbd_conf *mdev)
@@ -3275,7 +3287,7 @@ static void drbd_delete_device(unsigned int minor)
3275 3287
3276 drbd_release_ee_lists(mdev); 3288 drbd_release_ee_lists(mdev);
3277 3289
3278 /* should be free'd on disconnect? */ 3290 /* should be freed on disconnect? */
3279 kfree(mdev->ee_hash); 3291 kfree(mdev->ee_hash);
3280 /* 3292 /*
3281 mdev->ee_hash_s = 0; 3293 mdev->ee_hash_s = 0;
@@ -3415,7 +3427,9 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
3415 q->backing_dev_info.congested_data = mdev; 3427 q->backing_dev_info.congested_data = mdev;
3416 3428
3417 blk_queue_make_request(q, drbd_make_request); 3429 blk_queue_make_request(q, drbd_make_request);
3418 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9); 3430 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3431 This triggers a max_bio_size message upon first attach or connect */
3432 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3419 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 3433 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3420 blk_queue_merge_bvec(q, drbd_merge_bvec); 3434 blk_queue_merge_bvec(q, drbd_merge_bvec);
3421 q->queue_lock = &mdev->req_lock; 3435 q->queue_lock = &mdev->req_lock;
@@ -3627,7 +3641,8 @@ struct meta_data_on_disk {
3627 /* `-- act_log->nr_elements <-- sync_conf.al_extents */ 3641 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3628 u32 bm_offset; /* offset to the bitmap, from here */ 3642 u32 bm_offset; /* offset to the bitmap, from here */
3629 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ 3643 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3630 u32 reserved_u32[4]; 3644 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3645 u32 reserved_u32[3];
3631 3646
3632} __packed; 3647} __packed;
3633 3648
@@ -3668,6 +3683,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
3668 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); 3683 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3669 3684
3670 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); 3685 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3686 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
3671 3687
3672 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); 3688 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3673 sector = mdev->ldev->md.md_offset; 3689 sector = mdev->ldev->md.md_offset;
@@ -3751,6 +3767,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3751 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); 3767 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3752 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); 3768 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3753 3769
3770 spin_lock_irq(&mdev->req_lock);
3771 if (mdev->state.conn < C_CONNECTED) {
3772 int peer;
3773 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3774 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3775 mdev->peer_max_bio_size = peer;
3776 }
3777 spin_unlock_irq(&mdev->req_lock);
3778
3754 if (mdev->sync_conf.al_extents < 7) 3779 if (mdev->sync_conf.al_extents < 7)
3755 mdev->sync_conf.al_extents = 127; 3780 mdev->sync_conf.al_extents = 127;
3756 3781
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 03b29f78a37d..515bcd948a43 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -272,9 +272,28 @@ static int _try_outdate_peer_async(void *data)
272{ 272{
273 struct drbd_conf *mdev = (struct drbd_conf *)data; 273 struct drbd_conf *mdev = (struct drbd_conf *)data;
274 enum drbd_disk_state nps; 274 enum drbd_disk_state nps;
275 union drbd_state ns;
275 276
276 nps = drbd_try_outdate_peer(mdev); 277 nps = drbd_try_outdate_peer(mdev);
277 drbd_request_state(mdev, NS(pdsk, nps)); 278
279 /* Not using
280 drbd_request_state(mdev, NS(pdsk, nps));
281 here, because we might were able to re-establish the connection
282 in the meantime. This can only partially be solved in the state's
283 engine is_valid_state() and is_valid_state_transition()
284 functions.
285
286 nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
287 pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
288 therefore we have to have the pre state change check here.
289 */
290 spin_lock_irq(&mdev->req_lock);
291 ns = mdev->state;
292 if (ns.conn < C_WF_REPORT_PARAMS) {
293 ns.pdsk = nps;
294 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
295 }
296 spin_unlock_irq(&mdev->req_lock);
278 297
279 return 0; 298 return 0;
280} 299}
@@ -577,7 +596,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
577 * Returns 0 on success, negative return values indicate errors. 596 * Returns 0 on success, negative return values indicate errors.
578 * You should call drbd_md_sync() after calling this function. 597 * You should call drbd_md_sync() after calling this function.
579 */ 598 */
580enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) 599enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
581{ 600{
582 sector_t prev_first_sect, prev_size; /* previous meta location */ 601 sector_t prev_first_sect, prev_size; /* previous meta location */
583 sector_t la_size; 602 sector_t la_size;
@@ -773,30 +792,78 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
773 return 0; 792 return 0;
774} 793}
775 794
776void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) __must_hold(local) 795static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
777{ 796{
778 struct request_queue * const q = mdev->rq_queue; 797 struct request_queue * const q = mdev->rq_queue;
779 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 798 int max_hw_sectors = max_bio_size >> 9;
780 int max_segments = mdev->ldev->dc.max_bio_bvecs; 799 int max_segments = 0;
781 int max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 800
801 if (get_ldev_if_state(mdev, D_ATTACHING)) {
802 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
803
804 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
805 max_segments = mdev->ldev->dc.max_bio_bvecs;
806 put_ldev(mdev);
807 }
782 808
783 blk_queue_logical_block_size(q, 512); 809 blk_queue_logical_block_size(q, 512);
784 blk_queue_max_hw_sectors(q, max_hw_sectors); 810 blk_queue_max_hw_sectors(q, max_hw_sectors);
785 /* This is the workaround for "bio would need to, but cannot, be split" */ 811 /* This is the workaround for "bio would need to, but cannot, be split" */
786 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); 812 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
787 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); 813 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
788 blk_queue_stack_limits(q, b);
789 814
790 dev_info(DEV, "max BIO size = %u\n", queue_max_hw_sectors(q) << 9); 815 if (get_ldev_if_state(mdev, D_ATTACHING)) {
816 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
817
818 blk_queue_stack_limits(q, b);
791 819
792 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 820 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
793 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", 821 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
794 q->backing_dev_info.ra_pages, 822 q->backing_dev_info.ra_pages,
795 b->backing_dev_info.ra_pages); 823 b->backing_dev_info.ra_pages);
796 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 824 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
825 }
826 put_ldev(mdev);
797 } 827 }
798} 828}
799 829
830void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
831{
832 int now, new, local, peer;
833
834 now = queue_max_hw_sectors(mdev->rq_queue) << 9;
835 local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
836 peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */
837
838 if (get_ldev_if_state(mdev, D_ATTACHING)) {
839 local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
840 mdev->local_max_bio_size = local;
841 put_ldev(mdev);
842 }
843
844 /* We may ignore peer limits if the peer is modern enough.
845 Because new from 8.3.8 onwards the peer can use multiple
846 BIOs for a single peer_request */
847 if (mdev->state.conn >= C_CONNECTED) {
848 if (mdev->agreed_pro_version < 94)
849 peer = mdev->peer_max_bio_size;
850 else if (mdev->agreed_pro_version == 94)
851 peer = DRBD_MAX_SIZE_H80_PACKET;
852 else /* drbd 8.3.8 onwards */
853 peer = DRBD_MAX_BIO_SIZE;
854 }
855
856 new = min_t(int, local, peer);
857
858 if (mdev->state.role == R_PRIMARY && new < now)
859 dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
860
861 if (new != now)
862 dev_info(DEV, "max BIO size = %u\n", new);
863
864 drbd_setup_queue_param(mdev, new);
865}
866
800/* serialize deconfig (worker exiting, doing cleanup) 867/* serialize deconfig (worker exiting, doing cleanup)
801 * and reconfig (drbdsetup disk, drbdsetup net) 868 * and reconfig (drbdsetup disk, drbdsetup net)
802 * 869 *
@@ -865,7 +932,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
865 struct block_device *bdev; 932 struct block_device *bdev;
866 struct lru_cache *resync_lru = NULL; 933 struct lru_cache *resync_lru = NULL;
867 union drbd_state ns, os; 934 union drbd_state ns, os;
868 unsigned int max_bio_size;
869 enum drbd_state_rv rv; 935 enum drbd_state_rv rv;
870 int cp_discovered = 0; 936 int cp_discovered = 0;
871 int logical_block_size; 937 int logical_block_size;
@@ -1117,20 +1183,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1117 mdev->read_cnt = 0; 1183 mdev->read_cnt = 0;
1118 mdev->writ_cnt = 0; 1184 mdev->writ_cnt = 0;
1119 1185
1120 max_bio_size = DRBD_MAX_BIO_SIZE; 1186 drbd_reconsider_max_bio_size(mdev);
1121 if (mdev->state.conn == C_CONNECTED) {
1122 /* We are Primary, Connected, and now attach a new local
1123 * backing store. We must not increase the user visible maximum
1124 * bio size on this device to something the peer may not be
1125 * able to handle. */
1126 if (mdev->agreed_pro_version < 94)
1127 max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
1128 else if (mdev->agreed_pro_version == 94)
1129 max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
1130 /* else: drbd 8.3.9 and later, stay with default */
1131 }
1132
1133 drbd_setup_queue_param(mdev, max_bio_size);
1134 1187
1135 /* If I am currently not R_PRIMARY, 1188 /* If I am currently not R_PRIMARY,
1136 * but meta data primary indicator is set, 1189 * but meta data primary indicator is set,
@@ -1152,7 +1205,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1152 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) 1205 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1153 set_bit(USE_DEGR_WFC_T, &mdev->flags); 1206 set_bit(USE_DEGR_WFC_T, &mdev->flags);
1154 1207
1155 dd = drbd_determin_dev_size(mdev, 0); 1208 dd = drbd_determine_dev_size(mdev, 0);
1156 if (dd == dev_size_error) { 1209 if (dd == dev_size_error) {
1157 retcode = ERR_NOMEM_BITMAP; 1210 retcode = ERR_NOMEM_BITMAP;
1158 goto force_diskless_dec; 1211 goto force_diskless_dec;
@@ -1281,11 +1334,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1281static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1334static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1282 struct drbd_nl_cfg_reply *reply) 1335 struct drbd_nl_cfg_reply *reply)
1283{ 1336{
1337 enum drbd_ret_code retcode;
1338 int ret;
1284 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ 1339 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
1285 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); 1340 retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
1286 if (mdev->state.disk == D_DISKLESS) 1341 /* D_FAILED will transition to DISKLESS. */
1287 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); 1342 ret = wait_event_interruptible(mdev->misc_wait,
1343 mdev->state.disk != D_FAILED);
1288 drbd_resume_io(mdev); 1344 drbd_resume_io(mdev);
1345 if ((int)retcode == (int)SS_IS_DISKLESS)
1346 retcode = SS_NOTHING_TO_DO;
1347 if (ret)
1348 retcode = ERR_INTR;
1349 reply->ret_code = retcode;
1289 return 0; 1350 return 0;
1290} 1351}
1291 1352
@@ -1658,7 +1719,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1658 1719
1659 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; 1720 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1660 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); 1721 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
1661 dd = drbd_determin_dev_size(mdev, ddsf); 1722 dd = drbd_determine_dev_size(mdev, ddsf);
1662 drbd_md_sync(mdev); 1723 drbd_md_sync(mdev);
1663 put_ldev(mdev); 1724 put_ldev(mdev);
1664 if (dd == dev_size_error) { 1725 if (dd == dev_size_error) {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index fd26666c0b08..25d32c5aa50a 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -333,7 +333,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
333 if (!page) 333 if (!page)
334 goto fail; 334 goto fail;
335 335
336 INIT_HLIST_NODE(&e->colision); 336 INIT_HLIST_NODE(&e->collision);
337 e->epoch = NULL; 337 e->epoch = NULL;
338 e->mdev = mdev; 338 e->mdev = mdev;
339 e->pages = page; 339 e->pages = page;
@@ -356,7 +356,7 @@ void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int i
356 kfree(e->digest); 356 kfree(e->digest);
357 drbd_pp_free(mdev, e->pages, is_net); 357 drbd_pp_free(mdev, e->pages, is_net);
358 D_ASSERT(atomic_read(&e->pending_bios) == 0); 358 D_ASSERT(atomic_read(&e->pending_bios) == 0);
359 D_ASSERT(hlist_unhashed(&e->colision)); 359 D_ASSERT(hlist_unhashed(&e->collision));
360 mempool_free(e, drbd_ee_mempool); 360 mempool_free(e, drbd_ee_mempool);
361} 361}
362 362
@@ -787,7 +787,7 @@ static int drbd_connect(struct drbd_conf *mdev)
787 } 787 }
788 788
789 if (sock && msock) { 789 if (sock && msock) {
790 schedule_timeout_interruptible(HZ / 10); 790 schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
791 ok = drbd_socket_okay(mdev, &sock); 791 ok = drbd_socket_okay(mdev, &sock);
792 ok = drbd_socket_okay(mdev, &msock) && ok; 792 ok = drbd_socket_okay(mdev, &msock) && ok;
793 if (ok) 793 if (ok)
@@ -899,11 +899,6 @@ retry:
899 899
900 drbd_thread_start(&mdev->asender); 900 drbd_thread_start(&mdev->asender);
901 901
902 if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
903 drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
904 put_ldev(mdev);
905 }
906
907 if (drbd_send_protocol(mdev) == -1) 902 if (drbd_send_protocol(mdev) == -1)
908 return -1; 903 return -1;
909 drbd_send_sync_param(mdev, &mdev->sync_conf); 904 drbd_send_sync_param(mdev, &mdev->sync_conf);
@@ -1418,7 +1413,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
1418 sector_t sector = e->sector; 1413 sector_t sector = e->sector;
1419 int ok; 1414 int ok;
1420 1415
1421 D_ASSERT(hlist_unhashed(&e->colision)); 1416 D_ASSERT(hlist_unhashed(&e->collision));
1422 1417
1423 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1418 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1424 drbd_set_in_sync(mdev, sector, e->size); 1419 drbd_set_in_sync(mdev, sector, e->size);
@@ -1487,7 +1482,7 @@ static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
1487 return false; 1482 return false;
1488 } 1483 }
1489 1484
1490 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid 1485 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
1491 * special casing it there for the various failure cases. 1486 * special casing it there for the various failure cases.
1492 * still no race with drbd_fail_pending_reads */ 1487 * still no race with drbd_fail_pending_reads */
1493 ok = recv_dless_read(mdev, req, sector, data_size); 1488 ok = recv_dless_read(mdev, req, sector, data_size);
@@ -1558,11 +1553,11 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1558 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ 1553 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1559 if (mdev->net_conf->two_primaries) { 1554 if (mdev->net_conf->two_primaries) {
1560 spin_lock_irq(&mdev->req_lock); 1555 spin_lock_irq(&mdev->req_lock);
1561 D_ASSERT(!hlist_unhashed(&e->colision)); 1556 D_ASSERT(!hlist_unhashed(&e->collision));
1562 hlist_del_init(&e->colision); 1557 hlist_del_init(&e->collision);
1563 spin_unlock_irq(&mdev->req_lock); 1558 spin_unlock_irq(&mdev->req_lock);
1564 } else { 1559 } else {
1565 D_ASSERT(hlist_unhashed(&e->colision)); 1560 D_ASSERT(hlist_unhashed(&e->collision));
1566 } 1561 }
1567 1562
1568 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); 1563 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
@@ -1579,8 +1574,8 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
1579 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); 1574 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1580 1575
1581 spin_lock_irq(&mdev->req_lock); 1576 spin_lock_irq(&mdev->req_lock);
1582 D_ASSERT(!hlist_unhashed(&e->colision)); 1577 D_ASSERT(!hlist_unhashed(&e->collision));
1583 hlist_del_init(&e->colision); 1578 hlist_del_init(&e->collision);
1584 spin_unlock_irq(&mdev->req_lock); 1579 spin_unlock_irq(&mdev->req_lock);
1585 1580
1586 dec_unacked(mdev); 1581 dec_unacked(mdev);
@@ -1755,7 +1750,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1755 1750
1756 spin_lock_irq(&mdev->req_lock); 1751 spin_lock_irq(&mdev->req_lock);
1757 1752
1758 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); 1753 hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
1759 1754
1760#define OVERLAPS overlaps(i->sector, i->size, sector, size) 1755#define OVERLAPS overlaps(i->sector, i->size, sector, size)
1761 slot = tl_hash_slot(mdev, sector); 1756 slot = tl_hash_slot(mdev, sector);
@@ -1765,7 +1760,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1765 int have_conflict = 0; 1760 int have_conflict = 0;
1766 prepare_to_wait(&mdev->misc_wait, &wait, 1761 prepare_to_wait(&mdev->misc_wait, &wait,
1767 TASK_INTERRUPTIBLE); 1762 TASK_INTERRUPTIBLE);
1768 hlist_for_each_entry(i, n, slot, colision) { 1763 hlist_for_each_entry(i, n, slot, collision) {
1769 if (OVERLAPS) { 1764 if (OVERLAPS) {
1770 /* only ALERT on first iteration, 1765 /* only ALERT on first iteration,
1771 * we may be woken up early... */ 1766 * we may be woken up early... */
@@ -1804,7 +1799,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1804 } 1799 }
1805 1800
1806 if (signal_pending(current)) { 1801 if (signal_pending(current)) {
1807 hlist_del_init(&e->colision); 1802 hlist_del_init(&e->collision);
1808 1803
1809 spin_unlock_irq(&mdev->req_lock); 1804 spin_unlock_irq(&mdev->req_lock);
1810 1805
@@ -1862,7 +1857,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1862 dev_err(DEV, "submit failed, triggering re-connect\n"); 1857 dev_err(DEV, "submit failed, triggering re-connect\n");
1863 spin_lock_irq(&mdev->req_lock); 1858 spin_lock_irq(&mdev->req_lock);
1864 list_del(&e->w.list); 1859 list_del(&e->w.list);
1865 hlist_del_init(&e->colision); 1860 hlist_del_init(&e->collision);
1866 spin_unlock_irq(&mdev->req_lock); 1861 spin_unlock_irq(&mdev->req_lock);
1867 if (e->flags & EE_CALL_AL_COMPLETE_IO) 1862 if (e->flags & EE_CALL_AL_COMPLETE_IO)
1868 drbd_al_complete_io(mdev, e->sector); 1863 drbd_al_complete_io(mdev, e->sector);
@@ -2916,12 +2911,6 @@ disconnect:
2916 return false; 2911 return false;
2917} 2912}
2918 2913
2919static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2920{
2921 /* sorry, we currently have no working implementation
2922 * of distributed TCQ */
2923}
2924
2925/* warn if the arguments differ by more than 12.5% */ 2914/* warn if the arguments differ by more than 12.5% */
2926static void warn_if_differ_considerably(struct drbd_conf *mdev, 2915static void warn_if_differ_considerably(struct drbd_conf *mdev,
2927 const char *s, sector_t a, sector_t b) 2916 const char *s, sector_t a, sector_t b)
@@ -2939,7 +2928,6 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
2939{ 2928{
2940 struct p_sizes *p = &mdev->data.rbuf.sizes; 2929 struct p_sizes *p = &mdev->data.rbuf.sizes;
2941 enum determine_dev_size dd = unchanged; 2930 enum determine_dev_size dd = unchanged;
2942 unsigned int max_bio_size;
2943 sector_t p_size, p_usize, my_usize; 2931 sector_t p_size, p_usize, my_usize;
2944 int ldsc = 0; /* local disk size changed */ 2932 int ldsc = 0; /* local disk size changed */
2945 enum dds_flags ddsf; 2933 enum dds_flags ddsf;
@@ -2994,7 +2982,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
2994 2982
2995 ddsf = be16_to_cpu(p->dds_flags); 2983 ddsf = be16_to_cpu(p->dds_flags);
2996 if (get_ldev(mdev)) { 2984 if (get_ldev(mdev)) {
2997 dd = drbd_determin_dev_size(mdev, ddsf); 2985 dd = drbd_determine_dev_size(mdev, ddsf);
2998 put_ldev(mdev); 2986 put_ldev(mdev);
2999 if (dd == dev_size_error) 2987 if (dd == dev_size_error)
3000 return false; 2988 return false;
@@ -3004,23 +2992,15 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3004 drbd_set_my_capacity(mdev, p_size); 2992 drbd_set_my_capacity(mdev, p_size);
3005 } 2993 }
3006 2994
2995 mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
2996 drbd_reconsider_max_bio_size(mdev);
2997
3007 if (get_ldev(mdev)) { 2998 if (get_ldev(mdev)) {
3008 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 2999 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3009 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 3000 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3010 ldsc = 1; 3001 ldsc = 1;
3011 } 3002 }
3012 3003
3013 if (mdev->agreed_pro_version < 94)
3014 max_bio_size = be32_to_cpu(p->max_bio_size);
3015 else if (mdev->agreed_pro_version == 94)
3016 max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
3017 else /* drbd 8.3.8 onwards */
3018 max_bio_size = DRBD_MAX_BIO_SIZE;
3019
3020 if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9)
3021 drbd_setup_queue_param(mdev, max_bio_size);
3022
3023 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
3024 put_ldev(mdev); 3004 put_ldev(mdev);
3025 } 3005 }
3026 3006
@@ -4275,7 +4255,7 @@ static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4275 struct hlist_node *n; 4255 struct hlist_node *n;
4276 struct drbd_request *req; 4256 struct drbd_request *req;
4277 4257
4278 hlist_for_each_entry(req, n, slot, colision) { 4258 hlist_for_each_entry(req, n, slot, collision) {
4279 if ((unsigned long)req == (unsigned long)id) { 4259 if ((unsigned long)req == (unsigned long)id) {
4280 if (req->sector != sector) { 4260 if (req->sector != sector) {
4281 dev_err(DEV, "_ack_id_to_req: found req %p but it has " 4261 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
@@ -4554,6 +4534,7 @@ int drbd_asender(struct drbd_thread *thi)
4554 int received = 0; 4534 int received = 0;
4555 int expect = sizeof(struct p_header80); 4535 int expect = sizeof(struct p_header80);
4556 int empty; 4536 int empty;
4537 int ping_timeout_active = 0;
4557 4538
4558 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); 4539 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4559 4540
@@ -4566,6 +4547,7 @@ int drbd_asender(struct drbd_thread *thi)
4566 ERR_IF(!drbd_send_ping(mdev)) goto reconnect; 4547 ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4567 mdev->meta.socket->sk->sk_rcvtimeo = 4548 mdev->meta.socket->sk->sk_rcvtimeo =
4568 mdev->net_conf->ping_timeo*HZ/10; 4549 mdev->net_conf->ping_timeo*HZ/10;
4550 ping_timeout_active = 1;
4569 } 4551 }
4570 4552
4571 /* conditionally cork; 4553 /* conditionally cork;
@@ -4620,8 +4602,7 @@ int drbd_asender(struct drbd_thread *thi)
4620 dev_err(DEV, "meta connection shut down by peer.\n"); 4602 dev_err(DEV, "meta connection shut down by peer.\n");
4621 goto reconnect; 4603 goto reconnect;
4622 } else if (rv == -EAGAIN) { 4604 } else if (rv == -EAGAIN) {
4623 if (mdev->meta.socket->sk->sk_rcvtimeo == 4605 if (ping_timeout_active) {
4624 mdev->net_conf->ping_timeo*HZ/10) {
4625 dev_err(DEV, "PingAck did not arrive in time.\n"); 4606 dev_err(DEV, "PingAck did not arrive in time.\n");
4626 goto reconnect; 4607 goto reconnect;
4627 } 4608 }
@@ -4660,6 +4641,11 @@ int drbd_asender(struct drbd_thread *thi)
4660 if (!cmd->process(mdev, h)) 4641 if (!cmd->process(mdev, h))
4661 goto reconnect; 4642 goto reconnect;
4662 4643
4644 /* the idle_timeout (ping-int)
4645 * has been restored in got_PingAck() */
4646 if (cmd == get_asender_cmd(P_PING_ACK))
4647 ping_timeout_active = 0;
4648
4663 buf = h; 4649 buf = h;
4664 received = 0; 4650 received = 0;
4665 expect = sizeof(struct p_header80); 4651 expect = sizeof(struct p_header80);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 5c0c8be1bb0a..3424d675b769 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -163,7 +163,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
163 * they must have been failed on the spot */ 163 * they must have been failed on the spot */
164#define OVERLAPS overlaps(sector, size, i->sector, i->size) 164#define OVERLAPS overlaps(sector, size, i->sector, i->size)
165 slot = tl_hash_slot(mdev, sector); 165 slot = tl_hash_slot(mdev, sector);
166 hlist_for_each_entry(i, n, slot, colision) { 166 hlist_for_each_entry(i, n, slot, collision) {
167 if (OVERLAPS) { 167 if (OVERLAPS) {
168 dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " 168 dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
169 "other: %p %llus +%u\n", 169 "other: %p %llus +%u\n",
@@ -187,7 +187,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
187#undef OVERLAPS 187#undef OVERLAPS
188#define OVERLAPS overlaps(sector, size, e->sector, e->size) 188#define OVERLAPS overlaps(sector, size, e->sector, e->size)
189 slot = ee_hash_slot(mdev, req->sector); 189 slot = ee_hash_slot(mdev, req->sector);
190 hlist_for_each_entry(e, n, slot, colision) { 190 hlist_for_each_entry(e, n, slot, collision) {
191 if (OVERLAPS) { 191 if (OVERLAPS) {
192 wake_up(&mdev->misc_wait); 192 wake_up(&mdev->misc_wait);
193 break; 193 break;
@@ -260,8 +260,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
260 260
261 /* remove the request from the conflict detection 261 /* remove the request from the conflict detection
262 * respective block_id verification hash */ 262 * respective block_id verification hash */
263 if (!hlist_unhashed(&req->colision)) 263 if (!hlist_unhashed(&req->collision))
264 hlist_del(&req->colision); 264 hlist_del(&req->collision);
265 else 265 else
266 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); 266 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
267 267
@@ -329,7 +329,7 @@ static int _req_conflicts(struct drbd_request *req)
329 struct hlist_node *n; 329 struct hlist_node *n;
330 struct hlist_head *slot; 330 struct hlist_head *slot;
331 331
332 D_ASSERT(hlist_unhashed(&req->colision)); 332 D_ASSERT(hlist_unhashed(&req->collision));
333 333
334 if (!get_net_conf(mdev)) 334 if (!get_net_conf(mdev))
335 return 0; 335 return 0;
@@ -341,7 +341,7 @@ static int _req_conflicts(struct drbd_request *req)
341 341
342#define OVERLAPS overlaps(i->sector, i->size, sector, size) 342#define OVERLAPS overlaps(i->sector, i->size, sector, size)
343 slot = tl_hash_slot(mdev, sector); 343 slot = tl_hash_slot(mdev, sector);
344 hlist_for_each_entry(i, n, slot, colision) { 344 hlist_for_each_entry(i, n, slot, collision) {
345 if (OVERLAPS) { 345 if (OVERLAPS) {
346 dev_alert(DEV, "%s[%u] Concurrent local write detected! " 346 dev_alert(DEV, "%s[%u] Concurrent local write detected! "
347 "[DISCARD L] new: %llus +%u; " 347 "[DISCARD L] new: %llus +%u; "
@@ -359,7 +359,7 @@ static int _req_conflicts(struct drbd_request *req)
359#undef OVERLAPS 359#undef OVERLAPS
360#define OVERLAPS overlaps(e->sector, e->size, sector, size) 360#define OVERLAPS overlaps(e->sector, e->size, sector, size)
361 slot = ee_hash_slot(mdev, sector); 361 slot = ee_hash_slot(mdev, sector);
362 hlist_for_each_entry(e, n, slot, colision) { 362 hlist_for_each_entry(e, n, slot, collision) {
363 if (OVERLAPS) { 363 if (OVERLAPS) {
364 dev_alert(DEV, "%s[%u] Concurrent remote write detected!" 364 dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
365 " [DISCARD L] new: %llus +%u; " 365 " [DISCARD L] new: %llus +%u; "
@@ -491,7 +491,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
491 491
492 /* so we can verify the handle in the answer packet 492 /* so we can verify the handle in the answer packet
493 * corresponding hlist_del is in _req_may_be_done() */ 493 * corresponding hlist_del is in _req_may_be_done() */
494 hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); 494 hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
495 495
496 set_bit(UNPLUG_REMOTE, &mdev->flags); 496 set_bit(UNPLUG_REMOTE, &mdev->flags);
497 497
@@ -507,7 +507,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
507 /* assert something? */ 507 /* assert something? */
508 /* from drbd_make_request_common only */ 508 /* from drbd_make_request_common only */
509 509
510 hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); 510 hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
511 /* corresponding hlist_del is in _req_may_be_done() */ 511 /* corresponding hlist_del is in _req_may_be_done() */
512 512
513 /* NOTE 513 /* NOTE
@@ -1033,7 +1033,7 @@ fail_conflicting:
1033 err = 0; 1033 err = 0;
1034 1034
1035fail_free_complete: 1035fail_free_complete:
1036 if (rw == WRITE && local) 1036 if (req->rq_state & RQ_IN_ACT_LOG)
1037 drbd_al_complete_io(mdev, sector); 1037 drbd_al_complete_io(mdev, sector);
1038fail_and_free_req: 1038fail_and_free_req:
1039 if (local) { 1039 if (local) {
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 32e2c3e6a813..68a234a5fdc5 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -256,7 +256,7 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
256 struct hlist_node *n; 256 struct hlist_node *n;
257 struct drbd_request *req; 257 struct drbd_request *req;
258 258
259 hlist_for_each_entry(req, n, slot, colision) { 259 hlist_for_each_entry(req, n, slot, collision) {
260 if ((unsigned long)req == (unsigned long)id) { 260 if ((unsigned long)req == (unsigned long)id) {
261 D_ASSERT(req->sector == sector); 261 D_ASSERT(req->sector == sector);
262 return req; 262 return req;
@@ -291,7 +291,7 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
291 req->epoch = 0; 291 req->epoch = 0;
292 req->sector = bio_src->bi_sector; 292 req->sector = bio_src->bi_sector;
293 req->size = bio_src->bi_size; 293 req->size = bio_src->bi_size;
294 INIT_HLIST_NODE(&req->colision); 294 INIT_HLIST_NODE(&req->collision);
295 INIT_LIST_HEAD(&req->tl_requests); 295 INIT_LIST_HEAD(&req->tl_requests);
296 INIT_LIST_HEAD(&req->w.list); 296 INIT_LIST_HEAD(&req->w.list);
297 } 297 }
@@ -323,6 +323,7 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
323extern void complete_master_bio(struct drbd_conf *mdev, 323extern void complete_master_bio(struct drbd_conf *mdev,
324 struct bio_and_error *m); 324 struct bio_and_error *m);
325extern void request_timer_fn(unsigned long data); 325extern void request_timer_fn(unsigned long data);
326extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
326 327
327/* use this if you don't want to deal with calling complete_master_bio() 328/* use this if you don't want to deal with calling complete_master_bio()
328 * outside the spinlock, e.g. when walking some list on cleanup. */ 329 * outside the spinlock, e.g. when walking some list on cleanup. */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index f7e6c92f8d03..4d76b06b6b20 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -126,7 +126,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
126 list_del(&e->w.list); /* has been on active_ee or sync_ee */ 126 list_del(&e->w.list); /* has been on active_ee or sync_ee */
127 list_add_tail(&e->w.list, &mdev->done_ee); 127 list_add_tail(&e->w.list, &mdev->done_ee);
128 128
129 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, 129 /* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
130 * neither did we wake possibly waiting conflicting requests. 130 * neither did we wake possibly waiting conflicting requests.
131 * done from "drbd_process_done_ee" within the appropriate w.cb 131 * done from "drbd_process_done_ee" within the appropriate w.cb
132 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ 132 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
@@ -297,42 +297,48 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
297 crypto_hash_final(&desc, digest); 297 crypto_hash_final(&desc, digest);
298} 298}
299 299
300static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 300/* TODO merge common code with w_e_end_ov_req */
301int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
301{ 302{
302 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 303 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
303 int digest_size; 304 int digest_size;
304 void *digest; 305 void *digest;
305 int ok; 306 int ok = 1;
306 307
307 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); 308 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
308 309
309 if (unlikely(cancel)) { 310 if (unlikely(cancel))
310 drbd_free_ee(mdev, e); 311 goto out;
311 return 1;
312 }
313 312
314 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 313 if (likely((e->flags & EE_WAS_ERROR) != 0))
315 digest_size = crypto_hash_digestsize(mdev->csums_tfm); 314 goto out;
316 digest = kmalloc(digest_size, GFP_NOIO);
317 if (digest) {
318 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
319 315
320 inc_rs_pending(mdev); 316 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
321 ok = drbd_send_drequest_csum(mdev, 317 digest = kmalloc(digest_size, GFP_NOIO);
322 e->sector, 318 if (digest) {
323 e->size, 319 sector_t sector = e->sector;
324 digest, 320 unsigned int size = e->size;
325 digest_size, 321 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
326 P_CSUM_RS_REQUEST); 322 /* Free e and pages before send.
327 kfree(digest); 323 * In case we block on congestion, we could otherwise run into
328 } else { 324 * some distributed deadlock, if the other side blocks on
329 dev_err(DEV, "kmalloc() of digest failed.\n"); 325 * congestion as well, because our receiver blocks in
330 ok = 0; 326 * drbd_pp_alloc due to pp_in_use > max_buffers. */
331 } 327 drbd_free_ee(mdev, e);
332 } else 328 e = NULL;
333 ok = 1; 329 inc_rs_pending(mdev);
330 ok = drbd_send_drequest_csum(mdev, sector, size,
331 digest, digest_size,
332 P_CSUM_RS_REQUEST);
333 kfree(digest);
334 } else {
335 dev_err(DEV, "kmalloc() of digest failed.\n");
336 ok = 0;
337 }
334 338
335 drbd_free_ee(mdev, e); 339out:
340 if (e)
341 drbd_free_ee(mdev, e);
336 342
337 if (unlikely(!ok)) 343 if (unlikely(!ok))
338 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); 344 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
@@ -834,7 +840,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
834 const int ratio = 840 const int ratio =
835 (t == 0) ? 0 : 841 (t == 0) ? 0 :
836 (t < 100000) ? ((s*100)/t) : (s/(t/100)); 842 (t < 100000) ? ((s*100)/t) : (s/(t/100));
837 dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " 843 dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "
838 "transferred %luK total %luK\n", 844 "transferred %luK total %luK\n",
839 ratio, 845 ratio,
840 Bit2KB(mdev->rs_same_csum), 846 Bit2KB(mdev->rs_same_csum),
@@ -1071,9 +1077,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1071 return ok; 1077 return ok;
1072} 1078}
1073 1079
1080/* TODO merge common code with w_e_send_csum */
1074int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1081int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1075{ 1082{
1076 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 1083 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1084 sector_t sector = e->sector;
1085 unsigned int size = e->size;
1077 int digest_size; 1086 int digest_size;
1078 void *digest; 1087 void *digest;
1079 int ok = 1; 1088 int ok = 1;
@@ -1093,17 +1102,25 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1093 else 1102 else
1094 memset(digest, 0, digest_size); 1103 memset(digest, 0, digest_size);
1095 1104
1105 /* Free e and pages before send.
1106 * In case we block on congestion, we could otherwise run into
1107 * some distributed deadlock, if the other side blocks on
1108 * congestion as well, because our receiver blocks in
1109 * drbd_pp_alloc due to pp_in_use > max_buffers. */
1110 drbd_free_ee(mdev, e);
1111 e = NULL;
1096 inc_rs_pending(mdev); 1112 inc_rs_pending(mdev);
1097 ok = drbd_send_drequest_csum(mdev, e->sector, e->size, 1113 ok = drbd_send_drequest_csum(mdev, sector, size,
1098 digest, digest_size, P_OV_REPLY); 1114 digest, digest_size,
1115 P_OV_REPLY);
1099 if (!ok) 1116 if (!ok)
1100 dec_rs_pending(mdev); 1117 dec_rs_pending(mdev);
1101 kfree(digest); 1118 kfree(digest);
1102 1119
1103out: 1120out:
1104 drbd_free_ee(mdev, e); 1121 if (e)
1122 drbd_free_ee(mdev, e);
1105 dec_unacked(mdev); 1123 dec_unacked(mdev);
1106
1107 return ok; 1124 return ok;
1108} 1125}
1109 1126
@@ -1122,8 +1139,10 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1122{ 1139{
1123 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); 1140 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1124 struct digest_info *di; 1141 struct digest_info *di;
1125 int digest_size;
1126 void *digest; 1142 void *digest;
1143 sector_t sector = e->sector;
1144 unsigned int size = e->size;
1145 int digest_size;
1127 int ok, eq = 0; 1146 int ok, eq = 0;
1128 1147
1129 if (unlikely(cancel)) { 1148 if (unlikely(cancel)) {
@@ -1153,16 +1172,21 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1153 } 1172 }
1154 } 1173 }
1155 1174
1156 dec_unacked(mdev); 1175 /* Free e and pages before send.
1176 * In case we block on congestion, we could otherwise run into
1177 * some distributed deadlock, if the other side blocks on
1178 * congestion as well, because our receiver blocks in
1179 * drbd_pp_alloc due to pp_in_use > max_buffers. */
1180 drbd_free_ee(mdev, e);
1157 if (!eq) 1181 if (!eq)
1158 drbd_ov_oos_found(mdev, e->sector, e->size); 1182 drbd_ov_oos_found(mdev, sector, size);
1159 else 1183 else
1160 ov_oos_print(mdev); 1184 ov_oos_print(mdev);
1161 1185
1162 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, 1186 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
1163 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); 1187 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1164 1188
1165 drbd_free_ee(mdev, e); 1189 dec_unacked(mdev);
1166 1190
1167 --mdev->ov_left; 1191 --mdev->ov_left;
1168 1192
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a076a14ca72d..c59a672a3de0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1658,7 +1658,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
1658 struct kobject *kobj; 1658 struct kobject *kobj;
1659 1659
1660 mutex_lock(&loop_devices_mutex); 1660 mutex_lock(&loop_devices_mutex);
1661 lo = loop_init_one(dev & MINORMASK); 1661 lo = loop_init_one(MINOR(dev) >> part_shift);
1662 kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM); 1662 kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
1663 mutex_unlock(&loop_devices_mutex); 1663 mutex_unlock(&loop_devices_mutex);
1664 1664
@@ -1691,15 +1691,18 @@ static int __init loop_init(void)
1691 if (max_part > 0) 1691 if (max_part > 0)
1692 part_shift = fls(max_part); 1692 part_shift = fls(max_part);
1693 1693
1694 if ((1UL << part_shift) > DISK_MAX_PARTS)
1695 return -EINVAL;
1696
1694 if (max_loop > 1UL << (MINORBITS - part_shift)) 1697 if (max_loop > 1UL << (MINORBITS - part_shift))
1695 return -EINVAL; 1698 return -EINVAL;
1696 1699
1697 if (max_loop) { 1700 if (max_loop) {
1698 nr = max_loop; 1701 nr = max_loop;
1699 range = max_loop; 1702 range = max_loop << part_shift;
1700 } else { 1703 } else {
1701 nr = 8; 1704 nr = 8;
1702 range = 1UL << (MINORBITS - part_shift); 1705 range = 1UL << MINORBITS;
1703 } 1706 }
1704 1707
1705 if (register_blkdev(LOOP_MAJOR, "loop")) 1708 if (register_blkdev(LOOP_MAJOR, "loop"))
@@ -1738,7 +1741,7 @@ static void __exit loop_exit(void)
1738 unsigned long range; 1741 unsigned long range;
1739 struct loop_device *lo, *next; 1742 struct loop_device *lo, *next;
1740 1743
1741 range = max_loop ? max_loop : 1UL << (MINORBITS - part_shift); 1744 range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
1742 1745
1743 list_for_each_entry_safe(lo, next, &loop_devices, lo_list) 1746 list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
1744 loop_del_one(lo); 1747 loop_del_one(lo);
diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile
new file mode 100644
index 000000000000..e491c1b76878
--- /dev/null
+++ b/drivers/block/xen-blkback/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
2
3xen-blkback-y := blkback.o xenbus.o
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
new file mode 100644
index 000000000000..c73910cc28c9
--- /dev/null
+++ b/drivers/block/xen-blkback/blkback.c
@@ -0,0 +1,824 @@
1/******************************************************************************
2 *
3 * Back-end of the driver for virtual block devices. This portion of the
4 * driver exports a 'unified' block-device interface that can be accessed
5 * by any operating system that implements a compatible front end. A
6 * reference front-end implementation can be found in:
7 * drivers/block/xen-blkfront.c
8 *
9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
10 * Copyright (c) 2005, Christopher Clark
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34 * IN THE SOFTWARE.
35 */
36
37#include <linux/spinlock.h>
38#include <linux/kthread.h>
39#include <linux/list.h>
40#include <linux/delay.h>
41#include <linux/freezer.h>
42
43#include <xen/events.h>
44#include <xen/page.h>
45#include <asm/xen/hypervisor.h>
46#include <asm/xen/hypercall.h>
47#include "common.h"
48
49/*
50 * These are rather arbitrary. They are fairly large because adjacent requests
51 * pulled from a communication ring are quite likely to end up being part of
52 * the same scatter/gather request at the disc.
53 *
54 * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
55 *
56 * This will increase the chances of being able to write whole tracks.
57 * 64 should be enough to keep us competitive with Linux.
58 */
59static int xen_blkif_reqs = 64;
60module_param_named(reqs, xen_blkif_reqs, int, 0);
61MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
62
63/* Run-time switchable: /sys/module/blkback/parameters/ */
64static unsigned int log_stats;
65module_param(log_stats, int, 0644);
66
67/*
68 * Each outstanding request that we've passed to the lower device layers has a
69 * 'pending_req' allocated to it. Each buffer_head that completes decrements
70 * the pendcnt towards zero. When it hits zero, the specified domain has a
71 * response queued for it, with the saved 'id' passed back.
72 */
73struct pending_req {
74 struct xen_blkif *blkif;
75 u64 id;
76 int nr_pages;
77 atomic_t pendcnt;
78 unsigned short operation;
79 int status;
80 struct list_head free_list;
81};
82
83#define BLKBACK_INVALID_HANDLE (~0)
84
85struct xen_blkbk {
86 struct pending_req *pending_reqs;
87 /* List of all 'pending_req' available */
88 struct list_head pending_free;
89 /* And its spinlock. */
90 spinlock_t pending_free_lock;
91 wait_queue_head_t pending_free_wq;
92 /* The list of all pages that are available. */
93 struct page **pending_pages;
94 /* And the grant handles that are available. */
95 grant_handle_t *pending_grant_handles;
96};
97
98static struct xen_blkbk *blkbk;
99
100/*
101 * Little helpful macro to figure out the index and virtual address of the
102 * pending_pages[..]. For each 'pending_req' we have have up to
103 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
104 * 10 and would index in the pending_pages[..].
105 */
106static inline int vaddr_pagenr(struct pending_req *req, int seg)
107{
108 return (req - blkbk->pending_reqs) *
109 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
110}
111
112#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
113
114static inline unsigned long vaddr(struct pending_req *req, int seg)
115{
116 unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
117 return (unsigned long)pfn_to_kaddr(pfn);
118}
119
120#define pending_handle(_req, _seg) \
121 (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
122
123
124static int do_block_io_op(struct xen_blkif *blkif);
125static int dispatch_rw_block_io(struct xen_blkif *blkif,
126 struct blkif_request *req,
127 struct pending_req *pending_req);
128static void make_response(struct xen_blkif *blkif, u64 id,
129 unsigned short op, int st);
130
131/*
132 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
133 */
134static struct pending_req *alloc_req(void)
135{
136 struct pending_req *req = NULL;
137 unsigned long flags;
138
139 spin_lock_irqsave(&blkbk->pending_free_lock, flags);
140 if (!list_empty(&blkbk->pending_free)) {
141 req = list_entry(blkbk->pending_free.next, struct pending_req,
142 free_list);
143 list_del(&req->free_list);
144 }
145 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
146 return req;
147}
148
149/*
150 * Return the 'pending_req' structure back to the freepool. We also
151 * wake up the thread if it was waiting for a free page.
152 */
153static void free_req(struct pending_req *req)
154{
155 unsigned long flags;
156 int was_empty;
157
158 spin_lock_irqsave(&blkbk->pending_free_lock, flags);
159 was_empty = list_empty(&blkbk->pending_free);
160 list_add(&req->free_list, &blkbk->pending_free);
161 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
162 if (was_empty)
163 wake_up(&blkbk->pending_free_wq);
164}
165
166/*
167 * Routines for managing virtual block devices (vbds).
168 */
169static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
170 int operation)
171{
172 struct xen_vbd *vbd = &blkif->vbd;
173 int rc = -EACCES;
174
175 if ((operation != READ) && vbd->readonly)
176 goto out;
177
178 if (likely(req->nr_sects)) {
179 blkif_sector_t end = req->sector_number + req->nr_sects;
180
181 if (unlikely(end < req->sector_number))
182 goto out;
183 if (unlikely(end > vbd_sz(vbd)))
184 goto out;
185 }
186
187 req->dev = vbd->pdevice;
188 req->bdev = vbd->bdev;
189 rc = 0;
190
191 out:
192 return rc;
193}
194
195static void xen_vbd_resize(struct xen_blkif *blkif)
196{
197 struct xen_vbd *vbd = &blkif->vbd;
198 struct xenbus_transaction xbt;
199 int err;
200 struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
201 unsigned long long new_size = vbd_sz(vbd);
202
203 pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
204 blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
205 pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
206 vbd->size = new_size;
207again:
208 err = xenbus_transaction_start(&xbt);
209 if (err) {
210 pr_warn(DRV_PFX "Error starting transaction");
211 return;
212 }
213 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
214 (unsigned long long)vbd_sz(vbd));
215 if (err) {
216 pr_warn(DRV_PFX "Error writing new size");
217 goto abort;
218 }
219 /*
220 * Write the current state; we will use this to synchronize
221 * the front-end. If the current state is "connected" the
222 * front-end will get the new size information online.
223 */
224 err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
225 if (err) {
226 pr_warn(DRV_PFX "Error writing the state");
227 goto abort;
228 }
229
230 err = xenbus_transaction_end(xbt, 0);
231 if (err == -EAGAIN)
232 goto again;
233 if (err)
234 pr_warn(DRV_PFX "Error ending transaction");
235 return;
236abort:
237 xenbus_transaction_end(xbt, 1);
238}
239
240/*
241 * Notification from the guest OS.
242 */
243static void blkif_notify_work(struct xen_blkif *blkif)
244{
245 blkif->waiting_reqs = 1;
246 wake_up(&blkif->wq);
247}
248
249irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
250{
251 blkif_notify_work(dev_id);
252 return IRQ_HANDLED;
253}
254
255/*
256 * SCHEDULER FUNCTIONS
257 */
258
259static void print_stats(struct xen_blkif *blkif)
260{
261 pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d\n",
262 current->comm, blkif->st_oo_req,
263 blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
264 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
265 blkif->st_rd_req = 0;
266 blkif->st_wr_req = 0;
267 blkif->st_oo_req = 0;
268}
269
270int xen_blkif_schedule(void *arg)
271{
272 struct xen_blkif *blkif = arg;
273 struct xen_vbd *vbd = &blkif->vbd;
274
275 xen_blkif_get(blkif);
276
277 while (!kthread_should_stop()) {
278 if (try_to_freeze())
279 continue;
280 if (unlikely(vbd->size != vbd_sz(vbd)))
281 xen_vbd_resize(blkif);
282
283 wait_event_interruptible(
284 blkif->wq,
285 blkif->waiting_reqs || kthread_should_stop());
286 wait_event_interruptible(
287 blkbk->pending_free_wq,
288 !list_empty(&blkbk->pending_free) ||
289 kthread_should_stop());
290
291 blkif->waiting_reqs = 0;
292 smp_mb(); /* clear flag *before* checking for work */
293
294 if (do_block_io_op(blkif))
295 blkif->waiting_reqs = 1;
296
297 if (log_stats && time_after(jiffies, blkif->st_print))
298 print_stats(blkif);
299 }
300
301 if (log_stats)
302 print_stats(blkif);
303
304 blkif->xenblkd = NULL;
305 xen_blkif_put(blkif);
306
307 return 0;
308}
309
310struct seg_buf {
311 unsigned long buf;
312 unsigned int nsec;
313};
314/*
315 * Unmap the grant references, and also remove the M2P over-rides
316 * used in the 'pending_req'.
317 */
318static void xen_blkbk_unmap(struct pending_req *req)
319{
320 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
321 unsigned int i, invcount = 0;
322 grant_handle_t handle;
323 int ret;
324
325 for (i = 0; i < req->nr_pages; i++) {
326 handle = pending_handle(req, i);
327 if (handle == BLKBACK_INVALID_HANDLE)
328 continue;
329 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
330 GNTMAP_host_map, handle);
331 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
332 invcount++;
333 }
334
335 ret = HYPERVISOR_grant_table_op(
336 GNTTABOP_unmap_grant_ref, unmap, invcount);
337 BUG_ON(ret);
338 /*
339 * Note, we use invcount, so nr->pages, so we can't index
340 * using vaddr(req, i).
341 */
342 for (i = 0; i < invcount; i++) {
343 ret = m2p_remove_override(
344 virt_to_page(unmap[i].host_addr), false);
345 if (ret) {
346 pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n",
347 (unsigned long)unmap[i].host_addr);
348 continue;
349 }
350 }
351}
352
353static int xen_blkbk_map(struct blkif_request *req,
354 struct pending_req *pending_req,
355 struct seg_buf seg[])
356{
357 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
358 int i;
359 int nseg = req->nr_segments;
360 int ret = 0;
361
362 /*
363 * Fill out preq.nr_sects with proper amount of sectors, and setup
364 * assign map[..] with the PFN of the page in our domain with the
365 * corresponding grant reference for each page.
366 */
367 for (i = 0; i < nseg; i++) {
368 uint32_t flags;
369
370 flags = GNTMAP_host_map;
371 if (pending_req->operation != BLKIF_OP_READ)
372 flags |= GNTMAP_readonly;
373 gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
374 req->u.rw.seg[i].gref,
375 pending_req->blkif->domid);
376 }
377
378 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
379 BUG_ON(ret);
380
381 /*
382 * Now swizzle the MFN in our domain with the MFN from the other domain
383 * so that when we access vaddr(pending_req,i) it has the contents of
384 * the page from the other domain.
385 */
386 for (i = 0; i < nseg; i++) {
387 if (unlikely(map[i].status != 0)) {
388 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
389 map[i].handle = BLKBACK_INVALID_HANDLE;
390 ret |= 1;
391 }
392
393 pending_handle(pending_req, i) = map[i].handle;
394
395 if (ret)
396 continue;
397
398 ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
399 blkbk->pending_page(pending_req, i), false);
400 if (ret) {
401 pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
402 (unsigned long)map[i].dev_bus_addr, ret);
403 /* We could switch over to GNTTABOP_copy */
404 continue;
405 }
406
407 seg[i].buf = map[i].dev_bus_addr |
408 (req->u.rw.seg[i].first_sect << 9);
409 }
410 return ret;
411}
412
413/*
414 * Completion callback on the bio's. Called as bh->b_end_io()
415 */
416
417static void __end_block_io_op(struct pending_req *pending_req, int error)
418{
419 /* An error fails the entire request. */
420 if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
421 (error == -EOPNOTSUPP)) {
422 pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
423 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
424 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
425 } else if (error) {
426 pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
427 " error=%d\n", error);
428 pending_req->status = BLKIF_RSP_ERROR;
429 }
430
431 /*
432 * If all of the bio's have completed it is time to unmap
433 * the grant references associated with 'request' and provide
434 * the proper response on the ring.
435 */
436 if (atomic_dec_and_test(&pending_req->pendcnt)) {
437 xen_blkbk_unmap(pending_req);
438 make_response(pending_req->blkif, pending_req->id,
439 pending_req->operation, pending_req->status);
440 xen_blkif_put(pending_req->blkif);
441 free_req(pending_req);
442 }
443}
444
445/*
446 * bio callback.
447 */
448static void end_block_io_op(struct bio *bio, int error)
449{
450 __end_block_io_op(bio->bi_private, error);
451 bio_put(bio);
452}
453
454
455
456/*
457 * Function to copy the from the ring buffer the 'struct blkif_request'
458 * (which has the sectors we want, number of them, grant references, etc),
459 * and transmute it to the block API to hand it over to the proper block disk.
460 */
461static int do_block_io_op(struct xen_blkif *blkif)
462{
463 union blkif_back_rings *blk_rings = &blkif->blk_rings;
464 struct blkif_request req;
465 struct pending_req *pending_req;
466 RING_IDX rc, rp;
467 int more_to_do = 0;
468
469 rc = blk_rings->common.req_cons;
470 rp = blk_rings->common.sring->req_prod;
471 rmb(); /* Ensure we see queued requests up to 'rp'. */
472
473 while (rc != rp) {
474
475 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
476 break;
477
478 if (kthread_should_stop()) {
479 more_to_do = 1;
480 break;
481 }
482
483 pending_req = alloc_req();
484 if (NULL == pending_req) {
485 blkif->st_oo_req++;
486 more_to_do = 1;
487 break;
488 }
489
490 switch (blkif->blk_protocol) {
491 case BLKIF_PROTOCOL_NATIVE:
492 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
493 break;
494 case BLKIF_PROTOCOL_X86_32:
495 blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
496 break;
497 case BLKIF_PROTOCOL_X86_64:
498 blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
499 break;
500 default:
501 BUG();
502 }
503 blk_rings->common.req_cons = ++rc; /* before make_response() */
504
505 /* Apply all sanity checks to /private copy/ of request. */
506 barrier();
507
508 if (dispatch_rw_block_io(blkif, &req, pending_req))
509 break;
510
511 /* Yield point for this unbounded loop. */
512 cond_resched();
513 }
514
515 return more_to_do;
516}
517
518/*
519 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
520 * and call the 'submit_bio' to pass it to the underlying storage.
521 */
522static int dispatch_rw_block_io(struct xen_blkif *blkif,
523 struct blkif_request *req,
524 struct pending_req *pending_req)
525{
526 struct phys_req preq;
527 struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
528 unsigned int nseg;
529 struct bio *bio = NULL;
530 struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
531 int i, nbio = 0;
532 int operation;
533 struct blk_plug plug;
534
535 switch (req->operation) {
536 case BLKIF_OP_READ:
537 blkif->st_rd_req++;
538 operation = READ;
539 break;
540 case BLKIF_OP_WRITE:
541 blkif->st_wr_req++;
542 operation = WRITE_ODIRECT;
543 break;
544 case BLKIF_OP_FLUSH_DISKCACHE:
545 blkif->st_f_req++;
546 operation = WRITE_FLUSH;
547 break;
548 case BLKIF_OP_WRITE_BARRIER:
549 default:
550 operation = 0; /* make gcc happy */
551 goto fail_response;
552 break;
553 }
554
555 /* Check that the number of segments is sane. */
556 nseg = req->nr_segments;
557 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
558 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
559 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
560 nseg);
561 /* Haven't submitted any bio's yet. */
562 goto fail_response;
563 }
564
565 preq.dev = req->handle;
566 preq.sector_number = req->u.rw.sector_number;
567 preq.nr_sects = 0;
568
569 pending_req->blkif = blkif;
570 pending_req->id = req->id;
571 pending_req->operation = req->operation;
572 pending_req->status = BLKIF_RSP_OKAY;
573 pending_req->nr_pages = nseg;
574
575 for (i = 0; i < nseg; i++) {
576 seg[i].nsec = req->u.rw.seg[i].last_sect -
577 req->u.rw.seg[i].first_sect + 1;
578 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
579 (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
580 goto fail_response;
581 preq.nr_sects += seg[i].nsec;
582
583 }
584
585 if (xen_vbd_translate(&preq, blkif, operation) != 0) {
586 pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
587 operation == READ ? "read" : "write",
588 preq.sector_number,
589 preq.sector_number + preq.nr_sects, preq.dev);
590 goto fail_response;
591 }
592
593 /*
594 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
595 * is set there.
596 */
597 for (i = 0; i < nseg; i++) {
598 if (((int)preq.sector_number|(int)seg[i].nsec) &
599 ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
600 pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
601 blkif->domid);
602 goto fail_response;
603 }
604 }
605
606 /*
607 * If we have failed at this point, we need to undo the M2P override,
608 * set gnttab_set_unmap_op on all of the grant references and perform
609 * the hypercall to unmap the grants - that is all done in
610 * xen_blkbk_unmap.
611 */
612 if (xen_blkbk_map(req, pending_req, seg))
613 goto fail_flush;
614
615 /* This corresponding xen_blkif_put is done in __end_block_io_op */
616 xen_blkif_get(blkif);
617
618 for (i = 0; i < nseg; i++) {
619 while ((bio == NULL) ||
620 (bio_add_page(bio,
621 blkbk->pending_page(pending_req, i),
622 seg[i].nsec << 9,
623 seg[i].buf & ~PAGE_MASK) == 0)) {
624
625 bio = bio_alloc(GFP_KERNEL, nseg-i);
626 if (unlikely(bio == NULL))
627 goto fail_put_bio;
628
629 biolist[nbio++] = bio;
630 bio->bi_bdev = preq.bdev;
631 bio->bi_private = pending_req;
632 bio->bi_end_io = end_block_io_op;
633 bio->bi_sector = preq.sector_number;
634 }
635
636 preq.sector_number += seg[i].nsec;
637 }
638
639 /* This will be hit if the operation was a flush. */
640 if (!bio) {
641 BUG_ON(operation != WRITE_FLUSH);
642
643 bio = bio_alloc(GFP_KERNEL, 0);
644 if (unlikely(bio == NULL))
645 goto fail_put_bio;
646
647 biolist[nbio++] = bio;
648 bio->bi_bdev = preq.bdev;
649 bio->bi_private = pending_req;
650 bio->bi_end_io = end_block_io_op;
651 }
652
653 /*
654 * We set it one so that the last submit_bio does not have to call
655 * atomic_inc.
656 */
657 atomic_set(&pending_req->pendcnt, nbio);
658
659 /* Get a reference count for the disk queue and start sending I/O */
660 blk_start_plug(&plug);
661
662 for (i = 0; i < nbio; i++)
663 submit_bio(operation, biolist[i]);
664
665 /* Let the I/Os go.. */
666 blk_finish_plug(&plug);
667
668 if (operation == READ)
669 blkif->st_rd_sect += preq.nr_sects;
670 else if (operation == WRITE || operation == WRITE_FLUSH)
671 blkif->st_wr_sect += preq.nr_sects;
672
673 return 0;
674
675 fail_flush:
676 xen_blkbk_unmap(pending_req);
677 fail_response:
678 /* Haven't submitted any bio's yet. */
679 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
680 free_req(pending_req);
681 msleep(1); /* back off a bit */
682 return -EIO;
683
684 fail_put_bio:
685 for (i = 0; i < nbio; i++)
686 bio_put(biolist[i]);
687 __end_block_io_op(pending_req, -EINVAL);
688 msleep(1); /* back off a bit */
689 return -EIO;
690}
691
692
693
694/*
695 * Put a response on the ring on how the operation fared.
696 */
697static void make_response(struct xen_blkif *blkif, u64 id,
698 unsigned short op, int st)
699{
700 struct blkif_response resp;
701 unsigned long flags;
702 union blkif_back_rings *blk_rings = &blkif->blk_rings;
703 int more_to_do = 0;
704 int notify;
705
706 resp.id = id;
707 resp.operation = op;
708 resp.status = st;
709
710 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
711 /* Place on the response ring for the relevant domain. */
712 switch (blkif->blk_protocol) {
713 case BLKIF_PROTOCOL_NATIVE:
714 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
715 &resp, sizeof(resp));
716 break;
717 case BLKIF_PROTOCOL_X86_32:
718 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
719 &resp, sizeof(resp));
720 break;
721 case BLKIF_PROTOCOL_X86_64:
722 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
723 &resp, sizeof(resp));
724 break;
725 default:
726 BUG();
727 }
728 blk_rings->common.rsp_prod_pvt++;
729 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
730 if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
731 /*
732 * Tail check for pending requests. Allows frontend to avoid
733 * notifications if requests are already in flight (lower
734 * overheads and promotes batching).
735 */
736 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
737
738 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
739 more_to_do = 1;
740 }
741
742 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
743
744 if (more_to_do)
745 blkif_notify_work(blkif);
746 if (notify)
747 notify_remote_via_irq(blkif->irq);
748}
749
750static int __init xen_blkif_init(void)
751{
752 int i, mmap_pages;
753 int rc = 0;
754
755 if (!xen_pv_domain())
756 return -ENODEV;
757
758 blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
759 if (!blkbk) {
760 pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
761 return -ENOMEM;
762 }
763
764 mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
765
766 blkbk->pending_reqs = kmalloc(sizeof(blkbk->pending_reqs[0]) *
767 xen_blkif_reqs, GFP_KERNEL);
768 blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) *
769 mmap_pages, GFP_KERNEL);
770 blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
771 mmap_pages, GFP_KERNEL);
772
773 if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
774 !blkbk->pending_pages) {
775 rc = -ENOMEM;
776 goto out_of_memory;
777 }
778
779 for (i = 0; i < mmap_pages; i++) {
780 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
781 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
782 if (blkbk->pending_pages[i] == NULL) {
783 rc = -ENOMEM;
784 goto out_of_memory;
785 }
786 }
787 rc = xen_blkif_interface_init();
788 if (rc)
789 goto failed_init;
790
791 memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs));
792
793 INIT_LIST_HEAD(&blkbk->pending_free);
794 spin_lock_init(&blkbk->pending_free_lock);
795 init_waitqueue_head(&blkbk->pending_free_wq);
796
797 for (i = 0; i < xen_blkif_reqs; i++)
798 list_add_tail(&blkbk->pending_reqs[i].free_list,
799 &blkbk->pending_free);
800
801 rc = xen_blkif_xenbus_init();
802 if (rc)
803 goto failed_init;
804
805 return 0;
806
807 out_of_memory:
808 pr_alert(DRV_PFX "%s: out of memory\n", __func__);
809 failed_init:
810 kfree(blkbk->pending_reqs);
811 kfree(blkbk->pending_grant_handles);
812 for (i = 0; i < mmap_pages; i++) {
813 if (blkbk->pending_pages[i])
814 __free_page(blkbk->pending_pages[i]);
815 }
816 kfree(blkbk->pending_pages);
817 kfree(blkbk);
818 blkbk = NULL;
819 return rc;
820}
821
822module_init(xen_blkif_init);
823
824MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
new file mode 100644
index 000000000000..9e40b283a468
--- /dev/null
+++ b/drivers/block/xen-blkback/common.h
@@ -0,0 +1,233 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License version 2
4 * as published by the Free Software Foundation; or, when distributed
5 * separately from the Linux kernel or incorporated into other
6 * software packages, subject to the following license:
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this source file (the "Software"), to deal in the Software without
10 * restriction, including without limitation the rights to use, copy, modify,
11 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
12 * and to permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * IN THE SOFTWARE.
25 */
26
27#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
28#define __XEN_BLKIF__BACKEND__COMMON_H__
29
30#include <linux/version.h>
31#include <linux/module.h>
32#include <linux/interrupt.h>
33#include <linux/slab.h>
34#include <linux/blkdev.h>
35#include <linux/vmalloc.h>
36#include <linux/wait.h>
37#include <linux/io.h>
38#include <asm/setup.h>
39#include <asm/pgalloc.h>
40#include <asm/hypervisor.h>
41#include <xen/grant_table.h>
42#include <xen/xenbus.h>
43#include <xen/interface/io/ring.h>
44#include <xen/interface/io/blkif.h>
45#include <xen/interface/io/protocols.h>
46
47#define DRV_PFX "xen-blkback:"
48#define DPRINTK(fmt, args...) \
49 pr_debug(DRV_PFX "(%s:%d) " fmt ".\n", \
50 __func__, __LINE__, ##args)
51
52
53/* Not a real protocol. Used to generate ring structs which contain
54 * the elements common to all protocols only. This way we get a
55 * compiler-checkable way to use common struct elements, so we can
56 * avoid using switch(protocol) in a number of places. */
57struct blkif_common_request {
58 char dummy;
59};
60struct blkif_common_response {
61 char dummy;
62};
63
64/* i386 protocol version */
65#pragma pack(push, 4)
66struct blkif_x86_32_request {
67 uint8_t operation; /* BLKIF_OP_??? */
68 uint8_t nr_segments; /* number of segments */
69 blkif_vdev_t handle; /* only for read/write requests */
70 uint64_t id; /* private guest value, echoed in resp */
71 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
72 struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
73};
74struct blkif_x86_32_response {
75 uint64_t id; /* copied from request */
76 uint8_t operation; /* copied from request */
77 int16_t status; /* BLKIF_RSP_??? */
78};
79#pragma pack(pop)
80
81/* x86_64 protocol version */
82struct blkif_x86_64_request {
83 uint8_t operation; /* BLKIF_OP_??? */
84 uint8_t nr_segments; /* number of segments */
85 blkif_vdev_t handle; /* only for read/write requests */
86 uint64_t __attribute__((__aligned__(8))) id;
87 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
88 struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
89};
90struct blkif_x86_64_response {
91 uint64_t __attribute__((__aligned__(8))) id;
92 uint8_t operation; /* copied from request */
93 int16_t status; /* BLKIF_RSP_??? */
94};
95
96DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
97 struct blkif_common_response);
98DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
99 struct blkif_x86_32_response);
100DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
101 struct blkif_x86_64_response);
102
103union blkif_back_rings {
104 struct blkif_back_ring native;
105 struct blkif_common_back_ring common;
106 struct blkif_x86_32_back_ring x86_32;
107 struct blkif_x86_64_back_ring x86_64;
108};
109
110enum blkif_protocol {
111 BLKIF_PROTOCOL_NATIVE = 1,
112 BLKIF_PROTOCOL_X86_32 = 2,
113 BLKIF_PROTOCOL_X86_64 = 3,
114};
115
116struct xen_vbd {
117 /* What the domain refers to this vbd as. */
118 blkif_vdev_t handle;
119 /* Non-zero -> read-only */
120 unsigned char readonly;
121 /* VDISK_xxx */
122 unsigned char type;
123 /* phys device that this vbd maps to. */
124 u32 pdevice;
125 struct block_device *bdev;
126 /* Cached size parameter. */
127 sector_t size;
128 bool flush_support;
129};
130
131struct backend_info;
132
133struct xen_blkif {
134 /* Unique identifier for this interface. */
135 domid_t domid;
136 unsigned int handle;
137 /* Physical parameters of the comms window. */
138 unsigned int irq;
139 /* Comms information. */
140 enum blkif_protocol blk_protocol;
141 union blkif_back_rings blk_rings;
142 struct vm_struct *blk_ring_area;
143 /* The VBD attached to this interface. */
144 struct xen_vbd vbd;
145 /* Back pointer to the backend_info. */
146 struct backend_info *be;
147 /* Private fields. */
148 spinlock_t blk_ring_lock;
149 atomic_t refcnt;
150
151 wait_queue_head_t wq;
152 /* One thread per one blkif. */
153 struct task_struct *xenblkd;
154 unsigned int waiting_reqs;
155
156 /* statistics */
157 unsigned long st_print;
158 int st_rd_req;
159 int st_wr_req;
160 int st_oo_req;
161 int st_f_req;
162 int st_rd_sect;
163 int st_wr_sect;
164
165 wait_queue_head_t waiting_to_free;
166
167 grant_handle_t shmem_handle;
168 grant_ref_t shmem_ref;
169};
170
171
172#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
173 (_v)->bdev->bd_part->nr_sects : \
174 get_capacity((_v)->bdev->bd_disk))
175
176#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
177#define xen_blkif_put(_b) \
178 do { \
179 if (atomic_dec_and_test(&(_b)->refcnt)) \
180 wake_up(&(_b)->waiting_to_free);\
181 } while (0)
182
183struct phys_req {
184 unsigned short dev;
185 unsigned short nr_sects;
186 struct block_device *bdev;
187 blkif_sector_t sector_number;
188};
189int xen_blkif_interface_init(void);
190
191int xen_blkif_xenbus_init(void);
192
193irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
194int xen_blkif_schedule(void *arg);
195
196int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
197 struct backend_info *be, int state);
198
199struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
200
201static inline void blkif_get_x86_32_req(struct blkif_request *dst,
202 struct blkif_x86_32_request *src)
203{
204 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
205 dst->operation = src->operation;
206 dst->nr_segments = src->nr_segments;
207 dst->handle = src->handle;
208 dst->id = src->id;
209 dst->u.rw.sector_number = src->sector_number;
210 barrier();
211 if (n > dst->nr_segments)
212 n = dst->nr_segments;
213 for (i = 0; i < n; i++)
214 dst->u.rw.seg[i] = src->seg[i];
215}
216
217static inline void blkif_get_x86_64_req(struct blkif_request *dst,
218 struct blkif_x86_64_request *src)
219{
220 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
221 dst->operation = src->operation;
222 dst->nr_segments = src->nr_segments;
223 dst->handle = src->handle;
224 dst->id = src->id;
225 dst->u.rw.sector_number = src->sector_number;
226 barrier();
227 if (n > dst->nr_segments)
228 n = dst->nr_segments;
229 for (i = 0; i < n; i++)
230 dst->u.rw.seg[i] = src->seg[i];
231}
232
233#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
new file mode 100644
index 000000000000..34570823355b
--- /dev/null
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -0,0 +1,768 @@
1/* Xenbus code for blkif backend
2 Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
3 Copyright (C) 2005 XenSource Ltd
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15*/
16
17#include <stdarg.h>
18#include <linux/module.h>
19#include <linux/kthread.h>
20#include <xen/events.h>
21#include <xen/grant_table.h>
22#include "common.h"
23
24struct backend_info {
25 struct xenbus_device *dev;
26 struct xen_blkif *blkif;
27 struct xenbus_watch backend_watch;
28 unsigned major;
29 unsigned minor;
30 char *mode;
31};
32
33static struct kmem_cache *xen_blkif_cachep;
34static void connect(struct backend_info *);
35static int connect_ring(struct backend_info *);
36static void backend_changed(struct xenbus_watch *, const char **,
37 unsigned int);
38
39struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
40{
41 return be->dev;
42}
43
44static int blkback_name(struct xen_blkif *blkif, char *buf)
45{
46 char *devpath, *devname;
47 struct xenbus_device *dev = blkif->be->dev;
48
49 devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
50 if (IS_ERR(devpath))
51 return PTR_ERR(devpath);
52
53 devname = strstr(devpath, "/dev/");
54 if (devname != NULL)
55 devname += strlen("/dev/");
56 else
57 devname = devpath;
58
59 snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
60 kfree(devpath);
61
62 return 0;
63}
64
65static void xen_update_blkif_status(struct xen_blkif *blkif)
66{
67 int err;
68 char name[TASK_COMM_LEN];
69
70 /* Not ready to connect? */
71 if (!blkif->irq || !blkif->vbd.bdev)
72 return;
73
74 /* Already connected? */
75 if (blkif->be->dev->state == XenbusStateConnected)
76 return;
77
78 /* Attempt to connect: exit if we fail to. */
79 connect(blkif->be);
80 if (blkif->be->dev->state != XenbusStateConnected)
81 return;
82
83 err = blkback_name(blkif, name);
84 if (err) {
85 xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
86 return;
87 }
88
89 err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
90 if (err) {
91 xenbus_dev_error(blkif->be->dev, err, "block flush");
92 return;
93 }
94 invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
95
96 blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, name);
97 if (IS_ERR(blkif->xenblkd)) {
98 err = PTR_ERR(blkif->xenblkd);
99 blkif->xenblkd = NULL;
100 xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
101 }
102}
103
104static struct xen_blkif *xen_blkif_alloc(domid_t domid)
105{
106 struct xen_blkif *blkif;
107
108 blkif = kmem_cache_alloc(xen_blkif_cachep, GFP_KERNEL);
109 if (!blkif)
110 return ERR_PTR(-ENOMEM);
111
112 memset(blkif, 0, sizeof(*blkif));
113 blkif->domid = domid;
114 spin_lock_init(&blkif->blk_ring_lock);
115 atomic_set(&blkif->refcnt, 1);
116 init_waitqueue_head(&blkif->wq);
117 blkif->st_print = jiffies;
118 init_waitqueue_head(&blkif->waiting_to_free);
119
120 return blkif;
121}
122
123static int map_frontend_page(struct xen_blkif *blkif, unsigned long shared_page)
124{
125 struct gnttab_map_grant_ref op;
126
127 gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
128 GNTMAP_host_map, shared_page, blkif->domid);
129
130 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
131 BUG();
132
133 if (op.status) {
134 DPRINTK("Grant table operation failure !\n");
135 return op.status;
136 }
137
138 blkif->shmem_ref = shared_page;
139 blkif->shmem_handle = op.handle;
140
141 return 0;
142}
143
144static void unmap_frontend_page(struct xen_blkif *blkif)
145{
146 struct gnttab_unmap_grant_ref op;
147
148 gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
149 GNTMAP_host_map, blkif->shmem_handle);
150
151 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
152 BUG();
153}
154
155static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
156 unsigned int evtchn)
157{
158 int err;
159
160 /* Already connected through? */
161 if (blkif->irq)
162 return 0;
163
164 blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE);
165 if (!blkif->blk_ring_area)
166 return -ENOMEM;
167
168 err = map_frontend_page(blkif, shared_page);
169 if (err) {
170 free_vm_area(blkif->blk_ring_area);
171 return err;
172 }
173
174 switch (blkif->blk_protocol) {
175 case BLKIF_PROTOCOL_NATIVE:
176 {
177 struct blkif_sring *sring;
178 sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
179 BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
180 break;
181 }
182 case BLKIF_PROTOCOL_X86_32:
183 {
184 struct blkif_x86_32_sring *sring_x86_32;
185 sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
186 BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
187 break;
188 }
189 case BLKIF_PROTOCOL_X86_64:
190 {
191 struct blkif_x86_64_sring *sring_x86_64;
192 sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
193 BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
194 break;
195 }
196 default:
197 BUG();
198 }
199
200 err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
201 xen_blkif_be_int, 0,
202 "blkif-backend", blkif);
203 if (err < 0) {
204 unmap_frontend_page(blkif);
205 free_vm_area(blkif->blk_ring_area);
206 blkif->blk_rings.common.sring = NULL;
207 return err;
208 }
209 blkif->irq = err;
210
211 return 0;
212}
213
214static void xen_blkif_disconnect(struct xen_blkif *blkif)
215{
216 if (blkif->xenblkd) {
217 kthread_stop(blkif->xenblkd);
218 blkif->xenblkd = NULL;
219 }
220
221 atomic_dec(&blkif->refcnt);
222 wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
223 atomic_inc(&blkif->refcnt);
224
225 if (blkif->irq) {
226 unbind_from_irqhandler(blkif->irq, blkif);
227 blkif->irq = 0;
228 }
229
230 if (blkif->blk_rings.common.sring) {
231 unmap_frontend_page(blkif);
232 free_vm_area(blkif->blk_ring_area);
233 blkif->blk_rings.common.sring = NULL;
234 }
235}
236
237void xen_blkif_free(struct xen_blkif *blkif)
238{
239 if (!atomic_dec_and_test(&blkif->refcnt))
240 BUG();
241 kmem_cache_free(xen_blkif_cachep, blkif);
242}
243
244int __init xen_blkif_interface_init(void)
245{
246 xen_blkif_cachep = kmem_cache_create("blkif_cache",
247 sizeof(struct xen_blkif),
248 0, 0, NULL);
249 if (!xen_blkif_cachep)
250 return -ENOMEM;
251
252 return 0;
253}
254
255/*
256 * sysfs interface for VBD I/O requests
257 */
258
259#define VBD_SHOW(name, format, args...) \
260 static ssize_t show_##name(struct device *_dev, \
261 struct device_attribute *attr, \
262 char *buf) \
263 { \
264 struct xenbus_device *dev = to_xenbus_device(_dev); \
265 struct backend_info *be = dev_get_drvdata(&dev->dev); \
266 \
267 return sprintf(buf, format, ##args); \
268 } \
269 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
270
271VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
272VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
273VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
274VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req);
275VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
276VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
277
278static struct attribute *xen_vbdstat_attrs[] = {
279 &dev_attr_oo_req.attr,
280 &dev_attr_rd_req.attr,
281 &dev_attr_wr_req.attr,
282 &dev_attr_f_req.attr,
283 &dev_attr_rd_sect.attr,
284 &dev_attr_wr_sect.attr,
285 NULL
286};
287
288static struct attribute_group xen_vbdstat_group = {
289 .name = "statistics",
290 .attrs = xen_vbdstat_attrs,
291};
292
293VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
294VBD_SHOW(mode, "%s\n", be->mode);
295
296int xenvbd_sysfs_addif(struct xenbus_device *dev)
297{
298 int error;
299
300 error = device_create_file(&dev->dev, &dev_attr_physical_device);
301 if (error)
302 goto fail1;
303
304 error = device_create_file(&dev->dev, &dev_attr_mode);
305 if (error)
306 goto fail2;
307
308 error = sysfs_create_group(&dev->dev.kobj, &xen_vbdstat_group);
309 if (error)
310 goto fail3;
311
312 return 0;
313
314fail3: sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
315fail2: device_remove_file(&dev->dev, &dev_attr_mode);
316fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
317 return error;
318}
319
320void xenvbd_sysfs_delif(struct xenbus_device *dev)
321{
322 sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
323 device_remove_file(&dev->dev, &dev_attr_mode);
324 device_remove_file(&dev->dev, &dev_attr_physical_device);
325}
326
327
328static void xen_vbd_free(struct xen_vbd *vbd)
329{
330 if (vbd->bdev)
331 blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
332 vbd->bdev = NULL;
333}
334
335static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
336 unsigned major, unsigned minor, int readonly,
337 int cdrom)
338{
339 struct xen_vbd *vbd;
340 struct block_device *bdev;
341 struct request_queue *q;
342
343 vbd = &blkif->vbd;
344 vbd->handle = handle;
345 vbd->readonly = readonly;
346 vbd->type = 0;
347
348 vbd->pdevice = MKDEV(major, minor);
349
350 bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
351 FMODE_READ : FMODE_WRITE, NULL);
352
353 if (IS_ERR(bdev)) {
354 DPRINTK("xen_vbd_create: device %08x could not be opened.\n",
355 vbd->pdevice);
356 return -ENOENT;
357 }
358
359 vbd->bdev = bdev;
360 vbd->size = vbd_sz(vbd);
361
362 if (vbd->bdev->bd_disk == NULL) {
363 DPRINTK("xen_vbd_create: device %08x doesn't exist.\n",
364 vbd->pdevice);
365 xen_vbd_free(vbd);
366 return -ENOENT;
367 }
368
369 if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
370 vbd->type |= VDISK_CDROM;
371 if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
372 vbd->type |= VDISK_REMOVABLE;
373
374 q = bdev_get_queue(bdev);
375 if (q && q->flush_flags)
376 vbd->flush_support = true;
377
378 DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
379 handle, blkif->domid);
380 return 0;
381}
382static int xen_blkbk_remove(struct xenbus_device *dev)
383{
384 struct backend_info *be = dev_get_drvdata(&dev->dev);
385
386 DPRINTK("");
387
388 if (be->major || be->minor)
389 xenvbd_sysfs_delif(dev);
390
391 if (be->backend_watch.node) {
392 unregister_xenbus_watch(&be->backend_watch);
393 kfree(be->backend_watch.node);
394 be->backend_watch.node = NULL;
395 }
396
397 if (be->blkif) {
398 xen_blkif_disconnect(be->blkif);
399 xen_vbd_free(&be->blkif->vbd);
400 xen_blkif_free(be->blkif);
401 be->blkif = NULL;
402 }
403
404 kfree(be);
405 dev_set_drvdata(&dev->dev, NULL);
406 return 0;
407}
408
409int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
410 struct backend_info *be, int state)
411{
412 struct xenbus_device *dev = be->dev;
413 int err;
414
415 err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
416 "%d", state);
417 if (err)
418 xenbus_dev_fatal(dev, err, "writing feature-flush-cache");
419
420 return err;
421}
422
423/*
424 * Entry point to this code when a new device is created. Allocate the basic
425 * structures, and watch the store waiting for the hotplug scripts to tell us
426 * the device's physical major and minor numbers. Switch to InitWait.
427 */
428static int xen_blkbk_probe(struct xenbus_device *dev,
429 const struct xenbus_device_id *id)
430{
431 int err;
432 struct backend_info *be = kzalloc(sizeof(struct backend_info),
433 GFP_KERNEL);
434 if (!be) {
435 xenbus_dev_fatal(dev, -ENOMEM,
436 "allocating backend structure");
437 return -ENOMEM;
438 }
439 be->dev = dev;
440 dev_set_drvdata(&dev->dev, be);
441
442 be->blkif = xen_blkif_alloc(dev->otherend_id);
443 if (IS_ERR(be->blkif)) {
444 err = PTR_ERR(be->blkif);
445 be->blkif = NULL;
446 xenbus_dev_fatal(dev, err, "creating block interface");
447 goto fail;
448 }
449
450 /* setup back pointer */
451 be->blkif->be = be;
452
453 err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
454 "%s/%s", dev->nodename, "physical-device");
455 if (err)
456 goto fail;
457
458 err = xenbus_switch_state(dev, XenbusStateInitWait);
459 if (err)
460 goto fail;
461
462 return 0;
463
464fail:
465 DPRINTK("failed");
466 xen_blkbk_remove(dev);
467 return err;
468}
469
470
471/*
472 * Callback received when the hotplug scripts have placed the physical-device
473 * node. Read it and the mode node, and create a vbd. If the frontend is
474 * ready, connect.
475 */
476static void backend_changed(struct xenbus_watch *watch,
477 const char **vec, unsigned int len)
478{
479 int err;
480 unsigned major;
481 unsigned minor;
482 struct backend_info *be
483 = container_of(watch, struct backend_info, backend_watch);
484 struct xenbus_device *dev = be->dev;
485 int cdrom = 0;
486 char *device_type;
487
488 DPRINTK("");
489
490 err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
491 &major, &minor);
492 if (XENBUS_EXIST_ERR(err)) {
493 /*
494 * Since this watch will fire once immediately after it is
495 * registered, we expect this. Ignore it, and wait for the
496 * hotplug scripts.
497 */
498 return;
499 }
500 if (err != 2) {
501 xenbus_dev_fatal(dev, err, "reading physical-device");
502 return;
503 }
504
505 if ((be->major || be->minor) &&
506 ((be->major != major) || (be->minor != minor))) {
507 pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n",
508 be->major, be->minor, major, minor);
509 return;
510 }
511
512 be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
513 if (IS_ERR(be->mode)) {
514 err = PTR_ERR(be->mode);
515 be->mode = NULL;
516 xenbus_dev_fatal(dev, err, "reading mode");
517 return;
518 }
519
520 device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
521 if (!IS_ERR(device_type)) {
522 cdrom = strcmp(device_type, "cdrom") == 0;
523 kfree(device_type);
524 }
525
526 if (be->major == 0 && be->minor == 0) {
527 /* Front end dir is a number, which is used as the handle. */
528
529 char *p = strrchr(dev->otherend, '/') + 1;
530 long handle;
531 err = strict_strtoul(p, 0, &handle);
532 if (err)
533 return;
534
535 be->major = major;
536 be->minor = minor;
537
538 err = xen_vbd_create(be->blkif, handle, major, minor,
539 (NULL == strchr(be->mode, 'w')), cdrom);
540 if (err) {
541 be->major = 0;
542 be->minor = 0;
543 xenbus_dev_fatal(dev, err, "creating vbd structure");
544 return;
545 }
546
547 err = xenvbd_sysfs_addif(dev);
548 if (err) {
549 xen_vbd_free(&be->blkif->vbd);
550 be->major = 0;
551 be->minor = 0;
552 xenbus_dev_fatal(dev, err, "creating sysfs entries");
553 return;
554 }
555
556 /* We're potentially connected now */
557 xen_update_blkif_status(be->blkif);
558 }
559}
560
561
562/*
563 * Callback received when the frontend's state changes.
564 */
565static void frontend_changed(struct xenbus_device *dev,
566 enum xenbus_state frontend_state)
567{
568 struct backend_info *be = dev_get_drvdata(&dev->dev);
569 int err;
570
571 DPRINTK("%s", xenbus_strstate(frontend_state));
572
573 switch (frontend_state) {
574 case XenbusStateInitialising:
575 if (dev->state == XenbusStateClosed) {
576 pr_info(DRV_PFX "%s: prepare for reconnect\n",
577 dev->nodename);
578 xenbus_switch_state(dev, XenbusStateInitWait);
579 }
580 break;
581
582 case XenbusStateInitialised:
583 case XenbusStateConnected:
584 /*
585 * Ensure we connect even when two watches fire in
586 * close successsion and we miss the intermediate value
587 * of frontend_state.
588 */
589 if (dev->state == XenbusStateConnected)
590 break;
591
592 /*
593 * Enforce precondition before potential leak point.
594 * blkif_disconnect() is idempotent.
595 */
596 xen_blkif_disconnect(be->blkif);
597
598 err = connect_ring(be);
599 if (err)
600 break;
601 xen_update_blkif_status(be->blkif);
602 break;
603
604 case XenbusStateClosing:
605 xen_blkif_disconnect(be->blkif);
606 xenbus_switch_state(dev, XenbusStateClosing);
607 break;
608
609 case XenbusStateClosed:
610 xenbus_switch_state(dev, XenbusStateClosed);
611 if (xenbus_dev_is_online(dev))
612 break;
613 /* fall through if not online */
614 case XenbusStateUnknown:
615 /* implies blkif_disconnect() via blkback_remove() */
616 device_unregister(&dev->dev);
617 break;
618
619 default:
620 xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
621 frontend_state);
622 break;
623 }
624}
625
626
627/* ** Connection ** */
628
629
630/*
631 * Write the physical details regarding the block device to the store, and
632 * switch to Connected state.
633 */
634static void connect(struct backend_info *be)
635{
636 struct xenbus_transaction xbt;
637 int err;
638 struct xenbus_device *dev = be->dev;
639
640 DPRINTK("%s", dev->otherend);
641
642 /* Supply the information about the device the frontend needs */
643again:
644 err = xenbus_transaction_start(&xbt);
645 if (err) {
646 xenbus_dev_fatal(dev, err, "starting transaction");
647 return;
648 }
649
650 err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
651 if (err)
652 goto abort;
653
654 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
655 (unsigned long long)vbd_sz(&be->blkif->vbd));
656 if (err) {
657 xenbus_dev_fatal(dev, err, "writing %s/sectors",
658 dev->nodename);
659 goto abort;
660 }
661
662 /* FIXME: use a typename instead */
663 err = xenbus_printf(xbt, dev->nodename, "info", "%u",
664 be->blkif->vbd.type |
665 (be->blkif->vbd.readonly ? VDISK_READONLY : 0));
666 if (err) {
667 xenbus_dev_fatal(dev, err, "writing %s/info",
668 dev->nodename);
669 goto abort;
670 }
671 err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
672 (unsigned long)
673 bdev_logical_block_size(be->blkif->vbd.bdev));
674 if (err) {
675 xenbus_dev_fatal(dev, err, "writing %s/sector-size",
676 dev->nodename);
677 goto abort;
678 }
679
680 err = xenbus_transaction_end(xbt, 0);
681 if (err == -EAGAIN)
682 goto again;
683 if (err)
684 xenbus_dev_fatal(dev, err, "ending transaction");
685
686 err = xenbus_switch_state(dev, XenbusStateConnected);
687 if (err)
688 xenbus_dev_fatal(dev, err, "switching to Connected state",
689 dev->nodename);
690
691 return;
692 abort:
693 xenbus_transaction_end(xbt, 1);
694}
695
696
697static int connect_ring(struct backend_info *be)
698{
699 struct xenbus_device *dev = be->dev;
700 unsigned long ring_ref;
701 unsigned int evtchn;
702 char protocol[64] = "";
703 int err;
704
705 DPRINTK("%s", dev->otherend);
706
707 err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
708 &ring_ref, "event-channel", "%u", &evtchn, NULL);
709 if (err) {
710 xenbus_dev_fatal(dev, err,
711 "reading %s/ring-ref and event-channel",
712 dev->otherend);
713 return err;
714 }
715
716 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
717 err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
718 "%63s", protocol, NULL);
719 if (err)
720 strcpy(protocol, "unspecified, assuming native");
721 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
722 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
723 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
724 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
725 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
726 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
727 else {
728 xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
729 return -1;
730 }
731 pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
732 ring_ref, evtchn, be->blkif->blk_protocol, protocol);
733
734 /* Map the shared frame, irq etc. */
735 err = xen_blkif_map(be->blkif, ring_ref, evtchn);
736 if (err) {
737 xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
738 ring_ref, evtchn);
739 return err;
740 }
741
742 return 0;
743}
744
745
746/* ** Driver Registration ** */
747
748
749static const struct xenbus_device_id xen_blkbk_ids[] = {
750 { "vbd" },
751 { "" }
752};
753
754
755static struct xenbus_driver xen_blkbk = {
756 .name = "vbd",
757 .owner = THIS_MODULE,
758 .ids = xen_blkbk_ids,
759 .probe = xen_blkbk_probe,
760 .remove = xen_blkbk_remove,
761 .otherend_changed = frontend_changed
762};
763
764
765int xen_blkif_xenbus_init(void)
766{
767 return xenbus_register_backend(&xen_blkbk);
768}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9cb8668ff5f4..b536a9cef917 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -97,6 +97,7 @@ struct blkfront_info
97 struct blk_shadow shadow[BLK_RING_SIZE]; 97 struct blk_shadow shadow[BLK_RING_SIZE];
98 unsigned long shadow_free; 98 unsigned long shadow_free;
99 unsigned int feature_flush; 99 unsigned int feature_flush;
100 unsigned int flush_op;
100 int is_ready; 101 int is_ready;
101}; 102};
102 103
@@ -250,8 +251,7 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
250 251
251/* 252/*
252 * Generate a Xen blkfront IO request from a blk layer request. Reads 253 * Generate a Xen blkfront IO request from a blk layer request. Reads
253 * and writes are handled as expected. Since we lack a loose flush 254 * and writes are handled as expected.
254 * request, we map flushes into a full ordered barrier.
255 * 255 *
256 * @req: a request struct 256 * @req: a request struct
257 */ 257 */
@@ -293,14 +293,13 @@ static int blkif_queue_request(struct request *req)
293 293
294 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 294 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
295 /* 295 /*
296 * Ideally we could just do an unordered 296 * Ideally we can do an unordered flush-to-disk. In case the
297 * flush-to-disk, but all we have is a full write 297 * backend onlysupports barriers, use that. A barrier request
298 * barrier at the moment. However, a barrier write is
299 * a superset of FUA, so we can implement it the same 298 * a superset of FUA, so we can implement it the same
300 * way. (It's also a FLUSH+FUA, since it is 299 * way. (It's also a FLUSH+FUA, since it is
301 * guaranteed ordered WRT previous writes.) 300 * guaranteed ordered WRT previous writes.)
302 */ 301 */
303 ring_req->operation = BLKIF_OP_WRITE_BARRIER; 302 ring_req->operation = info->flush_op;
304 } 303 }
305 304
306 ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); 305 ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
@@ -433,8 +432,11 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
433static void xlvbd_flush(struct blkfront_info *info) 432static void xlvbd_flush(struct blkfront_info *info)
434{ 433{
435 blk_queue_flush(info->rq, info->feature_flush); 434 blk_queue_flush(info->rq, info->feature_flush);
436 printk(KERN_INFO "blkfront: %s: barriers %s\n", 435 printk(KERN_INFO "blkfront: %s: %s: %s\n",
437 info->gd->disk_name, 436 info->gd->disk_name,
437 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
438 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
439 "flush diskcache" : "barrier or flush"),
438 info->feature_flush ? "enabled" : "disabled"); 440 info->feature_flush ? "enabled" : "disabled");
439} 441}
440 442
@@ -720,15 +722,20 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
720 722
721 error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; 723 error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
722 switch (bret->operation) { 724 switch (bret->operation) {
725 case BLKIF_OP_FLUSH_DISKCACHE:
723 case BLKIF_OP_WRITE_BARRIER: 726 case BLKIF_OP_WRITE_BARRIER:
724 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { 727 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
725 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", 728 printk(KERN_WARNING "blkfront: %s: write %s op failed\n",
729 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
730 "barrier" : "flush disk cache",
726 info->gd->disk_name); 731 info->gd->disk_name);
727 error = -EOPNOTSUPP; 732 error = -EOPNOTSUPP;
728 } 733 }
729 if (unlikely(bret->status == BLKIF_RSP_ERROR && 734 if (unlikely(bret->status == BLKIF_RSP_ERROR &&
730 info->shadow[id].req.nr_segments == 0)) { 735 info->shadow[id].req.nr_segments == 0)) {
731 printk(KERN_WARNING "blkfront: %s: empty write barrier op failed\n", 736 printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
737 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
738 "barrier" : "flush disk cache",
732 info->gd->disk_name); 739 info->gd->disk_name);
733 error = -EOPNOTSUPP; 740 error = -EOPNOTSUPP;
734 } 741 }
@@ -736,6 +743,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
736 if (error == -EOPNOTSUPP) 743 if (error == -EOPNOTSUPP)
737 error = 0; 744 error = 0;
738 info->feature_flush = 0; 745 info->feature_flush = 0;
746 info->flush_op = 0;
739 xlvbd_flush(info); 747 xlvbd_flush(info);
740 } 748 }
741 /* fall through */ 749 /* fall through */
@@ -1100,7 +1108,7 @@ static void blkfront_connect(struct blkfront_info *info)
1100 unsigned long sector_size; 1108 unsigned long sector_size;
1101 unsigned int binfo; 1109 unsigned int binfo;
1102 int err; 1110 int err;
1103 int barrier; 1111 int barrier, flush;
1104 1112
1105 switch (info->connected) { 1113 switch (info->connected) {
1106 case BLKIF_STATE_CONNECTED: 1114 case BLKIF_STATE_CONNECTED:
@@ -1140,8 +1148,11 @@ static void blkfront_connect(struct blkfront_info *info)
1140 return; 1148 return;
1141 } 1149 }
1142 1150
1151 info->feature_flush = 0;
1152 info->flush_op = 0;
1153
1143 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1154 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1144 "feature-barrier", "%lu", &barrier, 1155 "feature-barrier", "%d", &barrier,
1145 NULL); 1156 NULL);
1146 1157
1147 /* 1158 /*
@@ -1151,11 +1162,23 @@ static void blkfront_connect(struct blkfront_info *info)
1151 * 1162 *
1152 * If there are barriers, then we use flush. 1163 * If there are barriers, then we use flush.
1153 */ 1164 */
1154 info->feature_flush = 0; 1165 if (!err && barrier) {
1155
1156 if (!err && barrier)
1157 info->feature_flush = REQ_FLUSH | REQ_FUA; 1166 info->feature_flush = REQ_FLUSH | REQ_FUA;
1167 info->flush_op = BLKIF_OP_WRITE_BARRIER;
1168 }
1169 /*
1170 * And if there is "feature-flush-cache" use that above
1171 * barriers.
1172 */
1173 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1174 "feature-flush-cache", "%d", &flush,
1175 NULL);
1158 1176
1177 if (!err && flush) {
1178 info->feature_flush = REQ_FLUSH;
1179 info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
1180 }
1181
1159 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1182 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1160 if (err) { 1183 if (err) {
1161 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1184 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",