aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/amiflop.c2
-rw-r--r--drivers/block/aoe/aoeblk.c3
-rw-r--r--drivers/block/ataflop.c2
-rw-r--r--drivers/block/cciss.c136
-rw-r--r--drivers/block/cciss.h4
-rw-r--r--drivers/block/cciss_scsi.c8
-rw-r--r--drivers/block/drbd/drbd_actlog.c42
-rw-r--r--drivers/block/drbd/drbd_int.h52
-rw-r--r--drivers/block/drbd/drbd_main.c148
-rw-r--r--drivers/block/drbd/drbd_nl.c25
-rw-r--r--drivers/block/drbd/drbd_proc.c1
-rw-r--r--drivers/block/drbd/drbd_receiver.c232
-rw-r--r--drivers/block/drbd/drbd_req.c38
-rw-r--r--drivers/block/drbd/drbd_req.h3
-rw-r--r--drivers/block/drbd/drbd_worker.c34
-rw-r--r--drivers/block/floppy.c4
-rw-r--r--drivers/block/loop.c6
-rw-r--r--drivers/block/rbd.c748
-rw-r--r--drivers/block/xen-blkfront.c57
19 files changed, 802 insertions, 743 deletions
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index a1725e6488d3..7888501ad9ee 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1341,7 +1341,7 @@ static struct request *set_next_request(void)
1341{ 1341{
1342 struct request_queue *q; 1342 struct request_queue *q;
1343 int cnt = FD_MAX_UNITS; 1343 int cnt = FD_MAX_UNITS;
1344 struct request *rq; 1344 struct request *rq = NULL;
1345 1345
1346 /* Find next queue we can dispatch from */ 1346 /* Find next queue we can dispatch from */
1347 fdc_queue = fdc_queue + 1; 1347 fdc_queue = fdc_queue + 1;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 541e18879965..528f6318ded1 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -180,9 +180,6 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
180 BUG(); 180 BUG();
181 bio_endio(bio, -ENXIO); 181 bio_endio(bio, -ENXIO);
182 return 0; 182 return 0;
183 } else if (bio->bi_rw & REQ_HARDBARRIER) {
184 bio_endio(bio, -EOPNOTSUPP);
185 return 0;
186 } else if (bio->bi_io_vec == NULL) { 183 } else if (bio->bi_io_vec == NULL) {
187 printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); 184 printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
188 BUG(); 185 BUG();
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 4e4cc6c828cb..605a67e40bbf 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1399,7 +1399,7 @@ static struct request *set_next_request(void)
1399{ 1399{
1400 struct request_queue *q; 1400 struct request_queue *q;
1401 int old_pos = fdc_queue; 1401 int old_pos = fdc_queue;
1402 struct request *rq; 1402 struct request *rq = NULL;
1403 1403
1404 do { 1404 do {
1405 q = unit[fdc_queue].disk->queue; 1405 q = unit[fdc_queue].disk->queue;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 2cc4dda46279..8e0f9256eb58 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -66,6 +66,7 @@ MODULE_VERSION("3.6.26");
66MODULE_LICENSE("GPL"); 66MODULE_LICENSE("GPL");
67 67
68static DEFINE_MUTEX(cciss_mutex); 68static DEFINE_MUTEX(cciss_mutex);
69static struct proc_dir_entry *proc_cciss;
69 70
70#include "cciss_cmd.h" 71#include "cciss_cmd.h"
71#include "cciss.h" 72#include "cciss.h"
@@ -113,6 +114,8 @@ static struct board_type products[] = {
113 {0x409D0E11, "Smart Array 6400 EM", &SA5_access}, 114 {0x409D0E11, "Smart Array 6400 EM", &SA5_access},
114 {0x40910E11, "Smart Array 6i", &SA5_access}, 115 {0x40910E11, "Smart Array 6i", &SA5_access},
115 {0x3225103C, "Smart Array P600", &SA5_access}, 116 {0x3225103C, "Smart Array P600", &SA5_access},
117 {0x3223103C, "Smart Array P800", &SA5_access},
118 {0x3234103C, "Smart Array P400", &SA5_access},
116 {0x3235103C, "Smart Array P400i", &SA5_access}, 119 {0x3235103C, "Smart Array P400i", &SA5_access},
117 {0x3211103C, "Smart Array E200i", &SA5_access}, 120 {0x3211103C, "Smart Array E200i", &SA5_access},
118 {0x3212103C, "Smart Array E200", &SA5_access}, 121 {0x3212103C, "Smart Array E200", &SA5_access},
@@ -361,8 +364,6 @@ static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG",
361#define ENG_GIG_FACTOR (ENG_GIG/512) 364#define ENG_GIG_FACTOR (ENG_GIG/512)
362#define ENGAGE_SCSI "engage scsi" 365#define ENGAGE_SCSI "engage scsi"
363 366
364static struct proc_dir_entry *proc_cciss;
365
366static void cciss_seq_show_header(struct seq_file *seq) 367static void cciss_seq_show_header(struct seq_file *seq)
367{ 368{
368 ctlr_info_t *h = seq->private; 369 ctlr_info_t *h = seq->private;
@@ -2833,6 +2834,8 @@ static int cciss_revalidate(struct gendisk *disk)
2833 InquiryData_struct *inq_buff = NULL; 2834 InquiryData_struct *inq_buff = NULL;
2834 2835
2835 for (logvol = 0; logvol < CISS_MAX_LUN; logvol++) { 2836 for (logvol = 0; logvol < CISS_MAX_LUN; logvol++) {
2837 if (!h->drv[logvol])
2838 continue;
2836 if (memcmp(h->drv[logvol]->LunID, drv->LunID, 2839 if (memcmp(h->drv[logvol]->LunID, drv->LunID,
2837 sizeof(drv->LunID)) == 0) { 2840 sizeof(drv->LunID)) == 0) {
2838 FOUND = 1; 2841 FOUND = 1;
@@ -3753,7 +3756,7 @@ static void __devinit cciss_wait_for_mode_change_ack(ctlr_info_t *h)
3753 for (i = 0; i < MAX_CONFIG_WAIT; i++) { 3756 for (i = 0; i < MAX_CONFIG_WAIT; i++) {
3754 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) 3757 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
3755 break; 3758 break;
3756 msleep(10); 3759 usleep_range(10000, 20000);
3757 } 3760 }
3758} 3761}
3759 3762
@@ -3937,10 +3940,9 @@ static int __devinit cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
3937 *board_id = ((subsystem_device_id << 16) & 0xffff0000) | 3940 *board_id = ((subsystem_device_id << 16) & 0xffff0000) |
3938 subsystem_vendor_id; 3941 subsystem_vendor_id;
3939 3942
3940 for (i = 0; i < ARRAY_SIZE(products); i++) { 3943 for (i = 0; i < ARRAY_SIZE(products); i++)
3941 if (*board_id == products[i].board_id) 3944 if (*board_id == products[i].board_id)
3942 return i; 3945 return i;
3943 }
3944 dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", 3946 dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
3945 *board_id); 3947 *board_id);
3946 return -ENODEV; 3948 return -ENODEV;
@@ -3971,18 +3973,31 @@ static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
3971 return -ENODEV; 3973 return -ENODEV;
3972} 3974}
3973 3975
3974static int __devinit cciss_wait_for_board_ready(ctlr_info_t *h) 3976static int __devinit cciss_wait_for_board_state(struct pci_dev *pdev,
3977 void __iomem *vaddr, int wait_for_ready)
3978#define BOARD_READY 1
3979#define BOARD_NOT_READY 0
3975{ 3980{
3976 int i; 3981 int i, iterations;
3977 u32 scratchpad; 3982 u32 scratchpad;
3978 3983
3979 for (i = 0; i < CCISS_BOARD_READY_ITERATIONS; i++) { 3984 if (wait_for_ready)
3980 scratchpad = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET); 3985 iterations = CCISS_BOARD_READY_ITERATIONS;
3981 if (scratchpad == CCISS_FIRMWARE_READY) 3986 else
3982 return 0; 3987 iterations = CCISS_BOARD_NOT_READY_ITERATIONS;
3988
3989 for (i = 0; i < iterations; i++) {
3990 scratchpad = readl(vaddr + SA5_SCRATCHPAD_OFFSET);
3991 if (wait_for_ready) {
3992 if (scratchpad == CCISS_FIRMWARE_READY)
3993 return 0;
3994 } else {
3995 if (scratchpad != CCISS_FIRMWARE_READY)
3996 return 0;
3997 }
3983 msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS); 3998 msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS);
3984 } 3999 }
3985 dev_warn(&h->pdev->dev, "board not ready, timed out.\n"); 4000 dev_warn(&pdev->dev, "board not ready, timed out.\n");
3986 return -ENODEV; 4001 return -ENODEV;
3987} 4002}
3988 4003
@@ -4031,6 +4046,11 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h)
4031static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h) 4046static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
4032{ 4047{
4033 h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands)); 4048 h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands));
4049
4050 /* Limit commands in memory limited kdump scenario. */
4051 if (reset_devices && h->max_commands > 32)
4052 h->max_commands = 32;
4053
4034 if (h->max_commands < 16) { 4054 if (h->max_commands < 16) {
4035 dev_warn(&h->pdev->dev, "Controller reports " 4055 dev_warn(&h->pdev->dev, "Controller reports "
4036 "max supported commands of %d, an obvious lie. " 4056 "max supported commands of %d, an obvious lie. "
@@ -4148,7 +4168,7 @@ static int __devinit cciss_pci_init(ctlr_info_t *h)
4148 err = -ENOMEM; 4168 err = -ENOMEM;
4149 goto err_out_free_res; 4169 goto err_out_free_res;
4150 } 4170 }
4151 err = cciss_wait_for_board_ready(h); 4171 err = cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY);
4152 if (err) 4172 if (err)
4153 goto err_out_free_res; 4173 goto err_out_free_res;
4154 err = cciss_find_cfgtables(h); 4174 err = cciss_find_cfgtables(h);
@@ -4313,36 +4333,6 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
4313#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0) 4333#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0)
4314#define cciss_noop(p) cciss_message(p, 3, 0) 4334#define cciss_noop(p) cciss_message(p, 3, 0)
4315 4335
4316static __devinit int cciss_reset_msi(struct pci_dev *pdev)
4317{
4318/* the #defines are stolen from drivers/pci/msi.h. */
4319#define msi_control_reg(base) (base + PCI_MSI_FLAGS)
4320#define PCI_MSIX_FLAGS_ENABLE (1 << 15)
4321
4322 int pos;
4323 u16 control = 0;
4324
4325 pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
4326 if (pos) {
4327 pci_read_config_word(pdev, msi_control_reg(pos), &control);
4328 if (control & PCI_MSI_FLAGS_ENABLE) {
4329 dev_info(&pdev->dev, "resetting MSI\n");
4330 pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSI_FLAGS_ENABLE);
4331 }
4332 }
4333
4334 pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
4335 if (pos) {
4336 pci_read_config_word(pdev, msi_control_reg(pos), &control);
4337 if (control & PCI_MSIX_FLAGS_ENABLE) {
4338 dev_info(&pdev->dev, "resetting MSI-X\n");
4339 pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSIX_FLAGS_ENABLE);
4340 }
4341 }
4342
4343 return 0;
4344}
4345
4346static int cciss_controller_hard_reset(struct pci_dev *pdev, 4336static int cciss_controller_hard_reset(struct pci_dev *pdev,
4347 void * __iomem vaddr, bool use_doorbell) 4337 void * __iomem vaddr, bool use_doorbell)
4348{ 4338{
@@ -4397,17 +4387,17 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
4397 * states or using the doorbell register. */ 4387 * states or using the doorbell register. */
4398static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) 4388static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4399{ 4389{
4400 u16 saved_config_space[32];
4401 u64 cfg_offset; 4390 u64 cfg_offset;
4402 u32 cfg_base_addr; 4391 u32 cfg_base_addr;
4403 u64 cfg_base_addr_index; 4392 u64 cfg_base_addr_index;
4404 void __iomem *vaddr; 4393 void __iomem *vaddr;
4405 unsigned long paddr; 4394 unsigned long paddr;
4406 u32 misc_fw_support, active_transport; 4395 u32 misc_fw_support, active_transport;
4407 int rc, i; 4396 int rc;
4408 CfgTable_struct __iomem *cfgtable; 4397 CfgTable_struct __iomem *cfgtable;
4409 bool use_doorbell; 4398 bool use_doorbell;
4410 u32 board_id; 4399 u32 board_id;
4400 u16 command_register;
4411 4401
4412 /* For controllers as old a the p600, this is very nearly 4402 /* For controllers as old a the p600, this is very nearly
4413 * the same thing as 4403 * the same thing as
@@ -4417,14 +4407,6 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4417 * pci_set_power_state(pci_dev, PCI_D0); 4407 * pci_set_power_state(pci_dev, PCI_D0);
4418 * pci_restore_state(pci_dev); 4408 * pci_restore_state(pci_dev);
4419 * 4409 *
4420 * but we can't use these nice canned kernel routines on
4421 * kexec, because they also check the MSI/MSI-X state in PCI
4422 * configuration space and do the wrong thing when it is
4423 * set/cleared. Also, the pci_save/restore_state functions
4424 * violate the ordering requirements for restoring the
4425 * configuration space from the CCISS document (see the
4426 * comment below). So we roll our own ....
4427 *
4428 * For controllers newer than the P600, the pci power state 4410 * For controllers newer than the P600, the pci power state
4429 * method of resetting doesn't work so we have another way 4411 * method of resetting doesn't work so we have another way
4430 * using the doorbell register. 4412 * using the doorbell register.
@@ -4443,8 +4425,13 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4443 return -ENODEV; 4425 return -ENODEV;
4444 } 4426 }
4445 4427
4446 for (i = 0; i < 32; i++) 4428 /* Save the PCI command register */
4447 pci_read_config_word(pdev, 2*i, &saved_config_space[i]); 4429 pci_read_config_word(pdev, 4, &command_register);
4430 /* Turn the board off. This is so that later pci_restore_state()
4431 * won't turn the board on before the rest of config space is ready.
4432 */
4433 pci_disable_device(pdev);
4434 pci_save_state(pdev);
4448 4435
4449 /* find the first memory BAR, so we can find the cfg table */ 4436 /* find the first memory BAR, so we can find the cfg table */
4450 rc = cciss_pci_find_memory_BAR(pdev, &paddr); 4437 rc = cciss_pci_find_memory_BAR(pdev, &paddr);
@@ -4479,26 +4466,32 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4479 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell); 4466 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
4480 if (rc) 4467 if (rc)
4481 goto unmap_cfgtable; 4468 goto unmap_cfgtable;
4482 4469 pci_restore_state(pdev);
4483 /* Restore the PCI configuration space. The Open CISS 4470 rc = pci_enable_device(pdev);
4484 * Specification says, "Restore the PCI Configuration 4471 if (rc) {
4485 * Registers, offsets 00h through 60h. It is important to 4472 dev_warn(&pdev->dev, "failed to enable device.\n");
4486 * restore the command register, 16-bits at offset 04h, 4473 goto unmap_cfgtable;
4487 * last. Do not restore the configuration status register,
4488 * 16-bits at offset 06h." Note that the offset is 2*i.
4489 */
4490 for (i = 0; i < 32; i++) {
4491 if (i == 2 || i == 3)
4492 continue;
4493 pci_write_config_word(pdev, 2*i, saved_config_space[i]);
4494 } 4474 }
4495 wmb(); 4475 pci_write_config_word(pdev, 4, command_register);
4496 pci_write_config_word(pdev, 4, saved_config_space[2]);
4497 4476
4498 /* Some devices (notably the HP Smart Array 5i Controller) 4477 /* Some devices (notably the HP Smart Array 5i Controller)
4499 need a little pause here */ 4478 need a little pause here */
4500 msleep(CCISS_POST_RESET_PAUSE_MSECS); 4479 msleep(CCISS_POST_RESET_PAUSE_MSECS);
4501 4480
4481 /* Wait for board to become not ready, then ready. */
4482 dev_info(&pdev->dev, "Waiting for board to become ready.\n");
4483 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
4484 if (rc) /* Don't bail, might be E500, etc. which can't be reset */
4485 dev_warn(&pdev->dev,
4486 "failed waiting for board to become not ready\n");
4487 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
4488 if (rc) {
4489 dev_warn(&pdev->dev,
4490 "failed waiting for board to become ready\n");
4491 goto unmap_cfgtable;
4492 }
4493 dev_info(&pdev->dev, "board ready.\n");
4494
4502 /* Controller should be in simple mode at this point. If it's not, 4495 /* Controller should be in simple mode at this point. If it's not,
4503 * It means we're on one of those controllers which doesn't support 4496 * It means we're on one of those controllers which doesn't support
4504 * the doorbell reset method and on which the PCI power management reset 4497 * the doorbell reset method and on which the PCI power management reset
@@ -4539,8 +4532,6 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
4539 return 0; /* just try to do the kdump anyhow. */ 4532 return 0; /* just try to do the kdump anyhow. */
4540 if (rc) 4533 if (rc)
4541 return -ENODEV; 4534 return -ENODEV;
4542 if (cciss_reset_msi(pdev))
4543 return -ENODEV;
4544 4535
4545 /* Now try to get the controller to respond to a no-op */ 4536 /* Now try to get the controller to respond to a no-op */
4546 for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) { 4537 for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) {
@@ -4936,7 +4927,8 @@ static void __exit cciss_cleanup(void)
4936 } 4927 }
4937 } 4928 }
4938 kthread_stop(cciss_scan_thread); 4929 kthread_stop(cciss_scan_thread);
4939 remove_proc_entry("driver/cciss", NULL); 4930 if (proc_cciss)
4931 remove_proc_entry("driver/cciss", NULL);
4940 bus_unregister(&cciss_bus_type); 4932 bus_unregister(&cciss_bus_type);
4941} 4933}
4942 4934
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index ae340ffc8f81..4b8933d778f1 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -200,10 +200,14 @@ struct ctlr_info
200 * the above. 200 * the above.
201 */ 201 */
202#define CCISS_BOARD_READY_WAIT_SECS (120) 202#define CCISS_BOARD_READY_WAIT_SECS (120)
203#define CCISS_BOARD_NOT_READY_WAIT_SECS (10)
203#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100) 204#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
204#define CCISS_BOARD_READY_ITERATIONS \ 205#define CCISS_BOARD_READY_ITERATIONS \
205 ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \ 206 ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \
206 CCISS_BOARD_READY_POLL_INTERVAL_MSECS) 207 CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
208#define CCISS_BOARD_NOT_READY_ITERATIONS \
209 ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
210 CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
207#define CCISS_POST_RESET_PAUSE_MSECS (3000) 211#define CCISS_POST_RESET_PAUSE_MSECS (3000)
208#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000) 212#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000)
209#define CCISS_POST_RESET_NOOP_RETRIES (12) 213#define CCISS_POST_RESET_NOOP_RETRIES (12)
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 575495f3c4b8..727d0225b7d0 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -62,8 +62,8 @@ static int cciss_scsi_proc_info(
62 int length, /* length of data in buffer */ 62 int length, /* length of data in buffer */
63 int func); /* 0 == read, 1 == write */ 63 int func); /* 0 == read, 1 == write */
64 64
65static int cciss_scsi_queue_command (struct scsi_cmnd *cmd, 65static int cciss_scsi_queue_command (struct Scsi_Host *h,
66 void (* done)(struct scsi_cmnd *)); 66 struct scsi_cmnd *cmd);
67static int cciss_eh_device_reset_handler(struct scsi_cmnd *); 67static int cciss_eh_device_reset_handler(struct scsi_cmnd *);
68static int cciss_eh_abort_handler(struct scsi_cmnd *); 68static int cciss_eh_abort_handler(struct scsi_cmnd *);
69 69
@@ -1406,7 +1406,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
1406 1406
1407 1407
1408static int 1408static int
1409cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd *)) 1409cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
1410{ 1410{
1411 ctlr_info_t *h; 1411 ctlr_info_t *h;
1412 int rc; 1412 int rc;
@@ -1504,6 +1504,8 @@ cciss_scsi_queue_command (struct scsi_cmnd *cmd, void (* done)(struct scsi_cmnd
1504 return 0; 1504 return 0;
1505} 1505}
1506 1506
1507static DEF_SCSI_QCMD(cciss_scsi_queue_command)
1508
1507static void cciss_unregister_scsi(ctlr_info_t *h) 1509static void cciss_unregister_scsi(ctlr_info_t *h)
1508{ 1510{
1509 struct cciss_scsi_adapter_data_t *sa; 1511 struct cciss_scsi_adapter_data_t *sa;
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index ac04ef97eac2..ba95cba192be 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -78,11 +78,10 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
78 init_completion(&md_io.event); 78 init_completion(&md_io.event);
79 md_io.error = 0; 79 md_io.error = 0;
80 80
81 if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) 81 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
82 rw |= REQ_HARDBARRIER; 82 rw |= REQ_FUA;
83 rw |= REQ_UNPLUG | REQ_SYNC; 83 rw |= REQ_UNPLUG | REQ_SYNC;
84 84
85 retry:
86 bio = bio_alloc(GFP_NOIO, 1); 85 bio = bio_alloc(GFP_NOIO, 1);
87 bio->bi_bdev = bdev->md_bdev; 86 bio->bi_bdev = bdev->md_bdev;
88 bio->bi_sector = sector; 87 bio->bi_sector = sector;
@@ -100,17 +99,6 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
100 wait_for_completion(&md_io.event); 99 wait_for_completion(&md_io.event);
101 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; 100 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
102 101
103 /* check for unsupported barrier op.
104 * would rather check on EOPNOTSUPP, but that is not reliable.
105 * don't try again for ANY return value != 0 */
106 if (unlikely((bio->bi_rw & REQ_HARDBARRIER) && !ok)) {
107 /* Try again with no barrier */
108 dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
109 set_bit(MD_NO_BARRIER, &mdev->flags);
110 rw &= ~REQ_HARDBARRIER;
111 bio_put(bio);
112 goto retry;
113 }
114 out: 102 out:
115 bio_put(bio); 103 bio_put(bio);
116 return ok; 104 return ok;
@@ -284,18 +272,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
284 u32 xor_sum = 0; 272 u32 xor_sum = 0;
285 273
286 if (!get_ldev(mdev)) { 274 if (!get_ldev(mdev)) {
287 dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); 275 dev_err(DEV,
276 "disk is %s, cannot start al transaction (-%d +%d)\n",
277 drbd_disk_str(mdev->state.disk), evicted, new_enr);
288 complete(&((struct update_al_work *)w)->event); 278 complete(&((struct update_al_work *)w)->event);
289 return 1; 279 return 1;
290 } 280 }
291 /* do we have to do a bitmap write, first? 281 /* do we have to do a bitmap write, first?
292 * TODO reduce maximum latency: 282 * TODO reduce maximum latency:
293 * submit both bios, then wait for both, 283 * submit both bios, then wait for both,
294 * instead of doing two synchronous sector writes. */ 284 * instead of doing two synchronous sector writes.
285 * For now, we must not write the transaction,
286 * if we cannot write out the bitmap of the evicted extent. */
295 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) 287 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
296 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); 288 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
297 289
298 mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ 290 /* The bitmap write may have failed, causing a state change. */
291 if (mdev->state.disk < D_INCONSISTENT) {
292 dev_err(DEV,
293 "disk is %s, cannot write al transaction (-%d +%d)\n",
294 drbd_disk_str(mdev->state.disk), evicted, new_enr);
295 complete(&((struct update_al_work *)w)->event);
296 put_ldev(mdev);
297 return 1;
298 }
299
300 mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
299 buffer = (struct al_transaction *)page_address(mdev->md_io_page); 301 buffer = (struct al_transaction *)page_address(mdev->md_io_page);
300 302
301 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); 303 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
@@ -739,7 +741,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
739 unsigned int enr; 741 unsigned int enr;
740 unsigned long add = 0; 742 unsigned long add = 0;
741 char ppb[10]; 743 char ppb[10];
742 int i; 744 int i, tmp;
743 745
744 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 746 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
745 747
@@ -747,7 +749,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
747 enr = lc_element_by_index(mdev->act_log, i)->lc_number; 749 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
748 if (enr == LC_FREE) 750 if (enr == LC_FREE)
749 continue; 751 continue;
750 add += drbd_bm_ALe_set_all(mdev, enr); 752 tmp = drbd_bm_ALe_set_all(mdev, enr);
753 dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
754 add += tmp;
751 } 755 }
752 756
753 lc_unlock(mdev->act_log); 757 lc_unlock(mdev->act_log);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9bdcf4393c0a..1ea1a34e78b2 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -114,11 +114,11 @@ struct drbd_conf;
114#define D_ASSERT(exp) if (!(exp)) \ 114#define D_ASSERT(exp) if (!(exp)) \
115 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) 115 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
116 116
117#define ERR_IF(exp) if (({ \ 117#define ERR_IF(exp) if (({ \
118 int _b = (exp) != 0; \ 118 int _b = (exp) != 0; \
119 if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ 119 if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \
120 __func__, #exp, __FILE__, __LINE__); \ 120 __func__, #exp, __FILE__, __LINE__); \
121 _b; \ 121 _b; \
122 })) 122 }))
123 123
124/* Defines to control fault insertion */ 124/* Defines to control fault insertion */
@@ -749,17 +749,12 @@ struct drbd_epoch {
749 749
750/* drbd_epoch flag bits */ 750/* drbd_epoch flag bits */
751enum { 751enum {
752 DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
753 DE_BARRIER_IN_NEXT_EPOCH_DONE,
754 DE_CONTAINS_A_BARRIER,
755 DE_HAVE_BARRIER_NUMBER, 752 DE_HAVE_BARRIER_NUMBER,
756 DE_IS_FINISHING,
757}; 753};
758 754
759enum epoch_event { 755enum epoch_event {
760 EV_PUT, 756 EV_PUT,
761 EV_GOT_BARRIER_NR, 757 EV_GOT_BARRIER_NR,
762 EV_BARRIER_DONE,
763 EV_BECAME_LAST, 758 EV_BECAME_LAST,
764 EV_CLEANUP = 32, /* used as flag */ 759 EV_CLEANUP = 32, /* used as flag */
765}; 760};
@@ -801,11 +796,6 @@ enum {
801 __EE_CALL_AL_COMPLETE_IO, 796 __EE_CALL_AL_COMPLETE_IO,
802 __EE_MAY_SET_IN_SYNC, 797 __EE_MAY_SET_IN_SYNC,
803 798
804 /* This epoch entry closes an epoch using a barrier.
805 * On sucessful completion, the epoch is released,
806 * and the P_BARRIER_ACK send. */
807 __EE_IS_BARRIER,
808
809 /* In case a barrier failed, 799 /* In case a barrier failed,
810 * we need to resubmit without the barrier flag. */ 800 * we need to resubmit without the barrier flag. */
811 __EE_RESUBMITTED, 801 __EE_RESUBMITTED,
@@ -820,7 +810,6 @@ enum {
820}; 810};
821#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 811#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
822#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 812#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
823#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
824#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) 813#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
825#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) 814#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
826#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) 815#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
@@ -843,16 +832,15 @@ enum {
843 * Gets cleared when the state.conn 832 * Gets cleared when the state.conn
844 * goes into C_CONNECTED state. */ 833 * goes into C_CONNECTED state. */
845 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ 834 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
846 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
847 CONSIDER_RESYNC, 835 CONSIDER_RESYNC,
848 836
849 MD_NO_BARRIER, /* meta data device does not support barriers, 837 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
850 so don't even try */
851 SUSPEND_IO, /* suspend application io */ 838 SUSPEND_IO, /* suspend application io */
852 BITMAP_IO, /* suspend application io; 839 BITMAP_IO, /* suspend application io;
853 once no more io in flight, start bitmap io */ 840 once no more io in flight, start bitmap io */
854 BITMAP_IO_QUEUED, /* Started bitmap IO */ 841 BITMAP_IO_QUEUED, /* Started bitmap IO */
855 GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ 842 GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
843 WAS_IO_ERROR, /* Local disk failed returned IO error */
856 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ 844 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
857 NET_CONGESTED, /* The data socket is congested */ 845 NET_CONGESTED, /* The data socket is congested */
858 846
@@ -947,7 +935,6 @@ enum write_ordering_e {
947 WO_none, 935 WO_none,
948 WO_drain_io, 936 WO_drain_io,
949 WO_bdev_flush, 937 WO_bdev_flush,
950 WO_bio_barrier
951}; 938};
952 939
953struct fifo_buffer { 940struct fifo_buffer {
@@ -1281,6 +1268,7 @@ extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1281extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1268extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1282extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); 1269extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1283extern void drbd_go_diskless(struct drbd_conf *mdev); 1270extern void drbd_go_diskless(struct drbd_conf *mdev);
1271extern void drbd_ldev_destroy(struct drbd_conf *mdev);
1284 1272
1285 1273
1286/* Meta data layout 1274/* Meta data layout
@@ -1798,17 +1786,17 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
1798 case EP_PASS_ON: 1786 case EP_PASS_ON:
1799 if (!forcedetach) { 1787 if (!forcedetach) {
1800 if (__ratelimit(&drbd_ratelimit_state)) 1788 if (__ratelimit(&drbd_ratelimit_state))
1801 dev_err(DEV, "Local IO failed in %s." 1789 dev_err(DEV, "Local IO failed in %s.\n", where);
1802 "Passing error on...\n", where);
1803 break; 1790 break;
1804 } 1791 }
1805 /* NOTE fall through to detach case if forcedetach set */ 1792 /* NOTE fall through to detach case if forcedetach set */
1806 case EP_DETACH: 1793 case EP_DETACH:
1807 case EP_CALL_HELPER: 1794 case EP_CALL_HELPER:
1795 set_bit(WAS_IO_ERROR, &mdev->flags);
1808 if (mdev->state.disk > D_FAILED) { 1796 if (mdev->state.disk > D_FAILED) {
1809 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); 1797 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
1810 dev_err(DEV, "Local IO failed in %s." 1798 dev_err(DEV,
1811 "Detaching...\n", where); 1799 "Local IO failed in %s. Detaching...\n", where);
1812 } 1800 }
1813 break; 1801 break;
1814 } 1802 }
@@ -1874,7 +1862,7 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
1874static inline sector_t drbd_get_capacity(struct block_device *bdev) 1862static inline sector_t drbd_get_capacity(struct block_device *bdev)
1875{ 1863{
1876 /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ 1864 /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
1877 return bdev ? bdev->bd_inode->i_size >> 9 : 0; 1865 return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
1878} 1866}
1879 1867
1880/** 1868/**
@@ -2127,7 +2115,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
2127 __release(local); 2115 __release(local);
2128 D_ASSERT(i >= 0); 2116 D_ASSERT(i >= 0);
2129 if (i == 0) { 2117 if (i == 0) {
2118 if (mdev->state.disk == D_DISKLESS)
2119 /* even internal references gone, safe to destroy */
2120 drbd_ldev_destroy(mdev);
2130 if (mdev->state.disk == D_FAILED) 2121 if (mdev->state.disk == D_FAILED)
2122 /* all application IO references gone. */
2131 drbd_go_diskless(mdev); 2123 drbd_go_diskless(mdev);
2132 wake_up(&mdev->misc_wait); 2124 wake_up(&mdev->misc_wait);
2133 } 2125 }
@@ -2138,6 +2130,10 @@ static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_stat
2138{ 2130{
2139 int io_allowed; 2131 int io_allowed;
2140 2132
2133 /* never get a reference while D_DISKLESS */
2134 if (mdev->state.disk == D_DISKLESS)
2135 return 0;
2136
2141 atomic_inc(&mdev->local_cnt); 2137 atomic_inc(&mdev->local_cnt);
2142 io_allowed = (mdev->state.disk >= mins); 2138 io_allowed = (mdev->state.disk >= mins);
2143 if (!io_allowed) 2139 if (!io_allowed)
@@ -2406,12 +2402,12 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
2406{ 2402{
2407 int r; 2403 int r;
2408 2404
2409 if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2405 if (test_bit(MD_NO_FUA, &mdev->flags))
2410 return; 2406 return;
2411 2407
2412 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL); 2408 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
2413 if (r) { 2409 if (r) {
2414 set_bit(MD_NO_BARRIER, &mdev->flags); 2410 set_bit(MD_NO_FUA, &mdev->flags);
2415 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2411 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
2416 } 2412 }
2417} 2413}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 25c7a73c5062..6be5401d0e88 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -835,6 +835,15 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
835 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) 835 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
836 ns.conn = os.conn; 836 ns.conn = os.conn;
837 837
838 /* we cannot fail (again) if we already detached */
839 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
840 ns.disk = D_DISKLESS;
841
842 /* if we are only D_ATTACHING yet,
843 * we can (and should) go directly to D_DISKLESS. */
844 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
845 ns.disk = D_DISKLESS;
846
838 /* After C_DISCONNECTING only C_STANDALONE may follow */ 847 /* After C_DISCONNECTING only C_STANDALONE may follow */
839 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) 848 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
840 ns.conn = os.conn; 849 ns.conn = os.conn;
@@ -1056,7 +1065,15 @@ int __drbd_set_state(struct drbd_conf *mdev,
1056 !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) 1065 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1057 set_bit(DEVICE_DYING, &mdev->flags); 1066 set_bit(DEVICE_DYING, &mdev->flags);
1058 1067
1059 mdev->state.i = ns.i; 1068 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1069 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1070 * drbd_ldev_destroy() won't happen before our corresponding
1071 * after_state_ch works run, where we put_ldev again. */
1072 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1073 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1074 atomic_inc(&mdev->local_cnt);
1075
1076 mdev->state = ns;
1060 wake_up(&mdev->misc_wait); 1077 wake_up(&mdev->misc_wait);
1061 wake_up(&mdev->state_wait); 1078 wake_up(&mdev->state_wait);
1062 1079
@@ -1268,7 +1285,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1268 if (test_bit(NEW_CUR_UUID, &mdev->flags)) { 1285 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1269 drbd_uuid_new_current(mdev); 1286 drbd_uuid_new_current(mdev);
1270 clear_bit(NEW_CUR_UUID, &mdev->flags); 1287 clear_bit(NEW_CUR_UUID, &mdev->flags);
1271 drbd_md_sync(mdev);
1272 } 1288 }
1273 spin_lock_irq(&mdev->req_lock); 1289 spin_lock_irq(&mdev->req_lock);
1274 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); 1290 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
@@ -1365,63 +1381,64 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1365 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) 1381 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1366 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); 1382 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1367 1383
1368 /* first half of local IO error */ 1384 /* first half of local IO error, failure to attach,
1369 if (os.disk > D_FAILED && ns.disk == D_FAILED) { 1385 * or administrative detach */
1370 enum drbd_io_error_p eh = EP_PASS_ON; 1386 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1387 enum drbd_io_error_p eh;
1388 int was_io_error;
1389 /* corresponding get_ldev was in __drbd_set_state, to serialize
1390 * our cleanup here with the transition to D_DISKLESS,
1391 * so it is safe to dreference ldev here. */
1392 eh = mdev->ldev->dc.on_io_error;
1393 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1394
1395 /* current state still has to be D_FAILED,
1396 * there is only one way out: to D_DISKLESS,
1397 * and that may only happen after our put_ldev below. */
1398 if (mdev->state.disk != D_FAILED)
1399 dev_err(DEV,
1400 "ASSERT FAILED: disk is %s during detach\n",
1401 drbd_disk_str(mdev->state.disk));
1371 1402
1372 if (drbd_send_state(mdev)) 1403 if (drbd_send_state(mdev))
1373 dev_warn(DEV, "Notified peer that my disk is broken.\n"); 1404 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1374 else 1405 else
1375 dev_err(DEV, "Sending state for drbd_io_error() failed\n"); 1406 dev_err(DEV, "Sending state for detaching disk failed\n");
1376 1407
1377 drbd_rs_cancel_all(mdev); 1408 drbd_rs_cancel_all(mdev);
1378 1409
1379 if (get_ldev_if_state(mdev, D_FAILED)) { 1410 /* In case we want to get something to stable storage still,
1380 eh = mdev->ldev->dc.on_io_error; 1411 * this may be the last chance.
1381 put_ldev(mdev); 1412 * Following put_ldev may transition to D_DISKLESS. */
1382 } 1413 drbd_md_sync(mdev);
1383 if (eh == EP_CALL_HELPER) 1414 put_ldev(mdev);
1415
1416 if (was_io_error && eh == EP_CALL_HELPER)
1384 drbd_khelper(mdev, "local-io-error"); 1417 drbd_khelper(mdev, "local-io-error");
1385 } 1418 }
1386 1419
1420 /* second half of local IO error, failure to attach,
1421 * or administrative detach,
1422 * after local_cnt references have reached zero again */
1423 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1424 /* We must still be diskless,
1425 * re-attach has to be serialized with this! */
1426 if (mdev->state.disk != D_DISKLESS)
1427 dev_err(DEV,
1428 "ASSERT FAILED: disk is %s while going diskless\n",
1429 drbd_disk_str(mdev->state.disk));
1387 1430
1388 /* second half of local IO error handling, 1431 mdev->rs_total = 0;
1389 * after local_cnt references have reached zero: */ 1432 mdev->rs_failed = 0;
1390 if (os.disk == D_FAILED && ns.disk == D_DISKLESS) { 1433 atomic_set(&mdev->rs_pending_cnt, 0);
1391 mdev->rs_total = 0;
1392 mdev->rs_failed = 0;
1393 atomic_set(&mdev->rs_pending_cnt, 0);
1394 }
1395
1396 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1397 /* We must still be diskless,
1398 * re-attach has to be serialized with this! */
1399 if (mdev->state.disk != D_DISKLESS)
1400 dev_err(DEV,
1401 "ASSERT FAILED: disk is %s while going diskless\n",
1402 drbd_disk_str(mdev->state.disk));
1403 1434
1404 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state
1405 * will inc/dec it frequently. Since we became D_DISKLESS, no
1406 * one has touched the protected members anymore, though, so we
1407 * are safe to free them here. */
1408 if (drbd_send_state(mdev)) 1435 if (drbd_send_state(mdev))
1409 dev_warn(DEV, "Notified peer that I detached my disk.\n"); 1436 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1410 else 1437 else
1411 dev_err(DEV, "Sending state for detach failed\n"); 1438 dev_err(DEV, "Sending state for being diskless failed\n");
1412 1439 /* corresponding get_ldev in __drbd_set_state
1413 lc_destroy(mdev->resync); 1440 * this may finaly trigger drbd_ldev_destroy. */
1414 mdev->resync = NULL; 1441 put_ldev(mdev);
1415 lc_destroy(mdev->act_log);
1416 mdev->act_log = NULL;
1417 __no_warn(local,
1418 drbd_free_bc(mdev->ldev);
1419 mdev->ldev = NULL;);
1420
1421 if (mdev->md_io_tmpp) {
1422 __free_page(mdev->md_io_tmpp);
1423 mdev->md_io_tmpp = NULL;
1424 }
1425 } 1442 }
1426 1443
1427 /* Disks got bigger while they were detached */ 1444 /* Disks got bigger while they were detached */
@@ -2772,11 +2789,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2772 2789
2773 drbd_set_defaults(mdev); 2790 drbd_set_defaults(mdev);
2774 2791
2775 /* for now, we do NOT yet support it,
2776 * even though we start some framework
2777 * to eventually support barriers */
2778 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2779
2780 atomic_set(&mdev->ap_bio_cnt, 0); 2792 atomic_set(&mdev->ap_bio_cnt, 0);
2781 atomic_set(&mdev->ap_pending_cnt, 0); 2793 atomic_set(&mdev->ap_pending_cnt, 0);
2782 atomic_set(&mdev->rs_pending_cnt, 0); 2794 atomic_set(&mdev->rs_pending_cnt, 0);
@@ -2842,7 +2854,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2842 drbd_thread_init(mdev, &mdev->asender, drbd_asender); 2854 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2843 2855
2844 mdev->agreed_pro_version = PRO_VERSION_MAX; 2856 mdev->agreed_pro_version = PRO_VERSION_MAX;
2845 mdev->write_ordering = WO_bio_barrier; 2857 mdev->write_ordering = WO_bdev_flush;
2846 mdev->resync_wenr = LC_FREE; 2858 mdev->resync_wenr = LC_FREE;
2847} 2859}
2848 2860
@@ -2899,7 +2911,6 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2899 D_ASSERT(list_empty(&mdev->resync_work.list)); 2911 D_ASSERT(list_empty(&mdev->resync_work.list));
2900 D_ASSERT(list_empty(&mdev->unplug_work.list)); 2912 D_ASSERT(list_empty(&mdev->unplug_work.list));
2901 D_ASSERT(list_empty(&mdev->go_diskless.list)); 2913 D_ASSERT(list_empty(&mdev->go_diskless.list));
2902
2903} 2914}
2904 2915
2905 2916
@@ -3660,6 +3671,8 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3660 3671
3661 get_random_bytes(&val, sizeof(u64)); 3672 get_random_bytes(&val, sizeof(u64));
3662 _drbd_uuid_set(mdev, UI_CURRENT, val); 3673 _drbd_uuid_set(mdev, UI_CURRENT, val);
3674 /* get it to stable storage _now_ */
3675 drbd_md_sync(mdev);
3663} 3676}
3664 3677
3665void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) 3678void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
@@ -3756,19 +3769,31 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3756 return 1; 3769 return 1;
3757} 3770}
3758 3771
3772void drbd_ldev_destroy(struct drbd_conf *mdev)
3773{
3774 lc_destroy(mdev->resync);
3775 mdev->resync = NULL;
3776 lc_destroy(mdev->act_log);
3777 mdev->act_log = NULL;
3778 __no_warn(local,
3779 drbd_free_bc(mdev->ldev);
3780 mdev->ldev = NULL;);
3781
3782 if (mdev->md_io_tmpp) {
3783 __free_page(mdev->md_io_tmpp);
3784 mdev->md_io_tmpp = NULL;
3785 }
3786 clear_bit(GO_DISKLESS, &mdev->flags);
3787}
3788
3759static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) 3789static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3760{ 3790{
3761 D_ASSERT(mdev->state.disk == D_FAILED); 3791 D_ASSERT(mdev->state.disk == D_FAILED);
3762 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will 3792 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3763 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch 3793 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3764 * the protected members anymore, though, so in the after_state_ch work 3794 * the protected members anymore, though, so once put_ldev reaches zero
3765 * it will be safe to free them. */ 3795 * again, it will be safe to free them. */
3766 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3796 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3767 /* We need to wait for return of references checked out while we still
3768 * have been D_FAILED, though (drbd_md_sync, bitmap io). */
3769 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
3770
3771 clear_bit(GO_DISKLESS, &mdev->flags);
3772 return 1; 3797 return 1;
3773} 3798}
3774 3799
@@ -3777,9 +3802,6 @@ void drbd_go_diskless(struct drbd_conf *mdev)
3777 D_ASSERT(mdev->state.disk == D_FAILED); 3802 D_ASSERT(mdev->state.disk == D_FAILED);
3778 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) 3803 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3779 drbd_queue_work(&mdev->data.work, &mdev->go_diskless); 3804 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3780 /* don't drbd_queue_work_front,
3781 * we need to serialize with the after_state_ch work
3782 * of the -> D_FAILED transition. */
3783} 3805}
3784 3806
3785/** 3807/**
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 87925e97e613..29e5c70e4e26 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -870,6 +870,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
870 retcode = ERR_DISK_CONFIGURED; 870 retcode = ERR_DISK_CONFIGURED;
871 goto fail; 871 goto fail;
872 } 872 }
873 /* It may just now have detached because of IO error. Make sure
874 * drbd_ldev_destroy is done already, we may end up here very fast,
875 * e.g. if someone calls attach from the on-io-error handler,
876 * to realize a "hot spare" feature (not that I'd recommend that) */
877 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
873 878
874 /* allocation not in the IO path, cqueue thread context */ 879 /* allocation not in the IO path, cqueue thread context */
875 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); 880 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
@@ -1098,9 +1103,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1098 /* Reset the "barriers don't work" bits here, then force meta data to 1103 /* Reset the "barriers don't work" bits here, then force meta data to
1099 * be written, to ensure we determine if barriers are supported. */ 1104 * be written, to ensure we determine if barriers are supported. */
1100 if (nbc->dc.no_md_flush) 1105 if (nbc->dc.no_md_flush)
1101 set_bit(MD_NO_BARRIER, &mdev->flags); 1106 set_bit(MD_NO_FUA, &mdev->flags);
1102 else 1107 else
1103 clear_bit(MD_NO_BARRIER, &mdev->flags); 1108 clear_bit(MD_NO_FUA, &mdev->flags);
1104 1109
1105 /* Point of no return reached. 1110 /* Point of no return reached.
1106 * Devices and memory are no longer released by error cleanup below. 1111 * Devices and memory are no longer released by error cleanup below.
@@ -1112,8 +1117,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1112 nbc = NULL; 1117 nbc = NULL;
1113 resync_lru = NULL; 1118 resync_lru = NULL;
1114 1119
1115 mdev->write_ordering = WO_bio_barrier; 1120 mdev->write_ordering = WO_bdev_flush;
1116 drbd_bump_write_ordering(mdev, WO_bio_barrier); 1121 drbd_bump_write_ordering(mdev, WO_bdev_flush);
1117 1122
1118 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) 1123 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1119 set_bit(CRASHED_PRIMARY, &mdev->flags); 1124 set_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1262,7 +1267,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1262 force_diskless_dec: 1267 force_diskless_dec:
1263 put_ldev(mdev); 1268 put_ldev(mdev);
1264 force_diskless: 1269 force_diskless:
1265 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 1270 drbd_force_state(mdev, NS(disk, D_FAILED));
1266 drbd_md_sync(mdev); 1271 drbd_md_sync(mdev);
1267 release_bdev2_fail: 1272 release_bdev2_fail:
1268 if (nbc) 1273 if (nbc)
@@ -1285,10 +1290,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1285 return 0; 1290 return 0;
1286} 1291}
1287 1292
1293/* Detaching the disk is a process in multiple stages. First we need to lock
1294 * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1295 * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1296 * internal references as well.
1297 * Only then we have finally detached. */
1288static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1298static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1289 struct drbd_nl_cfg_reply *reply) 1299 struct drbd_nl_cfg_reply *reply)
1290{ 1300{
1301 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
1291 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); 1302 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
1303 if (mdev->state.disk == D_DISKLESS)
1304 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1305 drbd_resume_io(mdev);
1292 return 0; 1306 return 0;
1293} 1307}
1294 1308
@@ -1953,7 +1967,6 @@ static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1953 if (test_bit(NEW_CUR_UUID, &mdev->flags)) { 1967 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1954 drbd_uuid_new_current(mdev); 1968 drbd_uuid_new_current(mdev);
1955 clear_bit(NEW_CUR_UUID, &mdev->flags); 1969 clear_bit(NEW_CUR_UUID, &mdev->flags);
1956 drbd_md_sync(mdev);
1957 } 1970 }
1958 drbd_suspend_io(mdev); 1971 drbd_suspend_io(mdev);
1959 reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); 1972 reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index ad325c5d0ce1..7e6ac307e2de 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -158,7 +158,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
158 [WO_none] = 'n', 158 [WO_none] = 'n',
159 [WO_drain_io] = 'd', 159 [WO_drain_io] = 'd',
160 [WO_bdev_flush] = 'f', 160 [WO_bdev_flush] = 'f',
161 [WO_bio_barrier] = 'b',
162 }; 161 };
163 162
164 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", 163 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index efd6169acf2f..24487d4fb202 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -36,7 +36,6 @@
36#include <linux/memcontrol.h> 36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h> 37#include <linux/mm_inline.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/smp_lock.h>
40#include <linux/pkt_sched.h> 39#include <linux/pkt_sched.h>
41#define __KERNEL_SYSCALLS__ 40#define __KERNEL_SYSCALLS__
42#include <linux/unistd.h> 41#include <linux/unistd.h>
@@ -49,11 +48,6 @@
49 48
50#include "drbd_vli.h" 49#include "drbd_vli.h"
51 50
52struct flush_work {
53 struct drbd_work w;
54 struct drbd_epoch *epoch;
55};
56
57enum finish_epoch { 51enum finish_epoch {
58 FE_STILL_LIVE, 52 FE_STILL_LIVE,
59 FE_DESTROYED, 53 FE_DESTROYED,
@@ -66,16 +60,6 @@ static int drbd_do_auth(struct drbd_conf *mdev);
66static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); 60static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
67static int e_end_block(struct drbd_conf *, struct drbd_work *, int); 61static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
68 62
69static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
70{
71 struct drbd_epoch *prev;
72 spin_lock(&mdev->epoch_lock);
73 prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
74 if (prev == epoch || prev == mdev->current_epoch)
75 prev = NULL;
76 spin_unlock(&mdev->epoch_lock);
77 return prev;
78}
79 63
80#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 64#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
81 65
@@ -981,7 +965,7 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
981 return TRUE; 965 return TRUE;
982} 966}
983 967
984static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) 968static void drbd_flush(struct drbd_conf *mdev)
985{ 969{
986 int rv; 970 int rv;
987 971
@@ -997,24 +981,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
997 } 981 }
998 put_ldev(mdev); 982 put_ldev(mdev);
999 } 983 }
1000
1001 return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1002}
1003
1004static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1005{
1006 struct flush_work *fw = (struct flush_work *)w;
1007 struct drbd_epoch *epoch = fw->epoch;
1008
1009 kfree(w);
1010
1011 if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
1012 drbd_flush_after_epoch(mdev, epoch);
1013
1014 drbd_may_finish_epoch(mdev, epoch, EV_PUT |
1015 (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
1016
1017 return 1;
1018} 984}
1019 985
1020/** 986/**
@@ -1027,15 +993,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1027 struct drbd_epoch *epoch, 993 struct drbd_epoch *epoch,
1028 enum epoch_event ev) 994 enum epoch_event ev)
1029{ 995{
1030 int finish, epoch_size; 996 int epoch_size;
1031 struct drbd_epoch *next_epoch; 997 struct drbd_epoch *next_epoch;
1032 int schedule_flush = 0;
1033 enum finish_epoch rv = FE_STILL_LIVE; 998 enum finish_epoch rv = FE_STILL_LIVE;
1034 999
1035 spin_lock(&mdev->epoch_lock); 1000 spin_lock(&mdev->epoch_lock);
1036 do { 1001 do {
1037 next_epoch = NULL; 1002 next_epoch = NULL;
1038 finish = 0;
1039 1003
1040 epoch_size = atomic_read(&epoch->epoch_size); 1004 epoch_size = atomic_read(&epoch->epoch_size);
1041 1005
@@ -1045,16 +1009,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1045 break; 1009 break;
1046 case EV_GOT_BARRIER_NR: 1010 case EV_GOT_BARRIER_NR:
1047 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); 1011 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1048
1049 /* Special case: If we just switched from WO_bio_barrier to
1050 WO_bdev_flush we should not finish the current epoch */
1051 if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1052 mdev->write_ordering != WO_bio_barrier &&
1053 epoch == mdev->current_epoch)
1054 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1055 break;
1056 case EV_BARRIER_DONE:
1057 set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1058 break; 1012 break;
1059 case EV_BECAME_LAST: 1013 case EV_BECAME_LAST:
1060 /* nothing to do*/ 1014 /* nothing to do*/
@@ -1063,23 +1017,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1063 1017
1064 if (epoch_size != 0 && 1018 if (epoch_size != 0 &&
1065 atomic_read(&epoch->active) == 0 && 1019 atomic_read(&epoch->active) == 0 &&
1066 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && 1020 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
1067 epoch->list.prev == &mdev->current_epoch->list &&
1068 !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1069 /* Nearly all conditions are met to finish that epoch... */
1070 if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1071 mdev->write_ordering == WO_none ||
1072 (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1073 ev & EV_CLEANUP) {
1074 finish = 1;
1075 set_bit(DE_IS_FINISHING, &epoch->flags);
1076 } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1077 mdev->write_ordering == WO_bio_barrier) {
1078 atomic_inc(&epoch->active);
1079 schedule_flush = 1;
1080 }
1081 }
1082 if (finish) {
1083 if (!(ev & EV_CLEANUP)) { 1021 if (!(ev & EV_CLEANUP)) {
1084 spin_unlock(&mdev->epoch_lock); 1022 spin_unlock(&mdev->epoch_lock);
1085 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); 1023 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
@@ -1102,6 +1040,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1102 /* atomic_set(&epoch->active, 0); is already zero */ 1040 /* atomic_set(&epoch->active, 0); is already zero */
1103 if (rv == FE_STILL_LIVE) 1041 if (rv == FE_STILL_LIVE)
1104 rv = FE_RECYCLED; 1042 rv = FE_RECYCLED;
1043 wake_up(&mdev->ee_wait);
1105 } 1044 }
1106 } 1045 }
1107 1046
@@ -1113,22 +1052,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1113 1052
1114 spin_unlock(&mdev->epoch_lock); 1053 spin_unlock(&mdev->epoch_lock);
1115 1054
1116 if (schedule_flush) {
1117 struct flush_work *fw;
1118 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1119 if (fw) {
1120 fw->w.cb = w_flush;
1121 fw->epoch = epoch;
1122 drbd_queue_work(&mdev->data.work, &fw->w);
1123 } else {
1124 dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1125 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1126 /* That is not a recursion, only one level */
1127 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1128 drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1129 }
1130 }
1131
1132 return rv; 1055 return rv;
1133} 1056}
1134 1057
@@ -1144,19 +1067,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1144 [WO_none] = "none", 1067 [WO_none] = "none",
1145 [WO_drain_io] = "drain", 1068 [WO_drain_io] = "drain",
1146 [WO_bdev_flush] = "flush", 1069 [WO_bdev_flush] = "flush",
1147 [WO_bio_barrier] = "barrier",
1148 }; 1070 };
1149 1071
1150 pwo = mdev->write_ordering; 1072 pwo = mdev->write_ordering;
1151 wo = min(pwo, wo); 1073 wo = min(pwo, wo);
1152 if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1153 wo = WO_bdev_flush;
1154 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) 1074 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1155 wo = WO_drain_io; 1075 wo = WO_drain_io;
1156 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) 1076 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1157 wo = WO_none; 1077 wo = WO_none;
1158 mdev->write_ordering = wo; 1078 mdev->write_ordering = wo;
1159 if (pwo != mdev->write_ordering || wo == WO_bio_barrier) 1079 if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
1160 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); 1080 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1161} 1081}
1162 1082
@@ -1192,7 +1112,7 @@ next_bio:
1192 bio->bi_sector = sector; 1112 bio->bi_sector = sector;
1193 bio->bi_bdev = mdev->ldev->backing_bdev; 1113 bio->bi_bdev = mdev->ldev->backing_bdev;
1194 /* we special case some flags in the multi-bio case, see below 1114 /* we special case some flags in the multi-bio case, see below
1195 * (REQ_UNPLUG, REQ_HARDBARRIER) */ 1115 * (REQ_UNPLUG) */
1196 bio->bi_rw = rw; 1116 bio->bi_rw = rw;
1197 bio->bi_private = e; 1117 bio->bi_private = e;
1198 bio->bi_end_io = drbd_endio_sec; 1118 bio->bi_end_io = drbd_endio_sec;
@@ -1226,11 +1146,6 @@ next_bio:
1226 bio->bi_rw &= ~REQ_UNPLUG; 1146 bio->bi_rw &= ~REQ_UNPLUG;
1227 1147
1228 drbd_generic_make_request(mdev, fault_type, bio); 1148 drbd_generic_make_request(mdev, fault_type, bio);
1229
1230 /* strip off REQ_HARDBARRIER,
1231 * unless it is the first or last bio */
1232 if (bios && bios->bi_next)
1233 bios->bi_rw &= ~REQ_HARDBARRIER;
1234 } while (bios); 1149 } while (bios);
1235 maybe_kick_lo(mdev); 1150 maybe_kick_lo(mdev);
1236 return 0; 1151 return 0;
@@ -1244,45 +1159,9 @@ fail:
1244 return -ENOMEM; 1159 return -ENOMEM;
1245} 1160}
1246 1161
1247/**
1248 * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
1249 * @mdev: DRBD device.
1250 * @w: work object.
1251 * @cancel: The connection will be closed anyways (unused in this callback)
1252 */
1253int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1254{
1255 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1256 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1257 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1258 so that we can finish that epoch in drbd_may_finish_epoch().
1259 That is necessary if we already have a long chain of Epochs, before
1260 we realize that REQ_HARDBARRIER is actually not supported */
1261
1262 /* As long as the -ENOTSUPP on the barrier is reported immediately
1263 that will never trigger. If it is reported late, we will just
1264 print that warning and continue correctly for all future requests
1265 with WO_bdev_flush */
1266 if (previous_epoch(mdev, e->epoch))
1267 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1268
1269 /* we still have a local reference,
1270 * get_ldev was done in receive_Data. */
1271
1272 e->w.cb = e_end_block;
1273 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1274 /* drbd_submit_ee fails for one reason only:
1275 * if was not able to allocate sufficient bios.
1276 * requeue, try again later. */
1277 e->w.cb = w_e_reissue;
1278 drbd_queue_work(&mdev->data.work, &e->w);
1279 }
1280 return 1;
1281}
1282
1283static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 1162static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1284{ 1163{
1285 int rv, issue_flush; 1164 int rv;
1286 struct p_barrier *p = &mdev->data.rbuf.barrier; 1165 struct p_barrier *p = &mdev->data.rbuf.barrier;
1287 struct drbd_epoch *epoch; 1166 struct drbd_epoch *epoch;
1288 1167
@@ -1300,44 +1179,40 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
1300 * Therefore we must send the barrier_ack after the barrier request was 1179 * Therefore we must send the barrier_ack after the barrier request was
1301 * completed. */ 1180 * completed. */
1302 switch (mdev->write_ordering) { 1181 switch (mdev->write_ordering) {
1303 case WO_bio_barrier:
1304 case WO_none: 1182 case WO_none:
1305 if (rv == FE_RECYCLED) 1183 if (rv == FE_RECYCLED)
1306 return TRUE; 1184 return TRUE;
1307 break; 1185
1186 /* receiver context, in the writeout path of the other node.
1187 * avoid potential distributed deadlock */
1188 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1189 if (epoch)
1190 break;
1191 else
1192 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1193 /* Fall through */
1308 1194
1309 case WO_bdev_flush: 1195 case WO_bdev_flush:
1310 case WO_drain_io: 1196 case WO_drain_io:
1311 if (rv == FE_STILL_LIVE) {
1312 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1313 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1314 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1315 }
1316 if (rv == FE_RECYCLED)
1317 return TRUE;
1318
1319 /* The asender will send all the ACKs and barrier ACKs out, since
1320 all EEs moved from the active_ee to the done_ee. We need to
1321 provide a new epoch object for the EEs that come in soon */
1322 break;
1323 }
1324
1325 /* receiver context, in the writeout path of the other node.
1326 * avoid potential distributed deadlock */
1327 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1328 if (!epoch) {
1329 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1330 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1331 drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 1197 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1332 if (issue_flush) { 1198 drbd_flush(mdev);
1333 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); 1199
1334 if (rv == FE_RECYCLED) 1200 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1335 return TRUE; 1201 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1202 if (epoch)
1203 break;
1336 } 1204 }
1337 1205
1338 drbd_wait_ee_list_empty(mdev, &mdev->done_ee); 1206 epoch = mdev->current_epoch;
1207 wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
1208
1209 D_ASSERT(atomic_read(&epoch->active) == 0);
1210 D_ASSERT(epoch->flags == 0);
1339 1211
1340 return TRUE; 1212 return TRUE;
1213 default:
1214 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
1215 return FALSE;
1341 } 1216 }
1342 1217
1343 epoch->flags = 0; 1218 epoch->flags = 0;
@@ -1652,15 +1527,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1652{ 1527{
1653 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1528 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1654 sector_t sector = e->sector; 1529 sector_t sector = e->sector;
1655 struct drbd_epoch *epoch;
1656 int ok = 1, pcmd; 1530 int ok = 1, pcmd;
1657 1531
1658 if (e->flags & EE_IS_BARRIER) {
1659 epoch = previous_epoch(mdev, e->epoch);
1660 if (epoch)
1661 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1662 }
1663
1664 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { 1532 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1665 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1533 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1666 pcmd = (mdev->state.conn >= C_SYNC_SOURCE && 1534 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
@@ -1817,27 +1685,6 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1817 e->epoch = mdev->current_epoch; 1685 e->epoch = mdev->current_epoch;
1818 atomic_inc(&e->epoch->epoch_size); 1686 atomic_inc(&e->epoch->epoch_size);
1819 atomic_inc(&e->epoch->active); 1687 atomic_inc(&e->epoch->active);
1820
1821 if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1822 struct drbd_epoch *epoch;
1823 /* Issue a barrier if we start a new epoch, and the previous epoch
1824 was not a epoch containing a single request which already was
1825 a Barrier. */
1826 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1827 if (epoch == e->epoch) {
1828 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1829 rw |= REQ_HARDBARRIER;
1830 e->flags |= EE_IS_BARRIER;
1831 } else {
1832 if (atomic_read(&epoch->epoch_size) > 1 ||
1833 !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1834 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1835 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1836 rw |= REQ_HARDBARRIER;
1837 e->flags |= EE_IS_BARRIER;
1838 }
1839 }
1840 }
1841 spin_unlock(&mdev->epoch_lock); 1688 spin_unlock(&mdev->epoch_lock);
1842 1689
1843 dp_flags = be32_to_cpu(p->dp_flags); 1690 dp_flags = be32_to_cpu(p->dp_flags);
@@ -1995,10 +1842,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1995 break; 1842 break;
1996 } 1843 }
1997 1844
1998 if (mdev->state.pdsk == D_DISKLESS) { 1845 if (mdev->state.pdsk < D_INCONSISTENT) {
1999 /* In case we have the only disk of the cluster, */ 1846 /* In case we have the only disk of the cluster, */
2000 drbd_set_out_of_sync(mdev, e->sector, e->size); 1847 drbd_set_out_of_sync(mdev, e->sector, e->size);
2001 e->flags |= EE_CALL_AL_COMPLETE_IO; 1848 e->flags |= EE_CALL_AL_COMPLETE_IO;
1849 e->flags &= ~EE_MAY_SET_IN_SYNC;
2002 drbd_al_begin_io(mdev, e->sector); 1850 drbd_al_begin_io(mdev, e->sector);
2003 } 1851 }
2004 1852
@@ -3362,7 +3210,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3362 if (ns.conn == C_MASK) { 3210 if (ns.conn == C_MASK) {
3363 ns.conn = C_CONNECTED; 3211 ns.conn = C_CONNECTED;
3364 if (mdev->state.disk == D_NEGOTIATING) { 3212 if (mdev->state.disk == D_NEGOTIATING) {
3365 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3213 drbd_force_state(mdev, NS(disk, D_FAILED));
3366 } else if (peer_state.disk == D_NEGOTIATING) { 3214 } else if (peer_state.disk == D_NEGOTIATING) {
3367 dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); 3215 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3368 peer_state.disk = D_DISKLESS; 3216 peer_state.disk = D_DISKLESS;
@@ -3779,17 +3627,19 @@ static void drbdd(struct drbd_conf *mdev)
3779 } 3627 }
3780 3628
3781 shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header); 3629 shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
3782 rv = drbd_recv(mdev, &header->h80.payload, shs);
3783 if (unlikely(rv != shs)) {
3784 dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
3785 goto err_out;
3786 }
3787
3788 if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) { 3630 if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
3789 dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size); 3631 dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
3790 goto err_out; 3632 goto err_out;
3791 } 3633 }
3792 3634
3635 if (shs) {
3636 rv = drbd_recv(mdev, &header->h80.payload, shs);
3637 if (unlikely(rv != shs)) {
3638 dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
3639 goto err_out;
3640 }
3641 }
3642
3793 rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs); 3643 rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
3794 3644
3795 if (unlikely(!rv)) { 3645 if (unlikely(!rv)) {
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 9e91a2545fc8..11a75d32a2e2 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -258,7 +258,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
258 if (!hlist_unhashed(&req->colision)) 258 if (!hlist_unhashed(&req->colision))
259 hlist_del(&req->colision); 259 hlist_del(&req->colision);
260 else 260 else
261 D_ASSERT((s & RQ_NET_MASK) == 0); 261 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
262 262
263 /* for writes we need to do some extra housekeeping */ 263 /* for writes we need to do some extra housekeeping */
264 if (rw == WRITE) 264 if (rw == WRITE)
@@ -813,7 +813,8 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
813 mdev->state.conn >= C_CONNECTED)); 813 mdev->state.conn >= C_CONNECTED));
814 814
815 if (!(local || remote) && !is_susp(mdev->state)) { 815 if (!(local || remote) && !is_susp(mdev->state)) {
816 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 816 if (__ratelimit(&drbd_ratelimit_state))
817 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
817 goto fail_free_complete; 818 goto fail_free_complete;
818 } 819 }
819 820
@@ -942,12 +943,21 @@ allocate_barrier:
942 if (local) { 943 if (local) {
943 req->private_bio->bi_bdev = mdev->ldev->backing_bdev; 944 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
944 945
945 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR 946 /* State may have changed since we grabbed our reference on the
946 : rw == READ ? DRBD_FAULT_DT_RD 947 * mdev->ldev member. Double check, and short-circuit to endio.
947 : DRBD_FAULT_DT_RA)) 948 * In case the last activity log transaction failed to get on
949 * stable storage, and this is a WRITE, we may not even submit
950 * this bio. */
951 if (get_ldev(mdev)) {
952 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
953 : rw == READ ? DRBD_FAULT_DT_RD
954 : DRBD_FAULT_DT_RA))
955 bio_endio(req->private_bio, -EIO);
956 else
957 generic_make_request(req->private_bio);
958 put_ldev(mdev);
959 } else
948 bio_endio(req->private_bio, -EIO); 960 bio_endio(req->private_bio, -EIO);
949 else
950 generic_make_request(req->private_bio);
951 } 961 }
952 962
953 /* we need to plug ALWAYS since we possibly need to kick lo_dev. 963 /* we need to plug ALWAYS since we possibly need to kick lo_dev.
@@ -1022,20 +1032,6 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1022 return 0; 1032 return 0;
1023 } 1033 }
1024 1034
1025 /* Reject barrier requests if we know the underlying device does
1026 * not support them.
1027 * XXX: Need to get this info from peer as well some how so we
1028 * XXX: reject if EITHER side/data/metadata area does not support them.
1029 *
1030 * because of those XXX, this is not yet enabled,
1031 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
1032 */
1033 if (unlikely(bio->bi_rw & REQ_HARDBARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags)) {
1034 /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
1035 bio_endio(bio, -EOPNOTSUPP);
1036 return 0;
1037 }
1038
1039 /* 1035 /*
1040 * what we "blindly" assume: 1036 * what we "blindly" assume:
1041 */ 1037 */
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 181ea0364822..ab2bd09d54b4 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -339,7 +339,8 @@ static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
339} 339}
340 340
341/* completion of master bio is outside of spinlock. 341/* completion of master bio is outside of spinlock.
342 * If you need it irqsave, do it your self! */ 342 * If you need it irqsave, do it your self!
343 * Which means: don't use from bio endio callback. */
343static inline int req_mod(struct drbd_request *req, 344static inline int req_mod(struct drbd_request *req,
344 enum drbd_req_event what) 345 enum drbd_req_event what)
345{ 346{
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 108d58015cd1..34f224b018b3 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -26,7 +26,6 @@
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/drbd.h> 27#include <linux/drbd.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/smp_lock.h>
30#include <linux/wait.h> 29#include <linux/wait.h>
31#include <linux/mm.h> 30#include <linux/mm.h>
32#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
@@ -102,12 +101,6 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
102 put_ldev(mdev); 101 put_ldev(mdev);
103} 102}
104 103
105static int is_failed_barrier(int ee_flags)
106{
107 return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
108 == (EE_IS_BARRIER|EE_WAS_ERROR);
109}
110
111/* writes on behalf of the partner, or resync writes, 104/* writes on behalf of the partner, or resync writes,
112 * "submitted" by the receiver, final stage. */ 105 * "submitted" by the receiver, final stage. */
113static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) 106static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
@@ -119,21 +112,6 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
119 int is_syncer_req; 112 int is_syncer_req;
120 int do_al_complete_io; 113 int do_al_complete_io;
121 114
122 /* if this is a failed barrier request, disable use of barriers,
123 * and schedule for resubmission */
124 if (is_failed_barrier(e->flags)) {
125 drbd_bump_write_ordering(mdev, WO_bdev_flush);
126 spin_lock_irqsave(&mdev->req_lock, flags);
127 list_del(&e->w.list);
128 e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
129 e->w.cb = w_e_reissue;
130 /* put_ldev actually happens below, once we come here again. */
131 __release(local);
132 spin_unlock_irqrestore(&mdev->req_lock, flags);
133 drbd_queue_work(&mdev->data.work, &e->w);
134 return;
135 }
136
137 D_ASSERT(e->block_id != ID_VACANT); 115 D_ASSERT(e->block_id != ID_VACANT);
138 116
139 /* after we moved e to done_ee, 117 /* after we moved e to done_ee,
@@ -215,8 +193,10 @@ void drbd_endio_sec(struct bio *bio, int error)
215 */ 193 */
216void drbd_endio_pri(struct bio *bio, int error) 194void drbd_endio_pri(struct bio *bio, int error)
217{ 195{
196 unsigned long flags;
218 struct drbd_request *req = bio->bi_private; 197 struct drbd_request *req = bio->bi_private;
219 struct drbd_conf *mdev = req->mdev; 198 struct drbd_conf *mdev = req->mdev;
199 struct bio_and_error m;
220 enum drbd_req_event what; 200 enum drbd_req_event what;
221 int uptodate = bio_flagged(bio, BIO_UPTODATE); 201 int uptodate = bio_flagged(bio, BIO_UPTODATE);
222 202
@@ -242,7 +222,13 @@ void drbd_endio_pri(struct bio *bio, int error)
242 bio_put(req->private_bio); 222 bio_put(req->private_bio);
243 req->private_bio = ERR_PTR(error); 223 req->private_bio = ERR_PTR(error);
244 224
245 req_mod(req, what); 225 /* not req_mod(), we need irqsave here! */
226 spin_lock_irqsave(&mdev->req_lock, flags);
227 __req_mod(req, what, &m);
228 spin_unlock_irqrestore(&mdev->req_lock, flags);
229
230 if (m.bio)
231 complete_master_bio(mdev, &m);
246} 232}
247 233
248int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 234int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
@@ -925,7 +911,7 @@ out:
925 drbd_md_sync(mdev); 911 drbd_md_sync(mdev);
926 912
927 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { 913 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
928 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); 914 dev_info(DEV, "Writing the whole bitmap\n");
929 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); 915 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
930 } 916 }
931 917
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 767107cce982..3951020e494a 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4363,9 +4363,9 @@ out_unreg_blkdev:
4363out_put_disk: 4363out_put_disk:
4364 while (dr--) { 4364 while (dr--) {
4365 del_timer(&motor_off_timer[dr]); 4365 del_timer(&motor_off_timer[dr]);
4366 put_disk(disks[dr]);
4367 if (disks[dr]->queue) 4366 if (disks[dr]->queue)
4368 blk_cleanup_queue(disks[dr]->queue); 4367 blk_cleanup_queue(disks[dr]->queue);
4368 put_disk(disks[dr]);
4369 } 4369 }
4370 return err; 4370 return err;
4371} 4371}
@@ -4573,8 +4573,8 @@ static void __exit floppy_module_exit(void)
4573 device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); 4573 device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
4574 platform_device_unregister(&floppy_device[drive]); 4574 platform_device_unregister(&floppy_device[drive]);
4575 } 4575 }
4576 put_disk(disks[drive]);
4577 blk_cleanup_queue(disks[drive]->queue); 4576 blk_cleanup_queue(disks[drive]->queue);
4577 put_disk(disks[drive]);
4578 } 4578 }
4579 4579
4580 del_timer_sync(&fd_timeout); 4580 del_timer_sync(&fd_timeout);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1e5284ef65fa..7ea0bea2f7e3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -481,12 +481,6 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
481 if (bio_rw(bio) == WRITE) { 481 if (bio_rw(bio) == WRITE) {
482 struct file *file = lo->lo_backing_file; 482 struct file *file = lo->lo_backing_file;
483 483
484 /* REQ_HARDBARRIER is deprecated */
485 if (bio->bi_rw & REQ_HARDBARRIER) {
486 ret = -EOPNOTSUPP;
487 goto out;
488 }
489
490 if (bio->bi_rw & REQ_FLUSH) { 484 if (bio->bi_rw & REQ_FLUSH) {
491 ret = vfs_fsync(file, 0); 485 ret = vfs_fsync(file, 0);
492 if (unlikely(ret && ret != -EINVAL)) { 486 if (unlikely(ret && ret != -EINVAL)) {
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6ec9d53806c5..008d4a00b50d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -21,80 +21,9 @@
21 21
22 22
23 23
24 Instructions for use 24 For usage instructions, please refer to:
25 --------------------
26 25
27 1) Map a Linux block device to an existing rbd image. 26 Documentation/ABI/testing/sysfs-bus-rbd
28
29 Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
30
31 $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
32
33 The snapshot name can be "-" or omitted to map the image read/write.
34
35 2) List all active blkdev<->object mappings.
36
37 In this example, we have performed step #1 twice, creating two blkdevs,
38 mapped to two separate rados objects in the rados rbd pool
39
40 $ cat /sys/class/rbd/list
41 #id major client_name pool name snap KB
42 0 254 client4143 rbd foo - 1024000
43
44 The columns, in order, are:
45 - blkdev unique id
46 - blkdev assigned major
47 - rados client id
48 - rados pool name
49 - rados block device name
50 - mapped snapshot ("-" if none)
51 - device size in KB
52
53
54 3) Create a snapshot.
55
56 Usage: <blkdev id> <snapname>
57
58 $ echo "0 mysnap" > /sys/class/rbd/snap_create
59
60
61 4) Listing a snapshot.
62
63 $ cat /sys/class/rbd/snaps_list
64 #id snap KB
65 0 - 1024000 (*)
66 0 foo 1024000
67
68 The columns, in order, are:
69 - blkdev unique id
70 - snapshot name, '-' means none (active read/write version)
71 - size of device at time of snapshot
72 - the (*) indicates this is the active version
73
74 5) Rollback to snapshot.
75
76 Usage: <blkdev id> <snapname>
77
78 $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
79
80
81 6) Mapping an image using snapshot.
82
83 A snapshot mapping is read-only. This is being done by passing
84 snap=<snapname> to the options when adding a device.
85
86 $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
87
88
89 7) Remove an active blkdev<->rbd image mapping.
90
91 In this example, we remove the mapping with blkdev unique id 1.
92
93 $ echo 1 > /sys/class/rbd/remove
94
95
96 NOTE: The actual creation and deletion of rados objects is outside the scope
97 of this driver.
98 27
99 */ 28 */
100 29
@@ -163,6 +92,14 @@ struct rbd_request {
163 u64 len; 92 u64 len;
164}; 93};
165 94
95struct rbd_snap {
96 struct device dev;
97 const char *name;
98 size_t size;
99 struct list_head node;
100 u64 id;
101};
102
166/* 103/*
167 * a single device 104 * a single device
168 */ 105 */
@@ -193,21 +130,60 @@ struct rbd_device {
193 int read_only; 130 int read_only;
194 131
195 struct list_head node; 132 struct list_head node;
133
134 /* list of snapshots */
135 struct list_head snaps;
136
137 /* sysfs related */
138 struct device dev;
139};
140
141static struct bus_type rbd_bus_type = {
142 .name = "rbd",
196}; 143};
197 144
198static spinlock_t node_lock; /* protects client get/put */ 145static spinlock_t node_lock; /* protects client get/put */
199 146
200static struct class *class_rbd; /* /sys/class/rbd */
201static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 147static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
202static LIST_HEAD(rbd_dev_list); /* devices */ 148static LIST_HEAD(rbd_dev_list); /* devices */
203static LIST_HEAD(rbd_client_list); /* clients */ 149static LIST_HEAD(rbd_client_list); /* clients */
204 150
151static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
152static void rbd_dev_release(struct device *dev);
153static ssize_t rbd_snap_rollback(struct device *dev,
154 struct device_attribute *attr,
155 const char *buf,
156 size_t size);
157static ssize_t rbd_snap_add(struct device *dev,
158 struct device_attribute *attr,
159 const char *buf,
160 size_t count);
161static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
162 struct rbd_snap *snap);;
163
164
165static struct rbd_device *dev_to_rbd(struct device *dev)
166{
167 return container_of(dev, struct rbd_device, dev);
168}
169
170static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
171{
172 return get_device(&rbd_dev->dev);
173}
174
175static void rbd_put_dev(struct rbd_device *rbd_dev)
176{
177 put_device(&rbd_dev->dev);
178}
205 179
206static int rbd_open(struct block_device *bdev, fmode_t mode) 180static int rbd_open(struct block_device *bdev, fmode_t mode)
207{ 181{
208 struct gendisk *disk = bdev->bd_disk; 182 struct gendisk *disk = bdev->bd_disk;
209 struct rbd_device *rbd_dev = disk->private_data; 183 struct rbd_device *rbd_dev = disk->private_data;
210 184
185 rbd_get_dev(rbd_dev);
186
211 set_device_ro(bdev, rbd_dev->read_only); 187 set_device_ro(bdev, rbd_dev->read_only);
212 188
213 if ((mode & FMODE_WRITE) && rbd_dev->read_only) 189 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
@@ -216,9 +192,19 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
216 return 0; 192 return 0;
217} 193}
218 194
195static int rbd_release(struct gendisk *disk, fmode_t mode)
196{
197 struct rbd_device *rbd_dev = disk->private_data;
198
199 rbd_put_dev(rbd_dev);
200
201 return 0;
202}
203
219static const struct block_device_operations rbd_bd_ops = { 204static const struct block_device_operations rbd_bd_ops = {
220 .owner = THIS_MODULE, 205 .owner = THIS_MODULE,
221 .open = rbd_open, 206 .open = rbd_open,
207 .release = rbd_release,
222}; 208};
223 209
224/* 210/*
@@ -361,7 +347,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
361 int ret = -ENOMEM; 347 int ret = -ENOMEM;
362 348
363 init_rwsem(&header->snap_rwsem); 349 init_rwsem(&header->snap_rwsem);
364
365 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); 350 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
366 header->snapc = kmalloc(sizeof(struct ceph_snap_context) + 351 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
367 snap_count * 352 snap_count *
@@ -1256,10 +1241,20 @@ bad:
1256 return -ERANGE; 1241 return -ERANGE;
1257} 1242}
1258 1243
1244static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1245{
1246 struct rbd_snap *snap;
1247
1248 while (!list_empty(&rbd_dev->snaps)) {
1249 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1250 __rbd_remove_snap_dev(rbd_dev, snap);
1251 }
1252}
1253
1259/* 1254/*
1260 * only read the first part of the ondisk header, without the snaps info 1255 * only read the first part of the ondisk header, without the snaps info
1261 */ 1256 */
1262static int rbd_update_snaps(struct rbd_device *rbd_dev) 1257static int __rbd_update_snaps(struct rbd_device *rbd_dev)
1263{ 1258{
1264 int ret; 1259 int ret;
1265 struct rbd_image_header h; 1260 struct rbd_image_header h;
@@ -1280,12 +1275,15 @@ static int rbd_update_snaps(struct rbd_device *rbd_dev)
1280 rbd_dev->header.total_snaps = h.total_snaps; 1275 rbd_dev->header.total_snaps = h.total_snaps;
1281 rbd_dev->header.snapc = h.snapc; 1276 rbd_dev->header.snapc = h.snapc;
1282 rbd_dev->header.snap_names = h.snap_names; 1277 rbd_dev->header.snap_names = h.snap_names;
1278 rbd_dev->header.snap_names_len = h.snap_names_len;
1283 rbd_dev->header.snap_sizes = h.snap_sizes; 1279 rbd_dev->header.snap_sizes = h.snap_sizes;
1284 rbd_dev->header.snapc->seq = snap_seq; 1280 rbd_dev->header.snapc->seq = snap_seq;
1285 1281
1282 ret = __rbd_init_snaps_header(rbd_dev);
1283
1286 up_write(&rbd_dev->header.snap_rwsem); 1284 up_write(&rbd_dev->header.snap_rwsem);
1287 1285
1288 return 0; 1286 return ret;
1289} 1287}
1290 1288
1291static int rbd_init_disk(struct rbd_device *rbd_dev) 1289static int rbd_init_disk(struct rbd_device *rbd_dev)
@@ -1300,6 +1298,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1300 if (rc) 1298 if (rc)
1301 return rc; 1299 return rc;
1302 1300
1301 /* no need to lock here, as rbd_dev is not registered yet */
1302 rc = __rbd_init_snaps_header(rbd_dev);
1303 if (rc)
1304 return rc;
1305
1303 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); 1306 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1304 if (rc) 1307 if (rc)
1305 return rc; 1308 return rc;
@@ -1343,54 +1346,360 @@ out:
1343 return rc; 1346 return rc;
1344} 1347}
1345 1348
1346/******************************************************************** 1349/*
1347 * /sys/class/rbd/ 1350 sysfs
1348 * add map rados objects to blkdev 1351*/
1349 * remove unmap rados objects 1352
1350 * list show mappings 1353static ssize_t rbd_size_show(struct device *dev,
1351 *******************************************************************/ 1354 struct device_attribute *attr, char *buf)
1355{
1356 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1357
1358 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
1359}
1360
1361static ssize_t rbd_major_show(struct device *dev,
1362 struct device_attribute *attr, char *buf)
1363{
1364 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1352 1365
1353static void class_rbd_release(struct class *cls) 1366 return sprintf(buf, "%d\n", rbd_dev->major);
1367}
1368
1369static ssize_t rbd_client_id_show(struct device *dev,
1370 struct device_attribute *attr, char *buf)
1354{ 1371{
1355 kfree(cls); 1372 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1373
1374 return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
1356} 1375}
1357 1376
1358static ssize_t class_rbd_list(struct class *c, 1377static ssize_t rbd_pool_show(struct device *dev,
1359 struct class_attribute *attr, 1378 struct device_attribute *attr, char *buf)
1360 char *data)
1361{ 1379{
1362 int n = 0; 1380 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1363 struct list_head *tmp; 1381
1364 int max = PAGE_SIZE; 1382 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1383}
1384
1385static ssize_t rbd_name_show(struct device *dev,
1386 struct device_attribute *attr, char *buf)
1387{
1388 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1389
1390 return sprintf(buf, "%s\n", rbd_dev->obj);
1391}
1392
1393static ssize_t rbd_snap_show(struct device *dev,
1394 struct device_attribute *attr,
1395 char *buf)
1396{
1397 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1398
1399 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1400}
1401
1402static ssize_t rbd_image_refresh(struct device *dev,
1403 struct device_attribute *attr,
1404 const char *buf,
1405 size_t size)
1406{
1407 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1408 int rc;
1409 int ret = size;
1365 1410
1366 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1411 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1367 1412
1368 n += snprintf(data, max, 1413 rc = __rbd_update_snaps(rbd_dev);
1369 "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n"); 1414 if (rc < 0)
1415 ret = rc;
1370 1416
1371 list_for_each(tmp, &rbd_dev_list) { 1417 mutex_unlock(&ctl_mutex);
1372 struct rbd_device *rbd_dev; 1418 return ret;
1419}
1373 1420
1374 rbd_dev = list_entry(tmp, struct rbd_device, node); 1421static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1375 n += snprintf(data+n, max-n, 1422static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1376 "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n", 1423static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1377 rbd_dev->id, 1424static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1378 rbd_dev->major, 1425static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1379 ceph_client_id(rbd_dev->client), 1426static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1380 rbd_dev->pool_name, 1427static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1381 rbd_dev->obj, rbd_dev->snap_name, 1428static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1382 rbd_dev->header.image_size >> 10); 1429static DEVICE_ATTR(rollback_snap, S_IWUSR, NULL, rbd_snap_rollback);
1383 if (n == max) 1430
1431static struct attribute *rbd_attrs[] = {
1432 &dev_attr_size.attr,
1433 &dev_attr_major.attr,
1434 &dev_attr_client_id.attr,
1435 &dev_attr_pool.attr,
1436 &dev_attr_name.attr,
1437 &dev_attr_current_snap.attr,
1438 &dev_attr_refresh.attr,
1439 &dev_attr_create_snap.attr,
1440 &dev_attr_rollback_snap.attr,
1441 NULL
1442};
1443
1444static struct attribute_group rbd_attr_group = {
1445 .attrs = rbd_attrs,
1446};
1447
1448static const struct attribute_group *rbd_attr_groups[] = {
1449 &rbd_attr_group,
1450 NULL
1451};
1452
1453static void rbd_sysfs_dev_release(struct device *dev)
1454{
1455}
1456
1457static struct device_type rbd_device_type = {
1458 .name = "rbd",
1459 .groups = rbd_attr_groups,
1460 .release = rbd_sysfs_dev_release,
1461};
1462
1463
1464/*
1465 sysfs - snapshots
1466*/
1467
1468static ssize_t rbd_snap_size_show(struct device *dev,
1469 struct device_attribute *attr,
1470 char *buf)
1471{
1472 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1473
1474 return sprintf(buf, "%lld\n", (long long)snap->size);
1475}
1476
1477static ssize_t rbd_snap_id_show(struct device *dev,
1478 struct device_attribute *attr,
1479 char *buf)
1480{
1481 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1482
1483 return sprintf(buf, "%lld\n", (long long)snap->id);
1484}
1485
1486static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1487static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1488
1489static struct attribute *rbd_snap_attrs[] = {
1490 &dev_attr_snap_size.attr,
1491 &dev_attr_snap_id.attr,
1492 NULL,
1493};
1494
1495static struct attribute_group rbd_snap_attr_group = {
1496 .attrs = rbd_snap_attrs,
1497};
1498
1499static void rbd_snap_dev_release(struct device *dev)
1500{
1501 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1502 kfree(snap->name);
1503 kfree(snap);
1504}
1505
1506static const struct attribute_group *rbd_snap_attr_groups[] = {
1507 &rbd_snap_attr_group,
1508 NULL
1509};
1510
1511static struct device_type rbd_snap_device_type = {
1512 .groups = rbd_snap_attr_groups,
1513 .release = rbd_snap_dev_release,
1514};
1515
1516static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1517 struct rbd_snap *snap)
1518{
1519 list_del(&snap->node);
1520 device_unregister(&snap->dev);
1521}
1522
1523static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1524 struct rbd_snap *snap,
1525 struct device *parent)
1526{
1527 struct device *dev = &snap->dev;
1528 int ret;
1529
1530 dev->type = &rbd_snap_device_type;
1531 dev->parent = parent;
1532 dev->release = rbd_snap_dev_release;
1533 dev_set_name(dev, "snap_%s", snap->name);
1534 ret = device_register(dev);
1535
1536 return ret;
1537}
1538
1539static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1540 int i, const char *name,
1541 struct rbd_snap **snapp)
1542{
1543 int ret;
1544 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1545 if (!snap)
1546 return -ENOMEM;
1547 snap->name = kstrdup(name, GFP_KERNEL);
1548 snap->size = rbd_dev->header.snap_sizes[i];
1549 snap->id = rbd_dev->header.snapc->snaps[i];
1550 if (device_is_registered(&rbd_dev->dev)) {
1551 ret = rbd_register_snap_dev(rbd_dev, snap,
1552 &rbd_dev->dev);
1553 if (ret < 0)
1554 goto err;
1555 }
1556 *snapp = snap;
1557 return 0;
1558err:
1559 kfree(snap->name);
1560 kfree(snap);
1561 return ret;
1562}
1563
1564/*
1565 * search for the previous snap in a null delimited string list
1566 */
1567const char *rbd_prev_snap_name(const char *name, const char *start)
1568{
1569 if (name < start + 2)
1570 return NULL;
1571
1572 name -= 2;
1573 while (*name) {
1574 if (name == start)
1575 return start;
1576 name--;
1577 }
1578 return name + 1;
1579}
1580
1581/*
1582 * compare the old list of snapshots that we have to what's in the header
1583 * and update it accordingly. Note that the header holds the snapshots
1584 * in a reverse order (from newest to oldest) and we need to go from
1585 * older to new so that we don't get a duplicate snap name when
1586 * doing the process (e.g., removed snapshot and recreated a new
1587 * one with the same name.
1588 */
1589static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
1590{
1591 const char *name, *first_name;
1592 int i = rbd_dev->header.total_snaps;
1593 struct rbd_snap *snap, *old_snap = NULL;
1594 int ret;
1595 struct list_head *p, *n;
1596
1597 first_name = rbd_dev->header.snap_names;
1598 name = first_name + rbd_dev->header.snap_names_len;
1599
1600 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
1601 u64 cur_id;
1602
1603 old_snap = list_entry(p, struct rbd_snap, node);
1604
1605 if (i)
1606 cur_id = rbd_dev->header.snapc->snaps[i - 1];
1607
1608 if (!i || old_snap->id < cur_id) {
1609 /* old_snap->id was skipped, thus was removed */
1610 __rbd_remove_snap_dev(rbd_dev, old_snap);
1611 continue;
1612 }
1613 if (old_snap->id == cur_id) {
1614 /* we have this snapshot already */
1615 i--;
1616 name = rbd_prev_snap_name(name, first_name);
1617 continue;
1618 }
1619 for (; i > 0;
1620 i--, name = rbd_prev_snap_name(name, first_name)) {
1621 if (!name) {
1622 WARN_ON(1);
1623 return -EINVAL;
1624 }
1625 cur_id = rbd_dev->header.snapc->snaps[i];
1626 /* snapshot removal? handle it above */
1627 if (cur_id >= old_snap->id)
1628 break;
1629 /* a new snapshot */
1630 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
1631 if (ret < 0)
1632 return ret;
1633
1634 /* note that we add it backward so using n and not p */
1635 list_add(&snap->node, n);
1636 p = &snap->node;
1637 }
1638 }
1639 /* we're done going over the old snap list, just add what's left */
1640 for (; i > 0; i--) {
1641 name = rbd_prev_snap_name(name, first_name);
1642 if (!name) {
1643 WARN_ON(1);
1644 return -EINVAL;
1645 }
1646 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
1647 if (ret < 0)
1648 return ret;
1649 list_add(&snap->node, &rbd_dev->snaps);
1650 }
1651
1652 return 0;
1653}
1654
1655
1656static void rbd_root_dev_release(struct device *dev)
1657{
1658}
1659
1660static struct device rbd_root_dev = {
1661 .init_name = "rbd",
1662 .release = rbd_root_dev_release,
1663};
1664
1665static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
1666{
1667 int ret = -ENOMEM;
1668 struct device *dev;
1669 struct rbd_snap *snap;
1670
1671 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1672 dev = &rbd_dev->dev;
1673
1674 dev->bus = &rbd_bus_type;
1675 dev->type = &rbd_device_type;
1676 dev->parent = &rbd_root_dev;
1677 dev->release = rbd_dev_release;
1678 dev_set_name(dev, "%d", rbd_dev->id);
1679 ret = device_register(dev);
1680 if (ret < 0)
1681 goto done_free;
1682
1683 list_for_each_entry(snap, &rbd_dev->snaps, node) {
1684 ret = rbd_register_snap_dev(rbd_dev, snap,
1685 &rbd_dev->dev);
1686 if (ret < 0)
1384 break; 1687 break;
1385 } 1688 }
1386 1689
1387 mutex_unlock(&ctl_mutex); 1690 mutex_unlock(&ctl_mutex);
1388 return n; 1691 return 0;
1692done_free:
1693 mutex_unlock(&ctl_mutex);
1694 return ret;
1389} 1695}
1390 1696
1391static ssize_t class_rbd_add(struct class *c, 1697static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
1392 struct class_attribute *attr, 1698{
1393 const char *buf, size_t count) 1699 device_unregister(&rbd_dev->dev);
1700}
1701
1702static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count)
1394{ 1703{
1395 struct ceph_osd_client *osdc; 1704 struct ceph_osd_client *osdc;
1396 struct rbd_device *rbd_dev; 1705 struct rbd_device *rbd_dev;
@@ -1419,6 +1728,7 @@ static ssize_t class_rbd_add(struct class *c,
1419 /* static rbd_device initialization */ 1728 /* static rbd_device initialization */
1420 spin_lock_init(&rbd_dev->lock); 1729 spin_lock_init(&rbd_dev->lock);
1421 INIT_LIST_HEAD(&rbd_dev->node); 1730 INIT_LIST_HEAD(&rbd_dev->node);
1731 INIT_LIST_HEAD(&rbd_dev->snaps);
1422 1732
1423 /* generate unique id: find highest unique id, add one */ 1733 /* generate unique id: find highest unique id, add one */
1424 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1734 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
@@ -1478,6 +1788,9 @@ static ssize_t class_rbd_add(struct class *c,
1478 } 1788 }
1479 rbd_dev->major = irc; 1789 rbd_dev->major = irc;
1480 1790
1791 rc = rbd_bus_add_dev(rbd_dev);
1792 if (rc)
1793 goto err_out_disk;
1481 /* set up and announce blkdev mapping */ 1794 /* set up and announce blkdev mapping */
1482 rc = rbd_init_disk(rbd_dev); 1795 rc = rbd_init_disk(rbd_dev);
1483 if (rc) 1796 if (rc)
@@ -1487,6 +1800,8 @@ static ssize_t class_rbd_add(struct class *c,
1487 1800
1488err_out_blkdev: 1801err_out_blkdev:
1489 unregister_blkdev(rbd_dev->major, rbd_dev->name); 1802 unregister_blkdev(rbd_dev->major, rbd_dev->name);
1803err_out_disk:
1804 rbd_free_disk(rbd_dev);
1490err_out_client: 1805err_out_client:
1491 rbd_put_client(rbd_dev); 1806 rbd_put_client(rbd_dev);
1492 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1807 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
@@ -1518,35 +1833,10 @@ static struct rbd_device *__rbd_get_dev(unsigned long id)
1518 return NULL; 1833 return NULL;
1519} 1834}
1520 1835
1521static ssize_t class_rbd_remove(struct class *c, 1836static void rbd_dev_release(struct device *dev)
1522 struct class_attribute *attr,
1523 const char *buf,
1524 size_t count)
1525{ 1837{
1526 struct rbd_device *rbd_dev = NULL; 1838 struct rbd_device *rbd_dev =
1527 int target_id, rc; 1839 container_of(dev, struct rbd_device, dev);
1528 unsigned long ul;
1529
1530 rc = strict_strtoul(buf, 10, &ul);
1531 if (rc)
1532 return rc;
1533
1534 /* convert to int; abort if we lost anything in the conversion */
1535 target_id = (int) ul;
1536 if (target_id != ul)
1537 return -EINVAL;
1538
1539 /* remove object from list immediately */
1540 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1541
1542 rbd_dev = __rbd_get_dev(target_id);
1543 if (rbd_dev)
1544 list_del_init(&rbd_dev->node);
1545
1546 mutex_unlock(&ctl_mutex);
1547
1548 if (!rbd_dev)
1549 return -ENOENT;
1550 1840
1551 rbd_put_client(rbd_dev); 1841 rbd_put_client(rbd_dev);
1552 1842
@@ -1557,67 +1847,11 @@ static ssize_t class_rbd_remove(struct class *c,
1557 1847
1558 /* release module ref */ 1848 /* release module ref */
1559 module_put(THIS_MODULE); 1849 module_put(THIS_MODULE);
1560
1561 return count;
1562} 1850}
1563 1851
1564static ssize_t class_rbd_snaps_list(struct class *c, 1852static ssize_t rbd_remove(struct bus_type *bus,
1565 struct class_attribute *attr, 1853 const char *buf,
1566 char *data) 1854 size_t count)
1567{
1568 struct rbd_device *rbd_dev = NULL;
1569 struct list_head *tmp;
1570 struct rbd_image_header *header;
1571 int i, n = 0, max = PAGE_SIZE;
1572 int ret;
1573
1574 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1575
1576 n += snprintf(data, max, "#id\tsnap\tKB\n");
1577
1578 list_for_each(tmp, &rbd_dev_list) {
1579 char *names, *p;
1580 struct ceph_snap_context *snapc;
1581
1582 rbd_dev = list_entry(tmp, struct rbd_device, node);
1583 header = &rbd_dev->header;
1584
1585 down_read(&header->snap_rwsem);
1586
1587 names = header->snap_names;
1588 snapc = header->snapc;
1589
1590 n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
1591 rbd_dev->id, RBD_SNAP_HEAD_NAME,
1592 header->image_size >> 10,
1593 (!rbd_dev->cur_snap ? " (*)" : ""));
1594 if (n == max)
1595 break;
1596
1597 p = names;
1598 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
1599 n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
1600 rbd_dev->id, p, header->snap_sizes[i] >> 10,
1601 (rbd_dev->cur_snap &&
1602 (snap_index(header, i) == rbd_dev->cur_snap) ?
1603 " (*)" : ""));
1604 if (n == max)
1605 break;
1606 }
1607
1608 up_read(&header->snap_rwsem);
1609 }
1610
1611
1612 ret = n;
1613 mutex_unlock(&ctl_mutex);
1614 return ret;
1615}
1616
1617static ssize_t class_rbd_snaps_refresh(struct class *c,
1618 struct class_attribute *attr,
1619 const char *buf,
1620 size_t count)
1621{ 1855{
1622 struct rbd_device *rbd_dev = NULL; 1856 struct rbd_device *rbd_dev = NULL;
1623 int target_id, rc; 1857 int target_id, rc;
@@ -1641,95 +1875,70 @@ static ssize_t class_rbd_snaps_refresh(struct class *c,
1641 goto done; 1875 goto done;
1642 } 1876 }
1643 1877
1644 rc = rbd_update_snaps(rbd_dev); 1878 list_del_init(&rbd_dev->node);
1645 if (rc < 0) 1879
1646 ret = rc; 1880 __rbd_remove_all_snaps(rbd_dev);
1881 rbd_bus_del_dev(rbd_dev);
1647 1882
1648done: 1883done:
1649 mutex_unlock(&ctl_mutex); 1884 mutex_unlock(&ctl_mutex);
1650 return ret; 1885 return ret;
1651} 1886}
1652 1887
1653static ssize_t class_rbd_snap_create(struct class *c, 1888static ssize_t rbd_snap_add(struct device *dev,
1654 struct class_attribute *attr, 1889 struct device_attribute *attr,
1655 const char *buf, 1890 const char *buf,
1656 size_t count) 1891 size_t count)
1657{ 1892{
1658 struct rbd_device *rbd_dev = NULL; 1893 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1659 int target_id, ret; 1894 int ret;
1660 char *name; 1895 char *name = kmalloc(count + 1, GFP_KERNEL);
1661
1662 name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
1663 if (!name) 1896 if (!name)
1664 return -ENOMEM; 1897 return -ENOMEM;
1665 1898
1666 /* parse snaps add command */ 1899 snprintf(name, count, "%s", buf);
1667 if (sscanf(buf, "%d "
1668 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1669 &target_id,
1670 name) != 2) {
1671 ret = -EINVAL;
1672 goto done;
1673 }
1674 1900
1675 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1901 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1676 1902
1677 rbd_dev = __rbd_get_dev(target_id);
1678 if (!rbd_dev) {
1679 ret = -ENOENT;
1680 goto done_unlock;
1681 }
1682
1683 ret = rbd_header_add_snap(rbd_dev, 1903 ret = rbd_header_add_snap(rbd_dev,
1684 name, GFP_KERNEL); 1904 name, GFP_KERNEL);
1685 if (ret < 0) 1905 if (ret < 0)
1686 goto done_unlock; 1906 goto done_unlock;
1687 1907
1688 ret = rbd_update_snaps(rbd_dev); 1908 ret = __rbd_update_snaps(rbd_dev);
1689 if (ret < 0) 1909 if (ret < 0)
1690 goto done_unlock; 1910 goto done_unlock;
1691 1911
1692 ret = count; 1912 ret = count;
1693done_unlock: 1913done_unlock:
1694 mutex_unlock(&ctl_mutex); 1914 mutex_unlock(&ctl_mutex);
1695done:
1696 kfree(name); 1915 kfree(name);
1697 return ret; 1916 return ret;
1698} 1917}
1699 1918
1700static ssize_t class_rbd_rollback(struct class *c, 1919static ssize_t rbd_snap_rollback(struct device *dev,
1701 struct class_attribute *attr, 1920 struct device_attribute *attr,
1702 const char *buf, 1921 const char *buf,
1703 size_t count) 1922 size_t count)
1704{ 1923{
1705 struct rbd_device *rbd_dev = NULL; 1924 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1706 int target_id, ret; 1925 int ret;
1707 u64 snapid; 1926 u64 snapid;
1708 char snap_name[RBD_MAX_SNAP_NAME_LEN];
1709 u64 cur_ofs; 1927 u64 cur_ofs;
1710 char *seg_name; 1928 char *seg_name = NULL;
1929 char *snap_name = kmalloc(count + 1, GFP_KERNEL);
1930 ret = -ENOMEM;
1931 if (!snap_name)
1932 return ret;
1711 1933
1712 /* parse snaps add command */ 1934 /* parse snaps add command */
1713 if (sscanf(buf, "%d " 1935 snprintf(snap_name, count, "%s", buf);
1714 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1715 &target_id,
1716 snap_name) != 2) {
1717 return -EINVAL;
1718 }
1719
1720 ret = -ENOMEM;
1721 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 1936 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1722 if (!seg_name) 1937 if (!seg_name)
1723 return ret; 1938 goto done;
1724 1939
1725 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1940 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1726 1941
1727 rbd_dev = __rbd_get_dev(target_id);
1728 if (!rbd_dev) {
1729 ret = -ENOENT;
1730 goto done_unlock;
1731 }
1732
1733 ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL); 1942 ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
1734 if (ret < 0) 1943 if (ret < 0)
1735 goto done_unlock; 1944 goto done_unlock;
@@ -1750,7 +1959,7 @@ static ssize_t class_rbd_rollback(struct class *c,
1750 seg_name, ret); 1959 seg_name, ret);
1751 } 1960 }
1752 1961
1753 ret = rbd_update_snaps(rbd_dev); 1962 ret = __rbd_update_snaps(rbd_dev);
1754 if (ret < 0) 1963 if (ret < 0)
1755 goto done_unlock; 1964 goto done_unlock;
1756 1965
@@ -1758,57 +1967,42 @@ static ssize_t class_rbd_rollback(struct class *c,
1758 1967
1759done_unlock: 1968done_unlock:
1760 mutex_unlock(&ctl_mutex); 1969 mutex_unlock(&ctl_mutex);
1970done:
1761 kfree(seg_name); 1971 kfree(seg_name);
1972 kfree(snap_name);
1762 1973
1763 return ret; 1974 return ret;
1764} 1975}
1765 1976
1766static struct class_attribute class_rbd_attrs[] = { 1977static struct bus_attribute rbd_bus_attrs[] = {
1767 __ATTR(add, 0200, NULL, class_rbd_add), 1978 __ATTR(add, S_IWUSR, NULL, rbd_add),
1768 __ATTR(remove, 0200, NULL, class_rbd_remove), 1979 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
1769 __ATTR(list, 0444, class_rbd_list, NULL),
1770 __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh),
1771 __ATTR(snap_create, 0200, NULL, class_rbd_snap_create),
1772 __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL),
1773 __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback),
1774 __ATTR_NULL 1980 __ATTR_NULL
1775}; 1981};
1776 1982
1777/* 1983/*
1778 * create control files in sysfs 1984 * create control files in sysfs
1779 * /sys/class/rbd/... 1985 * /sys/bus/rbd/...
1780 */ 1986 */
1781static int rbd_sysfs_init(void) 1987static int rbd_sysfs_init(void)
1782{ 1988{
1783 int ret = -ENOMEM; 1989 int ret;
1784 1990
1785 class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL); 1991 rbd_bus_type.bus_attrs = rbd_bus_attrs;
1786 if (!class_rbd)
1787 goto out;
1788 1992
1789 class_rbd->name = DRV_NAME; 1993 ret = bus_register(&rbd_bus_type);
1790 class_rbd->owner = THIS_MODULE; 1994 if (ret < 0)
1791 class_rbd->class_release = class_rbd_release; 1995 return ret;
1792 class_rbd->class_attrs = class_rbd_attrs;
1793 1996
1794 ret = class_register(class_rbd); 1997 ret = device_register(&rbd_root_dev);
1795 if (ret)
1796 goto out_class;
1797 return 0;
1798 1998
1799out_class:
1800 kfree(class_rbd);
1801 class_rbd = NULL;
1802 pr_err(DRV_NAME ": failed to create class rbd\n");
1803out:
1804 return ret; 1999 return ret;
1805} 2000}
1806 2001
1807static void rbd_sysfs_cleanup(void) 2002static void rbd_sysfs_cleanup(void)
1808{ 2003{
1809 if (class_rbd) 2004 device_unregister(&rbd_root_dev);
1810 class_destroy(class_rbd); 2005 bus_unregister(&rbd_bus_type);
1811 class_rbd = NULL;
1812} 2006}
1813 2007
1814int __init rbd_init(void) 2008int __init rbd_init(void)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 06e2812ba124..657873e4328d 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -65,14 +65,14 @@ enum blkif_state {
65 65
66struct blk_shadow { 66struct blk_shadow {
67 struct blkif_request req; 67 struct blkif_request req;
68 unsigned long request; 68 struct request *request;
69 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 69 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
70}; 70};
71 71
72static DEFINE_MUTEX(blkfront_mutex); 72static DEFINE_MUTEX(blkfront_mutex);
73static const struct block_device_operations xlvbd_block_fops; 73static const struct block_device_operations xlvbd_block_fops;
74 74
75#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) 75#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
76 76
77/* 77/*
78 * We have one of these per vbd, whether ide, scsi or 'other'. They 78 * We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -136,7 +136,7 @@ static void add_id_to_freelist(struct blkfront_info *info,
136 unsigned long id) 136 unsigned long id)
137{ 137{
138 info->shadow[id].req.id = info->shadow_free; 138 info->shadow[id].req.id = info->shadow_free;
139 info->shadow[id].request = 0; 139 info->shadow[id].request = NULL;
140 info->shadow_free = id; 140 info->shadow_free = id;
141} 141}
142 142
@@ -245,14 +245,11 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
245} 245}
246 246
247/* 247/*
248 * blkif_queue_request 248 * Generate a Xen blkfront IO request from a blk layer request. Reads
249 * and writes are handled as expected. Since we lack a loose flush
250 * request, we map flushes into a full ordered barrier.
249 * 251 *
250 * request block io 252 * @req: a request struct
251 *
252 * id: for guest use only.
253 * operation: BLKIF_OP_{READ,WRITE,PROBE}
254 * buffer: buffer to read/write into. this should be a
255 * virtual address in the guest os.
256 */ 253 */
257static int blkif_queue_request(struct request *req) 254static int blkif_queue_request(struct request *req)
258{ 255{
@@ -281,7 +278,7 @@ static int blkif_queue_request(struct request *req)
281 /* Fill out a communications ring structure. */ 278 /* Fill out a communications ring structure. */
282 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 279 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
283 id = get_id_from_freelist(info); 280 id = get_id_from_freelist(info);
284 info->shadow[id].request = (unsigned long)req; 281 info->shadow[id].request = req;
285 282
286 ring_req->id = id; 283 ring_req->id = id;
287 ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req); 284 ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
@@ -289,8 +286,18 @@ static int blkif_queue_request(struct request *req)
289 286
290 ring_req->operation = rq_data_dir(req) ? 287 ring_req->operation = rq_data_dir(req) ?
291 BLKIF_OP_WRITE : BLKIF_OP_READ; 288 BLKIF_OP_WRITE : BLKIF_OP_READ;
292 if (req->cmd_flags & REQ_HARDBARRIER) 289
290 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
291 /*
292 * Ideally we could just do an unordered
293 * flush-to-disk, but all we have is a full write
294 * barrier at the moment. However, a barrier write is
295 * a superset of FUA, so we can implement it the same
296 * way. (It's also a FLUSH+FUA, since it is
297 * guaranteed ordered WRT previous writes.)
298 */
293 ring_req->operation = BLKIF_OP_WRITE_BARRIER; 299 ring_req->operation = BLKIF_OP_WRITE_BARRIER;
300 }
294 301
295 ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); 302 ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
296 BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); 303 BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
@@ -636,7 +643,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
636 643
637 bret = RING_GET_RESPONSE(&info->ring, i); 644 bret = RING_GET_RESPONSE(&info->ring, i);
638 id = bret->id; 645 id = bret->id;
639 req = (struct request *)info->shadow[id].request; 646 req = info->shadow[id].request;
640 647
641 blkif_completion(&info->shadow[id]); 648 blkif_completion(&info->shadow[id]);
642 649
@@ -649,6 +656,16 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
649 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", 656 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
650 info->gd->disk_name); 657 info->gd->disk_name);
651 error = -EOPNOTSUPP; 658 error = -EOPNOTSUPP;
659 }
660 if (unlikely(bret->status == BLKIF_RSP_ERROR &&
661 info->shadow[id].req.nr_segments == 0)) {
662 printk(KERN_WARNING "blkfront: %s: empty write barrier op failed\n",
663 info->gd->disk_name);
664 error = -EOPNOTSUPP;
665 }
666 if (unlikely(error)) {
667 if (error == -EOPNOTSUPP)
668 error = 0;
652 info->feature_flush = 0; 669 info->feature_flush = 0;
653 xlvbd_flush(info); 670 xlvbd_flush(info);
654 } 671 }
@@ -901,7 +918,7 @@ static int blkif_recover(struct blkfront_info *info)
901 /* Stage 3: Find pending requests and requeue them. */ 918 /* Stage 3: Find pending requests and requeue them. */
902 for (i = 0; i < BLK_RING_SIZE; i++) { 919 for (i = 0; i < BLK_RING_SIZE; i++) {
903 /* Not in use? */ 920 /* Not in use? */
904 if (copy[i].request == 0) 921 if (!copy[i].request)
905 continue; 922 continue;
906 923
907 /* Grab a request slot and copy shadow state into it. */ 924 /* Grab a request slot and copy shadow state into it. */
@@ -918,9 +935,7 @@ static int blkif_recover(struct blkfront_info *info)
918 req->seg[j].gref, 935 req->seg[j].gref,
919 info->xbdev->otherend_id, 936 info->xbdev->otherend_id,
920 pfn_to_mfn(info->shadow[req->id].frame[j]), 937 pfn_to_mfn(info->shadow[req->id].frame[j]),
921 rq_data_dir( 938 rq_data_dir(info->shadow[req->id].request));
922 (struct request *)
923 info->shadow[req->id].request));
924 info->shadow[req->id].req = *req; 939 info->shadow[req->id].req = *req;
925 940
926 info->ring.req_prod_pvt++; 941 info->ring.req_prod_pvt++;
@@ -1069,14 +1084,8 @@ static void blkfront_connect(struct blkfront_info *info)
1069 */ 1084 */
1070 info->feature_flush = 0; 1085 info->feature_flush = 0;
1071 1086
1072 /*
1073 * The driver doesn't properly handled empty flushes, so
1074 * lets disable barrier support for now.
1075 */
1076#if 0
1077 if (!err && barrier) 1087 if (!err && barrier)
1078 info->feature_flush = REQ_FLUSH; 1088 info->feature_flush = REQ_FLUSH | REQ_FUA;
1079#endif
1080 1089
1081 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1090 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1082 if (err) { 1091 if (err) {