aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorJens Axboe <jaxboe@fusionio.com>2010-11-10 08:51:27 -0500
committerJens Axboe <jaxboe@fusionio.com>2010-11-10 08:51:27 -0500
commit00e375e7e962f938f6b3c93e4cd097a5e26cc788 (patch)
tree9aacf8566c688273af830fc4e50186810068b028 /drivers/block
parentf85acd81aa623e3dcf268c90e5cd8ecf36830984 (diff)
parent4205df34003eec4371020872cdfa228ffae5bd6a (diff)
Merge branch 'for-2.6.37/drivers' into for-linus
Conflicts: drivers/block/cciss.c Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/cciss.c128
-rw-r--r--drivers/block/cciss.h4
-rw-r--r--drivers/block/drbd/drbd_actlog.c42
-rw-r--r--drivers/block/drbd/drbd_int.h50
-rw-r--r--drivers/block/drbd/drbd_main.c148
-rw-r--r--drivers/block/drbd/drbd_nl.c25
-rw-r--r--drivers/block/drbd/drbd_proc.c1
-rw-r--r--drivers/block/drbd/drbd_receiver.c217
-rw-r--r--drivers/block/drbd/drbd_req.c38
-rw-r--r--drivers/block/drbd/drbd_worker.c23
10 files changed, 264 insertions, 412 deletions
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 2cdbc247d0ac..a67d0a611a8a 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -113,6 +113,8 @@ static struct board_type products[] = {
113 {0x409D0E11, "Smart Array 6400 EM", &SA5_access}, 113 {0x409D0E11, "Smart Array 6400 EM", &SA5_access},
114 {0x40910E11, "Smart Array 6i", &SA5_access}, 114 {0x40910E11, "Smart Array 6i", &SA5_access},
115 {0x3225103C, "Smart Array P600", &SA5_access}, 115 {0x3225103C, "Smart Array P600", &SA5_access},
116 {0x3223103C, "Smart Array P800", &SA5_access},
117 {0x3234103C, "Smart Array P400", &SA5_access},
116 {0x3235103C, "Smart Array P400i", &SA5_access}, 118 {0x3235103C, "Smart Array P400i", &SA5_access},
117 {0x3211103C, "Smart Array E200i", &SA5_access}, 119 {0x3211103C, "Smart Array E200i", &SA5_access},
118 {0x3212103C, "Smart Array E200", &SA5_access}, 120 {0x3212103C, "Smart Array E200", &SA5_access},
@@ -3753,7 +3755,7 @@ static void __devinit cciss_wait_for_mode_change_ack(ctlr_info_t *h)
3753 for (i = 0; i < MAX_CONFIG_WAIT; i++) { 3755 for (i = 0; i < MAX_CONFIG_WAIT; i++) {
3754 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) 3756 if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
3755 break; 3757 break;
3756 msleep(10); 3758 usleep_range(10000, 20000);
3757 } 3759 }
3758} 3760}
3759 3761
@@ -3937,10 +3939,9 @@ static int __devinit cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
3937 *board_id = ((subsystem_device_id << 16) & 0xffff0000) | 3939 *board_id = ((subsystem_device_id << 16) & 0xffff0000) |
3938 subsystem_vendor_id; 3940 subsystem_vendor_id;
3939 3941
3940 for (i = 0; i < ARRAY_SIZE(products); i++) { 3942 for (i = 0; i < ARRAY_SIZE(products); i++)
3941 if (*board_id == products[i].board_id) 3943 if (*board_id == products[i].board_id)
3942 return i; 3944 return i;
3943 }
3944 dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", 3945 dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
3945 *board_id); 3946 *board_id);
3946 return -ENODEV; 3947 return -ENODEV;
@@ -3971,18 +3972,31 @@ static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
3971 return -ENODEV; 3972 return -ENODEV;
3972} 3973}
3973 3974
3974static int __devinit cciss_wait_for_board_ready(ctlr_info_t *h) 3975static int __devinit cciss_wait_for_board_state(struct pci_dev *pdev,
3976 void __iomem *vaddr, int wait_for_ready)
3977#define BOARD_READY 1
3978#define BOARD_NOT_READY 0
3975{ 3979{
3976 int i; 3980 int i, iterations;
3977 u32 scratchpad; 3981 u32 scratchpad;
3978 3982
3979 for (i = 0; i < CCISS_BOARD_READY_ITERATIONS; i++) { 3983 if (wait_for_ready)
3980 scratchpad = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET); 3984 iterations = CCISS_BOARD_READY_ITERATIONS;
3981 if (scratchpad == CCISS_FIRMWARE_READY) 3985 else
3982 return 0; 3986 iterations = CCISS_BOARD_NOT_READY_ITERATIONS;
3987
3988 for (i = 0; i < iterations; i++) {
3989 scratchpad = readl(vaddr + SA5_SCRATCHPAD_OFFSET);
3990 if (wait_for_ready) {
3991 if (scratchpad == CCISS_FIRMWARE_READY)
3992 return 0;
3993 } else {
3994 if (scratchpad != CCISS_FIRMWARE_READY)
3995 return 0;
3996 }
3983 msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS); 3997 msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS);
3984 } 3998 }
3985 dev_warn(&h->pdev->dev, "board not ready, timed out.\n"); 3999 dev_warn(&pdev->dev, "board not ready, timed out.\n");
3986 return -ENODEV; 4000 return -ENODEV;
3987} 4001}
3988 4002
@@ -4031,6 +4045,11 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h)
4031static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h) 4045static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
4032{ 4046{
4033 h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands)); 4047 h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands));
4048
4049 /* Limit commands in memory limited kdump scenario. */
4050 if (reset_devices && h->max_commands > 32)
4051 h->max_commands = 32;
4052
4034 if (h->max_commands < 16) { 4053 if (h->max_commands < 16) {
4035 dev_warn(&h->pdev->dev, "Controller reports " 4054 dev_warn(&h->pdev->dev, "Controller reports "
4036 "max supported commands of %d, an obvious lie. " 4055 "max supported commands of %d, an obvious lie. "
@@ -4148,7 +4167,7 @@ static int __devinit cciss_pci_init(ctlr_info_t *h)
4148 err = -ENOMEM; 4167 err = -ENOMEM;
4149 goto err_out_free_res; 4168 goto err_out_free_res;
4150 } 4169 }
4151 err = cciss_wait_for_board_ready(h); 4170 err = cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY);
4152 if (err) 4171 if (err)
4153 goto err_out_free_res; 4172 goto err_out_free_res;
4154 err = cciss_find_cfgtables(h); 4173 err = cciss_find_cfgtables(h);
@@ -4313,36 +4332,6 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
4313#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0) 4332#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0)
4314#define cciss_noop(p) cciss_message(p, 3, 0) 4333#define cciss_noop(p) cciss_message(p, 3, 0)
4315 4334
4316static __devinit int cciss_reset_msi(struct pci_dev *pdev)
4317{
4318/* the #defines are stolen from drivers/pci/msi.h. */
4319#define msi_control_reg(base) (base + PCI_MSI_FLAGS)
4320#define PCI_MSIX_FLAGS_ENABLE (1 << 15)
4321
4322 int pos;
4323 u16 control = 0;
4324
4325 pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
4326 if (pos) {
4327 pci_read_config_word(pdev, msi_control_reg(pos), &control);
4328 if (control & PCI_MSI_FLAGS_ENABLE) {
4329 dev_info(&pdev->dev, "resetting MSI\n");
4330 pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSI_FLAGS_ENABLE);
4331 }
4332 }
4333
4334 pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
4335 if (pos) {
4336 pci_read_config_word(pdev, msi_control_reg(pos), &control);
4337 if (control & PCI_MSIX_FLAGS_ENABLE) {
4338 dev_info(&pdev->dev, "resetting MSI-X\n");
4339 pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSIX_FLAGS_ENABLE);
4340 }
4341 }
4342
4343 return 0;
4344}
4345
4346static int cciss_controller_hard_reset(struct pci_dev *pdev, 4335static int cciss_controller_hard_reset(struct pci_dev *pdev,
4347 void * __iomem vaddr, bool use_doorbell) 4336 void * __iomem vaddr, bool use_doorbell)
4348{ 4337{
@@ -4397,17 +4386,17 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev,
4397 * states or using the doorbell register. */ 4386 * states or using the doorbell register. */
4398static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) 4387static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4399{ 4388{
4400 u16 saved_config_space[32];
4401 u64 cfg_offset; 4389 u64 cfg_offset;
4402 u32 cfg_base_addr; 4390 u32 cfg_base_addr;
4403 u64 cfg_base_addr_index; 4391 u64 cfg_base_addr_index;
4404 void __iomem *vaddr; 4392 void __iomem *vaddr;
4405 unsigned long paddr; 4393 unsigned long paddr;
4406 u32 misc_fw_support, active_transport; 4394 u32 misc_fw_support, active_transport;
4407 int rc, i; 4395 int rc;
4408 CfgTable_struct __iomem *cfgtable; 4396 CfgTable_struct __iomem *cfgtable;
4409 bool use_doorbell; 4397 bool use_doorbell;
4410 u32 board_id; 4398 u32 board_id;
4399 u16 command_register;
4411 4400
4412 /* For controllers as old a the p600, this is very nearly 4401 /* For controllers as old a the p600, this is very nearly
4413 * the same thing as 4402 * the same thing as
@@ -4417,14 +4406,6 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4417 * pci_set_power_state(pci_dev, PCI_D0); 4406 * pci_set_power_state(pci_dev, PCI_D0);
4418 * pci_restore_state(pci_dev); 4407 * pci_restore_state(pci_dev);
4419 * 4408 *
4420 * but we can't use these nice canned kernel routines on
4421 * kexec, because they also check the MSI/MSI-X state in PCI
4422 * configuration space and do the wrong thing when it is
4423 * set/cleared. Also, the pci_save/restore_state functions
4424 * violate the ordering requirements for restoring the
4425 * configuration space from the CCISS document (see the
4426 * comment below). So we roll our own ....
4427 *
4428 * For controllers newer than the P600, the pci power state 4409 * For controllers newer than the P600, the pci power state
4429 * method of resetting doesn't work so we have another way 4410 * method of resetting doesn't work so we have another way
4430 * using the doorbell register. 4411 * using the doorbell register.
@@ -4443,8 +4424,13 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4443 return -ENODEV; 4424 return -ENODEV;
4444 } 4425 }
4445 4426
4446 for (i = 0; i < 32; i++) 4427 /* Save the PCI command register */
4447 pci_read_config_word(pdev, 2*i, &saved_config_space[i]); 4428 pci_read_config_word(pdev, 4, &command_register);
4429 /* Turn the board off. This is so that later pci_restore_state()
4430 * won't turn the board on before the rest of config space is ready.
4431 */
4432 pci_disable_device(pdev);
4433 pci_save_state(pdev);
4448 4434
4449 /* find the first memory BAR, so we can find the cfg table */ 4435 /* find the first memory BAR, so we can find the cfg table */
4450 rc = cciss_pci_find_memory_BAR(pdev, &paddr); 4436 rc = cciss_pci_find_memory_BAR(pdev, &paddr);
@@ -4479,26 +4465,32 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
4479 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell); 4465 rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
4480 if (rc) 4466 if (rc)
4481 goto unmap_cfgtable; 4467 goto unmap_cfgtable;
4482 4468 pci_restore_state(pdev);
4483 /* Restore the PCI configuration space. The Open CISS 4469 rc = pci_enable_device(pdev);
4484 * Specification says, "Restore the PCI Configuration 4470 if (rc) {
4485 * Registers, offsets 00h through 60h. It is important to 4471 dev_warn(&pdev->dev, "failed to enable device.\n");
4486 * restore the command register, 16-bits at offset 04h, 4472 goto unmap_cfgtable;
4487 * last. Do not restore the configuration status register,
4488 * 16-bits at offset 06h." Note that the offset is 2*i.
4489 */
4490 for (i = 0; i < 32; i++) {
4491 if (i == 2 || i == 3)
4492 continue;
4493 pci_write_config_word(pdev, 2*i, saved_config_space[i]);
4494 } 4473 }
4495 wmb(); 4474 pci_write_config_word(pdev, 4, command_register);
4496 pci_write_config_word(pdev, 4, saved_config_space[2]);
4497 4475
4498 /* Some devices (notably the HP Smart Array 5i Controller) 4476 /* Some devices (notably the HP Smart Array 5i Controller)
4499 need a little pause here */ 4477 need a little pause here */
4500 msleep(CCISS_POST_RESET_PAUSE_MSECS); 4478 msleep(CCISS_POST_RESET_PAUSE_MSECS);
4501 4479
4480 /* Wait for board to become not ready, then ready. */
4481 dev_info(&pdev->dev, "Waiting for board to become ready.\n");
4482 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
4483 if (rc) /* Don't bail, might be E500, etc. which can't be reset */
4484 dev_warn(&pdev->dev,
4485 "failed waiting for board to become not ready\n");
4486 rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
4487 if (rc) {
4488 dev_warn(&pdev->dev,
4489 "failed waiting for board to become ready\n");
4490 goto unmap_cfgtable;
4491 }
4492 dev_info(&pdev->dev, "board ready.\n");
4493
4502 /* Controller should be in simple mode at this point. If it's not, 4494 /* Controller should be in simple mode at this point. If it's not,
4503 * It means we're on one of those controllers which doesn't support 4495 * It means we're on one of those controllers which doesn't support
4504 * the doorbell reset method and on which the PCI power management reset 4496 * the doorbell reset method and on which the PCI power management reset
@@ -4539,8 +4531,6 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
4539 return 0; /* just try to do the kdump anyhow. */ 4531 return 0; /* just try to do the kdump anyhow. */
4540 if (rc) 4532 if (rc)
4541 return -ENODEV; 4533 return -ENODEV;
4542 if (cciss_reset_msi(pdev))
4543 return -ENODEV;
4544 4534
4545 /* Now try to get the controller to respond to a no-op */ 4535 /* Now try to get the controller to respond to a no-op */
4546 for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) { 4536 for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) {
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index ae340ffc8f81..4b8933d778f1 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -200,10 +200,14 @@ struct ctlr_info
200 * the above. 200 * the above.
201 */ 201 */
202#define CCISS_BOARD_READY_WAIT_SECS (120) 202#define CCISS_BOARD_READY_WAIT_SECS (120)
203#define CCISS_BOARD_NOT_READY_WAIT_SECS (10)
203#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100) 204#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
204#define CCISS_BOARD_READY_ITERATIONS \ 205#define CCISS_BOARD_READY_ITERATIONS \
205 ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \ 206 ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \
206 CCISS_BOARD_READY_POLL_INTERVAL_MSECS) 207 CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
208#define CCISS_BOARD_NOT_READY_ITERATIONS \
209 ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
210 CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
207#define CCISS_POST_RESET_PAUSE_MSECS (3000) 211#define CCISS_POST_RESET_PAUSE_MSECS (3000)
208#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000) 212#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000)
209#define CCISS_POST_RESET_NOOP_RETRIES (12) 213#define CCISS_POST_RESET_NOOP_RETRIES (12)
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index ac04ef97eac2..ba95cba192be 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -78,11 +78,10 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
78 init_completion(&md_io.event); 78 init_completion(&md_io.event);
79 md_io.error = 0; 79 md_io.error = 0;
80 80
81 if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) 81 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
82 rw |= REQ_HARDBARRIER; 82 rw |= REQ_FUA;
83 rw |= REQ_UNPLUG | REQ_SYNC; 83 rw |= REQ_UNPLUG | REQ_SYNC;
84 84
85 retry:
86 bio = bio_alloc(GFP_NOIO, 1); 85 bio = bio_alloc(GFP_NOIO, 1);
87 bio->bi_bdev = bdev->md_bdev; 86 bio->bi_bdev = bdev->md_bdev;
88 bio->bi_sector = sector; 87 bio->bi_sector = sector;
@@ -100,17 +99,6 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
100 wait_for_completion(&md_io.event); 99 wait_for_completion(&md_io.event);
101 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; 100 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
102 101
103 /* check for unsupported barrier op.
104 * would rather check on EOPNOTSUPP, but that is not reliable.
105 * don't try again for ANY return value != 0 */
106 if (unlikely((bio->bi_rw & REQ_HARDBARRIER) && !ok)) {
107 /* Try again with no barrier */
108 dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
109 set_bit(MD_NO_BARRIER, &mdev->flags);
110 rw &= ~REQ_HARDBARRIER;
111 bio_put(bio);
112 goto retry;
113 }
114 out: 102 out:
115 bio_put(bio); 103 bio_put(bio);
116 return ok; 104 return ok;
@@ -284,18 +272,32 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
284 u32 xor_sum = 0; 272 u32 xor_sum = 0;
285 273
286 if (!get_ldev(mdev)) { 274 if (!get_ldev(mdev)) {
287 dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); 275 dev_err(DEV,
276 "disk is %s, cannot start al transaction (-%d +%d)\n",
277 drbd_disk_str(mdev->state.disk), evicted, new_enr);
288 complete(&((struct update_al_work *)w)->event); 278 complete(&((struct update_al_work *)w)->event);
289 return 1; 279 return 1;
290 } 280 }
291 /* do we have to do a bitmap write, first? 281 /* do we have to do a bitmap write, first?
292 * TODO reduce maximum latency: 282 * TODO reduce maximum latency:
293 * submit both bios, then wait for both, 283 * submit both bios, then wait for both,
294 * instead of doing two synchronous sector writes. */ 284 * instead of doing two synchronous sector writes.
285 * For now, we must not write the transaction,
286 * if we cannot write out the bitmap of the evicted extent. */
295 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) 287 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
296 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); 288 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
297 289
298 mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ 290 /* The bitmap write may have failed, causing a state change. */
291 if (mdev->state.disk < D_INCONSISTENT) {
292 dev_err(DEV,
293 "disk is %s, cannot write al transaction (-%d +%d)\n",
294 drbd_disk_str(mdev->state.disk), evicted, new_enr);
295 complete(&((struct update_al_work *)w)->event);
296 put_ldev(mdev);
297 return 1;
298 }
299
300 mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
299 buffer = (struct al_transaction *)page_address(mdev->md_io_page); 301 buffer = (struct al_transaction *)page_address(mdev->md_io_page);
300 302
301 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); 303 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
@@ -739,7 +741,7 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
739 unsigned int enr; 741 unsigned int enr;
740 unsigned long add = 0; 742 unsigned long add = 0;
741 char ppb[10]; 743 char ppb[10];
742 int i; 744 int i, tmp;
743 745
744 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 746 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
745 747
@@ -747,7 +749,9 @@ void drbd_al_apply_to_bm(struct drbd_conf *mdev)
747 enr = lc_element_by_index(mdev->act_log, i)->lc_number; 749 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
748 if (enr == LC_FREE) 750 if (enr == LC_FREE)
749 continue; 751 continue;
750 add += drbd_bm_ALe_set_all(mdev, enr); 752 tmp = drbd_bm_ALe_set_all(mdev, enr);
753 dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
754 add += tmp;
751 } 755 }
752 756
753 lc_unlock(mdev->act_log); 757 lc_unlock(mdev->act_log);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 2637f499f77f..1ea1a34e78b2 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -114,11 +114,11 @@ struct drbd_conf;
114#define D_ASSERT(exp) if (!(exp)) \ 114#define D_ASSERT(exp) if (!(exp)) \
115 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) 115 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
116 116
117#define ERR_IF(exp) if (({ \ 117#define ERR_IF(exp) if (({ \
118 int _b = (exp) != 0; \ 118 int _b = (exp) != 0; \
119 if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ 119 if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \
120 __func__, #exp, __FILE__, __LINE__); \ 120 __func__, #exp, __FILE__, __LINE__); \
121 _b; \ 121 _b; \
122 })) 122 }))
123 123
124/* Defines to control fault insertion */ 124/* Defines to control fault insertion */
@@ -749,17 +749,12 @@ struct drbd_epoch {
749 749
750/* drbd_epoch flag bits */ 750/* drbd_epoch flag bits */
751enum { 751enum {
752 DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
753 DE_BARRIER_IN_NEXT_EPOCH_DONE,
754 DE_CONTAINS_A_BARRIER,
755 DE_HAVE_BARRIER_NUMBER, 752 DE_HAVE_BARRIER_NUMBER,
756 DE_IS_FINISHING,
757}; 753};
758 754
759enum epoch_event { 755enum epoch_event {
760 EV_PUT, 756 EV_PUT,
761 EV_GOT_BARRIER_NR, 757 EV_GOT_BARRIER_NR,
762 EV_BARRIER_DONE,
763 EV_BECAME_LAST, 758 EV_BECAME_LAST,
764 EV_CLEANUP = 32, /* used as flag */ 759 EV_CLEANUP = 32, /* used as flag */
765}; 760};
@@ -801,11 +796,6 @@ enum {
801 __EE_CALL_AL_COMPLETE_IO, 796 __EE_CALL_AL_COMPLETE_IO,
802 __EE_MAY_SET_IN_SYNC, 797 __EE_MAY_SET_IN_SYNC,
803 798
804 /* This epoch entry closes an epoch using a barrier.
805 * On sucessful completion, the epoch is released,
806 * and the P_BARRIER_ACK send. */
807 __EE_IS_BARRIER,
808
809 /* In case a barrier failed, 799 /* In case a barrier failed,
810 * we need to resubmit without the barrier flag. */ 800 * we need to resubmit without the barrier flag. */
811 __EE_RESUBMITTED, 801 __EE_RESUBMITTED,
@@ -820,7 +810,6 @@ enum {
820}; 810};
821#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 811#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
822#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 812#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
823#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
824#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) 813#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
825#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) 814#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
826#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) 815#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
@@ -843,16 +832,15 @@ enum {
843 * Gets cleared when the state.conn 832 * Gets cleared when the state.conn
844 * goes into C_CONNECTED state. */ 833 * goes into C_CONNECTED state. */
845 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ 834 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
846 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
847 CONSIDER_RESYNC, 835 CONSIDER_RESYNC,
848 836
849 MD_NO_BARRIER, /* meta data device does not support barriers, 837 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
850 so don't even try */
851 SUSPEND_IO, /* suspend application io */ 838 SUSPEND_IO, /* suspend application io */
852 BITMAP_IO, /* suspend application io; 839 BITMAP_IO, /* suspend application io;
853 once no more io in flight, start bitmap io */ 840 once no more io in flight, start bitmap io */
854 BITMAP_IO_QUEUED, /* Started bitmap IO */ 841 BITMAP_IO_QUEUED, /* Started bitmap IO */
855 GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ 842 GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
843 WAS_IO_ERROR, /* Local disk failed returned IO error */
856 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ 844 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
857 NET_CONGESTED, /* The data socket is congested */ 845 NET_CONGESTED, /* The data socket is congested */
858 846
@@ -947,7 +935,6 @@ enum write_ordering_e {
947 WO_none, 935 WO_none,
948 WO_drain_io, 936 WO_drain_io,
949 WO_bdev_flush, 937 WO_bdev_flush,
950 WO_bio_barrier
951}; 938};
952 939
953struct fifo_buffer { 940struct fifo_buffer {
@@ -1281,6 +1268,7 @@ extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1281extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1268extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1282extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); 1269extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1283extern void drbd_go_diskless(struct drbd_conf *mdev); 1270extern void drbd_go_diskless(struct drbd_conf *mdev);
1271extern void drbd_ldev_destroy(struct drbd_conf *mdev);
1284 1272
1285 1273
1286/* Meta data layout 1274/* Meta data layout
@@ -1798,17 +1786,17 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
1798 case EP_PASS_ON: 1786 case EP_PASS_ON:
1799 if (!forcedetach) { 1787 if (!forcedetach) {
1800 if (__ratelimit(&drbd_ratelimit_state)) 1788 if (__ratelimit(&drbd_ratelimit_state))
1801 dev_err(DEV, "Local IO failed in %s." 1789 dev_err(DEV, "Local IO failed in %s.\n", where);
1802 "Passing error on...\n", where);
1803 break; 1790 break;
1804 } 1791 }
1805 /* NOTE fall through to detach case if forcedetach set */ 1792 /* NOTE fall through to detach case if forcedetach set */
1806 case EP_DETACH: 1793 case EP_DETACH:
1807 case EP_CALL_HELPER: 1794 case EP_CALL_HELPER:
1795 set_bit(WAS_IO_ERROR, &mdev->flags);
1808 if (mdev->state.disk > D_FAILED) { 1796 if (mdev->state.disk > D_FAILED) {
1809 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); 1797 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
1810 dev_err(DEV, "Local IO failed in %s." 1798 dev_err(DEV,
1811 "Detaching...\n", where); 1799 "Local IO failed in %s. Detaching...\n", where);
1812 } 1800 }
1813 break; 1801 break;
1814 } 1802 }
@@ -2127,7 +2115,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
2127 __release(local); 2115 __release(local);
2128 D_ASSERT(i >= 0); 2116 D_ASSERT(i >= 0);
2129 if (i == 0) { 2117 if (i == 0) {
2118 if (mdev->state.disk == D_DISKLESS)
2119 /* even internal references gone, safe to destroy */
2120 drbd_ldev_destroy(mdev);
2130 if (mdev->state.disk == D_FAILED) 2121 if (mdev->state.disk == D_FAILED)
2122 /* all application IO references gone. */
2131 drbd_go_diskless(mdev); 2123 drbd_go_diskless(mdev);
2132 wake_up(&mdev->misc_wait); 2124 wake_up(&mdev->misc_wait);
2133 } 2125 }
@@ -2138,6 +2130,10 @@ static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_stat
2138{ 2130{
2139 int io_allowed; 2131 int io_allowed;
2140 2132
2133 /* never get a reference while D_DISKLESS */
2134 if (mdev->state.disk == D_DISKLESS)
2135 return 0;
2136
2141 atomic_inc(&mdev->local_cnt); 2137 atomic_inc(&mdev->local_cnt);
2142 io_allowed = (mdev->state.disk >= mins); 2138 io_allowed = (mdev->state.disk >= mins);
2143 if (!io_allowed) 2139 if (!io_allowed)
@@ -2406,12 +2402,12 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
2406{ 2402{
2407 int r; 2403 int r;
2408 2404
2409 if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2405 if (test_bit(MD_NO_FUA, &mdev->flags))
2410 return; 2406 return;
2411 2407
2412 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL); 2408 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
2413 if (r) { 2409 if (r) {
2414 set_bit(MD_NO_BARRIER, &mdev->flags); 2410 set_bit(MD_NO_FUA, &mdev->flags);
2415 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2411 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
2416 } 2412 }
2417} 2413}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 25c7a73c5062..6be5401d0e88 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -835,6 +835,15 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
835 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) 835 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
836 ns.conn = os.conn; 836 ns.conn = os.conn;
837 837
838 /* we cannot fail (again) if we already detached */
839 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
840 ns.disk = D_DISKLESS;
841
842 /* if we are only D_ATTACHING yet,
843 * we can (and should) go directly to D_DISKLESS. */
844 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
845 ns.disk = D_DISKLESS;
846
838 /* After C_DISCONNECTING only C_STANDALONE may follow */ 847 /* After C_DISCONNECTING only C_STANDALONE may follow */
839 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) 848 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
840 ns.conn = os.conn; 849 ns.conn = os.conn;
@@ -1056,7 +1065,15 @@ int __drbd_set_state(struct drbd_conf *mdev,
1056 !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) 1065 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1057 set_bit(DEVICE_DYING, &mdev->flags); 1066 set_bit(DEVICE_DYING, &mdev->flags);
1058 1067
1059 mdev->state.i = ns.i; 1068 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1069 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1070 * drbd_ldev_destroy() won't happen before our corresponding
1071 * after_state_ch works run, where we put_ldev again. */
1072 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1073 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1074 atomic_inc(&mdev->local_cnt);
1075
1076 mdev->state = ns;
1060 wake_up(&mdev->misc_wait); 1077 wake_up(&mdev->misc_wait);
1061 wake_up(&mdev->state_wait); 1078 wake_up(&mdev->state_wait);
1062 1079
@@ -1268,7 +1285,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1268 if (test_bit(NEW_CUR_UUID, &mdev->flags)) { 1285 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1269 drbd_uuid_new_current(mdev); 1286 drbd_uuid_new_current(mdev);
1270 clear_bit(NEW_CUR_UUID, &mdev->flags); 1287 clear_bit(NEW_CUR_UUID, &mdev->flags);
1271 drbd_md_sync(mdev);
1272 } 1288 }
1273 spin_lock_irq(&mdev->req_lock); 1289 spin_lock_irq(&mdev->req_lock);
1274 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); 1290 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
@@ -1365,63 +1381,64 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1365 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) 1381 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1366 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); 1382 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1367 1383
1368 /* first half of local IO error */ 1384 /* first half of local IO error, failure to attach,
1369 if (os.disk > D_FAILED && ns.disk == D_FAILED) { 1385 * or administrative detach */
1370 enum drbd_io_error_p eh = EP_PASS_ON; 1386 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1387 enum drbd_io_error_p eh;
1388 int was_io_error;
1389 /* corresponding get_ldev was in __drbd_set_state, to serialize
1390 * our cleanup here with the transition to D_DISKLESS,
1391 * so it is safe to dreference ldev here. */
1392 eh = mdev->ldev->dc.on_io_error;
1393 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1394
1395 /* current state still has to be D_FAILED,
1396 * there is only one way out: to D_DISKLESS,
1397 * and that may only happen after our put_ldev below. */
1398 if (mdev->state.disk != D_FAILED)
1399 dev_err(DEV,
1400 "ASSERT FAILED: disk is %s during detach\n",
1401 drbd_disk_str(mdev->state.disk));
1371 1402
1372 if (drbd_send_state(mdev)) 1403 if (drbd_send_state(mdev))
1373 dev_warn(DEV, "Notified peer that my disk is broken.\n"); 1404 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1374 else 1405 else
1375 dev_err(DEV, "Sending state for drbd_io_error() failed\n"); 1406 dev_err(DEV, "Sending state for detaching disk failed\n");
1376 1407
1377 drbd_rs_cancel_all(mdev); 1408 drbd_rs_cancel_all(mdev);
1378 1409
1379 if (get_ldev_if_state(mdev, D_FAILED)) { 1410 /* In case we want to get something to stable storage still,
1380 eh = mdev->ldev->dc.on_io_error; 1411 * this may be the last chance.
1381 put_ldev(mdev); 1412 * Following put_ldev may transition to D_DISKLESS. */
1382 } 1413 drbd_md_sync(mdev);
1383 if (eh == EP_CALL_HELPER) 1414 put_ldev(mdev);
1415
1416 if (was_io_error && eh == EP_CALL_HELPER)
1384 drbd_khelper(mdev, "local-io-error"); 1417 drbd_khelper(mdev, "local-io-error");
1385 } 1418 }
1386 1419
1420 /* second half of local IO error, failure to attach,
1421 * or administrative detach,
1422 * after local_cnt references have reached zero again */
1423 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1424 /* We must still be diskless,
1425 * re-attach has to be serialized with this! */
1426 if (mdev->state.disk != D_DISKLESS)
1427 dev_err(DEV,
1428 "ASSERT FAILED: disk is %s while going diskless\n",
1429 drbd_disk_str(mdev->state.disk));
1387 1430
1388 /* second half of local IO error handling, 1431 mdev->rs_total = 0;
1389 * after local_cnt references have reached zero: */ 1432 mdev->rs_failed = 0;
1390 if (os.disk == D_FAILED && ns.disk == D_DISKLESS) { 1433 atomic_set(&mdev->rs_pending_cnt, 0);
1391 mdev->rs_total = 0;
1392 mdev->rs_failed = 0;
1393 atomic_set(&mdev->rs_pending_cnt, 0);
1394 }
1395
1396 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1397 /* We must still be diskless,
1398 * re-attach has to be serialized with this! */
1399 if (mdev->state.disk != D_DISKLESS)
1400 dev_err(DEV,
1401 "ASSERT FAILED: disk is %s while going diskless\n",
1402 drbd_disk_str(mdev->state.disk));
1403 1434
1404 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state
1405 * will inc/dec it frequently. Since we became D_DISKLESS, no
1406 * one has touched the protected members anymore, though, so we
1407 * are safe to free them here. */
1408 if (drbd_send_state(mdev)) 1435 if (drbd_send_state(mdev))
1409 dev_warn(DEV, "Notified peer that I detached my disk.\n"); 1436 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1410 else 1437 else
1411 dev_err(DEV, "Sending state for detach failed\n"); 1438 dev_err(DEV, "Sending state for being diskless failed\n");
1412 1439 /* corresponding get_ldev in __drbd_set_state
1413 lc_destroy(mdev->resync); 1440 * this may finaly trigger drbd_ldev_destroy. */
1414 mdev->resync = NULL; 1441 put_ldev(mdev);
1415 lc_destroy(mdev->act_log);
1416 mdev->act_log = NULL;
1417 __no_warn(local,
1418 drbd_free_bc(mdev->ldev);
1419 mdev->ldev = NULL;);
1420
1421 if (mdev->md_io_tmpp) {
1422 __free_page(mdev->md_io_tmpp);
1423 mdev->md_io_tmpp = NULL;
1424 }
1425 } 1442 }
1426 1443
1427 /* Disks got bigger while they were detached */ 1444 /* Disks got bigger while they were detached */
@@ -2772,11 +2789,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2772 2789
2773 drbd_set_defaults(mdev); 2790 drbd_set_defaults(mdev);
2774 2791
2775 /* for now, we do NOT yet support it,
2776 * even though we start some framework
2777 * to eventually support barriers */
2778 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2779
2780 atomic_set(&mdev->ap_bio_cnt, 0); 2792 atomic_set(&mdev->ap_bio_cnt, 0);
2781 atomic_set(&mdev->ap_pending_cnt, 0); 2793 atomic_set(&mdev->ap_pending_cnt, 0);
2782 atomic_set(&mdev->rs_pending_cnt, 0); 2794 atomic_set(&mdev->rs_pending_cnt, 0);
@@ -2842,7 +2854,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2842 drbd_thread_init(mdev, &mdev->asender, drbd_asender); 2854 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2843 2855
2844 mdev->agreed_pro_version = PRO_VERSION_MAX; 2856 mdev->agreed_pro_version = PRO_VERSION_MAX;
2845 mdev->write_ordering = WO_bio_barrier; 2857 mdev->write_ordering = WO_bdev_flush;
2846 mdev->resync_wenr = LC_FREE; 2858 mdev->resync_wenr = LC_FREE;
2847} 2859}
2848 2860
@@ -2899,7 +2911,6 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2899 D_ASSERT(list_empty(&mdev->resync_work.list)); 2911 D_ASSERT(list_empty(&mdev->resync_work.list));
2900 D_ASSERT(list_empty(&mdev->unplug_work.list)); 2912 D_ASSERT(list_empty(&mdev->unplug_work.list));
2901 D_ASSERT(list_empty(&mdev->go_diskless.list)); 2913 D_ASSERT(list_empty(&mdev->go_diskless.list));
2902
2903} 2914}
2904 2915
2905 2916
@@ -3660,6 +3671,8 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3660 3671
3661 get_random_bytes(&val, sizeof(u64)); 3672 get_random_bytes(&val, sizeof(u64));
3662 _drbd_uuid_set(mdev, UI_CURRENT, val); 3673 _drbd_uuid_set(mdev, UI_CURRENT, val);
3674 /* get it to stable storage _now_ */
3675 drbd_md_sync(mdev);
3663} 3676}
3664 3677
3665void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) 3678void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
@@ -3756,19 +3769,31 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3756 return 1; 3769 return 1;
3757} 3770}
3758 3771
3772void drbd_ldev_destroy(struct drbd_conf *mdev)
3773{
3774 lc_destroy(mdev->resync);
3775 mdev->resync = NULL;
3776 lc_destroy(mdev->act_log);
3777 mdev->act_log = NULL;
3778 __no_warn(local,
3779 drbd_free_bc(mdev->ldev);
3780 mdev->ldev = NULL;);
3781
3782 if (mdev->md_io_tmpp) {
3783 __free_page(mdev->md_io_tmpp);
3784 mdev->md_io_tmpp = NULL;
3785 }
3786 clear_bit(GO_DISKLESS, &mdev->flags);
3787}
3788
3759static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused) 3789static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3760{ 3790{
3761 D_ASSERT(mdev->state.disk == D_FAILED); 3791 D_ASSERT(mdev->state.disk == D_FAILED);
3762 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will 3792 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3763 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch 3793 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3764 * the protected members anymore, though, so in the after_state_ch work 3794 * the protected members anymore, though, so once put_ldev reaches zero
3765 * it will be safe to free them. */ 3795 * again, it will be safe to free them. */
3766 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3796 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3767 /* We need to wait for return of references checked out while we still
3768 * have been D_FAILED, though (drbd_md_sync, bitmap io). */
3769 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
3770
3771 clear_bit(GO_DISKLESS, &mdev->flags);
3772 return 1; 3797 return 1;
3773} 3798}
3774 3799
@@ -3777,9 +3802,6 @@ void drbd_go_diskless(struct drbd_conf *mdev)
3777 D_ASSERT(mdev->state.disk == D_FAILED); 3802 D_ASSERT(mdev->state.disk == D_FAILED);
3778 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) 3803 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3779 drbd_queue_work(&mdev->data.work, &mdev->go_diskless); 3804 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3780 /* don't drbd_queue_work_front,
3781 * we need to serialize with the after_state_ch work
3782 * of the -> D_FAILED transition. */
3783} 3805}
3784 3806
3785/** 3807/**
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 87925e97e613..29e5c70e4e26 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -870,6 +870,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
870 retcode = ERR_DISK_CONFIGURED; 870 retcode = ERR_DISK_CONFIGURED;
871 goto fail; 871 goto fail;
872 } 872 }
873 /* It may just now have detached because of IO error. Make sure
874 * drbd_ldev_destroy is done already, we may end up here very fast,
875 * e.g. if someone calls attach from the on-io-error handler,
876 * to realize a "hot spare" feature (not that I'd recommend that) */
877 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
873 878
874 /* allocation not in the IO path, cqueue thread context */ 879 /* allocation not in the IO path, cqueue thread context */
875 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); 880 nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
@@ -1098,9 +1103,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1098 /* Reset the "barriers don't work" bits here, then force meta data to 1103 /* Reset the "barriers don't work" bits here, then force meta data to
1099 * be written, to ensure we determine if barriers are supported. */ 1104 * be written, to ensure we determine if barriers are supported. */
1100 if (nbc->dc.no_md_flush) 1105 if (nbc->dc.no_md_flush)
1101 set_bit(MD_NO_BARRIER, &mdev->flags); 1106 set_bit(MD_NO_FUA, &mdev->flags);
1102 else 1107 else
1103 clear_bit(MD_NO_BARRIER, &mdev->flags); 1108 clear_bit(MD_NO_FUA, &mdev->flags);
1104 1109
1105 /* Point of no return reached. 1110 /* Point of no return reached.
1106 * Devices and memory are no longer released by error cleanup below. 1111 * Devices and memory are no longer released by error cleanup below.
@@ -1112,8 +1117,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1112 nbc = NULL; 1117 nbc = NULL;
1113 resync_lru = NULL; 1118 resync_lru = NULL;
1114 1119
1115 mdev->write_ordering = WO_bio_barrier; 1120 mdev->write_ordering = WO_bdev_flush;
1116 drbd_bump_write_ordering(mdev, WO_bio_barrier); 1121 drbd_bump_write_ordering(mdev, WO_bdev_flush);
1117 1122
1118 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) 1123 if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1119 set_bit(CRASHED_PRIMARY, &mdev->flags); 1124 set_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1262,7 +1267,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1262 force_diskless_dec: 1267 force_diskless_dec:
1263 put_ldev(mdev); 1268 put_ldev(mdev);
1264 force_diskless: 1269 force_diskless:
1265 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 1270 drbd_force_state(mdev, NS(disk, D_FAILED));
1266 drbd_md_sync(mdev); 1271 drbd_md_sync(mdev);
1267 release_bdev2_fail: 1272 release_bdev2_fail:
1268 if (nbc) 1273 if (nbc)
@@ -1285,10 +1290,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1285 return 0; 1290 return 0;
1286} 1291}
1287 1292
1293/* Detaching the disk is a process in multiple stages. First we need to lock
1294 * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1295 * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1296 * internal references as well.
1297 * Only then we have finally detached. */
1288static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1298static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1289 struct drbd_nl_cfg_reply *reply) 1299 struct drbd_nl_cfg_reply *reply)
1290{ 1300{
1301 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
1291 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); 1302 reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
1303 if (mdev->state.disk == D_DISKLESS)
1304 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1305 drbd_resume_io(mdev);
1292 return 0; 1306 return 0;
1293} 1307}
1294 1308
@@ -1953,7 +1967,6 @@ static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1953 if (test_bit(NEW_CUR_UUID, &mdev->flags)) { 1967 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1954 drbd_uuid_new_current(mdev); 1968 drbd_uuid_new_current(mdev);
1955 clear_bit(NEW_CUR_UUID, &mdev->flags); 1969 clear_bit(NEW_CUR_UUID, &mdev->flags);
1956 drbd_md_sync(mdev);
1957 } 1970 }
1958 drbd_suspend_io(mdev); 1971 drbd_suspend_io(mdev);
1959 reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); 1972 reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index ad325c5d0ce1..7e6ac307e2de 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -158,7 +158,6 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
158 [WO_none] = 'n', 158 [WO_none] = 'n',
159 [WO_drain_io] = 'd', 159 [WO_drain_io] = 'd',
160 [WO_bdev_flush] = 'f', 160 [WO_bdev_flush] = 'f',
161 [WO_bio_barrier] = 'b',
162 }; 161 };
163 162
164 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", 163 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index efd6169acf2f..d299fe9e78c8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -49,11 +49,6 @@
49 49
50#include "drbd_vli.h" 50#include "drbd_vli.h"
51 51
52struct flush_work {
53 struct drbd_work w;
54 struct drbd_epoch *epoch;
55};
56
57enum finish_epoch { 52enum finish_epoch {
58 FE_STILL_LIVE, 53 FE_STILL_LIVE,
59 FE_DESTROYED, 54 FE_DESTROYED,
@@ -66,16 +61,6 @@ static int drbd_do_auth(struct drbd_conf *mdev);
66static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); 61static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
67static int e_end_block(struct drbd_conf *, struct drbd_work *, int); 62static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
68 63
69static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
70{
71 struct drbd_epoch *prev;
72 spin_lock(&mdev->epoch_lock);
73 prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
74 if (prev == epoch || prev == mdev->current_epoch)
75 prev = NULL;
76 spin_unlock(&mdev->epoch_lock);
77 return prev;
78}
79 64
80#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 65#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
81 66
@@ -981,7 +966,7 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
981 return TRUE; 966 return TRUE;
982} 967}
983 968
984static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) 969static void drbd_flush(struct drbd_conf *mdev)
985{ 970{
986 int rv; 971 int rv;
987 972
@@ -997,24 +982,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
997 } 982 }
998 put_ldev(mdev); 983 put_ldev(mdev);
999 } 984 }
1000
1001 return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1002}
1003
1004static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1005{
1006 struct flush_work *fw = (struct flush_work *)w;
1007 struct drbd_epoch *epoch = fw->epoch;
1008
1009 kfree(w);
1010
1011 if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
1012 drbd_flush_after_epoch(mdev, epoch);
1013
1014 drbd_may_finish_epoch(mdev, epoch, EV_PUT |
1015 (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
1016
1017 return 1;
1018} 985}
1019 986
1020/** 987/**
@@ -1027,15 +994,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1027 struct drbd_epoch *epoch, 994 struct drbd_epoch *epoch,
1028 enum epoch_event ev) 995 enum epoch_event ev)
1029{ 996{
1030 int finish, epoch_size; 997 int epoch_size;
1031 struct drbd_epoch *next_epoch; 998 struct drbd_epoch *next_epoch;
1032 int schedule_flush = 0;
1033 enum finish_epoch rv = FE_STILL_LIVE; 999 enum finish_epoch rv = FE_STILL_LIVE;
1034 1000
1035 spin_lock(&mdev->epoch_lock); 1001 spin_lock(&mdev->epoch_lock);
1036 do { 1002 do {
1037 next_epoch = NULL; 1003 next_epoch = NULL;
1038 finish = 0;
1039 1004
1040 epoch_size = atomic_read(&epoch->epoch_size); 1005 epoch_size = atomic_read(&epoch->epoch_size);
1041 1006
@@ -1045,16 +1010,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1045 break; 1010 break;
1046 case EV_GOT_BARRIER_NR: 1011 case EV_GOT_BARRIER_NR:
1047 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); 1012 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1048
1049 /* Special case: If we just switched from WO_bio_barrier to
1050 WO_bdev_flush we should not finish the current epoch */
1051 if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1052 mdev->write_ordering != WO_bio_barrier &&
1053 epoch == mdev->current_epoch)
1054 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1055 break;
1056 case EV_BARRIER_DONE:
1057 set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1058 break; 1013 break;
1059 case EV_BECAME_LAST: 1014 case EV_BECAME_LAST:
1060 /* nothing to do*/ 1015 /* nothing to do*/
@@ -1063,23 +1018,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1063 1018
1064 if (epoch_size != 0 && 1019 if (epoch_size != 0 &&
1065 atomic_read(&epoch->active) == 0 && 1020 atomic_read(&epoch->active) == 0 &&
1066 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && 1021 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
1067 epoch->list.prev == &mdev->current_epoch->list &&
1068 !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1069 /* Nearly all conditions are met to finish that epoch... */
1070 if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1071 mdev->write_ordering == WO_none ||
1072 (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1073 ev & EV_CLEANUP) {
1074 finish = 1;
1075 set_bit(DE_IS_FINISHING, &epoch->flags);
1076 } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1077 mdev->write_ordering == WO_bio_barrier) {
1078 atomic_inc(&epoch->active);
1079 schedule_flush = 1;
1080 }
1081 }
1082 if (finish) {
1083 if (!(ev & EV_CLEANUP)) { 1022 if (!(ev & EV_CLEANUP)) {
1084 spin_unlock(&mdev->epoch_lock); 1023 spin_unlock(&mdev->epoch_lock);
1085 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); 1024 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
@@ -1102,6 +1041,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1102 /* atomic_set(&epoch->active, 0); is already zero */ 1041 /* atomic_set(&epoch->active, 0); is already zero */
1103 if (rv == FE_STILL_LIVE) 1042 if (rv == FE_STILL_LIVE)
1104 rv = FE_RECYCLED; 1043 rv = FE_RECYCLED;
1044 wake_up(&mdev->ee_wait);
1105 } 1045 }
1106 } 1046 }
1107 1047
@@ -1113,22 +1053,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1113 1053
1114 spin_unlock(&mdev->epoch_lock); 1054 spin_unlock(&mdev->epoch_lock);
1115 1055
1116 if (schedule_flush) {
1117 struct flush_work *fw;
1118 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1119 if (fw) {
1120 fw->w.cb = w_flush;
1121 fw->epoch = epoch;
1122 drbd_queue_work(&mdev->data.work, &fw->w);
1123 } else {
1124 dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1125 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1126 /* That is not a recursion, only one level */
1127 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1128 drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1129 }
1130 }
1131
1132 return rv; 1056 return rv;
1133} 1057}
1134 1058
@@ -1144,19 +1068,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1144 [WO_none] = "none", 1068 [WO_none] = "none",
1145 [WO_drain_io] = "drain", 1069 [WO_drain_io] = "drain",
1146 [WO_bdev_flush] = "flush", 1070 [WO_bdev_flush] = "flush",
1147 [WO_bio_barrier] = "barrier",
1148 }; 1071 };
1149 1072
1150 pwo = mdev->write_ordering; 1073 pwo = mdev->write_ordering;
1151 wo = min(pwo, wo); 1074 wo = min(pwo, wo);
1152 if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1153 wo = WO_bdev_flush;
1154 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) 1075 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1155 wo = WO_drain_io; 1076 wo = WO_drain_io;
1156 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) 1077 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1157 wo = WO_none; 1078 wo = WO_none;
1158 mdev->write_ordering = wo; 1079 mdev->write_ordering = wo;
1159 if (pwo != mdev->write_ordering || wo == WO_bio_barrier) 1080 if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
1160 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); 1081 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1161} 1082}
1162 1083
@@ -1192,7 +1113,7 @@ next_bio:
1192 bio->bi_sector = sector; 1113 bio->bi_sector = sector;
1193 bio->bi_bdev = mdev->ldev->backing_bdev; 1114 bio->bi_bdev = mdev->ldev->backing_bdev;
1194 /* we special case some flags in the multi-bio case, see below 1115 /* we special case some flags in the multi-bio case, see below
1195 * (REQ_UNPLUG, REQ_HARDBARRIER) */ 1116 * (REQ_UNPLUG) */
1196 bio->bi_rw = rw; 1117 bio->bi_rw = rw;
1197 bio->bi_private = e; 1118 bio->bi_private = e;
1198 bio->bi_end_io = drbd_endio_sec; 1119 bio->bi_end_io = drbd_endio_sec;
@@ -1226,11 +1147,6 @@ next_bio:
1226 bio->bi_rw &= ~REQ_UNPLUG; 1147 bio->bi_rw &= ~REQ_UNPLUG;
1227 1148
1228 drbd_generic_make_request(mdev, fault_type, bio); 1149 drbd_generic_make_request(mdev, fault_type, bio);
1229
1230 /* strip off REQ_HARDBARRIER,
1231 * unless it is the first or last bio */
1232 if (bios && bios->bi_next)
1233 bios->bi_rw &= ~REQ_HARDBARRIER;
1234 } while (bios); 1150 } while (bios);
1235 maybe_kick_lo(mdev); 1151 maybe_kick_lo(mdev);
1236 return 0; 1152 return 0;
@@ -1244,45 +1160,9 @@ fail:
1244 return -ENOMEM; 1160 return -ENOMEM;
1245} 1161}
1246 1162
1247/**
1248 * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
1249 * @mdev: DRBD device.
1250 * @w: work object.
1251 * @cancel: The connection will be closed anyways (unused in this callback)
1252 */
1253int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1254{
1255 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1256 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1257 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1258 so that we can finish that epoch in drbd_may_finish_epoch().
1259 That is necessary if we already have a long chain of Epochs, before
1260 we realize that REQ_HARDBARRIER is actually not supported */
1261
1262 /* As long as the -ENOTSUPP on the barrier is reported immediately
1263 that will never trigger. If it is reported late, we will just
1264 print that warning and continue correctly for all future requests
1265 with WO_bdev_flush */
1266 if (previous_epoch(mdev, e->epoch))
1267 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1268
1269 /* we still have a local reference,
1270 * get_ldev was done in receive_Data. */
1271
1272 e->w.cb = e_end_block;
1273 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1274 /* drbd_submit_ee fails for one reason only:
1275 * if was not able to allocate sufficient bios.
1276 * requeue, try again later. */
1277 e->w.cb = w_e_reissue;
1278 drbd_queue_work(&mdev->data.work, &e->w);
1279 }
1280 return 1;
1281}
1282
1283static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 1163static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1284{ 1164{
1285 int rv, issue_flush; 1165 int rv;
1286 struct p_barrier *p = &mdev->data.rbuf.barrier; 1166 struct p_barrier *p = &mdev->data.rbuf.barrier;
1287 struct drbd_epoch *epoch; 1167 struct drbd_epoch *epoch;
1288 1168
@@ -1300,44 +1180,40 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
1300 * Therefore we must send the barrier_ack after the barrier request was 1180 * Therefore we must send the barrier_ack after the barrier request was
1301 * completed. */ 1181 * completed. */
1302 switch (mdev->write_ordering) { 1182 switch (mdev->write_ordering) {
1303 case WO_bio_barrier:
1304 case WO_none: 1183 case WO_none:
1305 if (rv == FE_RECYCLED) 1184 if (rv == FE_RECYCLED)
1306 return TRUE; 1185 return TRUE;
1307 break; 1186
1187 /* receiver context, in the writeout path of the other node.
1188 * avoid potential distributed deadlock */
1189 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1190 if (epoch)
1191 break;
1192 else
1193 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1194 /* Fall through */
1308 1195
1309 case WO_bdev_flush: 1196 case WO_bdev_flush:
1310 case WO_drain_io: 1197 case WO_drain_io:
1311 if (rv == FE_STILL_LIVE) {
1312 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1313 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1314 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1315 }
1316 if (rv == FE_RECYCLED)
1317 return TRUE;
1318
1319 /* The asender will send all the ACKs and barrier ACKs out, since
1320 all EEs moved from the active_ee to the done_ee. We need to
1321 provide a new epoch object for the EEs that come in soon */
1322 break;
1323 }
1324
1325 /* receiver context, in the writeout path of the other node.
1326 * avoid potential distributed deadlock */
1327 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1328 if (!epoch) {
1329 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1330 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1331 drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 1198 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1332 if (issue_flush) { 1199 drbd_flush(mdev);
1333 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); 1200
1334 if (rv == FE_RECYCLED) 1201 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1335 return TRUE; 1202 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1203 if (epoch)
1204 break;
1336 } 1205 }
1337 1206
1338 drbd_wait_ee_list_empty(mdev, &mdev->done_ee); 1207 epoch = mdev->current_epoch;
1208 wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
1209
1210 D_ASSERT(atomic_read(&epoch->active) == 0);
1211 D_ASSERT(epoch->flags == 0);
1339 1212
1340 return TRUE; 1213 return TRUE;
1214 default:
1215 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
1216 return FALSE;
1341 } 1217 }
1342 1218
1343 epoch->flags = 0; 1219 epoch->flags = 0;
@@ -1652,15 +1528,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1652{ 1528{
1653 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1529 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1654 sector_t sector = e->sector; 1530 sector_t sector = e->sector;
1655 struct drbd_epoch *epoch;
1656 int ok = 1, pcmd; 1531 int ok = 1, pcmd;
1657 1532
1658 if (e->flags & EE_IS_BARRIER) {
1659 epoch = previous_epoch(mdev, e->epoch);
1660 if (epoch)
1661 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1662 }
1663
1664 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { 1533 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1665 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1534 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1666 pcmd = (mdev->state.conn >= C_SYNC_SOURCE && 1535 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
@@ -1817,27 +1686,6 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1817 e->epoch = mdev->current_epoch; 1686 e->epoch = mdev->current_epoch;
1818 atomic_inc(&e->epoch->epoch_size); 1687 atomic_inc(&e->epoch->epoch_size);
1819 atomic_inc(&e->epoch->active); 1688 atomic_inc(&e->epoch->active);
1820
1821 if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1822 struct drbd_epoch *epoch;
1823 /* Issue a barrier if we start a new epoch, and the previous epoch
1824 was not a epoch containing a single request which already was
1825 a Barrier. */
1826 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1827 if (epoch == e->epoch) {
1828 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1829 rw |= REQ_HARDBARRIER;
1830 e->flags |= EE_IS_BARRIER;
1831 } else {
1832 if (atomic_read(&epoch->epoch_size) > 1 ||
1833 !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1834 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1835 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1836 rw |= REQ_HARDBARRIER;
1837 e->flags |= EE_IS_BARRIER;
1838 }
1839 }
1840 }
1841 spin_unlock(&mdev->epoch_lock); 1689 spin_unlock(&mdev->epoch_lock);
1842 1690
1843 dp_flags = be32_to_cpu(p->dp_flags); 1691 dp_flags = be32_to_cpu(p->dp_flags);
@@ -1995,10 +1843,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1995 break; 1843 break;
1996 } 1844 }
1997 1845
1998 if (mdev->state.pdsk == D_DISKLESS) { 1846 if (mdev->state.pdsk < D_INCONSISTENT) {
1999 /* In case we have the only disk of the cluster, */ 1847 /* In case we have the only disk of the cluster, */
2000 drbd_set_out_of_sync(mdev, e->sector, e->size); 1848 drbd_set_out_of_sync(mdev, e->sector, e->size);
2001 e->flags |= EE_CALL_AL_COMPLETE_IO; 1849 e->flags |= EE_CALL_AL_COMPLETE_IO;
1850 e->flags &= ~EE_MAY_SET_IN_SYNC;
2002 drbd_al_begin_io(mdev, e->sector); 1851 drbd_al_begin_io(mdev, e->sector);
2003 } 1852 }
2004 1853
@@ -3362,7 +3211,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3362 if (ns.conn == C_MASK) { 3211 if (ns.conn == C_MASK) {
3363 ns.conn = C_CONNECTED; 3212 ns.conn = C_CONNECTED;
3364 if (mdev->state.disk == D_NEGOTIATING) { 3213 if (mdev->state.disk == D_NEGOTIATING) {
3365 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3214 drbd_force_state(mdev, NS(disk, D_FAILED));
3366 } else if (peer_state.disk == D_NEGOTIATING) { 3215 } else if (peer_state.disk == D_NEGOTIATING) {
3367 dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); 3216 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3368 peer_state.disk = D_DISKLESS; 3217 peer_state.disk = D_DISKLESS;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 9e91a2545fc8..11a75d32a2e2 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -258,7 +258,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
258 if (!hlist_unhashed(&req->colision)) 258 if (!hlist_unhashed(&req->colision))
259 hlist_del(&req->colision); 259 hlist_del(&req->colision);
260 else 260 else
261 D_ASSERT((s & RQ_NET_MASK) == 0); 261 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
262 262
263 /* for writes we need to do some extra housekeeping */ 263 /* for writes we need to do some extra housekeeping */
264 if (rw == WRITE) 264 if (rw == WRITE)
@@ -813,7 +813,8 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
813 mdev->state.conn >= C_CONNECTED)); 813 mdev->state.conn >= C_CONNECTED));
814 814
815 if (!(local || remote) && !is_susp(mdev->state)) { 815 if (!(local || remote) && !is_susp(mdev->state)) {
816 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 816 if (__ratelimit(&drbd_ratelimit_state))
817 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
817 goto fail_free_complete; 818 goto fail_free_complete;
818 } 819 }
819 820
@@ -942,12 +943,21 @@ allocate_barrier:
942 if (local) { 943 if (local) {
943 req->private_bio->bi_bdev = mdev->ldev->backing_bdev; 944 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
944 945
945 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR 946 /* State may have changed since we grabbed our reference on the
946 : rw == READ ? DRBD_FAULT_DT_RD 947 * mdev->ldev member. Double check, and short-circuit to endio.
947 : DRBD_FAULT_DT_RA)) 948 * In case the last activity log transaction failed to get on
949 * stable storage, and this is a WRITE, we may not even submit
950 * this bio. */
951 if (get_ldev(mdev)) {
952 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
953 : rw == READ ? DRBD_FAULT_DT_RD
954 : DRBD_FAULT_DT_RA))
955 bio_endio(req->private_bio, -EIO);
956 else
957 generic_make_request(req->private_bio);
958 put_ldev(mdev);
959 } else
948 bio_endio(req->private_bio, -EIO); 960 bio_endio(req->private_bio, -EIO);
949 else
950 generic_make_request(req->private_bio);
951 } 961 }
952 962
953 /* we need to plug ALWAYS since we possibly need to kick lo_dev. 963 /* we need to plug ALWAYS since we possibly need to kick lo_dev.
@@ -1022,20 +1032,6 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1022 return 0; 1032 return 0;
1023 } 1033 }
1024 1034
1025 /* Reject barrier requests if we know the underlying device does
1026 * not support them.
1027 * XXX: Need to get this info from peer as well some how so we
1028 * XXX: reject if EITHER side/data/metadata area does not support them.
1029 *
1030 * because of those XXX, this is not yet enabled,
1031 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
1032 */
1033 if (unlikely(bio->bi_rw & REQ_HARDBARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags)) {
1034 /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
1035 bio_endio(bio, -EOPNOTSUPP);
1036 return 0;
1037 }
1038
1039 /* 1035 /*
1040 * what we "blindly" assume: 1036 * what we "blindly" assume:
1041 */ 1037 */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 108d58015cd1..b0551ba7ad0c 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -102,12 +102,6 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
102 put_ldev(mdev); 102 put_ldev(mdev);
103} 103}
104 104
105static int is_failed_barrier(int ee_flags)
106{
107 return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
108 == (EE_IS_BARRIER|EE_WAS_ERROR);
109}
110
111/* writes on behalf of the partner, or resync writes, 105/* writes on behalf of the partner, or resync writes,
112 * "submitted" by the receiver, final stage. */ 106 * "submitted" by the receiver, final stage. */
113static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local) 107static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
@@ -119,21 +113,6 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
119 int is_syncer_req; 113 int is_syncer_req;
120 int do_al_complete_io; 114 int do_al_complete_io;
121 115
122 /* if this is a failed barrier request, disable use of barriers,
123 * and schedule for resubmission */
124 if (is_failed_barrier(e->flags)) {
125 drbd_bump_write_ordering(mdev, WO_bdev_flush);
126 spin_lock_irqsave(&mdev->req_lock, flags);
127 list_del(&e->w.list);
128 e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
129 e->w.cb = w_e_reissue;
130 /* put_ldev actually happens below, once we come here again. */
131 __release(local);
132 spin_unlock_irqrestore(&mdev->req_lock, flags);
133 drbd_queue_work(&mdev->data.work, &e->w);
134 return;
135 }
136
137 D_ASSERT(e->block_id != ID_VACANT); 116 D_ASSERT(e->block_id != ID_VACANT);
138 117
139 /* after we moved e to done_ee, 118 /* after we moved e to done_ee,
@@ -925,7 +904,7 @@ out:
925 drbd_md_sync(mdev); 904 drbd_md_sync(mdev);
926 905
927 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { 906 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
928 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); 907 dev_info(DEV, "Writing the whole bitmap\n");
929 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); 908 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
930 } 909 }
931 910