aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-27 23:02:07 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-27 23:02:07 -0400
commit8d49a77568d1105ff3e64aec484dac059f54824e (patch)
tree633ee954a3cea97bf136dec933388a2e419e5dac /drivers/block
parent93567c43eb2a4771b9c590435928f9b3a428e568 (diff)
parent1ddd5049545e0aa1a0ed19bca4d9c9c3ce1ac8a2 (diff)
Merge branch 'for-2.6.39/drivers' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.39/drivers' of git://git.kernel.dk/linux-2.6-block: (122 commits) cciss: fix lost command issue drbd: need include for bitops functions declarations Revert "cciss: Add missing allocation in scsi_cmd_stack_setup and corresponding deallocation" cciss: fix missed command status value CMD_UNABORTABLE cciss: remove unnecessary casts cciss: Mask off error bits of c->busaddr in cmd_special_free when calling pci_free_consistent cciss: Inform controller we are using 32-bit tags. cciss: hoist tag masking out of loop cciss: Add missing allocation in scsi_cmd_stack_setup and corresponding deallocation cciss: export resettable host attribute drbd: drop code present under #ifdef which is relevant to 2.6.28 and below drbd: Fixed handling of read errors on a 'VerifyS' node drbd: Fixed handling of read errors on a 'VerifyT' node drbd: Implemented real timeout checking for request processing time drbd: Remove unused function atodb_endio() drbd: improve log message if received sector offset exceeds local capacity drbd: kill dead code drbd: don't BUG_ON, if bio_add_page of a single page to an empty bio fails drbd: Removed left over, now wrong comments drbd: serialize admin requests for new verify run with pending bitmap io ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/cciss.c86
-rw-r--r--drivers/block/cciss.h1
-rw-r--r--drivers/block/cciss_cmd.h1
-rw-r--r--drivers/block/cciss_scsi.c13
-rw-r--r--drivers/block/drbd/drbd_actlog.c335
-rw-r--r--drivers/block/drbd/drbd_bitmap.c752
-rw-r--r--drivers/block/drbd/drbd_int.h270
-rw-r--r--drivers/block/drbd/drbd_main.c673
-rw-r--r--drivers/block/drbd/drbd_nl.c183
-rw-r--r--drivers/block/drbd/drbd_proc.c114
-rw-r--r--drivers/block/drbd/drbd_receiver.c608
-rw-r--r--drivers/block/drbd/drbd_req.c169
-rw-r--r--drivers/block/drbd/drbd_req.h36
-rw-r--r--drivers/block/drbd/drbd_strings.c6
-rw-r--r--drivers/block/drbd/drbd_worker.c360
-rw-r--r--drivers/block/drbd/drbd_wrappers.h2
16 files changed, 2214 insertions, 1395 deletions
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 35658f445fca..9bf13988f1a2 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -193,7 +193,7 @@ static int __devinit cciss_find_cfg_addrs(struct pci_dev *pdev,
193 u64 *cfg_offset); 193 u64 *cfg_offset);
194static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev, 194static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
195 unsigned long *memory_bar); 195 unsigned long *memory_bar);
196 196static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag);
197 197
198/* performant mode helper functions */ 198/* performant mode helper functions */
199static void calc_bucket_map(int *bucket, int num_buckets, int nsgs, 199static void calc_bucket_map(int *bucket, int num_buckets, int nsgs,
@@ -231,7 +231,7 @@ static const struct block_device_operations cciss_fops = {
231 */ 231 */
232static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c) 232static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c)
233{ 233{
234 if (likely(h->transMethod == CFGTBL_Trans_Performant)) 234 if (likely(h->transMethod & CFGTBL_Trans_Performant))
235 c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1); 235 c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
236} 236}
237 237
@@ -556,6 +556,44 @@ static void __devinit cciss_procinit(ctlr_info_t *h)
556#define to_hba(n) container_of(n, struct ctlr_info, dev) 556#define to_hba(n) container_of(n, struct ctlr_info, dev)
557#define to_drv(n) container_of(n, drive_info_struct, dev) 557#define to_drv(n) container_of(n, drive_info_struct, dev)
558 558
559/* List of controllers which cannot be reset on kexec with reset_devices */
560static u32 unresettable_controller[] = {
561 0x324a103C, /* Smart Array P712m */
562 0x324b103C, /* SmartArray P711m */
563 0x3223103C, /* Smart Array P800 */
564 0x3234103C, /* Smart Array P400 */
565 0x3235103C, /* Smart Array P400i */
566 0x3211103C, /* Smart Array E200i */
567 0x3212103C, /* Smart Array E200 */
568 0x3213103C, /* Smart Array E200i */
569 0x3214103C, /* Smart Array E200i */
570 0x3215103C, /* Smart Array E200i */
571 0x3237103C, /* Smart Array E500 */
572 0x323D103C, /* Smart Array P700m */
573 0x409C0E11, /* Smart Array 6400 */
574 0x409D0E11, /* Smart Array 6400 EM */
575};
576
577static int ctlr_is_resettable(struct ctlr_info *h)
578{
579 int i;
580
581 for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++)
582 if (unresettable_controller[i] == h->board_id)
583 return 0;
584 return 1;
585}
586
587static ssize_t host_show_resettable(struct device *dev,
588 struct device_attribute *attr,
589 char *buf)
590{
591 struct ctlr_info *h = to_hba(dev);
592
593 return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h));
594}
595static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL);
596
559static ssize_t host_store_rescan(struct device *dev, 597static ssize_t host_store_rescan(struct device *dev,
560 struct device_attribute *attr, 598 struct device_attribute *attr,
561 const char *buf, size_t count) 599 const char *buf, size_t count)
@@ -741,6 +779,7 @@ static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL);
741 779
742static struct attribute *cciss_host_attrs[] = { 780static struct attribute *cciss_host_attrs[] = {
743 &dev_attr_rescan.attr, 781 &dev_attr_rescan.attr,
782 &dev_attr_resettable.attr,
744 NULL 783 NULL
745}; 784};
746 785
@@ -973,8 +1012,8 @@ static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c)
973 temp64.val32.upper = c->ErrDesc.Addr.upper; 1012 temp64.val32.upper = c->ErrDesc.Addr.upper;
974 pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct), 1013 pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct),
975 c->err_info, (dma_addr_t) temp64.val); 1014 c->err_info, (dma_addr_t) temp64.val);
976 pci_free_consistent(h->pdev, sizeof(CommandList_struct), 1015 pci_free_consistent(h->pdev, sizeof(CommandList_struct), c,
977 c, (dma_addr_t) c->busaddr); 1016 (dma_addr_t) cciss_tag_discard_error_bits(h, (u32) c->busaddr));
978} 1017}
979 1018
980static inline ctlr_info_t *get_host(struct gendisk *disk) 1019static inline ctlr_info_t *get_host(struct gendisk *disk)
@@ -1490,8 +1529,7 @@ static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
1490 return -EINVAL; 1529 return -EINVAL;
1491 if (!capable(CAP_SYS_RAWIO)) 1530 if (!capable(CAP_SYS_RAWIO))
1492 return -EPERM; 1531 return -EPERM;
1493 ioc = (BIG_IOCTL_Command_struct *) 1532 ioc = kmalloc(sizeof(*ioc), GFP_KERNEL);
1494 kmalloc(sizeof(*ioc), GFP_KERNEL);
1495 if (!ioc) { 1533 if (!ioc) {
1496 status = -ENOMEM; 1534 status = -ENOMEM;
1497 goto cleanup1; 1535 goto cleanup1;
@@ -2653,6 +2691,10 @@ static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c)
2653 c->Request.CDB[0]); 2691 c->Request.CDB[0]);
2654 return_status = IO_NEEDS_RETRY; 2692 return_status = IO_NEEDS_RETRY;
2655 break; 2693 break;
2694 case CMD_UNABORTABLE:
2695 dev_warn(&h->pdev->dev, "cmd unabortable\n");
2696 return_status = IO_ERROR;
2697 break;
2656 default: 2698 default:
2657 dev_warn(&h->pdev->dev, "cmd 0x%02x returned " 2699 dev_warn(&h->pdev->dev, "cmd 0x%02x returned "
2658 "unknown status %x\n", c->Request.CDB[0], 2700 "unknown status %x\n", c->Request.CDB[0],
@@ -3103,6 +3145,13 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
3103 (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? 3145 (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
3104 DID_PASSTHROUGH : DID_ERROR); 3146 DID_PASSTHROUGH : DID_ERROR);
3105 break; 3147 break;
3148 case CMD_UNABORTABLE:
3149 dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
3150 rq->errors = make_status_bytes(SAM_STAT_GOOD,
3151 cmd->err_info->CommandStatus, DRIVER_OK,
3152 cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ?
3153 DID_PASSTHROUGH : DID_ERROR);
3154 break;
3106 default: 3155 default:
3107 dev_warn(&h->pdev->dev, "cmd %p returned " 3156 dev_warn(&h->pdev->dev, "cmd %p returned "
3108 "unknown status %x\n", cmd, 3157 "unknown status %x\n", cmd,
@@ -3136,10 +3185,13 @@ static inline u32 cciss_tag_to_index(u32 tag)
3136 return tag >> DIRECT_LOOKUP_SHIFT; 3185 return tag >> DIRECT_LOOKUP_SHIFT;
3137} 3186}
3138 3187
3139static inline u32 cciss_tag_discard_error_bits(u32 tag) 3188static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag)
3140{ 3189{
3141#define CCISS_ERROR_BITS 0x03 3190#define CCISS_PERF_ERROR_BITS ((1 << DIRECT_LOOKUP_SHIFT) - 1)
3142 return tag & ~CCISS_ERROR_BITS; 3191#define CCISS_SIMPLE_ERROR_BITS 0x03
3192 if (likely(h->transMethod & CFGTBL_Trans_Performant))
3193 return tag & ~CCISS_PERF_ERROR_BITS;
3194 return tag & ~CCISS_SIMPLE_ERROR_BITS;
3143} 3195}
3144 3196
3145static inline void cciss_mark_tag_indexed(u32 *tag) 3197static inline void cciss_mark_tag_indexed(u32 *tag)
@@ -3359,7 +3411,7 @@ static inline u32 next_command(ctlr_info_t *h)
3359{ 3411{
3360 u32 a; 3412 u32 a;
3361 3413
3362 if (unlikely(h->transMethod != CFGTBL_Trans_Performant)) 3414 if (unlikely(!(h->transMethod & CFGTBL_Trans_Performant)))
3363 return h->access.command_completed(h); 3415 return h->access.command_completed(h);
3364 3416
3365 if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) { 3417 if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
@@ -3394,14 +3446,12 @@ static inline u32 process_indexed_cmd(ctlr_info_t *h, u32 raw_tag)
3394/* process completion of a non-indexed command */ 3446/* process completion of a non-indexed command */
3395static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag) 3447static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
3396{ 3448{
3397 u32 tag;
3398 CommandList_struct *c = NULL; 3449 CommandList_struct *c = NULL;
3399 __u32 busaddr_masked, tag_masked; 3450 __u32 busaddr_masked, tag_masked;
3400 3451
3401 tag = cciss_tag_discard_error_bits(raw_tag); 3452 tag_masked = cciss_tag_discard_error_bits(h, raw_tag);
3402 list_for_each_entry(c, &h->cmpQ, list) { 3453 list_for_each_entry(c, &h->cmpQ, list) {
3403 busaddr_masked = cciss_tag_discard_error_bits(c->busaddr); 3454 busaddr_masked = cciss_tag_discard_error_bits(h, c->busaddr);
3404 tag_masked = cciss_tag_discard_error_bits(tag);
3405 if (busaddr_masked == tag_masked) { 3455 if (busaddr_masked == tag_masked) {
3406 finish_cmd(h, c, raw_tag); 3456 finish_cmd(h, c, raw_tag);
3407 return next_command(h); 3457 return next_command(h);
@@ -3753,7 +3803,8 @@ static void __devinit cciss_wait_for_mode_change_ack(ctlr_info_t *h)
3753 } 3803 }
3754} 3804}
3755 3805
3756static __devinit void cciss_enter_performant_mode(ctlr_info_t *h) 3806static __devinit void cciss_enter_performant_mode(ctlr_info_t *h,
3807 u32 use_short_tags)
3757{ 3808{
3758 /* This is a bit complicated. There are 8 registers on 3809 /* This is a bit complicated. There are 8 registers on
3759 * the controller which we write to to tell it 8 different 3810 * the controller which we write to to tell it 8 different
@@ -3808,7 +3859,7 @@ static __devinit void cciss_enter_performant_mode(ctlr_info_t *h)
3808 writel(0, &h->transtable->RepQCtrAddrHigh32); 3859 writel(0, &h->transtable->RepQCtrAddrHigh32);
3809 writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32); 3860 writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32);
3810 writel(0, &h->transtable->RepQAddr0High32); 3861 writel(0, &h->transtable->RepQAddr0High32);
3811 writel(CFGTBL_Trans_Performant, 3862 writel(CFGTBL_Trans_Performant | use_short_tags,
3812 &(h->cfgtable->HostWrite.TransportRequest)); 3863 &(h->cfgtable->HostWrite.TransportRequest));
3813 3864
3814 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); 3865 writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
@@ -3855,7 +3906,8 @@ static void __devinit cciss_put_controller_into_performant_mode(ctlr_info_t *h)
3855 if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL)) 3906 if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL))
3856 goto clean_up; 3907 goto clean_up;
3857 3908
3858 cciss_enter_performant_mode(h); 3909 cciss_enter_performant_mode(h,
3910 trans_support & CFGTBL_Trans_use_short_tags);
3859 3911
3860 /* Change the access methods to the performant access methods */ 3912 /* Change the access methods to the performant access methods */
3861 h->access = SA5_performant_access; 3913 h->access = SA5_performant_access;
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 579f74918493..554bbd907d14 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -222,6 +222,7 @@ static void SA5_submit_command( ctlr_info_t *h, CommandList_struct *c)
222 h->ctlr, c->busaddr); 222 h->ctlr, c->busaddr);
223#endif /* CCISS_DEBUG */ 223#endif /* CCISS_DEBUG */
224 writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET); 224 writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET);
225 readl(h->vaddr + SA5_REQUEST_PORT_OFFSET);
225 h->commands_outstanding++; 226 h->commands_outstanding++;
226 if ( h->commands_outstanding > h->max_outstanding) 227 if ( h->commands_outstanding > h->max_outstanding)
227 h->max_outstanding = h->commands_outstanding; 228 h->max_outstanding = h->commands_outstanding;
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index 35463d2f0ee7..cd441bef031f 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -56,6 +56,7 @@
56 56
57#define CFGTBL_Trans_Simple 0x00000002l 57#define CFGTBL_Trans_Simple 0x00000002l
58#define CFGTBL_Trans_Performant 0x00000004l 58#define CFGTBL_Trans_Performant 0x00000004l
59#define CFGTBL_Trans_use_short_tags 0x20000000l
59 60
60#define CFGTBL_BusType_Ultra2 0x00000001l 61#define CFGTBL_BusType_Ultra2 0x00000001l
61#define CFGTBL_BusType_Ultra3 0x00000002l 62#define CFGTBL_BusType_Ultra3 0x00000002l
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 727d0225b7d0..df793803f5ae 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -824,13 +824,18 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
824 break; 824 break;
825 case CMD_UNSOLICITED_ABORT: 825 case CMD_UNSOLICITED_ABORT:
826 cmd->result = DID_ABORT << 16; 826 cmd->result = DID_ABORT << 16;
827 dev_warn(&h->pdev->dev, "%p aborted do to an " 827 dev_warn(&h->pdev->dev, "%p aborted due to an "
828 "unsolicited abort\n", c); 828 "unsolicited abort\n", c);
829 break; 829 break;
830 case CMD_TIMEOUT: 830 case CMD_TIMEOUT:
831 cmd->result = DID_TIME_OUT << 16; 831 cmd->result = DID_TIME_OUT << 16;
832 dev_warn(&h->pdev->dev, "%p timedout\n", c); 832 dev_warn(&h->pdev->dev, "%p timedout\n", c);
833 break; 833 break;
834 case CMD_UNABORTABLE:
835 cmd->result = DID_ERROR << 16;
836 dev_warn(&h->pdev->dev, "c %p command "
837 "unabortable\n", c);
838 break;
834 default: 839 default:
835 cmd->result = DID_ERROR << 16; 840 cmd->result = DID_ERROR << 16;
836 dev_warn(&h->pdev->dev, 841 dev_warn(&h->pdev->dev,
@@ -1007,11 +1012,15 @@ cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c)
1007 break; 1012 break;
1008 case CMD_UNSOLICITED_ABORT: 1013 case CMD_UNSOLICITED_ABORT:
1009 dev_warn(&h->pdev->dev, 1014 dev_warn(&h->pdev->dev,
1010 "%p aborted do to an unsolicited abort\n", c); 1015 "%p aborted due to an unsolicited abort\n", c);
1011 break; 1016 break;
1012 case CMD_TIMEOUT: 1017 case CMD_TIMEOUT:
1013 dev_warn(&h->pdev->dev, "%p timedout\n", c); 1018 dev_warn(&h->pdev->dev, "%p timedout\n", c);
1014 break; 1019 break;
1020 case CMD_UNABORTABLE:
1021 dev_warn(&h->pdev->dev,
1022 "%p unabortable\n", c);
1023 break;
1015 default: 1024 default:
1016 dev_warn(&h->pdev->dev, 1025 dev_warn(&h->pdev->dev,
1017 "%p returned unknown status %x\n", 1026 "%p returned unknown status %x\n",
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index aca302492ff2..2a1642bc451d 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,7 +92,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
92 bio->bi_end_io = drbd_md_io_complete; 92 bio->bi_end_io = drbd_md_io_complete;
93 bio->bi_rw = rw; 93 bio->bi_rw = rw;
94 94
95 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 95 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
96 bio_endio(bio, -EIO); 96 bio_endio(bio, -EIO);
97 else 97 else
98 submit_bio(rw, bio); 98 submit_bio(rw, bio);
@@ -176,13 +176,17 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
176 struct lc_element *al_ext; 176 struct lc_element *al_ext;
177 struct lc_element *tmp; 177 struct lc_element *tmp;
178 unsigned long al_flags = 0; 178 unsigned long al_flags = 0;
179 int wake;
179 180
180 spin_lock_irq(&mdev->al_lock); 181 spin_lock_irq(&mdev->al_lock);
181 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); 182 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
182 if (unlikely(tmp != NULL)) { 183 if (unlikely(tmp != NULL)) {
183 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 184 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
184 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 185 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
186 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
185 spin_unlock_irq(&mdev->al_lock); 187 spin_unlock_irq(&mdev->al_lock);
188 if (wake)
189 wake_up(&mdev->al_wait);
186 return NULL; 190 return NULL;
187 } 191 }
188 } 192 }
@@ -258,6 +262,33 @@ void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
258 spin_unlock_irqrestore(&mdev->al_lock, flags); 262 spin_unlock_irqrestore(&mdev->al_lock, flags);
259} 263}
260 264
265#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
266/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
267 * are still coupled, or assume too much about their relation.
268 * Code below will not work if this is violated.
269 * Will be cleaned up with some followup patch.
270 */
271# error FIXME
272#endif
273
274static unsigned int al_extent_to_bm_page(unsigned int al_enr)
275{
276 return al_enr >>
277 /* bit to page */
278 ((PAGE_SHIFT + 3) -
279 /* al extent number to bit */
280 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
281}
282
283static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
284{
285 return rs_enr >>
286 /* bit to page */
287 ((PAGE_SHIFT + 3) -
288 /* al extent number to bit */
289 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
290}
291
261int 292int
262w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) 293w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
263{ 294{
@@ -285,7 +316,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
285 * For now, we must not write the transaction, 316 * For now, we must not write the transaction,
286 * if we cannot write out the bitmap of the evicted extent. */ 317 * if we cannot write out the bitmap of the evicted extent. */
287 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) 318 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
288 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); 319 drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
289 320
290 /* The bitmap write may have failed, causing a state change. */ 321 /* The bitmap write may have failed, causing a state change. */
291 if (mdev->state.disk < D_INCONSISTENT) { 322 if (mdev->state.disk < D_INCONSISTENT) {
@@ -334,7 +365,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
334 + mdev->ldev->md.al_offset + mdev->al_tr_pos; 365 + mdev->ldev->md.al_offset + mdev->al_tr_pos;
335 366
336 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) 367 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
337 drbd_chk_io_error(mdev, 1, TRUE); 368 drbd_chk_io_error(mdev, 1, true);
338 369
339 if (++mdev->al_tr_pos > 370 if (++mdev->al_tr_pos >
340 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) 371 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
@@ -511,225 +542,6 @@ cancel:
511 return 1; 542 return 1;
512} 543}
513 544
514static void atodb_endio(struct bio *bio, int error)
515{
516 struct drbd_atodb_wait *wc = bio->bi_private;
517 struct drbd_conf *mdev = wc->mdev;
518 struct page *page;
519 int uptodate = bio_flagged(bio, BIO_UPTODATE);
520
521 /* strange behavior of some lower level drivers...
522 * fail the request by clearing the uptodate flag,
523 * but do not return any error?! */
524 if (!error && !uptodate)
525 error = -EIO;
526
527 drbd_chk_io_error(mdev, error, TRUE);
528 if (error && wc->error == 0)
529 wc->error = error;
530
531 if (atomic_dec_and_test(&wc->count))
532 complete(&wc->io_done);
533
534 page = bio->bi_io_vec[0].bv_page;
535 put_page(page);
536 bio_put(bio);
537 mdev->bm_writ_cnt++;
538 put_ldev(mdev);
539}
540
541/* sector to word */
542#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
543
544/* activity log to on disk bitmap -- prepare bio unless that sector
545 * is already covered by previously prepared bios */
546static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
547 struct bio **bios,
548 unsigned int enr,
549 struct drbd_atodb_wait *wc) __must_hold(local)
550{
551 struct bio *bio;
552 struct page *page;
553 sector_t on_disk_sector;
554 unsigned int page_offset = PAGE_SIZE;
555 int offset;
556 int i = 0;
557 int err = -ENOMEM;
558
559 /* We always write aligned, full 4k blocks,
560 * so we can ignore the logical_block_size (for now) */
561 enr &= ~7U;
562 on_disk_sector = enr + mdev->ldev->md.md_offset
563 + mdev->ldev->md.bm_offset;
564
565 D_ASSERT(!(on_disk_sector & 7U));
566
567 /* Check if that enr is already covered by an already created bio.
568 * Caution, bios[] is not NULL terminated,
569 * but only initialized to all NULL.
570 * For completely scattered activity log,
571 * the last invocation iterates over all bios,
572 * and finds the last NULL entry.
573 */
574 while ((bio = bios[i])) {
575 if (bio->bi_sector == on_disk_sector)
576 return 0;
577 i++;
578 }
579 /* bios[i] == NULL, the next not yet used slot */
580
581 /* GFP_KERNEL, we are not in the write-out path */
582 bio = bio_alloc(GFP_KERNEL, 1);
583 if (bio == NULL)
584 return -ENOMEM;
585
586 if (i > 0) {
587 const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
588 page_offset = prev_bv->bv_offset + prev_bv->bv_len;
589 page = prev_bv->bv_page;
590 }
591 if (page_offset == PAGE_SIZE) {
592 page = alloc_page(__GFP_HIGHMEM);
593 if (page == NULL)
594 goto out_bio_put;
595 page_offset = 0;
596 } else {
597 get_page(page);
598 }
599
600 offset = S2W(enr);
601 drbd_bm_get_lel(mdev, offset,
602 min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset),
603 kmap(page) + page_offset);
604 kunmap(page);
605
606 bio->bi_private = wc;
607 bio->bi_end_io = atodb_endio;
608 bio->bi_bdev = mdev->ldev->md_bdev;
609 bio->bi_sector = on_disk_sector;
610
611 if (bio_add_page(bio, page, 4096, page_offset) != 4096)
612 goto out_put_page;
613
614 atomic_inc(&wc->count);
615 /* we already know that we may do this...
616 * get_ldev_if_state(mdev,D_ATTACHING);
617 * just get the extra reference, so that the local_cnt reflects
618 * the number of pending IO requests DRBD at its backing device.
619 */
620 atomic_inc(&mdev->local_cnt);
621
622 bios[i] = bio;
623
624 return 0;
625
626out_put_page:
627 err = -EINVAL;
628 put_page(page);
629out_bio_put:
630 bio_put(bio);
631 return err;
632}
633
634/**
635 * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
636 * @mdev: DRBD device.
637 *
638 * Called when we detach (unconfigure) local storage,
639 * or when we go from R_PRIMARY to R_SECONDARY role.
640 */
641void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
642{
643 int i, nr_elements;
644 unsigned int enr;
645 struct bio **bios;
646 struct drbd_atodb_wait wc;
647
648 ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
649 return; /* sorry, I don't have any act_log etc... */
650
651 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
652
653 nr_elements = mdev->act_log->nr_elements;
654
655 /* GFP_KERNEL, we are not in anyone's write-out path */
656 bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
657 if (!bios)
658 goto submit_one_by_one;
659
660 atomic_set(&wc.count, 0);
661 init_completion(&wc.io_done);
662 wc.mdev = mdev;
663 wc.error = 0;
664
665 for (i = 0; i < nr_elements; i++) {
666 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
667 if (enr == LC_FREE)
668 continue;
669 /* next statement also does atomic_inc wc.count and local_cnt */
670 if (atodb_prepare_unless_covered(mdev, bios,
671 enr/AL_EXT_PER_BM_SECT,
672 &wc))
673 goto free_bios_submit_one_by_one;
674 }
675
676 /* unnecessary optimization? */
677 lc_unlock(mdev->act_log);
678 wake_up(&mdev->al_wait);
679
680 /* all prepared, submit them */
681 for (i = 0; i < nr_elements; i++) {
682 if (bios[i] == NULL)
683 break;
684 if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
685 bios[i]->bi_rw = WRITE;
686 bio_endio(bios[i], -EIO);
687 } else {
688 submit_bio(WRITE, bios[i]);
689 }
690 }
691
692 /* always (try to) flush bitmap to stable storage */
693 drbd_md_flush(mdev);
694
695 /* In case we did not submit a single IO do not wait for
696 * them to complete. ( Because we would wait forever here. )
697 *
698 * In case we had IOs and they are already complete, there
699 * is not point in waiting anyways.
700 * Therefore this if () ... */
701 if (atomic_read(&wc.count))
702 wait_for_completion(&wc.io_done);
703
704 put_ldev(mdev);
705
706 kfree(bios);
707 return;
708
709 free_bios_submit_one_by_one:
710 /* free everything by calling the endio callback directly. */
711 for (i = 0; i < nr_elements && bios[i]; i++)
712 bio_endio(bios[i], 0);
713
714 kfree(bios);
715
716 submit_one_by_one:
717 dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
718
719 for (i = 0; i < mdev->act_log->nr_elements; i++) {
720 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
721 if (enr == LC_FREE)
722 continue;
723 /* Really slow: if we have al-extents 16..19 active,
724 * sector 4 will be written four times! Synchronous! */
725 drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
726 }
727
728 lc_unlock(mdev->act_log);
729 wake_up(&mdev->al_wait);
730 put_ldev(mdev);
731}
732
733/** 545/**
734 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents 546 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
735 * @mdev: DRBD device. 547 * @mdev: DRBD device.
@@ -809,7 +621,7 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
809 return 1; 621 return 1;
810 } 622 }
811 623
812 drbd_bm_write_sect(mdev, udw->enr); 624 drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
813 put_ldev(mdev); 625 put_ldev(mdev);
814 626
815 kfree(udw); 627 kfree(udw);
@@ -889,7 +701,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
889 dev_warn(DEV, "Kicking resync_lru element enr=%u " 701 dev_warn(DEV, "Kicking resync_lru element enr=%u "
890 "out with rs_failed=%d\n", 702 "out with rs_failed=%d\n",
891 ext->lce.lc_number, ext->rs_failed); 703 ext->lce.lc_number, ext->rs_failed);
892 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
893 } 704 }
894 ext->rs_left = rs_left; 705 ext->rs_left = rs_left;
895 ext->rs_failed = success ? 0 : count; 706 ext->rs_failed = success ? 0 : count;
@@ -908,7 +719,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
908 drbd_queue_work_front(&mdev->data.work, &udw->w); 719 drbd_queue_work_front(&mdev->data.work, &udw->w);
909 } else { 720 } else {
910 dev_warn(DEV, "Could not kmalloc an udw\n"); 721 dev_warn(DEV, "Could not kmalloc an udw\n");
911 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
912 } 722 }
913 } 723 }
914 } else { 724 } else {
@@ -919,6 +729,22 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
919 } 729 }
920} 730}
921 731
732void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
733{
734 unsigned long now = jiffies;
735 unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
736 int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
737 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
738 if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
739 mdev->state.conn != C_PAUSED_SYNC_T &&
740 mdev->state.conn != C_PAUSED_SYNC_S) {
741 mdev->rs_mark_time[next] = now;
742 mdev->rs_mark_left[next] = still_to_go;
743 mdev->rs_last_mark = next;
744 }
745 }
746}
747
922/* clear the bit corresponding to the piece of storage in question: 748/* clear the bit corresponding to the piece of storage in question:
923 * size byte of data starting from sector. Only clear a bits of the affected 749 * size byte of data starting from sector. Only clear a bits of the affected
924 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 750 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
@@ -936,7 +762,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
936 int wake_up = 0; 762 int wake_up = 0;
937 unsigned long flags; 763 unsigned long flags;
938 764
939 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 765 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
940 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 766 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
941 (unsigned long long)sector, size); 767 (unsigned long long)sector, size);
942 return; 768 return;
@@ -969,21 +795,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
969 */ 795 */
970 count = drbd_bm_clear_bits(mdev, sbnr, ebnr); 796 count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
971 if (count && get_ldev(mdev)) { 797 if (count && get_ldev(mdev)) {
972 unsigned long now = jiffies; 798 drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
973 unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
974 int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
975 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
976 unsigned long tw = drbd_bm_total_weight(mdev);
977 if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
978 mdev->state.conn != C_PAUSED_SYNC_T &&
979 mdev->state.conn != C_PAUSED_SYNC_S) {
980 mdev->rs_mark_time[next] = now;
981 mdev->rs_mark_left[next] = tw;
982 mdev->rs_last_mark = next;
983 }
984 }
985 spin_lock_irqsave(&mdev->al_lock, flags); 799 spin_lock_irqsave(&mdev->al_lock, flags);
986 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); 800 drbd_try_clear_on_disk_bm(mdev, sector, count, true);
987 spin_unlock_irqrestore(&mdev->al_lock, flags); 801 spin_unlock_irqrestore(&mdev->al_lock, flags);
988 802
989 /* just wake_up unconditional now, various lc_chaged(), 803 /* just wake_up unconditional now, various lc_chaged(),
@@ -998,27 +812,27 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
998/* 812/*
999 * this is intended to set one request worth of data out of sync. 813 * this is intended to set one request worth of data out of sync.
1000 * affects at least 1 bit, 814 * affects at least 1 bit,
1001 * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. 815 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
1002 * 816 *
1003 * called by tl_clear and drbd_send_dblock (==drbd_make_request). 817 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
1004 * so this can be _any_ process. 818 * so this can be _any_ process.
1005 */ 819 */
1006void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, 820int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
1007 const char *file, const unsigned int line) 821 const char *file, const unsigned int line)
1008{ 822{
1009 unsigned long sbnr, ebnr, lbnr, flags; 823 unsigned long sbnr, ebnr, lbnr, flags;
1010 sector_t esector, nr_sectors; 824 sector_t esector, nr_sectors;
1011 unsigned int enr, count; 825 unsigned int enr, count = 0;
1012 struct lc_element *e; 826 struct lc_element *e;
1013 827
1014 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 828 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
1015 dev_err(DEV, "sector: %llus, size: %d\n", 829 dev_err(DEV, "sector: %llus, size: %d\n",
1016 (unsigned long long)sector, size); 830 (unsigned long long)sector, size);
1017 return; 831 return 0;
1018 } 832 }
1019 833
1020 if (!get_ldev(mdev)) 834 if (!get_ldev(mdev))
1021 return; /* no disk, no metadata, no bitmap to set bits in */ 835 return 0; /* no disk, no metadata, no bitmap to set bits in */
1022 836
1023 nr_sectors = drbd_get_capacity(mdev->this_bdev); 837 nr_sectors = drbd_get_capacity(mdev->this_bdev);
1024 esector = sector + (size >> 9) - 1; 838 esector = sector + (size >> 9) - 1;
@@ -1048,6 +862,8 @@ void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
1048 862
1049out: 863out:
1050 put_ldev(mdev); 864 put_ldev(mdev);
865
866 return count;
1051} 867}
1052 868
1053static 869static
@@ -1128,7 +944,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1128 unsigned int enr = BM_SECT_TO_EXT(sector); 944 unsigned int enr = BM_SECT_TO_EXT(sector);
1129 struct bm_extent *bm_ext; 945 struct bm_extent *bm_ext;
1130 int i, sig; 946 int i, sig;
947 int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
948 200 times -> 20 seconds. */
1131 949
950retry:
1132 sig = wait_event_interruptible(mdev->al_wait, 951 sig = wait_event_interruptible(mdev->al_wait,
1133 (bm_ext = _bme_get(mdev, enr))); 952 (bm_ext = _bme_get(mdev, enr)));
1134 if (sig) 953 if (sig)
@@ -1139,16 +958,25 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1139 958
1140 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 959 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1141 sig = wait_event_interruptible(mdev->al_wait, 960 sig = wait_event_interruptible(mdev->al_wait,
1142 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); 961 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
1143 if (sig) { 962 test_bit(BME_PRIORITY, &bm_ext->flags));
963
964 if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
1144 spin_lock_irq(&mdev->al_lock); 965 spin_lock_irq(&mdev->al_lock);
1145 if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 966 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1146 clear_bit(BME_NO_WRITES, &bm_ext->flags); 967 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
1147 mdev->resync_locked--; 968 mdev->resync_locked--;
1148 wake_up(&mdev->al_wait); 969 wake_up(&mdev->al_wait);
1149 } 970 }
1150 spin_unlock_irq(&mdev->al_lock); 971 spin_unlock_irq(&mdev->al_lock);
1151 return -EINTR; 972 if (sig)
973 return -EINTR;
974 if (schedule_timeout_interruptible(HZ/10))
975 return -EINTR;
976 if (sa && --sa == 0)
977 dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
978 "Resync stalled?\n");
979 goto retry;
1152 } 980 }
1153 } 981 }
1154 set_bit(BME_LOCKED, &bm_ext->flags); 982 set_bit(BME_LOCKED, &bm_ext->flags);
@@ -1291,8 +1119,7 @@ void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1291 } 1119 }
1292 1120
1293 if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 1121 if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1294 clear_bit(BME_LOCKED, &bm_ext->flags); 1122 bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
1295 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1296 mdev->resync_locked--; 1123 mdev->resync_locked--;
1297 wake_up(&mdev->al_wait); 1124 wake_up(&mdev->al_wait);
1298 } 1125 }
@@ -1383,7 +1210,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1383 sector_t esector, nr_sectors; 1210 sector_t esector, nr_sectors;
1384 int wake_up = 0; 1211 int wake_up = 0;
1385 1212
1386 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1213 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
1387 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", 1214 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1388 (unsigned long long)sector, size); 1215 (unsigned long long)sector, size);
1389 return; 1216 return;
@@ -1420,7 +1247,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1420 mdev->rs_failed += count; 1247 mdev->rs_failed += count;
1421 1248
1422 if (get_ldev(mdev)) { 1249 if (get_ldev(mdev)) {
1423 drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); 1250 drbd_try_clear_on_disk_bm(mdev, sector, count, false);
1424 put_ldev(mdev); 1251 put_ldev(mdev);
1425 } 1252 }
1426 1253
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 0645ca829a94..f0ae63d2df65 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -28,18 +28,58 @@
28#include <linux/drbd.h> 28#include <linux/drbd.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
31
32#include <asm-generic/bitops/le.h>
33
31#include "drbd_int.h" 34#include "drbd_int.h"
32 35
36
33/* OPAQUE outside this file! 37/* OPAQUE outside this file!
34 * interface defined in drbd_int.h 38 * interface defined in drbd_int.h
35 39
36 * convention: 40 * convention:
37 * function name drbd_bm_... => used elsewhere, "public". 41 * function name drbd_bm_... => used elsewhere, "public".
38 * function name bm_... => internal to implementation, "private". 42 * function name bm_... => internal to implementation, "private".
43 */
44
45
46/*
47 * LIMITATIONS:
48 * We want to support >= peta byte of backend storage, while for now still using
49 * a granularity of one bit per 4KiB of storage.
50 * 1 << 50 bytes backend storage (1 PiB)
51 * 1 << (50 - 12) bits needed
52 * 38 --> we need u64 to index and count bits
53 * 1 << (38 - 3) bitmap bytes needed
54 * 35 --> we still need u64 to index and count bytes
55 * (that's 32 GiB of bitmap for 1 PiB storage)
56 * 1 << (35 - 2) 32bit longs needed
57 * 33 --> we'd even need u64 to index and count 32bit long words.
58 * 1 << (35 - 3) 64bit longs needed
59 * 32 --> we could get away with a 32bit unsigned int to index and count
60 * 64bit long words, but I rather stay with unsigned long for now.
61 * We probably should neither count nor point to bytes or long words
62 * directly, but either by bitnumber, or by page index and offset.
63 * 1 << (35 - 12)
64 * 22 --> we need that much 4KiB pages of bitmap.
65 * 1 << (22 + 3) --> on a 64bit arch,
66 * we need 32 MiB to store the array of page pointers.
67 *
68 * Because I'm lazy, and because the resulting patch was too large, too ugly
69 * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
70 * (1 << 32) bits * 4k storage.
71 *
39 72
40 * Note that since find_first_bit returns int, at the current granularity of 73 * bitmap storage and IO:
41 * the bitmap (4KB per byte), this implementation "only" supports up to 74 * Bitmap is stored little endian on disk, and is kept little endian in
42 * 1<<(32+12) == 16 TB... 75 * core memory. Currently we still hold the full bitmap in core as long
76 * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
77 * seems excessive.
78 *
79 * We plan to reduce the amount of in-core bitmap pages by pageing them in
80 * and out against their on-disk location as necessary, but need to make
81 * sure we don't cause too much meta data IO, and must not deadlock in
82 * tight memory situations. This needs some more work.
43 */ 83 */
44 84
45/* 85/*
@@ -55,13 +95,9 @@
55struct drbd_bitmap { 95struct drbd_bitmap {
56 struct page **bm_pages; 96 struct page **bm_pages;
57 spinlock_t bm_lock; 97 spinlock_t bm_lock;
58 /* WARNING unsigned long bm_*: 98
59 * 32bit number of bit offset is just enough for 512 MB bitmap. 99 /* see LIMITATIONS: above */
60 * it will blow up if we make the bitmap bigger... 100
61 * not that it makes much sense to have a bitmap that large,
62 * rather change the granularity to 16k or 64k or something.
63 * (that implies other problems, however...)
64 */
65 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ 101 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
66 unsigned long bm_bits; 102 unsigned long bm_bits;
67 size_t bm_words; 103 size_t bm_words;
@@ -69,29 +105,18 @@ struct drbd_bitmap {
69 sector_t bm_dev_capacity; 105 sector_t bm_dev_capacity;
70 struct mutex bm_change; /* serializes resize operations */ 106 struct mutex bm_change; /* serializes resize operations */
71 107
72 atomic_t bm_async_io; 108 wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
73 wait_queue_head_t bm_io_wait;
74 109
75 unsigned long bm_flags; 110 enum bm_flag bm_flags;
76 111
77 /* debugging aid, in case we are still racy somewhere */ 112 /* debugging aid, in case we are still racy somewhere */
78 char *bm_why; 113 char *bm_why;
79 struct task_struct *bm_task; 114 struct task_struct *bm_task;
80}; 115};
81 116
82/* definition of bits in bm_flags */
83#define BM_LOCKED 0
84#define BM_MD_IO_ERROR 1
85#define BM_P_VMALLOCED 2
86
87static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 117static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
88 unsigned long e, int val, const enum km_type km); 118 unsigned long e, int val, const enum km_type km);
89 119
90static int bm_is_locked(struct drbd_bitmap *b)
91{
92 return test_bit(BM_LOCKED, &b->bm_flags);
93}
94
95#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) 120#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
96static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) 121static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
97{ 122{
@@ -108,7 +133,7 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
108 b->bm_task == mdev->worker.task ? "worker" : "?"); 133 b->bm_task == mdev->worker.task ? "worker" : "?");
109} 134}
110 135
111void drbd_bm_lock(struct drbd_conf *mdev, char *why) 136void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
112{ 137{
113 struct drbd_bitmap *b = mdev->bitmap; 138 struct drbd_bitmap *b = mdev->bitmap;
114 int trylock_failed; 139 int trylock_failed;
@@ -131,8 +156,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why)
131 b->bm_task == mdev->worker.task ? "worker" : "?"); 156 b->bm_task == mdev->worker.task ? "worker" : "?");
132 mutex_lock(&b->bm_change); 157 mutex_lock(&b->bm_change);
133 } 158 }
134 if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) 159 if (BM_LOCKED_MASK & b->bm_flags)
135 dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); 160 dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
161 b->bm_flags |= flags & BM_LOCKED_MASK;
136 162
137 b->bm_why = why; 163 b->bm_why = why;
138 b->bm_task = current; 164 b->bm_task = current;
@@ -146,31 +172,137 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
146 return; 172 return;
147 } 173 }
148 174
149 if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) 175 if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags))
150 dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); 176 dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
151 177
178 b->bm_flags &= ~BM_LOCKED_MASK;
152 b->bm_why = NULL; 179 b->bm_why = NULL;
153 b->bm_task = NULL; 180 b->bm_task = NULL;
154 mutex_unlock(&b->bm_change); 181 mutex_unlock(&b->bm_change);
155} 182}
156 183
157/* word offset to long pointer */ 184/* we store some "meta" info about our pages in page->private */
158static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) 185/* at a granularity of 4k storage per bitmap bit:
186 * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
187 * 1<<38 bits,
188 * 1<<23 4k bitmap pages.
189 * Use 24 bits as page index, covers 2 peta byte storage
190 * at a granularity of 4k per bit.
191 * Used to report the failed page idx on io error from the endio handlers.
192 */
193#define BM_PAGE_IDX_MASK ((1UL<<24)-1)
194/* this page is currently read in, or written back */
195#define BM_PAGE_IO_LOCK 31
196/* if there has been an IO error for this page */
197#define BM_PAGE_IO_ERROR 30
198/* this is to be able to intelligently skip disk IO,
199 * set if bits have been set since last IO. */
200#define BM_PAGE_NEED_WRITEOUT 29
201/* to mark for lazy writeout once syncer cleared all clearable bits,
202 * we if bits have been cleared since last IO. */
203#define BM_PAGE_LAZY_WRITEOUT 28
204
205/* store_page_idx uses non-atomic assingment. It is only used directly after
206 * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
207 * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
208 * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
209 * requires it all to be atomic as well. */
210static void bm_store_page_idx(struct page *page, unsigned long idx)
159{ 211{
160 struct page *page; 212 BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
161 unsigned long page_nr; 213 page_private(page) |= idx;
214}
215
216static unsigned long bm_page_to_idx(struct page *page)
217{
218 return page_private(page) & BM_PAGE_IDX_MASK;
219}
220
221/* As is very unlikely that the same page is under IO from more than one
222 * context, we can get away with a bit per page and one wait queue per bitmap.
223 */
224static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
225{
226 struct drbd_bitmap *b = mdev->bitmap;
227 void *addr = &page_private(b->bm_pages[page_nr]);
228 wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
229}
230
231static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
232{
233 struct drbd_bitmap *b = mdev->bitmap;
234 void *addr = &page_private(b->bm_pages[page_nr]);
235 clear_bit(BM_PAGE_IO_LOCK, addr);
236 smp_mb__after_clear_bit();
237 wake_up(&mdev->bitmap->bm_io_wait);
238}
239
240/* set _before_ submit_io, so it may be reset due to being changed
241 * while this page is in flight... will get submitted later again */
242static void bm_set_page_unchanged(struct page *page)
243{
244 /* use cmpxchg? */
245 clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
246 clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
247}
162 248
249static void bm_set_page_need_writeout(struct page *page)
250{
251 set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
252}
253
254static int bm_test_page_unchanged(struct page *page)
255{
256 volatile const unsigned long *addr = &page_private(page);
257 return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
258}
259
260static void bm_set_page_io_err(struct page *page)
261{
262 set_bit(BM_PAGE_IO_ERROR, &page_private(page));
263}
264
265static void bm_clear_page_io_err(struct page *page)
266{
267 clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
268}
269
270static void bm_set_page_lazy_writeout(struct page *page)
271{
272 set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
273}
274
275static int bm_test_page_lazy_writeout(struct page *page)
276{
277 return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
278}
279
280/* on a 32bit box, this would allow for exactly (2<<38) bits. */
281static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
282{
163 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ 283 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
164 page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); 284 unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
165 BUG_ON(page_nr >= b->bm_number_of_pages); 285 BUG_ON(page_nr >= b->bm_number_of_pages);
166 page = b->bm_pages[page_nr]; 286 return page_nr;
287}
167 288
289static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
290{
291 /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
292 unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
293 BUG_ON(page_nr >= b->bm_number_of_pages);
294 return page_nr;
295}
296
297static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
298{
299 struct page *page = b->bm_pages[idx];
168 return (unsigned long *) kmap_atomic(page, km); 300 return (unsigned long *) kmap_atomic(page, km);
169} 301}
170 302
171static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) 303static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
172{ 304{
173 return __bm_map_paddr(b, offset, KM_IRQ1); 305 return __bm_map_pidx(b, idx, KM_IRQ1);
174} 306}
175 307
176static void __bm_unmap(unsigned long *p_addr, const enum km_type km) 308static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
@@ -202,6 +334,7 @@ static void bm_unmap(unsigned long *p_addr)
202 * to be able to report device specific. 334 * to be able to report device specific.
203 */ 335 */
204 336
337
205static void bm_free_pages(struct page **pages, unsigned long number) 338static void bm_free_pages(struct page **pages, unsigned long number)
206{ 339{
207 unsigned long i; 340 unsigned long i;
@@ -269,6 +402,9 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
269 bm_vk_free(new_pages, vmalloced); 402 bm_vk_free(new_pages, vmalloced);
270 return NULL; 403 return NULL;
271 } 404 }
405 /* we want to know which page it is
406 * from the endio handlers */
407 bm_store_page_idx(page, i);
272 new_pages[i] = page; 408 new_pages[i] = page;
273 } 409 }
274 } else { 410 } else {
@@ -280,9 +416,9 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
280 } 416 }
281 417
282 if (vmalloced) 418 if (vmalloced)
283 set_bit(BM_P_VMALLOCED, &b->bm_flags); 419 b->bm_flags |= BM_P_VMALLOCED;
284 else 420 else
285 clear_bit(BM_P_VMALLOCED, &b->bm_flags); 421 b->bm_flags &= ~BM_P_VMALLOCED;
286 422
287 return new_pages; 423 return new_pages;
288} 424}
@@ -319,7 +455,7 @@ void drbd_bm_cleanup(struct drbd_conf *mdev)
319{ 455{
320 ERR_IF (!mdev->bitmap) return; 456 ERR_IF (!mdev->bitmap) return;
321 bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); 457 bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
322 bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); 458 bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
323 kfree(mdev->bitmap); 459 kfree(mdev->bitmap);
324 mdev->bitmap = NULL; 460 mdev->bitmap = NULL;
325} 461}
@@ -329,22 +465,39 @@ void drbd_bm_cleanup(struct drbd_conf *mdev)
329 * this masks out the remaining bits. 465 * this masks out the remaining bits.
330 * Returns the number of bits cleared. 466 * Returns the number of bits cleared.
331 */ 467 */
468#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
469#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
470#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
332static int bm_clear_surplus(struct drbd_bitmap *b) 471static int bm_clear_surplus(struct drbd_bitmap *b)
333{ 472{
334 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; 473 unsigned long mask;
335 size_t w = b->bm_bits >> LN2_BPL;
336 int cleared = 0;
337 unsigned long *p_addr, *bm; 474 unsigned long *p_addr, *bm;
475 int tmp;
476 int cleared = 0;
338 477
339 p_addr = bm_map_paddr(b, w); 478 /* number of bits modulo bits per page */
340 bm = p_addr + MLPP(w); 479 tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
341 if (w < b->bm_words) { 480 /* mask the used bits of the word containing the last bit */
481 mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
482 /* bitmap is always stored little endian,
483 * on disk and in core memory alike */
484 mask = cpu_to_lel(mask);
485
486 p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
487 bm = p_addr + (tmp/BITS_PER_LONG);
488 if (mask) {
489 /* If mask != 0, we are not exactly aligned, so bm now points
490 * to the long containing the last bit.
491 * If mask == 0, bm already points to the word immediately
492 * after the last (long word aligned) bit. */
342 cleared = hweight_long(*bm & ~mask); 493 cleared = hweight_long(*bm & ~mask);
343 *bm &= mask; 494 *bm &= mask;
344 w++; bm++; 495 bm++;
345 } 496 }
346 497
347 if (w < b->bm_words) { 498 if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
499 /* on a 32bit arch, we may need to zero out
500 * a padding long to align with a 64bit remote */
348 cleared += hweight_long(*bm); 501 cleared += hweight_long(*bm);
349 *bm = 0; 502 *bm = 0;
350 } 503 }
@@ -354,66 +507,75 @@ static int bm_clear_surplus(struct drbd_bitmap *b)
354 507
355static void bm_set_surplus(struct drbd_bitmap *b) 508static void bm_set_surplus(struct drbd_bitmap *b)
356{ 509{
357 const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; 510 unsigned long mask;
358 size_t w = b->bm_bits >> LN2_BPL;
359 unsigned long *p_addr, *bm; 511 unsigned long *p_addr, *bm;
360 512 int tmp;
361 p_addr = bm_map_paddr(b, w); 513
362 bm = p_addr + MLPP(w); 514 /* number of bits modulo bits per page */
363 if (w < b->bm_words) { 515 tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
516 /* mask the used bits of the word containing the last bit */
517 mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
518 /* bitmap is always stored little endian,
519 * on disk and in core memory alike */
520 mask = cpu_to_lel(mask);
521
522 p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
523 bm = p_addr + (tmp/BITS_PER_LONG);
524 if (mask) {
525 /* If mask != 0, we are not exactly aligned, so bm now points
526 * to the long containing the last bit.
527 * If mask == 0, bm already points to the word immediately
528 * after the last (long word aligned) bit. */
364 *bm |= ~mask; 529 *bm |= ~mask;
365 bm++; w++; 530 bm++;
366 } 531 }
367 532
368 if (w < b->bm_words) { 533 if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
369 *bm = ~(0UL); 534 /* on a 32bit arch, we may need to zero out
535 * a padding long to align with a 64bit remote */
536 *bm = ~0UL;
370 } 537 }
371 bm_unmap(p_addr); 538 bm_unmap(p_addr);
372} 539}
373 540
374static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) 541/* you better not modify the bitmap while this is running,
542 * or its results will be stale */
543static unsigned long bm_count_bits(struct drbd_bitmap *b)
375{ 544{
376 unsigned long *p_addr, *bm, offset = 0; 545 unsigned long *p_addr;
377 unsigned long bits = 0; 546 unsigned long bits = 0;
378 unsigned long i, do_now; 547 unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
379 548 int idx, i, last_word;
380 while (offset < b->bm_words) { 549
381 i = do_now = min_t(size_t, b->bm_words-offset, LWPP); 550 /* all but last page */
382 p_addr = __bm_map_paddr(b, offset, KM_USER0); 551 for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
383 bm = p_addr + MLPP(offset); 552 p_addr = __bm_map_pidx(b, idx, KM_USER0);
384 while (i--) { 553 for (i = 0; i < LWPP; i++)
385#ifndef __LITTLE_ENDIAN 554 bits += hweight_long(p_addr[i]);
386 if (swap_endian)
387 *bm = lel_to_cpu(*bm);
388#endif
389 bits += hweight_long(*bm++);
390 }
391 __bm_unmap(p_addr, KM_USER0); 555 __bm_unmap(p_addr, KM_USER0);
392 offset += do_now;
393 cond_resched(); 556 cond_resched();
394 } 557 }
395 558 /* last (or only) page */
559 last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
560 p_addr = __bm_map_pidx(b, idx, KM_USER0);
561 for (i = 0; i < last_word; i++)
562 bits += hweight_long(p_addr[i]);
563 p_addr[last_word] &= cpu_to_lel(mask);
564 bits += hweight_long(p_addr[last_word]);
565 /* 32bit arch, may have an unused padding long */
566 if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
567 p_addr[last_word+1] = 0;
568 __bm_unmap(p_addr, KM_USER0);
396 return bits; 569 return bits;
397} 570}
398 571
399static unsigned long bm_count_bits(struct drbd_bitmap *b)
400{
401 return __bm_count_bits(b, 0);
402}
403
404static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
405{
406 return __bm_count_bits(b, 1);
407}
408
409/* offset and len in long words.*/ 572/* offset and len in long words.*/
410static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) 573static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
411{ 574{
412 unsigned long *p_addr, *bm; 575 unsigned long *p_addr, *bm;
576 unsigned int idx;
413 size_t do_now, end; 577 size_t do_now, end;
414 578
415#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
416
417 end = offset + len; 579 end = offset + len;
418 580
419 if (end > b->bm_words) { 581 if (end > b->bm_words) {
@@ -423,15 +585,16 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
423 585
424 while (offset < end) { 586 while (offset < end) {
425 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; 587 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
426 p_addr = bm_map_paddr(b, offset); 588 idx = bm_word_to_page_idx(b, offset);
589 p_addr = bm_map_pidx(b, idx);
427 bm = p_addr + MLPP(offset); 590 bm = p_addr + MLPP(offset);
428 if (bm+do_now > p_addr + LWPP) { 591 if (bm+do_now > p_addr + LWPP) {
429 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", 592 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
430 p_addr, bm, (int)do_now); 593 p_addr, bm, (int)do_now);
431 break; /* breaks to after catch_oob_access_end() only! */ 594 } else
432 } 595 memset(bm, c, do_now * sizeof(long));
433 memset(bm, c, do_now * sizeof(long));
434 bm_unmap(p_addr); 596 bm_unmap(p_addr);
597 bm_set_page_need_writeout(b->bm_pages[idx]);
435 offset += do_now; 598 offset += do_now;
436 } 599 }
437} 600}
@@ -447,7 +610,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
447int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) 610int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
448{ 611{
449 struct drbd_bitmap *b = mdev->bitmap; 612 struct drbd_bitmap *b = mdev->bitmap;
450 unsigned long bits, words, owords, obits, *p_addr, *bm; 613 unsigned long bits, words, owords, obits;
451 unsigned long want, have, onpages; /* number of pages */ 614 unsigned long want, have, onpages; /* number of pages */
452 struct page **npages, **opages = NULL; 615 struct page **npages, **opages = NULL;
453 int err = 0, growing; 616 int err = 0, growing;
@@ -455,7 +618,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
455 618
456 ERR_IF(!b) return -ENOMEM; 619 ERR_IF(!b) return -ENOMEM;
457 620
458 drbd_bm_lock(mdev, "resize"); 621 drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
459 622
460 dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", 623 dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
461 (unsigned long long)capacity); 624 (unsigned long long)capacity);
@@ -463,7 +626,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
463 if (capacity == b->bm_dev_capacity) 626 if (capacity == b->bm_dev_capacity)
464 goto out; 627 goto out;
465 628
466 opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); 629 opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
467 630
468 if (capacity == 0) { 631 if (capacity == 0) {
469 spin_lock_irq(&b->bm_lock); 632 spin_lock_irq(&b->bm_lock);
@@ -491,18 +654,23 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
491 words = ALIGN(bits, 64) >> LN2_BPL; 654 words = ALIGN(bits, 64) >> LN2_BPL;
492 655
493 if (get_ldev(mdev)) { 656 if (get_ldev(mdev)) {
494 D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); 657 u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
495 put_ldev(mdev); 658 put_ldev(mdev);
659 if (bits > bits_on_disk) {
660 dev_info(DEV, "bits = %lu\n", bits);
661 dev_info(DEV, "bits_on_disk = %llu\n", bits_on_disk);
662 err = -ENOSPC;
663 goto out;
664 }
496 } 665 }
497 666
498 /* one extra long to catch off by one errors */ 667 want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
499 want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
500 have = b->bm_number_of_pages; 668 have = b->bm_number_of_pages;
501 if (want == have) { 669 if (want == have) {
502 D_ASSERT(b->bm_pages != NULL); 670 D_ASSERT(b->bm_pages != NULL);
503 npages = b->bm_pages; 671 npages = b->bm_pages;
504 } else { 672 } else {
505 if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) 673 if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))
506 npages = NULL; 674 npages = NULL;
507 else 675 else
508 npages = bm_realloc_pages(b, want); 676 npages = bm_realloc_pages(b, want);
@@ -542,11 +710,6 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
542 bm_free_pages(opages + want, have - want); 710 bm_free_pages(opages + want, have - want);
543 } 711 }
544 712
545 p_addr = bm_map_paddr(b, words);
546 bm = p_addr + MLPP(words);
547 *bm = DRBD_MAGIC;
548 bm_unmap(p_addr);
549
550 (void)bm_clear_surplus(b); 713 (void)bm_clear_surplus(b);
551 714
552 spin_unlock_irq(&b->bm_lock); 715 spin_unlock_irq(&b->bm_lock);
@@ -554,7 +717,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
554 bm_vk_free(opages, opages_vmalloced); 717 bm_vk_free(opages, opages_vmalloced);
555 if (!growing) 718 if (!growing)
556 b->bm_set = bm_count_bits(b); 719 b->bm_set = bm_count_bits(b);
557 dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); 720 dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
558 721
559 out: 722 out:
560 drbd_bm_unlock(mdev); 723 drbd_bm_unlock(mdev);
@@ -624,6 +787,7 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
624 struct drbd_bitmap *b = mdev->bitmap; 787 struct drbd_bitmap *b = mdev->bitmap;
625 unsigned long *p_addr, *bm; 788 unsigned long *p_addr, *bm;
626 unsigned long word, bits; 789 unsigned long word, bits;
790 unsigned int idx;
627 size_t end, do_now; 791 size_t end, do_now;
628 792
629 end = offset + number; 793 end = offset + number;
@@ -638,16 +802,18 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
638 spin_lock_irq(&b->bm_lock); 802 spin_lock_irq(&b->bm_lock);
639 while (offset < end) { 803 while (offset < end) {
640 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; 804 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
641 p_addr = bm_map_paddr(b, offset); 805 idx = bm_word_to_page_idx(b, offset);
806 p_addr = bm_map_pidx(b, idx);
642 bm = p_addr + MLPP(offset); 807 bm = p_addr + MLPP(offset);
643 offset += do_now; 808 offset += do_now;
644 while (do_now--) { 809 while (do_now--) {
645 bits = hweight_long(*bm); 810 bits = hweight_long(*bm);
646 word = *bm | lel_to_cpu(*buffer++); 811 word = *bm | *buffer++;
647 *bm++ = word; 812 *bm++ = word;
648 b->bm_set += hweight_long(word) - bits; 813 b->bm_set += hweight_long(word) - bits;
649 } 814 }
650 bm_unmap(p_addr); 815 bm_unmap(p_addr);
816 bm_set_page_need_writeout(b->bm_pages[idx]);
651 } 817 }
652 /* with 32bit <-> 64bit cross-platform connect 818 /* with 32bit <-> 64bit cross-platform connect
653 * this is only correct for current usage, 819 * this is only correct for current usage,
@@ -656,7 +822,6 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
656 */ 822 */
657 if (end == b->bm_words) 823 if (end == b->bm_words)
658 b->bm_set -= bm_clear_surplus(b); 824 b->bm_set -= bm_clear_surplus(b);
659
660 spin_unlock_irq(&b->bm_lock); 825 spin_unlock_irq(&b->bm_lock);
661} 826}
662 827
@@ -686,11 +851,11 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
686 else { 851 else {
687 while (offset < end) { 852 while (offset < end) {
688 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; 853 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
689 p_addr = bm_map_paddr(b, offset); 854 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
690 bm = p_addr + MLPP(offset); 855 bm = p_addr + MLPP(offset);
691 offset += do_now; 856 offset += do_now;
692 while (do_now--) 857 while (do_now--)
693 *buffer++ = cpu_to_lel(*bm++); 858 *buffer++ = *bm++;
694 bm_unmap(p_addr); 859 bm_unmap(p_addr);
695 } 860 }
696 } 861 }
@@ -724,9 +889,22 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
724 spin_unlock_irq(&b->bm_lock); 889 spin_unlock_irq(&b->bm_lock);
725} 890}
726 891
892struct bm_aio_ctx {
893 struct drbd_conf *mdev;
894 atomic_t in_flight;
895 struct completion done;
896 unsigned flags;
897#define BM_AIO_COPY_PAGES 1
898 int error;
899};
900
901/* bv_page may be a copy, or may be the original */
727static void bm_async_io_complete(struct bio *bio, int error) 902static void bm_async_io_complete(struct bio *bio, int error)
728{ 903{
729 struct drbd_bitmap *b = bio->bi_private; 904 struct bm_aio_ctx *ctx = bio->bi_private;
905 struct drbd_conf *mdev = ctx->mdev;
906 struct drbd_bitmap *b = mdev->bitmap;
907 unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
730 int uptodate = bio_flagged(bio, BIO_UPTODATE); 908 int uptodate = bio_flagged(bio, BIO_UPTODATE);
731 909
732 910
@@ -737,38 +915,83 @@ static void bm_async_io_complete(struct bio *bio, int error)
737 if (!error && !uptodate) 915 if (!error && !uptodate)
738 error = -EIO; 916 error = -EIO;
739 917
918 if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
919 !bm_test_page_unchanged(b->bm_pages[idx]))
920 dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx);
921
740 if (error) { 922 if (error) {
741 /* doh. what now? 923 /* ctx error will hold the completed-last non-zero error code,
742 * for now, set all bits, and flag MD_IO_ERROR */ 924 * in case error codes differ. */
743 __set_bit(BM_MD_IO_ERROR, &b->bm_flags); 925 ctx->error = error;
926 bm_set_page_io_err(b->bm_pages[idx]);
927 /* Not identical to on disk version of it.
928 * Is BM_PAGE_IO_ERROR enough? */
929 if (__ratelimit(&drbd_ratelimit_state))
930 dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
931 error, idx);
932 } else {
933 bm_clear_page_io_err(b->bm_pages[idx]);
934 dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
744 } 935 }
745 if (atomic_dec_and_test(&b->bm_async_io)) 936
746 wake_up(&b->bm_io_wait); 937 bm_page_unlock_io(mdev, idx);
938
939 /* FIXME give back to page pool */
940 if (ctx->flags & BM_AIO_COPY_PAGES)
941 put_page(bio->bi_io_vec[0].bv_page);
747 942
748 bio_put(bio); 943 bio_put(bio);
944
945 if (atomic_dec_and_test(&ctx->in_flight))
946 complete(&ctx->done);
749} 947}
750 948
751static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) 949static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
752{ 950{
753 /* we are process context. we always get a bio */ 951 /* we are process context. we always get a bio */
754 struct bio *bio = bio_alloc(GFP_KERNEL, 1); 952 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
953 struct drbd_conf *mdev = ctx->mdev;
954 struct drbd_bitmap *b = mdev->bitmap;
955 struct page *page;
755 unsigned int len; 956 unsigned int len;
957
756 sector_t on_disk_sector = 958 sector_t on_disk_sector =
757 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; 959 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
758 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); 960 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
759 961
760 /* this might happen with very small 962 /* this might happen with very small
761 * flexible external meta data device */ 963 * flexible external meta data device,
964 * or with PAGE_SIZE > 4k */
762 len = min_t(unsigned int, PAGE_SIZE, 965 len = min_t(unsigned int, PAGE_SIZE,
763 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); 966 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
764 967
968 /* serialize IO on this page */
969 bm_page_lock_io(mdev, page_nr);
970 /* before memcpy and submit,
971 * so it can be redirtied any time */
972 bm_set_page_unchanged(b->bm_pages[page_nr]);
973
974 if (ctx->flags & BM_AIO_COPY_PAGES) {
975 /* FIXME alloc_page is good enough for now, but actually needs
976 * to use pre-allocated page pool */
977 void *src, *dest;
978 page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
979 dest = kmap_atomic(page, KM_USER0);
980 src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
981 memcpy(dest, src, PAGE_SIZE);
982 kunmap_atomic(src, KM_USER1);
983 kunmap_atomic(dest, KM_USER0);
984 bm_store_page_idx(page, page_nr);
985 } else
986 page = b->bm_pages[page_nr];
987
765 bio->bi_bdev = mdev->ldev->md_bdev; 988 bio->bi_bdev = mdev->ldev->md_bdev;
766 bio->bi_sector = on_disk_sector; 989 bio->bi_sector = on_disk_sector;
767 bio_add_page(bio, b->bm_pages[page_nr], len, 0); 990 bio_add_page(bio, page, len, 0);
768 bio->bi_private = b; 991 bio->bi_private = ctx;
769 bio->bi_end_io = bm_async_io_complete; 992 bio->bi_end_io = bm_async_io_complete;
770 993
771 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { 994 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
772 bio->bi_rw |= rw; 995 bio->bi_rw |= rw;
773 bio_endio(bio, -EIO); 996 bio_endio(bio, -EIO);
774 } else { 997 } else {
@@ -776,87 +999,84 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int
776 } 999 }
777} 1000}
778 1001
779# if defined(__LITTLE_ENDIAN)
780 /* nothing to do, on disk == in memory */
781# define bm_cpu_to_lel(x) ((void)0)
782# else
783static void bm_cpu_to_lel(struct drbd_bitmap *b)
784{
785 /* need to cpu_to_lel all the pages ...
786 * this may be optimized by using
787 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
788 * the following is still not optimal, but better than nothing */
789 unsigned int i;
790 unsigned long *p_addr, *bm;
791 if (b->bm_set == 0) {
792 /* no page at all; avoid swap if all is 0 */
793 i = b->bm_number_of_pages;
794 } else if (b->bm_set == b->bm_bits) {
795 /* only the last page */
796 i = b->bm_number_of_pages - 1;
797 } else {
798 /* all pages */
799 i = 0;
800 }
801 for (; i < b->bm_number_of_pages; i++) {
802 p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
803 for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
804 *bm = cpu_to_lel(*bm);
805 kunmap_atomic(p_addr, KM_USER0);
806 }
807}
808# endif
809/* lel_to_cpu == cpu_to_lel */
810# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
811
812/* 1002/*
813 * bm_rw: read/write the whole bitmap from/to its on disk location. 1003 * bm_rw: read/write the whole bitmap from/to its on disk location.
814 */ 1004 */
815static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) 1005static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
816{ 1006{
1007 struct bm_aio_ctx ctx = {
1008 .mdev = mdev,
1009 .in_flight = ATOMIC_INIT(1),
1010 .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1011 .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
1012 };
817 struct drbd_bitmap *b = mdev->bitmap; 1013 struct drbd_bitmap *b = mdev->bitmap;
818 /* sector_t sector; */ 1014 int num_pages, i, count = 0;
819 int bm_words, num_pages, i;
820 unsigned long now; 1015 unsigned long now;
821 char ppb[10]; 1016 char ppb[10];
822 int err = 0; 1017 int err = 0;
823 1018
824 WARN_ON(!bm_is_locked(b)); 1019 /*
825 1020 * We are protected against bitmap disappearing/resizing by holding an
826 /* no spinlock here, the drbd_bm_lock should be enough! */ 1021 * ldev reference (caller must have called get_ldev()).
827 1022 * For read/write, we are protected against changes to the bitmap by
828 bm_words = drbd_bm_words(mdev); 1023 * the bitmap lock (see drbd_bitmap_io).
829 num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; 1024 * For lazy writeout, we don't care for ongoing changes to the bitmap,
1025 * as we submit copies of pages anyways.
1026 */
1027 if (!ctx.flags)
1028 WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
830 1029
831 /* on disk bitmap is little endian */ 1030 num_pages = b->bm_number_of_pages;
832 if (rw == WRITE)
833 bm_cpu_to_lel(b);
834 1031
835 now = jiffies; 1032 now = jiffies;
836 atomic_set(&b->bm_async_io, num_pages);
837 __clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
838 1033
839 /* let the layers below us try to merge these bios... */ 1034 /* let the layers below us try to merge these bios... */
840 for (i = 0; i < num_pages; i++) 1035 for (i = 0; i < num_pages; i++) {
841 bm_page_io_async(mdev, b, i, rw); 1036 /* ignore completely unchanged pages */
1037 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1038 break;
1039 if (rw & WRITE) {
1040 if (bm_test_page_unchanged(b->bm_pages[i])) {
1041 dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
1042 continue;
1043 }
1044 /* during lazy writeout,
1045 * ignore those pages not marked for lazy writeout. */
1046 if (lazy_writeout_upper_idx &&
1047 !bm_test_page_lazy_writeout(b->bm_pages[i])) {
1048 dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
1049 continue;
1050 }
1051 }
1052 atomic_inc(&ctx.in_flight);
1053 bm_page_io_async(&ctx, i, rw);
1054 ++count;
1055 cond_resched();
1056 }
842 1057
843 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); 1058 /*
1059 * We initialize ctx.in_flight to one to make sure bm_async_io_complete
1060 * will not complete() early, and decrement / test it here. If there
1061 * are still some bios in flight, we need to wait for them here.
1062 */
1063 if (!atomic_dec_and_test(&ctx.in_flight))
1064 wait_for_completion(&ctx.done);
1065 dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
1066 rw == WRITE ? "WRITE" : "READ",
1067 count, jiffies - now);
844 1068
845 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { 1069 if (ctx.error) {
846 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); 1070 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
847 drbd_chk_io_error(mdev, 1, TRUE); 1071 drbd_chk_io_error(mdev, 1, true);
848 err = -EIO; 1072 err = -EIO; /* ctx.error ? */
849 } 1073 }
850 1074
851 now = jiffies; 1075 now = jiffies;
852 if (rw == WRITE) { 1076 if (rw == WRITE) {
853 /* swap back endianness */
854 bm_lel_to_cpu(b);
855 /* flush bitmap to stable storage */
856 drbd_md_flush(mdev); 1077 drbd_md_flush(mdev);
857 } else /* rw == READ */ { 1078 } else /* rw == READ */ {
858 /* just read, if necessary adjust endianness */ 1079 b->bm_set = bm_count_bits(b);
859 b->bm_set = bm_count_bits_swap_endian(b);
860 dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", 1080 dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
861 jiffies - now); 1081 jiffies - now);
862 } 1082 }
@@ -874,112 +1094,128 @@ static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
874 */ 1094 */
875int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) 1095int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
876{ 1096{
877 return bm_rw(mdev, READ); 1097 return bm_rw(mdev, READ, 0);
878} 1098}
879 1099
880/** 1100/**
881 * drbd_bm_write() - Write the whole bitmap to its on disk location. 1101 * drbd_bm_write() - Write the whole bitmap to its on disk location.
882 * @mdev: DRBD device. 1102 * @mdev: DRBD device.
1103 *
1104 * Will only write pages that have changed since last IO.
883 */ 1105 */
884int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) 1106int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
885{ 1107{
886 return bm_rw(mdev, WRITE); 1108 return bm_rw(mdev, WRITE, 0);
887} 1109}
888 1110
889/** 1111/**
890 * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap 1112 * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
891 * @mdev: DRBD device. 1113 * @mdev: DRBD device.
892 * @enr: Extent number in the resync lru (happens to be sector offset) 1114 * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
893 *
894 * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
895 * by a single sector write. Therefore enr == sector offset from the
896 * start of the bitmap.
897 */ 1115 */
898int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) 1116int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
899{ 1117{
900 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset 1118 return bm_rw(mdev, WRITE, upper_idx);
901 + mdev->ldev->md.bm_offset; 1119}
902 int bm_words, num_words, offset; 1120
903 int err = 0;
904 1121
905 mutex_lock(&mdev->md_io_mutex); 1122/**
906 bm_words = drbd_bm_words(mdev); 1123 * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
907 offset = S2W(enr); /* word offset into bitmap */ 1124 * @mdev: DRBD device.
908 num_words = min(S2W(1), bm_words - offset); 1125 * @idx: bitmap page index
909 if (num_words < S2W(1)) 1126 *
910 memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); 1127 * We don't want to special case on logical_block_size of the backend device,
911 drbd_bm_get_lel(mdev, offset, num_words, 1128 * so we submit PAGE_SIZE aligned pieces.
912 page_address(mdev->md_io_page)); 1129 * Note that on "most" systems, PAGE_SIZE is 4k.
913 if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { 1130 *
914 int i; 1131 * In case this becomes an issue on systems with larger PAGE_SIZE,
915 err = -EIO; 1132 * we may want to change this again to write 4k aligned 4k pieces.
916 dev_err(DEV, "IO ERROR writing bitmap sector %lu " 1133 */
917 "(meta-disk sector %llus)\n", 1134int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
918 enr, (unsigned long long)on_disk_sector); 1135{
919 drbd_chk_io_error(mdev, 1, TRUE); 1136 struct bm_aio_ctx ctx = {
920 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) 1137 .mdev = mdev,
921 drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); 1138 .in_flight = ATOMIC_INIT(1),
1139 .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1140 .flags = BM_AIO_COPY_PAGES,
1141 };
1142
1143 if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
1144 dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
1145 return 0;
922 } 1146 }
1147
1148 bm_page_io_async(&ctx, idx, WRITE_SYNC);
1149 wait_for_completion(&ctx.done);
1150
1151 if (ctx.error)
1152 drbd_chk_io_error(mdev, 1, true);
1153 /* that should force detach, so the in memory bitmap will be
1154 * gone in a moment as well. */
1155
923 mdev->bm_writ_cnt++; 1156 mdev->bm_writ_cnt++;
924 mutex_unlock(&mdev->md_io_mutex); 1157 return ctx.error;
925 return err;
926} 1158}
927 1159
928/* NOTE 1160/* NOTE
929 * find_first_bit returns int, we return unsigned long. 1161 * find_first_bit returns int, we return unsigned long.
930 * should not make much difference anyways, but ... 1162 * For this to work on 32bit arch with bitnumbers > (1<<32),
1163 * we'd need to return u64, and get a whole lot of other places
1164 * fixed where we still use unsigned long.
931 * 1165 *
932 * this returns a bit number, NOT a sector! 1166 * this returns a bit number, NOT a sector!
933 */ 1167 */
934#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
935static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, 1168static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
936 const int find_zero_bit, const enum km_type km) 1169 const int find_zero_bit, const enum km_type km)
937{ 1170{
938 struct drbd_bitmap *b = mdev->bitmap; 1171 struct drbd_bitmap *b = mdev->bitmap;
939 unsigned long i = -1UL;
940 unsigned long *p_addr; 1172 unsigned long *p_addr;
941 unsigned long bit_offset; /* bit offset of the mapped page. */ 1173 unsigned long bit_offset;
1174 unsigned i;
1175
942 1176
943 if (bm_fo > b->bm_bits) { 1177 if (bm_fo > b->bm_bits) {
944 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); 1178 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
1179 bm_fo = DRBD_END_OF_BITMAP;
945 } else { 1180 } else {
946 while (bm_fo < b->bm_bits) { 1181 while (bm_fo < b->bm_bits) {
947 unsigned long offset; 1182 /* bit offset of the first bit in the page */
948 bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ 1183 bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
949 offset = bit_offset >> LN2_BPL; /* word offset of the page */ 1184 p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
950 p_addr = __bm_map_paddr(b, offset, km);
951 1185
952 if (find_zero_bit) 1186 if (find_zero_bit)
953 i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); 1187 i = generic_find_next_zero_le_bit(p_addr,
1188 PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
954 else 1189 else
955 i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); 1190 i = generic_find_next_le_bit(p_addr,
1191 PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
956 1192
957 __bm_unmap(p_addr, km); 1193 __bm_unmap(p_addr, km);
958 if (i < PAGE_SIZE*8) { 1194 if (i < PAGE_SIZE*8) {
959 i = bit_offset + i; 1195 bm_fo = bit_offset + i;
960 if (i >= b->bm_bits) 1196 if (bm_fo >= b->bm_bits)
961 break; 1197 break;
962 goto found; 1198 goto found;
963 } 1199 }
964 bm_fo = bit_offset + PAGE_SIZE*8; 1200 bm_fo = bit_offset + PAGE_SIZE*8;
965 } 1201 }
966 i = -1UL; 1202 bm_fo = DRBD_END_OF_BITMAP;
967 } 1203 }
968 found: 1204 found:
969 return i; 1205 return bm_fo;
970} 1206}
971 1207
972static unsigned long bm_find_next(struct drbd_conf *mdev, 1208static unsigned long bm_find_next(struct drbd_conf *mdev,
973 unsigned long bm_fo, const int find_zero_bit) 1209 unsigned long bm_fo, const int find_zero_bit)
974{ 1210{
975 struct drbd_bitmap *b = mdev->bitmap; 1211 struct drbd_bitmap *b = mdev->bitmap;
976 unsigned long i = -1UL; 1212 unsigned long i = DRBD_END_OF_BITMAP;
977 1213
978 ERR_IF(!b) return i; 1214 ERR_IF(!b) return i;
979 ERR_IF(!b->bm_pages) return i; 1215 ERR_IF(!b->bm_pages) return i;
980 1216
981 spin_lock_irq(&b->bm_lock); 1217 spin_lock_irq(&b->bm_lock);
982 if (bm_is_locked(b)) 1218 if (BM_DONT_TEST & b->bm_flags)
983 bm_print_lock_info(mdev); 1219 bm_print_lock_info(mdev);
984 1220
985 i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); 1221 i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
@@ -1005,13 +1241,13 @@ unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo
1005 * you must take drbd_bm_lock() first */ 1241 * you must take drbd_bm_lock() first */
1006unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) 1242unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1007{ 1243{
1008 /* WARN_ON(!bm_is_locked(mdev)); */ 1244 /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1009 return __bm_find_next(mdev, bm_fo, 0, KM_USER1); 1245 return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
1010} 1246}
1011 1247
1012unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) 1248unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1013{ 1249{
1014 /* WARN_ON(!bm_is_locked(mdev)); */ 1250 /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1015 return __bm_find_next(mdev, bm_fo, 1, KM_USER1); 1251 return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
1016} 1252}
1017 1253
@@ -1027,8 +1263,9 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1027 struct drbd_bitmap *b = mdev->bitmap; 1263 struct drbd_bitmap *b = mdev->bitmap;
1028 unsigned long *p_addr = NULL; 1264 unsigned long *p_addr = NULL;
1029 unsigned long bitnr; 1265 unsigned long bitnr;
1030 unsigned long last_page_nr = -1UL; 1266 unsigned int last_page_nr = -1U;
1031 int c = 0; 1267 int c = 0;
1268 int changed_total = 0;
1032 1269
1033 if (e >= b->bm_bits) { 1270 if (e >= b->bm_bits) {
1034 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", 1271 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
@@ -1036,23 +1273,33 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1036 e = b->bm_bits ? b->bm_bits -1 : 0; 1273 e = b->bm_bits ? b->bm_bits -1 : 0;
1037 } 1274 }
1038 for (bitnr = s; bitnr <= e; bitnr++) { 1275 for (bitnr = s; bitnr <= e; bitnr++) {
1039 unsigned long offset = bitnr>>LN2_BPL; 1276 unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
1040 unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
1041 if (page_nr != last_page_nr) { 1277 if (page_nr != last_page_nr) {
1042 if (p_addr) 1278 if (p_addr)
1043 __bm_unmap(p_addr, km); 1279 __bm_unmap(p_addr, km);
1044 p_addr = __bm_map_paddr(b, offset, km); 1280 if (c < 0)
1281 bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1282 else if (c > 0)
1283 bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1284 changed_total += c;
1285 c = 0;
1286 p_addr = __bm_map_pidx(b, page_nr, km);
1045 last_page_nr = page_nr; 1287 last_page_nr = page_nr;
1046 } 1288 }
1047 if (val) 1289 if (val)
1048 c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); 1290 c += (0 == generic___test_and_set_le_bit(bitnr & BITS_PER_PAGE_MASK, p_addr));
1049 else 1291 else
1050 c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); 1292 c -= (0 != generic___test_and_clear_le_bit(bitnr & BITS_PER_PAGE_MASK, p_addr));
1051 } 1293 }
1052 if (p_addr) 1294 if (p_addr)
1053 __bm_unmap(p_addr, km); 1295 __bm_unmap(p_addr, km);
1054 b->bm_set += c; 1296 if (c < 0)
1055 return c; 1297 bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1298 else if (c > 0)
1299 bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1300 changed_total += c;
1301 b->bm_set += changed_total;
1302 return changed_total;
1056} 1303}
1057 1304
1058/* returns number of bits actually changed. 1305/* returns number of bits actually changed.
@@ -1070,7 +1317,7 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1070 ERR_IF(!b->bm_pages) return 0; 1317 ERR_IF(!b->bm_pages) return 0;
1071 1318
1072 spin_lock_irqsave(&b->bm_lock, flags); 1319 spin_lock_irqsave(&b->bm_lock, flags);
1073 if (bm_is_locked(b)) 1320 if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
1074 bm_print_lock_info(mdev); 1321 bm_print_lock_info(mdev);
1075 1322
1076 c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); 1323 c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
@@ -1187,12 +1434,11 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1187 ERR_IF(!b->bm_pages) return 0; 1434 ERR_IF(!b->bm_pages) return 0;
1188 1435
1189 spin_lock_irqsave(&b->bm_lock, flags); 1436 spin_lock_irqsave(&b->bm_lock, flags);
1190 if (bm_is_locked(b)) 1437 if (BM_DONT_TEST & b->bm_flags)
1191 bm_print_lock_info(mdev); 1438 bm_print_lock_info(mdev);
1192 if (bitnr < b->bm_bits) { 1439 if (bitnr < b->bm_bits) {
1193 unsigned long offset = bitnr>>LN2_BPL; 1440 p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
1194 p_addr = bm_map_paddr(b, offset); 1441 i = generic_test_le_bit(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
1195 i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
1196 bm_unmap(p_addr); 1442 bm_unmap(p_addr);
1197 } else if (bitnr == b->bm_bits) { 1443 } else if (bitnr == b->bm_bits) {
1198 i = -1; 1444 i = -1;
@@ -1210,10 +1456,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1210{ 1456{
1211 unsigned long flags; 1457 unsigned long flags;
1212 struct drbd_bitmap *b = mdev->bitmap; 1458 struct drbd_bitmap *b = mdev->bitmap;
1213 unsigned long *p_addr = NULL, page_nr = -1; 1459 unsigned long *p_addr = NULL;
1214 unsigned long bitnr; 1460 unsigned long bitnr;
1461 unsigned int page_nr = -1U;
1215 int c = 0; 1462 int c = 0;
1216 size_t w;
1217 1463
1218 /* If this is called without a bitmap, that is a bug. But just to be 1464 /* If this is called without a bitmap, that is a bug. But just to be
1219 * robust in case we screwed up elsewhere, in that case pretend there 1465 * robust in case we screwed up elsewhere, in that case pretend there
@@ -1223,20 +1469,20 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1223 ERR_IF(!b->bm_pages) return 1; 1469 ERR_IF(!b->bm_pages) return 1;
1224 1470
1225 spin_lock_irqsave(&b->bm_lock, flags); 1471 spin_lock_irqsave(&b->bm_lock, flags);
1226 if (bm_is_locked(b)) 1472 if (BM_DONT_TEST & b->bm_flags)
1227 bm_print_lock_info(mdev); 1473 bm_print_lock_info(mdev);
1228 for (bitnr = s; bitnr <= e; bitnr++) { 1474 for (bitnr = s; bitnr <= e; bitnr++) {
1229 w = bitnr >> LN2_BPL; 1475 unsigned int idx = bm_bit_to_page_idx(b, bitnr);
1230 if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { 1476 if (page_nr != idx) {
1231 page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); 1477 page_nr = idx;
1232 if (p_addr) 1478 if (p_addr)
1233 bm_unmap(p_addr); 1479 bm_unmap(p_addr);
1234 p_addr = bm_map_paddr(b, w); 1480 p_addr = bm_map_pidx(b, idx);
1235 } 1481 }
1236 ERR_IF (bitnr >= b->bm_bits) { 1482 ERR_IF (bitnr >= b->bm_bits) {
1237 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); 1483 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1238 } else { 1484 } else {
1239 c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); 1485 c += (0 != generic_test_le_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1240 } 1486 }
1241 } 1487 }
1242 if (p_addr) 1488 if (p_addr)
@@ -1271,7 +1517,7 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1271 ERR_IF(!b->bm_pages) return 0; 1517 ERR_IF(!b->bm_pages) return 0;
1272 1518
1273 spin_lock_irqsave(&b->bm_lock, flags); 1519 spin_lock_irqsave(&b->bm_lock, flags);
1274 if (bm_is_locked(b)) 1520 if (BM_DONT_TEST & b->bm_flags)
1275 bm_print_lock_info(mdev); 1521 bm_print_lock_info(mdev);
1276 1522
1277 s = S2W(enr); 1523 s = S2W(enr);
@@ -1279,7 +1525,7 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1279 count = 0; 1525 count = 0;
1280 if (s < b->bm_words) { 1526 if (s < b->bm_words) {
1281 int n = e-s; 1527 int n = e-s;
1282 p_addr = bm_map_paddr(b, s); 1528 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1283 bm = p_addr + MLPP(s); 1529 bm = p_addr + MLPP(s);
1284 while (n--) 1530 while (n--)
1285 count += hweight_long(*bm++); 1531 count += hweight_long(*bm++);
@@ -1291,18 +1537,20 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1291 return count; 1537 return count;
1292} 1538}
1293 1539
1294/* set all bits covered by the AL-extent al_enr */ 1540/* Set all bits covered by the AL-extent al_enr.
1541 * Returns number of bits changed. */
1295unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) 1542unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1296{ 1543{
1297 struct drbd_bitmap *b = mdev->bitmap; 1544 struct drbd_bitmap *b = mdev->bitmap;
1298 unsigned long *p_addr, *bm; 1545 unsigned long *p_addr, *bm;
1299 unsigned long weight; 1546 unsigned long weight;
1300 int count, s, e, i, do_now; 1547 unsigned long s, e;
1548 int count, i, do_now;
1301 ERR_IF(!b) return 0; 1549 ERR_IF(!b) return 0;
1302 ERR_IF(!b->bm_pages) return 0; 1550 ERR_IF(!b->bm_pages) return 0;
1303 1551
1304 spin_lock_irq(&b->bm_lock); 1552 spin_lock_irq(&b->bm_lock);
1305 if (bm_is_locked(b)) 1553 if (BM_DONT_SET & b->bm_flags)
1306 bm_print_lock_info(mdev); 1554 bm_print_lock_info(mdev);
1307 weight = b->bm_set; 1555 weight = b->bm_set;
1308 1556
@@ -1314,7 +1562,7 @@ unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1314 count = 0; 1562 count = 0;
1315 if (s < b->bm_words) { 1563 if (s < b->bm_words) {
1316 i = do_now = e-s; 1564 i = do_now = e-s;
1317 p_addr = bm_map_paddr(b, s); 1565 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1318 bm = p_addr + MLPP(s); 1566 bm = p_addr + MLPP(s);
1319 while (i--) { 1567 while (i--) {
1320 count += hweight_long(*bm); 1568 count += hweight_long(*bm);
@@ -1326,7 +1574,7 @@ unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1326 if (e == b->bm_words) 1574 if (e == b->bm_words)
1327 b->bm_set -= bm_clear_surplus(b); 1575 b->bm_set -= bm_clear_surplus(b);
1328 } else { 1576 } else {
1329 dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); 1577 dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
1330 } 1578 }
1331 weight = b->bm_set - weight; 1579 weight = b->bm_set - weight;
1332 spin_unlock_irq(&b->bm_lock); 1580 spin_unlock_irq(&b->bm_lock);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b0bd27dfc1e8..81030d8d654b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -72,13 +72,6 @@ extern int fault_devs;
72extern char usermode_helper[]; 72extern char usermode_helper[];
73 73
74 74
75#ifndef TRUE
76#define TRUE 1
77#endif
78#ifndef FALSE
79#define FALSE 0
80#endif
81
82/* I don't remember why XCPU ... 75/* I don't remember why XCPU ...
83 * This is used to wake the asender, 76 * This is used to wake the asender,
84 * and to interrupt sending the sending task 77 * and to interrupt sending the sending task
@@ -104,6 +97,7 @@ extern char usermode_helper[];
104#define ID_SYNCER (-1ULL) 97#define ID_SYNCER (-1ULL)
105#define ID_VACANT 0 98#define ID_VACANT 0
106#define is_syncer_block_id(id) ((id) == ID_SYNCER) 99#define is_syncer_block_id(id) ((id) == ID_SYNCER)
100#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
107 101
108struct drbd_conf; 102struct drbd_conf;
109 103
@@ -137,20 +131,19 @@ enum {
137 DRBD_FAULT_MAX, 131 DRBD_FAULT_MAX,
138}; 132};
139 133
140#ifdef CONFIG_DRBD_FAULT_INJECTION
141extern unsigned int 134extern unsigned int
142_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); 135_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
136
143static inline int 137static inline int
144drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { 138drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
139#ifdef CONFIG_DRBD_FAULT_INJECTION
145 return fault_rate && 140 return fault_rate &&
146 (enable_faults & (1<<type)) && 141 (enable_faults & (1<<type)) &&
147 _drbd_insert_fault(mdev, type); 142 _drbd_insert_fault(mdev, type);
148}
149#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
150
151#else 143#else
152#define FAULT_ACTIVE(_m, _t) (0) 144 return 0;
153#endif 145#endif
146}
154 147
155/* integer division, round _UP_ to the next integer */ 148/* integer division, round _UP_ to the next integer */
156#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0)) 149#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
@@ -212,8 +205,10 @@ enum drbd_packets {
212 /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */ 205 /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
213 /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */ 206 /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
214 P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ 207 P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
208 P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
209 P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
215 210
216 P_MAX_CMD = 0x28, 211 P_MAX_CMD = 0x2A,
217 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 212 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
218 P_MAX_OPT_CMD = 0x101, 213 P_MAX_OPT_CMD = 0x101,
219 214
@@ -269,6 +264,7 @@ static inline const char *cmdname(enum drbd_packets cmd)
269 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", 264 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
270 [P_COMPRESSED_BITMAP] = "CBitmap", 265 [P_COMPRESSED_BITMAP] = "CBitmap",
271 [P_DELAY_PROBE] = "DelayProbe", 266 [P_DELAY_PROBE] = "DelayProbe",
267 [P_OUT_OF_SYNC] = "OutOfSync",
272 [P_MAX_CMD] = NULL, 268 [P_MAX_CMD] = NULL,
273 }; 269 };
274 270
@@ -512,7 +508,7 @@ struct p_sizes {
512 u64 d_size; /* size of disk */ 508 u64 d_size; /* size of disk */
513 u64 u_size; /* user requested size */ 509 u64 u_size; /* user requested size */
514 u64 c_size; /* current exported size */ 510 u64 c_size; /* current exported size */
515 u32 max_segment_size; /* Maximal size of a BIO */ 511 u32 max_bio_size; /* Maximal size of a BIO */
516 u16 queue_order_type; /* not yet implemented in DRBD*/ 512 u16 queue_order_type; /* not yet implemented in DRBD*/
517 u16 dds_flags; /* use enum dds_flags here. */ 513 u16 dds_flags; /* use enum dds_flags here. */
518} __packed; 514} __packed;
@@ -550,6 +546,13 @@ struct p_discard {
550 u32 pad; 546 u32 pad;
551} __packed; 547} __packed;
552 548
549struct p_block_desc {
550 struct p_header80 head;
551 u64 sector;
552 u32 blksize;
553 u32 pad; /* to multiple of 8 Byte */
554} __packed;
555
553/* Valid values for the encoding field. 556/* Valid values for the encoding field.
554 * Bump proto version when changing this. */ 557 * Bump proto version when changing this. */
555enum drbd_bitmap_code { 558enum drbd_bitmap_code {
@@ -647,6 +650,7 @@ union p_polymorph {
647 struct p_block_req block_req; 650 struct p_block_req block_req;
648 struct p_delay_probe93 delay_probe93; 651 struct p_delay_probe93 delay_probe93;
649 struct p_rs_uuid rs_uuid; 652 struct p_rs_uuid rs_uuid;
653 struct p_block_desc block_desc;
650} __packed; 654} __packed;
651 655
652/**********************************************************************/ 656/**********************************************************************/
@@ -677,13 +681,6 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
677 return thi->t_state; 681 return thi->t_state;
678} 682}
679 683
680
681/*
682 * Having this as the first member of a struct provides sort of "inheritance".
683 * "derived" structs can be "drbd_queue_work()"ed.
684 * The callback should know and cast back to the descendant struct.
685 * drbd_request and drbd_epoch_entry are descendants of drbd_work.
686 */
687struct drbd_work; 684struct drbd_work;
688typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); 685typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
689struct drbd_work { 686struct drbd_work {
@@ -712,9 +709,6 @@ struct drbd_request {
712 * starting a new epoch... 709 * starting a new epoch...
713 */ 710 */
714 711
715 /* up to here, the struct layout is identical to drbd_epoch_entry;
716 * we might be able to use that to our advantage... */
717
718 struct list_head tl_requests; /* ring list in the transfer log */ 712 struct list_head tl_requests; /* ring list in the transfer log */
719 struct bio *master_bio; /* master bio pointer */ 713 struct bio *master_bio; /* master bio pointer */
720 unsigned long rq_state; /* see comments above _req_mod() */ 714 unsigned long rq_state; /* see comments above _req_mod() */
@@ -831,7 +825,7 @@ enum {
831 CRASHED_PRIMARY, /* This node was a crashed primary. 825 CRASHED_PRIMARY, /* This node was a crashed primary.
832 * Gets cleared when the state.conn 826 * Gets cleared when the state.conn
833 * goes into C_CONNECTED state. */ 827 * goes into C_CONNECTED state. */
834 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ 828 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
835 CONSIDER_RESYNC, 829 CONSIDER_RESYNC,
836 830
837 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ 831 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
@@ -856,10 +850,37 @@ enum {
856 GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */ 850 GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
857 NEW_CUR_UUID, /* Create new current UUID when thawing IO */ 851 NEW_CUR_UUID, /* Create new current UUID when thawing IO */
858 AL_SUSPENDED, /* Activity logging is currently suspended. */ 852 AL_SUSPENDED, /* Activity logging is currently suspended. */
853 AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
859}; 854};
860 855
861struct drbd_bitmap; /* opaque for drbd_conf */ 856struct drbd_bitmap; /* opaque for drbd_conf */
862 857
858/* definition of bits in bm_flags to be used in drbd_bm_lock
859 * and drbd_bitmap_io and friends. */
860enum bm_flag {
861 /* do we need to kfree, or vfree bm_pages? */
862 BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
863
864 /* currently locked for bulk operation */
865 BM_LOCKED_MASK = 0x7,
866
867 /* in detail, that is: */
868 BM_DONT_CLEAR = 0x1,
869 BM_DONT_SET = 0x2,
870 BM_DONT_TEST = 0x4,
871
872 /* (test bit, count bit) allowed (common case) */
873 BM_LOCKED_TEST_ALLOWED = 0x3,
874
875 /* testing bits, as well as setting new bits allowed, but clearing bits
876 * would be unexpected. Used during bitmap receive. Setting new bits
877 * requires sending of "out-of-sync" information, though. */
878 BM_LOCKED_SET_ALLOWED = 0x1,
879
880 /* clear is not expected while bitmap is locked for bulk operation */
881};
882
883
863/* TODO sort members for performance 884/* TODO sort members for performance
864 * MAYBE group them further */ 885 * MAYBE group them further */
865 886
@@ -925,6 +946,7 @@ struct drbd_md_io {
925struct bm_io_work { 946struct bm_io_work {
926 struct drbd_work w; 947 struct drbd_work w;
927 char *why; 948 char *why;
949 enum bm_flag flags;
928 int (*io_fn)(struct drbd_conf *mdev); 950 int (*io_fn)(struct drbd_conf *mdev);
929 void (*done)(struct drbd_conf *mdev, int rv); 951 void (*done)(struct drbd_conf *mdev, int rv);
930}; 952};
@@ -963,9 +985,12 @@ struct drbd_conf {
963 struct drbd_work resync_work, 985 struct drbd_work resync_work,
964 unplug_work, 986 unplug_work,
965 go_diskless, 987 go_diskless,
966 md_sync_work; 988 md_sync_work,
989 start_resync_work;
967 struct timer_list resync_timer; 990 struct timer_list resync_timer;
968 struct timer_list md_sync_timer; 991 struct timer_list md_sync_timer;
992 struct timer_list start_resync_timer;
993 struct timer_list request_timer;
969#ifdef DRBD_DEBUG_MD_SYNC 994#ifdef DRBD_DEBUG_MD_SYNC
970 struct { 995 struct {
971 unsigned int line; 996 unsigned int line;
@@ -1000,9 +1025,9 @@ struct drbd_conf {
1000 struct hlist_head *tl_hash; 1025 struct hlist_head *tl_hash;
1001 unsigned int tl_hash_s; 1026 unsigned int tl_hash_s;
1002 1027
1003 /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ 1028 /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
1004 unsigned long rs_total; 1029 unsigned long rs_total;
1005 /* number of sync IOs that failed in this run */ 1030 /* number of resync blocks that failed in this run */
1006 unsigned long rs_failed; 1031 unsigned long rs_failed;
1007 /* Syncer's start time [unit jiffies] */ 1032 /* Syncer's start time [unit jiffies] */
1008 unsigned long rs_start; 1033 unsigned long rs_start;
@@ -1102,6 +1127,7 @@ struct drbd_conf {
1102 struct fifo_buffer rs_plan_s; /* correction values of resync planer */ 1127 struct fifo_buffer rs_plan_s; /* correction values of resync planer */
1103 int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ 1128 int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
1104 int rs_planed; /* resync sectors already planed */ 1129 int rs_planed; /* resync sectors already planed */
1130 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
1105}; 1131};
1106 1132
1107static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1133static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1163,14 +1189,19 @@ enum dds_flags {
1163}; 1189};
1164 1190
1165extern void drbd_init_set_defaults(struct drbd_conf *mdev); 1191extern void drbd_init_set_defaults(struct drbd_conf *mdev);
1166extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, 1192extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
1167 union drbd_state mask, union drbd_state val); 1193 enum chg_state_flags f,
1194 union drbd_state mask,
1195 union drbd_state val);
1168extern void drbd_force_state(struct drbd_conf *, union drbd_state, 1196extern void drbd_force_state(struct drbd_conf *, union drbd_state,
1169 union drbd_state); 1197 union drbd_state);
1170extern int _drbd_request_state(struct drbd_conf *, union drbd_state, 1198extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
1171 union drbd_state, enum chg_state_flags); 1199 union drbd_state,
1172extern int __drbd_set_state(struct drbd_conf *, union drbd_state, 1200 union drbd_state,
1173 enum chg_state_flags, struct completion *done); 1201 enum chg_state_flags);
1202extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
1203 enum chg_state_flags,
1204 struct completion *done);
1174extern void print_st_err(struct drbd_conf *, union drbd_state, 1205extern void print_st_err(struct drbd_conf *, union drbd_state,
1175 union drbd_state, int); 1206 union drbd_state, int);
1176extern int drbd_thread_start(struct drbd_thread *thi); 1207extern int drbd_thread_start(struct drbd_thread *thi);
@@ -1195,7 +1226,7 @@ extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
1195extern int drbd_send_protocol(struct drbd_conf *mdev); 1226extern int drbd_send_protocol(struct drbd_conf *mdev);
1196extern int drbd_send_uuids(struct drbd_conf *mdev); 1227extern int drbd_send_uuids(struct drbd_conf *mdev);
1197extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); 1228extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1198extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); 1229extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
1199extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); 1230extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
1200extern int _drbd_send_state(struct drbd_conf *mdev); 1231extern int _drbd_send_state(struct drbd_conf *mdev);
1201extern int drbd_send_state(struct drbd_conf *mdev); 1232extern int drbd_send_state(struct drbd_conf *mdev);
@@ -1220,11 +1251,10 @@ extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
1220 struct p_data *dp, int data_size); 1251 struct p_data *dp, int data_size);
1221extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, 1252extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
1222 sector_t sector, int blksize, u64 block_id); 1253 sector_t sector, int blksize, u64 block_id);
1254extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
1223extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, 1255extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
1224 struct drbd_epoch_entry *e); 1256 struct drbd_epoch_entry *e);
1225extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); 1257extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
1226extern int _drbd_send_barrier(struct drbd_conf *mdev,
1227 struct drbd_tl_epoch *barrier);
1228extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, 1258extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1229 sector_t sector, int size, u64 block_id); 1259 sector_t sector, int size, u64 block_id);
1230extern int drbd_send_drequest_csum(struct drbd_conf *mdev, 1260extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
@@ -1235,14 +1265,13 @@ extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size)
1235 1265
1236extern int drbd_send_bitmap(struct drbd_conf *mdev); 1266extern int drbd_send_bitmap(struct drbd_conf *mdev);
1237extern int _drbd_send_bitmap(struct drbd_conf *mdev); 1267extern int _drbd_send_bitmap(struct drbd_conf *mdev);
1238extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); 1268extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
1239extern void drbd_free_bc(struct drbd_backing_dev *ldev); 1269extern void drbd_free_bc(struct drbd_backing_dev *ldev);
1240extern void drbd_mdev_cleanup(struct drbd_conf *mdev); 1270extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
1271void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
1241 1272
1242/* drbd_meta-data.c (still in drbd_main.c) */
1243extern void drbd_md_sync(struct drbd_conf *mdev); 1273extern void drbd_md_sync(struct drbd_conf *mdev);
1244extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); 1274extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
1245/* maybe define them below as inline? */
1246extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); 1275extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1247extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); 1276extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
1248extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); 1277extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
@@ -1261,10 +1290,12 @@ extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
1261extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, 1290extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
1262 int (*io_fn)(struct drbd_conf *), 1291 int (*io_fn)(struct drbd_conf *),
1263 void (*done)(struct drbd_conf *, int), 1292 void (*done)(struct drbd_conf *, int),
1264 char *why); 1293 char *why, enum bm_flag flags);
1294extern int drbd_bitmap_io(struct drbd_conf *mdev,
1295 int (*io_fn)(struct drbd_conf *),
1296 char *why, enum bm_flag flags);
1265extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); 1297extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1266extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1298extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1267extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
1268extern void drbd_go_diskless(struct drbd_conf *mdev); 1299extern void drbd_go_diskless(struct drbd_conf *mdev);
1269extern void drbd_ldev_destroy(struct drbd_conf *mdev); 1300extern void drbd_ldev_destroy(struct drbd_conf *mdev);
1270 1301
@@ -1313,6 +1344,7 @@ struct bm_extent {
1313 1344
1314#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ 1345#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
1315#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ 1346#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
1347#define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */
1316 1348
1317/* drbd_bitmap.c */ 1349/* drbd_bitmap.c */
1318/* 1350/*
@@ -1390,7 +1422,9 @@ struct bm_extent {
1390 * you should use 64bit OS for that much storage, anyways. */ 1422 * you should use 64bit OS for that much storage, anyways. */
1391#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) 1423#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
1392#else 1424#else
1393#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32) 1425/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
1426#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
1427/* corresponds to (1UL << 38) bits right now. */
1394#endif 1428#endif
1395#endif 1429#endif
1396 1430
@@ -1398,7 +1432,7 @@ struct bm_extent {
1398 * With a value of 8 all IO in one 128K block make it to the same slot of the 1432 * With a value of 8 all IO in one 128K block make it to the same slot of the
1399 * hash table. */ 1433 * hash table. */
1400#define HT_SHIFT 8 1434#define HT_SHIFT 8
1401#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT)) 1435#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
1402 1436
1403#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ 1437#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
1404 1438
@@ -1410,16 +1444,20 @@ extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new
1410extern void drbd_bm_cleanup(struct drbd_conf *mdev); 1444extern void drbd_bm_cleanup(struct drbd_conf *mdev);
1411extern void drbd_bm_set_all(struct drbd_conf *mdev); 1445extern void drbd_bm_set_all(struct drbd_conf *mdev);
1412extern void drbd_bm_clear_all(struct drbd_conf *mdev); 1446extern void drbd_bm_clear_all(struct drbd_conf *mdev);
1447/* set/clear/test only a few bits at a time */
1413extern int drbd_bm_set_bits( 1448extern int drbd_bm_set_bits(
1414 struct drbd_conf *mdev, unsigned long s, unsigned long e); 1449 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1415extern int drbd_bm_clear_bits( 1450extern int drbd_bm_clear_bits(
1416 struct drbd_conf *mdev, unsigned long s, unsigned long e); 1451 struct drbd_conf *mdev, unsigned long s, unsigned long e);
1417/* bm_set_bits variant for use while holding drbd_bm_lock */ 1452extern int drbd_bm_count_bits(
1453 struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
1454/* bm_set_bits variant for use while holding drbd_bm_lock,
1455 * may process the whole bitmap in one go */
1418extern void _drbd_bm_set_bits(struct drbd_conf *mdev, 1456extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
1419 const unsigned long s, const unsigned long e); 1457 const unsigned long s, const unsigned long e);
1420extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); 1458extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
1421extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); 1459extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1422extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); 1460extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
1423extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); 1461extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1424extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); 1462extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1425extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, 1463extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
@@ -1427,6 +1465,8 @@ extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
1427extern size_t drbd_bm_words(struct drbd_conf *mdev); 1465extern size_t drbd_bm_words(struct drbd_conf *mdev);
1428extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); 1466extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
1429extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); 1467extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
1468
1469#define DRBD_END_OF_BITMAP (~(unsigned long)0)
1430extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); 1470extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
1431/* bm_find_next variants for use while you hold drbd_bm_lock() */ 1471/* bm_find_next variants for use while you hold drbd_bm_lock() */
1432extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); 1472extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
@@ -1437,14 +1477,12 @@ extern int drbd_bm_rs_done(struct drbd_conf *mdev);
1437/* for receive_bitmap */ 1477/* for receive_bitmap */
1438extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, 1478extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
1439 size_t number, unsigned long *buffer); 1479 size_t number, unsigned long *buffer);
1440/* for _drbd_send_bitmap and drbd_bm_write_sect */ 1480/* for _drbd_send_bitmap */
1441extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, 1481extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
1442 size_t number, unsigned long *buffer); 1482 size_t number, unsigned long *buffer);
1443 1483
1444extern void drbd_bm_lock(struct drbd_conf *mdev, char *why); 1484extern void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags);
1445extern void drbd_bm_unlock(struct drbd_conf *mdev); 1485extern void drbd_bm_unlock(struct drbd_conf *mdev);
1446
1447extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
1448/* drbd_main.c */ 1486/* drbd_main.c */
1449 1487
1450extern struct kmem_cache *drbd_request_cache; 1488extern struct kmem_cache *drbd_request_cache;
@@ -1467,7 +1505,7 @@ extern void drbd_free_mdev(struct drbd_conf *mdev);
1467extern int proc_details; 1505extern int proc_details;
1468 1506
1469/* drbd_req */ 1507/* drbd_req */
1470extern int drbd_make_request_26(struct request_queue *q, struct bio *bio); 1508extern int drbd_make_request(struct request_queue *q, struct bio *bio);
1471extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); 1509extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
1472extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); 1510extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
1473extern int is_valid_ar_handle(struct drbd_request *, sector_t); 1511extern int is_valid_ar_handle(struct drbd_request *, sector_t);
@@ -1482,8 +1520,9 @@ enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew =
1482extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); 1520extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
1483extern void resync_after_online_grow(struct drbd_conf *); 1521extern void resync_after_online_grow(struct drbd_conf *);
1484extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); 1522extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
1485extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, 1523extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
1486 int force); 1524 enum drbd_role new_role,
1525 int force);
1487extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); 1526extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
1488extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev); 1527extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
1489extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); 1528extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
@@ -1499,6 +1538,7 @@ extern int drbd_resync_finished(struct drbd_conf *mdev);
1499extern int drbd_md_sync_page_io(struct drbd_conf *mdev, 1538extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
1500 struct drbd_backing_dev *bdev, sector_t sector, int rw); 1539 struct drbd_backing_dev *bdev, sector_t sector, int rw);
1501extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); 1540extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
1541extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
1502 1542
1503static inline void ov_oos_print(struct drbd_conf *mdev) 1543static inline void ov_oos_print(struct drbd_conf *mdev)
1504{ 1544{
@@ -1522,21 +1562,23 @@ extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
1522extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); 1562extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
1523extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); 1563extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
1524extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); 1564extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
1525extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); 1565extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
1526extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); 1566extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
1527extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); 1567extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
1528extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
1529extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); 1568extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
1530extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); 1569extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
1531extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); 1570extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
1532extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); 1571extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
1533extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); 1572extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1534extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); 1573extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
1574extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
1575extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
1535 1576
1536extern void resync_timer_fn(unsigned long data); 1577extern void resync_timer_fn(unsigned long data);
1578extern void start_resync_timer_fn(unsigned long data);
1537 1579
1538/* drbd_receiver.c */ 1580/* drbd_receiver.c */
1539extern int drbd_rs_should_slow_down(struct drbd_conf *mdev); 1581extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
1540extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, 1582extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1541 const unsigned rw, const int fault_type); 1583 const unsigned rw, const int fault_type);
1542extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); 1584extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
@@ -1619,16 +1661,16 @@ extern int drbd_rs_del_all(struct drbd_conf *mdev);
1619extern void drbd_rs_failed_io(struct drbd_conf *mdev, 1661extern void drbd_rs_failed_io(struct drbd_conf *mdev,
1620 sector_t sector, int size); 1662 sector_t sector, int size);
1621extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); 1663extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
1664extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
1622extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, 1665extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
1623 int size, const char *file, const unsigned int line); 1666 int size, const char *file, const unsigned int line);
1624#define drbd_set_in_sync(mdev, sector, size) \ 1667#define drbd_set_in_sync(mdev, sector, size) \
1625 __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) 1668 __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
1626extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, 1669extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
1627 int size, const char *file, const unsigned int line); 1670 int size, const char *file, const unsigned int line);
1628#define drbd_set_out_of_sync(mdev, sector, size) \ 1671#define drbd_set_out_of_sync(mdev, sector, size) \
1629 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) 1672 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
1630extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); 1673extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
1631extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
1632extern void drbd_al_shrink(struct drbd_conf *mdev); 1674extern void drbd_al_shrink(struct drbd_conf *mdev);
1633 1675
1634 1676
@@ -1747,11 +1789,11 @@ static inline void drbd_state_unlock(struct drbd_conf *mdev)
1747 wake_up(&mdev->misc_wait); 1789 wake_up(&mdev->misc_wait);
1748} 1790}
1749 1791
1750static inline int _drbd_set_state(struct drbd_conf *mdev, 1792static inline enum drbd_state_rv
1751 union drbd_state ns, enum chg_state_flags flags, 1793_drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1752 struct completion *done) 1794 enum chg_state_flags flags, struct completion *done)
1753{ 1795{
1754 int rv; 1796 enum drbd_state_rv rv;
1755 1797
1756 read_lock(&global_state_lock); 1798 read_lock(&global_state_lock);
1757 rv = __drbd_set_state(mdev, ns, flags, done); 1799 rv = __drbd_set_state(mdev, ns, flags, done);
@@ -1982,17 +2024,17 @@ static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
1982 2024
1983static inline void drbd_thread_stop(struct drbd_thread *thi) 2025static inline void drbd_thread_stop(struct drbd_thread *thi)
1984{ 2026{
1985 _drbd_thread_stop(thi, FALSE, TRUE); 2027 _drbd_thread_stop(thi, false, true);
1986} 2028}
1987 2029
1988static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) 2030static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
1989{ 2031{
1990 _drbd_thread_stop(thi, FALSE, FALSE); 2032 _drbd_thread_stop(thi, false, false);
1991} 2033}
1992 2034
1993static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) 2035static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
1994{ 2036{
1995 _drbd_thread_stop(thi, TRUE, FALSE); 2037 _drbd_thread_stop(thi, true, false);
1996} 2038}
1997 2039
1998/* counts how many answer packets packets we expect from our peer, 2040/* counts how many answer packets packets we expect from our peer,
@@ -2146,17 +2188,18 @@ extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
2146static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, 2188static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
2147 unsigned long *bits_left, unsigned int *per_mil_done) 2189 unsigned long *bits_left, unsigned int *per_mil_done)
2148{ 2190{
2149 /* 2191 /* this is to break it at compile time when we change that, in case we
2150 * this is to break it at compile time when we change that 2192 * want to support more than (1<<32) bits on a 32bit arch. */
2151 * (we may feel 4TB maximum storage per drbd is not enough)
2152 */
2153 typecheck(unsigned long, mdev->rs_total); 2193 typecheck(unsigned long, mdev->rs_total);
2154 2194
2155 /* note: both rs_total and rs_left are in bits, i.e. in 2195 /* note: both rs_total and rs_left are in bits, i.e. in
2156 * units of BM_BLOCK_SIZE. 2196 * units of BM_BLOCK_SIZE.
2157 * for the percentage, we don't care. */ 2197 * for the percentage, we don't care. */
2158 2198
2159 *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; 2199 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
2200 *bits_left = mdev->ov_left;
2201 else
2202 *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2160 /* >> 10 to prevent overflow, 2203 /* >> 10 to prevent overflow,
2161 * +1 to prevent division by zero */ 2204 * +1 to prevent division by zero */
2162 if (*bits_left > mdev->rs_total) { 2205 if (*bits_left > mdev->rs_total) {
@@ -2171,10 +2214,19 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
2171 *bits_left, mdev->rs_total, mdev->rs_failed); 2214 *bits_left, mdev->rs_total, mdev->rs_failed);
2172 *per_mil_done = 0; 2215 *per_mil_done = 0;
2173 } else { 2216 } else {
2174 /* make sure the calculation happens in long context */ 2217 /* Make sure the division happens in long context.
2175 unsigned long tmp = 1000UL - 2218 * We allow up to one petabyte storage right now,
2176 (*bits_left >> 10)*1000UL 2219 * at a granularity of 4k per bit that is 2**38 bits.
2177 / ((mdev->rs_total >> 10) + 1UL); 2220 * After shift right and multiplication by 1000,
2221 * this should still fit easily into a 32bit long,
2222 * so we don't need a 64bit division on 32bit arch.
2223 * Note: currently we don't support such large bitmaps on 32bit
2224 * arch anyways, but no harm done to be prepared for it here.
2225 */
2226 unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10;
2227 unsigned long left = *bits_left >> shift;
2228 unsigned long total = 1UL + (mdev->rs_total >> shift);
2229 unsigned long tmp = 1000UL - left * 1000UL/total;
2178 *per_mil_done = tmp; 2230 *per_mil_done = tmp;
2179 } 2231 }
2180} 2232}
@@ -2193,8 +2245,9 @@ static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
2193 return mxb; 2245 return mxb;
2194} 2246}
2195 2247
2196static inline int drbd_state_is_stable(union drbd_state s) 2248static inline int drbd_state_is_stable(struct drbd_conf *mdev)
2197{ 2249{
2250 union drbd_state s = mdev->state;
2198 2251
2199 /* DO NOT add a default clause, we want the compiler to warn us 2252 /* DO NOT add a default clause, we want the compiler to warn us
2200 * for any newly introduced state we may have forgotten to add here */ 2253 * for any newly introduced state we may have forgotten to add here */
@@ -2211,11 +2264,9 @@ static inline int drbd_state_is_stable(union drbd_state s)
2211 case C_VERIFY_T: 2264 case C_VERIFY_T:
2212 case C_PAUSED_SYNC_S: 2265 case C_PAUSED_SYNC_S:
2213 case C_PAUSED_SYNC_T: 2266 case C_PAUSED_SYNC_T:
2214 /* maybe stable, look at the disk state */ 2267 case C_AHEAD:
2215 break; 2268 case C_BEHIND:
2216 2269 /* transitional states, IO allowed */
2217 /* no new io accepted during tansitional states
2218 * like handshake or teardown */
2219 case C_DISCONNECTING: 2270 case C_DISCONNECTING:
2220 case C_UNCONNECTED: 2271 case C_UNCONNECTED:
2221 case C_TIMEOUT: 2272 case C_TIMEOUT:
@@ -2226,7 +2277,15 @@ static inline int drbd_state_is_stable(union drbd_state s)
2226 case C_WF_REPORT_PARAMS: 2277 case C_WF_REPORT_PARAMS:
2227 case C_STARTING_SYNC_S: 2278 case C_STARTING_SYNC_S:
2228 case C_STARTING_SYNC_T: 2279 case C_STARTING_SYNC_T:
2280 break;
2281
2282 /* Allow IO in BM exchange states with new protocols */
2229 case C_WF_BITMAP_S: 2283 case C_WF_BITMAP_S:
2284 if (mdev->agreed_pro_version < 96)
2285 return 0;
2286 break;
2287
2288 /* no new io accepted in these states */
2230 case C_WF_BITMAP_T: 2289 case C_WF_BITMAP_T:
2231 case C_WF_SYNC_UUID: 2290 case C_WF_SYNC_UUID:
2232 case C_MASK: 2291 case C_MASK:
@@ -2261,41 +2320,47 @@ static inline int is_susp(union drbd_state s)
2261 return s.susp || s.susp_nod || s.susp_fen; 2320 return s.susp || s.susp_nod || s.susp_fen;
2262} 2321}
2263 2322
2264static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) 2323static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
2265{ 2324{
2266 int mxb = drbd_get_max_buffers(mdev); 2325 int mxb = drbd_get_max_buffers(mdev);
2267 2326
2268 if (is_susp(mdev->state)) 2327 if (is_susp(mdev->state))
2269 return 0; 2328 return false;
2270 if (test_bit(SUSPEND_IO, &mdev->flags)) 2329 if (test_bit(SUSPEND_IO, &mdev->flags))
2271 return 0; 2330 return false;
2272 2331
2273 /* to avoid potential deadlock or bitmap corruption, 2332 /* to avoid potential deadlock or bitmap corruption,
2274 * in various places, we only allow new application io 2333 * in various places, we only allow new application io
2275 * to start during "stable" states. */ 2334 * to start during "stable" states. */
2276 2335
2277 /* no new io accepted when attaching or detaching the disk */ 2336 /* no new io accepted when attaching or detaching the disk */
2278 if (!drbd_state_is_stable(mdev->state)) 2337 if (!drbd_state_is_stable(mdev))
2279 return 0; 2338 return false;
2280 2339
2281 /* since some older kernels don't have atomic_add_unless, 2340 /* since some older kernels don't have atomic_add_unless,
2282 * and we are within the spinlock anyways, we have this workaround. */ 2341 * and we are within the spinlock anyways, we have this workaround. */
2283 if (atomic_read(&mdev->ap_bio_cnt) > mxb) 2342 if (atomic_read(&mdev->ap_bio_cnt) > mxb)
2284 return 0; 2343 return false;
2285 if (test_bit(BITMAP_IO, &mdev->flags)) 2344 if (test_bit(BITMAP_IO, &mdev->flags))
2286 return 0; 2345 return false;
2287 return 1; 2346 return true;
2288} 2347}
2289 2348
2290/* I'd like to use wait_event_lock_irq, 2349static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
2291 * but I'm not sure when it got introduced,
2292 * and not sure when it has 3 or 4 arguments */
2293static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2294{ 2350{
2295 /* compare with after_state_ch, 2351 bool rv = false;
2296 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ 2352
2297 DEFINE_WAIT(wait); 2353 spin_lock_irq(&mdev->req_lock);
2354 rv = may_inc_ap_bio(mdev);
2355 if (rv)
2356 atomic_add(count, &mdev->ap_bio_cnt);
2357 spin_unlock_irq(&mdev->req_lock);
2358
2359 return rv;
2360}
2298 2361
2362static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2363{
2299 /* we wait here 2364 /* we wait here
2300 * as long as the device is suspended 2365 * as long as the device is suspended
2301 * until the bitmap is no longer on the fly during connection 2366 * until the bitmap is no longer on the fly during connection
@@ -2304,16 +2369,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2304 * to avoid races with the reconnect code, 2369 * to avoid races with the reconnect code,
2305 * we need to atomic_inc within the spinlock. */ 2370 * we need to atomic_inc within the spinlock. */
2306 2371
2307 spin_lock_irq(&mdev->req_lock); 2372 wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
2308 while (!__inc_ap_bio_cond(mdev)) {
2309 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
2310 spin_unlock_irq(&mdev->req_lock);
2311 schedule();
2312 finish_wait(&mdev->misc_wait, &wait);
2313 spin_lock_irq(&mdev->req_lock);
2314 }
2315 atomic_add(count, &mdev->ap_bio_cnt);
2316 spin_unlock_irq(&mdev->req_lock);
2317} 2373}
2318 2374
2319static inline void dec_ap_bio(struct drbd_conf *mdev) 2375static inline void dec_ap_bio(struct drbd_conf *mdev)
@@ -2333,9 +2389,11 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
2333 } 2389 }
2334} 2390}
2335 2391
2336static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) 2392static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
2337{ 2393{
2394 int changed = mdev->ed_uuid != val;
2338 mdev->ed_uuid = val; 2395 mdev->ed_uuid = val;
2396 return changed;
2339} 2397}
2340 2398
2341static inline int seq_cmp(u32 a, u32 b) 2399static inline int seq_cmp(u32 a, u32 b)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8a43ce0edeed..dfc85f32d317 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -85,7 +85,8 @@ MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); 85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION); 86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL"); 87MODULE_LICENSE("GPL");
88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); 88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
89MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); 90MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90 91
91#include <linux/moduleparam.h> 92#include <linux/moduleparam.h>
@@ -115,7 +116,7 @@ module_param(fault_devs, int, 0644);
115#endif 116#endif
116 117
117/* module parameter, defined */ 118/* module parameter, defined */
118unsigned int minor_count = 32; 119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
119int disable_sendpage; 120int disable_sendpage;
120int allow_oos; 121int allow_oos;
121unsigned int cn_idx = CN_IDX_DRBD; 122unsigned int cn_idx = CN_IDX_DRBD;
@@ -335,6 +336,7 @@ bail:
335 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 336 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336} 337}
337 338
339
338/** 340/**
339 * _tl_restart() - Walks the transfer log, and applies an action to all requests 341 * _tl_restart() - Walks the transfer log, and applies an action to all requests
340 * @mdev: DRBD device. 342 * @mdev: DRBD device.
@@ -456,7 +458,7 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
456} 458}
457 459
458/** 460/**
459 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one 461 * cl_wide_st_chg() - true if the state change is a cluster wide one
460 * @mdev: DRBD device. 462 * @mdev: DRBD device.
461 * @os: old (current) state. 463 * @os: old (current) state.
462 * @ns: new (wanted) state. 464 * @ns: new (wanted) state.
@@ -473,12 +475,13 @@ static int cl_wide_st_chg(struct drbd_conf *mdev,
473 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); 475 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474} 476}
475 477
476int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, 478enum drbd_state_rv
477 union drbd_state mask, union drbd_state val) 479drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
480 union drbd_state mask, union drbd_state val)
478{ 481{
479 unsigned long flags; 482 unsigned long flags;
480 union drbd_state os, ns; 483 union drbd_state os, ns;
481 int rv; 484 enum drbd_state_rv rv;
482 485
483 spin_lock_irqsave(&mdev->req_lock, flags); 486 spin_lock_irqsave(&mdev->req_lock, flags);
484 os = mdev->state; 487 os = mdev->state;
@@ -502,20 +505,22 @@ void drbd_force_state(struct drbd_conf *mdev,
502 drbd_change_state(mdev, CS_HARD, mask, val); 505 drbd_change_state(mdev, CS_HARD, mask, val);
503} 506}
504 507
505static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); 508static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
506static int is_valid_state_transition(struct drbd_conf *, 509static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
507 union drbd_state, union drbd_state); 510 union drbd_state,
511 union drbd_state);
508static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 512static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
509 union drbd_state ns, const char **warn_sync_abort); 513 union drbd_state ns, const char **warn_sync_abort);
510int drbd_send_state_req(struct drbd_conf *, 514int drbd_send_state_req(struct drbd_conf *,
511 union drbd_state, union drbd_state); 515 union drbd_state, union drbd_state);
512 516
513static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, 517static enum drbd_state_rv
514 union drbd_state mask, union drbd_state val) 518_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
519 union drbd_state val)
515{ 520{
516 union drbd_state os, ns; 521 union drbd_state os, ns;
517 unsigned long flags; 522 unsigned long flags;
518 int rv; 523 enum drbd_state_rv rv;
519 524
520 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) 525 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
521 return SS_CW_SUCCESS; 526 return SS_CW_SUCCESS;
@@ -536,7 +541,7 @@ static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
536 if (rv == SS_SUCCESS) { 541 if (rv == SS_SUCCESS) {
537 rv = is_valid_state_transition(mdev, ns, os); 542 rv = is_valid_state_transition(mdev, ns, os);
538 if (rv == SS_SUCCESS) 543 if (rv == SS_SUCCESS)
539 rv = 0; /* cont waiting, otherwise fail. */ 544 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
540 } 545 }
541 } 546 }
542 spin_unlock_irqrestore(&mdev->req_lock, flags); 547 spin_unlock_irqrestore(&mdev->req_lock, flags);
@@ -554,14 +559,14 @@ static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
554 * Should not be called directly, use drbd_request_state() or 559 * Should not be called directly, use drbd_request_state() or
555 * _drbd_request_state(). 560 * _drbd_request_state().
556 */ 561 */
557static int drbd_req_state(struct drbd_conf *mdev, 562static enum drbd_state_rv
558 union drbd_state mask, union drbd_state val, 563drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
559 enum chg_state_flags f) 564 union drbd_state val, enum chg_state_flags f)
560{ 565{
561 struct completion done; 566 struct completion done;
562 unsigned long flags; 567 unsigned long flags;
563 union drbd_state os, ns; 568 union drbd_state os, ns;
564 int rv; 569 enum drbd_state_rv rv;
565 570
566 init_completion(&done); 571 init_completion(&done);
567 572
@@ -636,10 +641,11 @@ abort:
636 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE 641 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
637 * flag, or when logging of failed state change requests is not desired. 642 * flag, or when logging of failed state change requests is not desired.
638 */ 643 */
639int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, 644enum drbd_state_rv
640 union drbd_state val, enum chg_state_flags f) 645_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
646 union drbd_state val, enum chg_state_flags f)
641{ 647{
642 int rv; 648 enum drbd_state_rv rv;
643 649
644 wait_event(mdev->state_wait, 650 wait_event(mdev->state_wait,
645 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); 651 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
@@ -663,8 +669,8 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
663 ); 669 );
664} 670}
665 671
666void print_st_err(struct drbd_conf *mdev, 672void print_st_err(struct drbd_conf *mdev, union drbd_state os,
667 union drbd_state os, union drbd_state ns, int err) 673 union drbd_state ns, enum drbd_state_rv err)
668{ 674{
669 if (err == SS_IN_TRANSIENT_STATE) 675 if (err == SS_IN_TRANSIENT_STATE)
670 return; 676 return;
@@ -674,32 +680,18 @@ void print_st_err(struct drbd_conf *mdev,
674} 680}
675 681
676 682
677#define drbd_peer_str drbd_role_str
678#define drbd_pdsk_str drbd_disk_str
679
680#define drbd_susp_str(A) ((A) ? "1" : "0")
681#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
682#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
683#define drbd_user_isp_str(A) ((A) ? "1" : "0")
684
685#define PSC(A) \
686 ({ if (ns.A != os.A) { \
687 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
688 drbd_##A##_str(os.A), \
689 drbd_##A##_str(ns.A)); \
690 } })
691
692/** 683/**
693 * is_valid_state() - Returns an SS_ error code if ns is not valid 684 * is_valid_state() - Returns an SS_ error code if ns is not valid
694 * @mdev: DRBD device. 685 * @mdev: DRBD device.
695 * @ns: State to consider. 686 * @ns: State to consider.
696 */ 687 */
697static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) 688static enum drbd_state_rv
689is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
698{ 690{
699 /* See drbd_state_sw_errors in drbd_strings.c */ 691 /* See drbd_state_sw_errors in drbd_strings.c */
700 692
701 enum drbd_fencing_p fp; 693 enum drbd_fencing_p fp;
702 int rv = SS_SUCCESS; 694 enum drbd_state_rv rv = SS_SUCCESS;
703 695
704 fp = FP_DONT_CARE; 696 fp = FP_DONT_CARE;
705 if (get_ldev(mdev)) { 697 if (get_ldev(mdev)) {
@@ -762,10 +754,11 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
762 * @ns: new state. 754 * @ns: new state.
763 * @os: old state. 755 * @os: old state.
764 */ 756 */
765static int is_valid_state_transition(struct drbd_conf *mdev, 757static enum drbd_state_rv
766 union drbd_state ns, union drbd_state os) 758is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
759 union drbd_state os)
767{ 760{
768 int rv = SS_SUCCESS; 761 enum drbd_state_rv rv = SS_SUCCESS;
769 762
770 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && 763 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
771 os.conn > C_CONNECTED) 764 os.conn > C_CONNECTED)
@@ -800,6 +793,10 @@ static int is_valid_state_transition(struct drbd_conf *mdev,
800 os.conn < C_CONNECTED) 793 os.conn < C_CONNECTED)
801 rv = SS_NEED_CONNECTION; 794 rv = SS_NEED_CONNECTION;
802 795
796 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
797 && os.conn < C_WF_REPORT_PARAMS)
798 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
799
803 return rv; 800 return rv;
804} 801}
805 802
@@ -817,6 +814,7 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
817 union drbd_state ns, const char **warn_sync_abort) 814 union drbd_state ns, const char **warn_sync_abort)
818{ 815{
819 enum drbd_fencing_p fp; 816 enum drbd_fencing_p fp;
817 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
820 818
821 fp = FP_DONT_CARE; 819 fp = FP_DONT_CARE;
822 if (get_ldev(mdev)) { 820 if (get_ldev(mdev)) {
@@ -869,56 +867,6 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
869 ns.conn = C_CONNECTED; 867 ns.conn = C_CONNECTED;
870 } 868 }
871 869
872 if (ns.conn >= C_CONNECTED &&
873 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
874 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
875 switch (ns.conn) {
876 case C_WF_BITMAP_T:
877 case C_PAUSED_SYNC_T:
878 ns.disk = D_OUTDATED;
879 break;
880 case C_CONNECTED:
881 case C_WF_BITMAP_S:
882 case C_SYNC_SOURCE:
883 case C_PAUSED_SYNC_S:
884 ns.disk = D_UP_TO_DATE;
885 break;
886 case C_SYNC_TARGET:
887 ns.disk = D_INCONSISTENT;
888 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
889 break;
890 }
891 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
892 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
893 }
894
895 if (ns.conn >= C_CONNECTED &&
896 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
897 switch (ns.conn) {
898 case C_CONNECTED:
899 case C_WF_BITMAP_T:
900 case C_PAUSED_SYNC_T:
901 case C_SYNC_TARGET:
902 ns.pdsk = D_UP_TO_DATE;
903 break;
904 case C_WF_BITMAP_S:
905 case C_PAUSED_SYNC_S:
906 /* remap any consistent state to D_OUTDATED,
907 * but disallow "upgrade" of not even consistent states.
908 */
909 ns.pdsk =
910 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
911 ? os.pdsk : D_OUTDATED;
912 break;
913 case C_SYNC_SOURCE:
914 ns.pdsk = D_INCONSISTENT;
915 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
916 break;
917 }
918 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
919 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
920 }
921
922 /* Connection breaks down before we finished "Negotiating" */ 870 /* Connection breaks down before we finished "Negotiating" */
923 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && 871 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
924 get_ldev_if_state(mdev, D_NEGOTIATING)) { 872 get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -933,6 +881,94 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
933 put_ldev(mdev); 881 put_ldev(mdev);
934 } 882 }
935 883
884 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
885 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
886 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
887 ns.disk = D_UP_TO_DATE;
888 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
889 ns.pdsk = D_UP_TO_DATE;
890 }
891
892 /* Implications of the connection stat on the disk states */
893 disk_min = D_DISKLESS;
894 disk_max = D_UP_TO_DATE;
895 pdsk_min = D_INCONSISTENT;
896 pdsk_max = D_UNKNOWN;
897 switch ((enum drbd_conns)ns.conn) {
898 case C_WF_BITMAP_T:
899 case C_PAUSED_SYNC_T:
900 case C_STARTING_SYNC_T:
901 case C_WF_SYNC_UUID:
902 case C_BEHIND:
903 disk_min = D_INCONSISTENT;
904 disk_max = D_OUTDATED;
905 pdsk_min = D_UP_TO_DATE;
906 pdsk_max = D_UP_TO_DATE;
907 break;
908 case C_VERIFY_S:
909 case C_VERIFY_T:
910 disk_min = D_UP_TO_DATE;
911 disk_max = D_UP_TO_DATE;
912 pdsk_min = D_UP_TO_DATE;
913 pdsk_max = D_UP_TO_DATE;
914 break;
915 case C_CONNECTED:
916 disk_min = D_DISKLESS;
917 disk_max = D_UP_TO_DATE;
918 pdsk_min = D_DISKLESS;
919 pdsk_max = D_UP_TO_DATE;
920 break;
921 case C_WF_BITMAP_S:
922 case C_PAUSED_SYNC_S:
923 case C_STARTING_SYNC_S:
924 case C_AHEAD:
925 disk_min = D_UP_TO_DATE;
926 disk_max = D_UP_TO_DATE;
927 pdsk_min = D_INCONSISTENT;
928 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
929 break;
930 case C_SYNC_TARGET:
931 disk_min = D_INCONSISTENT;
932 disk_max = D_INCONSISTENT;
933 pdsk_min = D_UP_TO_DATE;
934 pdsk_max = D_UP_TO_DATE;
935 break;
936 case C_SYNC_SOURCE:
937 disk_min = D_UP_TO_DATE;
938 disk_max = D_UP_TO_DATE;
939 pdsk_min = D_INCONSISTENT;
940 pdsk_max = D_INCONSISTENT;
941 break;
942 case C_STANDALONE:
943 case C_DISCONNECTING:
944 case C_UNCONNECTED:
945 case C_TIMEOUT:
946 case C_BROKEN_PIPE:
947 case C_NETWORK_FAILURE:
948 case C_PROTOCOL_ERROR:
949 case C_TEAR_DOWN:
950 case C_WF_CONNECTION:
951 case C_WF_REPORT_PARAMS:
952 case C_MASK:
953 break;
954 }
955 if (ns.disk > disk_max)
956 ns.disk = disk_max;
957
958 if (ns.disk < disk_min) {
959 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
960 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
961 ns.disk = disk_min;
962 }
963 if (ns.pdsk > pdsk_max)
964 ns.pdsk = pdsk_max;
965
966 if (ns.pdsk < pdsk_min) {
967 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
968 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
969 ns.pdsk = pdsk_min;
970 }
971
936 if (fp == FP_STONITH && 972 if (fp == FP_STONITH &&
937 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && 973 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
938 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) 974 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
@@ -961,6 +997,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
961/* helper for __drbd_set_state */ 997/* helper for __drbd_set_state */
962static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) 998static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
963{ 999{
1000 if (mdev->agreed_pro_version < 90)
1001 mdev->ov_start_sector = 0;
1002 mdev->rs_total = drbd_bm_bits(mdev);
1003 mdev->ov_position = 0;
964 if (cs == C_VERIFY_T) { 1004 if (cs == C_VERIFY_T) {
965 /* starting online verify from an arbitrary position 1005 /* starting online verify from an arbitrary position
966 * does not fit well into the existing protocol. 1006 * does not fit well into the existing protocol.
@@ -970,11 +1010,15 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
970 mdev->ov_start_sector = ~(sector_t)0; 1010 mdev->ov_start_sector = ~(sector_t)0;
971 } else { 1011 } else {
972 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); 1012 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
973 if (bit >= mdev->rs_total) 1013 if (bit >= mdev->rs_total) {
974 mdev->ov_start_sector = 1014 mdev->ov_start_sector =
975 BM_BIT_TO_SECT(mdev->rs_total - 1); 1015 BM_BIT_TO_SECT(mdev->rs_total - 1);
1016 mdev->rs_total = 1;
1017 } else
1018 mdev->rs_total -= bit;
976 mdev->ov_position = mdev->ov_start_sector; 1019 mdev->ov_position = mdev->ov_start_sector;
977 } 1020 }
1021 mdev->ov_left = mdev->rs_total;
978} 1022}
979 1023
980static void drbd_resume_al(struct drbd_conf *mdev) 1024static void drbd_resume_al(struct drbd_conf *mdev)
@@ -992,12 +1036,12 @@ static void drbd_resume_al(struct drbd_conf *mdev)
992 * 1036 *
993 * Caller needs to hold req_lock, and global_state_lock. Do not call directly. 1037 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
994 */ 1038 */
995int __drbd_set_state(struct drbd_conf *mdev, 1039enum drbd_state_rv
996 union drbd_state ns, enum chg_state_flags flags, 1040__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
997 struct completion *done) 1041 enum chg_state_flags flags, struct completion *done)
998{ 1042{
999 union drbd_state os; 1043 union drbd_state os;
1000 int rv = SS_SUCCESS; 1044 enum drbd_state_rv rv = SS_SUCCESS;
1001 const char *warn_sync_abort = NULL; 1045 const char *warn_sync_abort = NULL;
1002 struct after_state_chg_work *ascw; 1046 struct after_state_chg_work *ascw;
1003 1047
@@ -1033,22 +1077,46 @@ int __drbd_set_state(struct drbd_conf *mdev,
1033 dev_warn(DEV, "%s aborted.\n", warn_sync_abort); 1077 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1034 1078
1035 { 1079 {
1036 char *pbp, pb[300]; 1080 char *pbp, pb[300];
1037 pbp = pb; 1081 pbp = pb;
1038 *pbp = 0; 1082 *pbp = 0;
1039 PSC(role); 1083 if (ns.role != os.role)
1040 PSC(peer); 1084 pbp += sprintf(pbp, "role( %s -> %s ) ",
1041 PSC(conn); 1085 drbd_role_str(os.role),
1042 PSC(disk); 1086 drbd_role_str(ns.role));
1043 PSC(pdsk); 1087 if (ns.peer != os.peer)
1044 if (is_susp(ns) != is_susp(os)) 1088 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1045 pbp += sprintf(pbp, "susp( %s -> %s ) ", 1089 drbd_role_str(os.peer),
1046 drbd_susp_str(is_susp(os)), 1090 drbd_role_str(ns.peer));
1047 drbd_susp_str(is_susp(ns))); 1091 if (ns.conn != os.conn)
1048 PSC(aftr_isp); 1092 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1049 PSC(peer_isp); 1093 drbd_conn_str(os.conn),
1050 PSC(user_isp); 1094 drbd_conn_str(ns.conn));
1051 dev_info(DEV, "%s\n", pb); 1095 if (ns.disk != os.disk)
1096 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1097 drbd_disk_str(os.disk),
1098 drbd_disk_str(ns.disk));
1099 if (ns.pdsk != os.pdsk)
1100 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1101 drbd_disk_str(os.pdsk),
1102 drbd_disk_str(ns.pdsk));
1103 if (is_susp(ns) != is_susp(os))
1104 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1105 is_susp(os),
1106 is_susp(ns));
1107 if (ns.aftr_isp != os.aftr_isp)
1108 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1109 os.aftr_isp,
1110 ns.aftr_isp);
1111 if (ns.peer_isp != os.peer_isp)
1112 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1113 os.peer_isp,
1114 ns.peer_isp);
1115 if (ns.user_isp != os.user_isp)
1116 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1117 os.user_isp,
1118 ns.user_isp);
1119 dev_info(DEV, "%s\n", pb);
1052 } 1120 }
1053 1121
1054 /* solve the race between becoming unconfigured, 1122 /* solve the race between becoming unconfigured,
@@ -1074,6 +1142,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
1074 atomic_inc(&mdev->local_cnt); 1142 atomic_inc(&mdev->local_cnt);
1075 1143
1076 mdev->state = ns; 1144 mdev->state = ns;
1145
1146 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1147 drbd_print_uuids(mdev, "attached to UUIDs");
1148
1077 wake_up(&mdev->misc_wait); 1149 wake_up(&mdev->misc_wait);
1078 wake_up(&mdev->state_wait); 1150 wake_up(&mdev->state_wait);
1079 1151
@@ -1081,7 +1153,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
1081 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && 1153 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1082 ns.conn < C_CONNECTED) { 1154 ns.conn < C_CONNECTED) {
1083 mdev->ov_start_sector = 1155 mdev->ov_start_sector =
1084 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); 1156 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1085 dev_info(DEV, "Online Verify reached sector %llu\n", 1157 dev_info(DEV, "Online Verify reached sector %llu\n",
1086 (unsigned long long)mdev->ov_start_sector); 1158 (unsigned long long)mdev->ov_start_sector);
1087 } 1159 }
@@ -1106,14 +1178,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
1106 unsigned long now = jiffies; 1178 unsigned long now = jiffies;
1107 int i; 1179 int i;
1108 1180
1109 mdev->ov_position = 0; 1181 set_ov_position(mdev, ns.conn);
1110 mdev->rs_total = drbd_bm_bits(mdev);
1111 if (mdev->agreed_pro_version >= 90)
1112 set_ov_position(mdev, ns.conn);
1113 else
1114 mdev->ov_start_sector = 0;
1115 mdev->ov_left = mdev->rs_total
1116 - BM_SECT_TO_BIT(mdev->ov_position);
1117 mdev->rs_start = now; 1182 mdev->rs_start = now;
1118 mdev->rs_last_events = 0; 1183 mdev->rs_last_events = 0;
1119 mdev->rs_last_sect_ev = 0; 1184 mdev->rs_last_sect_ev = 0;
@@ -1121,10 +1186,12 @@ int __drbd_set_state(struct drbd_conf *mdev,
1121 mdev->ov_last_oos_start = 0; 1186 mdev->ov_last_oos_start = 0;
1122 1187
1123 for (i = 0; i < DRBD_SYNC_MARKS; i++) { 1188 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1124 mdev->rs_mark_left[i] = mdev->rs_total; 1189 mdev->rs_mark_left[i] = mdev->ov_left;
1125 mdev->rs_mark_time[i] = now; 1190 mdev->rs_mark_time[i] = now;
1126 } 1191 }
1127 1192
1193 drbd_rs_controller_reset(mdev);
1194
1128 if (ns.conn == C_VERIFY_S) { 1195 if (ns.conn == C_VERIFY_S) {
1129 dev_info(DEV, "Starting Online Verify from sector %llu\n", 1196 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1130 (unsigned long long)mdev->ov_position); 1197 (unsigned long long)mdev->ov_position);
@@ -1228,6 +1295,26 @@ static void abw_start_sync(struct drbd_conf *mdev, int rv)
1228 } 1295 }
1229} 1296}
1230 1297
1298int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1299 int (*io_fn)(struct drbd_conf *),
1300 char *why, enum bm_flag flags)
1301{
1302 int rv;
1303
1304 D_ASSERT(current == mdev->worker.task);
1305
1306 /* open coded non-blocking drbd_suspend_io(mdev); */
1307 set_bit(SUSPEND_IO, &mdev->flags);
1308
1309 drbd_bm_lock(mdev, why, flags);
1310 rv = io_fn(mdev);
1311 drbd_bm_unlock(mdev);
1312
1313 drbd_resume_io(mdev);
1314
1315 return rv;
1316}
1317
1231/** 1318/**
1232 * after_state_ch() - Perform after state change actions that may sleep 1319 * after_state_ch() - Perform after state change actions that may sleep
1233 * @mdev: DRBD device. 1320 * @mdev: DRBD device.
@@ -1266,16 +1353,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1266 1353
1267 nsm.i = -1; 1354 nsm.i = -1;
1268 if (ns.susp_nod) { 1355 if (ns.susp_nod) {
1269 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { 1356 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1270 if (ns.conn == C_CONNECTED) 1357 what = resend;
1271 what = resend, nsm.susp_nod = 0;
1272 else /* ns.conn > C_CONNECTED */
1273 dev_err(DEV, "Unexpected Resynd going on!\n");
1274 }
1275 1358
1276 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) 1359 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1277 what = restart_frozen_disk_io, nsm.susp_nod = 0; 1360 what = restart_frozen_disk_io;
1278 1361
1362 if (what != nothing)
1363 nsm.susp_nod = 0;
1279 } 1364 }
1280 1365
1281 if (ns.susp_fen) { 1366 if (ns.susp_fen) {
@@ -1306,13 +1391,30 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1306 spin_unlock_irq(&mdev->req_lock); 1391 spin_unlock_irq(&mdev->req_lock);
1307 } 1392 }
1308 1393
1394 /* Became sync source. With protocol >= 96, we still need to send out
1395 * the sync uuid now. Need to do that before any drbd_send_state, or
1396 * the other side may go "paused sync" before receiving the sync uuids,
1397 * which is unexpected. */
1398 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1399 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1400 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1401 drbd_gen_and_send_sync_uuid(mdev);
1402 put_ldev(mdev);
1403 }
1404
1309 /* Do not change the order of the if above and the two below... */ 1405 /* Do not change the order of the if above and the two below... */
1310 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ 1406 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1311 drbd_send_uuids(mdev); 1407 drbd_send_uuids(mdev);
1312 drbd_send_state(mdev); 1408 drbd_send_state(mdev);
1313 } 1409 }
1314 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) 1410 /* No point in queuing send_bitmap if we don't have a connection
1315 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); 1411 * anymore, so check also the _current_ state, not only the new state
1412 * at the time this work was queued. */
1413 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1414 mdev->state.conn == C_WF_BITMAP_S)
1415 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1416 "send_bitmap (WFBitMapS)",
1417 BM_LOCKED_TEST_ALLOWED);
1316 1418
1317 /* Lost contact to peer's copy of the data */ 1419 /* Lost contact to peer's copy of the data */
1318 if ((os.pdsk >= D_INCONSISTENT && 1420 if ((os.pdsk >= D_INCONSISTENT &&
@@ -1343,7 +1445,23 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1343 1445
1344 /* D_DISKLESS Peer becomes secondary */ 1446 /* D_DISKLESS Peer becomes secondary */
1345 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1447 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1346 drbd_al_to_on_disk_bm(mdev); 1448 /* We may still be Primary ourselves.
1449 * No harm done if the bitmap still changes,
1450 * redirtied pages will follow later. */
1451 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1452 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
1453 put_ldev(mdev);
1454 }
1455
1456 /* Write out all changed bits on demote.
1457 * Though, no need to da that just yet
1458 * if there is a resync going on still */
1459 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1460 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1461 /* No changes to the bitmap expected this time, so assert that,
1462 * even though no harm was done if it did change. */
1463 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1464 "demote", BM_LOCKED_TEST_ALLOWED);
1347 put_ldev(mdev); 1465 put_ldev(mdev);
1348 } 1466 }
1349 1467
@@ -1371,15 +1489,23 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1371 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) 1489 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1372 drbd_send_state(mdev); 1490 drbd_send_state(mdev);
1373 1491
1492 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1493 drbd_send_state(mdev);
1494
1374 /* We are in the progress to start a full sync... */ 1495 /* We are in the progress to start a full sync... */
1375 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || 1496 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1376 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) 1497 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1377 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); 1498 /* no other bitmap changes expected during this phase */
1499 drbd_queue_bitmap_io(mdev,
1500 &drbd_bmio_set_n_write, &abw_start_sync,
1501 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1378 1502
1379 /* We are invalidating our self... */ 1503 /* We are invalidating our self... */
1380 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && 1504 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1381 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) 1505 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1382 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); 1506 /* other bitmap operation expected during this phase */
1507 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1508 "set_n_write from invalidate", BM_LOCKED_MASK);
1383 1509
1384 /* first half of local IO error, failure to attach, 1510 /* first half of local IO error, failure to attach,
1385 * or administrative detach */ 1511 * or administrative detach */
@@ -1434,8 +1560,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1434 1560
1435 if (drbd_send_state(mdev)) 1561 if (drbd_send_state(mdev))
1436 dev_warn(DEV, "Notified peer that I'm now diskless.\n"); 1562 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1437 else
1438 dev_err(DEV, "Sending state for being diskless failed\n");
1439 /* corresponding get_ldev in __drbd_set_state 1563 /* corresponding get_ldev in __drbd_set_state
1440 * this may finaly trigger drbd_ldev_destroy. */ 1564 * this may finaly trigger drbd_ldev_destroy. */
1441 put_ldev(mdev); 1565 put_ldev(mdev);
@@ -1459,6 +1583,19 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1459 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) 1583 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1460 drbd_send_state(mdev); 1584 drbd_send_state(mdev);
1461 1585
1586 /* This triggers bitmap writeout of potentially still unwritten pages
1587 * if the resync finished cleanly, or aborted because of peer disk
1588 * failure, or because of connection loss.
1589 * For resync aborted because of local disk failure, we cannot do
1590 * any bitmap writeout anymore.
1591 * No harm done if some bits change during this phase.
1592 */
1593 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1594 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1595 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
1596 put_ldev(mdev);
1597 }
1598
1462 /* free tl_hash if we Got thawed and are C_STANDALONE */ 1599 /* free tl_hash if we Got thawed and are C_STANDALONE */
1463 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash) 1600 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1464 drbd_free_tl_hash(mdev); 1601 drbd_free_tl_hash(mdev);
@@ -1559,7 +1696,7 @@ int drbd_thread_start(struct drbd_thread *thi)
1559 if (!try_module_get(THIS_MODULE)) { 1696 if (!try_module_get(THIS_MODULE)) {
1560 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); 1697 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1561 spin_unlock_irqrestore(&thi->t_lock, flags); 1698 spin_unlock_irqrestore(&thi->t_lock, flags);
1562 return FALSE; 1699 return false;
1563 } 1700 }
1564 1701
1565 init_completion(&thi->stop); 1702 init_completion(&thi->stop);
@@ -1576,7 +1713,7 @@ int drbd_thread_start(struct drbd_thread *thi)
1576 dev_err(DEV, "Couldn't start thread\n"); 1713 dev_err(DEV, "Couldn't start thread\n");
1577 1714
1578 module_put(THIS_MODULE); 1715 module_put(THIS_MODULE);
1579 return FALSE; 1716 return false;
1580 } 1717 }
1581 spin_lock_irqsave(&thi->t_lock, flags); 1718 spin_lock_irqsave(&thi->t_lock, flags);
1582 thi->task = nt; 1719 thi->task = nt;
@@ -1596,7 +1733,7 @@ int drbd_thread_start(struct drbd_thread *thi)
1596 break; 1733 break;
1597 } 1734 }
1598 1735
1599 return TRUE; 1736 return true;
1600} 1737}
1601 1738
1602 1739
@@ -1694,8 +1831,8 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1694{ 1831{
1695 int sent, ok; 1832 int sent, ok;
1696 1833
1697 ERR_IF(!h) return FALSE; 1834 ERR_IF(!h) return false;
1698 ERR_IF(!size) return FALSE; 1835 ERR_IF(!size) return false;
1699 1836
1700 h->magic = BE_DRBD_MAGIC; 1837 h->magic = BE_DRBD_MAGIC;
1701 h->command = cpu_to_be16(cmd); 1838 h->command = cpu_to_be16(cmd);
@@ -1704,8 +1841,8 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1704 sent = drbd_send(mdev, sock, h, size, msg_flags); 1841 sent = drbd_send(mdev, sock, h, size, msg_flags);
1705 1842
1706 ok = (sent == size); 1843 ok = (sent == size);
1707 if (!ok) 1844 if (!ok && !signal_pending(current))
1708 dev_err(DEV, "short sent %s size=%d sent=%d\n", 1845 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1709 cmdname(cmd), (int)size, sent); 1846 cmdname(cmd), (int)size, sent);
1710 return ok; 1847 return ok;
1711} 1848}
@@ -1840,7 +1977,7 @@ int drbd_send_protocol(struct drbd_conf *mdev)
1840 else { 1977 else {
1841 dev_err(DEV, "--dry-run is not supported by peer"); 1978 dev_err(DEV, "--dry-run is not supported by peer");
1842 kfree(p); 1979 kfree(p);
1843 return 0; 1980 return -1;
1844 } 1981 }
1845 } 1982 }
1846 p->conn_flags = cpu_to_be32(cf); 1983 p->conn_flags = cpu_to_be32(cf);
@@ -1888,12 +2025,36 @@ int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1888 return _drbd_send_uuids(mdev, 8); 2025 return _drbd_send_uuids(mdev, 8);
1889} 2026}
1890 2027
2028void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2029{
2030 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2031 u64 *uuid = mdev->ldev->md.uuid;
2032 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2033 text,
2034 (unsigned long long)uuid[UI_CURRENT],
2035 (unsigned long long)uuid[UI_BITMAP],
2036 (unsigned long long)uuid[UI_HISTORY_START],
2037 (unsigned long long)uuid[UI_HISTORY_END]);
2038 put_ldev(mdev);
2039 } else {
2040 dev_info(DEV, "%s effective data uuid: %016llX\n",
2041 text,
2042 (unsigned long long)mdev->ed_uuid);
2043 }
2044}
1891 2045
1892int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) 2046int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
1893{ 2047{
1894 struct p_rs_uuid p; 2048 struct p_rs_uuid p;
2049 u64 uuid;
1895 2050
1896 p.uuid = cpu_to_be64(val); 2051 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2052
2053 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
2054 drbd_uuid_set(mdev, UI_BITMAP, uuid);
2055 drbd_print_uuids(mdev, "updated sync UUID");
2056 drbd_md_sync(mdev);
2057 p.uuid = cpu_to_be64(uuid);
1897 2058
1898 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, 2059 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1899 (struct p_header80 *)&p, sizeof(p)); 2060 (struct p_header80 *)&p, sizeof(p));
@@ -1921,7 +2082,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
1921 p.d_size = cpu_to_be64(d_size); 2082 p.d_size = cpu_to_be64(d_size);
1922 p.u_size = cpu_to_be64(u_size); 2083 p.u_size = cpu_to_be64(u_size);
1923 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); 2084 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1924 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); 2085 p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
1925 p.queue_order_type = cpu_to_be16(q_order_type); 2086 p.queue_order_type = cpu_to_be16(q_order_type);
1926 p.dds_flags = cpu_to_be16(flags); 2087 p.dds_flags = cpu_to_be16(flags);
1927 2088
@@ -1972,7 +2133,7 @@ int drbd_send_state_req(struct drbd_conf *mdev,
1972 (struct p_header80 *)&p, sizeof(p)); 2133 (struct p_header80 *)&p, sizeof(p));
1973} 2134}
1974 2135
1975int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) 2136int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
1976{ 2137{
1977 struct p_req_state_reply p; 2138 struct p_req_state_reply p;
1978 2139
@@ -2076,9 +2237,15 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2076 return len; 2237 return len;
2077} 2238}
2078 2239
2079enum { OK, FAILED, DONE } 2240/**
2241 * send_bitmap_rle_or_plain
2242 *
2243 * Return 0 when done, 1 when another iteration is needed, and a negative error
2244 * code upon failure.
2245 */
2246static int
2080send_bitmap_rle_or_plain(struct drbd_conf *mdev, 2247send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2081 struct p_header80 *h, struct bm_xfer_ctx *c) 2248 struct p_header80 *h, struct bm_xfer_ctx *c)
2082{ 2249{
2083 struct p_compressed_bm *p = (void*)h; 2250 struct p_compressed_bm *p = (void*)h;
2084 unsigned long num_words; 2251 unsigned long num_words;
@@ -2088,7 +2255,7 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2088 len = fill_bitmap_rle_bits(mdev, p, c); 2255 len = fill_bitmap_rle_bits(mdev, p, c);
2089 2256
2090 if (len < 0) 2257 if (len < 0)
2091 return FAILED; 2258 return -EIO;
2092 2259
2093 if (len) { 2260 if (len) {
2094 DCBP_set_code(p, RLE_VLI_Bits); 2261 DCBP_set_code(p, RLE_VLI_Bits);
@@ -2118,11 +2285,14 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2118 if (c->bit_offset > c->bm_bits) 2285 if (c->bit_offset > c->bm_bits)
2119 c->bit_offset = c->bm_bits; 2286 c->bit_offset = c->bm_bits;
2120 } 2287 }
2121 ok = ok ? ((len == 0) ? DONE : OK) : FAILED; 2288 if (ok) {
2122 2289 if (len == 0) {
2123 if (ok == DONE) 2290 INFO_bm_xfer_stats(mdev, "send", c);
2124 INFO_bm_xfer_stats(mdev, "send", c); 2291 return 0;
2125 return ok; 2292 } else
2293 return 1;
2294 }
2295 return -EIO;
2126} 2296}
2127 2297
2128/* See the comment at receive_bitmap() */ 2298/* See the comment at receive_bitmap() */
@@ -2130,16 +2300,16 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
2130{ 2300{
2131 struct bm_xfer_ctx c; 2301 struct bm_xfer_ctx c;
2132 struct p_header80 *p; 2302 struct p_header80 *p;
2133 int ret; 2303 int err;
2134 2304
2135 ERR_IF(!mdev->bitmap) return FALSE; 2305 ERR_IF(!mdev->bitmap) return false;
2136 2306
2137 /* maybe we should use some per thread scratch page, 2307 /* maybe we should use some per thread scratch page,
2138 * and allocate that during initial device creation? */ 2308 * and allocate that during initial device creation? */
2139 p = (struct p_header80 *) __get_free_page(GFP_NOIO); 2309 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2140 if (!p) { 2310 if (!p) {
2141 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); 2311 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2142 return FALSE; 2312 return false;
2143 } 2313 }
2144 2314
2145 if (get_ldev(mdev)) { 2315 if (get_ldev(mdev)) {
@@ -2165,11 +2335,11 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
2165 }; 2335 };
2166 2336
2167 do { 2337 do {
2168 ret = send_bitmap_rle_or_plain(mdev, p, &c); 2338 err = send_bitmap_rle_or_plain(mdev, p, &c);
2169 } while (ret == OK); 2339 } while (err > 0);
2170 2340
2171 free_page((unsigned long) p); 2341 free_page((unsigned long) p);
2172 return (ret == DONE); 2342 return err == 0;
2173} 2343}
2174 2344
2175int drbd_send_bitmap(struct drbd_conf *mdev) 2345int drbd_send_bitmap(struct drbd_conf *mdev)
@@ -2192,7 +2362,7 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2192 p.set_size = cpu_to_be32(set_size); 2362 p.set_size = cpu_to_be32(set_size);
2193 2363
2194 if (mdev->state.conn < C_CONNECTED) 2364 if (mdev->state.conn < C_CONNECTED)
2195 return FALSE; 2365 return false;
2196 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, 2366 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2197 (struct p_header80 *)&p, sizeof(p)); 2367 (struct p_header80 *)&p, sizeof(p));
2198 return ok; 2368 return ok;
@@ -2220,7 +2390,7 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2220 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); 2390 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2221 2391
2222 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) 2392 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2223 return FALSE; 2393 return false;
2224 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, 2394 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2225 (struct p_header80 *)&p, sizeof(p)); 2395 (struct p_header80 *)&p, sizeof(p));
2226 return ok; 2396 return ok;
@@ -2326,8 +2496,8 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2326} 2496}
2327 2497
2328/* called on sndtimeo 2498/* called on sndtimeo
2329 * returns FALSE if we should retry, 2499 * returns false if we should retry,
2330 * TRUE if we think connection is dead 2500 * true if we think connection is dead
2331 */ 2501 */
2332static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) 2502static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2333{ 2503{
@@ -2340,7 +2510,7 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *
2340 || mdev->state.conn < C_CONNECTED; 2510 || mdev->state.conn < C_CONNECTED;
2341 2511
2342 if (drop_it) 2512 if (drop_it)
2343 return TRUE; 2513 return true;
2344 2514
2345 drop_it = !--mdev->ko_count; 2515 drop_it = !--mdev->ko_count;
2346 if (!drop_it) { 2516 if (!drop_it) {
@@ -2531,13 +2701,39 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2531 if (ok && dgs) { 2701 if (ok && dgs) {
2532 dgb = mdev->int_dig_out; 2702 dgb = mdev->int_dig_out;
2533 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); 2703 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2534 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); 2704 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2535 } 2705 }
2536 if (ok) { 2706 if (ok) {
2537 if (mdev->net_conf->wire_protocol == DRBD_PROT_A) 2707 /* For protocol A, we have to memcpy the payload into
2708 * socket buffers, as we may complete right away
2709 * as soon as we handed it over to tcp, at which point the data
2710 * pages may become invalid.
2711 *
2712 * For data-integrity enabled, we copy it as well, so we can be
2713 * sure that even if the bio pages may still be modified, it
2714 * won't change the data on the wire, thus if the digest checks
2715 * out ok after sending on this side, but does not fit on the
2716 * receiving side, we sure have detected corruption elsewhere.
2717 */
2718 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2538 ok = _drbd_send_bio(mdev, req->master_bio); 2719 ok = _drbd_send_bio(mdev, req->master_bio);
2539 else 2720 else
2540 ok = _drbd_send_zc_bio(mdev, req->master_bio); 2721 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2722
2723 /* double check digest, sometimes buffers have been modified in flight. */
2724 if (dgs > 0 && dgs <= 64) {
2725 /* 64 byte, 512 bit, is the larges digest size
2726 * currently supported in kernel crypto. */
2727 unsigned char digest[64];
2728 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2729 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2730 dev_warn(DEV,
2731 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2732 (unsigned long long)req->sector, req->size);
2733 }
2734 } /* else if (dgs > 64) {
2735 ... Be noisy about digest too large ...
2736 } */
2541 } 2737 }
2542 2738
2543 drbd_put_data_sock(mdev); 2739 drbd_put_data_sock(mdev);
@@ -2587,7 +2783,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2587 if (ok && dgs) { 2783 if (ok && dgs) {
2588 dgb = mdev->int_dig_out; 2784 dgb = mdev->int_dig_out;
2589 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); 2785 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2590 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); 2786 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2591 } 2787 }
2592 if (ok) 2788 if (ok)
2593 ok = _drbd_send_zc_ee(mdev, e); 2789 ok = _drbd_send_zc_ee(mdev, e);
@@ -2597,6 +2793,16 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2597 return ok; 2793 return ok;
2598} 2794}
2599 2795
2796int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2797{
2798 struct p_block_desc p;
2799
2800 p.sector = cpu_to_be64(req->sector);
2801 p.blksize = cpu_to_be32(req->size);
2802
2803 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2804}
2805
2600/* 2806/*
2601 drbd_send distinguishes two cases: 2807 drbd_send distinguishes two cases:
2602 2808
@@ -2770,6 +2976,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2770 atomic_set(&mdev->pp_in_use_by_net, 0); 2976 atomic_set(&mdev->pp_in_use_by_net, 0);
2771 atomic_set(&mdev->rs_sect_in, 0); 2977 atomic_set(&mdev->rs_sect_in, 0);
2772 atomic_set(&mdev->rs_sect_ev, 0); 2978 atomic_set(&mdev->rs_sect_ev, 0);
2979 atomic_set(&mdev->ap_in_flight, 0);
2773 2980
2774 mutex_init(&mdev->md_io_mutex); 2981 mutex_init(&mdev->md_io_mutex);
2775 mutex_init(&mdev->data.mutex); 2982 mutex_init(&mdev->data.mutex);
@@ -2798,19 +3005,27 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2798 INIT_LIST_HEAD(&mdev->unplug_work.list); 3005 INIT_LIST_HEAD(&mdev->unplug_work.list);
2799 INIT_LIST_HEAD(&mdev->go_diskless.list); 3006 INIT_LIST_HEAD(&mdev->go_diskless.list);
2800 INIT_LIST_HEAD(&mdev->md_sync_work.list); 3007 INIT_LIST_HEAD(&mdev->md_sync_work.list);
3008 INIT_LIST_HEAD(&mdev->start_resync_work.list);
2801 INIT_LIST_HEAD(&mdev->bm_io_work.w.list); 3009 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2802 3010
2803 mdev->resync_work.cb = w_resync_inactive; 3011 mdev->resync_work.cb = w_resync_timer;
2804 mdev->unplug_work.cb = w_send_write_hint; 3012 mdev->unplug_work.cb = w_send_write_hint;
2805 mdev->go_diskless.cb = w_go_diskless; 3013 mdev->go_diskless.cb = w_go_diskless;
2806 mdev->md_sync_work.cb = w_md_sync; 3014 mdev->md_sync_work.cb = w_md_sync;
2807 mdev->bm_io_work.w.cb = w_bitmap_io; 3015 mdev->bm_io_work.w.cb = w_bitmap_io;
3016 mdev->start_resync_work.cb = w_start_resync;
2808 init_timer(&mdev->resync_timer); 3017 init_timer(&mdev->resync_timer);
2809 init_timer(&mdev->md_sync_timer); 3018 init_timer(&mdev->md_sync_timer);
3019 init_timer(&mdev->start_resync_timer);
3020 init_timer(&mdev->request_timer);
2810 mdev->resync_timer.function = resync_timer_fn; 3021 mdev->resync_timer.function = resync_timer_fn;
2811 mdev->resync_timer.data = (unsigned long) mdev; 3022 mdev->resync_timer.data = (unsigned long) mdev;
2812 mdev->md_sync_timer.function = md_sync_timer_fn; 3023 mdev->md_sync_timer.function = md_sync_timer_fn;
2813 mdev->md_sync_timer.data = (unsigned long) mdev; 3024 mdev->md_sync_timer.data = (unsigned long) mdev;
3025 mdev->start_resync_timer.function = start_resync_timer_fn;
3026 mdev->start_resync_timer.data = (unsigned long) mdev;
3027 mdev->request_timer.function = request_timer_fn;
3028 mdev->request_timer.data = (unsigned long) mdev;
2814 3029
2815 init_waitqueue_head(&mdev->misc_wait); 3030 init_waitqueue_head(&mdev->misc_wait);
2816 init_waitqueue_head(&mdev->state_wait); 3031 init_waitqueue_head(&mdev->state_wait);
@@ -2881,6 +3096,8 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2881 D_ASSERT(list_empty(&mdev->resync_work.list)); 3096 D_ASSERT(list_empty(&mdev->resync_work.list));
2882 D_ASSERT(list_empty(&mdev->unplug_work.list)); 3097 D_ASSERT(list_empty(&mdev->unplug_work.list));
2883 D_ASSERT(list_empty(&mdev->go_diskless.list)); 3098 D_ASSERT(list_empty(&mdev->go_diskless.list));
3099
3100 drbd_set_defaults(mdev);
2884} 3101}
2885 3102
2886 3103
@@ -2923,7 +3140,7 @@ static void drbd_destroy_mempools(void)
2923static int drbd_create_mempools(void) 3140static int drbd_create_mempools(void)
2924{ 3141{
2925 struct page *page; 3142 struct page *page;
2926 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; 3143 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
2927 int i; 3144 int i;
2928 3145
2929 /* prepare our caches and mempools */ 3146 /* prepare our caches and mempools */
@@ -3087,11 +3304,20 @@ static void drbd_cleanup(void)
3087 3304
3088 unregister_reboot_notifier(&drbd_notifier); 3305 unregister_reboot_notifier(&drbd_notifier);
3089 3306
3307 /* first remove proc,
3308 * drbdsetup uses it's presence to detect
3309 * whether DRBD is loaded.
3310 * If we would get stuck in proc removal,
3311 * but have netlink already deregistered,
3312 * some drbdsetup commands may wait forever
3313 * for an answer.
3314 */
3315 if (drbd_proc)
3316 remove_proc_entry("drbd", NULL);
3317
3090 drbd_nl_cleanup(); 3318 drbd_nl_cleanup();
3091 3319
3092 if (minor_table) { 3320 if (minor_table) {
3093 if (drbd_proc)
3094 remove_proc_entry("drbd", NULL);
3095 i = minor_count; 3321 i = minor_count;
3096 while (i--) 3322 while (i--)
3097 drbd_delete_device(i); 3323 drbd_delete_device(i);
@@ -3119,7 +3345,7 @@ static int drbd_congested(void *congested_data, int bdi_bits)
3119 char reason = '-'; 3345 char reason = '-';
3120 int r = 0; 3346 int r = 0;
3121 3347
3122 if (!__inc_ap_bio_cond(mdev)) { 3348 if (!may_inc_ap_bio(mdev)) {
3123 /* DRBD has frozen IO */ 3349 /* DRBD has frozen IO */
3124 r = bdi_bits; 3350 r = bdi_bits;
3125 reason = 'd'; 3351 reason = 'd';
@@ -3172,7 +3398,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
3172 goto out_no_disk; 3398 goto out_no_disk;
3173 mdev->vdisk = disk; 3399 mdev->vdisk = disk;
3174 3400
3175 set_disk_ro(disk, TRUE); 3401 set_disk_ro(disk, true);
3176 3402
3177 disk->queue = q; 3403 disk->queue = q;
3178 disk->major = DRBD_MAJOR; 3404 disk->major = DRBD_MAJOR;
@@ -3188,8 +3414,8 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
3188 q->backing_dev_info.congested_fn = drbd_congested; 3414 q->backing_dev_info.congested_fn = drbd_congested;
3189 q->backing_dev_info.congested_data = mdev; 3415 q->backing_dev_info.congested_data = mdev;
3190 3416
3191 blk_queue_make_request(q, drbd_make_request_26); 3417 blk_queue_make_request(q, drbd_make_request);
3192 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); 3418 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
3193 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 3419 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3194 blk_queue_merge_bvec(q, drbd_merge_bvec); 3420 blk_queue_merge_bvec(q, drbd_merge_bvec);
3195 q->queue_lock = &mdev->req_lock; 3421 q->queue_lock = &mdev->req_lock;
@@ -3251,6 +3477,7 @@ void drbd_free_mdev(struct drbd_conf *mdev)
3251 put_disk(mdev->vdisk); 3477 put_disk(mdev->vdisk);
3252 blk_cleanup_queue(mdev->rq_queue); 3478 blk_cleanup_queue(mdev->rq_queue);
3253 free_cpumask_var(mdev->cpu_mask); 3479 free_cpumask_var(mdev->cpu_mask);
3480 drbd_free_tl_hash(mdev);
3254 kfree(mdev); 3481 kfree(mdev);
3255} 3482}
3256 3483
@@ -3266,7 +3493,7 @@ int __init drbd_init(void)
3266 return -EINVAL; 3493 return -EINVAL;
3267 } 3494 }
3268 3495
3269 if (1 > minor_count || minor_count > 255) { 3496 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3270 printk(KERN_ERR 3497 printk(KERN_ERR
3271 "drbd: invalid minor_count (%d)\n", minor_count); 3498 "drbd: invalid minor_count (%d)\n", minor_count);
3272#ifdef MODULE 3499#ifdef MODULE
@@ -3448,7 +3675,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
3448 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 3675 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3449 /* this was a try anyways ... */ 3676 /* this was a try anyways ... */
3450 dev_err(DEV, "meta data update failed!\n"); 3677 dev_err(DEV, "meta data update failed!\n");
3451 drbd_chk_io_error(mdev, 1, TRUE); 3678 drbd_chk_io_error(mdev, 1, true);
3452 } 3679 }
3453 3680
3454 /* Update mdev->ldev->md.la_size_sect, 3681 /* Update mdev->ldev->md.la_size_sect,
@@ -3464,7 +3691,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
3464 * @mdev: DRBD device. 3691 * @mdev: DRBD device.
3465 * @bdev: Device from which the meta data should be read in. 3692 * @bdev: Device from which the meta data should be read in.
3466 * 3693 *
3467 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case 3694 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3468 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. 3695 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3469 */ 3696 */
3470int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) 3697int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
@@ -3534,28 +3761,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3534 return rv; 3761 return rv;
3535} 3762}
3536 3763
3537static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3538{
3539 static char *uuid_str[UI_EXTENDED_SIZE] = {
3540 [UI_CURRENT] = "CURRENT",
3541 [UI_BITMAP] = "BITMAP",
3542 [UI_HISTORY_START] = "HISTORY_START",
3543 [UI_HISTORY_END] = "HISTORY_END",
3544 [UI_SIZE] = "SIZE",
3545 [UI_FLAGS] = "FLAGS",
3546 };
3547
3548 if (index >= UI_EXTENDED_SIZE) {
3549 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3550 return;
3551 }
3552
3553 dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3554 uuid_str[index],
3555 (unsigned long long)mdev->ldev->md.uuid[index]);
3556}
3557
3558
3559/** 3764/**
3560 * drbd_md_mark_dirty() - Mark meta data super block as dirty 3765 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3561 * @mdev: DRBD device. 3766 * @mdev: DRBD device.
@@ -3585,10 +3790,8 @@ static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3585{ 3790{
3586 int i; 3791 int i;
3587 3792
3588 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) { 3793 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3589 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; 3794 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3590 debug_drbd_uuid(mdev, i+1);
3591 }
3592} 3795}
3593 3796
3594void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) 3797void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
@@ -3603,7 +3806,6 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3603 } 3806 }
3604 3807
3605 mdev->ldev->md.uuid[idx] = val; 3808 mdev->ldev->md.uuid[idx] = val;
3606 debug_drbd_uuid(mdev, idx);
3607 drbd_md_mark_dirty(mdev); 3809 drbd_md_mark_dirty(mdev);
3608} 3810}
3609 3811
@@ -3613,7 +3815,6 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3613 if (mdev->ldev->md.uuid[idx]) { 3815 if (mdev->ldev->md.uuid[idx]) {
3614 drbd_uuid_move_history(mdev); 3816 drbd_uuid_move_history(mdev);
3615 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; 3817 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3616 debug_drbd_uuid(mdev, UI_HISTORY_START);
3617 } 3818 }
3618 _drbd_uuid_set(mdev, idx, val); 3819 _drbd_uuid_set(mdev, idx, val);
3619} 3820}
@@ -3628,14 +3829,16 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3628void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) 3829void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3629{ 3830{
3630 u64 val; 3831 u64 val;
3832 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3833
3834 if (bm_uuid)
3835 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3631 3836
3632 dev_info(DEV, "Creating new current UUID\n");
3633 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3634 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; 3837 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3635 debug_drbd_uuid(mdev, UI_BITMAP);
3636 3838
3637 get_random_bytes(&val, sizeof(u64)); 3839 get_random_bytes(&val, sizeof(u64));
3638 _drbd_uuid_set(mdev, UI_CURRENT, val); 3840 _drbd_uuid_set(mdev, UI_CURRENT, val);
3841 drbd_print_uuids(mdev, "new current UUID");
3639 /* get it to stable storage _now_ */ 3842 /* get it to stable storage _now_ */
3640 drbd_md_sync(mdev); 3843 drbd_md_sync(mdev);
3641} 3844}
@@ -3649,16 +3852,12 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3649 drbd_uuid_move_history(mdev); 3852 drbd_uuid_move_history(mdev);
3650 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; 3853 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3651 mdev->ldev->md.uuid[UI_BITMAP] = 0; 3854 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3652 debug_drbd_uuid(mdev, UI_HISTORY_START);
3653 debug_drbd_uuid(mdev, UI_BITMAP);
3654 } else { 3855 } else {
3655 if (mdev->ldev->md.uuid[UI_BITMAP]) 3856 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3656 dev_warn(DEV, "bm UUID already set"); 3857 if (bm_uuid)
3657 3858 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3658 mdev->ldev->md.uuid[UI_BITMAP] = val;
3659 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3660 3859
3661 debug_drbd_uuid(mdev, UI_BITMAP); 3860 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
3662 } 3861 }
3663 drbd_md_mark_dirty(mdev); 3862 drbd_md_mark_dirty(mdev);
3664} 3863}
@@ -3714,15 +3913,19 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3714static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) 3913static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3715{ 3914{
3716 struct bm_io_work *work = container_of(w, struct bm_io_work, w); 3915 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3717 int rv; 3916 int rv = -EIO;
3718 3917
3719 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); 3918 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3720 3919
3721 drbd_bm_lock(mdev, work->why); 3920 if (get_ldev(mdev)) {
3722 rv = work->io_fn(mdev); 3921 drbd_bm_lock(mdev, work->why, work->flags);
3723 drbd_bm_unlock(mdev); 3922 rv = work->io_fn(mdev);
3923 drbd_bm_unlock(mdev);
3924 put_ldev(mdev);
3925 }
3724 3926
3725 clear_bit(BITMAP_IO, &mdev->flags); 3927 clear_bit(BITMAP_IO, &mdev->flags);
3928 smp_mb__after_clear_bit();
3726 wake_up(&mdev->misc_wait); 3929 wake_up(&mdev->misc_wait);
3727 3930
3728 if (work->done) 3931 if (work->done)
@@ -3730,6 +3933,7 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3730 3933
3731 clear_bit(BITMAP_IO_QUEUED, &mdev->flags); 3934 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3732 work->why = NULL; 3935 work->why = NULL;
3936 work->flags = 0;
3733 3937
3734 return 1; 3938 return 1;
3735} 3939}
@@ -3784,7 +3988,7 @@ void drbd_go_diskless(struct drbd_conf *mdev)
3784void drbd_queue_bitmap_io(struct drbd_conf *mdev, 3988void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3785 int (*io_fn)(struct drbd_conf *), 3989 int (*io_fn)(struct drbd_conf *),
3786 void (*done)(struct drbd_conf *, int), 3990 void (*done)(struct drbd_conf *, int),
3787 char *why) 3991 char *why, enum bm_flag flags)
3788{ 3992{
3789 D_ASSERT(current == mdev->worker.task); 3993 D_ASSERT(current == mdev->worker.task);
3790 3994
@@ -3798,15 +4002,15 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3798 mdev->bm_io_work.io_fn = io_fn; 4002 mdev->bm_io_work.io_fn = io_fn;
3799 mdev->bm_io_work.done = done; 4003 mdev->bm_io_work.done = done;
3800 mdev->bm_io_work.why = why; 4004 mdev->bm_io_work.why = why;
4005 mdev->bm_io_work.flags = flags;
3801 4006
4007 spin_lock_irq(&mdev->req_lock);
3802 set_bit(BITMAP_IO, &mdev->flags); 4008 set_bit(BITMAP_IO, &mdev->flags);
3803 if (atomic_read(&mdev->ap_bio_cnt) == 0) { 4009 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3804 if (list_empty(&mdev->bm_io_work.w.list)) { 4010 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
3805 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3806 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); 4011 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3807 } else
3808 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3809 } 4012 }
4013 spin_unlock_irq(&mdev->req_lock);
3810} 4014}
3811 4015
3812/** 4016/**
@@ -3818,19 +4022,22 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3818 * freezes application IO while that the actual IO operations runs. This 4022 * freezes application IO while that the actual IO operations runs. This
3819 * functions MAY NOT be called from worker context. 4023 * functions MAY NOT be called from worker context.
3820 */ 4024 */
3821int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) 4025int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4026 char *why, enum bm_flag flags)
3822{ 4027{
3823 int rv; 4028 int rv;
3824 4029
3825 D_ASSERT(current != mdev->worker.task); 4030 D_ASSERT(current != mdev->worker.task);
3826 4031
3827 drbd_suspend_io(mdev); 4032 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4033 drbd_suspend_io(mdev);
3828 4034
3829 drbd_bm_lock(mdev, why); 4035 drbd_bm_lock(mdev, why, flags);
3830 rv = io_fn(mdev); 4036 rv = io_fn(mdev);
3831 drbd_bm_unlock(mdev); 4037 drbd_bm_unlock(mdev);
3832 4038
3833 drbd_resume_io(mdev); 4039 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4040 drbd_resume_io(mdev);
3834 4041
3835 return rv; 4042 return rv;
3836} 4043}
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index fe81c851ca88..03b29f78a37d 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -288,10 +288,11 @@ void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
288 dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n"); 288 dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
289} 289}
290 290
291int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) 291enum drbd_state_rv
292drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
292{ 293{
293 const int max_tries = 4; 294 const int max_tries = 4;
294 int r = 0; 295 enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
295 int try = 0; 296 int try = 0;
296 int forced = 0; 297 int forced = 0;
297 union drbd_state mask, val; 298 union drbd_state mask, val;
@@ -306,17 +307,17 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
306 val.i = 0; val.role = new_role; 307 val.i = 0; val.role = new_role;
307 308
308 while (try++ < max_tries) { 309 while (try++ < max_tries) {
309 r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); 310 rv = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
310 311
311 /* in case we first succeeded to outdate, 312 /* in case we first succeeded to outdate,
312 * but now suddenly could establish a connection */ 313 * but now suddenly could establish a connection */
313 if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { 314 if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
314 val.pdsk = 0; 315 val.pdsk = 0;
315 mask.pdsk = 0; 316 mask.pdsk = 0;
316 continue; 317 continue;
317 } 318 }
318 319
319 if (r == SS_NO_UP_TO_DATE_DISK && force && 320 if (rv == SS_NO_UP_TO_DATE_DISK && force &&
320 (mdev->state.disk < D_UP_TO_DATE && 321 (mdev->state.disk < D_UP_TO_DATE &&
321 mdev->state.disk >= D_INCONSISTENT)) { 322 mdev->state.disk >= D_INCONSISTENT)) {
322 mask.disk = D_MASK; 323 mask.disk = D_MASK;
@@ -325,7 +326,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
325 continue; 326 continue;
326 } 327 }
327 328
328 if (r == SS_NO_UP_TO_DATE_DISK && 329 if (rv == SS_NO_UP_TO_DATE_DISK &&
329 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { 330 mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
330 D_ASSERT(mdev->state.pdsk == D_UNKNOWN); 331 D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
331 nps = drbd_try_outdate_peer(mdev); 332 nps = drbd_try_outdate_peer(mdev);
@@ -341,9 +342,9 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
341 continue; 342 continue;
342 } 343 }
343 344
344 if (r == SS_NOTHING_TO_DO) 345 if (rv == SS_NOTHING_TO_DO)
345 goto fail; 346 goto fail;
346 if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { 347 if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
347 nps = drbd_try_outdate_peer(mdev); 348 nps = drbd_try_outdate_peer(mdev);
348 349
349 if (force && nps > D_OUTDATED) { 350 if (force && nps > D_OUTDATED) {
@@ -356,25 +357,24 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
356 357
357 continue; 358 continue;
358 } 359 }
359 if (r == SS_TWO_PRIMARIES) { 360 if (rv == SS_TWO_PRIMARIES) {
360 /* Maybe the peer is detected as dead very soon... 361 /* Maybe the peer is detected as dead very soon...
361 retry at most once more in this case. */ 362 retry at most once more in this case. */
362 __set_current_state(TASK_INTERRUPTIBLE); 363 schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10);
363 schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
364 if (try < max_tries) 364 if (try < max_tries)
365 try = max_tries - 1; 365 try = max_tries - 1;
366 continue; 366 continue;
367 } 367 }
368 if (r < SS_SUCCESS) { 368 if (rv < SS_SUCCESS) {
369 r = _drbd_request_state(mdev, mask, val, 369 rv = _drbd_request_state(mdev, mask, val,
370 CS_VERBOSE + CS_WAIT_COMPLETE); 370 CS_VERBOSE + CS_WAIT_COMPLETE);
371 if (r < SS_SUCCESS) 371 if (rv < SS_SUCCESS)
372 goto fail; 372 goto fail;
373 } 373 }
374 break; 374 break;
375 } 375 }
376 376
377 if (r < SS_SUCCESS) 377 if (rv < SS_SUCCESS)
378 goto fail; 378 goto fail;
379 379
380 if (forced) 380 if (forced)
@@ -384,7 +384,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
384 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); 384 wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
385 385
386 if (new_role == R_SECONDARY) { 386 if (new_role == R_SECONDARY) {
387 set_disk_ro(mdev->vdisk, TRUE); 387 set_disk_ro(mdev->vdisk, true);
388 if (get_ldev(mdev)) { 388 if (get_ldev(mdev)) {
389 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; 389 mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
390 put_ldev(mdev); 390 put_ldev(mdev);
@@ -394,7 +394,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
394 mdev->net_conf->want_lose = 0; 394 mdev->net_conf->want_lose = 0;
395 put_net_conf(mdev); 395 put_net_conf(mdev);
396 } 396 }
397 set_disk_ro(mdev->vdisk, FALSE); 397 set_disk_ro(mdev->vdisk, false);
398 if (get_ldev(mdev)) { 398 if (get_ldev(mdev)) {
399 if (((mdev->state.conn < C_CONNECTED || 399 if (((mdev->state.conn < C_CONNECTED ||
400 mdev->state.pdsk <= D_FAILED) 400 mdev->state.pdsk <= D_FAILED)
@@ -406,10 +406,8 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
406 } 406 }
407 } 407 }
408 408
409 if ((new_role == R_SECONDARY) && get_ldev(mdev)) { 409 /* writeout of activity log covered areas of the bitmap
410 drbd_al_to_on_disk_bm(mdev); 410 * to stable storage done in after state change already */
411 put_ldev(mdev);
412 }
413 411
414 if (mdev->state.conn >= C_WF_REPORT_PARAMS) { 412 if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
415 /* if this was forced, we should consider sync */ 413 /* if this was forced, we should consider sync */
@@ -423,7 +421,7 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
423 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); 421 kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
424 fail: 422 fail:
425 mutex_unlock(&mdev->state_mutex); 423 mutex_unlock(&mdev->state_mutex);
426 return r; 424 return rv;
427} 425}
428 426
429static struct drbd_conf *ensure_mdev(int minor, int create) 427static struct drbd_conf *ensure_mdev(int minor, int create)
@@ -528,17 +526,19 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
528 } 526 }
529} 527}
530 528
529/* input size is expected to be in KB */
531char *ppsize(char *buf, unsigned long long size) 530char *ppsize(char *buf, unsigned long long size)
532{ 531{
533 /* Needs 9 bytes at max. */ 532 /* Needs 9 bytes at max including trailing NUL:
533 * -1ULL ==> "16384 EB" */
534 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; 534 static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
535 int base = 0; 535 int base = 0;
536 while (size >= 10000) { 536 while (size >= 10000 && base < sizeof(units)-1) {
537 /* shift + round */ 537 /* shift + round */
538 size = (size >> 10) + !!(size & (1<<9)); 538 size = (size >> 10) + !!(size & (1<<9));
539 base++; 539 base++;
540 } 540 }
541 sprintf(buf, "%lu %cB", (long)size, units[base]); 541 sprintf(buf, "%u %cB", (unsigned)size, units[base]);
542 542
543 return buf; 543 return buf;
544} 544}
@@ -642,11 +642,19 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_
642 || prev_size != mdev->ldev->md.md_size_sect; 642 || prev_size != mdev->ldev->md.md_size_sect;
643 643
644 if (la_size_changed || md_moved) { 644 if (la_size_changed || md_moved) {
645 int err;
646
645 drbd_al_shrink(mdev); /* All extents inactive. */ 647 drbd_al_shrink(mdev); /* All extents inactive. */
646 dev_info(DEV, "Writing the whole bitmap, %s\n", 648 dev_info(DEV, "Writing the whole bitmap, %s\n",
647 la_size_changed && md_moved ? "size changed and md moved" : 649 la_size_changed && md_moved ? "size changed and md moved" :
648 la_size_changed ? "size changed" : "md moved"); 650 la_size_changed ? "size changed" : "md moved");
649 rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ 651 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
652 err = drbd_bitmap_io(mdev, &drbd_bm_write,
653 "size changed", BM_LOCKED_MASK);
654 if (err) {
655 rv = dev_size_error;
656 goto out;
657 }
650 drbd_md_mark_dirty(mdev); 658 drbd_md_mark_dirty(mdev);
651 } 659 }
652 660
@@ -765,22 +773,21 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
765 return 0; 773 return 0;
766} 774}
767 775
768void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) 776void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) __must_hold(local)
769{ 777{
770 struct request_queue * const q = mdev->rq_queue; 778 struct request_queue * const q = mdev->rq_queue;
771 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 779 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
772 int max_segments = mdev->ldev->dc.max_bio_bvecs; 780 int max_segments = mdev->ldev->dc.max_bio_bvecs;
781 int max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
773 782
774 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
775
776 blk_queue_max_hw_sectors(q, max_seg_s >> 9);
777 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
778 blk_queue_max_segment_size(q, max_seg_s);
779 blk_queue_logical_block_size(q, 512); 783 blk_queue_logical_block_size(q, 512);
780 blk_queue_segment_boundary(q, PAGE_SIZE-1); 784 blk_queue_max_hw_sectors(q, max_hw_sectors);
781 blk_stack_limits(&q->limits, &b->limits, 0); 785 /* This is the workaround for "bio would need to, but cannot, be split" */
786 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
787 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
788 blk_queue_stack_limits(q, b);
782 789
783 dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); 790 dev_info(DEV, "max BIO size = %u\n", queue_max_hw_sectors(q) << 9);
784 791
785 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 792 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
786 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", 793 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
@@ -850,7 +857,7 @@ static void drbd_suspend_al(struct drbd_conf *mdev)
850static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 857static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
851 struct drbd_nl_cfg_reply *reply) 858 struct drbd_nl_cfg_reply *reply)
852{ 859{
853 enum drbd_ret_codes retcode; 860 enum drbd_ret_code retcode;
854 enum determine_dev_size dd; 861 enum determine_dev_size dd;
855 sector_t max_possible_sectors; 862 sector_t max_possible_sectors;
856 sector_t min_md_device_sectors; 863 sector_t min_md_device_sectors;
@@ -858,8 +865,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
858 struct block_device *bdev; 865 struct block_device *bdev;
859 struct lru_cache *resync_lru = NULL; 866 struct lru_cache *resync_lru = NULL;
860 union drbd_state ns, os; 867 union drbd_state ns, os;
861 unsigned int max_seg_s; 868 unsigned int max_bio_size;
862 int rv; 869 enum drbd_state_rv rv;
863 int cp_discovered = 0; 870 int cp_discovered = 0;
864 int logical_block_size; 871 int logical_block_size;
865 872
@@ -1005,9 +1012,10 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1005 /* and for any other previously queued work */ 1012 /* and for any other previously queued work */
1006 drbd_flush_workqueue(mdev); 1013 drbd_flush_workqueue(mdev);
1007 1014
1008 retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); 1015 rv = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
1016 retcode = rv; /* FIXME: Type mismatch. */
1009 drbd_resume_io(mdev); 1017 drbd_resume_io(mdev);
1010 if (retcode < SS_SUCCESS) 1018 if (rv < SS_SUCCESS)
1011 goto fail; 1019 goto fail;
1012 1020
1013 if (!get_ldev_if_state(mdev, D_ATTACHING)) 1021 if (!get_ldev_if_state(mdev, D_ATTACHING))
@@ -1109,20 +1117,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1109 mdev->read_cnt = 0; 1117 mdev->read_cnt = 0;
1110 mdev->writ_cnt = 0; 1118 mdev->writ_cnt = 0;
1111 1119
1112 max_seg_s = DRBD_MAX_SEGMENT_SIZE; 1120 max_bio_size = DRBD_MAX_BIO_SIZE;
1113 if (mdev->state.conn == C_CONNECTED) { 1121 if (mdev->state.conn == C_CONNECTED) {
1114 /* We are Primary, Connected, and now attach a new local 1122 /* We are Primary, Connected, and now attach a new local
1115 * backing store. We must not increase the user visible maximum 1123 * backing store. We must not increase the user visible maximum
1116 * bio size on this device to something the peer may not be 1124 * bio size on this device to something the peer may not be
1117 * able to handle. */ 1125 * able to handle. */
1118 if (mdev->agreed_pro_version < 94) 1126 if (mdev->agreed_pro_version < 94)
1119 max_seg_s = queue_max_segment_size(mdev->rq_queue); 1127 max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
1120 else if (mdev->agreed_pro_version == 94) 1128 else if (mdev->agreed_pro_version == 94)
1121 max_seg_s = DRBD_MAX_SIZE_H80_PACKET; 1129 max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
1122 /* else: drbd 8.3.9 and later, stay with default */ 1130 /* else: drbd 8.3.9 and later, stay with default */
1123 } 1131 }
1124 1132
1125 drbd_setup_queue_param(mdev, max_seg_s); 1133 drbd_setup_queue_param(mdev, max_bio_size);
1126 1134
1127 /* If I am currently not R_PRIMARY, 1135 /* If I am currently not R_PRIMARY,
1128 * but meta data primary indicator is set, 1136 * but meta data primary indicator is set,
@@ -1154,12 +1162,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1154 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { 1162 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1155 dev_info(DEV, "Assuming that all blocks are out of sync " 1163 dev_info(DEV, "Assuming that all blocks are out of sync "
1156 "(aka FullSync)\n"); 1164 "(aka FullSync)\n");
1157 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { 1165 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
1166 "set_n_write from attaching", BM_LOCKED_MASK)) {
1158 retcode = ERR_IO_MD_DISK; 1167 retcode = ERR_IO_MD_DISK;
1159 goto force_diskless_dec; 1168 goto force_diskless_dec;
1160 } 1169 }
1161 } else { 1170 } else {
1162 if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { 1171 if (drbd_bitmap_io(mdev, &drbd_bm_read,
1172 "read from attaching", BM_LOCKED_MASK) < 0) {
1163 retcode = ERR_IO_MD_DISK; 1173 retcode = ERR_IO_MD_DISK;
1164 goto force_diskless_dec; 1174 goto force_diskless_dec;
1165 } 1175 }
@@ -1167,7 +1177,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1167 1177
1168 if (cp_discovered) { 1178 if (cp_discovered) {
1169 drbd_al_apply_to_bm(mdev); 1179 drbd_al_apply_to_bm(mdev);
1170 drbd_al_to_on_disk_bm(mdev); 1180 if (drbd_bitmap_io(mdev, &drbd_bm_write,
1181 "crashed primary apply AL", BM_LOCKED_MASK)) {
1182 retcode = ERR_IO_MD_DISK;
1183 goto force_diskless_dec;
1184 }
1171 } 1185 }
1172 1186
1173 if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) 1187 if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
@@ -1279,7 +1293,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1279 struct drbd_nl_cfg_reply *reply) 1293 struct drbd_nl_cfg_reply *reply)
1280{ 1294{
1281 int i, ns; 1295 int i, ns;
1282 enum drbd_ret_codes retcode; 1296 enum drbd_ret_code retcode;
1283 struct net_conf *new_conf = NULL; 1297 struct net_conf *new_conf = NULL;
1284 struct crypto_hash *tfm = NULL; 1298 struct crypto_hash *tfm = NULL;
1285 struct crypto_hash *integrity_w_tfm = NULL; 1299 struct crypto_hash *integrity_w_tfm = NULL;
@@ -1324,6 +1338,8 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1324 new_conf->wire_protocol = DRBD_PROT_C; 1338 new_conf->wire_protocol = DRBD_PROT_C;
1325 new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; 1339 new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
1326 new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; 1340 new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
1341 new_conf->on_congestion = DRBD_ON_CONGESTION_DEF;
1342 new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF;
1327 1343
1328 if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { 1344 if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
1329 retcode = ERR_MANDATORY_TAG; 1345 retcode = ERR_MANDATORY_TAG;
@@ -1345,6 +1361,11 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1345 } 1361 }
1346 } 1362 }
1347 1363
1364 if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
1365 retcode = ERR_CONG_NOT_PROTO_A;
1366 goto fail;
1367 }
1368
1348 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { 1369 if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
1349 retcode = ERR_DISCARD; 1370 retcode = ERR_DISCARD;
1350 goto fail; 1371 goto fail;
@@ -1525,6 +1546,21 @@ static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1525 struct drbd_nl_cfg_reply *reply) 1546 struct drbd_nl_cfg_reply *reply)
1526{ 1547{
1527 int retcode; 1548 int retcode;
1549 struct disconnect dc;
1550
1551 memset(&dc, 0, sizeof(struct disconnect));
1552 if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
1553 retcode = ERR_MANDATORY_TAG;
1554 goto fail;
1555 }
1556
1557 if (dc.force) {
1558 spin_lock_irq(&mdev->req_lock);
1559 if (mdev->state.conn >= C_WF_CONNECTION)
1560 _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
1561 spin_unlock_irq(&mdev->req_lock);
1562 goto done;
1563 }
1528 1564
1529 retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); 1565 retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
1530 1566
@@ -1842,6 +1878,10 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1842{ 1878{
1843 int retcode; 1879 int retcode;
1844 1880
1881 /* If there is still bitmap IO pending, probably because of a previous
1882 * resync just being finished, wait for it before requesting a new resync. */
1883 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
1884
1845 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); 1885 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
1846 1886
1847 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) 1887 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
@@ -1877,6 +1917,10 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
1877{ 1917{
1878 int retcode; 1918 int retcode;
1879 1919
1920 /* If there is still bitmap IO pending, probably because of a previous
1921 * resync just being finished, wait for it before requesting a new resync. */
1922 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
1923
1880 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); 1924 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
1881 1925
1882 if (retcode < SS_SUCCESS) { 1926 if (retcode < SS_SUCCESS) {
@@ -1885,9 +1929,9 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
1885 into a full resync. */ 1929 into a full resync. */
1886 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); 1930 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
1887 if (retcode >= SS_SUCCESS) { 1931 if (retcode >= SS_SUCCESS) {
1888 /* open coded drbd_bitmap_io() */
1889 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, 1932 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
1890 "set_n_write from invalidate_peer")) 1933 "set_n_write from invalidate_peer",
1934 BM_LOCKED_SET_ALLOWED))
1891 retcode = ERR_IO_MD_DISK; 1935 retcode = ERR_IO_MD_DISK;
1892 } 1936 }
1893 } else 1937 } else
@@ -1914,9 +1958,17 @@ static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1914 struct drbd_nl_cfg_reply *reply) 1958 struct drbd_nl_cfg_reply *reply)
1915{ 1959{
1916 int retcode = NO_ERROR; 1960 int retcode = NO_ERROR;
1961 union drbd_state s;
1917 1962
1918 if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) 1963 if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
1919 retcode = ERR_PAUSE_IS_CLEAR; 1964 s = mdev->state;
1965 if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
1966 retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
1967 s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
1968 } else {
1969 retcode = ERR_PAUSE_IS_CLEAR;
1970 }
1971 }
1920 1972
1921 reply->ret_code = retcode; 1973 reply->ret_code = retcode;
1922 return 0; 1974 return 0;
@@ -2054,6 +2106,11 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
2054 reply->ret_code = ERR_MANDATORY_TAG; 2106 reply->ret_code = ERR_MANDATORY_TAG;
2055 return 0; 2107 return 0;
2056 } 2108 }
2109
2110 /* If there is still bitmap IO pending, e.g. previous resync or verify
2111 * just being finished, wait for it before requesting a new resync. */
2112 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2113
2057 /* w_make_ov_request expects position to be aligned */ 2114 /* w_make_ov_request expects position to be aligned */
2058 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; 2115 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
2059 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); 2116 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
@@ -2097,7 +2154,8 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
2097 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ 2154 drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
2098 2155
2099 if (args.clear_bm) { 2156 if (args.clear_bm) {
2100 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); 2157 err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
2158 "clear_n_write from new_c_uuid", BM_LOCKED_MASK);
2101 if (err) { 2159 if (err) {
2102 dev_err(DEV, "Writing bitmap failed with %d\n",err); 2160 dev_err(DEV, "Writing bitmap failed with %d\n",err);
2103 retcode = ERR_IO_MD_DISK; 2161 retcode = ERR_IO_MD_DISK;
@@ -2105,6 +2163,7 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
2105 if (skip_initial_sync) { 2163 if (skip_initial_sync) {
2106 drbd_send_uuids_skip_initial_sync(mdev); 2164 drbd_send_uuids_skip_initial_sync(mdev);
2107 _drbd_uuid_set(mdev, UI_BITMAP, 0); 2165 _drbd_uuid_set(mdev, UI_BITMAP, 0);
2166 drbd_print_uuids(mdev, "cleared bitmap UUID");
2108 spin_lock_irq(&mdev->req_lock); 2167 spin_lock_irq(&mdev->req_lock);
2109 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 2168 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
2110 CS_VERBOSE, NULL); 2169 CS_VERBOSE, NULL);
@@ -2189,7 +2248,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
2189 goto fail; 2248 goto fail;
2190 } 2249 }
2191 2250
2192 if (nlp->packet_type >= P_nl_after_last_packet) { 2251 if (nlp->packet_type >= P_nl_after_last_packet ||
2252 nlp->packet_type == P_return_code_only) {
2193 retcode = ERR_PACKET_NR; 2253 retcode = ERR_PACKET_NR;
2194 goto fail; 2254 goto fail;
2195 } 2255 }
@@ -2205,7 +2265,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
2205 reply_size += cm->reply_body_size; 2265 reply_size += cm->reply_body_size;
2206 2266
2207 /* allocation not in the IO path, cqueue thread context */ 2267 /* allocation not in the IO path, cqueue thread context */
2208 cn_reply = kmalloc(reply_size, GFP_KERNEL); 2268 cn_reply = kzalloc(reply_size, GFP_KERNEL);
2209 if (!cn_reply) { 2269 if (!cn_reply) {
2210 retcode = ERR_NOMEM; 2270 retcode = ERR_NOMEM;
2211 goto fail; 2271 goto fail;
@@ -2213,7 +2273,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
2213 reply = (struct drbd_nl_cfg_reply *) cn_reply->data; 2273 reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
2214 2274
2215 reply->packet_type = 2275 reply->packet_type =
2216 cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; 2276 cm->reply_body_size ? nlp->packet_type : P_return_code_only;
2217 reply->minor = nlp->drbd_minor; 2277 reply->minor = nlp->drbd_minor;
2218 reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ 2278 reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
2219 /* reply->tag_list; might be modified by cm->function. */ 2279 /* reply->tag_list; might be modified by cm->function. */
@@ -2376,7 +2436,7 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2376 /* receiver thread context, which is not in the writeout path (of this node), 2436 /* receiver thread context, which is not in the writeout path (of this node),
2377 * but may be in the writeout path of the _other_ node. 2437 * but may be in the writeout path of the _other_ node.
2378 * GFP_NOIO to avoid potential "distributed deadlock". */ 2438 * GFP_NOIO to avoid potential "distributed deadlock". */
2379 cn_reply = kmalloc( 2439 cn_reply = kzalloc(
2380 sizeof(struct cn_msg)+ 2440 sizeof(struct cn_msg)+
2381 sizeof(struct drbd_nl_cfg_reply)+ 2441 sizeof(struct drbd_nl_cfg_reply)+
2382 sizeof(struct dump_ee_tag_len_struct)+ 2442 sizeof(struct dump_ee_tag_len_struct)+
@@ -2398,10 +2458,11 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2398 tl = tl_add_int(tl, T_ee_sector, &e->sector); 2458 tl = tl_add_int(tl, T_ee_sector, &e->sector);
2399 tl = tl_add_int(tl, T_ee_block_id, &e->block_id); 2459 tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
2400 2460
2461 /* dump the first 32k */
2462 len = min_t(unsigned, e->size, 32 << 10);
2401 put_unaligned(T_ee_data, tl++); 2463 put_unaligned(T_ee_data, tl++);
2402 put_unaligned(e->size, tl++); 2464 put_unaligned(len, tl++);
2403 2465
2404 len = e->size;
2405 page = e->pages; 2466 page = e->pages;
2406 page_chain_for_each(page) { 2467 page_chain_for_each(page) {
2407 void *d = kmap_atomic(page, KM_USER0); 2468 void *d = kmap_atomic(page, KM_USER0);
@@ -2410,6 +2471,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2410 kunmap_atomic(d, KM_USER0); 2471 kunmap_atomic(d, KM_USER0);
2411 tl = (unsigned short*)((char*)tl + l); 2472 tl = (unsigned short*)((char*)tl + l);
2412 len -= l; 2473 len -= l;
2474 if (len == 0)
2475 break;
2413 } 2476 }
2414 put_unaligned(TT_END, tl++); /* Close the tag list */ 2477 put_unaligned(TT_END, tl++); /* Close the tag list */
2415 2478
@@ -2508,6 +2571,7 @@ void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2508 (struct drbd_nl_cfg_reply *)cn_reply->data; 2571 (struct drbd_nl_cfg_reply *)cn_reply->data;
2509 int rr; 2572 int rr;
2510 2573
2574 memset(buffer, 0, sizeof(buffer));
2511 cn_reply->id = req->id; 2575 cn_reply->id = req->id;
2512 2576
2513 cn_reply->seq = req->seq; 2577 cn_reply->seq = req->seq;
@@ -2515,6 +2579,7 @@ void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
2515 cn_reply->len = sizeof(struct drbd_nl_cfg_reply); 2579 cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
2516 cn_reply->flags = 0; 2580 cn_reply->flags = 0;
2517 2581
2582 reply->packet_type = P_return_code_only;
2518 reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; 2583 reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
2519 reply->ret_code = ret_code; 2584 reply->ret_code = ret_code;
2520 2585
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 7e6ac307e2de..2959cdfb77f5 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -34,6 +34,7 @@
34#include "drbd_int.h" 34#include "drbd_int.h"
35 35
36static int drbd_proc_open(struct inode *inode, struct file *file); 36static int drbd_proc_open(struct inode *inode, struct file *file);
37static int drbd_proc_release(struct inode *inode, struct file *file);
37 38
38 39
39struct proc_dir_entry *drbd_proc; 40struct proc_dir_entry *drbd_proc;
@@ -42,9 +43,22 @@ const struct file_operations drbd_proc_fops = {
42 .open = drbd_proc_open, 43 .open = drbd_proc_open,
43 .read = seq_read, 44 .read = seq_read,
44 .llseek = seq_lseek, 45 .llseek = seq_lseek,
45 .release = single_release, 46 .release = drbd_proc_release,
46}; 47};
47 48
49void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
50{
51 /* v is in kB/sec. We don't expect TiByte/sec yet. */
52 if (unlikely(v >= 1000000)) {
53 /* cool: > GiByte/s */
54 seq_printf(seq, "%ld,", v / 1000000);
55 v /= 1000000;
56 seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
57 } else if (likely(v >= 1000))
58 seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
59 else
60 seq_printf(seq, "%ld", v);
61}
48 62
49/*lge 63/*lge
50 * progress bars shamelessly adapted from driver/md/md.c 64 * progress bars shamelessly adapted from driver/md/md.c
@@ -71,10 +85,15 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
71 seq_printf(seq, "."); 85 seq_printf(seq, ".");
72 seq_printf(seq, "] "); 86 seq_printf(seq, "] ");
73 87
74 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); 88 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
75 /* if more than 1 GB display in MB */ 89 seq_printf(seq, "verified:");
76 if (mdev->rs_total > 0x100000L) 90 else
77 seq_printf(seq, "(%lu/%lu)M\n\t", 91 seq_printf(seq, "sync'ed:");
92 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
93
94 /* if more than a few GB, display in MB */
95 if (mdev->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
96 seq_printf(seq, "(%lu/%lu)M",
78 (unsigned long) Bit2KB(rs_left >> 10), 97 (unsigned long) Bit2KB(rs_left >> 10),
79 (unsigned long) Bit2KB(mdev->rs_total >> 10)); 98 (unsigned long) Bit2KB(mdev->rs_total >> 10));
80 else 99 else
@@ -94,6 +113,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
94 /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is 113 /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
95 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at 114 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
96 * least DRBD_SYNC_MARK_STEP time before it will be modified. */ 115 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
116 /* ------------------------ ~18s average ------------------------ */
97 i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS; 117 i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
98 dt = (jiffies - mdev->rs_mark_time[i]) / HZ; 118 dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
99 if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) 119 if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
@@ -107,14 +127,24 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
107 seq_printf(seq, "finish: %lu:%02lu:%02lu", 127 seq_printf(seq, "finish: %lu:%02lu:%02lu",
108 rt / 3600, (rt % 3600) / 60, rt % 60); 128 rt / 3600, (rt % 3600) / 60, rt % 60);
109 129
110 /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
111 dbdt = Bit2KB(db/dt); 130 dbdt = Bit2KB(db/dt);
112 if (dbdt > 1000) 131 seq_printf(seq, " speed: ");
113 seq_printf(seq, " speed: %ld,%03ld", 132 seq_printf_with_thousands_grouping(seq, dbdt);
114 dbdt/1000, dbdt % 1000); 133 seq_printf(seq, " (");
115 else 134 /* ------------------------- ~3s average ------------------------ */
116 seq_printf(seq, " speed: %ld", dbdt); 135 if (proc_details >= 1) {
136 /* this is what drbd_rs_should_slow_down() uses */
137 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
138 dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
139 if (!dt)
140 dt++;
141 db = mdev->rs_mark_left[i] - rs_left;
142 dbdt = Bit2KB(db/dt);
143 seq_printf_with_thousands_grouping(seq, dbdt);
144 seq_printf(seq, " -- ");
145 }
117 146
147 /* --------------------- long term average ---------------------- */
118 /* mean speed since syncer started 148 /* mean speed since syncer started
119 * we do account for PausedSync periods */ 149 * we do account for PausedSync periods */
120 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; 150 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
@@ -122,20 +152,34 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
122 dt = 1; 152 dt = 1;
123 db = mdev->rs_total - rs_left; 153 db = mdev->rs_total - rs_left;
124 dbdt = Bit2KB(db/dt); 154 dbdt = Bit2KB(db/dt);
125 if (dbdt > 1000) 155 seq_printf_with_thousands_grouping(seq, dbdt);
126 seq_printf(seq, " (%ld,%03ld)", 156 seq_printf(seq, ")");
127 dbdt/1000, dbdt % 1000);
128 else
129 seq_printf(seq, " (%ld)", dbdt);
130 157
131 if (mdev->state.conn == C_SYNC_TARGET) { 158 if (mdev->state.conn == C_SYNC_TARGET ||
132 if (mdev->c_sync_rate > 1000) 159 mdev->state.conn == C_VERIFY_S) {
133 seq_printf(seq, " want: %d,%03d", 160 seq_printf(seq, " want: ");
134 mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000); 161 seq_printf_with_thousands_grouping(seq, mdev->c_sync_rate);
135 else
136 seq_printf(seq, " want: %d", mdev->c_sync_rate);
137 } 162 }
138 seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); 163 seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
164
165 if (proc_details >= 1) {
166 /* 64 bit:
167 * we convert to sectors in the display below. */
168 unsigned long bm_bits = drbd_bm_bits(mdev);
169 unsigned long bit_pos;
170 if (mdev->state.conn == C_VERIFY_S ||
171 mdev->state.conn == C_VERIFY_T)
172 bit_pos = bm_bits - mdev->ov_left;
173 else
174 bit_pos = mdev->bm_resync_fo;
175 /* Total sectors may be slightly off for oddly
176 * sized devices. So what. */
177 seq_printf(seq,
178 "\t%3d%% sector pos: %llu/%llu\n",
179 (int)(bit_pos / (bm_bits/100+1)),
180 (unsigned long long)bit_pos * BM_SECT_PER_BIT,
181 (unsigned long long)bm_bits * BM_SECT_PER_BIT);
182 }
139} 183}
140 184
141static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) 185static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
@@ -232,20 +276,16 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
232 mdev->epochs, 276 mdev->epochs,
233 write_ordering_chars[mdev->write_ordering] 277 write_ordering_chars[mdev->write_ordering]
234 ); 278 );
235 seq_printf(seq, " oos:%lu\n", 279 seq_printf(seq, " oos:%llu\n",
236 Bit2KB(drbd_bm_total_weight(mdev))); 280 Bit2KB((unsigned long long)
281 drbd_bm_total_weight(mdev)));
237 } 282 }
238 if (mdev->state.conn == C_SYNC_SOURCE || 283 if (mdev->state.conn == C_SYNC_SOURCE ||
239 mdev->state.conn == C_SYNC_TARGET) 284 mdev->state.conn == C_SYNC_TARGET ||
285 mdev->state.conn == C_VERIFY_S ||
286 mdev->state.conn == C_VERIFY_T)
240 drbd_syncer_progress(mdev, seq); 287 drbd_syncer_progress(mdev, seq);
241 288
242 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
243 seq_printf(seq, "\t%3d%% %lu/%lu\n",
244 (int)((mdev->rs_total-mdev->ov_left) /
245 (mdev->rs_total/100+1)),
246 mdev->rs_total - mdev->ov_left,
247 mdev->rs_total);
248
249 if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { 289 if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
250 lc_seq_printf_stats(seq, mdev->resync); 290 lc_seq_printf_stats(seq, mdev->resync);
251 lc_seq_printf_stats(seq, mdev->act_log); 291 lc_seq_printf_stats(seq, mdev->act_log);
@@ -265,7 +305,15 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
265 305
266static int drbd_proc_open(struct inode *inode, struct file *file) 306static int drbd_proc_open(struct inode *inode, struct file *file)
267{ 307{
268 return single_open(file, drbd_seq_show, PDE(inode)->data); 308 if (try_module_get(THIS_MODULE))
309 return single_open(file, drbd_seq_show, PDE(inode)->data);
310 return -ENODEV;
311}
312
313static int drbd_proc_release(struct inode *inode, struct file *file)
314{
315 module_put(THIS_MODULE);
316 return single_release(inode, file);
269} 317}
270 318
271/* PROC FS stuff end */ 319/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 8e68be939deb..fe1564c7d8b6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -277,7 +277,7 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
277 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; 277 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
278 int i; 278 int i;
279 279
280 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) 280 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
281 i = page_chain_free(page); 281 i = page_chain_free(page);
282 else { 282 else {
283 struct page *tmp; 283 struct page *tmp;
@@ -319,7 +319,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
319 struct page *page; 319 struct page *page;
320 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; 320 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
321 321
322 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) 322 if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
323 return NULL; 323 return NULL;
324 324
325 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); 325 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
@@ -725,16 +725,16 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
725 char tb[4]; 725 char tb[4];
726 726
727 if (!*sock) 727 if (!*sock)
728 return FALSE; 728 return false;
729 729
730 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); 730 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
731 731
732 if (rr > 0 || rr == -EAGAIN) { 732 if (rr > 0 || rr == -EAGAIN) {
733 return TRUE; 733 return true;
734 } else { 734 } else {
735 sock_release(*sock); 735 sock_release(*sock);
736 *sock = NULL; 736 *sock = NULL;
737 return FALSE; 737 return false;
738 } 738 }
739} 739}
740 740
@@ -768,8 +768,7 @@ static int drbd_connect(struct drbd_conf *mdev)
768 if (s || ++try >= 3) 768 if (s || ++try >= 3)
769 break; 769 break;
770 /* give the other side time to call bind() & listen() */ 770 /* give the other side time to call bind() & listen() */
771 __set_current_state(TASK_INTERRUPTIBLE); 771 schedule_timeout_interruptible(HZ / 10);
772 schedule_timeout(HZ / 10);
773 } 772 }
774 773
775 if (s) { 774 if (s) {
@@ -788,8 +787,7 @@ static int drbd_connect(struct drbd_conf *mdev)
788 } 787 }
789 788
790 if (sock && msock) { 789 if (sock && msock) {
791 __set_current_state(TASK_INTERRUPTIBLE); 790 schedule_timeout_interruptible(HZ / 10);
792 schedule_timeout(HZ / 10);
793 ok = drbd_socket_okay(mdev, &sock); 791 ok = drbd_socket_okay(mdev, &sock);
794 ok = drbd_socket_okay(mdev, &msock) && ok; 792 ok = drbd_socket_okay(mdev, &msock) && ok;
795 if (ok) 793 if (ok)
@@ -906,7 +904,7 @@ retry:
906 put_ldev(mdev); 904 put_ldev(mdev);
907 } 905 }
908 906
909 if (!drbd_send_protocol(mdev)) 907 if (drbd_send_protocol(mdev) == -1)
910 return -1; 908 return -1;
911 drbd_send_sync_param(mdev, &mdev->sync_conf); 909 drbd_send_sync_param(mdev, &mdev->sync_conf);
912 drbd_send_sizes(mdev, 0, 0); 910 drbd_send_sizes(mdev, 0, 0);
@@ -914,6 +912,7 @@ retry:
914 drbd_send_state(mdev); 912 drbd_send_state(mdev);
915 clear_bit(USE_DEGR_WFC_T, &mdev->flags); 913 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
916 clear_bit(RESIZE_PENDING, &mdev->flags); 914 clear_bit(RESIZE_PENDING, &mdev->flags);
915 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
917 916
918 return 1; 917 return 1;
919 918
@@ -932,8 +931,9 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
932 931
933 r = drbd_recv(mdev, h, sizeof(*h)); 932 r = drbd_recv(mdev, h, sizeof(*h));
934 if (unlikely(r != sizeof(*h))) { 933 if (unlikely(r != sizeof(*h))) {
935 dev_err(DEV, "short read expecting header on sock: r=%d\n", r); 934 if (!signal_pending(current))
936 return FALSE; 935 dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
936 return false;
937 } 937 }
938 938
939 if (likely(h->h80.magic == BE_DRBD_MAGIC)) { 939 if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
@@ -947,11 +947,11 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
947 be32_to_cpu(h->h80.magic), 947 be32_to_cpu(h->h80.magic),
948 be16_to_cpu(h->h80.command), 948 be16_to_cpu(h->h80.command),
949 be16_to_cpu(h->h80.length)); 949 be16_to_cpu(h->h80.length));
950 return FALSE; 950 return false;
951 } 951 }
952 mdev->last_received = jiffies; 952 mdev->last_received = jiffies;
953 953
954 return TRUE; 954 return true;
955} 955}
956 956
957static void drbd_flush(struct drbd_conf *mdev) 957static void drbd_flush(struct drbd_conf *mdev)
@@ -1074,6 +1074,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1074 * @mdev: DRBD device. 1074 * @mdev: DRBD device.
1075 * @e: epoch entry 1075 * @e: epoch entry
1076 * @rw: flag field, see bio->bi_rw 1076 * @rw: flag field, see bio->bi_rw
1077 *
1078 * May spread the pages to multiple bios,
1079 * depending on bio_add_page restrictions.
1080 *
1081 * Returns 0 if all bios have been submitted,
1082 * -ENOMEM if we could not allocate enough bios,
1083 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1084 * single page to an empty bio (which should never happen and likely indicates
1085 * that the lower level IO stack is in some way broken). This has been observed
1086 * on certain Xen deployments.
1077 */ 1087 */
1078/* TODO allocate from our own bio_set. */ 1088/* TODO allocate from our own bio_set. */
1079int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, 1089int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
@@ -1086,6 +1096,7 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1086 unsigned ds = e->size; 1096 unsigned ds = e->size;
1087 unsigned n_bios = 0; 1097 unsigned n_bios = 0;
1088 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; 1098 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1099 int err = -ENOMEM;
1089 1100
1090 /* In most cases, we will only need one bio. But in case the lower 1101 /* In most cases, we will only need one bio. But in case the lower
1091 * level restrictions happen to be different at this offset on this 1102 * level restrictions happen to be different at this offset on this
@@ -1111,8 +1122,17 @@ next_bio:
1111 page_chain_for_each(page) { 1122 page_chain_for_each(page) {
1112 unsigned len = min_t(unsigned, ds, PAGE_SIZE); 1123 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1113 if (!bio_add_page(bio, page, len, 0)) { 1124 if (!bio_add_page(bio, page, len, 0)) {
1114 /* a single page must always be possible! */ 1125 /* A single page must always be possible!
1115 BUG_ON(bio->bi_vcnt == 0); 1126 * But in case it fails anyways,
1127 * we deal with it, and complain (below). */
1128 if (bio->bi_vcnt == 0) {
1129 dev_err(DEV,
1130 "bio_add_page failed for len=%u, "
1131 "bi_vcnt=0 (bi_sector=%llu)\n",
1132 len, (unsigned long long)bio->bi_sector);
1133 err = -ENOSPC;
1134 goto fail;
1135 }
1116 goto next_bio; 1136 goto next_bio;
1117 } 1137 }
1118 ds -= len; 1138 ds -= len;
@@ -1138,7 +1158,7 @@ fail:
1138 bios = bios->bi_next; 1158 bios = bios->bi_next;
1139 bio_put(bio); 1159 bio_put(bio);
1140 } 1160 }
1141 return -ENOMEM; 1161 return err;
1142} 1162}
1143 1163
1144static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 1164static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
@@ -1160,7 +1180,7 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
1160 switch (mdev->write_ordering) { 1180 switch (mdev->write_ordering) {
1161 case WO_none: 1181 case WO_none:
1162 if (rv == FE_RECYCLED) 1182 if (rv == FE_RECYCLED)
1163 return TRUE; 1183 return true;
1164 1184
1165 /* receiver context, in the writeout path of the other node. 1185 /* receiver context, in the writeout path of the other node.
1166 * avoid potential distributed deadlock */ 1186 * avoid potential distributed deadlock */
@@ -1188,10 +1208,10 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
1188 D_ASSERT(atomic_read(&epoch->active) == 0); 1208 D_ASSERT(atomic_read(&epoch->active) == 0);
1189 D_ASSERT(epoch->flags == 0); 1209 D_ASSERT(epoch->flags == 0);
1190 1210
1191 return TRUE; 1211 return true;
1192 default: 1212 default:
1193 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering); 1213 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
1194 return FALSE; 1214 return false;
1195 } 1215 }
1196 1216
1197 epoch->flags = 0; 1217 epoch->flags = 0;
@@ -1209,7 +1229,7 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
1209 } 1229 }
1210 spin_unlock(&mdev->epoch_lock); 1230 spin_unlock(&mdev->epoch_lock);
1211 1231
1212 return TRUE; 1232 return true;
1213} 1233}
1214 1234
1215/* used from receive_RSDataReply (recv_resync_read) 1235/* used from receive_RSDataReply (recv_resync_read)
@@ -1231,21 +1251,25 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
1231 if (dgs) { 1251 if (dgs) {
1232 rr = drbd_recv(mdev, dig_in, dgs); 1252 rr = drbd_recv(mdev, dig_in, dgs);
1233 if (rr != dgs) { 1253 if (rr != dgs) {
1234 dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", 1254 if (!signal_pending(current))
1235 rr, dgs); 1255 dev_warn(DEV,
1256 "short read receiving data digest: read %d expected %d\n",
1257 rr, dgs);
1236 return NULL; 1258 return NULL;
1237 } 1259 }
1238 } 1260 }
1239 1261
1240 data_size -= dgs; 1262 data_size -= dgs;
1241 1263
1264 ERR_IF(data_size == 0) return NULL;
1242 ERR_IF(data_size & 0x1ff) return NULL; 1265 ERR_IF(data_size & 0x1ff) return NULL;
1243 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; 1266 ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL;
1244 1267
1245 /* even though we trust out peer, 1268 /* even though we trust out peer,
1246 * we sometimes have to double check. */ 1269 * we sometimes have to double check. */
1247 if (sector + (data_size>>9) > capacity) { 1270 if (sector + (data_size>>9) > capacity) {
1248 dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n", 1271 dev_err(DEV, "request from peer beyond end of local disk: "
1272 "capacity: %llus < sector: %llus + size: %u\n",
1249 (unsigned long long)capacity, 1273 (unsigned long long)capacity,
1250 (unsigned long long)sector, data_size); 1274 (unsigned long long)sector, data_size);
1251 return NULL; 1275 return NULL;
@@ -1264,15 +1288,16 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
1264 unsigned len = min_t(int, ds, PAGE_SIZE); 1288 unsigned len = min_t(int, ds, PAGE_SIZE);
1265 data = kmap(page); 1289 data = kmap(page);
1266 rr = drbd_recv(mdev, data, len); 1290 rr = drbd_recv(mdev, data, len);
1267 if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) { 1291 if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
1268 dev_err(DEV, "Fault injection: Corrupting data on receive\n"); 1292 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1269 data[0] = data[0] ^ (unsigned long)-1; 1293 data[0] = data[0] ^ (unsigned long)-1;
1270 } 1294 }
1271 kunmap(page); 1295 kunmap(page);
1272 if (rr != len) { 1296 if (rr != len) {
1273 drbd_free_ee(mdev, e); 1297 drbd_free_ee(mdev, e);
1274 dev_warn(DEV, "short read receiving data: read %d expected %d\n", 1298 if (!signal_pending(current))
1275 rr, len); 1299 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1300 rr, len);
1276 return NULL; 1301 return NULL;
1277 } 1302 }
1278 ds -= rr; 1303 ds -= rr;
@@ -1281,7 +1306,8 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
1281 if (dgs) { 1306 if (dgs) {
1282 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv); 1307 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
1283 if (memcmp(dig_in, dig_vv, dgs)) { 1308 if (memcmp(dig_in, dig_vv, dgs)) {
1284 dev_err(DEV, "Digest integrity check FAILED.\n"); 1309 dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
1310 (unsigned long long)sector, data_size);
1285 drbd_bcast_ee(mdev, "digest failed", 1311 drbd_bcast_ee(mdev, "digest failed",
1286 dgs, dig_in, dig_vv, e); 1312 dgs, dig_in, dig_vv, e);
1287 drbd_free_ee(mdev, e); 1313 drbd_free_ee(mdev, e);
@@ -1302,7 +1328,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1302 void *data; 1328 void *data;
1303 1329
1304 if (!data_size) 1330 if (!data_size)
1305 return TRUE; 1331 return true;
1306 1332
1307 page = drbd_pp_alloc(mdev, 1, 1); 1333 page = drbd_pp_alloc(mdev, 1, 1);
1308 1334
@@ -1311,8 +1337,10 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1311 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); 1337 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1312 if (rr != min_t(int, data_size, PAGE_SIZE)) { 1338 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1313 rv = 0; 1339 rv = 0;
1314 dev_warn(DEV, "short read receiving data: read %d expected %d\n", 1340 if (!signal_pending(current))
1315 rr, min_t(int, data_size, PAGE_SIZE)); 1341 dev_warn(DEV,
1342 "short read receiving data: read %d expected %d\n",
1343 rr, min_t(int, data_size, PAGE_SIZE));
1316 break; 1344 break;
1317 } 1345 }
1318 data_size -= rr; 1346 data_size -= rr;
@@ -1337,8 +1365,10 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1337 if (dgs) { 1365 if (dgs) {
1338 rr = drbd_recv(mdev, dig_in, dgs); 1366 rr = drbd_recv(mdev, dig_in, dgs);
1339 if (rr != dgs) { 1367 if (rr != dgs) {
1340 dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", 1368 if (!signal_pending(current))
1341 rr, dgs); 1369 dev_warn(DEV,
1370 "short read receiving data reply digest: read %d expected %d\n",
1371 rr, dgs);
1342 return 0; 1372 return 0;
1343 } 1373 }
1344 } 1374 }
@@ -1359,9 +1389,10 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1359 expect); 1389 expect);
1360 kunmap(bvec->bv_page); 1390 kunmap(bvec->bv_page);
1361 if (rr != expect) { 1391 if (rr != expect) {
1362 dev_warn(DEV, "short read receiving data reply: " 1392 if (!signal_pending(current))
1363 "read %d expected %d\n", 1393 dev_warn(DEV, "short read receiving data reply: "
1364 rr, expect); 1394 "read %d expected %d\n",
1395 rr, expect);
1365 return 0; 1396 return 0;
1366 } 1397 }
1367 data_size -= rr; 1398 data_size -= rr;
@@ -1425,11 +1456,10 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
1425 1456
1426 atomic_add(data_size >> 9, &mdev->rs_sect_ev); 1457 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
1427 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) 1458 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
1428 return TRUE; 1459 return true;
1429 1460
1430 /* drbd_submit_ee currently fails for one reason only: 1461 /* don't care for the reason here */
1431 * not being able to allocate enough bios. 1462 dev_err(DEV, "submit failed, triggering re-connect\n");
1432 * Is dropping the connection going to help? */
1433 spin_lock_irq(&mdev->req_lock); 1463 spin_lock_irq(&mdev->req_lock);
1434 list_del(&e->w.list); 1464 list_del(&e->w.list);
1435 spin_unlock_irq(&mdev->req_lock); 1465 spin_unlock_irq(&mdev->req_lock);
@@ -1437,7 +1467,7 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
1437 drbd_free_ee(mdev, e); 1467 drbd_free_ee(mdev, e);
1438fail: 1468fail:
1439 put_ldev(mdev); 1469 put_ldev(mdev);
1440 return FALSE; 1470 return false;
1441} 1471}
1442 1472
1443static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 1473static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
@@ -1454,7 +1484,7 @@ static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
1454 spin_unlock_irq(&mdev->req_lock); 1484 spin_unlock_irq(&mdev->req_lock);
1455 if (unlikely(!req)) { 1485 if (unlikely(!req)) {
1456 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); 1486 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1457 return FALSE; 1487 return false;
1458 } 1488 }
1459 1489
1460 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid 1490 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
@@ -1611,15 +1641,15 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1611 return ret; 1641 return ret;
1612} 1642}
1613 1643
1614static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf) 1644/* see also bio_flags_to_wire()
1645 * DRBD_REQ_*, because we need to semantically map the flags to data packet
1646 * flags and back. We may replicate to other kernel versions. */
1647static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
1615{ 1648{
1616 if (mdev->agreed_pro_version >= 95) 1649 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
1617 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | 1650 (dpf & DP_FUA ? REQ_FUA : 0) |
1618 (dpf & DP_FUA ? REQ_FUA : 0) | 1651 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
1619 (dpf & DP_FLUSH ? REQ_FUA : 0) | 1652 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
1620 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
1621 else
1622 return dpf & DP_RW_SYNC ? REQ_SYNC : 0;
1623} 1653}
1624 1654
1625/* mirrored write */ 1655/* mirrored write */
@@ -1632,9 +1662,6 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1632 u32 dp_flags; 1662 u32 dp_flags;
1633 1663
1634 if (!get_ldev(mdev)) { 1664 if (!get_ldev(mdev)) {
1635 if (__ratelimit(&drbd_ratelimit_state))
1636 dev_err(DEV, "Can not write mirrored data block "
1637 "to local disk.\n");
1638 spin_lock(&mdev->peer_seq_lock); 1665 spin_lock(&mdev->peer_seq_lock);
1639 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) 1666 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1640 mdev->peer_seq++; 1667 mdev->peer_seq++;
@@ -1654,23 +1681,23 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1654 e = read_in_block(mdev, p->block_id, sector, data_size); 1681 e = read_in_block(mdev, p->block_id, sector, data_size);
1655 if (!e) { 1682 if (!e) {
1656 put_ldev(mdev); 1683 put_ldev(mdev);
1657 return FALSE; 1684 return false;
1658 } 1685 }
1659 1686
1660 e->w.cb = e_end_block; 1687 e->w.cb = e_end_block;
1661 1688
1689 dp_flags = be32_to_cpu(p->dp_flags);
1690 rw |= wire_flags_to_bio(mdev, dp_flags);
1691
1692 if (dp_flags & DP_MAY_SET_IN_SYNC)
1693 e->flags |= EE_MAY_SET_IN_SYNC;
1694
1662 spin_lock(&mdev->epoch_lock); 1695 spin_lock(&mdev->epoch_lock);
1663 e->epoch = mdev->current_epoch; 1696 e->epoch = mdev->current_epoch;
1664 atomic_inc(&e->epoch->epoch_size); 1697 atomic_inc(&e->epoch->epoch_size);
1665 atomic_inc(&e->epoch->active); 1698 atomic_inc(&e->epoch->active);
1666 spin_unlock(&mdev->epoch_lock); 1699 spin_unlock(&mdev->epoch_lock);
1667 1700
1668 dp_flags = be32_to_cpu(p->dp_flags);
1669 rw |= write_flags_to_bio(mdev, dp_flags);
1670
1671 if (dp_flags & DP_MAY_SET_IN_SYNC)
1672 e->flags |= EE_MAY_SET_IN_SYNC;
1673
1674 /* I'm the receiver, I do hold a net_cnt reference. */ 1701 /* I'm the receiver, I do hold a net_cnt reference. */
1675 if (!mdev->net_conf->two_primaries) { 1702 if (!mdev->net_conf->two_primaries) {
1676 spin_lock_irq(&mdev->req_lock); 1703 spin_lock_irq(&mdev->req_lock);
@@ -1773,7 +1800,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1773 put_ldev(mdev); 1800 put_ldev(mdev);
1774 wake_asender(mdev); 1801 wake_asender(mdev);
1775 finish_wait(&mdev->misc_wait, &wait); 1802 finish_wait(&mdev->misc_wait, &wait);
1776 return TRUE; 1803 return true;
1777 } 1804 }
1778 1805
1779 if (signal_pending(current)) { 1806 if (signal_pending(current)) {
@@ -1829,11 +1856,10 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1829 } 1856 }
1830 1857
1831 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0) 1858 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
1832 return TRUE; 1859 return true;
1833 1860
1834 /* drbd_submit_ee currently fails for one reason only: 1861 /* don't care for the reason here */
1835 * not being able to allocate enough bios. 1862 dev_err(DEV, "submit failed, triggering re-connect\n");
1836 * Is dropping the connection going to help? */
1837 spin_lock_irq(&mdev->req_lock); 1863 spin_lock_irq(&mdev->req_lock);
1838 list_del(&e->w.list); 1864 list_del(&e->w.list);
1839 hlist_del_init(&e->colision); 1865 hlist_del_init(&e->colision);
@@ -1842,12 +1868,10 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1842 drbd_al_complete_io(mdev, e->sector); 1868 drbd_al_complete_io(mdev, e->sector);
1843 1869
1844out_interrupted: 1870out_interrupted:
1845 /* yes, the epoch_size now is imbalanced. 1871 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
1846 * but we drop the connection anyways, so we don't have a chance to
1847 * receive a barrier... atomic_inc(&mdev->epoch_size); */
1848 put_ldev(mdev); 1872 put_ldev(mdev);
1849 drbd_free_ee(mdev, e); 1873 drbd_free_ee(mdev, e);
1850 return FALSE; 1874 return false;
1851} 1875}
1852 1876
1853/* We may throttle resync, if the lower device seems to be busy, 1877/* We may throttle resync, if the lower device seems to be busy,
@@ -1861,10 +1885,11 @@ out_interrupted:
1861 * The current sync rate used here uses only the most recent two step marks, 1885 * The current sync rate used here uses only the most recent two step marks,
1862 * to have a short time average so we can react faster. 1886 * to have a short time average so we can react faster.
1863 */ 1887 */
1864int drbd_rs_should_slow_down(struct drbd_conf *mdev) 1888int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
1865{ 1889{
1866 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk; 1890 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
1867 unsigned long db, dt, dbdt; 1891 unsigned long db, dt, dbdt;
1892 struct lc_element *tmp;
1868 int curr_events; 1893 int curr_events;
1869 int throttle = 0; 1894 int throttle = 0;
1870 1895
@@ -1872,9 +1897,22 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev)
1872 if (mdev->sync_conf.c_min_rate == 0) 1897 if (mdev->sync_conf.c_min_rate == 0)
1873 return 0; 1898 return 0;
1874 1899
1900 spin_lock_irq(&mdev->al_lock);
1901 tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
1902 if (tmp) {
1903 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
1904 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
1905 spin_unlock_irq(&mdev->al_lock);
1906 return 0;
1907 }
1908 /* Do not slow down if app IO is already waiting for this extent */
1909 }
1910 spin_unlock_irq(&mdev->al_lock);
1911
1875 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 1912 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
1876 (int)part_stat_read(&disk->part0, sectors[1]) - 1913 (int)part_stat_read(&disk->part0, sectors[1]) -
1877 atomic_read(&mdev->rs_sect_ev); 1914 atomic_read(&mdev->rs_sect_ev);
1915
1878 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) { 1916 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
1879 unsigned long rs_left; 1917 unsigned long rs_left;
1880 int i; 1918 int i;
@@ -1883,8 +1921,12 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev)
1883 1921
1884 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, 1922 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
1885 * approx. */ 1923 * approx. */
1886 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS; 1924 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
1887 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; 1925
1926 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
1927 rs_left = mdev->ov_left;
1928 else
1929 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
1888 1930
1889 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ; 1931 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
1890 if (!dt) 1932 if (!dt)
@@ -1912,15 +1954,15 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
1912 sector = be64_to_cpu(p->sector); 1954 sector = be64_to_cpu(p->sector);
1913 size = be32_to_cpu(p->blksize); 1955 size = be32_to_cpu(p->blksize);
1914 1956
1915 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1957 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
1916 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 1958 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1917 (unsigned long long)sector, size); 1959 (unsigned long long)sector, size);
1918 return FALSE; 1960 return false;
1919 } 1961 }
1920 if (sector + (size>>9) > capacity) { 1962 if (sector + (size>>9) > capacity) {
1921 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, 1963 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1922 (unsigned long long)sector, size); 1964 (unsigned long long)sector, size);
1923 return FALSE; 1965 return false;
1924 } 1966 }
1925 1967
1926 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { 1968 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
@@ -1957,7 +1999,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
1957 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); 1999 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
1958 if (!e) { 2000 if (!e) {
1959 put_ldev(mdev); 2001 put_ldev(mdev);
1960 return FALSE; 2002 return false;
1961 } 2003 }
1962 2004
1963 switch (cmd) { 2005 switch (cmd) {
@@ -1970,6 +2012,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
1970 case P_RS_DATA_REQUEST: 2012 case P_RS_DATA_REQUEST:
1971 e->w.cb = w_e_end_rsdata_req; 2013 e->w.cb = w_e_end_rsdata_req;
1972 fault_type = DRBD_FAULT_RS_RD; 2014 fault_type = DRBD_FAULT_RS_RD;
2015 /* used in the sector offset progress display */
2016 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
1973 break; 2017 break;
1974 2018
1975 case P_OV_REPLY: 2019 case P_OV_REPLY:
@@ -1991,7 +2035,11 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
1991 if (cmd == P_CSUM_RS_REQUEST) { 2035 if (cmd == P_CSUM_RS_REQUEST) {
1992 D_ASSERT(mdev->agreed_pro_version >= 89); 2036 D_ASSERT(mdev->agreed_pro_version >= 89);
1993 e->w.cb = w_e_end_csum_rs_req; 2037 e->w.cb = w_e_end_csum_rs_req;
2038 /* used in the sector offset progress display */
2039 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
1994 } else if (cmd == P_OV_REPLY) { 2040 } else if (cmd == P_OV_REPLY) {
2041 /* track progress, we may need to throttle */
2042 atomic_add(size >> 9, &mdev->rs_sect_in);
1995 e->w.cb = w_e_end_ov_reply; 2043 e->w.cb = w_e_end_ov_reply;
1996 dec_rs_pending(mdev); 2044 dec_rs_pending(mdev);
1997 /* drbd_rs_begin_io done when we sent this request, 2045 /* drbd_rs_begin_io done when we sent this request,
@@ -2003,9 +2051,16 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
2003 case P_OV_REQUEST: 2051 case P_OV_REQUEST:
2004 if (mdev->ov_start_sector == ~(sector_t)0 && 2052 if (mdev->ov_start_sector == ~(sector_t)0 &&
2005 mdev->agreed_pro_version >= 90) { 2053 mdev->agreed_pro_version >= 90) {
2054 unsigned long now = jiffies;
2055 int i;
2006 mdev->ov_start_sector = sector; 2056 mdev->ov_start_sector = sector;
2007 mdev->ov_position = sector; 2057 mdev->ov_position = sector;
2008 mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); 2058 mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
2059 mdev->rs_total = mdev->ov_left;
2060 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2061 mdev->rs_mark_left[i] = mdev->ov_left;
2062 mdev->rs_mark_time[i] = now;
2063 }
2009 dev_info(DEV, "Online Verify start sector: %llu\n", 2064 dev_info(DEV, "Online Verify start sector: %llu\n",
2010 (unsigned long long)sector); 2065 (unsigned long long)sector);
2011 } 2066 }
@@ -2042,9 +2097,9 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
2042 * we would also throttle its application reads. 2097 * we would also throttle its application reads.
2043 * In that case, throttling is done on the SyncTarget only. 2098 * In that case, throttling is done on the SyncTarget only.
2044 */ 2099 */
2045 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev)) 2100 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
2046 msleep(100); 2101 schedule_timeout_uninterruptible(HZ/10);
2047 if (drbd_rs_begin_io(mdev, e->sector)) 2102 if (drbd_rs_begin_io(mdev, sector))
2048 goto out_free_e; 2103 goto out_free_e;
2049 2104
2050submit_for_resync: 2105submit_for_resync:
@@ -2057,11 +2112,10 @@ submit:
2057 spin_unlock_irq(&mdev->req_lock); 2112 spin_unlock_irq(&mdev->req_lock);
2058 2113
2059 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0) 2114 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
2060 return TRUE; 2115 return true;
2061 2116
2062 /* drbd_submit_ee currently fails for one reason only: 2117 /* don't care for the reason here */
2063 * not being able to allocate enough bios. 2118 dev_err(DEV, "submit failed, triggering re-connect\n");
2064 * Is dropping the connection going to help? */
2065 spin_lock_irq(&mdev->req_lock); 2119 spin_lock_irq(&mdev->req_lock);
2066 list_del(&e->w.list); 2120 list_del(&e->w.list);
2067 spin_unlock_irq(&mdev->req_lock); 2121 spin_unlock_irq(&mdev->req_lock);
@@ -2070,7 +2124,7 @@ submit:
2070out_free_e: 2124out_free_e:
2071 put_ldev(mdev); 2125 put_ldev(mdev);
2072 drbd_free_ee(mdev, e); 2126 drbd_free_ee(mdev, e);
2073 return FALSE; 2127 return false;
2074} 2128}
2075 2129
2076static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) 2130static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
@@ -2147,10 +2201,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2147 2201
2148static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) 2202static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2149{ 2203{
2150 int self, peer, hg, rv = -100; 2204 int hg, rv = -100;
2151
2152 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2153 peer = mdev->p_uuid[UI_BITMAP] & 1;
2154 2205
2155 switch (mdev->net_conf->after_sb_1p) { 2206 switch (mdev->net_conf->after_sb_1p) {
2156 case ASB_DISCARD_YOUNGER_PRI: 2207 case ASB_DISCARD_YOUNGER_PRI:
@@ -2177,12 +2228,14 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2177 case ASB_CALL_HELPER: 2228 case ASB_CALL_HELPER:
2178 hg = drbd_asb_recover_0p(mdev); 2229 hg = drbd_asb_recover_0p(mdev);
2179 if (hg == -1 && mdev->state.role == R_PRIMARY) { 2230 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2180 self = drbd_set_role(mdev, R_SECONDARY, 0); 2231 enum drbd_state_rv rv2;
2232
2233 drbd_set_role(mdev, R_SECONDARY, 0);
2181 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 2234 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2182 * we might be here in C_WF_REPORT_PARAMS which is transient. 2235 * we might be here in C_WF_REPORT_PARAMS which is transient.
2183 * we do not need to wait for the after state change work either. */ 2236 * we do not need to wait for the after state change work either. */
2184 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); 2237 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2185 if (self != SS_SUCCESS) { 2238 if (rv2 != SS_SUCCESS) {
2186 drbd_khelper(mdev, "pri-lost-after-sb"); 2239 drbd_khelper(mdev, "pri-lost-after-sb");
2187 } else { 2240 } else {
2188 dev_warn(DEV, "Successfully gave up primary role.\n"); 2241 dev_warn(DEV, "Successfully gave up primary role.\n");
@@ -2197,10 +2250,7 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2197 2250
2198static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) 2251static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2199{ 2252{
2200 int self, peer, hg, rv = -100; 2253 int hg, rv = -100;
2201
2202 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2203 peer = mdev->p_uuid[UI_BITMAP] & 1;
2204 2254
2205 switch (mdev->net_conf->after_sb_2p) { 2255 switch (mdev->net_conf->after_sb_2p) {
2206 case ASB_DISCARD_YOUNGER_PRI: 2256 case ASB_DISCARD_YOUNGER_PRI:
@@ -2220,11 +2270,13 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2220 case ASB_CALL_HELPER: 2270 case ASB_CALL_HELPER:
2221 hg = drbd_asb_recover_0p(mdev); 2271 hg = drbd_asb_recover_0p(mdev);
2222 if (hg == -1) { 2272 if (hg == -1) {
2273 enum drbd_state_rv rv2;
2274
2223 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 2275 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2224 * we might be here in C_WF_REPORT_PARAMS which is transient. 2276 * we might be here in C_WF_REPORT_PARAMS which is transient.
2225 * we do not need to wait for the after state change work either. */ 2277 * we do not need to wait for the after state change work either. */
2226 self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); 2278 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2227 if (self != SS_SUCCESS) { 2279 if (rv2 != SS_SUCCESS) {
2228 drbd_khelper(mdev, "pri-lost-after-sb"); 2280 drbd_khelper(mdev, "pri-lost-after-sb");
2229 } else { 2281 } else {
2230 dev_warn(DEV, "Successfully gave up primary role.\n"); 2282 dev_warn(DEV, "Successfully gave up primary role.\n");
@@ -2263,6 +2315,8 @@ static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2263 -2 C_SYNC_TARGET set BitMap 2315 -2 C_SYNC_TARGET set BitMap
2264 -100 after split brain, disconnect 2316 -100 after split brain, disconnect
2265-1000 unrelated data 2317-1000 unrelated data
2318-1091 requires proto 91
2319-1096 requires proto 96
2266 */ 2320 */
2267static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) 2321static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2268{ 2322{
@@ -2292,7 +2346,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2292 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { 2346 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2293 2347
2294 if (mdev->agreed_pro_version < 91) 2348 if (mdev->agreed_pro_version < 91)
2295 return -1001; 2349 return -1091;
2296 2350
2297 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && 2351 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2298 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { 2352 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
@@ -2313,7 +2367,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2313 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { 2367 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2314 2368
2315 if (mdev->agreed_pro_version < 91) 2369 if (mdev->agreed_pro_version < 91)
2316 return -1001; 2370 return -1091;
2317 2371
2318 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && 2372 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2319 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { 2373 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
@@ -2358,17 +2412,22 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2358 *rule_nr = 51; 2412 *rule_nr = 51;
2359 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); 2413 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2360 if (self == peer) { 2414 if (self == peer) {
2361 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 2415 if (mdev->agreed_pro_version < 96 ?
2362 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); 2416 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2363 if (self == peer) { 2417 (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2418 peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
2364 /* The last P_SYNC_UUID did not get though. Undo the last start of 2419 /* The last P_SYNC_UUID did not get though. Undo the last start of
2365 resync as sync source modifications of the peer's UUIDs. */ 2420 resync as sync source modifications of the peer's UUIDs. */
2366 2421
2367 if (mdev->agreed_pro_version < 91) 2422 if (mdev->agreed_pro_version < 91)
2368 return -1001; 2423 return -1091;
2369 2424
2370 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; 2425 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2371 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; 2426 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2427
2428 dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
2429 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2430
2372 return -1; 2431 return -1;
2373 } 2432 }
2374 } 2433 }
@@ -2390,20 +2449,20 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2390 *rule_nr = 71; 2449 *rule_nr = 71;
2391 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 2450 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2392 if (self == peer) { 2451 if (self == peer) {
2393 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); 2452 if (mdev->agreed_pro_version < 96 ?
2394 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); 2453 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2395 if (self == peer) { 2454 (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2455 self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
2396 /* The last P_SYNC_UUID did not get though. Undo the last start of 2456 /* The last P_SYNC_UUID did not get though. Undo the last start of
2397 resync as sync source modifications of our UUIDs. */ 2457 resync as sync source modifications of our UUIDs. */
2398 2458
2399 if (mdev->agreed_pro_version < 91) 2459 if (mdev->agreed_pro_version < 91)
2400 return -1001; 2460 return -1091;
2401 2461
2402 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); 2462 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2403 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); 2463 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2404 2464
2405 dev_info(DEV, "Undid last start of resync:\n"); 2465 dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
2406
2407 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, 2466 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2408 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); 2467 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2409 2468
@@ -2466,8 +2525,8 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
2466 dev_alert(DEV, "Unrelated data, aborting!\n"); 2525 dev_alert(DEV, "Unrelated data, aborting!\n");
2467 return C_MASK; 2526 return C_MASK;
2468 } 2527 }
2469 if (hg == -1001) { 2528 if (hg < -1000) {
2470 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); 2529 dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
2471 return C_MASK; 2530 return C_MASK;
2472 } 2531 }
2473 2532
@@ -2566,7 +2625,8 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
2566 2625
2567 if (abs(hg) >= 2) { 2626 if (abs(hg) >= 2) {
2568 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); 2627 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2569 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) 2628 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
2629 BM_LOCKED_SET_ALLOWED))
2570 return C_MASK; 2630 return C_MASK;
2571 } 2631 }
2572 2632
@@ -2660,7 +2720,7 @@ static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsig
2660 unsigned char *my_alg = mdev->net_conf->integrity_alg; 2720 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2661 2721
2662 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) 2722 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2663 return FALSE; 2723 return false;
2664 2724
2665 p_integrity_alg[SHARED_SECRET_MAX-1] = 0; 2725 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2666 if (strcmp(p_integrity_alg, my_alg)) { 2726 if (strcmp(p_integrity_alg, my_alg)) {
@@ -2671,11 +2731,11 @@ static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsig
2671 my_alg[0] ? my_alg : (unsigned char *)"<not-used>"); 2731 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2672 } 2732 }
2673 2733
2674 return TRUE; 2734 return true;
2675 2735
2676disconnect: 2736disconnect:
2677 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2737 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2678 return FALSE; 2738 return false;
2679} 2739}
2680 2740
2681/* helper function 2741/* helper function
@@ -2707,7 +2767,7 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2707 2767
2708static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size) 2768static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
2709{ 2769{
2710 int ok = TRUE; 2770 int ok = true;
2711 struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95; 2771 struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
2712 unsigned int header_size, data_size, exp_max_sz; 2772 unsigned int header_size, data_size, exp_max_sz;
2713 struct crypto_hash *verify_tfm = NULL; 2773 struct crypto_hash *verify_tfm = NULL;
@@ -2725,7 +2785,7 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
2725 if (packet_size > exp_max_sz) { 2785 if (packet_size > exp_max_sz) {
2726 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", 2786 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2727 packet_size, exp_max_sz); 2787 packet_size, exp_max_sz);
2728 return FALSE; 2788 return false;
2729 } 2789 }
2730 2790
2731 if (apv <= 88) { 2791 if (apv <= 88) {
@@ -2745,7 +2805,7 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
2745 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); 2805 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2746 2806
2747 if (drbd_recv(mdev, &p->head.payload, header_size) != header_size) 2807 if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
2748 return FALSE; 2808 return false;
2749 2809
2750 mdev->sync_conf.rate = be32_to_cpu(p->rate); 2810 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2751 2811
@@ -2755,11 +2815,11 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
2755 dev_err(DEV, "verify-alg too long, " 2815 dev_err(DEV, "verify-alg too long, "
2756 "peer wants %u, accepting only %u byte\n", 2816 "peer wants %u, accepting only %u byte\n",
2757 data_size, SHARED_SECRET_MAX); 2817 data_size, SHARED_SECRET_MAX);
2758 return FALSE; 2818 return false;
2759 } 2819 }
2760 2820
2761 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) 2821 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2762 return FALSE; 2822 return false;
2763 2823
2764 /* we expect NUL terminated string */ 2824 /* we expect NUL terminated string */
2765 /* but just in case someone tries to be evil */ 2825 /* but just in case someone tries to be evil */
@@ -2853,7 +2913,7 @@ disconnect:
2853 /* but free the verify_tfm again, if csums_tfm did not work out */ 2913 /* but free the verify_tfm again, if csums_tfm did not work out */
2854 crypto_free_hash(verify_tfm); 2914 crypto_free_hash(verify_tfm);
2855 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2915 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2856 return FALSE; 2916 return false;
2857} 2917}
2858 2918
2859static void drbd_setup_order_type(struct drbd_conf *mdev, int peer) 2919static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
@@ -2879,7 +2939,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
2879{ 2939{
2880 struct p_sizes *p = &mdev->data.rbuf.sizes; 2940 struct p_sizes *p = &mdev->data.rbuf.sizes;
2881 enum determine_dev_size dd = unchanged; 2941 enum determine_dev_size dd = unchanged;
2882 unsigned int max_seg_s; 2942 unsigned int max_bio_size;
2883 sector_t p_size, p_usize, my_usize; 2943 sector_t p_size, p_usize, my_usize;
2884 int ldsc = 0; /* local disk size changed */ 2944 int ldsc = 0; /* local disk size changed */
2885 enum dds_flags ddsf; 2945 enum dds_flags ddsf;
@@ -2890,7 +2950,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
2890 if (p_size == 0 && mdev->state.disk == D_DISKLESS) { 2950 if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2891 dev_err(DEV, "some backing storage is needed\n"); 2951 dev_err(DEV, "some backing storage is needed\n");
2892 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2952 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2893 return FALSE; 2953 return false;
2894 } 2954 }
2895 2955
2896 /* just store the peer's disk size for now. 2956 /* just store the peer's disk size for now.
@@ -2927,18 +2987,17 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
2927 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 2987 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2928 mdev->ldev->dc.disk_size = my_usize; 2988 mdev->ldev->dc.disk_size = my_usize;
2929 put_ldev(mdev); 2989 put_ldev(mdev);
2930 return FALSE; 2990 return false;
2931 } 2991 }
2932 put_ldev(mdev); 2992 put_ldev(mdev);
2933 } 2993 }
2934#undef min_not_zero
2935 2994
2936 ddsf = be16_to_cpu(p->dds_flags); 2995 ddsf = be16_to_cpu(p->dds_flags);
2937 if (get_ldev(mdev)) { 2996 if (get_ldev(mdev)) {
2938 dd = drbd_determin_dev_size(mdev, ddsf); 2997 dd = drbd_determin_dev_size(mdev, ddsf);
2939 put_ldev(mdev); 2998 put_ldev(mdev);
2940 if (dd == dev_size_error) 2999 if (dd == dev_size_error)
2941 return FALSE; 3000 return false;
2942 drbd_md_sync(mdev); 3001 drbd_md_sync(mdev);
2943 } else { 3002 } else {
2944 /* I am diskless, need to accept the peer's size. */ 3003 /* I am diskless, need to accept the peer's size. */
@@ -2952,14 +3011,14 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
2952 } 3011 }
2953 3012
2954 if (mdev->agreed_pro_version < 94) 3013 if (mdev->agreed_pro_version < 94)
2955 max_seg_s = be32_to_cpu(p->max_segment_size); 3014 max_bio_size = be32_to_cpu(p->max_bio_size);
2956 else if (mdev->agreed_pro_version == 94) 3015 else if (mdev->agreed_pro_version == 94)
2957 max_seg_s = DRBD_MAX_SIZE_H80_PACKET; 3016 max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
2958 else /* drbd 8.3.8 onwards */ 3017 else /* drbd 8.3.8 onwards */
2959 max_seg_s = DRBD_MAX_SEGMENT_SIZE; 3018 max_bio_size = DRBD_MAX_BIO_SIZE;
2960 3019
2961 if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) 3020 if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9)
2962 drbd_setup_queue_param(mdev, max_seg_s); 3021 drbd_setup_queue_param(mdev, max_bio_size);
2963 3022
2964 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type)); 3023 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
2965 put_ldev(mdev); 3024 put_ldev(mdev);
@@ -2985,14 +3044,14 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
2985 } 3044 }
2986 } 3045 }
2987 3046
2988 return TRUE; 3047 return true;
2989} 3048}
2990 3049
2991static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 3050static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
2992{ 3051{
2993 struct p_uuids *p = &mdev->data.rbuf.uuids; 3052 struct p_uuids *p = &mdev->data.rbuf.uuids;
2994 u64 *p_uuid; 3053 u64 *p_uuid;
2995 int i; 3054 int i, updated_uuids = 0;
2996 3055
2997 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); 3056 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
2998 3057
@@ -3009,7 +3068,7 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3009 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", 3068 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3010 (unsigned long long)mdev->ed_uuid); 3069 (unsigned long long)mdev->ed_uuid);
3011 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3070 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3012 return FALSE; 3071 return false;
3013 } 3072 }
3014 3073
3015 if (get_ldev(mdev)) { 3074 if (get_ldev(mdev)) {
@@ -3021,19 +3080,21 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3021 if (skip_initial_sync) { 3080 if (skip_initial_sync) {
3022 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); 3081 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3023 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, 3082 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
3024 "clear_n_write from receive_uuids"); 3083 "clear_n_write from receive_uuids",
3084 BM_LOCKED_TEST_ALLOWED);
3025 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); 3085 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3026 _drbd_uuid_set(mdev, UI_BITMAP, 0); 3086 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3027 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), 3087 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3028 CS_VERBOSE, NULL); 3088 CS_VERBOSE, NULL);
3029 drbd_md_sync(mdev); 3089 drbd_md_sync(mdev);
3090 updated_uuids = 1;
3030 } 3091 }
3031 put_ldev(mdev); 3092 put_ldev(mdev);
3032 } else if (mdev->state.disk < D_INCONSISTENT && 3093 } else if (mdev->state.disk < D_INCONSISTENT &&
3033 mdev->state.role == R_PRIMARY) { 3094 mdev->state.role == R_PRIMARY) {
3034 /* I am a diskless primary, the peer just created a new current UUID 3095 /* I am a diskless primary, the peer just created a new current UUID
3035 for me. */ 3096 for me. */
3036 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); 3097 updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3037 } 3098 }
3038 3099
3039 /* Before we test for the disk state, we should wait until an eventually 3100 /* Before we test for the disk state, we should wait until an eventually
@@ -3042,9 +3103,12 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3042 new disk state... */ 3103 new disk state... */
3043 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); 3104 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3044 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) 3105 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
3045 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); 3106 updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3046 3107
3047 return TRUE; 3108 if (updated_uuids)
3109 drbd_print_uuids(mdev, "receiver updated UUIDs to");
3110
3111 return true;
3048} 3112}
3049 3113
3050/** 3114/**
@@ -3081,7 +3145,7 @@ static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
3081{ 3145{
3082 struct p_req_state *p = &mdev->data.rbuf.req_state; 3146 struct p_req_state *p = &mdev->data.rbuf.req_state;
3083 union drbd_state mask, val; 3147 union drbd_state mask, val;
3084 int rv; 3148 enum drbd_state_rv rv;
3085 3149
3086 mask.i = be32_to_cpu(p->mask); 3150 mask.i = be32_to_cpu(p->mask);
3087 val.i = be32_to_cpu(p->val); 3151 val.i = be32_to_cpu(p->val);
@@ -3089,7 +3153,7 @@ static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
3089 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && 3153 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3090 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { 3154 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3091 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); 3155 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3092 return TRUE; 3156 return true;
3093 } 3157 }
3094 3158
3095 mask = convert_state(mask); 3159 mask = convert_state(mask);
@@ -3100,7 +3164,7 @@ static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
3100 drbd_send_sr_reply(mdev, rv); 3164 drbd_send_sr_reply(mdev, rv);
3101 drbd_md_sync(mdev); 3165 drbd_md_sync(mdev);
3102 3166
3103 return TRUE; 3167 return true;
3104} 3168}
3105 3169
3106static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 3170static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
@@ -3145,7 +3209,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3145 peer_state.conn == C_CONNECTED) { 3209 peer_state.conn == C_CONNECTED) {
3146 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) 3210 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3147 drbd_resync_finished(mdev); 3211 drbd_resync_finished(mdev);
3148 return TRUE; 3212 return true;
3149 } 3213 }
3150 } 3214 }
3151 3215
@@ -3161,6 +3225,9 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3161 if (ns.conn == C_WF_REPORT_PARAMS) 3225 if (ns.conn == C_WF_REPORT_PARAMS)
3162 ns.conn = C_CONNECTED; 3226 ns.conn = C_CONNECTED;
3163 3227
3228 if (peer_state.conn == C_AHEAD)
3229 ns.conn = C_BEHIND;
3230
3164 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && 3231 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3165 get_ldev_if_state(mdev, D_NEGOTIATING)) { 3232 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3166 int cr; /* consider resync */ 3233 int cr; /* consider resync */
@@ -3195,10 +3262,10 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3195 real_peer_disk = D_DISKLESS; 3262 real_peer_disk = D_DISKLESS;
3196 } else { 3263 } else {
3197 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags)) 3264 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
3198 return FALSE; 3265 return false;
3199 D_ASSERT(os.conn == C_WF_REPORT_PARAMS); 3266 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
3200 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3267 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3201 return FALSE; 3268 return false;
3202 } 3269 }
3203 } 3270 }
3204 } 3271 }
@@ -3223,7 +3290,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3223 drbd_uuid_new_current(mdev); 3290 drbd_uuid_new_current(mdev);
3224 clear_bit(NEW_CUR_UUID, &mdev->flags); 3291 clear_bit(NEW_CUR_UUID, &mdev->flags);
3225 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0)); 3292 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
3226 return FALSE; 3293 return false;
3227 } 3294 }
3228 rv = _drbd_set_state(mdev, ns, cs_flags, NULL); 3295 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
3229 ns = mdev->state; 3296 ns = mdev->state;
@@ -3231,7 +3298,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3231 3298
3232 if (rv < SS_SUCCESS) { 3299 if (rv < SS_SUCCESS) {
3233 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 3300 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3234 return FALSE; 3301 return false;
3235 } 3302 }
3236 3303
3237 if (os.conn > C_WF_REPORT_PARAMS) { 3304 if (os.conn > C_WF_REPORT_PARAMS) {
@@ -3249,7 +3316,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3249 3316
3250 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ 3317 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3251 3318
3252 return TRUE; 3319 return true;
3253} 3320}
3254 3321
3255static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 3322static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
@@ -3258,6 +3325,7 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
3258 3325
3259 wait_event(mdev->misc_wait, 3326 wait_event(mdev->misc_wait,
3260 mdev->state.conn == C_WF_SYNC_UUID || 3327 mdev->state.conn == C_WF_SYNC_UUID ||
3328 mdev->state.conn == C_BEHIND ||
3261 mdev->state.conn < C_CONNECTED || 3329 mdev->state.conn < C_CONNECTED ||
3262 mdev->state.disk < D_NEGOTIATING); 3330 mdev->state.disk < D_NEGOTIATING);
3263 3331
@@ -3269,32 +3337,42 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
3269 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); 3337 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3270 _drbd_uuid_set(mdev, UI_BITMAP, 0UL); 3338 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3271 3339
3340 drbd_print_uuids(mdev, "updated sync uuid");
3272 drbd_start_resync(mdev, C_SYNC_TARGET); 3341 drbd_start_resync(mdev, C_SYNC_TARGET);
3273 3342
3274 put_ldev(mdev); 3343 put_ldev(mdev);
3275 } else 3344 } else
3276 dev_err(DEV, "Ignoring SyncUUID packet!\n"); 3345 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3277 3346
3278 return TRUE; 3347 return true;
3279} 3348}
3280 3349
3281enum receive_bitmap_ret { OK, DONE, FAILED }; 3350/**
3282 3351 * receive_bitmap_plain
3283static enum receive_bitmap_ret 3352 *
3353 * Return 0 when done, 1 when another iteration is needed, and a negative error
3354 * code upon failure.
3355 */
3356static int
3284receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size, 3357receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
3285 unsigned long *buffer, struct bm_xfer_ctx *c) 3358 unsigned long *buffer, struct bm_xfer_ctx *c)
3286{ 3359{
3287 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); 3360 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3288 unsigned want = num_words * sizeof(long); 3361 unsigned want = num_words * sizeof(long);
3362 int err;
3289 3363
3290 if (want != data_size) { 3364 if (want != data_size) {
3291 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size); 3365 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
3292 return FAILED; 3366 return -EIO;
3293 } 3367 }
3294 if (want == 0) 3368 if (want == 0)
3295 return DONE; 3369 return 0;
3296 if (drbd_recv(mdev, buffer, want) != want) 3370 err = drbd_recv(mdev, buffer, want);
3297 return FAILED; 3371 if (err != want) {
3372 if (err >= 0)
3373 err = -EIO;
3374 return err;
3375 }
3298 3376
3299 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); 3377 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3300 3378
@@ -3303,10 +3381,16 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
3303 if (c->bit_offset > c->bm_bits) 3381 if (c->bit_offset > c->bm_bits)
3304 c->bit_offset = c->bm_bits; 3382 c->bit_offset = c->bm_bits;
3305 3383
3306 return OK; 3384 return 1;
3307} 3385}
3308 3386
3309static enum receive_bitmap_ret 3387/**
3388 * recv_bm_rle_bits
3389 *
3390 * Return 0 when done, 1 when another iteration is needed, and a negative error
3391 * code upon failure.
3392 */
3393static int
3310recv_bm_rle_bits(struct drbd_conf *mdev, 3394recv_bm_rle_bits(struct drbd_conf *mdev,
3311 struct p_compressed_bm *p, 3395 struct p_compressed_bm *p,
3312 struct bm_xfer_ctx *c) 3396 struct bm_xfer_ctx *c)
@@ -3326,18 +3410,18 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
3326 3410
3327 bits = bitstream_get_bits(&bs, &look_ahead, 64); 3411 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3328 if (bits < 0) 3412 if (bits < 0)
3329 return FAILED; 3413 return -EIO;
3330 3414
3331 for (have = bits; have > 0; s += rl, toggle = !toggle) { 3415 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3332 bits = vli_decode_bits(&rl, look_ahead); 3416 bits = vli_decode_bits(&rl, look_ahead);
3333 if (bits <= 0) 3417 if (bits <= 0)
3334 return FAILED; 3418 return -EIO;
3335 3419
3336 if (toggle) { 3420 if (toggle) {
3337 e = s + rl -1; 3421 e = s + rl -1;
3338 if (e >= c->bm_bits) { 3422 if (e >= c->bm_bits) {
3339 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); 3423 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3340 return FAILED; 3424 return -EIO;
3341 } 3425 }
3342 _drbd_bm_set_bits(mdev, s, e); 3426 _drbd_bm_set_bits(mdev, s, e);
3343 } 3427 }
@@ -3347,14 +3431,14 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
3347 have, bits, look_ahead, 3431 have, bits, look_ahead,
3348 (unsigned int)(bs.cur.b - p->code), 3432 (unsigned int)(bs.cur.b - p->code),
3349 (unsigned int)bs.buf_len); 3433 (unsigned int)bs.buf_len);
3350 return FAILED; 3434 return -EIO;
3351 } 3435 }
3352 look_ahead >>= bits; 3436 look_ahead >>= bits;
3353 have -= bits; 3437 have -= bits;
3354 3438
3355 bits = bitstream_get_bits(&bs, &tmp, 64 - have); 3439 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3356 if (bits < 0) 3440 if (bits < 0)
3357 return FAILED; 3441 return -EIO;
3358 look_ahead |= tmp << have; 3442 look_ahead |= tmp << have;
3359 have += bits; 3443 have += bits;
3360 } 3444 }
@@ -3362,10 +3446,16 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
3362 c->bit_offset = s; 3446 c->bit_offset = s;
3363 bm_xfer_ctx_bit_to_word_offset(c); 3447 bm_xfer_ctx_bit_to_word_offset(c);
3364 3448
3365 return (s == c->bm_bits) ? DONE : OK; 3449 return (s != c->bm_bits);
3366} 3450}
3367 3451
3368static enum receive_bitmap_ret 3452/**
3453 * decode_bitmap_c
3454 *
3455 * Return 0 when done, 1 when another iteration is needed, and a negative error
3456 * code upon failure.
3457 */
3458static int
3369decode_bitmap_c(struct drbd_conf *mdev, 3459decode_bitmap_c(struct drbd_conf *mdev,
3370 struct p_compressed_bm *p, 3460 struct p_compressed_bm *p,
3371 struct bm_xfer_ctx *c) 3461 struct bm_xfer_ctx *c)
@@ -3379,7 +3469,7 @@ decode_bitmap_c(struct drbd_conf *mdev,
3379 3469
3380 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); 3470 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3381 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); 3471 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3382 return FAILED; 3472 return -EIO;
3383} 3473}
3384 3474
3385void INFO_bm_xfer_stats(struct drbd_conf *mdev, 3475void INFO_bm_xfer_stats(struct drbd_conf *mdev,
@@ -3428,13 +3518,13 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
3428{ 3518{
3429 struct bm_xfer_ctx c; 3519 struct bm_xfer_ctx c;
3430 void *buffer; 3520 void *buffer;
3431 enum receive_bitmap_ret ret; 3521 int err;
3432 int ok = FALSE; 3522 int ok = false;
3433 struct p_header80 *h = &mdev->data.rbuf.header.h80; 3523 struct p_header80 *h = &mdev->data.rbuf.header.h80;
3434 3524
3435 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); 3525 drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
3436 3526 /* you are supposed to send additional out-of-sync information
3437 drbd_bm_lock(mdev, "receive bitmap"); 3527 * if you actually set bits during this phase */
3438 3528
3439 /* maybe we should use some per thread scratch page, 3529 /* maybe we should use some per thread scratch page,
3440 * and allocate that during initial device creation? */ 3530 * and allocate that during initial device creation? */
@@ -3449,9 +3539,9 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
3449 .bm_words = drbd_bm_words(mdev), 3539 .bm_words = drbd_bm_words(mdev),
3450 }; 3540 };
3451 3541
3452 do { 3542 for(;;) {
3453 if (cmd == P_BITMAP) { 3543 if (cmd == P_BITMAP) {
3454 ret = receive_bitmap_plain(mdev, data_size, buffer, &c); 3544 err = receive_bitmap_plain(mdev, data_size, buffer, &c);
3455 } else if (cmd == P_COMPRESSED_BITMAP) { 3545 } else if (cmd == P_COMPRESSED_BITMAP) {
3456 /* MAYBE: sanity check that we speak proto >= 90, 3546 /* MAYBE: sanity check that we speak proto >= 90,
3457 * and the feature is enabled! */ 3547 * and the feature is enabled! */
@@ -3468,9 +3558,9 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
3468 goto out; 3558 goto out;
3469 if (data_size <= (sizeof(*p) - sizeof(p->head))) { 3559 if (data_size <= (sizeof(*p) - sizeof(p->head))) {
3470 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size); 3560 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
3471 return FAILED; 3561 goto out;
3472 } 3562 }
3473 ret = decode_bitmap_c(mdev, p, &c); 3563 err = decode_bitmap_c(mdev, p, &c);
3474 } else { 3564 } else {
3475 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd); 3565 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
3476 goto out; 3566 goto out;
@@ -3479,24 +3569,26 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
3479 c.packets[cmd == P_BITMAP]++; 3569 c.packets[cmd == P_BITMAP]++;
3480 c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size; 3570 c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
3481 3571
3482 if (ret != OK) 3572 if (err <= 0) {
3573 if (err < 0)
3574 goto out;
3483 break; 3575 break;
3484 3576 }
3485 if (!drbd_recv_header(mdev, &cmd, &data_size)) 3577 if (!drbd_recv_header(mdev, &cmd, &data_size))
3486 goto out; 3578 goto out;
3487 } while (ret == OK); 3579 }
3488 if (ret == FAILED)
3489 goto out;
3490 3580
3491 INFO_bm_xfer_stats(mdev, "receive", &c); 3581 INFO_bm_xfer_stats(mdev, "receive", &c);
3492 3582
3493 if (mdev->state.conn == C_WF_BITMAP_T) { 3583 if (mdev->state.conn == C_WF_BITMAP_T) {
3584 enum drbd_state_rv rv;
3585
3494 ok = !drbd_send_bitmap(mdev); 3586 ok = !drbd_send_bitmap(mdev);
3495 if (!ok) 3587 if (!ok)
3496 goto out; 3588 goto out;
3497 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ 3589 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3498 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); 3590 rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3499 D_ASSERT(ok == SS_SUCCESS); 3591 D_ASSERT(rv == SS_SUCCESS);
3500 } else if (mdev->state.conn != C_WF_BITMAP_S) { 3592 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3501 /* admin may have requested C_DISCONNECTING, 3593 /* admin may have requested C_DISCONNECTING,
3502 * other threads may have noticed network errors */ 3594 * other threads may have noticed network errors */
@@ -3504,7 +3596,7 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
3504 drbd_conn_str(mdev->state.conn)); 3596 drbd_conn_str(mdev->state.conn));
3505 } 3597 }
3506 3598
3507 ok = TRUE; 3599 ok = true;
3508 out: 3600 out:
3509 drbd_bm_unlock(mdev); 3601 drbd_bm_unlock(mdev);
3510 if (ok && mdev->state.conn == C_WF_BITMAP_S) 3602 if (ok && mdev->state.conn == C_WF_BITMAP_S)
@@ -3538,7 +3630,26 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, u
3538 * with the data requests being unplugged */ 3630 * with the data requests being unplugged */
3539 drbd_tcp_quickack(mdev->data.socket); 3631 drbd_tcp_quickack(mdev->data.socket);
3540 3632
3541 return TRUE; 3633 return true;
3634}
3635
3636static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3637{
3638 struct p_block_desc *p = &mdev->data.rbuf.block_desc;
3639
3640 switch (mdev->state.conn) {
3641 case C_WF_SYNC_UUID:
3642 case C_WF_BITMAP_T:
3643 case C_BEHIND:
3644 break;
3645 default:
3646 dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
3647 drbd_conn_str(mdev->state.conn));
3648 }
3649
3650 drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
3651
3652 return true;
3542} 3653}
3543 3654
3544typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive); 3655typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
@@ -3571,6 +3682,7 @@ static struct data_cmd drbd_cmd_handler[] = {
3571 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 3682 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3572 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 3683 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3573 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, 3684 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
3685 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
3574 /* anything missing from this table is in 3686 /* anything missing from this table is in
3575 * the asender_tbl, see get_asender_cmd */ 3687 * the asender_tbl, see get_asender_cmd */
3576 [P_MAX_CMD] = { 0, 0, NULL }, 3688 [P_MAX_CMD] = { 0, 0, NULL },
@@ -3610,7 +3722,8 @@ static void drbdd(struct drbd_conf *mdev)
3610 if (shs) { 3722 if (shs) {
3611 rv = drbd_recv(mdev, &header->h80.payload, shs); 3723 rv = drbd_recv(mdev, &header->h80.payload, shs);
3612 if (unlikely(rv != shs)) { 3724 if (unlikely(rv != shs)) {
3613 dev_err(DEV, "short read while reading sub header: rv=%d\n", rv); 3725 if (!signal_pending(current))
3726 dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
3614 goto err_out; 3727 goto err_out;
3615 } 3728 }
3616 } 3729 }
@@ -3682,9 +3795,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3682 3795
3683 if (mdev->state.conn == C_STANDALONE) 3796 if (mdev->state.conn == C_STANDALONE)
3684 return; 3797 return;
3685 if (mdev->state.conn >= C_WF_CONNECTION)
3686 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3687 drbd_conn_str(mdev->state.conn));
3688 3798
3689 /* asender does not clean up anything. it must not interfere, either */ 3799 /* asender does not clean up anything. it must not interfere, either */
3690 drbd_thread_stop(&mdev->asender); 3800 drbd_thread_stop(&mdev->asender);
@@ -3713,6 +3823,8 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3713 atomic_set(&mdev->rs_pending_cnt, 0); 3823 atomic_set(&mdev->rs_pending_cnt, 0);
3714 wake_up(&mdev->misc_wait); 3824 wake_up(&mdev->misc_wait);
3715 3825
3826 del_timer(&mdev->request_timer);
3827
3716 /* make sure syncer is stopped and w_resume_next_sg queued */ 3828 /* make sure syncer is stopped and w_resume_next_sg queued */
3717 del_timer_sync(&mdev->resync_timer); 3829 del_timer_sync(&mdev->resync_timer);
3718 resync_timer_fn((unsigned long)mdev); 3830 resync_timer_fn((unsigned long)mdev);
@@ -3758,13 +3870,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3758 if (os.conn == C_DISCONNECTING) { 3870 if (os.conn == C_DISCONNECTING) {
3759 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0); 3871 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
3760 3872
3761 if (!is_susp(mdev->state)) {
3762 /* we must not free the tl_hash
3763 * while application io is still on the fly */
3764 wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3765 drbd_free_tl_hash(mdev);
3766 }
3767
3768 crypto_free_hash(mdev->cram_hmac_tfm); 3873 crypto_free_hash(mdev->cram_hmac_tfm);
3769 mdev->cram_hmac_tfm = NULL; 3874 mdev->cram_hmac_tfm = NULL;
3770 3875
@@ -3773,6 +3878,10 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3773 drbd_request_state(mdev, NS(conn, C_STANDALONE)); 3878 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3774 } 3879 }
3775 3880
3881 /* serialize with bitmap writeout triggered by the state change,
3882 * if any. */
3883 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
3884
3776 /* tcp_close and release of sendpage pages can be deferred. I don't 3885 /* tcp_close and release of sendpage pages can be deferred. I don't
3777 * want to use SO_LINGER, because apparently it can be deferred for 3886 * want to use SO_LINGER, because apparently it can be deferred for
3778 * more than 20 seconds (longest time I checked). 3887 * more than 20 seconds (longest time I checked).
@@ -3873,7 +3982,8 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
3873 rv = drbd_recv(mdev, &p->head.payload, expect); 3982 rv = drbd_recv(mdev, &p->head.payload, expect);
3874 3983
3875 if (rv != expect) { 3984 if (rv != expect) {
3876 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); 3985 if (!signal_pending(current))
3986 dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
3877 return 0; 3987 return 0;
3878 } 3988 }
3879 3989
@@ -3975,7 +4085,8 @@ static int drbd_do_auth(struct drbd_conf *mdev)
3975 rv = drbd_recv(mdev, peers_ch, length); 4085 rv = drbd_recv(mdev, peers_ch, length);
3976 4086
3977 if (rv != length) { 4087 if (rv != length) {
3978 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); 4088 if (!signal_pending(current))
4089 dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
3979 rv = 0; 4090 rv = 0;
3980 goto fail; 4091 goto fail;
3981 } 4092 }
@@ -4022,7 +4133,8 @@ static int drbd_do_auth(struct drbd_conf *mdev)
4022 rv = drbd_recv(mdev, response , resp_size); 4133 rv = drbd_recv(mdev, response , resp_size);
4023 4134
4024 if (rv != resp_size) { 4135 if (rv != resp_size) {
4025 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); 4136 if (!signal_pending(current))
4137 dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
4026 rv = 0; 4138 rv = 0;
4027 goto fail; 4139 goto fail;
4028 } 4140 }
@@ -4074,8 +4186,7 @@ int drbdd_init(struct drbd_thread *thi)
4074 h = drbd_connect(mdev); 4186 h = drbd_connect(mdev);
4075 if (h == 0) { 4187 if (h == 0) {
4076 drbd_disconnect(mdev); 4188 drbd_disconnect(mdev);
4077 __set_current_state(TASK_INTERRUPTIBLE); 4189 schedule_timeout_interruptible(HZ);
4078 schedule_timeout(HZ);
4079 } 4190 }
4080 if (h == -1) { 4191 if (h == -1) {
4081 dev_warn(DEV, "Discarding network configuration.\n"); 4192 dev_warn(DEV, "Discarding network configuration.\n");
@@ -4113,7 +4224,7 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
4113 } 4224 }
4114 wake_up(&mdev->state_wait); 4225 wake_up(&mdev->state_wait);
4115 4226
4116 return TRUE; 4227 return true;
4117} 4228}
4118 4229
4119static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h) 4230static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4129,7 +4240,7 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
4129 if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags)) 4240 if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
4130 wake_up(&mdev->misc_wait); 4241 wake_up(&mdev->misc_wait);
4131 4242
4132 return TRUE; 4243 return true;
4133} 4244}
4134 4245
4135static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h) 4246static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4152,7 +4263,7 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
4152 dec_rs_pending(mdev); 4263 dec_rs_pending(mdev);
4153 atomic_add(blksize >> 9, &mdev->rs_sect_in); 4264 atomic_add(blksize >> 9, &mdev->rs_sect_in);
4154 4265
4155 return TRUE; 4266 return true;
4156} 4267}
4157 4268
4158/* when we receive the ACK for a write request, 4269/* when we receive the ACK for a write request,
@@ -4176,8 +4287,6 @@ static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4176 return req; 4287 return req;
4177 } 4288 }
4178 } 4289 }
4179 dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4180 (void *)(unsigned long)id, (unsigned long long)sector);
4181 return NULL; 4290 return NULL;
4182} 4291}
4183 4292
@@ -4195,15 +4304,17 @@ static int validate_req_change_req_state(struct drbd_conf *mdev,
4195 req = validator(mdev, id, sector); 4304 req = validator(mdev, id, sector);
4196 if (unlikely(!req)) { 4305 if (unlikely(!req)) {
4197 spin_unlock_irq(&mdev->req_lock); 4306 spin_unlock_irq(&mdev->req_lock);
4198 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); 4307
4199 return FALSE; 4308 dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
4309 (void *)(unsigned long)id, (unsigned long long)sector);
4310 return false;
4200 } 4311 }
4201 __req_mod(req, what, &m); 4312 __req_mod(req, what, &m);
4202 spin_unlock_irq(&mdev->req_lock); 4313 spin_unlock_irq(&mdev->req_lock);
4203 4314
4204 if (m.bio) 4315 if (m.bio)
4205 complete_master_bio(mdev, &m); 4316 complete_master_bio(mdev, &m);
4206 return TRUE; 4317 return true;
4207} 4318}
4208 4319
4209static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h) 4320static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4218,7 +4329,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
4218 if (is_syncer_block_id(p->block_id)) { 4329 if (is_syncer_block_id(p->block_id)) {
4219 drbd_set_in_sync(mdev, sector, blksize); 4330 drbd_set_in_sync(mdev, sector, blksize);
4220 dec_rs_pending(mdev); 4331 dec_rs_pending(mdev);
4221 return TRUE; 4332 return true;
4222 } 4333 }
4223 switch (be16_to_cpu(h->command)) { 4334 switch (be16_to_cpu(h->command)) {
4224 case P_RS_WRITE_ACK: 4335 case P_RS_WRITE_ACK:
@@ -4239,7 +4350,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
4239 break; 4350 break;
4240 default: 4351 default:
4241 D_ASSERT(0); 4352 D_ASSERT(0);
4242 return FALSE; 4353 return false;
4243 } 4354 }
4244 4355
4245 return validate_req_change_req_state(mdev, p->block_id, sector, 4356 return validate_req_change_req_state(mdev, p->block_id, sector,
@@ -4250,20 +4361,44 @@ static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
4250{ 4361{
4251 struct p_block_ack *p = (struct p_block_ack *)h; 4362 struct p_block_ack *p = (struct p_block_ack *)h;
4252 sector_t sector = be64_to_cpu(p->sector); 4363 sector_t sector = be64_to_cpu(p->sector);
4253 4364 int size = be32_to_cpu(p->blksize);
4254 if (__ratelimit(&drbd_ratelimit_state)) 4365 struct drbd_request *req;
4255 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); 4366 struct bio_and_error m;
4256 4367
4257 update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4368 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4258 4369
4259 if (is_syncer_block_id(p->block_id)) { 4370 if (is_syncer_block_id(p->block_id)) {
4260 int size = be32_to_cpu(p->blksize);
4261 dec_rs_pending(mdev); 4371 dec_rs_pending(mdev);
4262 drbd_rs_failed_io(mdev, sector, size); 4372 drbd_rs_failed_io(mdev, sector, size);
4263 return TRUE; 4373 return true;
4264 } 4374 }
4265 return validate_req_change_req_state(mdev, p->block_id, sector, 4375
4266 _ack_id_to_req, __func__ , neg_acked); 4376 spin_lock_irq(&mdev->req_lock);
4377 req = _ack_id_to_req(mdev, p->block_id, sector);
4378 if (!req) {
4379 spin_unlock_irq(&mdev->req_lock);
4380 if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
4381 mdev->net_conf->wire_protocol == DRBD_PROT_B) {
4382 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
4383 The master bio might already be completed, therefore the
4384 request is no longer in the collision hash.
4385 => Do not try to validate block_id as request. */
4386 /* In Protocol B we might already have got a P_RECV_ACK
4387 but then get a P_NEG_ACK after wards. */
4388 drbd_set_out_of_sync(mdev, sector, size);
4389 return true;
4390 } else {
4391 dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
4392 (void *)(unsigned long)p->block_id, (unsigned long long)sector);
4393 return false;
4394 }
4395 }
4396 __req_mod(req, neg_acked, &m);
4397 spin_unlock_irq(&mdev->req_lock);
4398
4399 if (m.bio)
4400 complete_master_bio(mdev, &m);
4401 return true;
4267} 4402}
4268 4403
4269static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h) 4404static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4294,11 +4429,20 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
4294 4429
4295 if (get_ldev_if_state(mdev, D_FAILED)) { 4430 if (get_ldev_if_state(mdev, D_FAILED)) {
4296 drbd_rs_complete_io(mdev, sector); 4431 drbd_rs_complete_io(mdev, sector);
4297 drbd_rs_failed_io(mdev, sector, size); 4432 switch (be16_to_cpu(h->command)) {
4433 case P_NEG_RS_DREPLY:
4434 drbd_rs_failed_io(mdev, sector, size);
4435 case P_RS_CANCEL:
4436 break;
4437 default:
4438 D_ASSERT(0);
4439 put_ldev(mdev);
4440 return false;
4441 }
4298 put_ldev(mdev); 4442 put_ldev(mdev);
4299 } 4443 }
4300 4444
4301 return TRUE; 4445 return true;
4302} 4446}
4303 4447
4304static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) 4448static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4307,7 +4451,14 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
4307 4451
4308 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); 4452 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4309 4453
4310 return TRUE; 4454 if (mdev->state.conn == C_AHEAD &&
4455 atomic_read(&mdev->ap_in_flight) == 0 &&
4456 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
4457 mdev->start_resync_timer.expires = jiffies + HZ;
4458 add_timer(&mdev->start_resync_timer);
4459 }
4460
4461 return true;
4311} 4462}
4312 4463
4313static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h) 4464static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4328,12 +4479,18 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
4328 ov_oos_print(mdev); 4479 ov_oos_print(mdev);
4329 4480
4330 if (!get_ldev(mdev)) 4481 if (!get_ldev(mdev))
4331 return TRUE; 4482 return true;
4332 4483
4333 drbd_rs_complete_io(mdev, sector); 4484 drbd_rs_complete_io(mdev, sector);
4334 dec_rs_pending(mdev); 4485 dec_rs_pending(mdev);
4335 4486
4336 if (--mdev->ov_left == 0) { 4487 --mdev->ov_left;
4488
4489 /* let's advance progress step marks only for every other megabyte */
4490 if ((mdev->ov_left & 0x200) == 0x200)
4491 drbd_advance_rs_marks(mdev, mdev->ov_left);
4492
4493 if (mdev->ov_left == 0) {
4337 w = kmalloc(sizeof(*w), GFP_NOIO); 4494 w = kmalloc(sizeof(*w), GFP_NOIO);
4338 if (w) { 4495 if (w) {
4339 w->cb = w_ov_finished; 4496 w->cb = w_ov_finished;
@@ -4345,12 +4502,12 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
4345 } 4502 }
4346 } 4503 }
4347 put_ldev(mdev); 4504 put_ldev(mdev);
4348 return TRUE; 4505 return true;
4349} 4506}
4350 4507
4351static int got_skip(struct drbd_conf *mdev, struct p_header80 *h) 4508static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
4352{ 4509{
4353 return TRUE; 4510 return true;
4354} 4511}
4355 4512
4356struct asender_cmd { 4513struct asender_cmd {
@@ -4378,6 +4535,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4378 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, 4535 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4379 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, 4536 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4380 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, 4537 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
4538 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply},
4381 [P_MAX_CMD] = { 0, NULL }, 4539 [P_MAX_CMD] = { 0, NULL },
4382 }; 4540 };
4383 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) 4541 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index ad3fc6228f27..5c0c8be1bb0a 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -140,9 +140,14 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
140 struct hlist_node *n; 140 struct hlist_node *n;
141 struct hlist_head *slot; 141 struct hlist_head *slot;
142 142
143 /* before we can signal completion to the upper layers, 143 /* Before we can signal completion to the upper layers,
144 * we may need to close the current epoch */ 144 * we may need to close the current epoch.
145 * We can skip this, if this request has not even been sent, because we
146 * did not have a fully established connection yet/anymore, during
147 * bitmap exchange, or while we are C_AHEAD due to congestion policy.
148 */
145 if (mdev->state.conn >= C_CONNECTED && 149 if (mdev->state.conn >= C_CONNECTED &&
150 (s & RQ_NET_SENT) != 0 &&
146 req->epoch == mdev->newest_tle->br_number) 151 req->epoch == mdev->newest_tle->br_number)
147 queue_barrier(mdev); 152 queue_barrier(mdev);
148 153
@@ -440,7 +445,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
440 req->rq_state |= RQ_LOCAL_COMPLETED; 445 req->rq_state |= RQ_LOCAL_COMPLETED;
441 req->rq_state &= ~RQ_LOCAL_PENDING; 446 req->rq_state &= ~RQ_LOCAL_PENDING;
442 447
443 __drbd_chk_io_error(mdev, FALSE); 448 __drbd_chk_io_error(mdev, false);
444 _req_may_be_done_not_susp(req, m); 449 _req_may_be_done_not_susp(req, m);
445 put_ldev(mdev); 450 put_ldev(mdev);
446 break; 451 break;
@@ -461,7 +466,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
461 466
462 D_ASSERT(!(req->rq_state & RQ_NET_MASK)); 467 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
463 468
464 __drbd_chk_io_error(mdev, FALSE); 469 __drbd_chk_io_error(mdev, false);
465 put_ldev(mdev); 470 put_ldev(mdev);
466 471
467 /* no point in retrying if there is no good remote data, 472 /* no point in retrying if there is no good remote data,
@@ -545,6 +550,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
545 550
546 break; 551 break;
547 552
553 case queue_for_send_oos:
554 req->rq_state |= RQ_NET_QUEUED;
555 req->w.cb = w_send_oos;
556 drbd_queue_work(&mdev->data.work, &req->w);
557 break;
558
559 case oos_handed_to_network:
560 /* actually the same */
548 case send_canceled: 561 case send_canceled:
549 /* treat it the same */ 562 /* treat it the same */
550 case send_failed: 563 case send_failed:
@@ -558,6 +571,9 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
558 571
559 case handed_over_to_network: 572 case handed_over_to_network:
560 /* assert something? */ 573 /* assert something? */
574 if (bio_data_dir(req->master_bio) == WRITE)
575 atomic_add(req->size>>9, &mdev->ap_in_flight);
576
561 if (bio_data_dir(req->master_bio) == WRITE && 577 if (bio_data_dir(req->master_bio) == WRITE &&
562 mdev->net_conf->wire_protocol == DRBD_PROT_A) { 578 mdev->net_conf->wire_protocol == DRBD_PROT_A) {
563 /* this is what is dangerous about protocol A: 579 /* this is what is dangerous about protocol A:
@@ -591,6 +607,9 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
591 dec_ap_pending(mdev); 607 dec_ap_pending(mdev);
592 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); 608 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
593 req->rq_state |= RQ_NET_DONE; 609 req->rq_state |= RQ_NET_DONE;
610 if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
611 atomic_sub(req->size>>9, &mdev->ap_in_flight);
612
594 /* if it is still queued, we may not complete it here. 613 /* if it is still queued, we may not complete it here.
595 * it will be canceled soon. */ 614 * it will be canceled soon. */
596 if (!(req->rq_state & RQ_NET_QUEUED)) 615 if (!(req->rq_state & RQ_NET_QUEUED))
@@ -628,14 +647,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
628 req->rq_state |= RQ_NET_OK; 647 req->rq_state |= RQ_NET_OK;
629 D_ASSERT(req->rq_state & RQ_NET_PENDING); 648 D_ASSERT(req->rq_state & RQ_NET_PENDING);
630 dec_ap_pending(mdev); 649 dec_ap_pending(mdev);
650 atomic_sub(req->size>>9, &mdev->ap_in_flight);
631 req->rq_state &= ~RQ_NET_PENDING; 651 req->rq_state &= ~RQ_NET_PENDING;
632 _req_may_be_done_not_susp(req, m); 652 _req_may_be_done_not_susp(req, m);
633 break; 653 break;
634 654
635 case neg_acked: 655 case neg_acked:
636 /* assert something? */ 656 /* assert something? */
637 if (req->rq_state & RQ_NET_PENDING) 657 if (req->rq_state & RQ_NET_PENDING) {
638 dec_ap_pending(mdev); 658 dec_ap_pending(mdev);
659 atomic_sub(req->size>>9, &mdev->ap_in_flight);
660 }
639 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); 661 req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
640 662
641 req->rq_state |= RQ_NET_DONE; 663 req->rq_state |= RQ_NET_DONE;
@@ -690,8 +712,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
690 dev_err(DEV, "FIXME (barrier_acked but pending)\n"); 712 dev_err(DEV, "FIXME (barrier_acked but pending)\n");
691 list_move(&req->tl_requests, &mdev->out_of_sequence_requests); 713 list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
692 } 714 }
693 D_ASSERT(req->rq_state & RQ_NET_SENT); 715 if ((req->rq_state & RQ_NET_MASK) != 0) {
694 req->rq_state |= RQ_NET_DONE; 716 req->rq_state |= RQ_NET_DONE;
717 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
718 atomic_sub(req->size>>9, &mdev->ap_in_flight);
719 }
695 _req_may_be_done(req, m); /* Allowed while state.susp */ 720 _req_may_be_done(req, m); /* Allowed while state.susp */
696 break; 721 break;
697 722
@@ -738,14 +763,14 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s
738 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); 763 return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
739} 764}
740 765
741static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) 766static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
742{ 767{
743 const int rw = bio_rw(bio); 768 const int rw = bio_rw(bio);
744 const int size = bio->bi_size; 769 const int size = bio->bi_size;
745 const sector_t sector = bio->bi_sector; 770 const sector_t sector = bio->bi_sector;
746 struct drbd_tl_epoch *b = NULL; 771 struct drbd_tl_epoch *b = NULL;
747 struct drbd_request *req; 772 struct drbd_request *req;
748 int local, remote; 773 int local, remote, send_oos = 0;
749 int err = -EIO; 774 int err = -EIO;
750 int ret = 0; 775 int ret = 0;
751 776
@@ -759,6 +784,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
759 bio_endio(bio, -ENOMEM); 784 bio_endio(bio, -ENOMEM);
760 return 0; 785 return 0;
761 } 786 }
787 req->start_time = start_time;
762 788
763 local = get_ldev(mdev); 789 local = get_ldev(mdev);
764 if (!local) { 790 if (!local) {
@@ -808,9 +834,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
808 drbd_al_begin_io(mdev, sector); 834 drbd_al_begin_io(mdev, sector);
809 } 835 }
810 836
811 remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || 837 remote = remote && drbd_should_do_remote(mdev->state);
812 (mdev->state.pdsk == D_INCONSISTENT && 838 send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
813 mdev->state.conn >= C_CONNECTED)); 839 D_ASSERT(!(remote && send_oos));
814 840
815 if (!(local || remote) && !is_susp(mdev->state)) { 841 if (!(local || remote) && !is_susp(mdev->state)) {
816 if (__ratelimit(&drbd_ratelimit_state)) 842 if (__ratelimit(&drbd_ratelimit_state))
@@ -824,7 +850,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
824 * but there is a race between testing the bit and pointer outside the 850 * but there is a race between testing the bit and pointer outside the
825 * spinlock, and grabbing the spinlock. 851 * spinlock, and grabbing the spinlock.
826 * if we lost that race, we retry. */ 852 * if we lost that race, we retry. */
827 if (rw == WRITE && remote && 853 if (rw == WRITE && (remote || send_oos) &&
828 mdev->unused_spare_tle == NULL && 854 mdev->unused_spare_tle == NULL &&
829 test_bit(CREATE_BARRIER, &mdev->flags)) { 855 test_bit(CREATE_BARRIER, &mdev->flags)) {
830allocate_barrier: 856allocate_barrier:
@@ -842,18 +868,19 @@ allocate_barrier:
842 if (is_susp(mdev->state)) { 868 if (is_susp(mdev->state)) {
843 /* If we got suspended, use the retry mechanism of 869 /* If we got suspended, use the retry mechanism of
844 generic_make_request() to restart processing of this 870 generic_make_request() to restart processing of this
845 bio. In the next call to drbd_make_request_26 871 bio. In the next call to drbd_make_request
846 we sleep in inc_ap_bio() */ 872 we sleep in inc_ap_bio() */
847 ret = 1; 873 ret = 1;
848 spin_unlock_irq(&mdev->req_lock); 874 spin_unlock_irq(&mdev->req_lock);
849 goto fail_free_complete; 875 goto fail_free_complete;
850 } 876 }
851 877
852 if (remote) { 878 if (remote || send_oos) {
853 remote = (mdev->state.pdsk == D_UP_TO_DATE || 879 remote = drbd_should_do_remote(mdev->state);
854 (mdev->state.pdsk == D_INCONSISTENT && 880 send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
855 mdev->state.conn >= C_CONNECTED)); 881 D_ASSERT(!(remote && send_oos));
856 if (!remote) 882
883 if (!(remote || send_oos))
857 dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); 884 dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
858 if (!(local || remote)) { 885 if (!(local || remote)) {
859 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 886 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
@@ -866,7 +893,7 @@ allocate_barrier:
866 mdev->unused_spare_tle = b; 893 mdev->unused_spare_tle = b;
867 b = NULL; 894 b = NULL;
868 } 895 }
869 if (rw == WRITE && remote && 896 if (rw == WRITE && (remote || send_oos) &&
870 mdev->unused_spare_tle == NULL && 897 mdev->unused_spare_tle == NULL &&
871 test_bit(CREATE_BARRIER, &mdev->flags)) { 898 test_bit(CREATE_BARRIER, &mdev->flags)) {
872 /* someone closed the current epoch 899 /* someone closed the current epoch
@@ -889,7 +916,7 @@ allocate_barrier:
889 * barrier packet. To get the write ordering right, we only have to 916 * barrier packet. To get the write ordering right, we only have to
890 * make sure that, if this is a write request and it triggered a 917 * make sure that, if this is a write request and it triggered a
891 * barrier packet, this request is queued within the same spinlock. */ 918 * barrier packet, this request is queued within the same spinlock. */
892 if (remote && mdev->unused_spare_tle && 919 if ((remote || send_oos) && mdev->unused_spare_tle &&
893 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { 920 test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
894 _tl_add_barrier(mdev, mdev->unused_spare_tle); 921 _tl_add_barrier(mdev, mdev->unused_spare_tle);
895 mdev->unused_spare_tle = NULL; 922 mdev->unused_spare_tle = NULL;
@@ -937,6 +964,34 @@ allocate_barrier:
937 ? queue_for_net_write 964 ? queue_for_net_write
938 : queue_for_net_read); 965 : queue_for_net_read);
939 } 966 }
967 if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
968 _req_mod(req, queue_for_send_oos);
969
970 if (remote &&
971 mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) {
972 int congested = 0;
973
974 if (mdev->net_conf->cong_fill &&
975 atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
976 dev_info(DEV, "Congestion-fill threshold reached\n");
977 congested = 1;
978 }
979
980 if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
981 dev_info(DEV, "Congestion-extents threshold reached\n");
982 congested = 1;
983 }
984
985 if (congested) {
986 queue_barrier(mdev); /* last barrier, after mirrored writes */
987
988 if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
989 _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
990 else /*mdev->net_conf->on_congestion == OC_DISCONNECT */
991 _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
992 }
993 }
994
940 spin_unlock_irq(&mdev->req_lock); 995 spin_unlock_irq(&mdev->req_lock);
941 kfree(b); /* if someone else has beaten us to it... */ 996 kfree(b); /* if someone else has beaten us to it... */
942 997
@@ -949,9 +1004,9 @@ allocate_barrier:
949 * stable storage, and this is a WRITE, we may not even submit 1004 * stable storage, and this is a WRITE, we may not even submit
950 * this bio. */ 1005 * this bio. */
951 if (get_ldev(mdev)) { 1006 if (get_ldev(mdev)) {
952 if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR 1007 if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
953 : rw == READ ? DRBD_FAULT_DT_RD 1008 : rw == READ ? DRBD_FAULT_DT_RD
954 : DRBD_FAULT_DT_RA)) 1009 : DRBD_FAULT_DT_RA))
955 bio_endio(req->private_bio, -EIO); 1010 bio_endio(req->private_bio, -EIO);
956 else 1011 else
957 generic_make_request(req->private_bio); 1012 generic_make_request(req->private_bio);
@@ -1018,16 +1073,19 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
1018 return 0; 1073 return 0;
1019} 1074}
1020 1075
1021int drbd_make_request_26(struct request_queue *q, struct bio *bio) 1076int drbd_make_request(struct request_queue *q, struct bio *bio)
1022{ 1077{
1023 unsigned int s_enr, e_enr; 1078 unsigned int s_enr, e_enr;
1024 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; 1079 struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
1080 unsigned long start_time;
1025 1081
1026 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { 1082 if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
1027 bio_endio(bio, -EPERM); 1083 bio_endio(bio, -EPERM);
1028 return 0; 1084 return 0;
1029 } 1085 }
1030 1086
1087 start_time = jiffies;
1088
1031 /* 1089 /*
1032 * what we "blindly" assume: 1090 * what we "blindly" assume:
1033 */ 1091 */
@@ -1042,12 +1100,12 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1042 1100
1043 if (likely(s_enr == e_enr)) { 1101 if (likely(s_enr == e_enr)) {
1044 inc_ap_bio(mdev, 1); 1102 inc_ap_bio(mdev, 1);
1045 return drbd_make_request_common(mdev, bio); 1103 return drbd_make_request_common(mdev, bio, start_time);
1046 } 1104 }
1047 1105
1048 /* can this bio be split generically? 1106 /* can this bio be split generically?
1049 * Maybe add our own split-arbitrary-bios function. */ 1107 * Maybe add our own split-arbitrary-bios function. */
1050 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { 1108 if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
1051 /* rather error out here than BUG in bio_split */ 1109 /* rather error out here than BUG in bio_split */
1052 dev_err(DEV, "bio would need to, but cannot, be split: " 1110 dev_err(DEV, "bio would need to, but cannot, be split: "
1053 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", 1111 "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
@@ -1069,11 +1127,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1069 const int sps = 1 << HT_SHIFT; /* sectors per slot */ 1127 const int sps = 1 << HT_SHIFT; /* sectors per slot */
1070 const int mask = sps - 1; 1128 const int mask = sps - 1;
1071 const sector_t first_sectors = sps - (sect & mask); 1129 const sector_t first_sectors = sps - (sect & mask);
1072 bp = bio_split(bio, 1130 bp = bio_split(bio, first_sectors);
1073#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
1074 bio_split_pool,
1075#endif
1076 first_sectors);
1077 1131
1078 /* we need to get a "reference count" (ap_bio_cnt) 1132 /* we need to get a "reference count" (ap_bio_cnt)
1079 * to avoid races with the disconnect/reconnect/suspend code. 1133 * to avoid races with the disconnect/reconnect/suspend code.
@@ -1084,10 +1138,10 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1084 1138
1085 D_ASSERT(e_enr == s_enr + 1); 1139 D_ASSERT(e_enr == s_enr + 1);
1086 1140
1087 while (drbd_make_request_common(mdev, &bp->bio1)) 1141 while (drbd_make_request_common(mdev, &bp->bio1, start_time))
1088 inc_ap_bio(mdev, 1); 1142 inc_ap_bio(mdev, 1);
1089 1143
1090 while (drbd_make_request_common(mdev, &bp->bio2)) 1144 while (drbd_make_request_common(mdev, &bp->bio2, start_time))
1091 inc_ap_bio(mdev, 1); 1145 inc_ap_bio(mdev, 1);
1092 1146
1093 dec_ap_bio(mdev); 1147 dec_ap_bio(mdev);
@@ -1098,7 +1152,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1098} 1152}
1099 1153
1100/* This is called by bio_add_page(). With this function we reduce 1154/* This is called by bio_add_page(). With this function we reduce
1101 * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs 1155 * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
1102 * units (was AL_EXTENTs). 1156 * units (was AL_EXTENTs).
1103 * 1157 *
1104 * we do the calculation within the lower 32bit of the byte offsets, 1158 * we do the calculation within the lower 32bit of the byte offsets,
@@ -1108,7 +1162,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1108 * As long as the BIO is empty we have to allow at least one bvec, 1162 * As long as the BIO is empty we have to allow at least one bvec,
1109 * regardless of size and offset. so the resulting bio may still 1163 * regardless of size and offset. so the resulting bio may still
1110 * cross extent boundaries. those are dealt with (bio_split) in 1164 * cross extent boundaries. those are dealt with (bio_split) in
1111 * drbd_make_request_26. 1165 * drbd_make_request.
1112 */ 1166 */
1113int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) 1167int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
1114{ 1168{
@@ -1118,8 +1172,8 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1118 unsigned int bio_size = bvm->bi_size; 1172 unsigned int bio_size = bvm->bi_size;
1119 int limit, backing_limit; 1173 int limit, backing_limit;
1120 1174
1121 limit = DRBD_MAX_SEGMENT_SIZE 1175 limit = DRBD_MAX_BIO_SIZE
1122 - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); 1176 - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
1123 if (limit < 0) 1177 if (limit < 0)
1124 limit = 0; 1178 limit = 0;
1125 if (bio_size == 0) { 1179 if (bio_size == 0) {
@@ -1136,3 +1190,42 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1136 } 1190 }
1137 return limit; 1191 return limit;
1138} 1192}
1193
1194void request_timer_fn(unsigned long data)
1195{
1196 struct drbd_conf *mdev = (struct drbd_conf *) data;
1197 struct drbd_request *req; /* oldest request */
1198 struct list_head *le;
1199 unsigned long et = 0; /* effective timeout = ko_count * timeout */
1200
1201 if (get_net_conf(mdev)) {
1202 et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count;
1203 put_net_conf(mdev);
1204 }
1205 if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
1206 return; /* Recurring timer stopped */
1207
1208 spin_lock_irq(&mdev->req_lock);
1209 le = &mdev->oldest_tle->requests;
1210 if (list_empty(le)) {
1211 spin_unlock_irq(&mdev->req_lock);
1212 mod_timer(&mdev->request_timer, jiffies + et);
1213 return;
1214 }
1215
1216 le = le->prev;
1217 req = list_entry(le, struct drbd_request, tl_requests);
1218 if (time_is_before_eq_jiffies(req->start_time + et)) {
1219 if (req->rq_state & RQ_NET_PENDING) {
1220 dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
1221 _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
1222 } else {
1223 dev_warn(DEV, "Local backing block device frozen?\n");
1224 mod_timer(&mdev->request_timer, jiffies + et);
1225 }
1226 } else {
1227 mod_timer(&mdev->request_timer, req->start_time + et);
1228 }
1229
1230 spin_unlock_irq(&mdev->req_lock);
1231}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index ab2bd09d54b4..32e2c3e6a813 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -82,14 +82,16 @@ enum drbd_req_event {
82 to_be_submitted, 82 to_be_submitted,
83 83
84 /* XXX yes, now I am inconsistent... 84 /* XXX yes, now I am inconsistent...
85 * these two are not "events" but "actions" 85 * these are not "events" but "actions"
86 * oh, well... */ 86 * oh, well... */
87 queue_for_net_write, 87 queue_for_net_write,
88 queue_for_net_read, 88 queue_for_net_read,
89 queue_for_send_oos,
89 90
90 send_canceled, 91 send_canceled,
91 send_failed, 92 send_failed,
92 handed_over_to_network, 93 handed_over_to_network,
94 oos_handed_to_network,
93 connection_lost_while_pending, 95 connection_lost_while_pending,
94 read_retry_remote_canceled, 96 read_retry_remote_canceled,
95 recv_acked_by_peer, 97 recv_acked_by_peer,
@@ -289,7 +291,6 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
289 req->epoch = 0; 291 req->epoch = 0;
290 req->sector = bio_src->bi_sector; 292 req->sector = bio_src->bi_sector;
291 req->size = bio_src->bi_size; 293 req->size = bio_src->bi_size;
292 req->start_time = jiffies;
293 INIT_HLIST_NODE(&req->colision); 294 INIT_HLIST_NODE(&req->colision);
294 INIT_LIST_HEAD(&req->tl_requests); 295 INIT_LIST_HEAD(&req->tl_requests);
295 INIT_LIST_HEAD(&req->w.list); 296 INIT_LIST_HEAD(&req->w.list);
@@ -321,6 +322,7 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
321 struct bio_and_error *m); 322 struct bio_and_error *m);
322extern void complete_master_bio(struct drbd_conf *mdev, 323extern void complete_master_bio(struct drbd_conf *mdev,
323 struct bio_and_error *m); 324 struct bio_and_error *m);
325extern void request_timer_fn(unsigned long data);
324 326
325/* use this if you don't want to deal with calling complete_master_bio() 327/* use this if you don't want to deal with calling complete_master_bio()
326 * outside the spinlock, e.g. when walking some list on cleanup. */ 328 * outside the spinlock, e.g. when walking some list on cleanup. */
@@ -338,23 +340,43 @@ static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
338 return rv; 340 return rv;
339} 341}
340 342
341/* completion of master bio is outside of spinlock. 343/* completion of master bio is outside of our spinlock.
342 * If you need it irqsave, do it your self! 344 * We still may or may not be inside some irqs disabled section
343 * Which means: don't use from bio endio callback. */ 345 * of the lower level driver completion callback, so we need to
346 * spin_lock_irqsave here. */
344static inline int req_mod(struct drbd_request *req, 347static inline int req_mod(struct drbd_request *req,
345 enum drbd_req_event what) 348 enum drbd_req_event what)
346{ 349{
350 unsigned long flags;
347 struct drbd_conf *mdev = req->mdev; 351 struct drbd_conf *mdev = req->mdev;
348 struct bio_and_error m; 352 struct bio_and_error m;
349 int rv; 353 int rv;
350 354
351 spin_lock_irq(&mdev->req_lock); 355 spin_lock_irqsave(&mdev->req_lock, flags);
352 rv = __req_mod(req, what, &m); 356 rv = __req_mod(req, what, &m);
353 spin_unlock_irq(&mdev->req_lock); 357 spin_unlock_irqrestore(&mdev->req_lock, flags);
354 358
355 if (m.bio) 359 if (m.bio)
356 complete_master_bio(mdev, &m); 360 complete_master_bio(mdev, &m);
357 361
358 return rv; 362 return rv;
359} 363}
364
365static inline bool drbd_should_do_remote(union drbd_state s)
366{
367 return s.pdsk == D_UP_TO_DATE ||
368 (s.pdsk >= D_INCONSISTENT &&
369 s.conn >= C_WF_BITMAP_T &&
370 s.conn < C_AHEAD);
371 /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
372 That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
373 states. */
374}
375static inline bool drbd_should_send_oos(union drbd_state s)
376{
377 return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
378 /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
379 since we enter state C_AHEAD only if proto >= 96 */
380}
381
360#endif 382#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 85179e1fb50a..c44a2a602772 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -48,6 +48,8 @@ static const char *drbd_conn_s_names[] = {
48 [C_PAUSED_SYNC_T] = "PausedSyncT", 48 [C_PAUSED_SYNC_T] = "PausedSyncT",
49 [C_VERIFY_S] = "VerifyS", 49 [C_VERIFY_S] = "VerifyS",
50 [C_VERIFY_T] = "VerifyT", 50 [C_VERIFY_T] = "VerifyT",
51 [C_AHEAD] = "Ahead",
52 [C_BEHIND] = "Behind",
51}; 53};
52 54
53static const char *drbd_role_s_names[] = { 55static const char *drbd_role_s_names[] = {
@@ -92,7 +94,7 @@ static const char *drbd_state_sw_errors[] = {
92const char *drbd_conn_str(enum drbd_conns s) 94const char *drbd_conn_str(enum drbd_conns s)
93{ 95{
94 /* enums are unsigned... */ 96 /* enums are unsigned... */
95 return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; 97 return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
96} 98}
97 99
98const char *drbd_role_str(enum drbd_role s) 100const char *drbd_role_str(enum drbd_role s)
@@ -105,7 +107,7 @@ const char *drbd_disk_str(enum drbd_disk_state s)
105 return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; 107 return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
106} 108}
107 109
108const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) 110const char *drbd_set_st_err_str(enum drbd_state_rv err)
109{ 111{
110 return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : 112 return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
111 err > SS_TWO_PRIMARIES ? "TOO_LARGE" 113 err > SS_TWO_PRIMARIES ? "TOO_LARGE"
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index e027446590d3..f7e6c92f8d03 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -39,18 +39,17 @@
39#include "drbd_req.h" 39#include "drbd_req.h"
40 40
41static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); 41static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
42static int w_make_resync_request(struct drbd_conf *mdev,
43 struct drbd_work *w, int cancel);
42 44
43 45
44 46
45/* defined here: 47/* endio handlers:
46 drbd_md_io_complete 48 * drbd_md_io_complete (defined here)
47 drbd_endio_sec 49 * drbd_endio_pri (defined here)
48 drbd_endio_pri 50 * drbd_endio_sec (defined here)
49 51 * bm_async_io_complete (defined in drbd_bitmap.c)
50 * more endio handlers: 52 *
51 atodb_endio in drbd_actlog.c
52 drbd_bm_async_io_complete in drbd_bitmap.c
53
54 * For all these callbacks, note the following: 53 * For all these callbacks, note the following:
55 * The callbacks will be called in irq context by the IDE drivers, 54 * The callbacks will be called in irq context by the IDE drivers,
56 * and in Softirqs/Tasklets/BH context by the SCSI drivers. 55 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
@@ -94,7 +93,7 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
94 if (list_empty(&mdev->read_ee)) 93 if (list_empty(&mdev->read_ee))
95 wake_up(&mdev->ee_wait); 94 wake_up(&mdev->ee_wait);
96 if (test_bit(__EE_WAS_ERROR, &e->flags)) 95 if (test_bit(__EE_WAS_ERROR, &e->flags))
97 __drbd_chk_io_error(mdev, FALSE); 96 __drbd_chk_io_error(mdev, false);
98 spin_unlock_irqrestore(&mdev->req_lock, flags); 97 spin_unlock_irqrestore(&mdev->req_lock, flags);
99 98
100 drbd_queue_work(&mdev->data.work, &e->w); 99 drbd_queue_work(&mdev->data.work, &e->w);
@@ -137,7 +136,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
137 : list_empty(&mdev->active_ee); 136 : list_empty(&mdev->active_ee);
138 137
139 if (test_bit(__EE_WAS_ERROR, &e->flags)) 138 if (test_bit(__EE_WAS_ERROR, &e->flags))
140 __drbd_chk_io_error(mdev, FALSE); 139 __drbd_chk_io_error(mdev, false);
141 spin_unlock_irqrestore(&mdev->req_lock, flags); 140 spin_unlock_irqrestore(&mdev->req_lock, flags);
142 141
143 if (is_syncer_req) 142 if (is_syncer_req)
@@ -163,14 +162,15 @@ void drbd_endio_sec(struct bio *bio, int error)
163 int uptodate = bio_flagged(bio, BIO_UPTODATE); 162 int uptodate = bio_flagged(bio, BIO_UPTODATE);
164 int is_write = bio_data_dir(bio) == WRITE; 163 int is_write = bio_data_dir(bio) == WRITE;
165 164
166 if (error) 165 if (error && __ratelimit(&drbd_ratelimit_state))
167 dev_warn(DEV, "%s: error=%d s=%llus\n", 166 dev_warn(DEV, "%s: error=%d s=%llus\n",
168 is_write ? "write" : "read", error, 167 is_write ? "write" : "read", error,
169 (unsigned long long)e->sector); 168 (unsigned long long)e->sector);
170 if (!error && !uptodate) { 169 if (!error && !uptodate) {
171 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", 170 if (__ratelimit(&drbd_ratelimit_state))
172 is_write ? "write" : "read", 171 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
173 (unsigned long long)e->sector); 172 is_write ? "write" : "read",
173 (unsigned long long)e->sector);
174 /* strange behavior of some lower level drivers... 174 /* strange behavior of some lower level drivers...
175 * fail the request by clearing the uptodate flag, 175 * fail the request by clearing the uptodate flag,
176 * but do not return any error?! */ 176 * but do not return any error?! */
@@ -250,13 +250,6 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
250 return w_send_read_req(mdev, w, 0); 250 return w_send_read_req(mdev, w, 0);
251} 251}
252 252
253int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
254{
255 ERR_IF(cancel) return 1;
256 dev_err(DEV, "resync inactive, but callback triggered??\n");
257 return 1; /* Simply ignore this! */
258}
259
260void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest) 253void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
261{ 254{
262 struct hash_desc desc; 255 struct hash_desc desc;
@@ -355,7 +348,7 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
355 if (!get_ldev(mdev)) 348 if (!get_ldev(mdev))
356 return -EIO; 349 return -EIO;
357 350
358 if (drbd_rs_should_slow_down(mdev)) 351 if (drbd_rs_should_slow_down(mdev, sector))
359 goto defer; 352 goto defer;
360 353
361 /* GFP_TRY, because if there is no memory available right now, this may 354 /* GFP_TRY, because if there is no memory available right now, this may
@@ -373,9 +366,10 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
373 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) 366 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
374 return 0; 367 return 0;
375 368
376 /* drbd_submit_ee currently fails for one reason only: 369 /* If it failed because of ENOMEM, retry should help. If it failed
377 * not being able to allocate enough bios. 370 * because bio_add_page failed (probably broken lower level driver),
378 * Is dropping the connection going to help? */ 371 * retry may or may not help.
372 * If it does not, you may need to force disconnect. */
379 spin_lock_irq(&mdev->req_lock); 373 spin_lock_irq(&mdev->req_lock);
380 list_del(&e->w.list); 374 list_del(&e->w.list);
381 spin_unlock_irq(&mdev->req_lock); 375 spin_unlock_irq(&mdev->req_lock);
@@ -386,26 +380,25 @@ defer:
386 return -EAGAIN; 380 return -EAGAIN;
387} 381}
388 382
389void resync_timer_fn(unsigned long data) 383int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
390{ 384{
391 struct drbd_conf *mdev = (struct drbd_conf *) data;
392 int queue;
393
394 queue = 1;
395 switch (mdev->state.conn) { 385 switch (mdev->state.conn) {
396 case C_VERIFY_S: 386 case C_VERIFY_S:
397 mdev->resync_work.cb = w_make_ov_request; 387 w_make_ov_request(mdev, w, cancel);
398 break; 388 break;
399 case C_SYNC_TARGET: 389 case C_SYNC_TARGET:
400 mdev->resync_work.cb = w_make_resync_request; 390 w_make_resync_request(mdev, w, cancel);
401 break; 391 break;
402 default:
403 queue = 0;
404 mdev->resync_work.cb = w_resync_inactive;
405 } 392 }
406 393
407 /* harmless race: list_empty outside data.work.q_lock */ 394 return 1;
408 if (list_empty(&mdev->resync_work.list) && queue) 395}
396
397void resync_timer_fn(unsigned long data)
398{
399 struct drbd_conf *mdev = (struct drbd_conf *) data;
400
401 if (list_empty(&mdev->resync_work.list))
409 drbd_queue_work(&mdev->data.work, &mdev->resync_work); 402 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
410} 403}
411 404
@@ -438,7 +431,7 @@ static void fifo_add_val(struct fifo_buffer *fb, int value)
438 fb->values[i] += value; 431 fb->values[i] += value;
439} 432}
440 433
441int drbd_rs_controller(struct drbd_conf *mdev) 434static int drbd_rs_controller(struct drbd_conf *mdev)
442{ 435{
443 unsigned int sect_in; /* Number of sectors that came in since the last turn */ 436 unsigned int sect_in; /* Number of sectors that came in since the last turn */
444 unsigned int want; /* The number of sectors we want in the proxy */ 437 unsigned int want; /* The number of sectors we want in the proxy */
@@ -492,29 +485,36 @@ int drbd_rs_controller(struct drbd_conf *mdev)
492 return req_sect; 485 return req_sect;
493} 486}
494 487
495int w_make_resync_request(struct drbd_conf *mdev, 488static int drbd_rs_number_requests(struct drbd_conf *mdev)
496 struct drbd_work *w, int cancel) 489{
490 int number;
491 if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
492 number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
493 mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
494 } else {
495 mdev->c_sync_rate = mdev->sync_conf.rate;
496 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
497 }
498
499 /* ignore the amount of pending requests, the resync controller should
500 * throttle down to incoming reply rate soon enough anyways. */
501 return number;
502}
503
504static int w_make_resync_request(struct drbd_conf *mdev,
505 struct drbd_work *w, int cancel)
497{ 506{
498 unsigned long bit; 507 unsigned long bit;
499 sector_t sector; 508 sector_t sector;
500 const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 509 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
501 int max_segment_size; 510 int max_bio_size;
502 int number, rollback_i, size, pe, mx; 511 int number, rollback_i, size;
503 int align, queued, sndbuf; 512 int align, queued, sndbuf;
504 int i = 0; 513 int i = 0;
505 514
506 if (unlikely(cancel)) 515 if (unlikely(cancel))
507 return 1; 516 return 1;
508 517
509 if (unlikely(mdev->state.conn < C_CONNECTED)) {
510 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
511 return 0;
512 }
513
514 if (mdev->state.conn != C_SYNC_TARGET)
515 dev_err(DEV, "%s in w_make_resync_request\n",
516 drbd_conn_str(mdev->state.conn));
517
518 if (mdev->rs_total == 0) { 518 if (mdev->rs_total == 0) {
519 /* empty resync? */ 519 /* empty resync? */
520 drbd_resync_finished(mdev); 520 drbd_resync_finished(mdev);
@@ -527,49 +527,19 @@ int w_make_resync_request(struct drbd_conf *mdev,
527 to continue resync with a broken disk makes no sense at 527 to continue resync with a broken disk makes no sense at
528 all */ 528 all */
529 dev_err(DEV, "Disk broke down during resync!\n"); 529 dev_err(DEV, "Disk broke down during resync!\n");
530 mdev->resync_work.cb = w_resync_inactive;
531 return 1; 530 return 1;
532 } 531 }
533 532
534 /* starting with drbd 8.3.8, we can handle multi-bio EEs, 533 /* starting with drbd 8.3.8, we can handle multi-bio EEs,
535 * if it should be necessary */ 534 * if it should be necessary */
536 max_segment_size = 535 max_bio_size =
537 mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) : 536 mdev->agreed_pro_version < 94 ? queue_max_hw_sectors(mdev->rq_queue) << 9 :
538 mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE; 537 mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_BIO_SIZE;
539 538
540 if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */ 539 number = drbd_rs_number_requests(mdev);
541 number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); 540 if (number == 0)
542 mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
543 } else {
544 mdev->c_sync_rate = mdev->sync_conf.rate;
545 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
546 }
547
548 /* Throttle resync on lower level disk activity, which may also be
549 * caused by application IO on Primary/SyncTarget.
550 * Keep this after the call to drbd_rs_controller, as that assumes
551 * to be called as precisely as possible every SLEEP_TIME,
552 * and would be confused otherwise. */
553 if (drbd_rs_should_slow_down(mdev))
554 goto requeue; 541 goto requeue;
555 542
556 mutex_lock(&mdev->data.mutex);
557 if (mdev->data.socket)
558 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
559 else
560 mx = 1;
561 mutex_unlock(&mdev->data.mutex);
562
563 /* For resync rates >160MB/sec, allow more pending RS requests */
564 if (number > mx)
565 mx = number;
566
567 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
568 pe = atomic_read(&mdev->rs_pending_cnt);
569 if ((pe + number) > mx) {
570 number = mx - pe;
571 }
572
573 for (i = 0; i < number; i++) { 543 for (i = 0; i < number; i++) {
574 /* Stop generating RS requests, when half of the send buffer is filled */ 544 /* Stop generating RS requests, when half of the send buffer is filled */
575 mutex_lock(&mdev->data.mutex); 545 mutex_lock(&mdev->data.mutex);
@@ -588,16 +558,16 @@ next_sector:
588 size = BM_BLOCK_SIZE; 558 size = BM_BLOCK_SIZE;
589 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); 559 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
590 560
591 if (bit == -1UL) { 561 if (bit == DRBD_END_OF_BITMAP) {
592 mdev->bm_resync_fo = drbd_bm_bits(mdev); 562 mdev->bm_resync_fo = drbd_bm_bits(mdev);
593 mdev->resync_work.cb = w_resync_inactive;
594 put_ldev(mdev); 563 put_ldev(mdev);
595 return 1; 564 return 1;
596 } 565 }
597 566
598 sector = BM_BIT_TO_SECT(bit); 567 sector = BM_BIT_TO_SECT(bit);
599 568
600 if (drbd_try_rs_begin_io(mdev, sector)) { 569 if (drbd_rs_should_slow_down(mdev, sector) ||
570 drbd_try_rs_begin_io(mdev, sector)) {
601 mdev->bm_resync_fo = bit; 571 mdev->bm_resync_fo = bit;
602 goto requeue; 572 goto requeue;
603 } 573 }
@@ -608,7 +578,7 @@ next_sector:
608 goto next_sector; 578 goto next_sector;
609 } 579 }
610 580
611#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE 581#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
612 /* try to find some adjacent bits. 582 /* try to find some adjacent bits.
613 * we stop if we have already the maximum req size. 583 * we stop if we have already the maximum req size.
614 * 584 *
@@ -618,7 +588,7 @@ next_sector:
618 align = 1; 588 align = 1;
619 rollback_i = i; 589 rollback_i = i;
620 for (;;) { 590 for (;;) {
621 if (size + BM_BLOCK_SIZE > max_segment_size) 591 if (size + BM_BLOCK_SIZE > max_bio_size)
622 break; 592 break;
623 593
624 /* Be always aligned */ 594 /* Be always aligned */
@@ -685,7 +655,6 @@ next_sector:
685 * resync data block, and the last bit is cleared. 655 * resync data block, and the last bit is cleared.
686 * until then resync "work" is "inactive" ... 656 * until then resync "work" is "inactive" ...
687 */ 657 */
688 mdev->resync_work.cb = w_resync_inactive;
689 put_ldev(mdev); 658 put_ldev(mdev);
690 return 1; 659 return 1;
691 } 660 }
@@ -706,27 +675,18 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
706 if (unlikely(cancel)) 675 if (unlikely(cancel))
707 return 1; 676 return 1;
708 677
709 if (unlikely(mdev->state.conn < C_CONNECTED)) { 678 number = drbd_rs_number_requests(mdev);
710 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
711 return 0;
712 }
713
714 number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
715 if (atomic_read(&mdev->rs_pending_cnt) > number)
716 goto requeue;
717
718 number -= atomic_read(&mdev->rs_pending_cnt);
719 679
720 sector = mdev->ov_position; 680 sector = mdev->ov_position;
721 for (i = 0; i < number; i++) { 681 for (i = 0; i < number; i++) {
722 if (sector >= capacity) { 682 if (sector >= capacity) {
723 mdev->resync_work.cb = w_resync_inactive;
724 return 1; 683 return 1;
725 } 684 }
726 685
727 size = BM_BLOCK_SIZE; 686 size = BM_BLOCK_SIZE;
728 687
729 if (drbd_try_rs_begin_io(mdev, sector)) { 688 if (drbd_rs_should_slow_down(mdev, sector) ||
689 drbd_try_rs_begin_io(mdev, sector)) {
730 mdev->ov_position = sector; 690 mdev->ov_position = sector;
731 goto requeue; 691 goto requeue;
732 } 692 }
@@ -744,11 +704,33 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
744 mdev->ov_position = sector; 704 mdev->ov_position = sector;
745 705
746 requeue: 706 requeue:
707 mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
747 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); 708 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
748 return 1; 709 return 1;
749} 710}
750 711
751 712
713void start_resync_timer_fn(unsigned long data)
714{
715 struct drbd_conf *mdev = (struct drbd_conf *) data;
716
717 drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);
718}
719
720int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
721{
722 if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
723 dev_warn(DEV, "w_start_resync later...\n");
724 mdev->start_resync_timer.expires = jiffies + HZ/10;
725 add_timer(&mdev->start_resync_timer);
726 return 1;
727 }
728
729 drbd_start_resync(mdev, C_SYNC_SOURCE);
730 clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);
731 return 1;
732}
733
752int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 734int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
753{ 735{
754 kfree(w); 736 kfree(w);
@@ -782,6 +764,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
782 union drbd_state os, ns; 764 union drbd_state os, ns;
783 struct drbd_work *w; 765 struct drbd_work *w;
784 char *khelper_cmd = NULL; 766 char *khelper_cmd = NULL;
767 int verify_done = 0;
785 768
786 /* Remove all elements from the resync LRU. Since future actions 769 /* Remove all elements from the resync LRU. Since future actions
787 * might set bits in the (main) bitmap, then the entries in the 770 * might set bits in the (main) bitmap, then the entries in the
@@ -792,8 +775,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
792 * queue (or even the read operations for those packets 775 * queue (or even the read operations for those packets
793 * is not finished by now). Retry in 100ms. */ 776 * is not finished by now). Retry in 100ms. */
794 777
795 __set_current_state(TASK_INTERRUPTIBLE); 778 schedule_timeout_interruptible(HZ / 10);
796 schedule_timeout(HZ / 10);
797 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); 779 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
798 if (w) { 780 if (w) {
799 w->cb = w_resync_finished; 781 w->cb = w_resync_finished;
@@ -818,6 +800,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
818 spin_lock_irq(&mdev->req_lock); 800 spin_lock_irq(&mdev->req_lock);
819 os = mdev->state; 801 os = mdev->state;
820 802
803 verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
804
821 /* This protects us against multiple calls (that can happen in the presence 805 /* This protects us against multiple calls (that can happen in the presence
822 of application IO), and against connectivity loss just before we arrive here. */ 806 of application IO), and against connectivity loss just before we arrive here. */
823 if (os.conn <= C_CONNECTED) 807 if (os.conn <= C_CONNECTED)
@@ -827,8 +811,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
827 ns.conn = C_CONNECTED; 811 ns.conn = C_CONNECTED;
828 812
829 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", 813 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
830 (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? 814 verify_done ? "Online verify " : "Resync",
831 "Online verify " : "Resync",
832 dt + mdev->rs_paused, mdev->rs_paused, dbdt); 815 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
833 816
834 n_oos = drbd_bm_total_weight(mdev); 817 n_oos = drbd_bm_total_weight(mdev);
@@ -886,14 +869,18 @@ int drbd_resync_finished(struct drbd_conf *mdev)
886 } 869 }
887 } 870 }
888 871
889 drbd_uuid_set_bm(mdev, 0UL); 872 if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
890 873 /* for verify runs, we don't update uuids here,
891 if (mdev->p_uuid) { 874 * so there would be nothing to report. */
892 /* Now the two UUID sets are equal, update what we 875 drbd_uuid_set_bm(mdev, 0UL);
893 * know of the peer. */ 876 drbd_print_uuids(mdev, "updated UUIDs");
894 int i; 877 if (mdev->p_uuid) {
895 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) 878 /* Now the two UUID sets are equal, update what we
896 mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; 879 * know of the peer. */
880 int i;
881 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
882 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
883 }
897 } 884 }
898 } 885 }
899 886
@@ -905,15 +892,11 @@ out:
905 mdev->rs_total = 0; 892 mdev->rs_total = 0;
906 mdev->rs_failed = 0; 893 mdev->rs_failed = 0;
907 mdev->rs_paused = 0; 894 mdev->rs_paused = 0;
908 mdev->ov_start_sector = 0; 895 if (verify_done)
896 mdev->ov_start_sector = 0;
909 897
910 drbd_md_sync(mdev); 898 drbd_md_sync(mdev);
911 899
912 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
913 dev_info(DEV, "Writing the whole bitmap\n");
914 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
915 }
916
917 if (khelper_cmd) 900 if (khelper_cmd)
918 drbd_khelper(mdev, khelper_cmd); 901 drbd_khelper(mdev, khelper_cmd);
919 902
@@ -994,7 +977,9 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
994 put_ldev(mdev); 977 put_ldev(mdev);
995 } 978 }
996 979
997 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 980 if (mdev->state.conn == C_AHEAD) {
981 ok = drbd_send_ack(mdev, P_RS_CANCEL, e);
982 } else if (likely((e->flags & EE_WAS_ERROR) == 0)) {
998 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { 983 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
999 inc_rs_pending(mdev); 984 inc_rs_pending(mdev);
1000 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); 985 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
@@ -1096,25 +1081,27 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1096 if (unlikely(cancel)) 1081 if (unlikely(cancel))
1097 goto out; 1082 goto out;
1098 1083
1099 if (unlikely((e->flags & EE_WAS_ERROR) != 0))
1100 goto out;
1101
1102 digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1084 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1103 /* FIXME if this allocation fails, online verify will not terminate! */
1104 digest = kmalloc(digest_size, GFP_NOIO); 1085 digest = kmalloc(digest_size, GFP_NOIO);
1105 if (digest) { 1086 if (!digest) {
1106 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest); 1087 ok = 0; /* terminate the connection in case the allocation failed */
1107 inc_rs_pending(mdev); 1088 goto out;
1108 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
1109 digest, digest_size, P_OV_REPLY);
1110 if (!ok)
1111 dec_rs_pending(mdev);
1112 kfree(digest);
1113 } 1089 }
1114 1090
1091 if (likely(!(e->flags & EE_WAS_ERROR)))
1092 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
1093 else
1094 memset(digest, 0, digest_size);
1095
1096 inc_rs_pending(mdev);
1097 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
1098 digest, digest_size, P_OV_REPLY);
1099 if (!ok)
1100 dec_rs_pending(mdev);
1101 kfree(digest);
1102
1115out: 1103out:
1116 drbd_free_ee(mdev, e); 1104 drbd_free_ee(mdev, e);
1117
1118 dec_unacked(mdev); 1105 dec_unacked(mdev);
1119 1106
1120 return ok; 1107 return ok;
@@ -1129,7 +1116,6 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1129 mdev->ov_last_oos_size = size>>9; 1116 mdev->ov_last_oos_size = size>>9;
1130 } 1117 }
1131 drbd_set_out_of_sync(mdev, sector, size); 1118 drbd_set_out_of_sync(mdev, sector, size);
1132 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1133} 1119}
1134 1120
1135int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1121int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
@@ -1165,10 +1151,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1165 eq = !memcmp(digest, di->digest, digest_size); 1151 eq = !memcmp(digest, di->digest, digest_size);
1166 kfree(digest); 1152 kfree(digest);
1167 } 1153 }
1168 } else {
1169 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1170 if (__ratelimit(&drbd_ratelimit_state))
1171 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1172 } 1154 }
1173 1155
1174 dec_unacked(mdev); 1156 dec_unacked(mdev);
@@ -1182,7 +1164,13 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1182 1164
1183 drbd_free_ee(mdev, e); 1165 drbd_free_ee(mdev, e);
1184 1166
1185 if (--mdev->ov_left == 0) { 1167 --mdev->ov_left;
1168
1169 /* let's advance progress step marks only for every other megabyte */
1170 if ((mdev->ov_left & 0x200) == 0x200)
1171 drbd_advance_rs_marks(mdev, mdev->ov_left);
1172
1173 if (mdev->ov_left == 0) {
1186 ov_oos_print(mdev); 1174 ov_oos_print(mdev);
1187 drbd_resync_finished(mdev); 1175 drbd_resync_finished(mdev);
1188 } 1176 }
@@ -1235,6 +1223,22 @@ int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1235 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); 1223 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1236} 1224}
1237 1225
1226int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1227{
1228 struct drbd_request *req = container_of(w, struct drbd_request, w);
1229 int ok;
1230
1231 if (unlikely(cancel)) {
1232 req_mod(req, send_canceled);
1233 return 1;
1234 }
1235
1236 ok = drbd_send_oos(mdev, req);
1237 req_mod(req, oos_handed_to_network);
1238
1239 return ok;
1240}
1241
1238/** 1242/**
1239 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request 1243 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1240 * @mdev: DRBD device. 1244 * @mdev: DRBD device.
@@ -1430,6 +1434,17 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
1430 return retcode; 1434 return retcode;
1431} 1435}
1432 1436
1437void drbd_rs_controller_reset(struct drbd_conf *mdev)
1438{
1439 atomic_set(&mdev->rs_sect_in, 0);
1440 atomic_set(&mdev->rs_sect_ev, 0);
1441 mdev->rs_in_flight = 0;
1442 mdev->rs_planed = 0;
1443 spin_lock(&mdev->peer_seq_lock);
1444 fifo_set(&mdev->rs_plan_s, 0);
1445 spin_unlock(&mdev->peer_seq_lock);
1446}
1447
1433/** 1448/**
1434 * drbd_start_resync() - Start the resync process 1449 * drbd_start_resync() - Start the resync process
1435 * @mdev: DRBD device. 1450 * @mdev: DRBD device.
@@ -1443,13 +1458,18 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1443 union drbd_state ns; 1458 union drbd_state ns;
1444 int r; 1459 int r;
1445 1460
1446 if (mdev->state.conn >= C_SYNC_SOURCE) { 1461 if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) {
1447 dev_err(DEV, "Resync already running!\n"); 1462 dev_err(DEV, "Resync already running!\n");
1448 return; 1463 return;
1449 } 1464 }
1450 1465
1451 /* In case a previous resync run was aborted by an IO error/detach on the peer. */ 1466 if (mdev->state.conn < C_AHEAD) {
1452 drbd_rs_cancel_all(mdev); 1467 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1468 drbd_rs_cancel_all(mdev);
1469 /* This should be done when we abort the resync. We definitely do not
1470 want to have this for connections going back and forth between
1471 Ahead/Behind and SyncSource/SyncTarget */
1472 }
1453 1473
1454 if (side == C_SYNC_TARGET) { 1474 if (side == C_SYNC_TARGET) {
1455 /* Since application IO was locked out during C_WF_BITMAP_T and 1475 /* Since application IO was locked out during C_WF_BITMAP_T and
@@ -1463,6 +1483,20 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1463 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 1483 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1464 return; 1484 return;
1465 } 1485 }
1486 } else /* C_SYNC_SOURCE */ {
1487 r = drbd_khelper(mdev, "before-resync-source");
1488 r = (r >> 8) & 0xff;
1489 if (r > 0) {
1490 if (r == 3) {
1491 dev_info(DEV, "before-resync-source handler returned %d, "
1492 "ignoring. Old userland tools?", r);
1493 } else {
1494 dev_info(DEV, "before-resync-source handler returned %d, "
1495 "dropping connection.\n", r);
1496 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1497 return;
1498 }
1499 }
1466 } 1500 }
1467 1501
1468 drbd_state_lock(mdev); 1502 drbd_state_lock(mdev);
@@ -1472,18 +1506,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1472 return; 1506 return;
1473 } 1507 }
1474 1508
1475 if (side == C_SYNC_TARGET) {
1476 mdev->bm_resync_fo = 0;
1477 } else /* side == C_SYNC_SOURCE */ {
1478 u64 uuid;
1479
1480 get_random_bytes(&uuid, sizeof(u64));
1481 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1482 drbd_send_sync_uuid(mdev, uuid);
1483
1484 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1485 }
1486
1487 write_lock_irq(&global_state_lock); 1509 write_lock_irq(&global_state_lock);
1488 ns = mdev->state; 1510 ns = mdev->state;
1489 1511
@@ -1521,13 +1543,24 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1521 _drbd_pause_after(mdev); 1543 _drbd_pause_after(mdev);
1522 } 1544 }
1523 write_unlock_irq(&global_state_lock); 1545 write_unlock_irq(&global_state_lock);
1524 put_ldev(mdev);
1525 1546
1526 if (r == SS_SUCCESS) { 1547 if (r == SS_SUCCESS) {
1527 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", 1548 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1528 drbd_conn_str(ns.conn), 1549 drbd_conn_str(ns.conn),
1529 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), 1550 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1530 (unsigned long) mdev->rs_total); 1551 (unsigned long) mdev->rs_total);
1552 if (side == C_SYNC_TARGET)
1553 mdev->bm_resync_fo = 0;
1554
1555 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1556 * with w_send_oos, or the sync target will get confused as to
1557 * how much bits to resync. We cannot do that always, because for an
1558 * empty resync and protocol < 95, we need to do it here, as we call
1559 * drbd_resync_finished from here in that case.
1560 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1561 * and from after_state_ch otherwise. */
1562 if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)
1563 drbd_gen_and_send_sync_uuid(mdev);
1531 1564
1532 if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) { 1565 if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
1533 /* This still has a race (about when exactly the peers 1566 /* This still has a race (about when exactly the peers
@@ -1547,13 +1580,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1547 drbd_resync_finished(mdev); 1580 drbd_resync_finished(mdev);
1548 } 1581 }
1549 1582
1550 atomic_set(&mdev->rs_sect_in, 0); 1583 drbd_rs_controller_reset(mdev);
1551 atomic_set(&mdev->rs_sect_ev, 0);
1552 mdev->rs_in_flight = 0;
1553 mdev->rs_planed = 0;
1554 spin_lock(&mdev->peer_seq_lock);
1555 fifo_set(&mdev->rs_plan_s, 0);
1556 spin_unlock(&mdev->peer_seq_lock);
1557 /* ns.conn may already be != mdev->state.conn, 1584 /* ns.conn may already be != mdev->state.conn,
1558 * we may have been paused in between, or become paused until 1585 * we may have been paused in between, or become paused until
1559 * the timer triggers. 1586 * the timer triggers.
@@ -1563,6 +1590,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1563 1590
1564 drbd_md_sync(mdev); 1591 drbd_md_sync(mdev);
1565 } 1592 }
1593 put_ldev(mdev);
1566 drbd_state_unlock(mdev); 1594 drbd_state_unlock(mdev);
1567} 1595}
1568 1596
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index 53586fa5ae1b..151f1a37478f 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -39,7 +39,7 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev,
39 return; 39 return;
40 } 40 }
41 41
42 if (FAULT_ACTIVE(mdev, fault_type)) 42 if (drbd_insert_fault(mdev, fault_type))
43 bio_endio(bio, -EIO); 43 bio_endio(bio, -EIO);
44 else 44 else
45 generic_make_request(bio); 45 generic_make_request(bio);