aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-30 12:05:47 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-30 12:05:47 -0400
commita70f35af4e49f87ba4b6c4b30220fbb66cd74af6 (patch)
treef81d1c68d332f7ed32048085fa2972c057f62419 /drivers/block/drbd
parent0d167518e045cc8bb63f0a8a0a85ad4fa4e0044f (diff)
parent4fd1ffaa122cf66bfb710ced43679413df4f3605 (diff)
Merge branch 'for-3.5/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: "Here are the driver related changes for 3.5. It contains: - The floppy changes from Jiri. Jiri is now also marked as the maintainer of floppy.c, I shall be publically branding his forehead with red hot iron at the next opportune moment. - A batch of drbd updates and fixes from the linbit crew, as well as fixes from others. - Two small fixes for xen-blkfront courtesy of Jan." * 'for-3.5/drivers' of git://git.kernel.dk/linux-block: (70 commits) floppy: take over maintainership floppy: remove floppy-specific O_EXCL handling floppy: convert to delayed work and single-thread wq xen-blkfront: module exit handling adjustments xen-blkfront: properly name all devices drbd: grammar fix in log message drbd: check MODULE for THIS_MODULE drbd: Restore the request restart logic drbd: introduce a bio_set to allocate housekeeping bios from drbd: remove unused define drbd: bm_page_async_io: properly initialize page->private drbd: use the newly introduced page pool for bitmap IO drbd: add page pool to be used for meta data IO drbd: allow bitmap to change during writeout from resync_finished drbd: fix race between drbdadm invalidate/verify and finishing resync drbd: fix resend/resubmit of frozen IO drbd: Ensure that data_size is not 0 before using data_size-1 as index drbd: Delay/reject other state changes while establishing a connection drbd: move put_ldev from __req_mod() to the endio callback drbd: fix WRITE_ACKED_BY_PEER_AND_SIS to not set RQ_NET_DONE ...
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r--drivers/block/drbd/drbd_actlog.c104
-rw-r--r--drivers/block/drbd/drbd_bitmap.c146
-rw-r--r--drivers/block/drbd/drbd_int.h90
-rw-r--r--drivers/block/drbd/drbd_main.c357
-rw-r--r--drivers/block/drbd/drbd_nl.c48
-rw-r--r--drivers/block/drbd/drbd_proc.c2
-rw-r--r--drivers/block/drbd/drbd_receiver.c95
-rw-r--r--drivers/block/drbd/drbd_req.c132
-rw-r--r--drivers/block/drbd/drbd_req.h19
-rw-r--r--drivers/block/drbd/drbd_worker.c31
10 files changed, 741 insertions, 283 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index cf0e63dd97da..e54e31b02b88 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -65,39 +65,80 @@ struct drbd_atodb_wait {
65 65
66int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); 66int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
67 67
68void *drbd_md_get_buffer(struct drbd_conf *mdev)
69{
70 int r;
71
72 wait_event(mdev->misc_wait,
73 (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
74 mdev->state.disk <= D_FAILED);
75
76 return r ? NULL : page_address(mdev->md_io_page);
77}
78
79void drbd_md_put_buffer(struct drbd_conf *mdev)
80{
81 if (atomic_dec_and_test(&mdev->md_io_in_use))
82 wake_up(&mdev->misc_wait);
83}
84
85static bool md_io_allowed(struct drbd_conf *mdev)
86{
87 enum drbd_disk_state ds = mdev->state.disk;
88 return ds >= D_NEGOTIATING || ds == D_ATTACHING;
89}
90
91void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
92 unsigned int *done)
93{
94 long dt = bdev->dc.disk_timeout * HZ / 10;
95 if (dt == 0)
96 dt = MAX_SCHEDULE_TIMEOUT;
97
98 dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt);
99 if (dt == 0)
100 dev_err(DEV, "meta-data IO operation timed out\n");
101}
102
68static int _drbd_md_sync_page_io(struct drbd_conf *mdev, 103static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
69 struct drbd_backing_dev *bdev, 104 struct drbd_backing_dev *bdev,
70 struct page *page, sector_t sector, 105 struct page *page, sector_t sector,
71 int rw, int size) 106 int rw, int size)
72{ 107{
73 struct bio *bio; 108 struct bio *bio;
74 struct drbd_md_io md_io;
75 int ok; 109 int ok;
76 110
77 md_io.mdev = mdev; 111 mdev->md_io.done = 0;
78 init_completion(&md_io.event); 112 mdev->md_io.error = -ENODEV;
79 md_io.error = 0;
80 113
81 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) 114 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
82 rw |= REQ_FUA | REQ_FLUSH; 115 rw |= REQ_FUA | REQ_FLUSH;
83 rw |= REQ_SYNC; 116 rw |= REQ_SYNC;
84 117
85 bio = bio_alloc(GFP_NOIO, 1); 118 bio = bio_alloc_drbd(GFP_NOIO);
86 bio->bi_bdev = bdev->md_bdev; 119 bio->bi_bdev = bdev->md_bdev;
87 bio->bi_sector = sector; 120 bio->bi_sector = sector;
88 ok = (bio_add_page(bio, page, size, 0) == size); 121 ok = (bio_add_page(bio, page, size, 0) == size);
89 if (!ok) 122 if (!ok)
90 goto out; 123 goto out;
91 bio->bi_private = &md_io; 124 bio->bi_private = &mdev->md_io;
92 bio->bi_end_io = drbd_md_io_complete; 125 bio->bi_end_io = drbd_md_io_complete;
93 bio->bi_rw = rw; 126 bio->bi_rw = rw;
94 127
128 if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
129 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
130 ok = 0;
131 goto out;
132 }
133
134 bio_get(bio); /* one bio_put() is in the completion handler */
135 atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
95 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 136 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
96 bio_endio(bio, -EIO); 137 bio_endio(bio, -EIO);
97 else 138 else
98 submit_bio(rw, bio); 139 submit_bio(rw, bio);
99 wait_for_completion(&md_io.event); 140 wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done);
100 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; 141 ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
101 142
102 out: 143 out:
103 bio_put(bio); 144 bio_put(bio);
@@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
111 int offset = 0; 152 int offset = 0;
112 struct page *iop = mdev->md_io_page; 153 struct page *iop = mdev->md_io_page;
113 154
114 D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); 155 D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
115 156
116 BUG_ON(!bdev->md_bdev); 157 BUG_ON(!bdev->md_bdev);
117 158
@@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
328 return 1; 369 return 1;
329 } 370 }
330 371
331 mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ 372 buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
332 buffer = (struct al_transaction *)page_address(mdev->md_io_page); 373 if (!buffer) {
374 dev_err(DEV, "disk failed while waiting for md_io buffer\n");
375 complete(&((struct update_al_work *)w)->event);
376 put_ldev(mdev);
377 return 1;
378 }
333 379
334 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); 380 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
335 buffer->tr_number = cpu_to_be32(mdev->al_tr_number); 381 buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
@@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
374 D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); 420 D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
375 mdev->al_tr_number++; 421 mdev->al_tr_number++;
376 422
377 mutex_unlock(&mdev->md_io_mutex); 423 drbd_md_put_buffer(mdev);
378 424
379 complete(&((struct update_al_work *)w)->event); 425 complete(&((struct update_al_work *)w)->event);
380 put_ldev(mdev); 426 put_ldev(mdev);
@@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
443 /* lock out all other meta data io for now, 489 /* lock out all other meta data io for now,
444 * and make sure the page is mapped. 490 * and make sure the page is mapped.
445 */ 491 */
446 mutex_lock(&mdev->md_io_mutex); 492 buffer = drbd_md_get_buffer(mdev);
447 buffer = page_address(mdev->md_io_page); 493 if (!buffer)
494 return 0;
448 495
449 /* Find the valid transaction in the log */ 496 /* Find the valid transaction in the log */
450 for (i = 0; i <= mx; i++) { 497 for (i = 0; i <= mx; i++) {
@@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
452 if (rv == 0) 499 if (rv == 0)
453 continue; 500 continue;
454 if (rv == -1) { 501 if (rv == -1) {
455 mutex_unlock(&mdev->md_io_mutex); 502 drbd_md_put_buffer(mdev);
456 return 0; 503 return 0;
457 } 504 }
458 cnr = be32_to_cpu(buffer->tr_number); 505 cnr = be32_to_cpu(buffer->tr_number);
@@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
478 525
479 if (!found_valid) { 526 if (!found_valid) {
480 dev_warn(DEV, "No usable activity log found.\n"); 527 dev_warn(DEV, "No usable activity log found.\n");
481 mutex_unlock(&mdev->md_io_mutex); 528 drbd_md_put_buffer(mdev);
482 return 1; 529 return 1;
483 } 530 }
484 531
@@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
493 rv = drbd_al_read_tr(mdev, bdev, buffer, i); 540 rv = drbd_al_read_tr(mdev, bdev, buffer, i);
494 ERR_IF(rv == 0) goto cancel; 541 ERR_IF(rv == 0) goto cancel;
495 if (rv == -1) { 542 if (rv == -1) {
496 mutex_unlock(&mdev->md_io_mutex); 543 drbd_md_put_buffer(mdev);
497 return 0; 544 return 0;
498 } 545 }
499 546
@@ -534,7 +581,7 @@ cancel:
534 mdev->al_tr_pos = 0; 581 mdev->al_tr_pos = 0;
535 582
536 /* ok, we are done with it */ 583 /* ok, we are done with it */
537 mutex_unlock(&mdev->md_io_mutex); 584 drbd_md_put_buffer(mdev);
538 585
539 dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", 586 dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
540 transactions, active_extents); 587 transactions, active_extents);
@@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
671 else 718 else
672 ext->rs_failed += count; 719 ext->rs_failed += count;
673 if (ext->rs_left < ext->rs_failed) { 720 if (ext->rs_left < ext->rs_failed) {
674 dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " 721 dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
675 "rs_failed=%d count=%d\n", 722 "rs_failed=%d count=%d cstate=%s\n",
676 (unsigned long long)sector, 723 (unsigned long long)sector,
677 ext->lce.lc_number, ext->rs_left, 724 ext->lce.lc_number, ext->rs_left,
678 ext->rs_failed, count); 725 ext->rs_failed, count,
679 dump_stack(); 726 drbd_conn_str(mdev->state.conn));
680 727
681 lc_put(mdev->resync, &ext->lce); 728 /* We don't expect to be able to clear more bits
682 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 729 * than have been set when we originally counted
683 return; 730 * the set bits to cache that value in ext->rs_left.
731 * Whatever the reason (disconnect during resync,
732 * delayed local completion of an application write),
733 * try to fix it up by recounting here. */
734 ext->rs_left = drbd_bm_e_weight(mdev, enr);
684 } 735 }
685 } else { 736 } else {
686 /* Normally this element should be in the cache, 737 /* Normally this element should be in the cache,
@@ -1192,6 +1243,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev)
1192 put_ldev(mdev); 1243 put_ldev(mdev);
1193 } 1244 }
1194 spin_unlock_irq(&mdev->al_lock); 1245 spin_unlock_irq(&mdev->al_lock);
1246 wake_up(&mdev->al_wait);
1195 1247
1196 return 0; 1248 return 0;
1197} 1249}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 3030201c69d8..b5c5ff53cb57 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
205static void bm_store_page_idx(struct page *page, unsigned long idx) 205static void bm_store_page_idx(struct page *page, unsigned long idx)
206{ 206{
207 BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); 207 BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
208 page_private(page) |= idx; 208 set_page_private(page, idx);
209} 209}
210 210
211static unsigned long bm_page_to_idx(struct page *page) 211static unsigned long bm_page_to_idx(struct page *page)
@@ -886,12 +886,21 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
886struct bm_aio_ctx { 886struct bm_aio_ctx {
887 struct drbd_conf *mdev; 887 struct drbd_conf *mdev;
888 atomic_t in_flight; 888 atomic_t in_flight;
889 struct completion done; 889 unsigned int done;
890 unsigned flags; 890 unsigned flags;
891#define BM_AIO_COPY_PAGES 1 891#define BM_AIO_COPY_PAGES 1
892 int error; 892 int error;
893 struct kref kref;
893}; 894};
894 895
896static void bm_aio_ctx_destroy(struct kref *kref)
897{
898 struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
899
900 put_ldev(ctx->mdev);
901 kfree(ctx);
902}
903
895/* bv_page may be a copy, or may be the original */ 904/* bv_page may be a copy, or may be the original */
896static void bm_async_io_complete(struct bio *bio, int error) 905static void bm_async_io_complete(struct bio *bio, int error)
897{ 906{
@@ -930,20 +939,21 @@ static void bm_async_io_complete(struct bio *bio, int error)
930 939
931 bm_page_unlock_io(mdev, idx); 940 bm_page_unlock_io(mdev, idx);
932 941
933 /* FIXME give back to page pool */
934 if (ctx->flags & BM_AIO_COPY_PAGES) 942 if (ctx->flags & BM_AIO_COPY_PAGES)
935 put_page(bio->bi_io_vec[0].bv_page); 943 mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
936 944
937 bio_put(bio); 945 bio_put(bio);
938 946
939 if (atomic_dec_and_test(&ctx->in_flight)) 947 if (atomic_dec_and_test(&ctx->in_flight)) {
940 complete(&ctx->done); 948 ctx->done = 1;
949 wake_up(&mdev->misc_wait);
950 kref_put(&ctx->kref, &bm_aio_ctx_destroy);
951 }
941} 952}
942 953
943static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) 954static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
944{ 955{
945 /* we are process context. we always get a bio */ 956 struct bio *bio = bio_alloc_drbd(GFP_NOIO);
946 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
947 struct drbd_conf *mdev = ctx->mdev; 957 struct drbd_conf *mdev = ctx->mdev;
948 struct drbd_bitmap *b = mdev->bitmap; 958 struct drbd_bitmap *b = mdev->bitmap;
949 struct page *page; 959 struct page *page;
@@ -966,10 +976,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
966 bm_set_page_unchanged(b->bm_pages[page_nr]); 976 bm_set_page_unchanged(b->bm_pages[page_nr]);
967 977
968 if (ctx->flags & BM_AIO_COPY_PAGES) { 978 if (ctx->flags & BM_AIO_COPY_PAGES) {
969 /* FIXME alloc_page is good enough for now, but actually needs
970 * to use pre-allocated page pool */
971 void *src, *dest; 979 void *src, *dest;
972 page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); 980 page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
973 dest = kmap_atomic(page); 981 dest = kmap_atomic(page);
974 src = kmap_atomic(b->bm_pages[page_nr]); 982 src = kmap_atomic(b->bm_pages[page_nr]);
975 memcpy(dest, src, PAGE_SIZE); 983 memcpy(dest, src, PAGE_SIZE);
@@ -981,6 +989,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
981 989
982 bio->bi_bdev = mdev->ldev->md_bdev; 990 bio->bi_bdev = mdev->ldev->md_bdev;
983 bio->bi_sector = on_disk_sector; 991 bio->bi_sector = on_disk_sector;
992 /* bio_add_page of a single page to an empty bio will always succeed,
993 * according to api. Do we want to assert that? */
984 bio_add_page(bio, page, len, 0); 994 bio_add_page(bio, page, len, 0);
985 bio->bi_private = ctx; 995 bio->bi_private = ctx;
986 bio->bi_end_io = bm_async_io_complete; 996 bio->bi_end_io = bm_async_io_complete;
@@ -999,14 +1009,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
999/* 1009/*
1000 * bm_rw: read/write the whole bitmap from/to its on disk location. 1010 * bm_rw: read/write the whole bitmap from/to its on disk location.
1001 */ 1011 */
1002static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) 1012static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
1003{ 1013{
1004 struct bm_aio_ctx ctx = { 1014 struct bm_aio_ctx *ctx;
1005 .mdev = mdev,
1006 .in_flight = ATOMIC_INIT(1),
1007 .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1008 .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
1009 };
1010 struct drbd_bitmap *b = mdev->bitmap; 1015 struct drbd_bitmap *b = mdev->bitmap;
1011 int num_pages, i, count = 0; 1016 int num_pages, i, count = 0;
1012 unsigned long now; 1017 unsigned long now;
@@ -1021,7 +1026,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
1021 * For lazy writeout, we don't care for ongoing changes to the bitmap, 1026 * For lazy writeout, we don't care for ongoing changes to the bitmap,
1022 * as we submit copies of pages anyways. 1027 * as we submit copies of pages anyways.
1023 */ 1028 */
1024 if (!ctx.flags) 1029
1030 ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
1031 if (!ctx)
1032 return -ENOMEM;
1033
1034 *ctx = (struct bm_aio_ctx) {
1035 .mdev = mdev,
1036 .in_flight = ATOMIC_INIT(1),
1037 .done = 0,
1038 .flags = flags,
1039 .error = 0,
1040 .kref = { ATOMIC_INIT(2) },
1041 };
1042
1043 if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
1044 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
1045 kfree(ctx);
1046 return -ENODEV;
1047 }
1048
1049 if (!ctx->flags)
1025 WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); 1050 WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
1026 1051
1027 num_pages = b->bm_number_of_pages; 1052 num_pages = b->bm_number_of_pages;
@@ -1046,29 +1071,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
1046 continue; 1071 continue;
1047 } 1072 }
1048 } 1073 }
1049 atomic_inc(&ctx.in_flight); 1074 atomic_inc(&ctx->in_flight);
1050 bm_page_io_async(&ctx, i, rw); 1075 bm_page_io_async(ctx, i, rw);
1051 ++count; 1076 ++count;
1052 cond_resched(); 1077 cond_resched();
1053 } 1078 }
1054 1079
1055 /* 1080 /*
1056 * We initialize ctx.in_flight to one to make sure bm_async_io_complete 1081 * We initialize ctx->in_flight to one to make sure bm_async_io_complete
1057 * will not complete() early, and decrement / test it here. If there 1082 * will not set ctx->done early, and decrement / test it here. If there
1058 * are still some bios in flight, we need to wait for them here. 1083 * are still some bios in flight, we need to wait for them here.
1084 * If all IO is done already (or nothing had been submitted), there is
1085 * no need to wait. Still, we need to put the kref associated with the
1086 * "in_flight reached zero, all done" event.
1059 */ 1087 */
1060 if (!atomic_dec_and_test(&ctx.in_flight)) 1088 if (!atomic_dec_and_test(&ctx->in_flight))
1061 wait_for_completion(&ctx.done); 1089 wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
1090 else
1091 kref_put(&ctx->kref, &bm_aio_ctx_destroy);
1092
1062 dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", 1093 dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
1063 rw == WRITE ? "WRITE" : "READ", 1094 rw == WRITE ? "WRITE" : "READ",
1064 count, jiffies - now); 1095 count, jiffies - now);
1065 1096
1066 if (ctx.error) { 1097 if (ctx->error) {
1067 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); 1098 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
1068 drbd_chk_io_error(mdev, 1, true); 1099 drbd_chk_io_error(mdev, 1, true);
1069 err = -EIO; /* ctx.error ? */ 1100 err = -EIO; /* ctx->error ? */
1070 } 1101 }
1071 1102
1103 if (atomic_read(&ctx->in_flight))
1104 err = -EIO; /* Disk failed during IO... */
1105
1072 now = jiffies; 1106 now = jiffies;
1073 if (rw == WRITE) { 1107 if (rw == WRITE) {
1074 drbd_md_flush(mdev); 1108 drbd_md_flush(mdev);
@@ -1082,6 +1116,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
1082 dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", 1116 dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
1083 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); 1117 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
1084 1118
1119 kref_put(&ctx->kref, &bm_aio_ctx_destroy);
1085 return err; 1120 return err;
1086} 1121}
1087 1122
@@ -1091,7 +1126,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
1091 */ 1126 */
1092int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) 1127int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
1093{ 1128{
1094 return bm_rw(mdev, READ, 0); 1129 return bm_rw(mdev, READ, 0, 0);
1095} 1130}
1096 1131
1097/** 1132/**
@@ -1102,7 +1137,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
1102 */ 1137 */
1103int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) 1138int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
1104{ 1139{
1105 return bm_rw(mdev, WRITE, 0); 1140 return bm_rw(mdev, WRITE, 0, 0);
1106} 1141}
1107 1142
1108/** 1143/**
@@ -1112,7 +1147,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
1112 */ 1147 */
1113int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) 1148int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
1114{ 1149{
1115 return bm_rw(mdev, WRITE, upper_idx); 1150 return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx);
1151}
1152
1153/**
1154 * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
1155 * @mdev: DRBD device.
1156 *
1157 * Will only write pages that have changed since last IO.
1158 * In contrast to drbd_bm_write(), this will copy the bitmap pages
1159 * to temporary writeout pages. It is intended to trigger a full write-out
1160 * while still allowing the bitmap to change, for example if a resync or online
1161 * verify is aborted due to a failed peer disk, while local IO continues, or
1162 * pending resync acks are still being processed.
1163 */
1164int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
1165{
1166 return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
1116} 1167}
1117 1168
1118 1169
@@ -1130,28 +1181,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l
1130 */ 1181 */
1131int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) 1182int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
1132{ 1183{
1133 struct bm_aio_ctx ctx = { 1184 struct bm_aio_ctx *ctx;
1185 int err;
1186
1187 if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
1188 dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
1189 return 0;
1190 }
1191
1192 ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
1193 if (!ctx)
1194 return -ENOMEM;
1195
1196 *ctx = (struct bm_aio_ctx) {
1134 .mdev = mdev, 1197 .mdev = mdev,
1135 .in_flight = ATOMIC_INIT(1), 1198 .in_flight = ATOMIC_INIT(1),
1136 .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), 1199 .done = 0,
1137 .flags = BM_AIO_COPY_PAGES, 1200 .flags = BM_AIO_COPY_PAGES,
1201 .error = 0,
1202 .kref = { ATOMIC_INIT(2) },
1138 }; 1203 };
1139 1204
1140 if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { 1205 if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
1141 dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); 1206 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
1142 return 0; 1207 kfree(ctx);
1208 return -ENODEV;
1143 } 1209 }
1144 1210
1145 bm_page_io_async(&ctx, idx, WRITE_SYNC); 1211 bm_page_io_async(ctx, idx, WRITE_SYNC);
1146 wait_for_completion(&ctx.done); 1212 wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
1147 1213
1148 if (ctx.error) 1214 if (ctx->error)
1149 drbd_chk_io_error(mdev, 1, true); 1215 drbd_chk_io_error(mdev, 1, true);
1150 /* that should force detach, so the in memory bitmap will be 1216 /* that should force detach, so the in memory bitmap will be
1151 * gone in a moment as well. */ 1217 * gone in a moment as well. */
1152 1218
1153 mdev->bm_writ_cnt++; 1219 mdev->bm_writ_cnt++;
1154 return ctx.error; 1220 err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
1221 kref_put(&ctx->kref, &bm_aio_ctx_destroy);
1222 return err;
1155} 1223}
1156 1224
1157/* NOTE 1225/* NOTE
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 8d680562ba73..02f013a073a7 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -712,7 +712,6 @@ struct drbd_request {
712 struct list_head tl_requests; /* ring list in the transfer log */ 712 struct list_head tl_requests; /* ring list in the transfer log */
713 struct bio *master_bio; /* master bio pointer */ 713 struct bio *master_bio; /* master bio pointer */
714 unsigned long rq_state; /* see comments above _req_mod() */ 714 unsigned long rq_state; /* see comments above _req_mod() */
715 int seq_num;
716 unsigned long start_time; 715 unsigned long start_time;
717}; 716};
718 717
@@ -851,6 +850,7 @@ enum {
851 NEW_CUR_UUID, /* Create new current UUID when thawing IO */ 850 NEW_CUR_UUID, /* Create new current UUID when thawing IO */
852 AL_SUSPENDED, /* Activity logging is currently suspended. */ 851 AL_SUSPENDED, /* Activity logging is currently suspended. */
853 AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ 852 AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
853 STATE_SENT, /* Do not change state/UUIDs while this is set */
854}; 854};
855 855
856struct drbd_bitmap; /* opaque for drbd_conf */ 856struct drbd_bitmap; /* opaque for drbd_conf */
@@ -862,31 +862,30 @@ enum bm_flag {
862 BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ 862 BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
863 863
864 /* currently locked for bulk operation */ 864 /* currently locked for bulk operation */
865 BM_LOCKED_MASK = 0x7, 865 BM_LOCKED_MASK = 0xf,
866 866
867 /* in detail, that is: */ 867 /* in detail, that is: */
868 BM_DONT_CLEAR = 0x1, 868 BM_DONT_CLEAR = 0x1,
869 BM_DONT_SET = 0x2, 869 BM_DONT_SET = 0x2,
870 BM_DONT_TEST = 0x4, 870 BM_DONT_TEST = 0x4,
871 871
872 /* so we can mark it locked for bulk operation,
873 * and still allow all non-bulk operations */
874 BM_IS_LOCKED = 0x8,
875
872 /* (test bit, count bit) allowed (common case) */ 876 /* (test bit, count bit) allowed (common case) */
873 BM_LOCKED_TEST_ALLOWED = 0x3, 877 BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
874 878
875 /* testing bits, as well as setting new bits allowed, but clearing bits 879 /* testing bits, as well as setting new bits allowed, but clearing bits
876 * would be unexpected. Used during bitmap receive. Setting new bits 880 * would be unexpected. Used during bitmap receive. Setting new bits
877 * requires sending of "out-of-sync" information, though. */ 881 * requires sending of "out-of-sync" information, though. */
878 BM_LOCKED_SET_ALLOWED = 0x1, 882 BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
879 883
880 /* clear is not expected while bitmap is locked for bulk operation */ 884 /* for drbd_bm_write_copy_pages, everything is allowed,
885 * only concurrent bulk operations are locked out. */
886 BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
881}; 887};
882 888
883
884/* TODO sort members for performance
885 * MAYBE group them further */
886
887/* THINK maybe we actually want to use the default "event/%s" worker threads
888 * or similar in linux 2.6, which uses per cpu data and threads.
889 */
890struct drbd_work_queue { 889struct drbd_work_queue {
891 struct list_head q; 890 struct list_head q;
892 struct semaphore s; /* producers up it, worker down()s it */ 891 struct semaphore s; /* producers up it, worker down()s it */
@@ -938,8 +937,7 @@ struct drbd_backing_dev {
938}; 937};
939 938
940struct drbd_md_io { 939struct drbd_md_io {
941 struct drbd_conf *mdev; 940 unsigned int done;
942 struct completion event;
943 int error; 941 int error;
944}; 942};
945 943
@@ -1022,6 +1020,7 @@ struct drbd_conf {
1022 struct drbd_tl_epoch *newest_tle; 1020 struct drbd_tl_epoch *newest_tle;
1023 struct drbd_tl_epoch *oldest_tle; 1021 struct drbd_tl_epoch *oldest_tle;
1024 struct list_head out_of_sequence_requests; 1022 struct list_head out_of_sequence_requests;
1023 struct list_head barrier_acked_requests;
1025 struct hlist_head *tl_hash; 1024 struct hlist_head *tl_hash;
1026 unsigned int tl_hash_s; 1025 unsigned int tl_hash_s;
1027 1026
@@ -1056,6 +1055,8 @@ struct drbd_conf {
1056 struct crypto_hash *csums_tfm; 1055 struct crypto_hash *csums_tfm;
1057 struct crypto_hash *verify_tfm; 1056 struct crypto_hash *verify_tfm;
1058 1057
1058 unsigned long last_reattach_jif;
1059 unsigned long last_reconnect_jif;
1059 struct drbd_thread receiver; 1060 struct drbd_thread receiver;
1060 struct drbd_thread worker; 1061 struct drbd_thread worker;
1061 struct drbd_thread asender; 1062 struct drbd_thread asender;
@@ -1094,7 +1095,8 @@ struct drbd_conf {
1094 wait_queue_head_t ee_wait; 1095 wait_queue_head_t ee_wait;
1095 struct page *md_io_page; /* one page buffer for md_io */ 1096 struct page *md_io_page; /* one page buffer for md_io */
1096 struct page *md_io_tmpp; /* for logical_block_size != 512 */ 1097 struct page *md_io_tmpp; /* for logical_block_size != 512 */
1097 struct mutex md_io_mutex; /* protects the md_io_buffer */ 1098 struct drbd_md_io md_io;
1099 atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */
1098 spinlock_t al_lock; 1100 spinlock_t al_lock;
1099 wait_queue_head_t al_wait; 1101 wait_queue_head_t al_wait;
1100 struct lru_cache *act_log; /* activity log */ 1102 struct lru_cache *act_log; /* activity log */
@@ -1228,8 +1230,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev);
1228extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); 1230extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1229extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); 1231extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
1230extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); 1232extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
1231extern int _drbd_send_state(struct drbd_conf *mdev); 1233extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
1232extern int drbd_send_state(struct drbd_conf *mdev); 1234extern int drbd_send_current_state(struct drbd_conf *mdev);
1233extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1235extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1234 enum drbd_packets cmd, struct p_header80 *h, 1236 enum drbd_packets cmd, struct p_header80 *h,
1235 size_t size, unsigned msg_flags); 1237 size_t size, unsigned msg_flags);
@@ -1461,6 +1463,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1461extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); 1463extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
1462extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); 1464extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1463extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); 1465extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1466extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
1464extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, 1467extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
1465 unsigned long al_enr); 1468 unsigned long al_enr);
1466extern size_t drbd_bm_words(struct drbd_conf *mdev); 1469extern size_t drbd_bm_words(struct drbd_conf *mdev);
@@ -1493,11 +1496,38 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
1493extern mempool_t *drbd_request_mempool; 1496extern mempool_t *drbd_request_mempool;
1494extern mempool_t *drbd_ee_mempool; 1497extern mempool_t *drbd_ee_mempool;
1495 1498
1496extern struct page *drbd_pp_pool; /* drbd's page pool */ 1499/* drbd's page pool, used to buffer data received from the peer,
1500 * or data requested by the peer.
1501 *
1502 * This does not have an emergency reserve.
1503 *
1504 * When allocating from this pool, it first takes pages from the pool.
1505 * Only if the pool is depleted will try to allocate from the system.
1506 *
1507 * The assumption is that pages taken from this pool will be processed,
1508 * and given back, "quickly", and then can be recycled, so we can avoid
1509 * frequent calls to alloc_page(), and still will be able to make progress even
1510 * under memory pressure.
1511 */
1512extern struct page *drbd_pp_pool;
1497extern spinlock_t drbd_pp_lock; 1513extern spinlock_t drbd_pp_lock;
1498extern int drbd_pp_vacant; 1514extern int drbd_pp_vacant;
1499extern wait_queue_head_t drbd_pp_wait; 1515extern wait_queue_head_t drbd_pp_wait;
1500 1516
1517/* We also need a standard (emergency-reserve backed) page pool
1518 * for meta data IO (activity log, bitmap).
1519 * We can keep it global, as long as it is used as "N pages at a time".
1520 * 128 should be plenty, currently we probably can get away with as few as 1.
1521 */
1522#define DRBD_MIN_POOL_PAGES 128
1523extern mempool_t *drbd_md_io_page_pool;
1524
1525/* We also need to make sure we get a bio
1526 * when we need it for housekeeping purposes */
1527extern struct bio_set *drbd_md_io_bio_set;
1528/* to allocate from that set */
1529extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
1530
1501extern rwlock_t global_state_lock; 1531extern rwlock_t global_state_lock;
1502 1532
1503extern struct drbd_conf *drbd_new_device(unsigned int minor); 1533extern struct drbd_conf *drbd_new_device(unsigned int minor);
@@ -1536,8 +1566,12 @@ extern void resume_next_sg(struct drbd_conf *mdev);
1536extern void suspend_other_sg(struct drbd_conf *mdev); 1566extern void suspend_other_sg(struct drbd_conf *mdev);
1537extern int drbd_resync_finished(struct drbd_conf *mdev); 1567extern int drbd_resync_finished(struct drbd_conf *mdev);
1538/* maybe rather drbd_main.c ? */ 1568/* maybe rather drbd_main.c ? */
1569extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
1570extern void drbd_md_put_buffer(struct drbd_conf *mdev);
1539extern int drbd_md_sync_page_io(struct drbd_conf *mdev, 1571extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
1540 struct drbd_backing_dev *bdev, sector_t sector, int rw); 1572 struct drbd_backing_dev *bdev, sector_t sector, int rw);
1573extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
1574 unsigned int *done);
1541extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); 1575extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
1542extern void drbd_rs_controller_reset(struct drbd_conf *mdev); 1576extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
1543 1577
@@ -1754,19 +1788,6 @@ static inline struct page *page_chain_next(struct page *page)
1754#define page_chain_for_each_safe(page, n) \ 1788#define page_chain_for_each_safe(page, n) \
1755 for (; page && ({ n = page_chain_next(page); 1; }); page = n) 1789 for (; page && ({ n = page_chain_next(page); 1; }); page = n)
1756 1790
1757static inline int drbd_bio_has_active_page(struct bio *bio)
1758{
1759 struct bio_vec *bvec;
1760 int i;
1761
1762 __bio_for_each_segment(bvec, bio, i, 0) {
1763 if (page_count(bvec->bv_page) > 1)
1764 return 1;
1765 }
1766
1767 return 0;
1768}
1769
1770static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) 1791static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
1771{ 1792{
1772 struct page *page = e->pages; 1793 struct page *page = e->pages;
@@ -1777,7 +1798,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
1777 return 0; 1798 return 0;
1778} 1799}
1779 1800
1780
1781static inline void drbd_state_lock(struct drbd_conf *mdev) 1801static inline void drbd_state_lock(struct drbd_conf *mdev)
1782{ 1802{
1783 wait_event(mdev->misc_wait, 1803 wait_event(mdev->misc_wait,
@@ -2230,7 +2250,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
2230 * Note: currently we don't support such large bitmaps on 32bit 2250 * Note: currently we don't support such large bitmaps on 32bit
2231 * arch anyways, but no harm done to be prepared for it here. 2251 * arch anyways, but no harm done to be prepared for it here.
2232 */ 2252 */
2233 unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10; 2253 unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10;
2234 unsigned long left = *bits_left >> shift; 2254 unsigned long left = *bits_left >> shift;
2235 unsigned long total = 1UL + (mdev->rs_total >> shift); 2255 unsigned long total = 1UL + (mdev->rs_total >> shift);
2236 unsigned long tmp = 1000UL - left * 1000UL/total; 2256 unsigned long tmp = 1000UL - left * 1000UL/total;
@@ -2306,12 +2326,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
2306 case D_OUTDATED: 2326 case D_OUTDATED:
2307 case D_CONSISTENT: 2327 case D_CONSISTENT:
2308 case D_UP_TO_DATE: 2328 case D_UP_TO_DATE:
2329 case D_FAILED:
2309 /* disk state is stable as well. */ 2330 /* disk state is stable as well. */
2310 break; 2331 break;
2311 2332
2312 /* no new io accepted during tansitional states */ 2333 /* no new io accepted during tansitional states */
2313 case D_ATTACHING: 2334 case D_ATTACHING:
2314 case D_FAILED:
2315 case D_NEGOTIATING: 2335 case D_NEGOTIATING:
2316 case D_UNKNOWN: 2336 case D_UNKNOWN:
2317 case D_MASK: 2337 case D_MASK:
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 211fc44f84be..920ede2829d6 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -139,6 +139,8 @@ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ 139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool; 140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool; 141mempool_t *drbd_ee_mempool;
142mempool_t *drbd_md_io_page_pool;
143struct bio_set *drbd_md_io_bio_set;
142 144
143/* I do not use a standard mempool, because: 145/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first. 146 1) I want to hand out the pre-allocated objects first.
@@ -159,7 +161,24 @@ static const struct block_device_operations drbd_ops = {
159 .release = drbd_release, 161 .release = drbd_release,
160}; 162};
161 163
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) 164static void bio_destructor_drbd(struct bio *bio)
165{
166 bio_free(bio, drbd_md_io_bio_set);
167}
168
169struct bio *bio_alloc_drbd(gfp_t gfp_mask)
170{
171 struct bio *bio;
172
173 if (!drbd_md_io_bio_set)
174 return bio_alloc(gfp_mask, 1);
175
176 bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
177 if (!bio)
178 return NULL;
179 bio->bi_destructor = bio_destructor_drbd;
180 return bio;
181}
163 182
164#ifdef __CHECKER__ 183#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will 184/* When checking with sparse, and this is an inline function, sparse will
@@ -208,6 +227,7 @@ static int tl_init(struct drbd_conf *mdev)
208 mdev->oldest_tle = b; 227 mdev->oldest_tle = b;
209 mdev->newest_tle = b; 228 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests); 229 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
230 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
211 231
212 mdev->tl_hash = NULL; 232 mdev->tl_hash = NULL;
213 mdev->tl_hash_s = 0; 233 mdev->tl_hash_s = 0;
@@ -246,9 +266,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
246 new->n_writes = 0; 266 new->n_writes = 0;
247 267
248 newest_before = mdev->newest_tle; 268 newest_before = mdev->newest_tle;
249 /* never send a barrier number == 0, because that is special-cased 269 new->br_number = newest_before->br_number+1;
250 * when using TCQ for our write ordering code */
251 new->br_number = (newest_before->br_number+1) ?: 1;
252 if (mdev->newest_tle != new) { 270 if (mdev->newest_tle != new) {
253 mdev->newest_tle->next = new; 271 mdev->newest_tle->next = new;
254 mdev->newest_tle = new; 272 mdev->newest_tle = new;
@@ -311,7 +329,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
311 These have been list_move'd to the out_of_sequence_requests list in 329 These have been list_move'd to the out_of_sequence_requests list in
312 _req_mod(, barrier_acked) above. 330 _req_mod(, barrier_acked) above.
313 */ 331 */
314 list_del_init(&b->requests); 332 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
315 333
316 nob = b->next; 334 nob = b->next;
317 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { 335 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
@@ -411,6 +429,23 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
411 b = tmp; 429 b = tmp;
412 list_splice(&carry_reads, &b->requests); 430 list_splice(&carry_reads, &b->requests);
413 } 431 }
432
433 /* Actions operating on the disk state, also want to work on
434 requests that got barrier acked. */
435 switch (what) {
436 case fail_frozen_disk_io:
437 case restart_frozen_disk_io:
438 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
439 req = list_entry(le, struct drbd_request, tl_requests);
440 _req_mod(req, what);
441 }
442
443 case connection_lost_while_pending:
444 case resend:
445 break;
446 default:
447 dev_err(DEV, "what = %d in _tl_restart()\n", what);
448 }
414} 449}
415 450
416 451
@@ -458,6 +493,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
458} 493}
459 494
460/** 495/**
496 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
497 * @mdev: DRBD device.
498 */
499void tl_abort_disk_io(struct drbd_conf *mdev)
500{
501 struct drbd_tl_epoch *b;
502 struct list_head *le, *tle;
503 struct drbd_request *req;
504
505 spin_lock_irq(&mdev->req_lock);
506 b = mdev->oldest_tle;
507 while (b) {
508 list_for_each_safe(le, tle, &b->requests) {
509 req = list_entry(le, struct drbd_request, tl_requests);
510 if (!(req->rq_state & RQ_LOCAL_PENDING))
511 continue;
512 _req_mod(req, abort_disk_io);
513 }
514 b = b->next;
515 }
516
517 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
518 req = list_entry(le, struct drbd_request, tl_requests);
519 if (!(req->rq_state & RQ_LOCAL_PENDING))
520 continue;
521 _req_mod(req, abort_disk_io);
522 }
523
524 spin_unlock_irq(&mdev->req_lock);
525}
526
527/**
461 * cl_wide_st_chg() - true if the state change is a cluster wide one 528 * cl_wide_st_chg() - true if the state change is a cluster wide one
462 * @mdev: DRBD device. 529 * @mdev: DRBD device.
463 * @os: old (current) state. 530 * @os: old (current) state.
@@ -470,7 +537,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev,
470 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || 537 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
471 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || 538 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
472 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || 539 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
473 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || 540 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
474 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || 541 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
475 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); 542 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
476} 543}
@@ -509,8 +576,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
509static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, 576static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
510 union drbd_state, 577 union drbd_state,
511 union drbd_state); 578 union drbd_state);
579enum sanitize_state_warnings {
580 NO_WARNING,
581 ABORTED_ONLINE_VERIFY,
582 ABORTED_RESYNC,
583 CONNECTION_LOST_NEGOTIATING,
584 IMPLICITLY_UPGRADED_DISK,
585 IMPLICITLY_UPGRADED_PDSK,
586};
512static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 587static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
513 union drbd_state ns, const char **warn_sync_abort); 588 union drbd_state ns, enum sanitize_state_warnings *warn);
514int drbd_send_state_req(struct drbd_conf *, 589int drbd_send_state_req(struct drbd_conf *,
515 union drbd_state, union drbd_state); 590 union drbd_state, union drbd_state);
516 591
@@ -785,6 +860,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
785 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) 860 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
786 rv = SS_IN_TRANSIENT_STATE; 861 rv = SS_IN_TRANSIENT_STATE;
787 862
863 /* While establishing a connection only allow cstate to change.
864 Delay/refuse role changes, detach attach etc... */
865 if (test_bit(STATE_SENT, &mdev->flags) &&
866 !(os.conn == C_WF_REPORT_PARAMS ||
867 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
868 rv = SS_IN_TRANSIENT_STATE;
869
788 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) 870 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
789 rv = SS_NEED_CONNECTION; 871 rv = SS_NEED_CONNECTION;
790 872
@@ -803,6 +885,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
803 return rv; 885 return rv;
804} 886}
805 887
888static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
889{
890 static const char *msg_table[] = {
891 [NO_WARNING] = "",
892 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
893 [ABORTED_RESYNC] = "Resync aborted.",
894 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
895 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
896 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
897 };
898
899 if (warn != NO_WARNING)
900 dev_warn(DEV, "%s\n", msg_table[warn]);
901}
902
806/** 903/**
807 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition 904 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
808 * @mdev: DRBD device. 905 * @mdev: DRBD device.
@@ -814,11 +911,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
814 * to D_UNKNOWN. This rule and many more along those lines are in this function. 911 * to D_UNKNOWN. This rule and many more along those lines are in this function.
815 */ 912 */
816static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, 913static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
817 union drbd_state ns, const char **warn_sync_abort) 914 union drbd_state ns, enum sanitize_state_warnings *warn)
818{ 915{
819 enum drbd_fencing_p fp; 916 enum drbd_fencing_p fp;
820 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; 917 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
821 918
919 if (warn)
920 *warn = NO_WARNING;
921
822 fp = FP_DONT_CARE; 922 fp = FP_DONT_CARE;
823 if (get_ldev(mdev)) { 923 if (get_ldev(mdev)) {
824 fp = mdev->ldev->dc.fencing; 924 fp = mdev->ldev->dc.fencing;
@@ -833,18 +933,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
833 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. 933 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
834 * If you try to go into some Sync* state, that shall fail (elsewhere). */ 934 * If you try to go into some Sync* state, that shall fail (elsewhere). */
835 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && 935 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
836 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) 936 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
837 ns.conn = os.conn; 937 ns.conn = os.conn;
838 938
839 /* we cannot fail (again) if we already detached */ 939 /* we cannot fail (again) if we already detached */
840 if (ns.disk == D_FAILED && os.disk == D_DISKLESS) 940 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
841 ns.disk = D_DISKLESS; 941 ns.disk = D_DISKLESS;
842 942
843 /* if we are only D_ATTACHING yet,
844 * we can (and should) go directly to D_DISKLESS. */
845 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
846 ns.disk = D_DISKLESS;
847
848 /* After C_DISCONNECTING only C_STANDALONE may follow */ 943 /* After C_DISCONNECTING only C_STANDALONE may follow */
849 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) 944 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
850 ns.conn = os.conn; 945 ns.conn = os.conn;
@@ -863,10 +958,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
863 /* Abort resync if a disk fails/detaches */ 958 /* Abort resync if a disk fails/detaches */
864 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && 959 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
865 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { 960 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
866 if (warn_sync_abort) 961 if (warn)
867 *warn_sync_abort = 962 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
868 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? 963 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
869 "Online-verify" : "Resync";
870 ns.conn = C_CONNECTED; 964 ns.conn = C_CONNECTED;
871 } 965 }
872 966
@@ -877,7 +971,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
877 ns.disk = mdev->new_state_tmp.disk; 971 ns.disk = mdev->new_state_tmp.disk;
878 ns.pdsk = mdev->new_state_tmp.pdsk; 972 ns.pdsk = mdev->new_state_tmp.pdsk;
879 } else { 973 } else {
880 dev_alert(DEV, "Connection lost while negotiating, no data!\n"); 974 if (warn)
975 *warn = CONNECTION_LOST_NEGOTIATING;
881 ns.disk = D_DISKLESS; 976 ns.disk = D_DISKLESS;
882 ns.pdsk = D_UNKNOWN; 977 ns.pdsk = D_UNKNOWN;
883 } 978 }
@@ -959,16 +1054,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
959 ns.disk = disk_max; 1054 ns.disk = disk_max;
960 1055
961 if (ns.disk < disk_min) { 1056 if (ns.disk < disk_min) {
962 dev_warn(DEV, "Implicitly set disk from %s to %s\n", 1057 if (warn)
963 drbd_disk_str(ns.disk), drbd_disk_str(disk_min)); 1058 *warn = IMPLICITLY_UPGRADED_DISK;
964 ns.disk = disk_min; 1059 ns.disk = disk_min;
965 } 1060 }
966 if (ns.pdsk > pdsk_max) 1061 if (ns.pdsk > pdsk_max)
967 ns.pdsk = pdsk_max; 1062 ns.pdsk = pdsk_max;
968 1063
969 if (ns.pdsk < pdsk_min) { 1064 if (ns.pdsk < pdsk_min) {
970 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n", 1065 if (warn)
971 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min)); 1066 *warn = IMPLICITLY_UPGRADED_PDSK;
972 ns.pdsk = pdsk_min; 1067 ns.pdsk = pdsk_min;
973 } 1068 }
974 1069
@@ -1045,12 +1140,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1045{ 1140{
1046 union drbd_state os; 1141 union drbd_state os;
1047 enum drbd_state_rv rv = SS_SUCCESS; 1142 enum drbd_state_rv rv = SS_SUCCESS;
1048 const char *warn_sync_abort = NULL; 1143 enum sanitize_state_warnings ssw;
1049 struct after_state_chg_work *ascw; 1144 struct after_state_chg_work *ascw;
1050 1145
1051 os = mdev->state; 1146 os = mdev->state;
1052 1147
1053 ns = sanitize_state(mdev, os, ns, &warn_sync_abort); 1148 ns = sanitize_state(mdev, os, ns, &ssw);
1054 1149
1055 if (ns.i == os.i) 1150 if (ns.i == os.i)
1056 return SS_NOTHING_TO_DO; 1151 return SS_NOTHING_TO_DO;
@@ -1076,8 +1171,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1076 return rv; 1171 return rv;
1077 } 1172 }
1078 1173
1079 if (warn_sync_abort) 1174 print_sanitize_warnings(mdev, ssw);
1080 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1081 1175
1082 { 1176 {
1083 char *pbp, pb[300]; 1177 char *pbp, pb[300];
@@ -1243,7 +1337,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1243 drbd_thread_stop_nowait(&mdev->receiver); 1337 drbd_thread_stop_nowait(&mdev->receiver);
1244 1338
1245 /* Upon network failure, we need to restart the receiver. */ 1339 /* Upon network failure, we need to restart the receiver. */
1246 if (os.conn > C_TEAR_DOWN && 1340 if (os.conn > C_WF_CONNECTION &&
1247 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) 1341 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1248 drbd_thread_restart_nowait(&mdev->receiver); 1342 drbd_thread_restart_nowait(&mdev->receiver);
1249 1343
@@ -1251,6 +1345,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1251 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) 1345 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1252 drbd_resume_al(mdev); 1346 drbd_resume_al(mdev);
1253 1347
1348 /* remember last connect and attach times so request_timer_fn() won't
1349 * kill newly established sessions while we are still trying to thaw
1350 * previously frozen IO */
1351 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1352 mdev->last_reconnect_jif = jiffies;
1353 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1354 ns.disk > D_NEGOTIATING)
1355 mdev->last_reattach_jif = jiffies;
1356
1254 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); 1357 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1255 if (ascw) { 1358 if (ascw) {
1256 ascw->os = os; 1359 ascw->os = os;
@@ -1354,12 +1457,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1354 /* Here we have the actions that are performed after a 1457 /* Here we have the actions that are performed after a
1355 state change. This function might sleep */ 1458 state change. This function might sleep */
1356 1459
1460 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1461 mod_timer(&mdev->request_timer, jiffies + HZ);
1462
1357 nsm.i = -1; 1463 nsm.i = -1;
1358 if (ns.susp_nod) { 1464 if (ns.susp_nod) {
1359 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) 1465 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1360 what = resend; 1466 what = resend;
1361 1467
1362 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) 1468 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1469 ns.disk > D_NEGOTIATING)
1363 what = restart_frozen_disk_io; 1470 what = restart_frozen_disk_io;
1364 1471
1365 if (what != nothing) 1472 if (what != nothing)
@@ -1408,7 +1515,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1408 /* Do not change the order of the if above and the two below... */ 1515 /* Do not change the order of the if above and the two below... */
1409 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ 1516 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1410 drbd_send_uuids(mdev); 1517 drbd_send_uuids(mdev);
1411 drbd_send_state(mdev); 1518 drbd_send_state(mdev, ns);
1412 } 1519 }
1413 /* No point in queuing send_bitmap if we don't have a connection 1520 /* No point in queuing send_bitmap if we don't have a connection
1414 * anymore, so check also the _current_ state, not only the new state 1521 * anymore, so check also the _current_ state, not only the new state
@@ -1441,11 +1548,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1441 } 1548 }
1442 1549
1443 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { 1550 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1444 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { 1551 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1552 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1445 drbd_uuid_new_current(mdev); 1553 drbd_uuid_new_current(mdev);
1446 drbd_send_uuids(mdev); 1554 drbd_send_uuids(mdev);
1447 } 1555 }
1448
1449 /* D_DISKLESS Peer becomes secondary */ 1556 /* D_DISKLESS Peer becomes secondary */
1450 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1557 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1451 /* We may still be Primary ourselves. 1558 /* We may still be Primary ourselves.
@@ -1473,14 +1580,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1473 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { 1580 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1474 drbd_send_sizes(mdev, 0, 0); /* to start sync... */ 1581 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
1475 drbd_send_uuids(mdev); 1582 drbd_send_uuids(mdev);
1476 drbd_send_state(mdev); 1583 drbd_send_state(mdev, ns);
1477 } 1584 }
1478 1585
1479 /* We want to pause/continue resync, tell peer. */ 1586 /* We want to pause/continue resync, tell peer. */
1480 if (ns.conn >= C_CONNECTED && 1587 if (ns.conn >= C_CONNECTED &&
1481 ((os.aftr_isp != ns.aftr_isp) || 1588 ((os.aftr_isp != ns.aftr_isp) ||
1482 (os.user_isp != ns.user_isp))) 1589 (os.user_isp != ns.user_isp)))
1483 drbd_send_state(mdev); 1590 drbd_send_state(mdev, ns);
1484 1591
1485 /* In case one of the isp bits got set, suspend other devices. */ 1592 /* In case one of the isp bits got set, suspend other devices. */
1486 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && 1593 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1490,10 +1597,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1490 /* Make sure the peer gets informed about eventual state 1597 /* Make sure the peer gets informed about eventual state
1491 changes (ISP bits) while we were in WFReportParams. */ 1598 changes (ISP bits) while we were in WFReportParams. */
1492 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) 1599 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1493 drbd_send_state(mdev); 1600 drbd_send_state(mdev, ns);
1494 1601
1495 if (os.conn != C_AHEAD && ns.conn == C_AHEAD) 1602 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1496 drbd_send_state(mdev); 1603 drbd_send_state(mdev, ns);
1497 1604
1498 /* We are in the progress to start a full sync... */ 1605 /* We are in the progress to start a full sync... */
1499 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || 1606 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1513,33 +1620,38 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1513 /* first half of local IO error, failure to attach, 1620 /* first half of local IO error, failure to attach,
1514 * or administrative detach */ 1621 * or administrative detach */
1515 if (os.disk != D_FAILED && ns.disk == D_FAILED) { 1622 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1516 enum drbd_io_error_p eh; 1623 enum drbd_io_error_p eh = EP_PASS_ON;
1517 int was_io_error; 1624 int was_io_error = 0;
1518 /* corresponding get_ldev was in __drbd_set_state, to serialize 1625 /* corresponding get_ldev was in __drbd_set_state, to serialize
1519 * our cleanup here with the transition to D_DISKLESS, 1626 * our cleanup here with the transition to D_DISKLESS.
1520 * so it is safe to dreference ldev here. */ 1627 * But is is still not save to dreference ldev here, since
1521 eh = mdev->ldev->dc.on_io_error; 1628 * we might come from an failed Attach before ldev was set. */
1522 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); 1629 if (mdev->ldev) {
1523 1630 eh = mdev->ldev->dc.on_io_error;
1524 /* current state still has to be D_FAILED, 1631 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1525 * there is only one way out: to D_DISKLESS, 1632
1526 * and that may only happen after our put_ldev below. */ 1633 /* Immediately allow completion of all application IO, that waits
1527 if (mdev->state.disk != D_FAILED) 1634 for completion from the local disk. */
1528 dev_err(DEV, 1635 tl_abort_disk_io(mdev);
1529 "ASSERT FAILED: disk is %s during detach\n", 1636
1530 drbd_disk_str(mdev->state.disk)); 1637 /* current state still has to be D_FAILED,
1531 1638 * there is only one way out: to D_DISKLESS,
1532 if (drbd_send_state(mdev)) 1639 * and that may only happen after our put_ldev below. */
1533 dev_warn(DEV, "Notified peer that I am detaching my disk\n"); 1640 if (mdev->state.disk != D_FAILED)
1534 else 1641 dev_err(DEV,
1535 dev_err(DEV, "Sending state for detaching disk failed\n"); 1642 "ASSERT FAILED: disk is %s during detach\n",
1536 1643 drbd_disk_str(mdev->state.disk));
1537 drbd_rs_cancel_all(mdev); 1644
1538 1645 if (ns.conn >= C_CONNECTED)
1539 /* In case we want to get something to stable storage still, 1646 drbd_send_state(mdev, ns);
1540 * this may be the last chance. 1647
1541 * Following put_ldev may transition to D_DISKLESS. */ 1648 drbd_rs_cancel_all(mdev);
1542 drbd_md_sync(mdev); 1649
1650 /* In case we want to get something to stable storage still,
1651 * this may be the last chance.
1652 * Following put_ldev may transition to D_DISKLESS. */
1653 drbd_md_sync(mdev);
1654 }
1543 put_ldev(mdev); 1655 put_ldev(mdev);
1544 1656
1545 if (was_io_error && eh == EP_CALL_HELPER) 1657 if (was_io_error && eh == EP_CALL_HELPER)
@@ -1561,16 +1673,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1561 mdev->rs_failed = 0; 1673 mdev->rs_failed = 0;
1562 atomic_set(&mdev->rs_pending_cnt, 0); 1674 atomic_set(&mdev->rs_pending_cnt, 0);
1563 1675
1564 if (drbd_send_state(mdev)) 1676 if (ns.conn >= C_CONNECTED)
1565 dev_warn(DEV, "Notified peer that I'm now diskless.\n"); 1677 drbd_send_state(mdev, ns);
1678
1566 /* corresponding get_ldev in __drbd_set_state 1679 /* corresponding get_ldev in __drbd_set_state
1567 * this may finally trigger drbd_ldev_destroy. */ 1680 * this may finally trigger drbd_ldev_destroy. */
1568 put_ldev(mdev); 1681 put_ldev(mdev);
1569 } 1682 }
1570 1683
1571 /* Notify peer that I had a local IO error, and did not detached.. */ 1684 /* Notify peer that I had a local IO error, and did not detached.. */
1572 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) 1685 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
1573 drbd_send_state(mdev); 1686 drbd_send_state(mdev, ns);
1574 1687
1575 /* Disks got bigger while they were detached */ 1688 /* Disks got bigger while they were detached */
1576 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && 1689 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1588,7 +1701,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1588 /* sync target done with resync. Explicitly notify peer, even though 1701 /* sync target done with resync. Explicitly notify peer, even though
1589 * it should (at least for non-empty resyncs) already know itself. */ 1702 * it should (at least for non-empty resyncs) already know itself. */
1590 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) 1703 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1591 drbd_send_state(mdev); 1704 drbd_send_state(mdev, ns);
1705
1706 /* Wake up role changes, that were delayed because of connection establishing */
1707 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1708 clear_bit(STATE_SENT, &mdev->flags);
1709 wake_up(&mdev->state_wait);
1710 }
1592 1711
1593 /* This triggers bitmap writeout of potentially still unwritten pages 1712 /* This triggers bitmap writeout of potentially still unwritten pages
1594 * if the resync finished cleanly, or aborted because of peer disk 1713 * if the resync finished cleanly, or aborted because of peer disk
@@ -1598,8 +1717,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1598 * No harm done if some bits change during this phase. 1717 * No harm done if some bits change during this phase.
1599 */ 1718 */
1600 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { 1719 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1601 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, 1720 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1602 "write from resync_finished", BM_LOCKED_SET_ALLOWED); 1721 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
1603 put_ldev(mdev); 1722 put_ldev(mdev);
1604 } 1723 }
1605 1724
@@ -2057,7 +2176,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2057 2176
2058 D_ASSERT(mdev->state.disk == D_UP_TO_DATE); 2177 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2059 2178
2060 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET; 2179 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2180 if (uuid && uuid != UUID_JUST_CREATED)
2181 uuid = uuid + UUID_NEW_BM_OFFSET;
2182 else
2183 get_random_bytes(&uuid, sizeof(u64));
2061 drbd_uuid_set(mdev, UI_BITMAP, uuid); 2184 drbd_uuid_set(mdev, UI_BITMAP, uuid);
2062 drbd_print_uuids(mdev, "updated sync UUID"); 2185 drbd_print_uuids(mdev, "updated sync UUID");
2063 drbd_md_sync(mdev); 2186 drbd_md_sync(mdev);
@@ -2089,6 +2212,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
2089 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ 2212 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
2090 } 2213 }
2091 2214
2215 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2216 if (mdev->agreed_pro_version <= 94)
2217 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2218
2092 p.d_size = cpu_to_be64(d_size); 2219 p.d_size = cpu_to_be64(d_size);
2093 p.u_size = cpu_to_be64(u_size); 2220 p.u_size = cpu_to_be64(u_size);
2094 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); 2221 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
@@ -2102,10 +2229,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
2102} 2229}
2103 2230
2104/** 2231/**
2105 * drbd_send_state() - Sends the drbd state to the peer 2232 * drbd_send_current_state() - Sends the drbd state to the peer
2106 * @mdev: DRBD device. 2233 * @mdev: DRBD device.
2107 */ 2234 */
2108int drbd_send_state(struct drbd_conf *mdev) 2235int drbd_send_current_state(struct drbd_conf *mdev)
2109{ 2236{
2110 struct socket *sock; 2237 struct socket *sock;
2111 struct p_state p; 2238 struct p_state p;
@@ -2131,6 +2258,37 @@ int drbd_send_state(struct drbd_conf *mdev)
2131 return ok; 2258 return ok;
2132} 2259}
2133 2260
2261/**
2262 * drbd_send_state() - After a state change, sends the new state to the peer
2263 * @mdev: DRBD device.
2264 * @state: the state to send, not necessarily the current state.
2265 *
2266 * Each state change queues an "after_state_ch" work, which will eventually
2267 * send the resulting new state to the peer. If more state changes happen
2268 * between queuing and processing of the after_state_ch work, we still
2269 * want to send each intermediary state in the order it occurred.
2270 */
2271int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2272{
2273 struct socket *sock;
2274 struct p_state p;
2275 int ok = 0;
2276
2277 mutex_lock(&mdev->data.mutex);
2278
2279 p.state = cpu_to_be32(state.i);
2280 sock = mdev->data.socket;
2281
2282 if (likely(sock != NULL)) {
2283 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2284 (struct p_header80 *)&p, sizeof(p), 0);
2285 }
2286
2287 mutex_unlock(&mdev->data.mutex);
2288
2289 return ok;
2290}
2291
2134int drbd_send_state_req(struct drbd_conf *mdev, 2292int drbd_send_state_req(struct drbd_conf *mdev,
2135 union drbd_state mask, union drbd_state val) 2293 union drbd_state mask, union drbd_state val)
2136{ 2294{
@@ -2615,7 +2773,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2615 struct bio_vec *bvec; 2773 struct bio_vec *bvec;
2616 int i; 2774 int i;
2617 /* hint all but last page with MSG_MORE */ 2775 /* hint all but last page with MSG_MORE */
2618 __bio_for_each_segment(bvec, bio, i, 0) { 2776 bio_for_each_segment(bvec, bio, i) {
2619 if (!_drbd_no_send_page(mdev, bvec->bv_page, 2777 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2620 bvec->bv_offset, bvec->bv_len, 2778 bvec->bv_offset, bvec->bv_len,
2621 i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) 2779 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2629,7 +2787,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2629 struct bio_vec *bvec; 2787 struct bio_vec *bvec;
2630 int i; 2788 int i;
2631 /* hint all but last page with MSG_MORE */ 2789 /* hint all but last page with MSG_MORE */
2632 __bio_for_each_segment(bvec, bio, i, 0) { 2790 bio_for_each_segment(bvec, bio, i) {
2633 if (!_drbd_send_page(mdev, bvec->bv_page, 2791 if (!_drbd_send_page(mdev, bvec->bv_page,
2634 bvec->bv_offset, bvec->bv_len, 2792 bvec->bv_offset, bvec->bv_len,
2635 i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) 2793 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2695,8 +2853,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2695 2853
2696 p.sector = cpu_to_be64(req->sector); 2854 p.sector = cpu_to_be64(req->sector);
2697 p.block_id = (unsigned long)req; 2855 p.block_id = (unsigned long)req;
2698 p.seq_num = cpu_to_be32(req->seq_num = 2856 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2699 atomic_add_return(1, &mdev->packet_seq));
2700 2857
2701 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); 2858 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2702 2859
@@ -2987,8 +3144,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2987 atomic_set(&mdev->rs_sect_in, 0); 3144 atomic_set(&mdev->rs_sect_in, 0);
2988 atomic_set(&mdev->rs_sect_ev, 0); 3145 atomic_set(&mdev->rs_sect_ev, 0);
2989 atomic_set(&mdev->ap_in_flight, 0); 3146 atomic_set(&mdev->ap_in_flight, 0);
3147 atomic_set(&mdev->md_io_in_use, 0);
2990 3148
2991 mutex_init(&mdev->md_io_mutex);
2992 mutex_init(&mdev->data.mutex); 3149 mutex_init(&mdev->data.mutex);
2993 mutex_init(&mdev->meta.mutex); 3150 mutex_init(&mdev->meta.mutex);
2994 sema_init(&mdev->data.work.s, 0); 3151 sema_init(&mdev->data.work.s, 0);
@@ -3126,6 +3283,10 @@ static void drbd_destroy_mempools(void)
3126 3283
3127 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ 3284 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3128 3285
3286 if (drbd_md_io_bio_set)
3287 bioset_free(drbd_md_io_bio_set);
3288 if (drbd_md_io_page_pool)
3289 mempool_destroy(drbd_md_io_page_pool);
3129 if (drbd_ee_mempool) 3290 if (drbd_ee_mempool)
3130 mempool_destroy(drbd_ee_mempool); 3291 mempool_destroy(drbd_ee_mempool);
3131 if (drbd_request_mempool) 3292 if (drbd_request_mempool)
@@ -3139,6 +3300,8 @@ static void drbd_destroy_mempools(void)
3139 if (drbd_al_ext_cache) 3300 if (drbd_al_ext_cache)
3140 kmem_cache_destroy(drbd_al_ext_cache); 3301 kmem_cache_destroy(drbd_al_ext_cache);
3141 3302
3303 drbd_md_io_bio_set = NULL;
3304 drbd_md_io_page_pool = NULL;
3142 drbd_ee_mempool = NULL; 3305 drbd_ee_mempool = NULL;
3143 drbd_request_mempool = NULL; 3306 drbd_request_mempool = NULL;
3144 drbd_ee_cache = NULL; 3307 drbd_ee_cache = NULL;
@@ -3162,6 +3325,8 @@ static int drbd_create_mempools(void)
3162 drbd_bm_ext_cache = NULL; 3325 drbd_bm_ext_cache = NULL;
3163 drbd_al_ext_cache = NULL; 3326 drbd_al_ext_cache = NULL;
3164 drbd_pp_pool = NULL; 3327 drbd_pp_pool = NULL;
3328 drbd_md_io_page_pool = NULL;
3329 drbd_md_io_bio_set = NULL;
3165 3330
3166 /* caches */ 3331 /* caches */
3167 drbd_request_cache = kmem_cache_create( 3332 drbd_request_cache = kmem_cache_create(
@@ -3185,6 +3350,16 @@ static int drbd_create_mempools(void)
3185 goto Enomem; 3350 goto Enomem;
3186 3351
3187 /* mempools */ 3352 /* mempools */
3353#ifdef COMPAT_HAVE_BIOSET_CREATE
3354 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
3355 if (drbd_md_io_bio_set == NULL)
3356 goto Enomem;
3357#endif
3358
3359 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
3360 if (drbd_md_io_page_pool == NULL)
3361 goto Enomem;
3362
3188 drbd_request_mempool = mempool_create(number, 3363 drbd_request_mempool = mempool_create(number,
3189 mempool_alloc_slab, mempool_free_slab, drbd_request_cache); 3364 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3190 if (drbd_request_mempool == NULL) 3365 if (drbd_request_mempool == NULL)
@@ -3262,6 +3437,8 @@ static void drbd_delete_device(unsigned int minor)
3262 if (!mdev) 3437 if (!mdev)
3263 return; 3438 return;
3264 3439
3440 del_timer_sync(&mdev->request_timer);
3441
3265 /* paranoia asserts */ 3442 /* paranoia asserts */
3266 if (mdev->open_cnt != 0) 3443 if (mdev->open_cnt != 0)
3267 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, 3444 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
@@ -3666,8 +3843,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
3666 if (!get_ldev_if_state(mdev, D_FAILED)) 3843 if (!get_ldev_if_state(mdev, D_FAILED))
3667 return; 3844 return;
3668 3845
3669 mutex_lock(&mdev->md_io_mutex); 3846 buffer = drbd_md_get_buffer(mdev);
3670 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); 3847 if (!buffer)
3848 goto out;
3849
3671 memset(buffer, 0, 512); 3850 memset(buffer, 0, 512);
3672 3851
3673 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); 3852 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
@@ -3698,7 +3877,8 @@ void drbd_md_sync(struct drbd_conf *mdev)
3698 * since we updated it on metadata. */ 3877 * since we updated it on metadata. */
3699 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); 3878 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3700 3879
3701 mutex_unlock(&mdev->md_io_mutex); 3880 drbd_md_put_buffer(mdev);
3881out:
3702 put_ldev(mdev); 3882 put_ldev(mdev);
3703} 3883}
3704 3884
@@ -3718,8 +3898,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3718 if (!get_ldev_if_state(mdev, D_ATTACHING)) 3898 if (!get_ldev_if_state(mdev, D_ATTACHING))
3719 return ERR_IO_MD_DISK; 3899 return ERR_IO_MD_DISK;
3720 3900
3721 mutex_lock(&mdev->md_io_mutex); 3901 buffer = drbd_md_get_buffer(mdev);
3722 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); 3902 if (!buffer)
3903 goto out;
3723 3904
3724 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { 3905 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3725 /* NOTE: can't do normal error processing here as this is 3906 /* NOTE: can't do normal error processing here as this is
@@ -3780,7 +3961,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3780 mdev->sync_conf.al_extents = 127; 3961 mdev->sync_conf.al_extents = 127;
3781 3962
3782 err: 3963 err:
3783 mutex_unlock(&mdev->md_io_mutex); 3964 drbd_md_put_buffer(mdev);
3965 out:
3784 put_ldev(mdev); 3966 put_ldev(mdev);
3785 3967
3786 return rv; 3968 return rv;
@@ -4183,12 +4365,11 @@ const char *drbd_buildtag(void)
4183 static char buildtag[38] = "\0uilt-in"; 4365 static char buildtag[38] = "\0uilt-in";
4184 4366
4185 if (buildtag[0] == 0) { 4367 if (buildtag[0] == 0) {
4186#ifdef CONFIG_MODULES 4368#ifdef MODULE
4187 if (THIS_MODULE != NULL) 4369 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4188 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); 4370#else
4189 else 4371 buildtag[0] = 'b';
4190#endif 4372#endif
4191 buildtag[0] = 'b';
4192 } 4373 }
4193 4374
4194 return buildtag; 4375 return buildtag;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 946166e13953..6d4de6a72e80 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data)
289 */ 289 */
290 spin_lock_irq(&mdev->req_lock); 290 spin_lock_irq(&mdev->req_lock);
291 ns = mdev->state; 291 ns = mdev->state;
292 if (ns.conn < C_WF_REPORT_PARAMS) { 292 if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) {
293 ns.pdsk = nps; 293 ns.pdsk = nps;
294 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); 294 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
295 } 295 }
@@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
432 /* if this was forced, we should consider sync */ 432 /* if this was forced, we should consider sync */
433 if (forced) 433 if (forced)
434 drbd_send_uuids(mdev); 434 drbd_send_uuids(mdev);
435 drbd_send_state(mdev); 435 drbd_send_current_state(mdev);
436 } 436 }
437 437
438 drbd_md_sync(mdev); 438 drbd_md_sync(mdev);
@@ -845,9 +845,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
845 Because new from 8.3.8 onwards the peer can use multiple 845 Because new from 8.3.8 onwards the peer can use multiple
846 BIOs for a single peer_request */ 846 BIOs for a single peer_request */
847 if (mdev->state.conn >= C_CONNECTED) { 847 if (mdev->state.conn >= C_CONNECTED) {
848 if (mdev->agreed_pro_version < 94) 848 if (mdev->agreed_pro_version < 94) {
849 peer = mdev->peer_max_bio_size; 849 peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
850 else if (mdev->agreed_pro_version == 94) 850 /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
851 } else if (mdev->agreed_pro_version == 94)
851 peer = DRBD_MAX_SIZE_H80_PACKET; 852 peer = DRBD_MAX_SIZE_H80_PACKET;
852 else /* drbd 8.3.8 onwards */ 853 else /* drbd 8.3.8 onwards */
853 peer = DRBD_MAX_BIO_SIZE; 854 peer = DRBD_MAX_BIO_SIZE;
@@ -1032,7 +1033,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1032 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", 1033 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
1033 (unsigned long long) drbd_get_max_capacity(nbc), 1034 (unsigned long long) drbd_get_max_capacity(nbc),
1034 (unsigned long long) nbc->dc.disk_size); 1035 (unsigned long long) nbc->dc.disk_size);
1035 retcode = ERR_DISK_TO_SMALL; 1036 retcode = ERR_DISK_TOO_SMALL;
1036 goto fail; 1037 goto fail;
1037 } 1038 }
1038 1039
@@ -1046,7 +1047,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1046 } 1047 }
1047 1048
1048 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { 1049 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
1049 retcode = ERR_MD_DISK_TO_SMALL; 1050 retcode = ERR_MD_DISK_TOO_SMALL;
1050 dev_warn(DEV, "refusing attach: md-device too small, " 1051 dev_warn(DEV, "refusing attach: md-device too small, "
1051 "at least %llu sectors needed for this meta-disk type\n", 1052 "at least %llu sectors needed for this meta-disk type\n",
1052 (unsigned long long) min_md_device_sectors); 1053 (unsigned long long) min_md_device_sectors);
@@ -1057,7 +1058,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1057 * (we may currently be R_PRIMARY with no local disk...) */ 1058 * (we may currently be R_PRIMARY with no local disk...) */
1058 if (drbd_get_max_capacity(nbc) < 1059 if (drbd_get_max_capacity(nbc) <
1059 drbd_get_capacity(mdev->this_bdev)) { 1060 drbd_get_capacity(mdev->this_bdev)) {
1060 retcode = ERR_DISK_TO_SMALL; 1061 retcode = ERR_DISK_TOO_SMALL;
1061 goto fail; 1062 goto fail;
1062 } 1063 }
1063 1064
@@ -1138,7 +1139,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1138 if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && 1139 if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
1139 drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { 1140 drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
1140 dev_warn(DEV, "refusing to truncate a consistent device\n"); 1141 dev_warn(DEV, "refusing to truncate a consistent device\n");
1141 retcode = ERR_DISK_TO_SMALL; 1142 retcode = ERR_DISK_TOO_SMALL;
1142 goto force_diskless_dec; 1143 goto force_diskless_dec;
1143 } 1144 }
1144 1145
@@ -1336,17 +1337,34 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1336{ 1337{
1337 enum drbd_ret_code retcode; 1338 enum drbd_ret_code retcode;
1338 int ret; 1339 int ret;
1340 struct detach dt = {};
1341
1342 if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
1343 reply->ret_code = ERR_MANDATORY_TAG;
1344 goto out;
1345 }
1346
1347 if (dt.detach_force) {
1348 drbd_force_state(mdev, NS(disk, D_FAILED));
1349 reply->ret_code = SS_SUCCESS;
1350 goto out;
1351 }
1352
1339 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ 1353 drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
1354 drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */
1340 retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); 1355 retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
1356 drbd_md_put_buffer(mdev);
1341 /* D_FAILED will transition to DISKLESS. */ 1357 /* D_FAILED will transition to DISKLESS. */
1342 ret = wait_event_interruptible(mdev->misc_wait, 1358 ret = wait_event_interruptible(mdev->misc_wait,
1343 mdev->state.disk != D_FAILED); 1359 mdev->state.disk != D_FAILED);
1344 drbd_resume_io(mdev); 1360 drbd_resume_io(mdev);
1361
1345 if ((int)retcode == (int)SS_IS_DISKLESS) 1362 if ((int)retcode == (int)SS_IS_DISKLESS)
1346 retcode = SS_NOTHING_TO_DO; 1363 retcode = SS_NOTHING_TO_DO;
1347 if (ret) 1364 if (ret)
1348 retcode = ERR_INTR; 1365 retcode = ERR_INTR;
1349 reply->ret_code = retcode; 1366 reply->ret_code = retcode;
1367out:
1350 return 0; 1368 return 0;
1351} 1369}
1352 1370
@@ -1711,7 +1729,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1711 1729
1712 if (rs.no_resync && mdev->agreed_pro_version < 93) { 1730 if (rs.no_resync && mdev->agreed_pro_version < 93) {
1713 retcode = ERR_NEED_APV_93; 1731 retcode = ERR_NEED_APV_93;
1714 goto fail; 1732 goto fail_ldev;
1715 } 1733 }
1716 1734
1717 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) 1735 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
@@ -1738,6 +1756,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1738 fail: 1756 fail:
1739 reply->ret_code = retcode; 1757 reply->ret_code = retcode;
1740 return 0; 1758 return 0;
1759
1760 fail_ldev:
1761 put_ldev(mdev);
1762 goto fail;
1741} 1763}
1742 1764
1743static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, 1765static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -1941,6 +1963,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1941 1963
1942 /* If there is still bitmap IO pending, probably because of a previous 1964 /* If there is still bitmap IO pending, probably because of a previous
1943 * resync just being finished, wait for it before requesting a new resync. */ 1965 * resync just being finished, wait for it before requesting a new resync. */
1966 drbd_suspend_io(mdev);
1944 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 1967 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
1945 1968
1946 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); 1969 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
@@ -1959,6 +1982,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
1959 1982
1960 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); 1983 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
1961 } 1984 }
1985 drbd_resume_io(mdev);
1962 1986
1963 reply->ret_code = retcode; 1987 reply->ret_code = retcode;
1964 return 0; 1988 return 0;
@@ -1980,6 +2004,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
1980 2004
1981 /* If there is still bitmap IO pending, probably because of a previous 2005 /* If there is still bitmap IO pending, probably because of a previous
1982 * resync just being finished, wait for it before requesting a new resync. */ 2006 * resync just being finished, wait for it before requesting a new resync. */
2007 drbd_suspend_io(mdev);
1983 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 2008 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
1984 2009
1985 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); 2010 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
@@ -1998,6 +2023,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
1998 } else 2023 } else
1999 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); 2024 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
2000 } 2025 }
2026 drbd_resume_io(mdev);
2001 2027
2002 reply->ret_code = retcode; 2028 reply->ret_code = retcode;
2003 return 0; 2029 return 0;
@@ -2170,11 +2196,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
2170 2196
2171 /* If there is still bitmap IO pending, e.g. previous resync or verify 2197 /* If there is still bitmap IO pending, e.g. previous resync or verify
2172 * just being finished, wait for it before requesting a new resync. */ 2198 * just being finished, wait for it before requesting a new resync. */
2199 drbd_suspend_io(mdev);
2173 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 2200 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2174 2201
2175 /* w_make_ov_request expects position to be aligned */ 2202 /* w_make_ov_request expects position to be aligned */
2176 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; 2203 mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
2177 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); 2204 reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
2205 drbd_resume_io(mdev);
2178 return 0; 2206 return 0;
2179} 2207}
2180 2208
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 2959cdfb77f5..869bada2ed06 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
52 if (unlikely(v >= 1000000)) { 52 if (unlikely(v >= 1000000)) {
53 /* cool: > GiByte/s */ 53 /* cool: > GiByte/s */
54 seq_printf(seq, "%ld,", v / 1000000); 54 seq_printf(seq, "%ld,", v / 1000000);
55 v /= 1000000; 55 v %= 1000000;
56 seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); 56 seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
57 } else if (likely(v >= 1000)) 57 } else if (likely(v >= 1000))
58 seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); 58 seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 436f519bed1c..ea4836e0ae98 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -466,6 +466,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what,
466 goto out; 466 goto out;
467 } 467 }
468 (*newsock)->ops = sock->ops; 468 (*newsock)->ops = sock->ops;
469 __module_get((*newsock)->ops->owner);
469 470
470out: 471out:
471 return err; 472 return err;
@@ -750,6 +751,7 @@ static int drbd_connect(struct drbd_conf *mdev)
750{ 751{
751 struct socket *s, *sock, *msock; 752 struct socket *s, *sock, *msock;
752 int try, h, ok; 753 int try, h, ok;
754 enum drbd_state_rv rv;
753 755
754 D_ASSERT(!mdev->data.socket); 756 D_ASSERT(!mdev->data.socket);
755 757
@@ -888,25 +890,32 @@ retry:
888 } 890 }
889 } 891 }
890 892
891 if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
892 return 0;
893
894 sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; 893 sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
895 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 894 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
896 895
897 atomic_set(&mdev->packet_seq, 0); 896 atomic_set(&mdev->packet_seq, 0);
898 mdev->peer_seq = 0; 897 mdev->peer_seq = 0;
899 898
900 drbd_thread_start(&mdev->asender);
901
902 if (drbd_send_protocol(mdev) == -1) 899 if (drbd_send_protocol(mdev) == -1)
903 return -1; 900 return -1;
901 set_bit(STATE_SENT, &mdev->flags);
904 drbd_send_sync_param(mdev, &mdev->sync_conf); 902 drbd_send_sync_param(mdev, &mdev->sync_conf);
905 drbd_send_sizes(mdev, 0, 0); 903 drbd_send_sizes(mdev, 0, 0);
906 drbd_send_uuids(mdev); 904 drbd_send_uuids(mdev);
907 drbd_send_state(mdev); 905 drbd_send_current_state(mdev);
908 clear_bit(USE_DEGR_WFC_T, &mdev->flags); 906 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
909 clear_bit(RESIZE_PENDING, &mdev->flags); 907 clear_bit(RESIZE_PENDING, &mdev->flags);
908
909 spin_lock_irq(&mdev->req_lock);
910 rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
911 if (mdev->state.conn != C_WF_REPORT_PARAMS)
912 clear_bit(STATE_SENT, &mdev->flags);
913 spin_unlock_irq(&mdev->req_lock);
914
915 if (rv < SS_SUCCESS)
916 return 0;
917
918 drbd_thread_start(&mdev->asender);
910 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ 919 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
911 920
912 return 1; 921 return 1;
@@ -957,7 +966,7 @@ static void drbd_flush(struct drbd_conf *mdev)
957 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, 966 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
958 NULL); 967 NULL);
959 if (rv) { 968 if (rv) {
960 dev_err(DEV, "local disk flush failed with status %d\n", rv); 969 dev_info(DEV, "local disk flush failed with status %d\n", rv);
961 /* would rather check on EOPNOTSUPP, but that is not reliable. 970 /* would rather check on EOPNOTSUPP, but that is not reliable.
962 * don't try again for ANY return value != 0 971 * don't try again for ANY return value != 0
963 * if (rv == -EOPNOTSUPP) */ 972 * if (rv == -EOPNOTSUPP) */
@@ -1001,13 +1010,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1001 1010
1002 if (epoch_size != 0 && 1011 if (epoch_size != 0 &&
1003 atomic_read(&epoch->active) == 0 && 1012 atomic_read(&epoch->active) == 0 &&
1004 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) { 1013 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
1005 if (!(ev & EV_CLEANUP)) { 1014 if (!(ev & EV_CLEANUP)) {
1006 spin_unlock(&mdev->epoch_lock); 1015 spin_unlock(&mdev->epoch_lock);
1007 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); 1016 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1008 spin_lock(&mdev->epoch_lock); 1017 spin_lock(&mdev->epoch_lock);
1009 } 1018 }
1010 dec_unacked(mdev); 1019 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
1020 dec_unacked(mdev);
1011 1021
1012 if (mdev->current_epoch != epoch) { 1022 if (mdev->current_epoch != epoch) {
1013 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); 1023 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
@@ -1096,7 +1106,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1096 /* In most cases, we will only need one bio. But in case the lower 1106 /* In most cases, we will only need one bio. But in case the lower
1097 * level restrictions happen to be different at this offset on this 1107 * level restrictions happen to be different at this offset on this
1098 * side than those of the sending peer, we may need to submit the 1108 * side than those of the sending peer, we may need to submit the
1099 * request in more than one bio. */ 1109 * request in more than one bio.
1110 *
1111 * Plain bio_alloc is good enough here, this is no DRBD internally
1112 * generated bio, but a bio allocated on behalf of the peer.
1113 */
1100next_bio: 1114next_bio:
1101 bio = bio_alloc(GFP_NOIO, nr_pages); 1115 bio = bio_alloc(GFP_NOIO, nr_pages);
1102 if (!bio) { 1116 if (!bio) {
@@ -1583,6 +1597,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
1583 return ok; 1597 return ok;
1584} 1598}
1585 1599
1600static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
1601{
1602
1603 struct drbd_epoch_entry *rs_e;
1604 bool rv = 0;
1605
1606 spin_lock_irq(&mdev->req_lock);
1607 list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
1608 if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
1609 rv = 1;
1610 break;
1611 }
1612 }
1613 spin_unlock_irq(&mdev->req_lock);
1614
1615 return rv;
1616}
1617
1586/* Called from receive_Data. 1618/* Called from receive_Data.
1587 * Synchronize packets on sock with packets on msock. 1619 * Synchronize packets on sock with packets on msock.
1588 * 1620 *
@@ -1826,6 +1858,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1826 list_add(&e->w.list, &mdev->active_ee); 1858 list_add(&e->w.list, &mdev->active_ee);
1827 spin_unlock_irq(&mdev->req_lock); 1859 spin_unlock_irq(&mdev->req_lock);
1828 1860
1861 if (mdev->state.conn == C_SYNC_TARGET)
1862 wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
1863
1829 switch (mdev->net_conf->wire_protocol) { 1864 switch (mdev->net_conf->wire_protocol) {
1830 case DRBD_PROT_C: 1865 case DRBD_PROT_C:
1831 inc_unacked(mdev); 1866 inc_unacked(mdev);
@@ -2420,7 +2455,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
2420 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; 2455 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2421 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; 2456 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2422 2457
2423 dev_info(DEV, "Did not got last syncUUID packet, corrected:\n"); 2458 dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
2424 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); 2459 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2425 2460
2426 return -1; 2461 return -1;
@@ -2806,10 +2841,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
2806 2841
2807 if (apv >= 88) { 2842 if (apv >= 88) {
2808 if (apv == 88) { 2843 if (apv == 88) {
2809 if (data_size > SHARED_SECRET_MAX) { 2844 if (data_size > SHARED_SECRET_MAX || data_size == 0) {
2810 dev_err(DEV, "verify-alg too long, " 2845 dev_err(DEV, "verify-alg of wrong size, "
2811 "peer wants %u, accepting only %u byte\n", 2846 "peer wants %u, accepting only up to %u byte\n",
2812 data_size, SHARED_SECRET_MAX); 2847 data_size, SHARED_SECRET_MAX);
2813 return false; 2848 return false;
2814 } 2849 }
2815 2850
@@ -3168,9 +3203,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3168 os = ns = mdev->state; 3203 os = ns = mdev->state;
3169 spin_unlock_irq(&mdev->req_lock); 3204 spin_unlock_irq(&mdev->req_lock);
3170 3205
3171 /* peer says his disk is uptodate, while we think it is inconsistent, 3206 /* If some other part of the code (asender thread, timeout)
3172 * and this happens while we think we have a sync going on. */ 3207 * already decided to close the connection again,
3173 if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && 3208 * we must not "re-establish" it here. */
3209 if (os.conn <= C_TEAR_DOWN)
3210 return false;
3211
3212 /* If this is the "end of sync" confirmation, usually the peer disk
3213 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
3214 * set) resync started in PausedSyncT, or if the timing of pause-/
3215 * unpause-sync events has been "just right", the peer disk may
3216 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
3217 */
3218 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
3219 real_peer_disk == D_UP_TO_DATE &&
3174 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { 3220 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3175 /* If we are (becoming) SyncSource, but peer is still in sync 3221 /* If we are (becoming) SyncSource, but peer is still in sync
3176 * preparation, ignore its uptodate-ness to avoid flapping, it 3222 * preparation, ignore its uptodate-ness to avoid flapping, it
@@ -3288,7 +3334,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3288 /* Nowadays only used when forcing a node into primary role and 3334 /* Nowadays only used when forcing a node into primary role and
3289 setting its disk to UpToDate with that */ 3335 setting its disk to UpToDate with that */
3290 drbd_send_uuids(mdev); 3336 drbd_send_uuids(mdev);
3291 drbd_send_state(mdev); 3337 drbd_send_current_state(mdev);
3292 } 3338 }
3293 } 3339 }
3294 3340
@@ -3776,6 +3822,13 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3776 if (mdev->state.conn == C_STANDALONE) 3822 if (mdev->state.conn == C_STANDALONE)
3777 return; 3823 return;
3778 3824
3825 /* We are about to start the cleanup after connection loss.
3826 * Make sure drbd_make_request knows about that.
3827 * Usually we should be in some network failure state already,
3828 * but just in case we are not, we fix it up here.
3829 */
3830 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
3831
3779 /* asender does not clean up anything. it must not interfere, either */ 3832 /* asender does not clean up anything. it must not interfere, either */
3780 drbd_thread_stop(&mdev->asender); 3833 drbd_thread_stop(&mdev->asender);
3781 drbd_free_sock(mdev); 3834 drbd_free_sock(mdev);
@@ -3803,8 +3856,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3803 atomic_set(&mdev->rs_pending_cnt, 0); 3856 atomic_set(&mdev->rs_pending_cnt, 0);
3804 wake_up(&mdev->misc_wait); 3857 wake_up(&mdev->misc_wait);
3805 3858
3806 del_timer(&mdev->request_timer);
3807
3808 /* make sure syncer is stopped and w_resume_next_sg queued */ 3859 /* make sure syncer is stopped and w_resume_next_sg queued */
3809 del_timer_sync(&mdev->resync_timer); 3860 del_timer_sync(&mdev->resync_timer);
3810 resync_timer_fn((unsigned long)mdev); 3861 resync_timer_fn((unsigned long)mdev);
@@ -4433,7 +4484,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
4433 4484
4434 if (mdev->state.conn == C_AHEAD && 4485 if (mdev->state.conn == C_AHEAD &&
4435 atomic_read(&mdev->ap_in_flight) == 0 && 4486 atomic_read(&mdev->ap_in_flight) == 0 &&
4436 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) { 4487 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
4437 mdev->start_resync_timer.expires = jiffies + HZ; 4488 mdev->start_resync_timer.expires = jiffies + HZ;
4438 add_timer(&mdev->start_resync_timer); 4489 add_timer(&mdev->start_resync_timer);
4439 } 4490 }
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 4a0f314086e5..9c5c84946b05 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req
37 const int rw = bio_data_dir(bio); 37 const int rw = bio_data_dir(bio);
38 int cpu; 38 int cpu;
39 cpu = part_stat_lock(); 39 cpu = part_stat_lock();
40 part_round_stats(cpu, &mdev->vdisk->part0);
40 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); 41 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
41 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); 42 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
42 part_inc_in_flight(&mdev->vdisk->part0, rw); 43 part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
214{ 215{
215 const unsigned long s = req->rq_state; 216 const unsigned long s = req->rq_state;
216 struct drbd_conf *mdev = req->mdev; 217 struct drbd_conf *mdev = req->mdev;
217 /* only WRITES may end up here without a master bio (on barrier ack) */ 218 int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
218 int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
219 219
220 /* we must not complete the master bio, while it is 220 /* we must not complete the master bio, while it is
221 * still being processed by _drbd_send_zc_bio (drbd_send_dblock) 221 * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
@@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
230 return; 230 return;
231 if (s & RQ_NET_PENDING) 231 if (s & RQ_NET_PENDING)
232 return; 232 return;
233 if (s & RQ_LOCAL_PENDING) 233 if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
234 return; 234 return;
235 235
236 if (req->master_bio) { 236 if (req->master_bio) {
@@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
277 req->master_bio = NULL; 277 req->master_bio = NULL;
278 } 278 }
279 279
280 if (s & RQ_LOCAL_PENDING)
281 return;
282
280 if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { 283 if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
281 /* this is disconnected (local only) operation, 284 /* this is disconnected (local only) operation,
282 * or protocol C P_WRITE_ACK, 285 * or protocol C P_WRITE_ACK,
@@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
429 break; 432 break;
430 433
431 case completed_ok: 434 case completed_ok:
432 if (bio_data_dir(req->master_bio) == WRITE) 435 if (req->rq_state & RQ_WRITE)
433 mdev->writ_cnt += req->size>>9; 436 mdev->writ_cnt += req->size>>9;
434 else 437 else
435 mdev->read_cnt += req->size>>9; 438 mdev->read_cnt += req->size>>9;
@@ -438,7 +441,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
438 req->rq_state &= ~RQ_LOCAL_PENDING; 441 req->rq_state &= ~RQ_LOCAL_PENDING;
439 442
440 _req_may_be_done_not_susp(req, m); 443 _req_may_be_done_not_susp(req, m);
441 put_ldev(mdev); 444 break;
445
446 case abort_disk_io:
447 req->rq_state |= RQ_LOCAL_ABORTED;
448 if (req->rq_state & RQ_WRITE)
449 _req_may_be_done_not_susp(req, m);
450 else
451 goto goto_queue_for_net_read;
442 break; 452 break;
443 453
444 case write_completed_with_error: 454 case write_completed_with_error:
@@ -447,7 +457,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
447 457
448 __drbd_chk_io_error(mdev, false); 458 __drbd_chk_io_error(mdev, false);
449 _req_may_be_done_not_susp(req, m); 459 _req_may_be_done_not_susp(req, m);
450 put_ldev(mdev);
451 break; 460 break;
452 461
453 case read_ahead_completed_with_error: 462 case read_ahead_completed_with_error:
@@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
455 req->rq_state |= RQ_LOCAL_COMPLETED; 464 req->rq_state |= RQ_LOCAL_COMPLETED;
456 req->rq_state &= ~RQ_LOCAL_PENDING; 465 req->rq_state &= ~RQ_LOCAL_PENDING;
457 _req_may_be_done_not_susp(req, m); 466 _req_may_be_done_not_susp(req, m);
458 put_ldev(mdev);
459 break; 467 break;
460 468
461 case read_completed_with_error: 469 case read_completed_with_error:
@@ -467,7 +475,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
467 D_ASSERT(!(req->rq_state & RQ_NET_MASK)); 475 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
468 476
469 __drbd_chk_io_error(mdev, false); 477 __drbd_chk_io_error(mdev, false);
470 put_ldev(mdev); 478
479 goto_queue_for_net_read:
471 480
472 /* no point in retrying if there is no good remote data, 481 /* no point in retrying if there is no good remote data,
473 * or we have no connection. */ 482 * or we have no connection. */
@@ -556,10 +565,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
556 drbd_queue_work(&mdev->data.work, &req->w); 565 drbd_queue_work(&mdev->data.work, &req->w);
557 break; 566 break;
558 567
559 case oos_handed_to_network: 568 case read_retry_remote_canceled:
560 /* actually the same */
561 case send_canceled: 569 case send_canceled:
562 /* treat it the same */
563 case send_failed: 570 case send_failed:
564 /* real cleanup will be done from tl_clear. just update flags 571 /* real cleanup will be done from tl_clear. just update flags
565 * so it is no longer marked as on the worker queue */ 572 * so it is no longer marked as on the worker queue */
@@ -589,17 +596,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
589 } 596 }
590 req->rq_state &= ~RQ_NET_QUEUED; 597 req->rq_state &= ~RQ_NET_QUEUED;
591 req->rq_state |= RQ_NET_SENT; 598 req->rq_state |= RQ_NET_SENT;
592 /* because _drbd_send_zc_bio could sleep, and may want to
593 * dereference the bio even after the "write_acked_by_peer" and
594 * "completed_ok" events came in, once we return from
595 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
596 * whether it is done already, and end it. */
597 _req_may_be_done_not_susp(req, m); 599 _req_may_be_done_not_susp(req, m);
598 break; 600 break;
599 601
600 case read_retry_remote_canceled: 602 case oos_handed_to_network:
603 /* Was not set PENDING, no longer QUEUED, so is now DONE
604 * as far as this connection is concerned. */
601 req->rq_state &= ~RQ_NET_QUEUED; 605 req->rq_state &= ~RQ_NET_QUEUED;
602 /* fall through, in case we raced with drbd_disconnect */ 606 req->rq_state |= RQ_NET_DONE;
607 _req_may_be_done_not_susp(req, m);
608 break;
609
603 case connection_lost_while_pending: 610 case connection_lost_while_pending:
604 /* transfer log cleanup after connection loss */ 611 /* transfer log cleanup after connection loss */
605 /* assert something? */ 612 /* assert something? */
@@ -616,8 +623,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
616 _req_may_be_done(req, m); /* Allowed while state.susp */ 623 _req_may_be_done(req, m); /* Allowed while state.susp */
617 break; 624 break;
618 625
619 case write_acked_by_peer_and_sis:
620 req->rq_state |= RQ_NET_SIS;
621 case conflict_discarded_by_peer: 626 case conflict_discarded_by_peer:
622 /* for discarded conflicting writes of multiple primaries, 627 /* for discarded conflicting writes of multiple primaries,
623 * there is no need to keep anything in the tl, potential 628 * there is no need to keep anything in the tl, potential
@@ -628,18 +633,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
628 (unsigned long long)req->sector, req->size); 633 (unsigned long long)req->sector, req->size);
629 req->rq_state |= RQ_NET_DONE; 634 req->rq_state |= RQ_NET_DONE;
630 /* fall through */ 635 /* fall through */
636 case write_acked_by_peer_and_sis:
631 case write_acked_by_peer: 637 case write_acked_by_peer:
638 if (what == write_acked_by_peer_and_sis)
639 req->rq_state |= RQ_NET_SIS;
632 /* protocol C; successfully written on peer. 640 /* protocol C; successfully written on peer.
633 * Nothing to do here. 641 * Nothing more to do here.
634 * We want to keep the tl in place for all protocols, to cater 642 * We want to keep the tl in place for all protocols, to cater
635 * for volatile write-back caches on lower level devices. 643 * for volatile write-back caches on lower level devices. */
636 *
637 * A barrier request is expected to have forced all prior
638 * requests onto stable storage, so completion of a barrier
639 * request could set NET_DONE right here, and not wait for the
640 * P_BARRIER_ACK, but that is an unnecessary optimization. */
641 644
642 /* this makes it effectively the same as for: */
643 case recv_acked_by_peer: 645 case recv_acked_by_peer:
644 /* protocol B; pretends to be successfully written on peer. 646 /* protocol B; pretends to be successfully written on peer.
645 * see also notes above in handed_over_to_network about 647 * see also notes above in handed_over_to_network about
@@ -773,6 +775,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
773 int local, remote, send_oos = 0; 775 int local, remote, send_oos = 0;
774 int err = -EIO; 776 int err = -EIO;
775 int ret = 0; 777 int ret = 0;
778 union drbd_state s;
776 779
777 /* allocate outside of all locks; */ 780 /* allocate outside of all locks; */
778 req = drbd_req_new(mdev, bio); 781 req = drbd_req_new(mdev, bio);
@@ -834,8 +837,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
834 drbd_al_begin_io(mdev, sector); 837 drbd_al_begin_io(mdev, sector);
835 } 838 }
836 839
837 remote = remote && drbd_should_do_remote(mdev->state); 840 s = mdev->state;
838 send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); 841 remote = remote && drbd_should_do_remote(s);
842 send_oos = rw == WRITE && drbd_should_send_oos(s);
839 D_ASSERT(!(remote && send_oos)); 843 D_ASSERT(!(remote && send_oos));
840 844
841 if (!(local || remote) && !is_susp(mdev->state)) { 845 if (!(local || remote) && !is_susp(mdev->state)) {
@@ -867,7 +871,7 @@ allocate_barrier:
867 871
868 if (is_susp(mdev->state)) { 872 if (is_susp(mdev->state)) {
869 /* If we got suspended, use the retry mechanism of 873 /* If we got suspended, use the retry mechanism of
870 generic_make_request() to restart processing of this 874 drbd_make_request() to restart processing of this
871 bio. In the next call to drbd_make_request 875 bio. In the next call to drbd_make_request
872 we sleep in inc_ap_bio() */ 876 we sleep in inc_ap_bio() */
873 ret = 1; 877 ret = 1;
@@ -1091,7 +1095,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
1091 */ 1095 */
1092 D_ASSERT(bio->bi_size > 0); 1096 D_ASSERT(bio->bi_size > 0);
1093 D_ASSERT((bio->bi_size & 0x1ff) == 0); 1097 D_ASSERT((bio->bi_size & 0x1ff) == 0);
1094 D_ASSERT(bio->bi_idx == 0);
1095 1098
1096 /* to make some things easier, force alignment of requests within the 1099 /* to make some things easier, force alignment of requests within the
1097 * granularity of our hash tables */ 1100 * granularity of our hash tables */
@@ -1099,8 +1102,9 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
1099 e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; 1102 e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
1100 1103
1101 if (likely(s_enr == e_enr)) { 1104 if (likely(s_enr == e_enr)) {
1102 inc_ap_bio(mdev, 1); 1105 do {
1103 drbd_make_request_common(mdev, bio, start_time); 1106 inc_ap_bio(mdev, 1);
1107 } while (drbd_make_request_common(mdev, bio, start_time));
1104 return; 1108 return;
1105 } 1109 }
1106 1110
@@ -1196,36 +1200,66 @@ void request_timer_fn(unsigned long data)
1196 struct drbd_conf *mdev = (struct drbd_conf *) data; 1200 struct drbd_conf *mdev = (struct drbd_conf *) data;
1197 struct drbd_request *req; /* oldest request */ 1201 struct drbd_request *req; /* oldest request */
1198 struct list_head *le; 1202 struct list_head *le;
1199 unsigned long et = 0; /* effective timeout = ko_count * timeout */ 1203 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
1204 unsigned long now;
1200 1205
1201 if (get_net_conf(mdev)) { 1206 if (get_net_conf(mdev)) {
1202 et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; 1207 if (mdev->state.conn >= C_WF_REPORT_PARAMS)
1208 ent = mdev->net_conf->timeout*HZ/10
1209 * mdev->net_conf->ko_count;
1203 put_net_conf(mdev); 1210 put_net_conf(mdev);
1204 } 1211 }
1205 if (!et || mdev->state.conn < C_WF_REPORT_PARAMS) 1212 if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
1213 dt = mdev->ldev->dc.disk_timeout * HZ / 10;
1214 put_ldev(mdev);
1215 }
1216 et = min_not_zero(dt, ent);
1217
1218 if (!et)
1206 return; /* Recurring timer stopped */ 1219 return; /* Recurring timer stopped */
1207 1220
1221 now = jiffies;
1222
1208 spin_lock_irq(&mdev->req_lock); 1223 spin_lock_irq(&mdev->req_lock);
1209 le = &mdev->oldest_tle->requests; 1224 le = &mdev->oldest_tle->requests;
1210 if (list_empty(le)) { 1225 if (list_empty(le)) {
1211 spin_unlock_irq(&mdev->req_lock); 1226 spin_unlock_irq(&mdev->req_lock);
1212 mod_timer(&mdev->request_timer, jiffies + et); 1227 mod_timer(&mdev->request_timer, now + et);
1213 return; 1228 return;
1214 } 1229 }
1215 1230
1216 le = le->prev; 1231 le = le->prev;
1217 req = list_entry(le, struct drbd_request, tl_requests); 1232 req = list_entry(le, struct drbd_request, tl_requests);
1218 if (time_is_before_eq_jiffies(req->start_time + et)) {
1219 if (req->rq_state & RQ_NET_PENDING) {
1220 dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
1221 _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
1222 } else {
1223 dev_warn(DEV, "Local backing block device frozen?\n");
1224 mod_timer(&mdev->request_timer, jiffies + et);
1225 }
1226 } else {
1227 mod_timer(&mdev->request_timer, req->start_time + et);
1228 }
1229 1233
1234 /* The request is considered timed out, if
1235 * - we have some effective timeout from the configuration,
1236 * with above state restrictions applied,
1237 * - the oldest request is waiting for a response from the network
1238 * resp. the local disk,
1239 * - the oldest request is in fact older than the effective timeout,
1240 * - the connection was established (resp. disk was attached)
1241 * for longer than the timeout already.
1242 * Note that for 32bit jiffies and very stable connections/disks,
1243 * we may have a wrap around, which is catched by
1244 * !time_in_range(now, last_..._jif, last_..._jif + timeout).
1245 *
1246 * Side effect: once per 32bit wrap-around interval, which means every
1247 * ~198 days with 250 HZ, we have a window where the timeout would need
1248 * to expire twice (worst case) to become effective. Good enough.
1249 */
1250 if (ent && req->rq_state & RQ_NET_PENDING &&
1251 time_after(now, req->start_time + ent) &&
1252 !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
1253 dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
1254 _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
1255 }
1256 if (dt && req->rq_state & RQ_LOCAL_PENDING &&
1257 time_after(now, req->start_time + dt) &&
1258 !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
1259 dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
1260 __drbd_chk_io_error(mdev, 1);
1261 }
1262 nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
1230 spin_unlock_irq(&mdev->req_lock); 1263 spin_unlock_irq(&mdev->req_lock);
1264 mod_timer(&mdev->request_timer, nt);
1231} 1265}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 68a234a5fdc5..3d2111919486 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -105,6 +105,7 @@ enum drbd_req_event {
105 read_completed_with_error, 105 read_completed_with_error,
106 read_ahead_completed_with_error, 106 read_ahead_completed_with_error,
107 write_completed_with_error, 107 write_completed_with_error,
108 abort_disk_io,
108 completed_ok, 109 completed_ok,
109 resend, 110 resend,
110 fail_frozen_disk_io, 111 fail_frozen_disk_io,
@@ -118,18 +119,21 @@ enum drbd_req_event {
118 * same time, so we should hold the request lock anyways. 119 * same time, so we should hold the request lock anyways.
119 */ 120 */
120enum drbd_req_state_bits { 121enum drbd_req_state_bits {
121 /* 210 122 /* 3210
122 * 000: no local possible 123 * 0000: no local possible
123 * 001: to be submitted 124 * 0001: to be submitted
124 * UNUSED, we could map: 011: submitted, completion still pending 125 * UNUSED, we could map: 011: submitted, completion still pending
125 * 110: completed ok 126 * 0110: completed ok
126 * 010: completed with error 127 * 0010: completed with error
128 * 1001: Aborted (before completion)
129 * 1x10: Aborted and completed -> free
127 */ 130 */
128 __RQ_LOCAL_PENDING, 131 __RQ_LOCAL_PENDING,
129 __RQ_LOCAL_COMPLETED, 132 __RQ_LOCAL_COMPLETED,
130 __RQ_LOCAL_OK, 133 __RQ_LOCAL_OK,
134 __RQ_LOCAL_ABORTED,
131 135
132 /* 76543 136 /* 87654
133 * 00000: no network possible 137 * 00000: no network possible
134 * 00001: to be send 138 * 00001: to be send
135 * 00011: to be send, on worker queue 139 * 00011: to be send, on worker queue
@@ -199,8 +203,9 @@ enum drbd_req_state_bits {
199#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) 203#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
200#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) 204#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
201#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) 205#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
206#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED)
202 207
203#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ 208#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1)
204 209
205#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) 210#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
206#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) 211#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 4d3e6f6213ba..620c70ff2231 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -70,11 +70,29 @@ rwlock_t global_state_lock;
70void drbd_md_io_complete(struct bio *bio, int error) 70void drbd_md_io_complete(struct bio *bio, int error)
71{ 71{
72 struct drbd_md_io *md_io; 72 struct drbd_md_io *md_io;
73 struct drbd_conf *mdev;
73 74
74 md_io = (struct drbd_md_io *)bio->bi_private; 75 md_io = (struct drbd_md_io *)bio->bi_private;
76 mdev = container_of(md_io, struct drbd_conf, md_io);
77
75 md_io->error = error; 78 md_io->error = error;
76 79
77 complete(&md_io->event); 80 /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
81 * to timeout on the lower level device, and eventually detach from it.
82 * If this io completion runs after that timeout expired, this
83 * drbd_md_put_buffer() may allow us to finally try and re-attach.
84 * During normal operation, this only puts that extra reference
85 * down to 1 again.
86 * Make sure we first drop the reference, and only then signal
87 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
88 * next drbd_md_sync_page_io(), that we trigger the
89 * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there.
90 */
91 drbd_md_put_buffer(mdev);
92 md_io->done = 1;
93 wake_up(&mdev->misc_wait);
94 bio_put(bio);
95 put_ldev(mdev);
78} 96}
79 97
80/* reads on behalf of the partner, 98/* reads on behalf of the partner,
@@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error)
226 spin_lock_irqsave(&mdev->req_lock, flags); 244 spin_lock_irqsave(&mdev->req_lock, flags);
227 __req_mod(req, what, &m); 245 __req_mod(req, what, &m);
228 spin_unlock_irqrestore(&mdev->req_lock, flags); 246 spin_unlock_irqrestore(&mdev->req_lock, flags);
247 put_ldev(mdev);
229 248
230 if (m.bio) 249 if (m.bio)
231 complete_master_bio(mdev, &m); 250 complete_master_bio(mdev, &m);
@@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
290 sg_init_table(&sg, 1); 309 sg_init_table(&sg, 1);
291 crypto_hash_init(&desc); 310 crypto_hash_init(&desc);
292 311
293 __bio_for_each_segment(bvec, bio, i, 0) { 312 bio_for_each_segment(bvec, bio, i) {
294 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); 313 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
295 crypto_hash_update(&desc, &sg, sg.length); 314 crypto_hash_update(&desc, &sg, sg.length);
296 } 315 }
@@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
728 } 747 }
729 748
730 drbd_start_resync(mdev, C_SYNC_SOURCE); 749 drbd_start_resync(mdev, C_SYNC_SOURCE);
731 clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags); 750 clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
732 return 1; 751 return 1;
733} 752}
734 753
@@ -1519,14 +1538,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1519 } 1538 }
1520 1539
1521 drbd_state_lock(mdev); 1540 drbd_state_lock(mdev);
1522 1541 write_lock_irq(&global_state_lock);
1523 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { 1542 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1543 write_unlock_irq(&global_state_lock);
1524 drbd_state_unlock(mdev); 1544 drbd_state_unlock(mdev);
1525 return; 1545 return;
1526 } 1546 }
1527 1547
1528 write_lock_irq(&global_state_lock); 1548 ns.i = mdev->state.i;
1529 ns = mdev->state;
1530 1549
1531 ns.aftr_isp = !_drbd_may_sync_now(mdev); 1550 ns.aftr_isp = !_drbd_may_sync_now(mdev);
1532 1551