aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-14 11:10:21 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-14 11:10:21 -0400
commitd429a3639ca967ce2f35e3e8d4e70caec7149ded (patch)
treecad1e5602551b6a744f63ef062de2c2e21cfe39a /drivers/block
parent4a319a490ca59a746b3d36768c0e29ee19832366 (diff)
parent99d540018caa920b7a54e2d3048f1dff530b294b (diff)
Merge branch 'for-3.17/drivers' of git://git.kernel.dk/linux-block
Pull block driver changes from Jens Axboe: "Nothing out of the ordinary here, this pull request contains: - A big round of fixes for bcache from Kent Overstreet, Slava Pestov, and Surbhi Palande. No new features, just a lot of fixes. - The usual round of drbd updates from Andreas Gruenbacher, Lars Ellenberg, and Philipp Reisner. - virtio_blk was converted to blk-mq back in 3.13, but now Ming Lei has taken it one step further and added support for actually using more than one queue. - Addition of an explicit SG_FLAG_Q_AT_HEAD for block/bsg, to compliment the the default behavior of adding to the tail of the queue. From Douglas Gilbert" * 'for-3.17/drivers' of git://git.kernel.dk/linux-block: (86 commits) bcache: Drop unneeded blk_sync_queue() calls bcache: add mutex lock for bch_is_open bcache: Correct printing of btree_gc_max_duration_ms bcache: try to set b->parent properly bcache: fix memory corruption in init error path bcache: fix crash with incomplete cache set bcache: Fix more early shutdown bugs bcache: fix use-after-free in btree_gc_coalesce() bcache: Fix an infinite loop in journal replay bcache: fix crash in bcache_btree_node_alloc_fail tracepoint bcache: bcache_write tracepoint was crashing bcache: fix typo in bch_bkey_equal_header bcache: Allocate bounce buffers with GFP_NOWAIT bcache: Make sure to pass GFP_WAIT to mempool_alloc() bcache: fix uninterruptible sleep in writeback thread bcache: wait for buckets when allocating new btree root bcache: fix crash on shutdown in passthrough mode bcache: fix lockdep warnings on shutdown bcache allocator: send discards with correct size bcache: Fix to remove the rcu_sched stalls. ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/drbd/Makefile1
-rw-r--r--drivers/block/drbd/drbd_actlog.c518
-rw-r--r--drivers/block/drbd/drbd_bitmap.c150
-rw-r--r--drivers/block/drbd/drbd_debugfs.c958
-rw-r--r--drivers/block/drbd/drbd_debugfs.h39
-rw-r--r--drivers/block/drbd/drbd_int.h383
-rw-r--r--drivers/block/drbd/drbd_interval.h4
-rw-r--r--drivers/block/drbd/drbd_main.c302
-rw-r--r--drivers/block/drbd/drbd_nl.c110
-rw-r--r--drivers/block/drbd/drbd_proc.c125
-rw-r--r--drivers/block/drbd/drbd_receiver.c316
-rw-r--r--drivers/block/drbd/drbd_req.c527
-rw-r--r--drivers/block/drbd/drbd_req.h1
-rw-r--r--drivers/block/drbd/drbd_state.c90
-rw-r--r--drivers/block/drbd/drbd_worker.c348
-rw-r--r--drivers/block/virtio_blk.c104
16 files changed, 2758 insertions, 1218 deletions
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 8b450338075e..4464e353c1e8 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -3,5 +3,6 @@ drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
3drbd-y += drbd_main.o drbd_strings.o drbd_nl.o 3drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
4drbd-y += drbd_interval.o drbd_state.o 4drbd-y += drbd_interval.o drbd_state.o
5drbd-y += drbd_nla.o 5drbd-y += drbd_nla.o
6drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
6 7
7obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o 8obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 05a1780ffa85..d26a3fa63688 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,34 +92,26 @@ struct __packed al_transaction_on_disk {
92 __be32 context[AL_CONTEXT_PER_TRANSACTION]; 92 __be32 context[AL_CONTEXT_PER_TRANSACTION];
93}; 93};
94 94
95struct update_odbm_work { 95void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
96 struct drbd_work w;
97 struct drbd_device *device;
98 unsigned int enr;
99};
100
101struct update_al_work {
102 struct drbd_work w;
103 struct drbd_device *device;
104 struct completion event;
105 int err;
106};
107
108
109void *drbd_md_get_buffer(struct drbd_device *device)
110{ 96{
111 int r; 97 int r;
112 98
113 wait_event(device->misc_wait, 99 wait_event(device->misc_wait,
114 (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 || 100 (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
115 device->state.disk <= D_FAILED); 101 device->state.disk <= D_FAILED);
116 102
117 return r ? NULL : page_address(device->md_io_page); 103 if (r)
104 return NULL;
105
106 device->md_io.current_use = intent;
107 device->md_io.start_jif = jiffies;
108 device->md_io.submit_jif = device->md_io.start_jif - 1;
109 return page_address(device->md_io.page);
118} 110}
119 111
120void drbd_md_put_buffer(struct drbd_device *device) 112void drbd_md_put_buffer(struct drbd_device *device)
121{ 113{
122 if (atomic_dec_and_test(&device->md_io_in_use)) 114 if (atomic_dec_and_test(&device->md_io.in_use))
123 wake_up(&device->misc_wait); 115 wake_up(&device->misc_wait);
124} 116}
125 117
@@ -145,10 +137,11 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b
145 137
146static int _drbd_md_sync_page_io(struct drbd_device *device, 138static int _drbd_md_sync_page_io(struct drbd_device *device,
147 struct drbd_backing_dev *bdev, 139 struct drbd_backing_dev *bdev,
148 struct page *page, sector_t sector, 140 sector_t sector, int rw)
149 int rw, int size)
150{ 141{
151 struct bio *bio; 142 struct bio *bio;
143 /* we do all our meta data IO in aligned 4k blocks. */
144 const int size = 4096;
152 int err; 145 int err;
153 146
154 device->md_io.done = 0; 147 device->md_io.done = 0;
@@ -156,15 +149,15 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
156 149
157 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) 150 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
158 rw |= REQ_FUA | REQ_FLUSH; 151 rw |= REQ_FUA | REQ_FLUSH;
159 rw |= REQ_SYNC; 152 rw |= REQ_SYNC | REQ_NOIDLE;
160 153
161 bio = bio_alloc_drbd(GFP_NOIO); 154 bio = bio_alloc_drbd(GFP_NOIO);
162 bio->bi_bdev = bdev->md_bdev; 155 bio->bi_bdev = bdev->md_bdev;
163 bio->bi_iter.bi_sector = sector; 156 bio->bi_iter.bi_sector = sector;
164 err = -EIO; 157 err = -EIO;
165 if (bio_add_page(bio, page, size, 0) != size) 158 if (bio_add_page(bio, device->md_io.page, size, 0) != size)
166 goto out; 159 goto out;
167 bio->bi_private = &device->md_io; 160 bio->bi_private = device;
168 bio->bi_end_io = drbd_md_io_complete; 161 bio->bi_end_io = drbd_md_io_complete;
169 bio->bi_rw = rw; 162 bio->bi_rw = rw;
170 163
@@ -179,7 +172,8 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
179 } 172 }
180 173
181 bio_get(bio); /* one bio_put() is in the completion handler */ 174 bio_get(bio); /* one bio_put() is in the completion handler */
182 atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ 175 atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
176 device->md_io.submit_jif = jiffies;
183 if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 177 if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
184 bio_endio(bio, -EIO); 178 bio_endio(bio, -EIO);
185 else 179 else
@@ -197,9 +191,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
197 sector_t sector, int rw) 191 sector_t sector, int rw)
198{ 192{
199 int err; 193 int err;
200 struct page *iop = device->md_io_page; 194 D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
201
202 D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1);
203 195
204 BUG_ON(!bdev->md_bdev); 196 BUG_ON(!bdev->md_bdev);
205 197
@@ -214,8 +206,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
214 current->comm, current->pid, __func__, 206 current->comm, current->pid, __func__,
215 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 207 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
216 208
217 /* we do all our meta data IO in aligned 4k blocks. */ 209 err = _drbd_md_sync_page_io(device, bdev, sector, rw);
218 err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096);
219 if (err) { 210 if (err) {
220 drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 211 drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
221 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); 212 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -297,26 +288,12 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
297 return need_transaction; 288 return need_transaction;
298} 289}
299 290
300static int al_write_transaction(struct drbd_device *device, bool delegate); 291static int al_write_transaction(struct drbd_device *device);
301
302/* When called through generic_make_request(), we must delegate
303 * activity log I/O to the worker thread: a further request
304 * submitted via generic_make_request() within the same task
305 * would be queued on current->bio_list, and would only start
306 * after this function returns (see generic_make_request()).
307 *
308 * However, if we *are* the worker, we must not delegate to ourselves.
309 */
310 292
311/* 293void drbd_al_begin_io_commit(struct drbd_device *device)
312 * @delegate: delegate activity log I/O to the worker thread
313 */
314void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
315{ 294{
316 bool locked = false; 295 bool locked = false;
317 296
318 BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
319
320 /* Serialize multiple transactions. 297 /* Serialize multiple transactions.
321 * This uses test_and_set_bit, memory barrier is implicit. 298 * This uses test_and_set_bit, memory barrier is implicit.
322 */ 299 */
@@ -335,7 +312,7 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
335 rcu_read_unlock(); 312 rcu_read_unlock();
336 313
337 if (write_al_updates) 314 if (write_al_updates)
338 al_write_transaction(device, delegate); 315 al_write_transaction(device);
339 spin_lock_irq(&device->al_lock); 316 spin_lock_irq(&device->al_lock);
340 /* FIXME 317 /* FIXME
341 if (err) 318 if (err)
@@ -352,12 +329,10 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate)
352/* 329/*
353 * @delegate: delegate activity log I/O to the worker thread 330 * @delegate: delegate activity log I/O to the worker thread
354 */ 331 */
355void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate) 332void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
356{ 333{
357 BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task);
358
359 if (drbd_al_begin_io_prepare(device, i)) 334 if (drbd_al_begin_io_prepare(device, i))
360 drbd_al_begin_io_commit(device, delegate); 335 drbd_al_begin_io_commit(device);
361} 336}
362 337
363int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) 338int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
@@ -380,8 +355,19 @@ int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *
380 /* We want all necessary updates for a given request within the same transaction 355 /* We want all necessary updates for a given request within the same transaction
381 * We could first check how many updates are *actually* needed, 356 * We could first check how many updates are *actually* needed,
382 * and use that instead of the worst-case nr_al_extents */ 357 * and use that instead of the worst-case nr_al_extents */
383 if (available_update_slots < nr_al_extents) 358 if (available_update_slots < nr_al_extents) {
384 return -EWOULDBLOCK; 359 /* Too many activity log extents are currently "hot".
360 *
361 * If we have accumulated pending changes already,
362 * we made progress.
363 *
364 * If we cannot get even a single pending change through,
365 * stop the fast path until we made some progress,
366 * or requests to "cold" extents could be starved. */
367 if (!al->pending_changes)
368 __set_bit(__LC_STARVING, &device->act_log->flags);
369 return -ENOBUFS;
370 }
385 371
386 /* Is resync active in this area? */ 372 /* Is resync active in this area? */
387 for (enr = first; enr <= last; enr++) { 373 for (enr = first; enr <= last; enr++) {
@@ -452,15 +438,6 @@ static unsigned int al_extent_to_bm_page(unsigned int al_enr)
452 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); 438 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
453} 439}
454 440
455static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
456{
457 return rs_enr >>
458 /* bit to page */
459 ((PAGE_SHIFT + 3) -
460 /* resync extent number to bit */
461 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
462}
463
464static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) 441static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
465{ 442{
466 const unsigned int stripes = device->ldev->md.al_stripes; 443 const unsigned int stripes = device->ldev->md.al_stripes;
@@ -479,8 +456,7 @@ static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
479 return device->ldev->md.md_offset + device->ldev->md.al_offset + t; 456 return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
480} 457}
481 458
482static int 459int al_write_transaction(struct drbd_device *device)
483_al_write_transaction(struct drbd_device *device)
484{ 460{
485 struct al_transaction_on_disk *buffer; 461 struct al_transaction_on_disk *buffer;
486 struct lc_element *e; 462 struct lc_element *e;
@@ -505,7 +481,8 @@ _al_write_transaction(struct drbd_device *device)
505 return -EIO; 481 return -EIO;
506 } 482 }
507 483
508 buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */ 484 /* protects md_io_buffer, al_tr_cycle, ... */
485 buffer = drbd_md_get_buffer(device, __func__);
509 if (!buffer) { 486 if (!buffer) {
510 drbd_err(device, "disk failed while waiting for md_io buffer\n"); 487 drbd_err(device, "disk failed while waiting for md_io buffer\n");
511 put_ldev(device); 488 put_ldev(device);
@@ -590,38 +567,6 @@ _al_write_transaction(struct drbd_device *device)
590 return err; 567 return err;
591} 568}
592 569
593
594static int w_al_write_transaction(struct drbd_work *w, int unused)
595{
596 struct update_al_work *aw = container_of(w, struct update_al_work, w);
597 struct drbd_device *device = aw->device;
598 int err;
599
600 err = _al_write_transaction(device);
601 aw->err = err;
602 complete(&aw->event);
603
604 return err != -EIO ? err : 0;
605}
606
607/* Calls from worker context (see w_restart_disk_io()) need to write the
608 transaction directly. Others came through generic_make_request(),
609 those need to delegate it to the worker. */
610static int al_write_transaction(struct drbd_device *device, bool delegate)
611{
612 if (delegate) {
613 struct update_al_work al_work;
614 init_completion(&al_work.event);
615 al_work.w.cb = w_al_write_transaction;
616 al_work.device = device;
617 drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
618 &al_work.w);
619 wait_for_completion(&al_work.event);
620 return al_work.err;
621 } else
622 return _al_write_transaction(device);
623}
624
625static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) 570static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
626{ 571{
627 int rv; 572 int rv;
@@ -682,72 +627,56 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer)
682 return 0; 627 return 0;
683} 628}
684 629
685static int w_update_odbm(struct drbd_work *w, int unused) 630static const char *drbd_change_sync_fname[] = {
686{ 631 [RECORD_RS_FAILED] = "drbd_rs_failed_io",
687 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); 632 [SET_IN_SYNC] = "drbd_set_in_sync",
688 struct drbd_device *device = udw->device; 633 [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
689 struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; 634};
690
691 if (!get_ldev(device)) {
692 if (__ratelimit(&drbd_ratelimit_state))
693 drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n");
694 kfree(udw);
695 return 0;
696 }
697
698 drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr));
699 put_ldev(device);
700
701 kfree(udw);
702
703 if (drbd_bm_total_weight(device) <= device->rs_failed) {
704 switch (device->state.conn) {
705 case C_SYNC_SOURCE: case C_SYNC_TARGET:
706 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
707 drbd_resync_finished(device);
708 default:
709 /* nothing to do */
710 break;
711 }
712 }
713 drbd_bcast_event(device, &sib);
714
715 return 0;
716}
717
718 635
719/* ATTENTION. The AL's extents are 4MB each, while the extents in the 636/* ATTENTION. The AL's extents are 4MB each, while the extents in the
720 * resync LRU-cache are 16MB each. 637 * resync LRU-cache are 16MB each.
721 * The caller of this function has to hold an get_ldev() reference. 638 * The caller of this function has to hold an get_ldev() reference.
722 * 639 *
640 * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
641 * potentially pulling in (and recounting the corresponding bits)
642 * this resync extent into the resync extent lru cache.
643 *
644 * Returns whether all bits have been cleared for this resync extent,
645 * precisely: (rs_left <= rs_failed)
646 *
723 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 647 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
724 */ 648 */
725static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector, 649static bool update_rs_extent(struct drbd_device *device,
726 int count, int success) 650 unsigned int enr, int count,
651 enum update_sync_bits_mode mode)
727{ 652{
728 struct lc_element *e; 653 struct lc_element *e;
729 struct update_odbm_work *udw;
730
731 unsigned int enr;
732 654
733 D_ASSERT(device, atomic_read(&device->local_cnt)); 655 D_ASSERT(device, atomic_read(&device->local_cnt));
734 656
735 /* I simply assume that a sector/size pair never crosses 657 /* When setting out-of-sync bits,
736 * a 16 MB extent border. (Currently this is true...) */ 658 * we don't need it cached (lc_find).
737 enr = BM_SECT_TO_EXT(sector); 659 * But if it is present in the cache,
738 660 * we should update the cached bit count.
739 e = lc_get(device->resync, enr); 661 * Otherwise, that extent should be in the resync extent lru cache
662 * already -- or we want to pull it in if necessary -- (lc_get),
663 * then update and check rs_left and rs_failed. */
664 if (mode == SET_OUT_OF_SYNC)
665 e = lc_find(device->resync, enr);
666 else
667 e = lc_get(device->resync, enr);
740 if (e) { 668 if (e) {
741 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 669 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
742 if (ext->lce.lc_number == enr) { 670 if (ext->lce.lc_number == enr) {
743 if (success) 671 if (mode == SET_IN_SYNC)
744 ext->rs_left -= count; 672 ext->rs_left -= count;
673 else if (mode == SET_OUT_OF_SYNC)
674 ext->rs_left += count;
745 else 675 else
746 ext->rs_failed += count; 676 ext->rs_failed += count;
747 if (ext->rs_left < ext->rs_failed) { 677 if (ext->rs_left < ext->rs_failed) {
748 drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d " 678 drbd_warn(device, "BAD! enr=%u rs_left=%d "
749 "rs_failed=%d count=%d cstate=%s\n", 679 "rs_failed=%d count=%d cstate=%s\n",
750 (unsigned long long)sector,
751 ext->lce.lc_number, ext->rs_left, 680 ext->lce.lc_number, ext->rs_left,
752 ext->rs_failed, count, 681 ext->rs_failed, count,
753 drbd_conn_str(device->state.conn)); 682 drbd_conn_str(device->state.conn));
@@ -781,34 +710,27 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto
781 ext->lce.lc_number, ext->rs_failed); 710 ext->lce.lc_number, ext->rs_failed);
782 } 711 }
783 ext->rs_left = rs_left; 712 ext->rs_left = rs_left;
784 ext->rs_failed = success ? 0 : count; 713 ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
785 /* we don't keep a persistent log of the resync lru, 714 /* we don't keep a persistent log of the resync lru,
786 * we can commit any change right away. */ 715 * we can commit any change right away. */
787 lc_committed(device->resync); 716 lc_committed(device->resync);
788 } 717 }
789 lc_put(device->resync, &ext->lce); 718 if (mode != SET_OUT_OF_SYNC)
719 lc_put(device->resync, &ext->lce);
790 /* no race, we are within the al_lock! */ 720 /* no race, we are within the al_lock! */
791 721
792 if (ext->rs_left == ext->rs_failed) { 722 if (ext->rs_left <= ext->rs_failed) {
793 ext->rs_failed = 0; 723 ext->rs_failed = 0;
794 724 return true;
795 udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
796 if (udw) {
797 udw->enr = ext->lce.lc_number;
798 udw->w.cb = w_update_odbm;
799 udw->device = device;
800 drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
801 &udw->w);
802 } else {
803 drbd_warn(device, "Could not kmalloc an udw\n");
804 }
805 } 725 }
806 } else { 726 } else if (mode != SET_OUT_OF_SYNC) {
727 /* be quiet if lc_find() did not find it. */
807 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", 728 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
808 device->resync_locked, 729 device->resync_locked,
809 device->resync->nr_elements, 730 device->resync->nr_elements,
810 device->resync->flags); 731 device->resync->flags);
811 } 732 }
733 return false;
812} 734}
813 735
814void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) 736void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
@@ -827,105 +749,105 @@ void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go
827 } 749 }
828} 750}
829 751
830/* clear the bit corresponding to the piece of storage in question: 752/* It is called lazy update, so don't do write-out too often. */
831 * size byte of data starting from sector. Only clear a bits of the affected 753static bool lazy_bitmap_update_due(struct drbd_device *device)
832 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
833 *
834 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
835 *
836 */
837void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
838 const char *file, const unsigned int line)
839{ 754{
840 /* Is called from worker and receiver context _only_ */ 755 return time_after(jiffies, device->rs_last_bcast + 2*HZ);
841 unsigned long sbnr, ebnr, lbnr; 756}
842 unsigned long count = 0;
843 sector_t esector, nr_sectors;
844 int wake_up = 0;
845 unsigned long flags;
846 757
847 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 758static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
848 drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 759{
849 (unsigned long long)sector, size); 760 if (rs_done)
761 set_bit(RS_DONE, &device->flags);
762 /* and also set RS_PROGRESS below */
763 else if (!lazy_bitmap_update_due(device))
850 return; 764 return;
851 }
852
853 if (!get_ldev(device))
854 return; /* no disk, no metadata, no bitmap to clear bits in */
855
856 nr_sectors = drbd_get_capacity(device->this_bdev);
857 esector = sector + (size >> 9) - 1;
858
859 if (!expect(sector < nr_sectors))
860 goto out;
861 if (!expect(esector < nr_sectors))
862 esector = nr_sectors - 1;
863
864 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
865
866 /* we clear it (in sync).
867 * round up start sector, round down end sector. we make sure we only
868 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
869 if (unlikely(esector < BM_SECT_PER_BIT-1))
870 goto out;
871 if (unlikely(esector == (nr_sectors-1)))
872 ebnr = lbnr;
873 else
874 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
875 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
876 765
877 if (sbnr > ebnr) 766 drbd_device_post_work(device, RS_PROGRESS);
878 goto out; 767}
879 768
769static int update_sync_bits(struct drbd_device *device,
770 unsigned long sbnr, unsigned long ebnr,
771 enum update_sync_bits_mode mode)
772{
880 /* 773 /*
881 * ok, (capacity & 7) != 0 sometimes, but who cares... 774 * We keep a count of set bits per resync-extent in the ->rs_left
882 * we count rs_{total,left} in bits, not sectors. 775 * caching member, so we need to loop and work within the resync extent
776 * alignment. Typically this loop will execute exactly once.
883 */ 777 */
884 count = drbd_bm_clear_bits(device, sbnr, ebnr); 778 unsigned long flags;
885 if (count) { 779 unsigned long count = 0;
886 drbd_advance_rs_marks(device, drbd_bm_total_weight(device)); 780 unsigned int cleared = 0;
887 spin_lock_irqsave(&device->al_lock, flags); 781 while (sbnr <= ebnr) {
888 drbd_try_clear_on_disk_bm(device, sector, count, true); 782 /* set temporary boundary bit number to last bit number within
889 spin_unlock_irqrestore(&device->al_lock, flags); 783 * the resync extent of the current start bit number,
890 784 * but cap at provided end bit number */
891 /* just wake_up unconditional now, various lc_chaged(), 785 unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
892 * lc_put() in drbd_try_clear_on_disk_bm(). */ 786 unsigned long c;
893 wake_up = 1; 787
788 if (mode == RECORD_RS_FAILED)
789 /* Only called from drbd_rs_failed_io(), bits
790 * supposedly still set. Recount, maybe some
791 * of the bits have been successfully cleared
792 * by application IO meanwhile.
793 */
794 c = drbd_bm_count_bits(device, sbnr, tbnr);
795 else if (mode == SET_IN_SYNC)
796 c = drbd_bm_clear_bits(device, sbnr, tbnr);
797 else /* if (mode == SET_OUT_OF_SYNC) */
798 c = drbd_bm_set_bits(device, sbnr, tbnr);
799
800 if (c) {
801 spin_lock_irqsave(&device->al_lock, flags);
802 cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
803 spin_unlock_irqrestore(&device->al_lock, flags);
804 count += c;
805 }
806 sbnr = tbnr + 1;
894 } 807 }
895out: 808 if (count) {
896 put_ldev(device); 809 if (mode == SET_IN_SYNC) {
897 if (wake_up) 810 unsigned long still_to_go = drbd_bm_total_weight(device);
811 bool rs_is_done = (still_to_go <= device->rs_failed);
812 drbd_advance_rs_marks(device, still_to_go);
813 if (cleared || rs_is_done)
814 maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
815 } else if (mode == RECORD_RS_FAILED)
816 device->rs_failed += count;
898 wake_up(&device->al_wait); 817 wake_up(&device->al_wait);
818 }
819 return count;
899} 820}
900 821
901/* 822/* clear the bit corresponding to the piece of storage in question:
902 * this is intended to set one request worth of data out of sync. 823 * size byte of data starting from sector. Only clear a bits of the affected
903 * affects at least 1 bit, 824 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
904 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. 825 *
826 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
905 * 827 *
906 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
907 * so this can be _any_ process.
908 */ 828 */
909int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size, 829int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
910 const char *file, const unsigned int line) 830 enum update_sync_bits_mode mode,
831 const char *file, const unsigned int line)
911{ 832{
912 unsigned long sbnr, ebnr, flags; 833 /* Is called from worker and receiver context _only_ */
834 unsigned long sbnr, ebnr, lbnr;
835 unsigned long count = 0;
913 sector_t esector, nr_sectors; 836 sector_t esector, nr_sectors;
914 unsigned int enr, count = 0;
915 struct lc_element *e;
916 837
917 /* this should be an empty REQ_FLUSH */ 838 /* This would be an empty REQ_FLUSH, be silent. */
918 if (size == 0) 839 if ((mode == SET_OUT_OF_SYNC) && size == 0)
919 return 0; 840 return 0;
920 841
921 if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 842 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
922 drbd_err(device, "sector: %llus, size: %d\n", 843 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
923 (unsigned long long)sector, size); 844 drbd_change_sync_fname[mode],
845 (unsigned long long)sector, size);
924 return 0; 846 return 0;
925 } 847 }
926 848
927 if (!get_ldev(device)) 849 if (!get_ldev(device))
928 return 0; /* no disk, no metadata, no bitmap to set bits in */ 850 return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
929 851
930 nr_sectors = drbd_get_capacity(device->this_bdev); 852 nr_sectors = drbd_get_capacity(device->this_bdev);
931 esector = sector + (size >> 9) - 1; 853 esector = sector + (size >> 9) - 1;
@@ -935,25 +857,28 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
935 if (!expect(esector < nr_sectors)) 857 if (!expect(esector < nr_sectors))
936 esector = nr_sectors - 1; 858 esector = nr_sectors - 1;
937 859
938 /* we set it out of sync, 860 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
939 * we do not need to round anything here */
940 sbnr = BM_SECT_TO_BIT(sector);
941 ebnr = BM_SECT_TO_BIT(esector);
942
943 /* ok, (capacity & 7) != 0 sometimes, but who cares...
944 * we count rs_{total,left} in bits, not sectors. */
945 spin_lock_irqsave(&device->al_lock, flags);
946 count = drbd_bm_set_bits(device, sbnr, ebnr);
947 861
948 enr = BM_SECT_TO_EXT(sector); 862 if (mode == SET_IN_SYNC) {
949 e = lc_find(device->resync, enr); 863 /* Round up start sector, round down end sector. We make sure
950 if (e) 864 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
951 lc_entry(e, struct bm_extent, lce)->rs_left += count; 865 if (unlikely(esector < BM_SECT_PER_BIT-1))
952 spin_unlock_irqrestore(&device->al_lock, flags); 866 goto out;
867 if (unlikely(esector == (nr_sectors-1)))
868 ebnr = lbnr;
869 else
870 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
871 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
872 } else {
873 /* We set it out of sync, or record resync failure.
874 * Should not round anything here. */
875 sbnr = BM_SECT_TO_BIT(sector);
876 ebnr = BM_SECT_TO_BIT(esector);
877 }
953 878
879 count = update_sync_bits(device, sbnr, ebnr, mode);
954out: 880out:
955 put_ldev(device); 881 put_ldev(device);
956
957 return count; 882 return count;
958} 883}
959 884
@@ -1075,6 +1000,15 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
1075 struct lc_element *e; 1000 struct lc_element *e;
1076 struct bm_extent *bm_ext; 1001 struct bm_extent *bm_ext;
1077 int i; 1002 int i;
1003 bool throttle = drbd_rs_should_slow_down(device, sector, true);
1004
1005 /* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
1006 * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
1007 * need to throttle. There is at most one such half-locked extent,
1008 * which is remembered in resync_wenr. */
1009
1010 if (throttle && device->resync_wenr != enr)
1011 return -EAGAIN;
1078 1012
1079 spin_lock_irq(&device->al_lock); 1013 spin_lock_irq(&device->al_lock);
1080 if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { 1014 if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
@@ -1098,8 +1032,10 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
1098 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1032 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
1099 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1033 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1100 device->resync_wenr = LC_FREE; 1034 device->resync_wenr = LC_FREE;
1101 if (lc_put(device->resync, &bm_ext->lce) == 0) 1035 if (lc_put(device->resync, &bm_ext->lce) == 0) {
1036 bm_ext->flags = 0;
1102 device->resync_locked--; 1037 device->resync_locked--;
1038 }
1103 wake_up(&device->al_wait); 1039 wake_up(&device->al_wait);
1104 } else { 1040 } else {
1105 drbd_alert(device, "LOGIC BUG\n"); 1041 drbd_alert(device, "LOGIC BUG\n");
@@ -1161,8 +1097,20 @@ proceed:
1161 return 0; 1097 return 0;
1162 1098
1163try_again: 1099try_again:
1164 if (bm_ext) 1100 if (bm_ext) {
1165 device->resync_wenr = enr; 1101 if (throttle) {
1102 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
1103 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
1104 clear_bit(BME_NO_WRITES, &bm_ext->flags);
1105 device->resync_wenr = LC_FREE;
1106 if (lc_put(device->resync, &bm_ext->lce) == 0) {
1107 bm_ext->flags = 0;
1108 device->resync_locked--;
1109 }
1110 wake_up(&device->al_wait);
1111 } else
1112 device->resync_wenr = enr;
1113 }
1166 spin_unlock_irq(&device->al_lock); 1114 spin_unlock_irq(&device->al_lock);
1167 return -EAGAIN; 1115 return -EAGAIN;
1168} 1116}
@@ -1270,69 +1218,3 @@ int drbd_rs_del_all(struct drbd_device *device)
1270 1218
1271 return 0; 1219 return 0;
1272} 1220}
1273
1274/**
1275 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1276 * @device: DRBD device.
1277 * @sector: The sector number.
1278 * @size: Size of failed IO operation, in byte.
1279 */
1280void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
1281{
1282 /* Is called from worker and receiver context _only_ */
1283 unsigned long sbnr, ebnr, lbnr;
1284 unsigned long count;
1285 sector_t esector, nr_sectors;
1286 int wake_up = 0;
1287
1288 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
1289 drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1290 (unsigned long long)sector, size);
1291 return;
1292 }
1293 nr_sectors = drbd_get_capacity(device->this_bdev);
1294 esector = sector + (size >> 9) - 1;
1295
1296 if (!expect(sector < nr_sectors))
1297 return;
1298 if (!expect(esector < nr_sectors))
1299 esector = nr_sectors - 1;
1300
1301 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1302
1303 /*
1304 * round up start sector, round down end sector. we make sure we only
1305 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1306 if (unlikely(esector < BM_SECT_PER_BIT-1))
1307 return;
1308 if (unlikely(esector == (nr_sectors-1)))
1309 ebnr = lbnr;
1310 else
1311 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1312 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1313
1314 if (sbnr > ebnr)
1315 return;
1316
1317 /*
1318 * ok, (capacity & 7) != 0 sometimes, but who cares...
1319 * we count rs_{total,left} in bits, not sectors.
1320 */
1321 spin_lock_irq(&device->al_lock);
1322 count = drbd_bm_count_bits(device, sbnr, ebnr);
1323 if (count) {
1324 device->rs_failed += count;
1325
1326 if (get_ldev(device)) {
1327 drbd_try_clear_on_disk_bm(device, sector, count, false);
1328 put_ldev(device);
1329 }
1330
1331 /* just wake_up unconditional now, various lc_chaged(),
1332 * lc_put() in drbd_try_clear_on_disk_bm(). */
1333 wake_up = 1;
1334 }
1335 spin_unlock_irq(&device->al_lock);
1336 if (wake_up)
1337 wake_up(&device->al_wait);
1338}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 1aa29f8fdfe1..426c97aef900 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -22,6 +22,8 @@
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
25#include <linux/bitops.h> 27#include <linux/bitops.h>
26#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
27#include <linux/string.h> 29#include <linux/string.h>
@@ -353,9 +355,8 @@ static void bm_free_pages(struct page **pages, unsigned long number)
353 355
354 for (i = 0; i < number; i++) { 356 for (i = 0; i < number; i++) {
355 if (!pages[i]) { 357 if (!pages[i]) {
356 printk(KERN_ALERT "drbd: bm_free_pages tried to free " 358 pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
357 "a NULL pointer; i=%lu n=%lu\n", 359 i, number);
358 i, number);
359 continue; 360 continue;
360 } 361 }
361 __free_page(pages[i]); 362 __free_page(pages[i]);
@@ -592,7 +593,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
592 end = offset + len; 593 end = offset + len;
593 594
594 if (end > b->bm_words) { 595 if (end > b->bm_words) {
595 printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); 596 pr_alert("bm_memset end > bm_words\n");
596 return; 597 return;
597 } 598 }
598 599
@@ -602,7 +603,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
602 p_addr = bm_map_pidx(b, idx); 603 p_addr = bm_map_pidx(b, idx);
603 bm = p_addr + MLPP(offset); 604 bm = p_addr + MLPP(offset);
604 if (bm+do_now > p_addr + LWPP) { 605 if (bm+do_now > p_addr + LWPP) {
605 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", 606 pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
606 p_addr, bm, (int)do_now); 607 p_addr, bm, (int)do_now);
607 } else 608 } else
608 memset(bm, c, do_now * sizeof(long)); 609 memset(bm, c, do_now * sizeof(long));
@@ -927,22 +928,14 @@ void drbd_bm_clear_all(struct drbd_device *device)
927 spin_unlock_irq(&b->bm_lock); 928 spin_unlock_irq(&b->bm_lock);
928} 929}
929 930
930struct bm_aio_ctx { 931static void drbd_bm_aio_ctx_destroy(struct kref *kref)
931 struct drbd_device *device;
932 atomic_t in_flight;
933 unsigned int done;
934 unsigned flags;
935#define BM_AIO_COPY_PAGES 1
936#define BM_AIO_WRITE_HINTED 2
937#define BM_WRITE_ALL_PAGES 4
938 int error;
939 struct kref kref;
940};
941
942static void bm_aio_ctx_destroy(struct kref *kref)
943{ 932{
944 struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); 933 struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
934 unsigned long flags;
945 935
936 spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
937 list_del(&ctx->list);
938 spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
946 put_ldev(ctx->device); 939 put_ldev(ctx->device);
947 kfree(ctx); 940 kfree(ctx);
948} 941}
@@ -950,7 +943,7 @@ static void bm_aio_ctx_destroy(struct kref *kref)
950/* bv_page may be a copy, or may be the original */ 943/* bv_page may be a copy, or may be the original */
951static void bm_async_io_complete(struct bio *bio, int error) 944static void bm_async_io_complete(struct bio *bio, int error)
952{ 945{
953 struct bm_aio_ctx *ctx = bio->bi_private; 946 struct drbd_bm_aio_ctx *ctx = bio->bi_private;
954 struct drbd_device *device = ctx->device; 947 struct drbd_device *device = ctx->device;
955 struct drbd_bitmap *b = device->bitmap; 948 struct drbd_bitmap *b = device->bitmap;
956 unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); 949 unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
@@ -993,17 +986,18 @@ static void bm_async_io_complete(struct bio *bio, int error)
993 if (atomic_dec_and_test(&ctx->in_flight)) { 986 if (atomic_dec_and_test(&ctx->in_flight)) {
994 ctx->done = 1; 987 ctx->done = 1;
995 wake_up(&device->misc_wait); 988 wake_up(&device->misc_wait);
996 kref_put(&ctx->kref, &bm_aio_ctx_destroy); 989 kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
997 } 990 }
998} 991}
999 992
1000static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) 993static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
1001{ 994{
1002 struct bio *bio = bio_alloc_drbd(GFP_NOIO); 995 struct bio *bio = bio_alloc_drbd(GFP_NOIO);
1003 struct drbd_device *device = ctx->device; 996 struct drbd_device *device = ctx->device;
1004 struct drbd_bitmap *b = device->bitmap; 997 struct drbd_bitmap *b = device->bitmap;
1005 struct page *page; 998 struct page *page;
1006 unsigned int len; 999 unsigned int len;
1000 unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
1007 1001
1008 sector_t on_disk_sector = 1002 sector_t on_disk_sector =
1009 device->ldev->md.md_offset + device->ldev->md.bm_offset; 1003 device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1049,9 +1043,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
1049/* 1043/*
1050 * bm_rw: read/write the whole bitmap from/to its on disk location. 1044 * bm_rw: read/write the whole bitmap from/to its on disk location.
1051 */ 1045 */
1052static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) 1046static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
1053{ 1047{
1054 struct bm_aio_ctx *ctx; 1048 struct drbd_bm_aio_ctx *ctx;
1055 struct drbd_bitmap *b = device->bitmap; 1049 struct drbd_bitmap *b = device->bitmap;
1056 int num_pages, i, count = 0; 1050 int num_pages, i, count = 0;
1057 unsigned long now; 1051 unsigned long now;
@@ -1067,12 +1061,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
1067 * as we submit copies of pages anyways. 1061 * as we submit copies of pages anyways.
1068 */ 1062 */
1069 1063
1070 ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); 1064 ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
1071 if (!ctx) 1065 if (!ctx)
1072 return -ENOMEM; 1066 return -ENOMEM;
1073 1067
1074 *ctx = (struct bm_aio_ctx) { 1068 *ctx = (struct drbd_bm_aio_ctx) {
1075 .device = device, 1069 .device = device,
1070 .start_jif = jiffies,
1076 .in_flight = ATOMIC_INIT(1), 1071 .in_flight = ATOMIC_INIT(1),
1077 .done = 0, 1072 .done = 0,
1078 .flags = flags, 1073 .flags = flags,
@@ -1080,15 +1075,21 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
1080 .kref = { ATOMIC_INIT(2) }, 1075 .kref = { ATOMIC_INIT(2) },
1081 }; 1076 };
1082 1077
1083 if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ 1078 if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */
1084 drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); 1079 drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
1085 kfree(ctx); 1080 kfree(ctx);
1086 return -ENODEV; 1081 return -ENODEV;
1087 } 1082 }
1083 /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
1084 drbd_adm_attach(), after device->ldev was assigned. */
1088 1085
1089 if (!ctx->flags) 1086 if (0 == (ctx->flags & ~BM_AIO_READ))
1090 WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); 1087 WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
1091 1088
1089 spin_lock_irq(&device->resource->req_lock);
1090 list_add_tail(&ctx->list, &device->pending_bitmap_io);
1091 spin_unlock_irq(&device->resource->req_lock);
1092
1092 num_pages = b->bm_number_of_pages; 1093 num_pages = b->bm_number_of_pages;
1093 1094
1094 now = jiffies; 1095 now = jiffies;
@@ -1098,13 +1099,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
1098 /* ignore completely unchanged pages */ 1099 /* ignore completely unchanged pages */
1099 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) 1100 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1100 break; 1101 break;
1101 if (rw & WRITE) { 1102 if (!(flags & BM_AIO_READ)) {
1102 if ((flags & BM_AIO_WRITE_HINTED) && 1103 if ((flags & BM_AIO_WRITE_HINTED) &&
1103 !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, 1104 !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
1104 &page_private(b->bm_pages[i]))) 1105 &page_private(b->bm_pages[i])))
1105 continue; 1106 continue;
1106 1107
1107 if (!(flags & BM_WRITE_ALL_PAGES) && 1108 if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
1108 bm_test_page_unchanged(b->bm_pages[i])) { 1109 bm_test_page_unchanged(b->bm_pages[i])) {
1109 dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); 1110 dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
1110 continue; 1111 continue;
@@ -1118,7 +1119,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
1118 } 1119 }
1119 } 1120 }
1120 atomic_inc(&ctx->in_flight); 1121 atomic_inc(&ctx->in_flight);
1121 bm_page_io_async(ctx, i, rw); 1122 bm_page_io_async(ctx, i);
1122 ++count; 1123 ++count;
1123 cond_resched(); 1124 cond_resched();
1124 } 1125 }
@@ -1134,12 +1135,12 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
1134 if (!atomic_dec_and_test(&ctx->in_flight)) 1135 if (!atomic_dec_and_test(&ctx->in_flight))
1135 wait_until_done_or_force_detached(device, device->ldev, &ctx->done); 1136 wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
1136 else 1137 else
1137 kref_put(&ctx->kref, &bm_aio_ctx_destroy); 1138 kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
1138 1139
1139 /* summary for global bitmap IO */ 1140 /* summary for global bitmap IO */
1140 if (flags == 0) 1141 if (flags == 0)
1141 drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", 1142 drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
1142 rw == WRITE ? "WRITE" : "READ", 1143 (flags & BM_AIO_READ) ? "READ" : "WRITE",
1143 count, jiffies - now); 1144 count, jiffies - now);
1144 1145
1145 if (ctx->error) { 1146 if (ctx->error) {
@@ -1152,20 +1153,18 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
1152 err = -EIO; /* Disk timeout/force-detach during IO... */ 1153 err = -EIO; /* Disk timeout/force-detach during IO... */
1153 1154
1154 now = jiffies; 1155 now = jiffies;
1155 if (rw == WRITE) { 1156 if (flags & BM_AIO_READ) {
1156 drbd_md_flush(device);
1157 } else /* rw == READ */ {
1158 b->bm_set = bm_count_bits(b); 1157 b->bm_set = bm_count_bits(b);
1159 drbd_info(device, "recounting of set bits took additional %lu jiffies\n", 1158 drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
1160 jiffies - now); 1159 jiffies - now);
1161 } 1160 }
1162 now = b->bm_set; 1161 now = b->bm_set;
1163 1162
1164 if (flags == 0) 1163 if ((flags & ~BM_AIO_READ) == 0)
1165 drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", 1164 drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
1166 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); 1165 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
1167 1166
1168 kref_put(&ctx->kref, &bm_aio_ctx_destroy); 1167 kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
1169 return err; 1168 return err;
1170} 1169}
1171 1170
@@ -1175,7 +1174,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la
1175 */ 1174 */
1176int drbd_bm_read(struct drbd_device *device) __must_hold(local) 1175int drbd_bm_read(struct drbd_device *device) __must_hold(local)
1177{ 1176{
1178 return bm_rw(device, READ, 0, 0); 1177 return bm_rw(device, BM_AIO_READ, 0);
1179} 1178}
1180 1179
1181/** 1180/**
@@ -1186,7 +1185,7 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local)
1186 */ 1185 */
1187int drbd_bm_write(struct drbd_device *device) __must_hold(local) 1186int drbd_bm_write(struct drbd_device *device) __must_hold(local)
1188{ 1187{
1189 return bm_rw(device, WRITE, 0, 0); 1188 return bm_rw(device, 0, 0);
1190} 1189}
1191 1190
1192/** 1191/**
@@ -1197,7 +1196,17 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local)
1197 */ 1196 */
1198int drbd_bm_write_all(struct drbd_device *device) __must_hold(local) 1197int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
1199{ 1198{
1200 return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0); 1199 return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
1200}
1201
1202/**
1203 * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
1204 * @device: DRBD device.
1205 * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
1206 */
1207int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
1208{
1209 return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
1201} 1210}
1202 1211
1203/** 1212/**
@@ -1213,7 +1222,7 @@ int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
1213 */ 1222 */
1214int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local) 1223int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
1215{ 1224{
1216 return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0); 1225 return bm_rw(device, BM_AIO_COPY_PAGES, 0);
1217} 1226}
1218 1227
1219/** 1228/**
@@ -1222,62 +1231,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
1222 */ 1231 */
1223int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local) 1232int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
1224{ 1233{
1225 return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); 1234 return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
1226}
1227
1228/**
1229 * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
1230 * @device: DRBD device.
1231 * @idx: bitmap page index
1232 *
1233 * We don't want to special case on logical_block_size of the backend device,
1234 * so we submit PAGE_SIZE aligned pieces.
1235 * Note that on "most" systems, PAGE_SIZE is 4k.
1236 *
1237 * In case this becomes an issue on systems with larger PAGE_SIZE,
1238 * we may want to change this again to write 4k aligned 4k pieces.
1239 */
1240int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local)
1241{
1242 struct bm_aio_ctx *ctx;
1243 int err;
1244
1245 if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) {
1246 dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx);
1247 return 0;
1248 }
1249
1250 ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
1251 if (!ctx)
1252 return -ENOMEM;
1253
1254 *ctx = (struct bm_aio_ctx) {
1255 .device = device,
1256 .in_flight = ATOMIC_INIT(1),
1257 .done = 0,
1258 .flags = BM_AIO_COPY_PAGES,
1259 .error = 0,
1260 .kref = { ATOMIC_INIT(2) },
1261 };
1262
1263 if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */
1264 drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
1265 kfree(ctx);
1266 return -ENODEV;
1267 }
1268
1269 bm_page_io_async(ctx, idx, WRITE_SYNC);
1270 wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
1271
1272 if (ctx->error)
1273 drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
1274 /* that causes us to detach, so the in memory bitmap will be
1275 * gone in a moment as well. */
1276
1277 device->bm_writ_cnt++;
1278 err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
1279 kref_put(&ctx->kref, &bm_aio_ctx_destroy);
1280 return err;
1281} 1235}
1282 1236
1283/* NOTE 1237/* NOTE
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644
index 000000000000..5c20b18540b8
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -0,0 +1,958 @@
1#define pr_fmt(fmt) "drbd debugfs: " fmt
2#include <linux/kernel.h>
3#include <linux/module.h>
4#include <linux/debugfs.h>
5#include <linux/seq_file.h>
6#include <linux/stat.h>
7#include <linux/jiffies.h>
8#include <linux/list.h>
9
10#include "drbd_int.h"
11#include "drbd_req.h"
12#include "drbd_debugfs.h"
13
14
15/**********************************************************************
16 * Whenever you change the file format, remember to bump the version. *
17 **********************************************************************/
18
19static struct dentry *drbd_debugfs_root;
20static struct dentry *drbd_debugfs_version;
21static struct dentry *drbd_debugfs_resources;
22static struct dentry *drbd_debugfs_minors;
23
24static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
25{
26 if (valid)
27 seq_printf(m, "\t%d", jiffies_to_msecs(dt));
28 else
29 seq_printf(m, "\t-");
30}
31
32static void __seq_print_rq_state_bit(struct seq_file *m,
33 bool is_set, char *sep, const char *set_name, const char *unset_name)
34{
35 if (is_set && set_name) {
36 seq_putc(m, *sep);
37 seq_puts(m, set_name);
38 *sep = '|';
39 } else if (!is_set && unset_name) {
40 seq_putc(m, *sep);
41 seq_puts(m, unset_name);
42 *sep = '|';
43 }
44}
45
46static void seq_print_rq_state_bit(struct seq_file *m,
47 bool is_set, char *sep, const char *set_name)
48{
49 __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
50}
51
52/* pretty print enum drbd_req_state_bits req->rq_state */
53static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
54{
55 unsigned int s = req->rq_state;
56 char sep = ' ';
57 seq_printf(m, "\t0x%08x", s);
58 seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
59
60 /* RQ_WRITE ignored, already reported */
61 seq_puts(m, "\tlocal:");
62 seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
63 seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
64 seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
65 sep = ' ';
66 seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
67 seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
68 seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
69 seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
70 if (sep == ' ')
71 seq_puts(m, " -");
72
73 /* for_each_connection ... */
74 seq_printf(m, "\tnet:");
75 sep = ' ';
76 seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
77 seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
78 seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
79 seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
80 seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
81 seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
82 if (sep == ' ')
83 seq_puts(m, " -");
84
85 seq_printf(m, " :");
86 sep = ' ';
87 seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
88 seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
89 seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
90 if (sep == ' ')
91 seq_puts(m, " -");
92 seq_printf(m, "\n");
93}
94
95static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
96{
97 /* change anything here, fixup header below! */
98 unsigned int s = req->rq_state;
99
100#define RQ_HDR_1 "epoch\tsector\tsize\trw"
101 seq_printf(m, "0x%x\t%llu\t%u\t%s",
102 req->epoch,
103 (unsigned long long)req->i.sector, req->i.size >> 9,
104 (s & RQ_WRITE) ? "W" : "R");
105
106#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
107 seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
108 seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
109 seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
110
111#define RQ_HDR_3 "\tsent\tacked\tdone"
112 seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
113 seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
114 seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
115
116#define RQ_HDR_4 "\tstate\n"
117 seq_print_request_state(m, req);
118}
119#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
120
121static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
122{
123 seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
124 seq_print_one_request(m, req, now);
125}
126
127static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
128{
129 struct drbd_device *device;
130 unsigned int i;
131
132 seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
133 rcu_read_lock();
134 idr_for_each_entry(&resource->devices, device, i) {
135 struct drbd_md_io tmp;
136 /* In theory this is racy,
137 * in the sense that there could have been a
138 * drbd_md_put_buffer(); drbd_md_get_buffer();
139 * between accessing these members here. */
140 tmp = device->md_io;
141 if (atomic_read(&tmp.in_use)) {
142 seq_printf(m, "%u\t%u\t%d\t",
143 device->minor, device->vnr,
144 jiffies_to_msecs(now - tmp.start_jif));
145 if (time_before(tmp.submit_jif, tmp.start_jif))
146 seq_puts(m, "-\t");
147 else
148 seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
149 seq_printf(m, "%s\n", tmp.current_use);
150 }
151 }
152 rcu_read_unlock();
153}
154
155static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
156{
157 struct drbd_device *device;
158 unsigned int i;
159
160 seq_puts(m, "minor\tvnr\tage\t#waiting\n");
161 rcu_read_lock();
162 idr_for_each_entry(&resource->devices, device, i) {
163 unsigned long jif;
164 struct drbd_request *req;
165 int n = atomic_read(&device->ap_actlog_cnt);
166 if (n) {
167 spin_lock_irq(&device->resource->req_lock);
168 req = list_first_entry_or_null(&device->pending_master_completion[1],
169 struct drbd_request, req_pending_master_completion);
170 /* if the oldest request does not wait for the activity log
171 * it is not interesting for us here */
172 if (req && !(req->rq_state & RQ_IN_ACT_LOG))
173 jif = req->start_jif;
174 else
175 req = NULL;
176 spin_unlock_irq(&device->resource->req_lock);
177 }
178 if (n) {
179 seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
180 if (req)
181 seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
182 else
183 seq_puts(m, "-\t");
184 seq_printf(m, "%u\n", n);
185 }
186 }
187 rcu_read_unlock();
188}
189
190static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
191{
192 struct drbd_bm_aio_ctx *ctx;
193 unsigned long start_jif;
194 unsigned int in_flight;
195 unsigned int flags;
196 spin_lock_irq(&device->resource->req_lock);
197 ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
198 if (ctx && ctx->done)
199 ctx = NULL;
200 if (ctx) {
201 start_jif = ctx->start_jif;
202 in_flight = atomic_read(&ctx->in_flight);
203 flags = ctx->flags;
204 }
205 spin_unlock_irq(&device->resource->req_lock);
206 if (ctx) {
207 seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
208 device->minor, device->vnr,
209 (flags & BM_AIO_READ) ? 'R' : 'W',
210 jiffies_to_msecs(now - start_jif),
211 in_flight);
212 }
213}
214
215static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
216{
217 struct drbd_device *device;
218 unsigned int i;
219
220 seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
221 rcu_read_lock();
222 idr_for_each_entry(&resource->devices, device, i) {
223 seq_print_device_bitmap_io(m, device, now);
224 }
225 rcu_read_unlock();
226}
227
228/* pretty print enum peer_req->flags */
229static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
230{
231 unsigned long f = peer_req->flags;
232 char sep = ' ';
233
234 __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
235 __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
236 seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
237 seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
238 seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
239
240 if (f & EE_IS_TRIM) {
241 seq_putc(m, sep);
242 sep = '|';
243 if (f & EE_IS_TRIM_USE_ZEROOUT)
244 seq_puts(m, "zero-out");
245 else
246 seq_puts(m, "trim");
247 }
248 seq_putc(m, '\n');
249}
250
251static void seq_print_peer_request(struct seq_file *m,
252 struct drbd_device *device, struct list_head *lh,
253 unsigned long now)
254{
255 bool reported_preparing = false;
256 struct drbd_peer_request *peer_req;
257 list_for_each_entry(peer_req, lh, w.list) {
258 if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
259 continue;
260
261 if (device)
262 seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
263
264 seq_printf(m, "%llu\t%u\t%c\t%u\t",
265 (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
266 (peer_req->flags & EE_WRITE) ? 'W' : 'R',
267 jiffies_to_msecs(now - peer_req->submit_jif));
268 seq_print_peer_request_flags(m, peer_req);
269 if (peer_req->flags & EE_SUBMITTED)
270 break;
271 else
272 reported_preparing = true;
273 }
274}
275
276static void seq_print_device_peer_requests(struct seq_file *m,
277 struct drbd_device *device, unsigned long now)
278{
279 seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
280 spin_lock_irq(&device->resource->req_lock);
281 seq_print_peer_request(m, device, &device->active_ee, now);
282 seq_print_peer_request(m, device, &device->read_ee, now);
283 seq_print_peer_request(m, device, &device->sync_ee, now);
284 spin_unlock_irq(&device->resource->req_lock);
285 if (test_bit(FLUSH_PENDING, &device->flags)) {
286 seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
287 device->minor, device->vnr,
288 jiffies_to_msecs(now - device->flush_jif));
289 }
290}
291
292static void seq_print_resource_pending_peer_requests(struct seq_file *m,
293 struct drbd_resource *resource, unsigned long now)
294{
295 struct drbd_device *device;
296 unsigned int i;
297
298 rcu_read_lock();
299 idr_for_each_entry(&resource->devices, device, i) {
300 seq_print_device_peer_requests(m, device, now);
301 }
302 rcu_read_unlock();
303}
304
305static void seq_print_resource_transfer_log_summary(struct seq_file *m,
306 struct drbd_resource *resource,
307 struct drbd_connection *connection,
308 unsigned long now)
309{
310 struct drbd_request *req;
311 unsigned int count = 0;
312 unsigned int show_state = 0;
313
314 seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
315 spin_lock_irq(&resource->req_lock);
316 list_for_each_entry(req, &connection->transfer_log, tl_requests) {
317 unsigned int tmp = 0;
318 unsigned int s;
319 ++count;
320
321 /* don't disable irq "forever" */
322 if (!(count & 0x1ff)) {
323 struct drbd_request *req_next;
324 kref_get(&req->kref);
325 spin_unlock_irq(&resource->req_lock);
326 cond_resched();
327 spin_lock_irq(&resource->req_lock);
328 req_next = list_next_entry(req, tl_requests);
329 if (kref_put(&req->kref, drbd_req_destroy))
330 req = req_next;
331 if (&req->tl_requests == &connection->transfer_log)
332 break;
333 }
334
335 s = req->rq_state;
336
337 /* This is meant to summarize timing issues, to be able to tell
338 * local disk problems from network problems.
339 * Skip requests, if we have shown an even older request with
340 * similar aspects already. */
341 if (req->master_bio == NULL)
342 tmp |= 1;
343 if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
344 tmp |= 2;
345 if (s & RQ_NET_MASK) {
346 if (!(s & RQ_NET_SENT))
347 tmp |= 4;
348 if (s & RQ_NET_PENDING)
349 tmp |= 8;
350 if (!(s & RQ_NET_DONE))
351 tmp |= 16;
352 }
353 if ((tmp & show_state) == tmp)
354 continue;
355 show_state |= tmp;
356 seq_printf(m, "%u\t", count);
357 seq_print_minor_vnr_req(m, req, now);
358 if (show_state == 0x1f)
359 break;
360 }
361 spin_unlock_irq(&resource->req_lock);
362}
363
364/* TODO: transfer_log and friends should be moved to resource */
365static int in_flight_summary_show(struct seq_file *m, void *pos)
366{
367 struct drbd_resource *resource = m->private;
368 struct drbd_connection *connection;
369 unsigned long jif = jiffies;
370
371 connection = first_connection(resource);
372 /* This does not happen, actually.
373 * But be robust and prepare for future code changes. */
374 if (!connection || !kref_get_unless_zero(&connection->kref))
375 return -ESTALE;
376
377 /* BUMP me if you change the file format/content/presentation */
378 seq_printf(m, "v: %u\n\n", 0);
379
380 seq_puts(m, "oldest bitmap IO\n");
381 seq_print_resource_pending_bitmap_io(m, resource, jif);
382 seq_putc(m, '\n');
383
384 seq_puts(m, "meta data IO\n");
385 seq_print_resource_pending_meta_io(m, resource, jif);
386 seq_putc(m, '\n');
387
388 seq_puts(m, "socket buffer stats\n");
389 /* for each connection ... once we have more than one */
390 rcu_read_lock();
391 if (connection->data.socket) {
392 /* open coded SIOCINQ, the "relevant" part */
393 struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
394 int answ = tp->rcv_nxt - tp->copied_seq;
395 seq_printf(m, "unread receive buffer: %u Byte\n", answ);
396 /* open coded SIOCOUTQ, the "relevant" part */
397 answ = tp->write_seq - tp->snd_una;
398 seq_printf(m, "unacked send buffer: %u Byte\n", answ);
399 }
400 rcu_read_unlock();
401 seq_putc(m, '\n');
402
403 seq_puts(m, "oldest peer requests\n");
404 seq_print_resource_pending_peer_requests(m, resource, jif);
405 seq_putc(m, '\n');
406
407 seq_puts(m, "application requests waiting for activity log\n");
408 seq_print_waiting_for_AL(m, resource, jif);
409 seq_putc(m, '\n');
410
411 seq_puts(m, "oldest application requests\n");
412 seq_print_resource_transfer_log_summary(m, resource, connection, jif);
413 seq_putc(m, '\n');
414
415 jif = jiffies - jif;
416 if (jif)
417 seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
418 kref_put(&connection->kref, drbd_destroy_connection);
419 return 0;
420}
421
422/* simple_positive(file->f_dentry) respectively debugfs_positive(),
423 * but neither is "reachable" from here.
424 * So we have our own inline version of it above. :-( */
425static inline int debugfs_positive(struct dentry *dentry)
426{
427 return dentry->d_inode && !d_unhashed(dentry);
428}
429
430/* make sure at *open* time that the respective object won't go away. */
431static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
432 void *data, struct kref *kref,
433 void (*release)(struct kref *))
434{
435 struct dentry *parent;
436 int ret = -ESTALE;
437
438 /* Are we still linked,
439 * or has debugfs_remove() already been called? */
440 parent = file->f_dentry->d_parent;
441 /* not sure if this can happen: */
442 if (!parent || !parent->d_inode)
443 goto out;
444 /* serialize with d_delete() */
445 mutex_lock(&parent->d_inode->i_mutex);
446 /* Make sure the object is still alive */
447 if (debugfs_positive(file->f_dentry)
448 && kref_get_unless_zero(kref))
449 ret = 0;
450 mutex_unlock(&parent->d_inode->i_mutex);
451 if (!ret) {
452 ret = single_open(file, show, data);
453 if (ret)
454 kref_put(kref, release);
455 }
456out:
457 return ret;
458}
459
460static int in_flight_summary_open(struct inode *inode, struct file *file)
461{
462 struct drbd_resource *resource = inode->i_private;
463 return drbd_single_open(file, in_flight_summary_show, resource,
464 &resource->kref, drbd_destroy_resource);
465}
466
467static int in_flight_summary_release(struct inode *inode, struct file *file)
468{
469 struct drbd_resource *resource = inode->i_private;
470 kref_put(&resource->kref, drbd_destroy_resource);
471 return single_release(inode, file);
472}
473
474static const struct file_operations in_flight_summary_fops = {
475 .owner = THIS_MODULE,
476 .open = in_flight_summary_open,
477 .read = seq_read,
478 .llseek = seq_lseek,
479 .release = in_flight_summary_release,
480};
481
482void drbd_debugfs_resource_add(struct drbd_resource *resource)
483{
484 struct dentry *dentry;
485 if (!drbd_debugfs_resources)
486 return;
487
488 dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
489 if (IS_ERR_OR_NULL(dentry))
490 goto fail;
491 resource->debugfs_res = dentry;
492
493 dentry = debugfs_create_dir("volumes", resource->debugfs_res);
494 if (IS_ERR_OR_NULL(dentry))
495 goto fail;
496 resource->debugfs_res_volumes = dentry;
497
498 dentry = debugfs_create_dir("connections", resource->debugfs_res);
499 if (IS_ERR_OR_NULL(dentry))
500 goto fail;
501 resource->debugfs_res_connections = dentry;
502
503 dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
504 resource->debugfs_res, resource,
505 &in_flight_summary_fops);
506 if (IS_ERR_OR_NULL(dentry))
507 goto fail;
508 resource->debugfs_res_in_flight_summary = dentry;
509 return;
510
511fail:
512 drbd_debugfs_resource_cleanup(resource);
513 drbd_err(resource, "failed to create debugfs dentry\n");
514}
515
516static void drbd_debugfs_remove(struct dentry **dp)
517{
518 debugfs_remove(*dp);
519 *dp = NULL;
520}
521
522void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
523{
524 /* it is ok to call debugfs_remove(NULL) */
525 drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
526 drbd_debugfs_remove(&resource->debugfs_res_connections);
527 drbd_debugfs_remove(&resource->debugfs_res_volumes);
528 drbd_debugfs_remove(&resource->debugfs_res);
529}
530
531static void seq_print_one_timing_detail(struct seq_file *m,
532 const struct drbd_thread_timing_details *tdp,
533 unsigned long now)
534{
535 struct drbd_thread_timing_details td;
536 /* No locking...
537 * use temporary assignment to get at consistent data. */
538 do {
539 td = *tdp;
540 } while (td.cb_nr != tdp->cb_nr);
541 if (!td.cb_addr)
542 return;
543 seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
544 td.cb_nr,
545 jiffies_to_msecs(now - td.start_jif),
546 td.caller_fn, td.line,
547 td.cb_addr);
548}
549
550static void seq_print_timing_details(struct seq_file *m,
551 const char *title,
552 unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
553{
554 unsigned int start_idx;
555 unsigned int i;
556
557 seq_printf(m, "%s\n", title);
558 /* If not much is going on, this will result in natural ordering.
559 * If it is very busy, we will possibly skip events, or even see wrap
560 * arounds, which could only be avoided with locking.
561 */
562 start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
563 for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
564 seq_print_one_timing_detail(m, tdp+i, now);
565 for (i = 0; i < start_idx; i++)
566 seq_print_one_timing_detail(m, tdp+i, now);
567}
568
569static int callback_history_show(struct seq_file *m, void *ignored)
570{
571 struct drbd_connection *connection = m->private;
572 unsigned long jif = jiffies;
573
574 /* BUMP me if you change the file format/content/presentation */
575 seq_printf(m, "v: %u\n\n", 0);
576
577 seq_puts(m, "n\tage\tcallsite\tfn\n");
578 seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
579 seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
580 return 0;
581}
582
583static int callback_history_open(struct inode *inode, struct file *file)
584{
585 struct drbd_connection *connection = inode->i_private;
586 return drbd_single_open(file, callback_history_show, connection,
587 &connection->kref, drbd_destroy_connection);
588}
589
590static int callback_history_release(struct inode *inode, struct file *file)
591{
592 struct drbd_connection *connection = inode->i_private;
593 kref_put(&connection->kref, drbd_destroy_connection);
594 return single_release(inode, file);
595}
596
597static const struct file_operations connection_callback_history_fops = {
598 .owner = THIS_MODULE,
599 .open = callback_history_open,
600 .read = seq_read,
601 .llseek = seq_lseek,
602 .release = callback_history_release,
603};
604
605static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
606{
607 struct drbd_connection *connection = m->private;
608 unsigned long now = jiffies;
609 struct drbd_request *r1, *r2;
610
611 /* BUMP me if you change the file format/content/presentation */
612 seq_printf(m, "v: %u\n\n", 0);
613
614 spin_lock_irq(&connection->resource->req_lock);
615 r1 = connection->req_next;
616 if (r1)
617 seq_print_minor_vnr_req(m, r1, now);
618 r2 = connection->req_ack_pending;
619 if (r2 && r2 != r1) {
620 r1 = r2;
621 seq_print_minor_vnr_req(m, r1, now);
622 }
623 r2 = connection->req_not_net_done;
624 if (r2 && r2 != r1)
625 seq_print_minor_vnr_req(m, r2, now);
626 spin_unlock_irq(&connection->resource->req_lock);
627 return 0;
628}
629
630static int connection_oldest_requests_open(struct inode *inode, struct file *file)
631{
632 struct drbd_connection *connection = inode->i_private;
633 return drbd_single_open(file, connection_oldest_requests_show, connection,
634 &connection->kref, drbd_destroy_connection);
635}
636
637static int connection_oldest_requests_release(struct inode *inode, struct file *file)
638{
639 struct drbd_connection *connection = inode->i_private;
640 kref_put(&connection->kref, drbd_destroy_connection);
641 return single_release(inode, file);
642}
643
644static const struct file_operations connection_oldest_requests_fops = {
645 .owner = THIS_MODULE,
646 .open = connection_oldest_requests_open,
647 .read = seq_read,
648 .llseek = seq_lseek,
649 .release = connection_oldest_requests_release,
650};
651
652void drbd_debugfs_connection_add(struct drbd_connection *connection)
653{
654 struct dentry *conns_dir = connection->resource->debugfs_res_connections;
655 struct dentry *dentry;
656 if (!conns_dir)
657 return;
658
659 /* Once we enable mutliple peers,
660 * these connections will have descriptive names.
661 * For now, it is just the one connection to the (only) "peer". */
662 dentry = debugfs_create_dir("peer", conns_dir);
663 if (IS_ERR_OR_NULL(dentry))
664 goto fail;
665 connection->debugfs_conn = dentry;
666
667 dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
668 connection->debugfs_conn, connection,
669 &connection_callback_history_fops);
670 if (IS_ERR_OR_NULL(dentry))
671 goto fail;
672 connection->debugfs_conn_callback_history = dentry;
673
674 dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
675 connection->debugfs_conn, connection,
676 &connection_oldest_requests_fops);
677 if (IS_ERR_OR_NULL(dentry))
678 goto fail;
679 connection->debugfs_conn_oldest_requests = dentry;
680 return;
681
682fail:
683 drbd_debugfs_connection_cleanup(connection);
684 drbd_err(connection, "failed to create debugfs dentry\n");
685}
686
687void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
688{
689 drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
690 drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
691 drbd_debugfs_remove(&connection->debugfs_conn);
692}
693
694static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
695{
696 struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
697
698 seq_printf(m, "%5d %s %s %s\n", bme->rs_left,
699 test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
700 test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
701 test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
702 );
703}
704
705static int device_resync_extents_show(struct seq_file *m, void *ignored)
706{
707 struct drbd_device *device = m->private;
708
709 /* BUMP me if you change the file format/content/presentation */
710 seq_printf(m, "v: %u\n\n", 0);
711
712 if (get_ldev_if_state(device, D_FAILED)) {
713 lc_seq_printf_stats(m, device->resync);
714 lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
715 put_ldev(device);
716 }
717 return 0;
718}
719
720static int device_act_log_extents_show(struct seq_file *m, void *ignored)
721{
722 struct drbd_device *device = m->private;
723
724 /* BUMP me if you change the file format/content/presentation */
725 seq_printf(m, "v: %u\n\n", 0);
726
727 if (get_ldev_if_state(device, D_FAILED)) {
728 lc_seq_printf_stats(m, device->act_log);
729 lc_seq_dump_details(m, device->act_log, "", NULL);
730 put_ldev(device);
731 }
732 return 0;
733}
734
735static int device_oldest_requests_show(struct seq_file *m, void *ignored)
736{
737 struct drbd_device *device = m->private;
738 struct drbd_resource *resource = device->resource;
739 unsigned long now = jiffies;
740 struct drbd_request *r1, *r2;
741 int i;
742
743 /* BUMP me if you change the file format/content/presentation */
744 seq_printf(m, "v: %u\n\n", 0);
745
746 seq_puts(m, RQ_HDR);
747 spin_lock_irq(&resource->req_lock);
748 /* WRITE, then READ */
749 for (i = 1; i >= 0; --i) {
750 r1 = list_first_entry_or_null(&device->pending_master_completion[i],
751 struct drbd_request, req_pending_master_completion);
752 r2 = list_first_entry_or_null(&device->pending_completion[i],
753 struct drbd_request, req_pending_local);
754 if (r1)
755 seq_print_one_request(m, r1, now);
756 if (r2 && r2 != r1)
757 seq_print_one_request(m, r2, now);
758 }
759 spin_unlock_irq(&resource->req_lock);
760 return 0;
761}
762
763static int device_data_gen_id_show(struct seq_file *m, void *ignored)
764{
765 struct drbd_device *device = m->private;
766 struct drbd_md *md;
767 enum drbd_uuid_index idx;
768
769 if (!get_ldev_if_state(device, D_FAILED))
770 return -ENODEV;
771
772 md = &device->ldev->md;
773 spin_lock_irq(&md->uuid_lock);
774 for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
775 seq_printf(m, "0x%016llX\n", md->uuid[idx]);
776 }
777 spin_unlock_irq(&md->uuid_lock);
778 put_ldev(device);
779 return 0;
780}
781
782#define drbd_debugfs_device_attr(name) \
783static int device_ ## name ## _open(struct inode *inode, struct file *file) \
784{ \
785 struct drbd_device *device = inode->i_private; \
786 return drbd_single_open(file, device_ ## name ## _show, device, \
787 &device->kref, drbd_destroy_device); \
788} \
789static int device_ ## name ## _release(struct inode *inode, struct file *file) \
790{ \
791 struct drbd_device *device = inode->i_private; \
792 kref_put(&device->kref, drbd_destroy_device); \
793 return single_release(inode, file); \
794} \
795static const struct file_operations device_ ## name ## _fops = { \
796 .owner = THIS_MODULE, \
797 .open = device_ ## name ## _open, \
798 .read = seq_read, \
799 .llseek = seq_lseek, \
800 .release = device_ ## name ## _release, \
801};
802
803drbd_debugfs_device_attr(oldest_requests)
804drbd_debugfs_device_attr(act_log_extents)
805drbd_debugfs_device_attr(resync_extents)
806drbd_debugfs_device_attr(data_gen_id)
807
808void drbd_debugfs_device_add(struct drbd_device *device)
809{
810 struct dentry *vols_dir = device->resource->debugfs_res_volumes;
811 char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
812 char vnr_buf[8]; /* volume number vnr is even 16 bit only; */
813 char *slink_name = NULL;
814
815 struct dentry *dentry;
816 if (!vols_dir || !drbd_debugfs_minors)
817 return;
818
819 snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
820 dentry = debugfs_create_dir(vnr_buf, vols_dir);
821 if (IS_ERR_OR_NULL(dentry))
822 goto fail;
823 device->debugfs_vol = dentry;
824
825 snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
826 slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
827 device->resource->name, device->vnr);
828 if (!slink_name)
829 goto fail;
830 dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
831 kfree(slink_name);
832 slink_name = NULL;
833 if (IS_ERR_OR_NULL(dentry))
834 goto fail;
835 device->debugfs_minor = dentry;
836
837#define DCF(name) do { \
838 dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \
839 device->debugfs_vol, device, \
840 &device_ ## name ## _fops); \
841 if (IS_ERR_OR_NULL(dentry)) \
842 goto fail; \
843 device->debugfs_vol_ ## name = dentry; \
844 } while (0)
845
846 DCF(oldest_requests);
847 DCF(act_log_extents);
848 DCF(resync_extents);
849 DCF(data_gen_id);
850#undef DCF
851 return;
852
853fail:
854 drbd_debugfs_device_cleanup(device);
855 drbd_err(device, "failed to create debugfs entries\n");
856}
857
858void drbd_debugfs_device_cleanup(struct drbd_device *device)
859{
860 drbd_debugfs_remove(&device->debugfs_minor);
861 drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
862 drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
863 drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
864 drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
865 drbd_debugfs_remove(&device->debugfs_vol);
866}
867
868void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
869{
870 struct dentry *conn_dir = peer_device->connection->debugfs_conn;
871 struct dentry *dentry;
872 char vnr_buf[8];
873
874 if (!conn_dir)
875 return;
876
877 snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
878 dentry = debugfs_create_dir(vnr_buf, conn_dir);
879 if (IS_ERR_OR_NULL(dentry))
880 goto fail;
881 peer_device->debugfs_peer_dev = dentry;
882 return;
883
884fail:
885 drbd_debugfs_peer_device_cleanup(peer_device);
886 drbd_err(peer_device, "failed to create debugfs entries\n");
887}
888
889void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
890{
891 drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
892}
893
894static int drbd_version_show(struct seq_file *m, void *ignored)
895{
896 seq_printf(m, "# %s\n", drbd_buildtag());
897 seq_printf(m, "VERSION=%s\n", REL_VERSION);
898 seq_printf(m, "API_VERSION=%u\n", API_VERSION);
899 seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
900 seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
901 return 0;
902}
903
904static int drbd_version_open(struct inode *inode, struct file *file)
905{
906 return single_open(file, drbd_version_show, NULL);
907}
908
909static struct file_operations drbd_version_fops = {
910 .owner = THIS_MODULE,
911 .open = drbd_version_open,
912 .llseek = seq_lseek,
913 .read = seq_read,
914 .release = single_release,
915};
916
917/* not __exit, may be indirectly called
918 * from the module-load-failure path as well. */
919void drbd_debugfs_cleanup(void)
920{
921 drbd_debugfs_remove(&drbd_debugfs_resources);
922 drbd_debugfs_remove(&drbd_debugfs_minors);
923 drbd_debugfs_remove(&drbd_debugfs_version);
924 drbd_debugfs_remove(&drbd_debugfs_root);
925}
926
927int __init drbd_debugfs_init(void)
928{
929 struct dentry *dentry;
930
931 dentry = debugfs_create_dir("drbd", NULL);
932 if (IS_ERR_OR_NULL(dentry))
933 goto fail;
934 drbd_debugfs_root = dentry;
935
936 dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
937 if (IS_ERR_OR_NULL(dentry))
938 goto fail;
939 drbd_debugfs_version = dentry;
940
941 dentry = debugfs_create_dir("resources", drbd_debugfs_root);
942 if (IS_ERR_OR_NULL(dentry))
943 goto fail;
944 drbd_debugfs_resources = dentry;
945
946 dentry = debugfs_create_dir("minors", drbd_debugfs_root);
947 if (IS_ERR_OR_NULL(dentry))
948 goto fail;
949 drbd_debugfs_minors = dentry;
950 return 0;
951
952fail:
953 drbd_debugfs_cleanup();
954 if (dentry)
955 return PTR_ERR(dentry);
956 else
957 return -EINVAL;
958}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644
index 000000000000..8bee21340dce
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -0,0 +1,39 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/debugfs.h>
4
5#include "drbd_int.h"
6
7#ifdef CONFIG_DEBUG_FS
8int __init drbd_debugfs_init(void);
9void drbd_debugfs_cleanup(void);
10
11void drbd_debugfs_resource_add(struct drbd_resource *resource);
12void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
13
14void drbd_debugfs_connection_add(struct drbd_connection *connection);
15void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
16
17void drbd_debugfs_device_add(struct drbd_device *device);
18void drbd_debugfs_device_cleanup(struct drbd_device *device);
19
20void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
21void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
22#else
23
24static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
25static inline void drbd_debugfs_cleanup(void) { }
26
27static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
28static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
29
30static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
31static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
32
33static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
34static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
35
36static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
37static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
38
39#endif
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a76ceb344d64..1a000016ccdf 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -317,7 +317,63 @@ struct drbd_request {
317 317
318 struct list_head tl_requests; /* ring list in the transfer log */ 318 struct list_head tl_requests; /* ring list in the transfer log */
319 struct bio *master_bio; /* master bio pointer */ 319 struct bio *master_bio; /* master bio pointer */
320 unsigned long start_time; 320
321 /* see struct drbd_device */
322 struct list_head req_pending_master_completion;
323 struct list_head req_pending_local;
324
325 /* for generic IO accounting */
326 unsigned long start_jif;
327
328 /* for DRBD internal statistics */
329
330 /* Minimal set of time stamps to determine if we wait for activity log
331 * transactions, local disk or peer. 32 bit "jiffies" are good enough,
332 * we don't expect a DRBD request to be stalled for several month.
333 */
334
335 /* before actual request processing */
336 unsigned long in_actlog_jif;
337
338 /* local disk */
339 unsigned long pre_submit_jif;
340
341 /* per connection */
342 unsigned long pre_send_jif;
343 unsigned long acked_jif;
344 unsigned long net_done_jif;
345
346 /* Possibly even more detail to track each phase:
347 * master_completion_jif
348 * how long did it take to complete the master bio
349 * (application visible latency)
350 * allocated_jif
351 * how long the master bio was blocked until we finally allocated
352 * a tracking struct
353 * in_actlog_jif
354 * how long did we wait for activity log transactions
355 *
356 * net_queued_jif
357 * when did we finally queue it for sending
358 * pre_send_jif
359 * when did we start sending it
360 * post_send_jif
361 * how long did we block in the network stack trying to send it
362 * acked_jif
363 * when did we receive (or fake, in protocol A) a remote ACK
364 * net_done_jif
365 * when did we receive final acknowledgement (P_BARRIER_ACK),
366 * or decide, e.g. on connection loss, that we do no longer expect
367 * anything from this peer for this request.
368 *
369 * pre_submit_jif
370 * post_sub_jif
371 * when did we start submiting to the lower level device,
372 * and how long did we block in that submit function
373 * local_completion_jif
374 * how long did it take the lower level device to complete this request
375 */
376
321 377
322 /* once it hits 0, we may complete the master_bio */ 378 /* once it hits 0, we may complete the master_bio */
323 atomic_t completion_ref; 379 atomic_t completion_ref;
@@ -366,6 +422,7 @@ struct drbd_peer_request {
366 struct drbd_interval i; 422 struct drbd_interval i;
367 /* see comments on ee flag bits below */ 423 /* see comments on ee flag bits below */
368 unsigned long flags; 424 unsigned long flags;
425 unsigned long submit_jif;
369 union { 426 union {
370 u64 block_id; 427 u64 block_id;
371 struct digest_info *digest; 428 struct digest_info *digest;
@@ -408,6 +465,17 @@ enum {
408 465
409 /* Is set when net_conf had two_primaries set while creating this peer_req */ 466 /* Is set when net_conf had two_primaries set while creating this peer_req */
410 __EE_IN_INTERVAL_TREE, 467 __EE_IN_INTERVAL_TREE,
468
469 /* for debugfs: */
470 /* has this been submitted, or does it still wait for something else? */
471 __EE_SUBMITTED,
472
473 /* this is/was a write request */
474 __EE_WRITE,
475
476 /* this originates from application on peer
477 * (not some resync or verify or other DRBD internal request) */
478 __EE_APPLICATION,
411}; 479};
412#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 480#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
413#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 481#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
@@ -419,6 +487,9 @@ enum {
419#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) 487#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
420#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) 488#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
421#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) 489#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
490#define EE_SUBMITTED (1<<__EE_SUBMITTED)
491#define EE_WRITE (1<<__EE_WRITE)
492#define EE_APPLICATION (1<<__EE_APPLICATION)
422 493
423/* flag bits per device */ 494/* flag bits per device */
424enum { 495enum {
@@ -433,11 +504,11 @@ enum {
433 CONSIDER_RESYNC, 504 CONSIDER_RESYNC,
434 505
435 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ 506 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
507
436 SUSPEND_IO, /* suspend application io */ 508 SUSPEND_IO, /* suspend application io */
437 BITMAP_IO, /* suspend application io; 509 BITMAP_IO, /* suspend application io;
438 once no more io in flight, start bitmap io */ 510 once no more io in flight, start bitmap io */
439 BITMAP_IO_QUEUED, /* Started bitmap IO */ 511 BITMAP_IO_QUEUED, /* Started bitmap IO */
440 GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
441 WAS_IO_ERROR, /* Local disk failed, returned IO error */ 512 WAS_IO_ERROR, /* Local disk failed, returned IO error */
442 WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ 513 WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
443 FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ 514 FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
@@ -450,6 +521,20 @@ enum {
450 B_RS_H_DONE, /* Before resync handler done (already executed) */ 521 B_RS_H_DONE, /* Before resync handler done (already executed) */
451 DISCARD_MY_DATA, /* discard_my_data flag per volume */ 522 DISCARD_MY_DATA, /* discard_my_data flag per volume */
452 READ_BALANCE_RR, 523 READ_BALANCE_RR,
524
525 FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush
526 * from drbd_flush_after_epoch() */
527
528 /* cleared only after backing device related structures have been destroyed. */
529 GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */
530
531 /* to be used in drbd_device_post_work() */
532 GO_DISKLESS, /* tell worker to schedule cleanup before detach */
533 DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */
534 MD_SYNC, /* tell worker to call drbd_md_sync() */
535 RS_START, /* tell worker to start resync/OV */
536 RS_PROGRESS, /* tell worker that resync made significant progress */
537 RS_DONE, /* tell worker that resync is done */
453}; 538};
454 539
455struct drbd_bitmap; /* opaque for drbd_device */ 540struct drbd_bitmap; /* opaque for drbd_device */
@@ -531,6 +616,11 @@ struct drbd_backing_dev {
531}; 616};
532 617
533struct drbd_md_io { 618struct drbd_md_io {
619 struct page *page;
620 unsigned long start_jif; /* last call to drbd_md_get_buffer */
621 unsigned long submit_jif; /* last _drbd_md_sync_page_io() submit */
622 const char *current_use;
623 atomic_t in_use;
534 unsigned int done; 624 unsigned int done;
535 int error; 625 int error;
536}; 626};
@@ -577,10 +667,18 @@ enum {
577 * and potentially deadlock on, this drbd worker. 667 * and potentially deadlock on, this drbd worker.
578 */ 668 */
579 DISCONNECT_SENT, 669 DISCONNECT_SENT,
670
671 DEVICE_WORK_PENDING, /* tell worker that some device has pending work */
580}; 672};
581 673
582struct drbd_resource { 674struct drbd_resource {
583 char *name; 675 char *name;
676#ifdef CONFIG_DEBUG_FS
677 struct dentry *debugfs_res;
678 struct dentry *debugfs_res_volumes;
679 struct dentry *debugfs_res_connections;
680 struct dentry *debugfs_res_in_flight_summary;
681#endif
584 struct kref kref; 682 struct kref kref;
585 struct idr devices; /* volume number to device mapping */ 683 struct idr devices; /* volume number to device mapping */
586 struct list_head connections; 684 struct list_head connections;
@@ -594,12 +692,28 @@ struct drbd_resource {
594 unsigned susp_nod:1; /* IO suspended because no data */ 692 unsigned susp_nod:1; /* IO suspended because no data */
595 unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ 693 unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
596 694
695 enum write_ordering_e write_ordering;
696
597 cpumask_var_t cpu_mask; 697 cpumask_var_t cpu_mask;
598}; 698};
599 699
700struct drbd_thread_timing_details
701{
702 unsigned long start_jif;
703 void *cb_addr;
704 const char *caller_fn;
705 unsigned int line;
706 unsigned int cb_nr;
707};
708
600struct drbd_connection { 709struct drbd_connection {
601 struct list_head connections; 710 struct list_head connections;
602 struct drbd_resource *resource; 711 struct drbd_resource *resource;
712#ifdef CONFIG_DEBUG_FS
713 struct dentry *debugfs_conn;
714 struct dentry *debugfs_conn_callback_history;
715 struct dentry *debugfs_conn_oldest_requests;
716#endif
603 struct kref kref; 717 struct kref kref;
604 struct idr peer_devices; /* volume number to peer device mapping */ 718 struct idr peer_devices; /* volume number to peer device mapping */
605 enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ 719 enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
@@ -636,7 +750,6 @@ struct drbd_connection {
636 struct drbd_epoch *current_epoch; 750 struct drbd_epoch *current_epoch;
637 spinlock_t epoch_lock; 751 spinlock_t epoch_lock;
638 unsigned int epochs; 752 unsigned int epochs;
639 enum write_ordering_e write_ordering;
640 atomic_t current_tle_nr; /* transfer log epoch number */ 753 atomic_t current_tle_nr; /* transfer log epoch number */
641 unsigned current_tle_writes; /* writes seen within this tl epoch */ 754 unsigned current_tle_writes; /* writes seen within this tl epoch */
642 755
@@ -645,9 +758,22 @@ struct drbd_connection {
645 struct drbd_thread worker; 758 struct drbd_thread worker;
646 struct drbd_thread asender; 759 struct drbd_thread asender;
647 760
761 /* cached pointers,
762 * so we can look up the oldest pending requests more quickly.
763 * protected by resource->req_lock */
764 struct drbd_request *req_next; /* DRBD 9: todo.req_next */
765 struct drbd_request *req_ack_pending;
766 struct drbd_request *req_not_net_done;
767
648 /* sender side */ 768 /* sender side */
649 struct drbd_work_queue sender_work; 769 struct drbd_work_queue sender_work;
650 770
771#define DRBD_THREAD_DETAILS_HIST 16
772 unsigned int w_cb_nr; /* keeps counting up */
773 unsigned int r_cb_nr; /* keeps counting up */
774 struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
775 struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
776
651 struct { 777 struct {
652 /* whether this sender thread 778 /* whether this sender thread
653 * has processed a single write yet. */ 779 * has processed a single write yet. */
@@ -663,11 +789,22 @@ struct drbd_connection {
663 } send; 789 } send;
664}; 790};
665 791
792void __update_timing_details(
793 struct drbd_thread_timing_details *tdp,
794 unsigned int *cb_nr,
795 void *cb,
796 const char *fn, const unsigned int line);
797
798#define update_worker_timing_details(c, cb) \
799 __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
800#define update_receiver_timing_details(c, cb) \
801 __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
802
666struct submit_worker { 803struct submit_worker {
667 struct workqueue_struct *wq; 804 struct workqueue_struct *wq;
668 struct work_struct worker; 805 struct work_struct worker;
669 806
670 spinlock_t lock; 807 /* protected by ..->resource->req_lock */
671 struct list_head writes; 808 struct list_head writes;
672}; 809};
673 810
@@ -675,12 +812,29 @@ struct drbd_peer_device {
675 struct list_head peer_devices; 812 struct list_head peer_devices;
676 struct drbd_device *device; 813 struct drbd_device *device;
677 struct drbd_connection *connection; 814 struct drbd_connection *connection;
815#ifdef CONFIG_DEBUG_FS
816 struct dentry *debugfs_peer_dev;
817#endif
678}; 818};
679 819
680struct drbd_device { 820struct drbd_device {
681 struct drbd_resource *resource; 821 struct drbd_resource *resource;
682 struct list_head peer_devices; 822 struct list_head peer_devices;
683 int vnr; /* volume number within the connection */ 823 struct list_head pending_bitmap_io;
824
825 unsigned long flush_jif;
826#ifdef CONFIG_DEBUG_FS
827 struct dentry *debugfs_minor;
828 struct dentry *debugfs_vol;
829 struct dentry *debugfs_vol_oldest_requests;
830 struct dentry *debugfs_vol_act_log_extents;
831 struct dentry *debugfs_vol_resync_extents;
832 struct dentry *debugfs_vol_data_gen_id;
833#endif
834
835 unsigned int vnr; /* volume number within the connection */
836 unsigned int minor; /* device minor number */
837
684 struct kref kref; 838 struct kref kref;
685 839
686 /* things that are stored as / read from meta data on disk */ 840 /* things that are stored as / read from meta data on disk */
@@ -697,19 +851,10 @@ struct drbd_device {
697 unsigned long last_reattach_jif; 851 unsigned long last_reattach_jif;
698 struct drbd_work resync_work; 852 struct drbd_work resync_work;
699 struct drbd_work unplug_work; 853 struct drbd_work unplug_work;
700 struct drbd_work go_diskless;
701 struct drbd_work md_sync_work;
702 struct drbd_work start_resync_work;
703 struct timer_list resync_timer; 854 struct timer_list resync_timer;
704 struct timer_list md_sync_timer; 855 struct timer_list md_sync_timer;
705 struct timer_list start_resync_timer; 856 struct timer_list start_resync_timer;
706 struct timer_list request_timer; 857 struct timer_list request_timer;
707#ifdef DRBD_DEBUG_MD_SYNC
708 struct {
709 unsigned int line;
710 const char* func;
711 } last_md_mark_dirty;
712#endif
713 858
714 /* Used after attach while negotiating new disk state. */ 859 /* Used after attach while negotiating new disk state. */
715 union drbd_state new_state_tmp; 860 union drbd_state new_state_tmp;
@@ -724,6 +869,7 @@ struct drbd_device {
724 unsigned int al_writ_cnt; 869 unsigned int al_writ_cnt;
725 unsigned int bm_writ_cnt; 870 unsigned int bm_writ_cnt;
726 atomic_t ap_bio_cnt; /* Requests we need to complete */ 871 atomic_t ap_bio_cnt; /* Requests we need to complete */
872 atomic_t ap_actlog_cnt; /* Requests waiting for activity log */
727 atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ 873 atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
728 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ 874 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
729 atomic_t unacked_cnt; /* Need to send replies for */ 875 atomic_t unacked_cnt; /* Need to send replies for */
@@ -733,6 +879,13 @@ struct drbd_device {
733 struct rb_root read_requests; 879 struct rb_root read_requests;
734 struct rb_root write_requests; 880 struct rb_root write_requests;
735 881
882 /* for statistics and timeouts */
883 /* [0] read, [1] write */
884 struct list_head pending_master_completion[2];
885 struct list_head pending_completion[2];
886
887 /* use checksums for *this* resync */
888 bool use_csums;
736 /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ 889 /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
737 unsigned long rs_total; 890 unsigned long rs_total;
738 /* number of resync blocks that failed in this run */ 891 /* number of resync blocks that failed in this run */
@@ -788,9 +941,7 @@ struct drbd_device {
788 atomic_t pp_in_use; /* allocated from page pool */ 941 atomic_t pp_in_use; /* allocated from page pool */
789 atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ 942 atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
790 wait_queue_head_t ee_wait; 943 wait_queue_head_t ee_wait;
791 struct page *md_io_page; /* one page buffer for md_io */
792 struct drbd_md_io md_io; 944 struct drbd_md_io md_io;
793 atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */
794 spinlock_t al_lock; 945 spinlock_t al_lock;
795 wait_queue_head_t al_wait; 946 wait_queue_head_t al_wait;
796 struct lru_cache *act_log; /* activity log */ 947 struct lru_cache *act_log; /* activity log */
@@ -800,7 +951,6 @@ struct drbd_device {
800 atomic_t packet_seq; 951 atomic_t packet_seq;
801 unsigned int peer_seq; 952 unsigned int peer_seq;
802 spinlock_t peer_seq_lock; 953 spinlock_t peer_seq_lock;
803 unsigned int minor;
804 unsigned long comm_bm_set; /* communicated number of set bits. */ 954 unsigned long comm_bm_set; /* communicated number of set bits. */
805 struct bm_io_work bm_io_work; 955 struct bm_io_work bm_io_work;
806 u64 ed_uuid; /* UUID of the exposed data */ 956 u64 ed_uuid; /* UUID of the exposed data */
@@ -824,6 +974,21 @@ struct drbd_device {
824 struct submit_worker submit; 974 struct submit_worker submit;
825}; 975};
826 976
977struct drbd_bm_aio_ctx {
978 struct drbd_device *device;
979 struct list_head list; /* on device->pending_bitmap_io */;
980 unsigned long start_jif;
981 atomic_t in_flight;
982 unsigned int done;
983 unsigned flags;
984#define BM_AIO_COPY_PAGES 1
985#define BM_AIO_WRITE_HINTED 2
986#define BM_AIO_WRITE_ALL_PAGES 4
987#define BM_AIO_READ 8
988 int error;
989 struct kref kref;
990};
991
827struct drbd_config_context { 992struct drbd_config_context {
828 /* assigned from drbd_genlmsghdr */ 993 /* assigned from drbd_genlmsghdr */
829 unsigned int minor; 994 unsigned int minor;
@@ -949,7 +1114,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
949extern int drbd_send_bitmap(struct drbd_device *device); 1114extern int drbd_send_bitmap(struct drbd_device *device);
950extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); 1115extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
951extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); 1116extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
952extern void drbd_free_bc(struct drbd_backing_dev *ldev); 1117extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
953extern void drbd_device_cleanup(struct drbd_device *device); 1118extern void drbd_device_cleanup(struct drbd_device *device);
954void drbd_print_uuids(struct drbd_device *device, const char *text); 1119void drbd_print_uuids(struct drbd_device *device, const char *text);
955 1120
@@ -966,13 +1131,7 @@ extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must
966extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local); 1131extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
967extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local); 1132extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
968extern int drbd_md_test_flag(struct drbd_backing_dev *, int); 1133extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
969#ifndef DRBD_DEBUG_MD_SYNC
970extern void drbd_md_mark_dirty(struct drbd_device *device); 1134extern void drbd_md_mark_dirty(struct drbd_device *device);
971#else
972#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
973extern void drbd_md_mark_dirty_(struct drbd_device *device,
974 unsigned int line, const char *func);
975#endif
976extern void drbd_queue_bitmap_io(struct drbd_device *device, 1135extern void drbd_queue_bitmap_io(struct drbd_device *device,
977 int (*io_fn)(struct drbd_device *), 1136 int (*io_fn)(struct drbd_device *),
978 void (*done)(struct drbd_device *, int), 1137 void (*done)(struct drbd_device *, int),
@@ -983,9 +1142,8 @@ extern int drbd_bitmap_io(struct drbd_device *device,
983extern int drbd_bitmap_io_from_worker(struct drbd_device *device, 1142extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
984 int (*io_fn)(struct drbd_device *), 1143 int (*io_fn)(struct drbd_device *),
985 char *why, enum bm_flag flags); 1144 char *why, enum bm_flag flags);
986extern int drbd_bmio_set_n_write(struct drbd_device *device); 1145extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
987extern int drbd_bmio_clear_n_write(struct drbd_device *device); 1146extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
988extern void drbd_ldev_destroy(struct drbd_device *device);
989 1147
990/* Meta data layout 1148/* Meta data layout
991 * 1149 *
@@ -1105,17 +1263,21 @@ struct bm_extent {
1105/* in which _bitmap_ extent (resp. sector) the bit for a certain 1263/* in which _bitmap_ extent (resp. sector) the bit for a certain
1106 * _storage_ sector is located in */ 1264 * _storage_ sector is located in */
1107#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) 1265#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
1266#define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
1108 1267
1109/* how much _storage_ sectors we have per bitmap sector */ 1268/* first storage sector a bitmap extent corresponds to */
1110#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) 1269#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
1270/* how much _storage_ sectors we have per bitmap extent */
1111#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) 1271#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
1272/* how many bits are covered by one bitmap extent (resync extent) */
1273#define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
1274
1275#define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1)
1276
1112 1277
1113/* in one sector of the bitmap, we have this many activity_log extents. */ 1278/* in one sector of the bitmap, we have this many activity_log extents. */
1114#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) 1279#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1115 1280
1116#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
1117#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
1118
1119/* the extent in "PER_EXTENT" below is an activity log extent 1281/* the extent in "PER_EXTENT" below is an activity log extent
1120 * we need that many (long words/bytes) to store the bitmap 1282 * we need that many (long words/bytes) to store the bitmap
1121 * of one AL_EXTENT_SIZE chunk of storage. 1283 * of one AL_EXTENT_SIZE chunk of storage.
@@ -1195,11 +1357,11 @@ extern void _drbd_bm_set_bits(struct drbd_device *device,
1195 const unsigned long s, const unsigned long e); 1357 const unsigned long s, const unsigned long e);
1196extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr); 1358extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
1197extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); 1359extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
1198extern int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local);
1199extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); 1360extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
1200extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); 1361extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
1201extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); 1362extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
1202extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); 1363extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
1364extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
1203extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); 1365extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
1204extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local); 1366extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
1205extern size_t drbd_bm_words(struct drbd_device *device); 1367extern size_t drbd_bm_words(struct drbd_device *device);
@@ -1213,7 +1375,6 @@ extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned lon
1213extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo); 1375extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
1214extern unsigned long _drbd_bm_total_weight(struct drbd_device *device); 1376extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
1215extern unsigned long drbd_bm_total_weight(struct drbd_device *device); 1377extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
1216extern int drbd_bm_rs_done(struct drbd_device *device);
1217/* for receive_bitmap */ 1378/* for receive_bitmap */
1218extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, 1379extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
1219 size_t number, unsigned long *buffer); 1380 size_t number, unsigned long *buffer);
@@ -1312,7 +1473,7 @@ enum determine_dev_size {
1312extern enum determine_dev_size 1473extern enum determine_dev_size
1313drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); 1474drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
1314extern void resync_after_online_grow(struct drbd_device *); 1475extern void resync_after_online_grow(struct drbd_device *);
1315extern void drbd_reconsider_max_bio_size(struct drbd_device *device); 1476extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
1316extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, 1477extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
1317 enum drbd_role new_role, 1478 enum drbd_role new_role,
1318 int force); 1479 int force);
@@ -1333,7 +1494,7 @@ extern void resume_next_sg(struct drbd_device *device);
1333extern void suspend_other_sg(struct drbd_device *device); 1494extern void suspend_other_sg(struct drbd_device *device);
1334extern int drbd_resync_finished(struct drbd_device *device); 1495extern int drbd_resync_finished(struct drbd_device *device);
1335/* maybe rather drbd_main.c ? */ 1496/* maybe rather drbd_main.c ? */
1336extern void *drbd_md_get_buffer(struct drbd_device *device); 1497extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
1337extern void drbd_md_put_buffer(struct drbd_device *device); 1498extern void drbd_md_put_buffer(struct drbd_device *device);
1338extern int drbd_md_sync_page_io(struct drbd_device *device, 1499extern int drbd_md_sync_page_io(struct drbd_device *device,
1339 struct drbd_backing_dev *bdev, sector_t sector, int rw); 1500 struct drbd_backing_dev *bdev, sector_t sector, int rw);
@@ -1380,7 +1541,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
1380extern int drbd_receiver(struct drbd_thread *thi); 1541extern int drbd_receiver(struct drbd_thread *thi);
1381extern int drbd_asender(struct drbd_thread *thi); 1542extern int drbd_asender(struct drbd_thread *thi);
1382extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); 1543extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
1383extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); 1544extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
1545 bool throttle_if_app_is_waiting);
1384extern int drbd_submit_peer_request(struct drbd_device *, 1546extern int drbd_submit_peer_request(struct drbd_device *,
1385 struct drbd_peer_request *, const unsigned, 1547 struct drbd_peer_request *, const unsigned,
1386 const int); 1548 const int);
@@ -1464,10 +1626,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
1464{ 1626{
1465 __release(local); 1627 __release(local);
1466 if (!bio->bi_bdev) { 1628 if (!bio->bi_bdev) {
1467 printk(KERN_ERR "drbd%d: drbd_generic_make_request: " 1629 drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
1468 "bio->bi_bdev == NULL\n",
1469 device_to_minor(device));
1470 dump_stack();
1471 bio_endio(bio, -ENODEV); 1630 bio_endio(bio, -ENODEV);
1472 return; 1631 return;
1473 } 1632 }
@@ -1478,7 +1637,8 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
1478 generic_make_request(bio); 1637 generic_make_request(bio);
1479} 1638}
1480 1639
1481void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo); 1640void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1641 enum write_ordering_e wo);
1482 1642
1483/* drbd_proc.c */ 1643/* drbd_proc.c */
1484extern struct proc_dir_entry *drbd_proc; 1644extern struct proc_dir_entry *drbd_proc;
@@ -1489,9 +1649,9 @@ extern const char *drbd_role_str(enum drbd_role s);
1489/* drbd_actlog.c */ 1649/* drbd_actlog.c */
1490extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); 1650extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
1491extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); 1651extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
1492extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate); 1652extern void drbd_al_begin_io_commit(struct drbd_device *device);
1493extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); 1653extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
1494extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate); 1654extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
1495extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i); 1655extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
1496extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector); 1656extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
1497extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector); 1657extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
@@ -1501,14 +1661,17 @@ extern int drbd_rs_del_all(struct drbd_device *device);
1501extern void drbd_rs_failed_io(struct drbd_device *device, 1661extern void drbd_rs_failed_io(struct drbd_device *device,
1502 sector_t sector, int size); 1662 sector_t sector, int size);
1503extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); 1663extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
1504extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, 1664
1505 int size, const char *file, const unsigned int line); 1665enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
1666extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
1667 enum update_sync_bits_mode mode,
1668 const char *file, const unsigned int line);
1506#define drbd_set_in_sync(device, sector, size) \ 1669#define drbd_set_in_sync(device, sector, size) \
1507 __drbd_set_in_sync(device, sector, size, __FILE__, __LINE__) 1670 __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__)
1508extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector,
1509 int size, const char *file, const unsigned int line);
1510#define drbd_set_out_of_sync(device, sector, size) \ 1671#define drbd_set_out_of_sync(device, sector, size) \
1511 __drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__) 1672 __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__)
1673#define drbd_rs_failed_io(device, sector, size) \
1674 __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__)
1512extern void drbd_al_shrink(struct drbd_device *device); 1675extern void drbd_al_shrink(struct drbd_device *device);
1513extern int drbd_initialize_al(struct drbd_device *, void *); 1676extern int drbd_initialize_al(struct drbd_device *, void *);
1514 1677
@@ -1764,25 +1927,38 @@ static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
1764} 1927}
1765 1928
1766static inline void 1929static inline void
1767drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) 1930drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
1768{ 1931{
1769 unsigned long flags; 1932 unsigned long flags;
1770 spin_lock_irqsave(&q->q_lock, flags); 1933 spin_lock_irqsave(&q->q_lock, flags);
1771 list_add(&w->list, &q->q); 1934 list_add_tail(&w->list, &q->q);
1772 spin_unlock_irqrestore(&q->q_lock, flags); 1935 spin_unlock_irqrestore(&q->q_lock, flags);
1773 wake_up(&q->q_wait); 1936 wake_up(&q->q_wait);
1774} 1937}
1775 1938
1776static inline void 1939static inline void
1777drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) 1940drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
1778{ 1941{
1779 unsigned long flags; 1942 unsigned long flags;
1780 spin_lock_irqsave(&q->q_lock, flags); 1943 spin_lock_irqsave(&q->q_lock, flags);
1781 list_add_tail(&w->list, &q->q); 1944 if (list_empty_careful(&w->list))
1945 list_add_tail(&w->list, &q->q);
1782 spin_unlock_irqrestore(&q->q_lock, flags); 1946 spin_unlock_irqrestore(&q->q_lock, flags);
1783 wake_up(&q->q_wait); 1947 wake_up(&q->q_wait);
1784} 1948}
1785 1949
1950static inline void
1951drbd_device_post_work(struct drbd_device *device, int work_bit)
1952{
1953 if (!test_and_set_bit(work_bit, &device->flags)) {
1954 struct drbd_connection *connection =
1955 first_peer_device(device)->connection;
1956 struct drbd_work_queue *q = &connection->sender_work;
1957 if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
1958 wake_up(&q->q_wait);
1959 }
1960}
1961
1786extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); 1962extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
1787 1963
1788static inline void wake_asender(struct drbd_connection *connection) 1964static inline void wake_asender(struct drbd_connection *connection)
@@ -1859,7 +2035,7 @@ static inline void inc_ap_pending(struct drbd_device *device)
1859 func, line, \ 2035 func, line, \
1860 atomic_read(&device->which)) 2036 atomic_read(&device->which))
1861 2037
1862#define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__) 2038#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
1863static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line) 2039static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
1864{ 2040{
1865 if (atomic_dec_and_test(&device->ap_pending_cnt)) 2041 if (atomic_dec_and_test(&device->ap_pending_cnt))
@@ -1878,7 +2054,7 @@ static inline void inc_rs_pending(struct drbd_device *device)
1878 atomic_inc(&device->rs_pending_cnt); 2054 atomic_inc(&device->rs_pending_cnt);
1879} 2055}
1880 2056
1881#define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__) 2057#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
1882static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line) 2058static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
1883{ 2059{
1884 atomic_dec(&device->rs_pending_cnt); 2060 atomic_dec(&device->rs_pending_cnt);
@@ -1899,20 +2075,29 @@ static inline void inc_unacked(struct drbd_device *device)
1899 atomic_inc(&device->unacked_cnt); 2075 atomic_inc(&device->unacked_cnt);
1900} 2076}
1901 2077
1902#define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__) 2078#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
1903static inline void _dec_unacked(struct drbd_device *device, const char *func, int line) 2079static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
1904{ 2080{
1905 atomic_dec(&device->unacked_cnt); 2081 atomic_dec(&device->unacked_cnt);
1906 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); 2082 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
1907} 2083}
1908 2084
1909#define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__) 2085#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
1910static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line) 2086static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
1911{ 2087{
1912 atomic_sub(n, &device->unacked_cnt); 2088 atomic_sub(n, &device->unacked_cnt);
1913 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); 2089 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
1914} 2090}
1915 2091
2092static inline bool is_sync_state(enum drbd_conns connection_state)
2093{
2094 return
2095 (connection_state == C_SYNC_SOURCE
2096 || connection_state == C_SYNC_TARGET
2097 || connection_state == C_PAUSED_SYNC_S
2098 || connection_state == C_PAUSED_SYNC_T);
2099}
2100
1916/** 2101/**
1917 * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev 2102 * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
1918 * @M: DRBD device. 2103 * @M: DRBD device.
@@ -1924,6 +2109,11 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
1924 2109
1925static inline void put_ldev(struct drbd_device *device) 2110static inline void put_ldev(struct drbd_device *device)
1926{ 2111{
2112 enum drbd_disk_state ds = device->state.disk;
2113 /* We must check the state *before* the atomic_dec becomes visible,
2114 * or we have a theoretical race where someone hitting zero,
2115 * while state still D_FAILED, will then see D_DISKLESS in the
2116 * condition below and calling into destroy, where he must not, yet. */
1927 int i = atomic_dec_return(&device->local_cnt); 2117 int i = atomic_dec_return(&device->local_cnt);
1928 2118
1929 /* This may be called from some endio handler, 2119 /* This may be called from some endio handler,
@@ -1932,15 +2122,13 @@ static inline void put_ldev(struct drbd_device *device)
1932 __release(local); 2122 __release(local);
1933 D_ASSERT(device, i >= 0); 2123 D_ASSERT(device, i >= 0);
1934 if (i == 0) { 2124 if (i == 0) {
1935 if (device->state.disk == D_DISKLESS) 2125 if (ds == D_DISKLESS)
1936 /* even internal references gone, safe to destroy */ 2126 /* even internal references gone, safe to destroy */
1937 drbd_ldev_destroy(device); 2127 drbd_device_post_work(device, DESTROY_DISK);
1938 if (device->state.disk == D_FAILED) { 2128 if (ds == D_FAILED)
1939 /* all application IO references gone. */ 2129 /* all application IO references gone. */
1940 if (!test_and_set_bit(GO_DISKLESS, &device->flags)) 2130 if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
1941 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 2131 drbd_device_post_work(device, GO_DISKLESS);
1942 &device->go_diskless);
1943 }
1944 wake_up(&device->misc_wait); 2132 wake_up(&device->misc_wait);
1945 } 2133 }
1946} 2134}
@@ -1964,54 +2152,6 @@ static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_
1964extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins); 2152extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
1965#endif 2153#endif
1966 2154
1967/* you must have an "get_ldev" reference */
1968static inline void drbd_get_syncer_progress(struct drbd_device *device,
1969 unsigned long *bits_left, unsigned int *per_mil_done)
1970{
1971 /* this is to break it at compile time when we change that, in case we
1972 * want to support more than (1<<32) bits on a 32bit arch. */
1973 typecheck(unsigned long, device->rs_total);
1974
1975 /* note: both rs_total and rs_left are in bits, i.e. in
1976 * units of BM_BLOCK_SIZE.
1977 * for the percentage, we don't care. */
1978
1979 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
1980 *bits_left = device->ov_left;
1981 else
1982 *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
1983 /* >> 10 to prevent overflow,
1984 * +1 to prevent division by zero */
1985 if (*bits_left > device->rs_total) {
1986 /* doh. maybe a logic bug somewhere.
1987 * may also be just a race condition
1988 * between this and a disconnect during sync.
1989 * for now, just prevent in-kernel buffer overflow.
1990 */
1991 smp_rmb();
1992 drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
1993 drbd_conn_str(device->state.conn),
1994 *bits_left, device->rs_total, device->rs_failed);
1995 *per_mil_done = 0;
1996 } else {
1997 /* Make sure the division happens in long context.
1998 * We allow up to one petabyte storage right now,
1999 * at a granularity of 4k per bit that is 2**38 bits.
2000 * After shift right and multiplication by 1000,
2001 * this should still fit easily into a 32bit long,
2002 * so we don't need a 64bit division on 32bit arch.
2003 * Note: currently we don't support such large bitmaps on 32bit
2004 * arch anyways, but no harm done to be prepared for it here.
2005 */
2006 unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10;
2007 unsigned long left = *bits_left >> shift;
2008 unsigned long total = 1UL + (device->rs_total >> shift);
2009 unsigned long tmp = 1000UL - left * 1000UL/total;
2010 *per_mil_done = tmp;
2011 }
2012}
2013
2014
2015/* this throttles on-the-fly application requests 2155/* this throttles on-the-fly application requests
2016 * according to max_buffers settings; 2156 * according to max_buffers settings;
2017 * maybe re-implement using semaphores? */ 2157 * maybe re-implement using semaphores? */
@@ -2201,25 +2341,6 @@ static inline int drbd_queue_order_type(struct drbd_device *device)
2201 return QUEUE_ORDERED_NONE; 2341 return QUEUE_ORDERED_NONE;
2202} 2342}
2203 2343
2204static inline void drbd_md_flush(struct drbd_device *device)
2205{
2206 int r;
2207
2208 if (device->ldev == NULL) {
2209 drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n");
2210 return;
2211 }
2212
2213 if (test_bit(MD_NO_FUA, &device->flags))
2214 return;
2215
2216 r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL);
2217 if (r) {
2218 set_bit(MD_NO_FUA, &device->flags);
2219 drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r);
2220 }
2221}
2222
2223static inline struct drbd_connection *first_connection(struct drbd_resource *resource) 2344static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
2224{ 2345{
2225 return list_first_entry_or_null(&resource->connections, 2346 return list_first_entry_or_null(&resource->connections,
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
index f38fcb00c10d..f210543f05f4 100644
--- a/drivers/block/drbd/drbd_interval.h
+++ b/drivers/block/drbd/drbd_interval.h
@@ -10,7 +10,9 @@ struct drbd_interval {
10 unsigned int size; /* size in bytes */ 10 unsigned int size; /* size in bytes */
11 sector_t end; /* highest interval end in subtree */ 11 sector_t end; /* highest interval end in subtree */
12 int local:1 /* local or remote request? */; 12 int local:1 /* local or remote request? */;
13 int waiting:1; 13 int waiting:1; /* someone is waiting for this to complete */
14 int completed:1; /* this has been completed already;
15 * ignore for conflict detection */
14}; 16};
15 17
16static inline void drbd_clear_interval(struct drbd_interval *i) 18static inline void drbd_clear_interval(struct drbd_interval *i)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 960645c26e6f..9b465bb68487 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -26,7 +26,10 @@
26 26
27 */ 27 */
28 28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
29#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/jiffies.h>
30#include <linux/drbd.h> 33#include <linux/drbd.h>
31#include <asm/uaccess.h> 34#include <asm/uaccess.h>
32#include <asm/types.h> 35#include <asm/types.h>
@@ -54,16 +57,14 @@
54#include "drbd_int.h" 57#include "drbd_int.h"
55#include "drbd_protocol.h" 58#include "drbd_protocol.h"
56#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ 59#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
57
58#include "drbd_vli.h" 60#include "drbd_vli.h"
61#include "drbd_debugfs.h"
59 62
60static DEFINE_MUTEX(drbd_main_mutex); 63static DEFINE_MUTEX(drbd_main_mutex);
61static int drbd_open(struct block_device *bdev, fmode_t mode); 64static int drbd_open(struct block_device *bdev, fmode_t mode);
62static void drbd_release(struct gendisk *gd, fmode_t mode); 65static void drbd_release(struct gendisk *gd, fmode_t mode);
63static int w_md_sync(struct drbd_work *w, int unused);
64static void md_sync_timer_fn(unsigned long data); 66static void md_sync_timer_fn(unsigned long data);
65static int w_bitmap_io(struct drbd_work *w, int unused); 67static int w_bitmap_io(struct drbd_work *w, int unused);
66static int w_go_diskless(struct drbd_work *w, int unused);
67 68
68MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " 69MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
69 "Lars Ellenberg <lars@linbit.com>"); 70 "Lars Ellenberg <lars@linbit.com>");
@@ -264,7 +265,7 @@ bail:
264 265
265/** 266/**
266 * _tl_restart() - Walks the transfer log, and applies an action to all requests 267 * _tl_restart() - Walks the transfer log, and applies an action to all requests
267 * @device: DRBD device. 268 * @connection: DRBD connection to operate on.
268 * @what: The action/event to perform with all request objects 269 * @what: The action/event to perform with all request objects
269 * 270 *
270 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, 271 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
@@ -662,6 +663,11 @@ static int __send_command(struct drbd_connection *connection, int vnr,
662 msg_flags); 663 msg_flags);
663 if (data && !err) 664 if (data && !err)
664 err = drbd_send_all(connection, sock->socket, data, size, 0); 665 err = drbd_send_all(connection, sock->socket, data, size, 0);
666 /* DRBD protocol "pings" are latency critical.
667 * This is supposed to trigger tcp_push_pending_frames() */
668 if (!err && (cmd == P_PING || cmd == P_PING_ACK))
669 drbd_tcp_nodelay(sock->socket);
670
665 return err; 671 return err;
666} 672}
667 673
@@ -1636,7 +1642,10 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
1636 if (peer_device->connection->agreed_pro_version >= 100) { 1642 if (peer_device->connection->agreed_pro_version >= 100) {
1637 if (req->rq_state & RQ_EXP_RECEIVE_ACK) 1643 if (req->rq_state & RQ_EXP_RECEIVE_ACK)
1638 dp_flags |= DP_SEND_RECEIVE_ACK; 1644 dp_flags |= DP_SEND_RECEIVE_ACK;
1639 if (req->rq_state & RQ_EXP_WRITE_ACK) 1645 /* During resync, request an explicit write ack,
1646 * even in protocol != C */
1647 if (req->rq_state & RQ_EXP_WRITE_ACK
1648 || (dp_flags & DP_MAY_SET_IN_SYNC))
1640 dp_flags |= DP_SEND_WRITE_ACK; 1649 dp_flags |= DP_SEND_WRITE_ACK;
1641 } 1650 }
1642 p->dp_flags = cpu_to_be32(dp_flags); 1651 p->dp_flags = cpu_to_be32(dp_flags);
@@ -1900,6 +1909,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
1900 drbd_set_defaults(device); 1909 drbd_set_defaults(device);
1901 1910
1902 atomic_set(&device->ap_bio_cnt, 0); 1911 atomic_set(&device->ap_bio_cnt, 0);
1912 atomic_set(&device->ap_actlog_cnt, 0);
1903 atomic_set(&device->ap_pending_cnt, 0); 1913 atomic_set(&device->ap_pending_cnt, 0);
1904 atomic_set(&device->rs_pending_cnt, 0); 1914 atomic_set(&device->rs_pending_cnt, 0);
1905 atomic_set(&device->unacked_cnt, 0); 1915 atomic_set(&device->unacked_cnt, 0);
@@ -1908,7 +1918,7 @@ void drbd_init_set_defaults(struct drbd_device *device)
1908 atomic_set(&device->rs_sect_in, 0); 1918 atomic_set(&device->rs_sect_in, 0);
1909 atomic_set(&device->rs_sect_ev, 0); 1919 atomic_set(&device->rs_sect_ev, 0);
1910 atomic_set(&device->ap_in_flight, 0); 1920 atomic_set(&device->ap_in_flight, 0);
1911 atomic_set(&device->md_io_in_use, 0); 1921 atomic_set(&device->md_io.in_use, 0);
1912 1922
1913 mutex_init(&device->own_state_mutex); 1923 mutex_init(&device->own_state_mutex);
1914 device->state_mutex = &device->own_state_mutex; 1924 device->state_mutex = &device->own_state_mutex;
@@ -1924,17 +1934,15 @@ void drbd_init_set_defaults(struct drbd_device *device)
1924 INIT_LIST_HEAD(&device->resync_reads); 1934 INIT_LIST_HEAD(&device->resync_reads);
1925 INIT_LIST_HEAD(&device->resync_work.list); 1935 INIT_LIST_HEAD(&device->resync_work.list);
1926 INIT_LIST_HEAD(&device->unplug_work.list); 1936 INIT_LIST_HEAD(&device->unplug_work.list);
1927 INIT_LIST_HEAD(&device->go_diskless.list);
1928 INIT_LIST_HEAD(&device->md_sync_work.list);
1929 INIT_LIST_HEAD(&device->start_resync_work.list);
1930 INIT_LIST_HEAD(&device->bm_io_work.w.list); 1937 INIT_LIST_HEAD(&device->bm_io_work.w.list);
1938 INIT_LIST_HEAD(&device->pending_master_completion[0]);
1939 INIT_LIST_HEAD(&device->pending_master_completion[1]);
1940 INIT_LIST_HEAD(&device->pending_completion[0]);
1941 INIT_LIST_HEAD(&device->pending_completion[1]);
1931 1942
1932 device->resync_work.cb = w_resync_timer; 1943 device->resync_work.cb = w_resync_timer;
1933 device->unplug_work.cb = w_send_write_hint; 1944 device->unplug_work.cb = w_send_write_hint;
1934 device->go_diskless.cb = w_go_diskless;
1935 device->md_sync_work.cb = w_md_sync;
1936 device->bm_io_work.w.cb = w_bitmap_io; 1945 device->bm_io_work.w.cb = w_bitmap_io;
1937 device->start_resync_work.cb = w_start_resync;
1938 1946
1939 init_timer(&device->resync_timer); 1947 init_timer(&device->resync_timer);
1940 init_timer(&device->md_sync_timer); 1948 init_timer(&device->md_sync_timer);
@@ -1992,7 +2000,7 @@ void drbd_device_cleanup(struct drbd_device *device)
1992 drbd_bm_cleanup(device); 2000 drbd_bm_cleanup(device);
1993 } 2001 }
1994 2002
1995 drbd_free_bc(device->ldev); 2003 drbd_free_ldev(device->ldev);
1996 device->ldev = NULL; 2004 device->ldev = NULL;
1997 2005
1998 clear_bit(AL_SUSPENDED, &device->flags); 2006 clear_bit(AL_SUSPENDED, &device->flags);
@@ -2006,7 +2014,6 @@ void drbd_device_cleanup(struct drbd_device *device)
2006 D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q)); 2014 D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
2007 D_ASSERT(device, list_empty(&device->resync_work.list)); 2015 D_ASSERT(device, list_empty(&device->resync_work.list));
2008 D_ASSERT(device, list_empty(&device->unplug_work.list)); 2016 D_ASSERT(device, list_empty(&device->unplug_work.list));
2009 D_ASSERT(device, list_empty(&device->go_diskless.list));
2010 2017
2011 drbd_set_defaults(device); 2018 drbd_set_defaults(device);
2012} 2019}
@@ -2129,20 +2136,6 @@ Enomem:
2129 return -ENOMEM; 2136 return -ENOMEM;
2130} 2137}
2131 2138
2132static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2133 void *unused)
2134{
2135 /* just so we have it. you never know what interesting things we
2136 * might want to do here some day...
2137 */
2138
2139 return NOTIFY_DONE;
2140}
2141
2142static struct notifier_block drbd_notifier = {
2143 .notifier_call = drbd_notify_sys,
2144};
2145
2146static void drbd_release_all_peer_reqs(struct drbd_device *device) 2139static void drbd_release_all_peer_reqs(struct drbd_device *device)
2147{ 2140{
2148 int rr; 2141 int rr;
@@ -2173,7 +2166,7 @@ void drbd_destroy_device(struct kref *kref)
2173{ 2166{
2174 struct drbd_device *device = container_of(kref, struct drbd_device, kref); 2167 struct drbd_device *device = container_of(kref, struct drbd_device, kref);
2175 struct drbd_resource *resource = device->resource; 2168 struct drbd_resource *resource = device->resource;
2176 struct drbd_connection *connection; 2169 struct drbd_peer_device *peer_device, *tmp_peer_device;
2177 2170
2178 del_timer_sync(&device->request_timer); 2171 del_timer_sync(&device->request_timer);
2179 2172
@@ -2187,7 +2180,7 @@ void drbd_destroy_device(struct kref *kref)
2187 if (device->this_bdev) 2180 if (device->this_bdev)
2188 bdput(device->this_bdev); 2181 bdput(device->this_bdev);
2189 2182
2190 drbd_free_bc(device->ldev); 2183 drbd_free_ldev(device->ldev);
2191 device->ldev = NULL; 2184 device->ldev = NULL;
2192 2185
2193 drbd_release_all_peer_reqs(device); 2186 drbd_release_all_peer_reqs(device);
@@ -2200,15 +2193,20 @@ void drbd_destroy_device(struct kref *kref)
2200 2193
2201 if (device->bitmap) /* should no longer be there. */ 2194 if (device->bitmap) /* should no longer be there. */
2202 drbd_bm_cleanup(device); 2195 drbd_bm_cleanup(device);
2203 __free_page(device->md_io_page); 2196 __free_page(device->md_io.page);
2204 put_disk(device->vdisk); 2197 put_disk(device->vdisk);
2205 blk_cleanup_queue(device->rq_queue); 2198 blk_cleanup_queue(device->rq_queue);
2206 kfree(device->rs_plan_s); 2199 kfree(device->rs_plan_s);
2207 kfree(first_peer_device(device));
2208 kfree(device);
2209 2200
2210 for_each_connection(connection, resource) 2201 /* not for_each_connection(connection, resource):
2211 kref_put(&connection->kref, drbd_destroy_connection); 2202 * those may have been cleaned up and disassociated already.
2203 */
2204 for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
2205 kref_put(&peer_device->connection->kref, drbd_destroy_connection);
2206 kfree(peer_device);
2207 }
2208 memset(device, 0xfd, sizeof(*device));
2209 kfree(device);
2212 kref_put(&resource->kref, drbd_destroy_resource); 2210 kref_put(&resource->kref, drbd_destroy_resource);
2213} 2211}
2214 2212
@@ -2236,7 +2234,7 @@ static void do_retry(struct work_struct *ws)
2236 list_for_each_entry_safe(req, tmp, &writes, tl_requests) { 2234 list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
2237 struct drbd_device *device = req->device; 2235 struct drbd_device *device = req->device;
2238 struct bio *bio = req->master_bio; 2236 struct bio *bio = req->master_bio;
2239 unsigned long start_time = req->start_time; 2237 unsigned long start_jif = req->start_jif;
2240 bool expected; 2238 bool expected;
2241 2239
2242 expected = 2240 expected =
@@ -2271,10 +2269,12 @@ static void do_retry(struct work_struct *ws)
2271 /* We are not just doing generic_make_request(), 2269 /* We are not just doing generic_make_request(),
2272 * as we want to keep the start_time information. */ 2270 * as we want to keep the start_time information. */
2273 inc_ap_bio(device); 2271 inc_ap_bio(device);
2274 __drbd_make_request(device, bio, start_time); 2272 __drbd_make_request(device, bio, start_jif);
2275 } 2273 }
2276} 2274}
2277 2275
2276/* called via drbd_req_put_completion_ref(),
2277 * holds resource->req_lock */
2278void drbd_restart_request(struct drbd_request *req) 2278void drbd_restart_request(struct drbd_request *req)
2279{ 2279{
2280 unsigned long flags; 2280 unsigned long flags;
@@ -2298,6 +2298,7 @@ void drbd_destroy_resource(struct kref *kref)
2298 idr_destroy(&resource->devices); 2298 idr_destroy(&resource->devices);
2299 free_cpumask_var(resource->cpu_mask); 2299 free_cpumask_var(resource->cpu_mask);
2300 kfree(resource->name); 2300 kfree(resource->name);
2301 memset(resource, 0xf2, sizeof(*resource));
2301 kfree(resource); 2302 kfree(resource);
2302} 2303}
2303 2304
@@ -2307,8 +2308,10 @@ void drbd_free_resource(struct drbd_resource *resource)
2307 2308
2308 for_each_connection_safe(connection, tmp, resource) { 2309 for_each_connection_safe(connection, tmp, resource) {
2309 list_del(&connection->connections); 2310 list_del(&connection->connections);
2311 drbd_debugfs_connection_cleanup(connection);
2310 kref_put(&connection->kref, drbd_destroy_connection); 2312 kref_put(&connection->kref, drbd_destroy_connection);
2311 } 2313 }
2314 drbd_debugfs_resource_cleanup(resource);
2312 kref_put(&resource->kref, drbd_destroy_resource); 2315 kref_put(&resource->kref, drbd_destroy_resource);
2313} 2316}
2314 2317
@@ -2318,8 +2321,6 @@ static void drbd_cleanup(void)
2318 struct drbd_device *device; 2321 struct drbd_device *device;
2319 struct drbd_resource *resource, *tmp; 2322 struct drbd_resource *resource, *tmp;
2320 2323
2321 unregister_reboot_notifier(&drbd_notifier);
2322
2323 /* first remove proc, 2324 /* first remove proc,
2324 * drbdsetup uses it's presence to detect 2325 * drbdsetup uses it's presence to detect
2325 * whether DRBD is loaded. 2326 * whether DRBD is loaded.
@@ -2335,6 +2336,7 @@ static void drbd_cleanup(void)
2335 destroy_workqueue(retry.wq); 2336 destroy_workqueue(retry.wq);
2336 2337
2337 drbd_genl_unregister(); 2338 drbd_genl_unregister();
2339 drbd_debugfs_cleanup();
2338 2340
2339 idr_for_each_entry(&drbd_devices, device, i) 2341 idr_for_each_entry(&drbd_devices, device, i)
2340 drbd_delete_device(device); 2342 drbd_delete_device(device);
@@ -2350,7 +2352,7 @@ static void drbd_cleanup(void)
2350 2352
2351 idr_destroy(&drbd_devices); 2353 idr_destroy(&drbd_devices);
2352 2354
2353 printk(KERN_INFO "drbd: module cleanup done.\n"); 2355 pr_info("module cleanup done.\n");
2354} 2356}
2355 2357
2356/** 2358/**
@@ -2539,6 +2541,20 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
2539 if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { 2541 if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
2540 err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE, 2542 err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
2541 cpumask_bits(new_cpu_mask), nr_cpu_ids); 2543 cpumask_bits(new_cpu_mask), nr_cpu_ids);
2544 if (err == -EOVERFLOW) {
2545 /* So what. mask it out. */
2546 cpumask_var_t tmp_cpu_mask;
2547 if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
2548 cpumask_setall(tmp_cpu_mask);
2549 cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
2550 drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
2551 res_opts->cpu_mask,
2552 strlen(res_opts->cpu_mask) > 12 ? "..." : "",
2553 nr_cpu_ids);
2554 free_cpumask_var(tmp_cpu_mask);
2555 err = 0;
2556 }
2557 }
2542 if (err) { 2558 if (err) {
2543 drbd_warn(resource, "bitmap_parse() failed with %d\n", err); 2559 drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
2544 /* retcode = ERR_CPU_MASK_PARSE; */ 2560 /* retcode = ERR_CPU_MASK_PARSE; */
@@ -2579,10 +2595,12 @@ struct drbd_resource *drbd_create_resource(const char *name)
2579 kref_init(&resource->kref); 2595 kref_init(&resource->kref);
2580 idr_init(&resource->devices); 2596 idr_init(&resource->devices);
2581 INIT_LIST_HEAD(&resource->connections); 2597 INIT_LIST_HEAD(&resource->connections);
2598 resource->write_ordering = WO_bdev_flush;
2582 list_add_tail_rcu(&resource->resources, &drbd_resources); 2599 list_add_tail_rcu(&resource->resources, &drbd_resources);
2583 mutex_init(&resource->conf_update); 2600 mutex_init(&resource->conf_update);
2584 mutex_init(&resource->adm_mutex); 2601 mutex_init(&resource->adm_mutex);
2585 spin_lock_init(&resource->req_lock); 2602 spin_lock_init(&resource->req_lock);
2603 drbd_debugfs_resource_add(resource);
2586 return resource; 2604 return resource;
2587 2605
2588fail_free_name: 2606fail_free_name:
@@ -2593,7 +2611,7 @@ fail:
2593 return NULL; 2611 return NULL;
2594} 2612}
2595 2613
2596/* caller must be under genl_lock() */ 2614/* caller must be under adm_mutex */
2597struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) 2615struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
2598{ 2616{
2599 struct drbd_resource *resource; 2617 struct drbd_resource *resource;
@@ -2617,7 +2635,6 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
2617 INIT_LIST_HEAD(&connection->current_epoch->list); 2635 INIT_LIST_HEAD(&connection->current_epoch->list);
2618 connection->epochs = 1; 2636 connection->epochs = 1;
2619 spin_lock_init(&connection->epoch_lock); 2637 spin_lock_init(&connection->epoch_lock);
2620 connection->write_ordering = WO_bdev_flush;
2621 2638
2622 connection->send.seen_any_write_yet = false; 2639 connection->send.seen_any_write_yet = false;
2623 connection->send.current_epoch_nr = 0; 2640 connection->send.current_epoch_nr = 0;
@@ -2652,6 +2669,7 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
2652 2669
2653 kref_get(&resource->kref); 2670 kref_get(&resource->kref);
2654 list_add_tail_rcu(&connection->connections, &resource->connections); 2671 list_add_tail_rcu(&connection->connections, &resource->connections);
2672 drbd_debugfs_connection_add(connection);
2655 return connection; 2673 return connection;
2656 2674
2657fail_resource: 2675fail_resource:
@@ -2680,6 +2698,7 @@ void drbd_destroy_connection(struct kref *kref)
2680 drbd_free_socket(&connection->data); 2698 drbd_free_socket(&connection->data);
2681 kfree(connection->int_dig_in); 2699 kfree(connection->int_dig_in);
2682 kfree(connection->int_dig_vv); 2700 kfree(connection->int_dig_vv);
2701 memset(connection, 0xfc, sizeof(*connection));
2683 kfree(connection); 2702 kfree(connection);
2684 kref_put(&resource->kref, drbd_destroy_resource); 2703 kref_put(&resource->kref, drbd_destroy_resource);
2685} 2704}
@@ -2694,7 +2713,6 @@ static int init_submitter(struct drbd_device *device)
2694 return -ENOMEM; 2713 return -ENOMEM;
2695 2714
2696 INIT_WORK(&device->submit.worker, do_submit); 2715 INIT_WORK(&device->submit.worker, do_submit);
2697 spin_lock_init(&device->submit.lock);
2698 INIT_LIST_HEAD(&device->submit.writes); 2716 INIT_LIST_HEAD(&device->submit.writes);
2699 return 0; 2717 return 0;
2700} 2718}
@@ -2764,8 +2782,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2764 blk_queue_merge_bvec(q, drbd_merge_bvec); 2782 blk_queue_merge_bvec(q, drbd_merge_bvec);
2765 q->queue_lock = &resource->req_lock; 2783 q->queue_lock = &resource->req_lock;
2766 2784
2767 device->md_io_page = alloc_page(GFP_KERNEL); 2785 device->md_io.page = alloc_page(GFP_KERNEL);
2768 if (!device->md_io_page) 2786 if (!device->md_io.page)
2769 goto out_no_io_page; 2787 goto out_no_io_page;
2770 2788
2771 if (drbd_bm_init(device)) 2789 if (drbd_bm_init(device))
@@ -2794,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2794 kref_get(&device->kref); 2812 kref_get(&device->kref);
2795 2813
2796 INIT_LIST_HEAD(&device->peer_devices); 2814 INIT_LIST_HEAD(&device->peer_devices);
2815 INIT_LIST_HEAD(&device->pending_bitmap_io);
2797 for_each_connection(connection, resource) { 2816 for_each_connection(connection, resource) {
2798 peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL); 2817 peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
2799 if (!peer_device) 2818 if (!peer_device)
@@ -2829,7 +2848,10 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2829 for_each_peer_device(peer_device, device) 2848 for_each_peer_device(peer_device, device)
2830 drbd_connected(peer_device); 2849 drbd_connected(peer_device);
2831 } 2850 }
2832 2851 /* move to create_peer_device() */
2852 for_each_peer_device(peer_device, device)
2853 drbd_debugfs_peer_device_add(peer_device);
2854 drbd_debugfs_device_add(device);
2833 return NO_ERROR; 2855 return NO_ERROR;
2834 2856
2835out_idr_remove_vol: 2857out_idr_remove_vol:
@@ -2853,7 +2875,7 @@ out_idr_remove_minor:
2853out_no_minor_idr: 2875out_no_minor_idr:
2854 drbd_bm_cleanup(device); 2876 drbd_bm_cleanup(device);
2855out_no_bitmap: 2877out_no_bitmap:
2856 __free_page(device->md_io_page); 2878 __free_page(device->md_io.page);
2857out_no_io_page: 2879out_no_io_page:
2858 put_disk(disk); 2880 put_disk(disk);
2859out_no_disk: 2881out_no_disk:
@@ -2868,8 +2890,13 @@ void drbd_delete_device(struct drbd_device *device)
2868{ 2890{
2869 struct drbd_resource *resource = device->resource; 2891 struct drbd_resource *resource = device->resource;
2870 struct drbd_connection *connection; 2892 struct drbd_connection *connection;
2893 struct drbd_peer_device *peer_device;
2871 int refs = 3; 2894 int refs = 3;
2872 2895
2896 /* move to free_peer_device() */
2897 for_each_peer_device(peer_device, device)
2898 drbd_debugfs_peer_device_cleanup(peer_device);
2899 drbd_debugfs_device_cleanup(device);
2873 for_each_connection(connection, resource) { 2900 for_each_connection(connection, resource) {
2874 idr_remove(&connection->peer_devices, device->vnr); 2901 idr_remove(&connection->peer_devices, device->vnr);
2875 refs++; 2902 refs++;
@@ -2881,13 +2908,12 @@ void drbd_delete_device(struct drbd_device *device)
2881 kref_sub(&device->kref, refs, drbd_destroy_device); 2908 kref_sub(&device->kref, refs, drbd_destroy_device);
2882} 2909}
2883 2910
2884int __init drbd_init(void) 2911static int __init drbd_init(void)
2885{ 2912{
2886 int err; 2913 int err;
2887 2914
2888 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { 2915 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
2889 printk(KERN_ERR 2916 pr_err("invalid minor_count (%d)\n", minor_count);
2890 "drbd: invalid minor_count (%d)\n", minor_count);
2891#ifdef MODULE 2917#ifdef MODULE
2892 return -EINVAL; 2918 return -EINVAL;
2893#else 2919#else
@@ -2897,14 +2923,11 @@ int __init drbd_init(void)
2897 2923
2898 err = register_blkdev(DRBD_MAJOR, "drbd"); 2924 err = register_blkdev(DRBD_MAJOR, "drbd");
2899 if (err) { 2925 if (err) {
2900 printk(KERN_ERR 2926 pr_err("unable to register block device major %d\n",
2901 "drbd: unable to register block device major %d\n",
2902 DRBD_MAJOR); 2927 DRBD_MAJOR);
2903 return err; 2928 return err;
2904 } 2929 }
2905 2930
2906 register_reboot_notifier(&drbd_notifier);
2907
2908 /* 2931 /*
2909 * allocate all necessary structs 2932 * allocate all necessary structs
2910 */ 2933 */
@@ -2918,7 +2941,7 @@ int __init drbd_init(void)
2918 2941
2919 err = drbd_genl_register(); 2942 err = drbd_genl_register();
2920 if (err) { 2943 if (err) {
2921 printk(KERN_ERR "drbd: unable to register generic netlink family\n"); 2944 pr_err("unable to register generic netlink family\n");
2922 goto fail; 2945 goto fail;
2923 } 2946 }
2924 2947
@@ -2929,38 +2952,39 @@ int __init drbd_init(void)
2929 err = -ENOMEM; 2952 err = -ENOMEM;
2930 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); 2953 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
2931 if (!drbd_proc) { 2954 if (!drbd_proc) {
2932 printk(KERN_ERR "drbd: unable to register proc file\n"); 2955 pr_err("unable to register proc file\n");
2933 goto fail; 2956 goto fail;
2934 } 2957 }
2935 2958
2936 retry.wq = create_singlethread_workqueue("drbd-reissue"); 2959 retry.wq = create_singlethread_workqueue("drbd-reissue");
2937 if (!retry.wq) { 2960 if (!retry.wq) {
2938 printk(KERN_ERR "drbd: unable to create retry workqueue\n"); 2961 pr_err("unable to create retry workqueue\n");
2939 goto fail; 2962 goto fail;
2940 } 2963 }
2941 INIT_WORK(&retry.worker, do_retry); 2964 INIT_WORK(&retry.worker, do_retry);
2942 spin_lock_init(&retry.lock); 2965 spin_lock_init(&retry.lock);
2943 INIT_LIST_HEAD(&retry.writes); 2966 INIT_LIST_HEAD(&retry.writes);
2944 2967
2945 printk(KERN_INFO "drbd: initialized. " 2968 if (drbd_debugfs_init())
2969 pr_notice("failed to initialize debugfs -- will not be available\n");
2970
2971 pr_info("initialized. "
2946 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", 2972 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2947 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); 2973 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2948 printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); 2974 pr_info("%s\n", drbd_buildtag());
2949 printk(KERN_INFO "drbd: registered as block device major %d\n", 2975 pr_info("registered as block device major %d\n", DRBD_MAJOR);
2950 DRBD_MAJOR);
2951
2952 return 0; /* Success! */ 2976 return 0; /* Success! */
2953 2977
2954fail: 2978fail:
2955 drbd_cleanup(); 2979 drbd_cleanup();
2956 if (err == -ENOMEM) 2980 if (err == -ENOMEM)
2957 printk(KERN_ERR "drbd: ran out of memory\n"); 2981 pr_err("ran out of memory\n");
2958 else 2982 else
2959 printk(KERN_ERR "drbd: initialization failure\n"); 2983 pr_err("initialization failure\n");
2960 return err; 2984 return err;
2961} 2985}
2962 2986
2963void drbd_free_bc(struct drbd_backing_dev *ldev) 2987void drbd_free_ldev(struct drbd_backing_dev *ldev)
2964{ 2988{
2965 if (ldev == NULL) 2989 if (ldev == NULL)
2966 return; 2990 return;
@@ -2972,24 +2996,29 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
2972 kfree(ldev); 2996 kfree(ldev);
2973} 2997}
2974 2998
2975void drbd_free_sock(struct drbd_connection *connection) 2999static void drbd_free_one_sock(struct drbd_socket *ds)
2976{ 3000{
2977 if (connection->data.socket) { 3001 struct socket *s;
2978 mutex_lock(&connection->data.mutex); 3002 mutex_lock(&ds->mutex);
2979 kernel_sock_shutdown(connection->data.socket, SHUT_RDWR); 3003 s = ds->socket;
2980 sock_release(connection->data.socket); 3004 ds->socket = NULL;
2981 connection->data.socket = NULL; 3005 mutex_unlock(&ds->mutex);
2982 mutex_unlock(&connection->data.mutex); 3006 if (s) {
2983 } 3007 /* so debugfs does not need to mutex_lock() */
2984 if (connection->meta.socket) { 3008 synchronize_rcu();
2985 mutex_lock(&connection->meta.mutex); 3009 kernel_sock_shutdown(s, SHUT_RDWR);
2986 kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR); 3010 sock_release(s);
2987 sock_release(connection->meta.socket);
2988 connection->meta.socket = NULL;
2989 mutex_unlock(&connection->meta.mutex);
2990 } 3011 }
2991} 3012}
2992 3013
3014void drbd_free_sock(struct drbd_connection *connection)
3015{
3016 if (connection->data.socket)
3017 drbd_free_one_sock(&connection->data);
3018 if (connection->meta.socket)
3019 drbd_free_one_sock(&connection->meta);
3020}
3021
2993/* meta data management */ 3022/* meta data management */
2994 3023
2995void conn_md_sync(struct drbd_connection *connection) 3024void conn_md_sync(struct drbd_connection *connection)
@@ -3093,7 +3122,7 @@ void drbd_md_sync(struct drbd_device *device)
3093 if (!get_ldev_if_state(device, D_FAILED)) 3122 if (!get_ldev_if_state(device, D_FAILED))
3094 return; 3123 return;
3095 3124
3096 buffer = drbd_md_get_buffer(device); 3125 buffer = drbd_md_get_buffer(device, __func__);
3097 if (!buffer) 3126 if (!buffer)
3098 goto out; 3127 goto out;
3099 3128
@@ -3253,7 +3282,7 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
3253 if (device->state.disk != D_DISKLESS) 3282 if (device->state.disk != D_DISKLESS)
3254 return ERR_DISK_CONFIGURED; 3283 return ERR_DISK_CONFIGURED;
3255 3284
3256 buffer = drbd_md_get_buffer(device); 3285 buffer = drbd_md_get_buffer(device, __func__);
3257 if (!buffer) 3286 if (!buffer)
3258 return ERR_NOMEM; 3287 return ERR_NOMEM;
3259 3288
@@ -3466,23 +3495,19 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
3466 * 3495 *
3467 * Sets all bits in the bitmap and writes the whole bitmap to stable storage. 3496 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3468 */ 3497 */
3469int drbd_bmio_set_n_write(struct drbd_device *device) 3498int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
3470{ 3499{
3471 int rv = -EIO; 3500 int rv = -EIO;
3472 3501
3473 if (get_ldev_if_state(device, D_ATTACHING)) { 3502 drbd_md_set_flag(device, MDF_FULL_SYNC);
3474 drbd_md_set_flag(device, MDF_FULL_SYNC); 3503 drbd_md_sync(device);
3475 drbd_md_sync(device); 3504 drbd_bm_set_all(device);
3476 drbd_bm_set_all(device);
3477
3478 rv = drbd_bm_write(device);
3479 3505
3480 if (!rv) { 3506 rv = drbd_bm_write(device);
3481 drbd_md_clear_flag(device, MDF_FULL_SYNC);
3482 drbd_md_sync(device);
3483 }
3484 3507
3485 put_ldev(device); 3508 if (!rv) {
3509 drbd_md_clear_flag(device, MDF_FULL_SYNC);
3510 drbd_md_sync(device);
3486 } 3511 }
3487 3512
3488 return rv; 3513 return rv;
@@ -3494,18 +3519,11 @@ int drbd_bmio_set_n_write(struct drbd_device *device)
3494 * 3519 *
3495 * Clears all bits in the bitmap and writes the whole bitmap to stable storage. 3520 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3496 */ 3521 */
3497int drbd_bmio_clear_n_write(struct drbd_device *device) 3522int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
3498{ 3523{
3499 int rv = -EIO;
3500
3501 drbd_resume_al(device); 3524 drbd_resume_al(device);
3502 if (get_ldev_if_state(device, D_ATTACHING)) { 3525 drbd_bm_clear_all(device);
3503 drbd_bm_clear_all(device); 3526 return drbd_bm_write(device);
3504 rv = drbd_bm_write(device);
3505 put_ldev(device);
3506 }
3507
3508 return rv;
3509} 3527}
3510 3528
3511static int w_bitmap_io(struct drbd_work *w, int unused) 3529static int w_bitmap_io(struct drbd_work *w, int unused)
@@ -3537,61 +3555,6 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
3537 return 0; 3555 return 0;
3538} 3556}
3539 3557
3540void drbd_ldev_destroy(struct drbd_device *device)
3541{
3542 lc_destroy(device->resync);
3543 device->resync = NULL;
3544 lc_destroy(device->act_log);
3545 device->act_log = NULL;
3546 __no_warn(local,
3547 drbd_free_bc(device->ldev);
3548 device->ldev = NULL;);
3549
3550 clear_bit(GO_DISKLESS, &device->flags);
3551}
3552
3553static int w_go_diskless(struct drbd_work *w, int unused)
3554{
3555 struct drbd_device *device =
3556 container_of(w, struct drbd_device, go_diskless);
3557
3558 D_ASSERT(device, device->state.disk == D_FAILED);
3559 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3560 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3561 * the protected members anymore, though, so once put_ldev reaches zero
3562 * again, it will be safe to free them. */
3563
3564 /* Try to write changed bitmap pages, read errors may have just
3565 * set some bits outside the area covered by the activity log.
3566 *
3567 * If we have an IO error during the bitmap writeout,
3568 * we will want a full sync next time, just in case.
3569 * (Do we want a specific meta data flag for this?)
3570 *
3571 * If that does not make it to stable storage either,
3572 * we cannot do anything about that anymore.
3573 *
3574 * We still need to check if both bitmap and ldev are present, we may
3575 * end up here after a failed attach, before ldev was even assigned.
3576 */
3577 if (device->bitmap && device->ldev) {
3578 /* An interrupted resync or similar is allowed to recounts bits
3579 * while we detach.
3580 * Any modifications would not be expected anymore, though.
3581 */
3582 if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
3583 "detach", BM_LOCKED_TEST_ALLOWED)) {
3584 if (test_bit(WAS_READ_ERROR, &device->flags)) {
3585 drbd_md_set_flag(device, MDF_FULL_SYNC);
3586 drbd_md_sync(device);
3587 }
3588 }
3589 }
3590
3591 drbd_force_state(device, NS(disk, D_DISKLESS));
3592 return 0;
3593}
3594
3595/** 3558/**
3596 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap 3559 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3597 * @device: DRBD device. 3560 * @device: DRBD device.
@@ -3603,6 +3566,9 @@ static int w_go_diskless(struct drbd_work *w, int unused)
3603 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be 3566 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3604 * called from worker context. It MUST NOT be used while a previous such 3567 * called from worker context. It MUST NOT be used while a previous such
3605 * work is still pending! 3568 * work is still pending!
3569 *
3570 * Its worker function encloses the call of io_fn() by get_ldev() and
3571 * put_ldev().
3606 */ 3572 */
3607void drbd_queue_bitmap_io(struct drbd_device *device, 3573void drbd_queue_bitmap_io(struct drbd_device *device,
3608 int (*io_fn)(struct drbd_device *), 3574 int (*io_fn)(struct drbd_device *),
@@ -3685,25 +3651,7 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3685static void md_sync_timer_fn(unsigned long data) 3651static void md_sync_timer_fn(unsigned long data)
3686{ 3652{
3687 struct drbd_device *device = (struct drbd_device *) data; 3653 struct drbd_device *device = (struct drbd_device *) data;
3688 3654 drbd_device_post_work(device, MD_SYNC);
3689 /* must not double-queue! */
3690 if (list_empty(&device->md_sync_work.list))
3691 drbd_queue_work_front(&first_peer_device(device)->connection->sender_work,
3692 &device->md_sync_work);
3693}
3694
3695static int w_md_sync(struct drbd_work *w, int unused)
3696{
3697 struct drbd_device *device =
3698 container_of(w, struct drbd_device, md_sync_work);
3699
3700 drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3701#ifdef DEBUG
3702 drbd_warn(device, "last md_mark_dirty: %s:%u\n",
3703 device->last_md_mark_dirty.func, device->last_md_mark_dirty.line);
3704#endif
3705 drbd_md_sync(device);
3706 return 0;
3707} 3655}
3708 3656
3709const char *cmdname(enum drbd_packet cmd) 3657const char *cmdname(enum drbd_packet cmd)
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 3f2e16738080..1cd47df44bda 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -23,6 +23,8 @@
23 23
24 */ 24 */
25 25
26#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
27
26#include <linux/module.h> 28#include <linux/module.h>
27#include <linux/drbd.h> 29#include <linux/drbd.h>
28#include <linux/in.h> 30#include <linux/in.h>
@@ -85,7 +87,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
85{ 87{
86 genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); 88 genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
87 if (genlmsg_reply(skb, info)) 89 if (genlmsg_reply(skb, info))
88 printk(KERN_ERR "drbd: error sending genl reply\n"); 90 pr_err("error sending genl reply\n");
89} 91}
90 92
91/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only 93/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
@@ -558,8 +560,10 @@ void conn_try_outdate_peer_async(struct drbd_connection *connection)
558} 560}
559 561
560enum drbd_state_rv 562enum drbd_state_rv
561drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) 563drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
562{ 564{
565 struct drbd_peer_device *const peer_device = first_peer_device(device);
566 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
563 const int max_tries = 4; 567 const int max_tries = 4;
564 enum drbd_state_rv rv = SS_UNKNOWN_ERROR; 568 enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
565 struct net_conf *nc; 569 struct net_conf *nc;
@@ -607,7 +611,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
607 device->state.disk == D_CONSISTENT && mask.pdsk == 0) { 611 device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
608 D_ASSERT(device, device->state.pdsk == D_UNKNOWN); 612 D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
609 613
610 if (conn_try_outdate_peer(first_peer_device(device)->connection)) { 614 if (conn_try_outdate_peer(connection)) {
611 val.disk = D_UP_TO_DATE; 615 val.disk = D_UP_TO_DATE;
612 mask.disk = D_MASK; 616 mask.disk = D_MASK;
613 } 617 }
@@ -617,7 +621,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
617 if (rv == SS_NOTHING_TO_DO) 621 if (rv == SS_NOTHING_TO_DO)
618 goto out; 622 goto out;
619 if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { 623 if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
620 if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) { 624 if (!conn_try_outdate_peer(connection) && force) {
621 drbd_warn(device, "Forced into split brain situation!\n"); 625 drbd_warn(device, "Forced into split brain situation!\n");
622 mask.pdsk = D_MASK; 626 mask.pdsk = D_MASK;
623 val.pdsk = D_OUTDATED; 627 val.pdsk = D_OUTDATED;
@@ -630,7 +634,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
630 retry at most once more in this case. */ 634 retry at most once more in this case. */
631 int timeo; 635 int timeo;
632 rcu_read_lock(); 636 rcu_read_lock();
633 nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 637 nc = rcu_dereference(connection->net_conf);
634 timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; 638 timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
635 rcu_read_unlock(); 639 rcu_read_unlock();
636 schedule_timeout_interruptible(timeo); 640 schedule_timeout_interruptible(timeo);
@@ -659,19 +663,17 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
659 /* FIXME also wait for all pending P_BARRIER_ACK? */ 663 /* FIXME also wait for all pending P_BARRIER_ACK? */
660 664
661 if (new_role == R_SECONDARY) { 665 if (new_role == R_SECONDARY) {
662 set_disk_ro(device->vdisk, true);
663 if (get_ldev(device)) { 666 if (get_ldev(device)) {
664 device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; 667 device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
665 put_ldev(device); 668 put_ldev(device);
666 } 669 }
667 } else { 670 } else {
668 /* Called from drbd_adm_set_role only. 671 mutex_lock(&device->resource->conf_update);
669 * We are still holding the conf_update mutex. */ 672 nc = connection->net_conf;
670 nc = first_peer_device(device)->connection->net_conf;
671 if (nc) 673 if (nc)
672 nc->discard_my_data = 0; /* without copy; single bit op is atomic */ 674 nc->discard_my_data = 0; /* without copy; single bit op is atomic */
675 mutex_unlock(&device->resource->conf_update);
673 676
674 set_disk_ro(device->vdisk, false);
675 if (get_ldev(device)) { 677 if (get_ldev(device)) {
676 if (((device->state.conn < C_CONNECTED || 678 if (((device->state.conn < C_CONNECTED ||
677 device->state.pdsk <= D_FAILED) 679 device->state.pdsk <= D_FAILED)
@@ -689,12 +691,12 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
689 if (device->state.conn >= C_WF_REPORT_PARAMS) { 691 if (device->state.conn >= C_WF_REPORT_PARAMS) {
690 /* if this was forced, we should consider sync */ 692 /* if this was forced, we should consider sync */
691 if (forced) 693 if (forced)
692 drbd_send_uuids(first_peer_device(device)); 694 drbd_send_uuids(peer_device);
693 drbd_send_current_state(first_peer_device(device)); 695 drbd_send_current_state(peer_device);
694 } 696 }
695 697
696 drbd_md_sync(device); 698 drbd_md_sync(device);
697 699 set_disk_ro(device->vdisk, new_role == R_SECONDARY);
698 kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); 700 kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
699out: 701out:
700 mutex_unlock(device->state_mutex); 702 mutex_unlock(device->state_mutex);
@@ -891,7 +893,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
891 * still lock the act_log to not trigger ASSERTs there. 893 * still lock the act_log to not trigger ASSERTs there.
892 */ 894 */
893 drbd_suspend_io(device); 895 drbd_suspend_io(device);
894 buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */ 896 buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
895 if (!buffer) { 897 if (!buffer) {
896 drbd_resume_io(device); 898 drbd_resume_io(device);
897 return DS_ERROR; 899 return DS_ERROR;
@@ -971,6 +973,10 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
971 if (la_size_changed || md_moved || rs) { 973 if (la_size_changed || md_moved || rs) {
972 u32 prev_flags; 974 u32 prev_flags;
973 975
976 /* We do some synchronous IO below, which may take some time.
977 * Clear the timer, to avoid scary "timer expired!" messages,
978 * "Superblock" is written out at least twice below, anyways. */
979 del_timer(&device->md_sync_timer);
974 drbd_al_shrink(device); /* All extents inactive. */ 980 drbd_al_shrink(device); /* All extents inactive. */
975 981
976 prev_flags = md->flags; 982 prev_flags = md->flags;
@@ -1116,15 +1122,16 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
1116 return 0; 1122 return 0;
1117} 1123}
1118 1124
1119static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size) 1125static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
1126 unsigned int max_bio_size)
1120{ 1127{
1121 struct request_queue * const q = device->rq_queue; 1128 struct request_queue * const q = device->rq_queue;
1122 unsigned int max_hw_sectors = max_bio_size >> 9; 1129 unsigned int max_hw_sectors = max_bio_size >> 9;
1123 unsigned int max_segments = 0; 1130 unsigned int max_segments = 0;
1124 struct request_queue *b = NULL; 1131 struct request_queue *b = NULL;
1125 1132
1126 if (get_ldev_if_state(device, D_ATTACHING)) { 1133 if (bdev) {
1127 b = device->ldev->backing_bdev->bd_disk->queue; 1134 b = bdev->backing_bdev->bd_disk->queue;
1128 1135
1129 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 1136 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
1130 rcu_read_lock(); 1137 rcu_read_lock();
@@ -1169,11 +1176,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
1169 b->backing_dev_info.ra_pages); 1176 b->backing_dev_info.ra_pages);
1170 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 1177 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
1171 } 1178 }
1172 put_ldev(device);
1173 } 1179 }
1174} 1180}
1175 1181
1176void drbd_reconsider_max_bio_size(struct drbd_device *device) 1182void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
1177{ 1183{
1178 unsigned int now, new, local, peer; 1184 unsigned int now, new, local, peer;
1179 1185
@@ -1181,10 +1187,9 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
1181 local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ 1187 local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
1182 peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ 1188 peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
1183 1189
1184 if (get_ldev_if_state(device, D_ATTACHING)) { 1190 if (bdev) {
1185 local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; 1191 local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
1186 device->local_max_bio_size = local; 1192 device->local_max_bio_size = local;
1187 put_ldev(device);
1188 } 1193 }
1189 local = min(local, DRBD_MAX_BIO_SIZE); 1194 local = min(local, DRBD_MAX_BIO_SIZE);
1190 1195
@@ -1217,7 +1222,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
1217 if (new != now) 1222 if (new != now)
1218 drbd_info(device, "max BIO size = %u\n", new); 1223 drbd_info(device, "max BIO size = %u\n", new);
1219 1224
1220 drbd_setup_queue_param(device, new); 1225 drbd_setup_queue_param(device, bdev, new);
1221} 1226}
1222 1227
1223/* Starts the worker thread */ 1228/* Starts the worker thread */
@@ -1299,6 +1304,13 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
1299 return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; 1304 return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
1300} 1305}
1301 1306
1307static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
1308{
1309 return a->disk_barrier != b->disk_barrier ||
1310 a->disk_flushes != b->disk_flushes ||
1311 a->disk_drain != b->disk_drain;
1312}
1313
1302int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) 1314int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1303{ 1315{
1304 struct drbd_config_context adm_ctx; 1316 struct drbd_config_context adm_ctx;
@@ -1405,7 +1417,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1405 else 1417 else
1406 set_bit(MD_NO_FUA, &device->flags); 1418 set_bit(MD_NO_FUA, &device->flags);
1407 1419
1408 drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush); 1420 if (write_ordering_changed(old_disk_conf, new_disk_conf))
1421 drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
1409 1422
1410 drbd_md_sync(device); 1423 drbd_md_sync(device);
1411 1424
@@ -1440,6 +1453,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1440{ 1453{
1441 struct drbd_config_context adm_ctx; 1454 struct drbd_config_context adm_ctx;
1442 struct drbd_device *device; 1455 struct drbd_device *device;
1456 struct drbd_peer_device *peer_device;
1457 struct drbd_connection *connection;
1443 int err; 1458 int err;
1444 enum drbd_ret_code retcode; 1459 enum drbd_ret_code retcode;
1445 enum determine_dev_size dd; 1460 enum determine_dev_size dd;
@@ -1462,7 +1477,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1462 1477
1463 device = adm_ctx.device; 1478 device = adm_ctx.device;
1464 mutex_lock(&adm_ctx.resource->adm_mutex); 1479 mutex_lock(&adm_ctx.resource->adm_mutex);
1465 conn_reconfig_start(first_peer_device(device)->connection); 1480 peer_device = first_peer_device(device);
1481 connection = peer_device ? peer_device->connection : NULL;
1482 conn_reconfig_start(connection);
1466 1483
1467 /* if you want to reconfigure, please tear down first */ 1484 /* if you want to reconfigure, please tear down first */
1468 if (device->state.disk > D_DISKLESS) { 1485 if (device->state.disk > D_DISKLESS) {
@@ -1473,7 +1490,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1473 * drbd_ldev_destroy is done already, we may end up here very fast, 1490 * drbd_ldev_destroy is done already, we may end up here very fast,
1474 * e.g. if someone calls attach from the on-io-error handler, 1491 * e.g. if someone calls attach from the on-io-error handler,
1475 * to realize a "hot spare" feature (not that I'd recommend that) */ 1492 * to realize a "hot spare" feature (not that I'd recommend that) */
1476 wait_event(device->misc_wait, !atomic_read(&device->local_cnt)); 1493 wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
1477 1494
1478 /* make sure there is no leftover from previous force-detach attempts */ 1495 /* make sure there is no leftover from previous force-detach attempts */
1479 clear_bit(FORCE_DETACH, &device->flags); 1496 clear_bit(FORCE_DETACH, &device->flags);
@@ -1529,7 +1546,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1529 goto fail; 1546 goto fail;
1530 1547
1531 rcu_read_lock(); 1548 rcu_read_lock();
1532 nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 1549 nc = rcu_dereference(connection->net_conf);
1533 if (nc) { 1550 if (nc) {
1534 if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { 1551 if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
1535 rcu_read_unlock(); 1552 rcu_read_unlock();
@@ -1649,7 +1666,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1649 */ 1666 */
1650 wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device)); 1667 wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
1651 /* and for any other previously queued work */ 1668 /* and for any other previously queued work */
1652 drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work); 1669 drbd_flush_workqueue(&connection->sender_work);
1653 1670
1654 rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE); 1671 rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
1655 retcode = rv; /* FIXME: Type mismatch. */ 1672 retcode = rv; /* FIXME: Type mismatch. */
@@ -1710,7 +1727,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1710 new_disk_conf = NULL; 1727 new_disk_conf = NULL;
1711 new_plan = NULL; 1728 new_plan = NULL;
1712 1729
1713 drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush); 1730 drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
1714 1731
1715 if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) 1732 if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
1716 set_bit(CRASHED_PRIMARY, &device->flags); 1733 set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1726,7 +1743,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1726 device->read_cnt = 0; 1743 device->read_cnt = 0;
1727 device->writ_cnt = 0; 1744 device->writ_cnt = 0;
1728 1745
1729 drbd_reconsider_max_bio_size(device); 1746 drbd_reconsider_max_bio_size(device, device->ldev);
1730 1747
1731 /* If I am currently not R_PRIMARY, 1748 /* If I am currently not R_PRIMARY,
1732 * but meta data primary indicator is set, 1749 * but meta data primary indicator is set,
@@ -1845,7 +1862,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1845 1862
1846 kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); 1863 kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
1847 put_ldev(device); 1864 put_ldev(device);
1848 conn_reconfig_done(first_peer_device(device)->connection); 1865 conn_reconfig_done(connection);
1849 mutex_unlock(&adm_ctx.resource->adm_mutex); 1866 mutex_unlock(&adm_ctx.resource->adm_mutex);
1850 drbd_adm_finish(&adm_ctx, info, retcode); 1867 drbd_adm_finish(&adm_ctx, info, retcode);
1851 return 0; 1868 return 0;
@@ -1856,7 +1873,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1856 drbd_force_state(device, NS(disk, D_DISKLESS)); 1873 drbd_force_state(device, NS(disk, D_DISKLESS));
1857 drbd_md_sync(device); 1874 drbd_md_sync(device);
1858 fail: 1875 fail:
1859 conn_reconfig_done(first_peer_device(device)->connection); 1876 conn_reconfig_done(connection);
1860 if (nbc) { 1877 if (nbc) {
1861 if (nbc->backing_bdev) 1878 if (nbc->backing_bdev)
1862 blkdev_put(nbc->backing_bdev, 1879 blkdev_put(nbc->backing_bdev,
@@ -1888,7 +1905,7 @@ static int adm_detach(struct drbd_device *device, int force)
1888 } 1905 }
1889 1906
1890 drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ 1907 drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
1891 drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */ 1908 drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
1892 retcode = drbd_request_state(device, NS(disk, D_FAILED)); 1909 retcode = drbd_request_state(device, NS(disk, D_FAILED));
1893 drbd_md_put_buffer(device); 1910 drbd_md_put_buffer(device);
1894 /* D_FAILED will transition to DISKLESS. */ 1911 /* D_FAILED will transition to DISKLESS. */
@@ -2654,8 +2671,13 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2654 if (retcode != NO_ERROR) 2671 if (retcode != NO_ERROR)
2655 goto out; 2672 goto out;
2656 2673
2657 mutex_lock(&adm_ctx.resource->adm_mutex);
2658 device = adm_ctx.device; 2674 device = adm_ctx.device;
2675 if (!get_ldev(device)) {
2676 retcode = ERR_NO_DISK;
2677 goto out;
2678 }
2679
2680 mutex_lock(&adm_ctx.resource->adm_mutex);
2659 2681
2660 /* If there is still bitmap IO pending, probably because of a previous 2682 /* If there is still bitmap IO pending, probably because of a previous
2661 * resync just being finished, wait for it before requesting a new resync. 2683 * resync just being finished, wait for it before requesting a new resync.
@@ -2679,6 +2701,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2679 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); 2701 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
2680 drbd_resume_io(device); 2702 drbd_resume_io(device);
2681 mutex_unlock(&adm_ctx.resource->adm_mutex); 2703 mutex_unlock(&adm_ctx.resource->adm_mutex);
2704 put_ldev(device);
2682out: 2705out:
2683 drbd_adm_finish(&adm_ctx, info, retcode); 2706 drbd_adm_finish(&adm_ctx, info, retcode);
2684 return 0; 2707 return 0;
@@ -2704,7 +2727,7 @@ out:
2704 return 0; 2727 return 0;
2705} 2728}
2706 2729
2707static int drbd_bmio_set_susp_al(struct drbd_device *device) 2730static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
2708{ 2731{
2709 int rv; 2732 int rv;
2710 2733
@@ -2725,8 +2748,13 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2725 if (retcode != NO_ERROR) 2748 if (retcode != NO_ERROR)
2726 goto out; 2749 goto out;
2727 2750
2728 mutex_lock(&adm_ctx.resource->adm_mutex);
2729 device = adm_ctx.device; 2751 device = adm_ctx.device;
2752 if (!get_ldev(device)) {
2753 retcode = ERR_NO_DISK;
2754 goto out;
2755 }
2756
2757 mutex_lock(&adm_ctx.resource->adm_mutex);
2730 2758
2731 /* If there is still bitmap IO pending, probably because of a previous 2759 /* If there is still bitmap IO pending, probably because of a previous
2732 * resync just being finished, wait for it before requesting a new resync. 2760 * resync just being finished, wait for it before requesting a new resync.
@@ -2753,6 +2781,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2753 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); 2781 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
2754 drbd_resume_io(device); 2782 drbd_resume_io(device);
2755 mutex_unlock(&adm_ctx.resource->adm_mutex); 2783 mutex_unlock(&adm_ctx.resource->adm_mutex);
2784 put_ldev(device);
2756out: 2785out:
2757 drbd_adm_finish(&adm_ctx, info, retcode); 2786 drbd_adm_finish(&adm_ctx, info, retcode);
2758 return 0; 2787 return 0;
@@ -2892,7 +2921,7 @@ static struct drbd_connection *the_only_connection(struct drbd_resource *resourc
2892 return list_first_entry(&resource->connections, struct drbd_connection, connections); 2921 return list_first_entry(&resource->connections, struct drbd_connection, connections);
2893} 2922}
2894 2923
2895int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, 2924static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
2896 const struct sib_info *sib) 2925 const struct sib_info *sib)
2897{ 2926{
2898 struct drbd_resource *resource = device->resource; 2927 struct drbd_resource *resource = device->resource;
@@ -3622,13 +3651,6 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
3622 unsigned seq; 3651 unsigned seq;
3623 int err = -ENOMEM; 3652 int err = -ENOMEM;
3624 3653
3625 if (sib->sib_reason == SIB_SYNC_PROGRESS) {
3626 if (time_after(jiffies, device->rs_last_bcast + HZ))
3627 device->rs_last_bcast = jiffies;
3628 else
3629 return;
3630 }
3631
3632 seq = atomic_inc_return(&drbd_genl_seq); 3654 seq = atomic_inc_return(&drbd_genl_seq);
3633 msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); 3655 msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
3634 if (!msg) 3656 if (!msg)
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 89736bdbbc70..06e6147c7601 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -60,20 +60,65 @@ static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
60 seq_printf(seq, "%ld", v); 60 seq_printf(seq, "%ld", v);
61} 61}
62 62
63static void drbd_get_syncer_progress(struct drbd_device *device,
64 union drbd_dev_state state, unsigned long *rs_total,
65 unsigned long *bits_left, unsigned int *per_mil_done)
66{
67 /* this is to break it at compile time when we change that, in case we
68 * want to support more than (1<<32) bits on a 32bit arch. */
69 typecheck(unsigned long, device->rs_total);
70 *rs_total = device->rs_total;
71
72 /* note: both rs_total and rs_left are in bits, i.e. in
73 * units of BM_BLOCK_SIZE.
74 * for the percentage, we don't care. */
75
76 if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
77 *bits_left = device->ov_left;
78 else
79 *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
80 /* >> 10 to prevent overflow,
81 * +1 to prevent division by zero */
82 if (*bits_left > *rs_total) {
83 /* D'oh. Maybe a logic bug somewhere. More likely just a race
84 * between state change and reset of rs_total.
85 */
86 *bits_left = *rs_total;
87 *per_mil_done = *rs_total ? 0 : 1000;
88 } else {
89 /* Make sure the division happens in long context.
90 * We allow up to one petabyte storage right now,
91 * at a granularity of 4k per bit that is 2**38 bits.
92 * After shift right and multiplication by 1000,
93 * this should still fit easily into a 32bit long,
94 * so we don't need a 64bit division on 32bit arch.
95 * Note: currently we don't support such large bitmaps on 32bit
96 * arch anyways, but no harm done to be prepared for it here.
97 */
98 unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
99 unsigned long left = *bits_left >> shift;
100 unsigned long total = 1UL + (*rs_total >> shift);
101 unsigned long tmp = 1000UL - left * 1000UL/total;
102 *per_mil_done = tmp;
103 }
104}
105
106
63/*lge 107/*lge
64 * progress bars shamelessly adapted from driver/md/md.c 108 * progress bars shamelessly adapted from driver/md/md.c
65 * output looks like 109 * output looks like
66 * [=====>..............] 33.5% (23456/123456) 110 * [=====>..............] 33.5% (23456/123456)
67 * finish: 2:20:20 speed: 6,345 (6,456) K/sec 111 * finish: 2:20:20 speed: 6,345 (6,456) K/sec
68 */ 112 */
69static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq) 113static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
114 union drbd_dev_state state)
70{ 115{
71 unsigned long db, dt, dbdt, rt, rs_left; 116 unsigned long db, dt, dbdt, rt, rs_total, rs_left;
72 unsigned int res; 117 unsigned int res;
73 int i, x, y; 118 int i, x, y;
74 int stalled = 0; 119 int stalled = 0;
75 120
76 drbd_get_syncer_progress(device, &rs_left, &res); 121 drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
77 122
78 x = res/50; 123 x = res/50;
79 y = 20-x; 124 y = 20-x;
@@ -85,21 +130,21 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
85 seq_printf(seq, "."); 130 seq_printf(seq, ".");
86 seq_printf(seq, "] "); 131 seq_printf(seq, "] ");
87 132
88 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) 133 if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
89 seq_printf(seq, "verified:"); 134 seq_printf(seq, "verified:");
90 else 135 else
91 seq_printf(seq, "sync'ed:"); 136 seq_printf(seq, "sync'ed:");
92 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); 137 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
93 138
94 /* if more than a few GB, display in MB */ 139 /* if more than a few GB, display in MB */
95 if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT))) 140 if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
96 seq_printf(seq, "(%lu/%lu)M", 141 seq_printf(seq, "(%lu/%lu)M",
97 (unsigned long) Bit2KB(rs_left >> 10), 142 (unsigned long) Bit2KB(rs_left >> 10),
98 (unsigned long) Bit2KB(device->rs_total >> 10)); 143 (unsigned long) Bit2KB(rs_total >> 10));
99 else 144 else
100 seq_printf(seq, "(%lu/%lu)K\n\t", 145 seq_printf(seq, "(%lu/%lu)K\n\t",
101 (unsigned long) Bit2KB(rs_left), 146 (unsigned long) Bit2KB(rs_left),
102 (unsigned long) Bit2KB(device->rs_total)); 147 (unsigned long) Bit2KB(rs_total));
103 148
104 /* see drivers/md/md.c 149 /* see drivers/md/md.c
105 * We do not want to overflow, so the order of operands and 150 * We do not want to overflow, so the order of operands and
@@ -150,13 +195,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
150 dt = (jiffies - device->rs_start - device->rs_paused) / HZ; 195 dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
151 if (dt == 0) 196 if (dt == 0)
152 dt = 1; 197 dt = 1;
153 db = device->rs_total - rs_left; 198 db = rs_total - rs_left;
154 dbdt = Bit2KB(db/dt); 199 dbdt = Bit2KB(db/dt);
155 seq_printf_with_thousands_grouping(seq, dbdt); 200 seq_printf_with_thousands_grouping(seq, dbdt);
156 seq_printf(seq, ")"); 201 seq_printf(seq, ")");
157 202
158 if (device->state.conn == C_SYNC_TARGET || 203 if (state.conn == C_SYNC_TARGET ||
159 device->state.conn == C_VERIFY_S) { 204 state.conn == C_VERIFY_S) {
160 seq_printf(seq, " want: "); 205 seq_printf(seq, " want: ");
161 seq_printf_with_thousands_grouping(seq, device->c_sync_rate); 206 seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
162 } 207 }
@@ -168,8 +213,8 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
168 unsigned long bm_bits = drbd_bm_bits(device); 213 unsigned long bm_bits = drbd_bm_bits(device);
169 unsigned long bit_pos; 214 unsigned long bit_pos;
170 unsigned long long stop_sector = 0; 215 unsigned long long stop_sector = 0;
171 if (device->state.conn == C_VERIFY_S || 216 if (state.conn == C_VERIFY_S ||
172 device->state.conn == C_VERIFY_T) { 217 state.conn == C_VERIFY_T) {
173 bit_pos = bm_bits - device->ov_left; 218 bit_pos = bm_bits - device->ov_left;
174 if (verify_can_do_stop_sector(device)) 219 if (verify_can_do_stop_sector(device))
175 stop_sector = device->ov_stop_sector; 220 stop_sector = device->ov_stop_sector;
@@ -188,22 +233,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
188 } 233 }
189} 234}
190 235
191static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
192{
193 struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
194
195 seq_printf(seq, "%5d %s %s\n", bme->rs_left,
196 bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
197 bme->flags & BME_LOCKED ? "LOCKED" : "------"
198 );
199}
200
201static int drbd_seq_show(struct seq_file *seq, void *v) 236static int drbd_seq_show(struct seq_file *seq, void *v)
202{ 237{
203 int i, prev_i = -1; 238 int i, prev_i = -1;
204 const char *sn; 239 const char *sn;
205 struct drbd_device *device; 240 struct drbd_device *device;
206 struct net_conf *nc; 241 struct net_conf *nc;
242 union drbd_dev_state state;
207 char wp; 243 char wp;
208 244
209 static char write_ordering_chars[] = { 245 static char write_ordering_chars[] = {
@@ -241,11 +277,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
241 seq_printf(seq, "\n"); 277 seq_printf(seq, "\n");
242 prev_i = i; 278 prev_i = i;
243 279
244 sn = drbd_conn_str(device->state.conn); 280 state = device->state;
281 sn = drbd_conn_str(state.conn);
245 282
246 if (device->state.conn == C_STANDALONE && 283 if (state.conn == C_STANDALONE &&
247 device->state.disk == D_DISKLESS && 284 state.disk == D_DISKLESS &&
248 device->state.role == R_SECONDARY) { 285 state.role == R_SECONDARY) {
249 seq_printf(seq, "%2d: cs:Unconfigured\n", i); 286 seq_printf(seq, "%2d: cs:Unconfigured\n", i);
250 } else { 287 } else {
251 /* reset device->congestion_reason */ 288 /* reset device->congestion_reason */
@@ -258,15 +295,15 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
258 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " 295 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
259 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", 296 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
260 i, sn, 297 i, sn,
261 drbd_role_str(device->state.role), 298 drbd_role_str(state.role),
262 drbd_role_str(device->state.peer), 299 drbd_role_str(state.peer),
263 drbd_disk_str(device->state.disk), 300 drbd_disk_str(state.disk),
264 drbd_disk_str(device->state.pdsk), 301 drbd_disk_str(state.pdsk),
265 wp, 302 wp,
266 drbd_suspended(device) ? 's' : 'r', 303 drbd_suspended(device) ? 's' : 'r',
267 device->state.aftr_isp ? 'a' : '-', 304 state.aftr_isp ? 'a' : '-',
268 device->state.peer_isp ? 'p' : '-', 305 state.peer_isp ? 'p' : '-',
269 device->state.user_isp ? 'u' : '-', 306 state.user_isp ? 'u' : '-',
270 device->congestion_reason ?: '-', 307 device->congestion_reason ?: '-',
271 test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-', 308 test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
272 device->send_cnt/2, 309 device->send_cnt/2,
@@ -281,17 +318,17 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
281 atomic_read(&device->unacked_cnt), 318 atomic_read(&device->unacked_cnt),
282 atomic_read(&device->ap_bio_cnt), 319 atomic_read(&device->ap_bio_cnt),
283 first_peer_device(device)->connection->epochs, 320 first_peer_device(device)->connection->epochs,
284 write_ordering_chars[first_peer_device(device)->connection->write_ordering] 321 write_ordering_chars[device->resource->write_ordering]
285 ); 322 );
286 seq_printf(seq, " oos:%llu\n", 323 seq_printf(seq, " oos:%llu\n",
287 Bit2KB((unsigned long long) 324 Bit2KB((unsigned long long)
288 drbd_bm_total_weight(device))); 325 drbd_bm_total_weight(device)));
289 } 326 }
290 if (device->state.conn == C_SYNC_SOURCE || 327 if (state.conn == C_SYNC_SOURCE ||
291 device->state.conn == C_SYNC_TARGET || 328 state.conn == C_SYNC_TARGET ||
292 device->state.conn == C_VERIFY_S || 329 state.conn == C_VERIFY_S ||
293 device->state.conn == C_VERIFY_T) 330 state.conn == C_VERIFY_T)
294 drbd_syncer_progress(device, seq); 331 drbd_syncer_progress(device, seq, state);
295 332
296 if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) { 333 if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
297 lc_seq_printf_stats(seq, device->resync); 334 lc_seq_printf_stats(seq, device->resync);
@@ -299,12 +336,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
299 put_ldev(device); 336 put_ldev(device);
300 } 337 }
301 338
302 if (proc_details >= 2) { 339 if (proc_details >= 2)
303 if (device->resync) { 340 seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
304 lc_seq_dump_details(seq, device->resync, "rs_left",
305 resync_dump_detail);
306 }
307 }
308 } 341 }
309 rcu_read_unlock(); 342 rcu_read_unlock();
310 343
@@ -316,7 +349,7 @@ static int drbd_proc_open(struct inode *inode, struct file *file)
316 int err; 349 int err;
317 350
318 if (try_module_get(THIS_MODULE)) { 351 if (try_module_get(THIS_MODULE)) {
319 err = single_open(file, drbd_seq_show, PDE_DATA(inode)); 352 err = single_open(file, drbd_seq_show, NULL);
320 if (err) 353 if (err)
321 module_put(THIS_MODULE); 354 module_put(THIS_MODULE);
322 return err; 355 return err;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5b17ec88ea05..9342b8da73ab 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -362,17 +362,14 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
362 goto fail; 362 goto fail;
363 } 363 }
364 364
365 memset(peer_req, 0, sizeof(*peer_req));
366 INIT_LIST_HEAD(&peer_req->w.list);
365 drbd_clear_interval(&peer_req->i); 367 drbd_clear_interval(&peer_req->i);
366 peer_req->i.size = data_size; 368 peer_req->i.size = data_size;
367 peer_req->i.sector = sector; 369 peer_req->i.sector = sector;
368 peer_req->i.local = false; 370 peer_req->submit_jif = jiffies;
369 peer_req->i.waiting = false;
370
371 peer_req->epoch = NULL;
372 peer_req->peer_device = peer_device; 371 peer_req->peer_device = peer_device;
373 peer_req->pages = page; 372 peer_req->pages = page;
374 atomic_set(&peer_req->pending_bios, 0);
375 peer_req->flags = 0;
376 /* 373 /*
377 * The block_id is opaque to the receiver. It is not endianness 374 * The block_id is opaque to the receiver. It is not endianness
378 * converted, and sent back to the sender unchanged. 375 * converted, and sent back to the sender unchanged.
@@ -389,11 +386,16 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
389void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, 386void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
390 int is_net) 387 int is_net)
391{ 388{
389 might_sleep();
392 if (peer_req->flags & EE_HAS_DIGEST) 390 if (peer_req->flags & EE_HAS_DIGEST)
393 kfree(peer_req->digest); 391 kfree(peer_req->digest);
394 drbd_free_pages(device, peer_req->pages, is_net); 392 drbd_free_pages(device, peer_req->pages, is_net);
395 D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); 393 D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
396 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 394 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
395 if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
396 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
397 drbd_al_complete_io(device, &peer_req->i);
398 }
397 mempool_free(peer_req, drbd_ee_mempool); 399 mempool_free(peer_req, drbd_ee_mempool);
398} 400}
399 401
@@ -791,8 +793,18 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
791{ 793{
792 unsigned int header_size = drbd_header_size(connection); 794 unsigned int header_size = drbd_header_size(connection);
793 struct packet_info pi; 795 struct packet_info pi;
796 struct net_conf *nc;
794 int err; 797 int err;
795 798
799 rcu_read_lock();
800 nc = rcu_dereference(connection->net_conf);
801 if (!nc) {
802 rcu_read_unlock();
803 return -EIO;
804 }
805 sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
806 rcu_read_unlock();
807
796 err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0); 808 err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
797 if (err != header_size) { 809 if (err != header_size) {
798 if (err >= 0) 810 if (err >= 0)
@@ -809,7 +821,7 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke
809 * drbd_socket_okay() - Free the socket if its connection is not okay 821 * drbd_socket_okay() - Free the socket if its connection is not okay
810 * @sock: pointer to the pointer to the socket. 822 * @sock: pointer to the pointer to the socket.
811 */ 823 */
812static int drbd_socket_okay(struct socket **sock) 824static bool drbd_socket_okay(struct socket **sock)
813{ 825{
814 int rr; 826 int rr;
815 char tb[4]; 827 char tb[4];
@@ -827,6 +839,30 @@ static int drbd_socket_okay(struct socket **sock)
827 return false; 839 return false;
828 } 840 }
829} 841}
842
843static bool connection_established(struct drbd_connection *connection,
844 struct socket **sock1,
845 struct socket **sock2)
846{
847 struct net_conf *nc;
848 int timeout;
849 bool ok;
850
851 if (!*sock1 || !*sock2)
852 return false;
853
854 rcu_read_lock();
855 nc = rcu_dereference(connection->net_conf);
856 timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
857 rcu_read_unlock();
858 schedule_timeout_interruptible(timeout);
859
860 ok = drbd_socket_okay(sock1);
861 ok = drbd_socket_okay(sock2) && ok;
862
863 return ok;
864}
865
830/* Gets called if a connection is established, or if a new minor gets created 866/* Gets called if a connection is established, or if a new minor gets created
831 in a connection */ 867 in a connection */
832int drbd_connected(struct drbd_peer_device *peer_device) 868int drbd_connected(struct drbd_peer_device *peer_device)
@@ -868,8 +904,8 @@ static int conn_connect(struct drbd_connection *connection)
868 struct drbd_socket sock, msock; 904 struct drbd_socket sock, msock;
869 struct drbd_peer_device *peer_device; 905 struct drbd_peer_device *peer_device;
870 struct net_conf *nc; 906 struct net_conf *nc;
871 int vnr, timeout, h, ok; 907 int vnr, timeout, h;
872 bool discard_my_data; 908 bool discard_my_data, ok;
873 enum drbd_state_rv rv; 909 enum drbd_state_rv rv;
874 struct accept_wait_data ad = { 910 struct accept_wait_data ad = {
875 .connection = connection, 911 .connection = connection,
@@ -913,17 +949,8 @@ static int conn_connect(struct drbd_connection *connection)
913 } 949 }
914 } 950 }
915 951
916 if (sock.socket && msock.socket) { 952 if (connection_established(connection, &sock.socket, &msock.socket))
917 rcu_read_lock(); 953 break;
918 nc = rcu_dereference(connection->net_conf);
919 timeout = nc->ping_timeo * HZ / 10;
920 rcu_read_unlock();
921 schedule_timeout_interruptible(timeout);
922 ok = drbd_socket_okay(&sock.socket);
923 ok = drbd_socket_okay(&msock.socket) && ok;
924 if (ok)
925 break;
926 }
927 954
928retry: 955retry:
929 s = drbd_wait_for_connect(connection, &ad); 956 s = drbd_wait_for_connect(connection, &ad);
@@ -969,8 +996,7 @@ randomize:
969 goto out_release_sockets; 996 goto out_release_sockets;
970 } 997 }
971 998
972 ok = drbd_socket_okay(&sock.socket); 999 ok = connection_established(connection, &sock.socket, &msock.socket);
973 ok = drbd_socket_okay(&msock.socket) && ok;
974 } while (!ok); 1000 } while (!ok);
975 1001
976 if (ad.s_listen) 1002 if (ad.s_listen)
@@ -1151,7 +1177,7 @@ static void drbd_flush(struct drbd_connection *connection)
1151 struct drbd_peer_device *peer_device; 1177 struct drbd_peer_device *peer_device;
1152 int vnr; 1178 int vnr;
1153 1179
1154 if (connection->write_ordering >= WO_bdev_flush) { 1180 if (connection->resource->write_ordering >= WO_bdev_flush) {
1155 rcu_read_lock(); 1181 rcu_read_lock();
1156 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1182 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1157 struct drbd_device *device = peer_device->device; 1183 struct drbd_device *device = peer_device->device;
@@ -1161,14 +1187,22 @@ static void drbd_flush(struct drbd_connection *connection)
1161 kref_get(&device->kref); 1187 kref_get(&device->kref);
1162 rcu_read_unlock(); 1188 rcu_read_unlock();
1163 1189
1190 /* Right now, we have only this one synchronous code path
1191 * for flushes between request epochs.
1192 * We may want to make those asynchronous,
1193 * or at least parallelize the flushes to the volume devices.
1194 */
1195 device->flush_jif = jiffies;
1196 set_bit(FLUSH_PENDING, &device->flags);
1164 rv = blkdev_issue_flush(device->ldev->backing_bdev, 1197 rv = blkdev_issue_flush(device->ldev->backing_bdev,
1165 GFP_NOIO, NULL); 1198 GFP_NOIO, NULL);
1199 clear_bit(FLUSH_PENDING, &device->flags);
1166 if (rv) { 1200 if (rv) {
1167 drbd_info(device, "local disk flush failed with status %d\n", rv); 1201 drbd_info(device, "local disk flush failed with status %d\n", rv);
1168 /* would rather check on EOPNOTSUPP, but that is not reliable. 1202 /* would rather check on EOPNOTSUPP, but that is not reliable.
1169 * don't try again for ANY return value != 0 1203 * don't try again for ANY return value != 0
1170 * if (rv == -EOPNOTSUPP) */ 1204 * if (rv == -EOPNOTSUPP) */
1171 drbd_bump_write_ordering(connection, WO_drain_io); 1205 drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
1172 } 1206 }
1173 put_ldev(device); 1207 put_ldev(device);
1174 kref_put(&device->kref, drbd_destroy_device); 1208 kref_put(&device->kref, drbd_destroy_device);
@@ -1257,15 +1291,30 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
1257 return rv; 1291 return rv;
1258} 1292}
1259 1293
1294static enum write_ordering_e
1295max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1296{
1297 struct disk_conf *dc;
1298
1299 dc = rcu_dereference(bdev->disk_conf);
1300
1301 if (wo == WO_bdev_flush && !dc->disk_flushes)
1302 wo = WO_drain_io;
1303 if (wo == WO_drain_io && !dc->disk_drain)
1304 wo = WO_none;
1305
1306 return wo;
1307}
1308
1260/** 1309/**
1261 * drbd_bump_write_ordering() - Fall back to an other write ordering method 1310 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1262 * @connection: DRBD connection. 1311 * @connection: DRBD connection.
1263 * @wo: Write ordering method to try. 1312 * @wo: Write ordering method to try.
1264 */ 1313 */
1265void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo) 1314void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1315 enum write_ordering_e wo)
1266{ 1316{
1267 struct disk_conf *dc; 1317 struct drbd_device *device;
1268 struct drbd_peer_device *peer_device;
1269 enum write_ordering_e pwo; 1318 enum write_ordering_e pwo;
1270 int vnr; 1319 int vnr;
1271 static char *write_ordering_str[] = { 1320 static char *write_ordering_str[] = {
@@ -1274,26 +1323,27 @@ void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ord
1274 [WO_bdev_flush] = "flush", 1323 [WO_bdev_flush] = "flush",
1275 }; 1324 };
1276 1325
1277 pwo = connection->write_ordering; 1326 pwo = resource->write_ordering;
1278 wo = min(pwo, wo); 1327 if (wo != WO_bdev_flush)
1328 wo = min(pwo, wo);
1279 rcu_read_lock(); 1329 rcu_read_lock();
1280 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1330 idr_for_each_entry(&resource->devices, device, vnr) {
1281 struct drbd_device *device = peer_device->device; 1331 if (get_ldev(device)) {
1332 wo = max_allowed_wo(device->ldev, wo);
1333 if (device->ldev == bdev)
1334 bdev = NULL;
1335 put_ldev(device);
1336 }
1337 }
1282 1338
1283 if (!get_ldev_if_state(device, D_ATTACHING)) 1339 if (bdev)
1284 continue; 1340 wo = max_allowed_wo(bdev, wo);
1285 dc = rcu_dereference(device->ldev->disk_conf);
1286 1341
1287 if (wo == WO_bdev_flush && !dc->disk_flushes)
1288 wo = WO_drain_io;
1289 if (wo == WO_drain_io && !dc->disk_drain)
1290 wo = WO_none;
1291 put_ldev(device);
1292 }
1293 rcu_read_unlock(); 1342 rcu_read_unlock();
1294 connection->write_ordering = wo; 1343
1295 if (pwo != connection->write_ordering || wo == WO_bdev_flush) 1344 resource->write_ordering = wo;
1296 drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]); 1345 if (pwo != resource->write_ordering || wo == WO_bdev_flush)
1346 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1297} 1347}
1298 1348
1299/** 1349/**
@@ -1330,6 +1380,13 @@ int drbd_submit_peer_request(struct drbd_device *device,
1330 /* wait for all pending IO completions, before we start 1380 /* wait for all pending IO completions, before we start
1331 * zeroing things out. */ 1381 * zeroing things out. */
1332 conn_wait_active_ee_empty(first_peer_device(device)->connection); 1382 conn_wait_active_ee_empty(first_peer_device(device)->connection);
1383 /* add it to the active list now,
1384 * so we can find it to present it in debugfs */
1385 peer_req->submit_jif = jiffies;
1386 peer_req->flags |= EE_SUBMITTED;
1387 spin_lock_irq(&device->resource->req_lock);
1388 list_add_tail(&peer_req->w.list, &device->active_ee);
1389 spin_unlock_irq(&device->resource->req_lock);
1333 if (blkdev_issue_zeroout(device->ldev->backing_bdev, 1390 if (blkdev_issue_zeroout(device->ldev->backing_bdev,
1334 sector, ds >> 9, GFP_NOIO)) 1391 sector, ds >> 9, GFP_NOIO))
1335 peer_req->flags |= EE_WAS_ERROR; 1392 peer_req->flags |= EE_WAS_ERROR;
@@ -1398,6 +1455,9 @@ submit:
1398 D_ASSERT(device, page == NULL); 1455 D_ASSERT(device, page == NULL);
1399 1456
1400 atomic_set(&peer_req->pending_bios, n_bios); 1457 atomic_set(&peer_req->pending_bios, n_bios);
1458 /* for debugfs: update timestamp, mark as submitted */
1459 peer_req->submit_jif = jiffies;
1460 peer_req->flags |= EE_SUBMITTED;
1401 do { 1461 do {
1402 bio = bios; 1462 bio = bios;
1403 bios = bios->bi_next; 1463 bios = bios->bi_next;
@@ -1471,7 +1531,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
1471 * R_PRIMARY crashes now. 1531 * R_PRIMARY crashes now.
1472 * Therefore we must send the barrier_ack after the barrier request was 1532 * Therefore we must send the barrier_ack after the barrier request was
1473 * completed. */ 1533 * completed. */
1474 switch (connection->write_ordering) { 1534 switch (connection->resource->write_ordering) {
1475 case WO_none: 1535 case WO_none:
1476 if (rv == FE_RECYCLED) 1536 if (rv == FE_RECYCLED)
1477 return 0; 1537 return 0;
@@ -1498,7 +1558,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
1498 1558
1499 return 0; 1559 return 0;
1500 default: 1560 default:
1501 drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering); 1561 drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1562 connection->resource->write_ordering);
1502 return -EIO; 1563 return -EIO;
1503 } 1564 }
1504 1565
@@ -1531,7 +1592,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1531 struct drbd_peer_request *peer_req; 1592 struct drbd_peer_request *peer_req;
1532 struct page *page; 1593 struct page *page;
1533 int dgs, ds, err; 1594 int dgs, ds, err;
1534 int data_size = pi->size; 1595 unsigned int data_size = pi->size;
1535 void *dig_in = peer_device->connection->int_dig_in; 1596 void *dig_in = peer_device->connection->int_dig_in;
1536 void *dig_vv = peer_device->connection->int_dig_vv; 1597 void *dig_vv = peer_device->connection->int_dig_vv;
1537 unsigned long *data; 1598 unsigned long *data;
@@ -1578,6 +1639,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1578 if (!peer_req) 1639 if (!peer_req)
1579 return NULL; 1640 return NULL;
1580 1641
1642 peer_req->flags |= EE_WRITE;
1581 if (trim) 1643 if (trim)
1582 return peer_req; 1644 return peer_req;
1583 1645
@@ -1734,9 +1796,10 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
1734 * respective _drbd_clear_done_ee */ 1796 * respective _drbd_clear_done_ee */
1735 1797
1736 peer_req->w.cb = e_end_resync_block; 1798 peer_req->w.cb = e_end_resync_block;
1799 peer_req->submit_jif = jiffies;
1737 1800
1738 spin_lock_irq(&device->resource->req_lock); 1801 spin_lock_irq(&device->resource->req_lock);
1739 list_add(&peer_req->w.list, &device->sync_ee); 1802 list_add_tail(&peer_req->w.list, &device->sync_ee);
1740 spin_unlock_irq(&device->resource->req_lock); 1803 spin_unlock_irq(&device->resource->req_lock);
1741 1804
1742 atomic_add(pi->size >> 9, &device->rs_sect_ev); 1805 atomic_add(pi->size >> 9, &device->rs_sect_ev);
@@ -1889,6 +1952,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
1889 } 1952 }
1890 dec_unacked(device); 1953 dec_unacked(device);
1891 } 1954 }
1955
1892 /* we delete from the conflict detection hash _after_ we sent out the 1956 /* we delete from the conflict detection hash _after_ we sent out the
1893 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ 1957 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1894 if (peer_req->flags & EE_IN_INTERVAL_TREE) { 1958 if (peer_req->flags & EE_IN_INTERVAL_TREE) {
@@ -2115,6 +2179,8 @@ static int handle_write_conflicts(struct drbd_device *device,
2115 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2179 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2116 if (i == &peer_req->i) 2180 if (i == &peer_req->i)
2117 continue; 2181 continue;
2182 if (i->completed)
2183 continue;
2118 2184
2119 if (!i->local) { 2185 if (!i->local) {
2120 /* 2186 /*
@@ -2147,7 +2213,6 @@ static int handle_write_conflicts(struct drbd_device *device,
2147 (unsigned long long)sector, size, 2213 (unsigned long long)sector, size,
2148 superseded ? "local" : "remote"); 2214 superseded ? "local" : "remote");
2149 2215
2150 inc_unacked(device);
2151 peer_req->w.cb = superseded ? e_send_superseded : 2216 peer_req->w.cb = superseded ? e_send_superseded :
2152 e_send_retry_write; 2217 e_send_retry_write;
2153 list_add_tail(&peer_req->w.list, &device->done_ee); 2218 list_add_tail(&peer_req->w.list, &device->done_ee);
@@ -2206,6 +2271,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2206{ 2271{
2207 struct drbd_peer_device *peer_device; 2272 struct drbd_peer_device *peer_device;
2208 struct drbd_device *device; 2273 struct drbd_device *device;
2274 struct net_conf *nc;
2209 sector_t sector; 2275 sector_t sector;
2210 struct drbd_peer_request *peer_req; 2276 struct drbd_peer_request *peer_req;
2211 struct p_data *p = pi->data; 2277 struct p_data *p = pi->data;
@@ -2245,6 +2311,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2245 } 2311 }
2246 2312
2247 peer_req->w.cb = e_end_block; 2313 peer_req->w.cb = e_end_block;
2314 peer_req->submit_jif = jiffies;
2315 peer_req->flags |= EE_APPLICATION;
2248 2316
2249 dp_flags = be32_to_cpu(p->dp_flags); 2317 dp_flags = be32_to_cpu(p->dp_flags);
2250 rw |= wire_flags_to_bio(dp_flags); 2318 rw |= wire_flags_to_bio(dp_flags);
@@ -2271,9 +2339,36 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2271 spin_unlock(&connection->epoch_lock); 2339 spin_unlock(&connection->epoch_lock);
2272 2340
2273 rcu_read_lock(); 2341 rcu_read_lock();
2274 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries; 2342 nc = rcu_dereference(peer_device->connection->net_conf);
2343 tp = nc->two_primaries;
2344 if (peer_device->connection->agreed_pro_version < 100) {
2345 switch (nc->wire_protocol) {
2346 case DRBD_PROT_C:
2347 dp_flags |= DP_SEND_WRITE_ACK;
2348 break;
2349 case DRBD_PROT_B:
2350 dp_flags |= DP_SEND_RECEIVE_ACK;
2351 break;
2352 }
2353 }
2275 rcu_read_unlock(); 2354 rcu_read_unlock();
2355
2356 if (dp_flags & DP_SEND_WRITE_ACK) {
2357 peer_req->flags |= EE_SEND_WRITE_ACK;
2358 inc_unacked(device);
2359 /* corresponding dec_unacked() in e_end_block()
2360 * respective _drbd_clear_done_ee */
2361 }
2362
2363 if (dp_flags & DP_SEND_RECEIVE_ACK) {
2364 /* I really don't like it that the receiver thread
2365 * sends on the msock, but anyways */
2366 drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
2367 }
2368
2276 if (tp) { 2369 if (tp) {
2370 /* two primaries implies protocol C */
2371 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
2277 peer_req->flags |= EE_IN_INTERVAL_TREE; 2372 peer_req->flags |= EE_IN_INTERVAL_TREE;
2278 err = wait_for_and_update_peer_seq(peer_device, peer_seq); 2373 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2279 if (err) 2374 if (err)
@@ -2297,44 +2392,18 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2297 * active_ee to become empty in drbd_submit_peer_request(); 2392 * active_ee to become empty in drbd_submit_peer_request();
2298 * better not add ourselves here. */ 2393 * better not add ourselves here. */
2299 if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) 2394 if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
2300 list_add(&peer_req->w.list, &device->active_ee); 2395 list_add_tail(&peer_req->w.list, &device->active_ee);
2301 spin_unlock_irq(&device->resource->req_lock); 2396 spin_unlock_irq(&device->resource->req_lock);
2302 2397
2303 if (device->state.conn == C_SYNC_TARGET) 2398 if (device->state.conn == C_SYNC_TARGET)
2304 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req)); 2399 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
2305 2400
2306 if (peer_device->connection->agreed_pro_version < 100) {
2307 rcu_read_lock();
2308 switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) {
2309 case DRBD_PROT_C:
2310 dp_flags |= DP_SEND_WRITE_ACK;
2311 break;
2312 case DRBD_PROT_B:
2313 dp_flags |= DP_SEND_RECEIVE_ACK;
2314 break;
2315 }
2316 rcu_read_unlock();
2317 }
2318
2319 if (dp_flags & DP_SEND_WRITE_ACK) {
2320 peer_req->flags |= EE_SEND_WRITE_ACK;
2321 inc_unacked(device);
2322 /* corresponding dec_unacked() in e_end_block()
2323 * respective _drbd_clear_done_ee */
2324 }
2325
2326 if (dp_flags & DP_SEND_RECEIVE_ACK) {
2327 /* I really don't like it that the receiver thread
2328 * sends on the msock, but anyways */
2329 drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
2330 }
2331
2332 if (device->state.pdsk < D_INCONSISTENT) { 2401 if (device->state.pdsk < D_INCONSISTENT) {
2333 /* In case we have the only disk of the cluster, */ 2402 /* In case we have the only disk of the cluster, */
2334 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); 2403 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
2335 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2336 peer_req->flags &= ~EE_MAY_SET_IN_SYNC; 2404 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2337 drbd_al_begin_io(device, &peer_req->i, true); 2405 drbd_al_begin_io(device, &peer_req->i);
2406 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2338 } 2407 }
2339 2408
2340 err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR); 2409 err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2347,8 +2416,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2347 list_del(&peer_req->w.list); 2416 list_del(&peer_req->w.list);
2348 drbd_remove_epoch_entry_interval(device, peer_req); 2417 drbd_remove_epoch_entry_interval(device, peer_req);
2349 spin_unlock_irq(&device->resource->req_lock); 2418 spin_unlock_irq(&device->resource->req_lock);
2350 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) 2419 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2420 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
2351 drbd_al_complete_io(device, &peer_req->i); 2421 drbd_al_complete_io(device, &peer_req->i);
2422 }
2352 2423
2353out_interrupted: 2424out_interrupted:
2354 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); 2425 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
@@ -2368,13 +2439,14 @@ out_interrupted:
2368 * The current sync rate used here uses only the most recent two step marks, 2439 * The current sync rate used here uses only the most recent two step marks,
2369 * to have a short time average so we can react faster. 2440 * to have a short time average so we can react faster.
2370 */ 2441 */
2371bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) 2442bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2443 bool throttle_if_app_is_waiting)
2372{ 2444{
2373 struct lc_element *tmp; 2445 struct lc_element *tmp;
2374 bool throttle = true; 2446 bool throttle = drbd_rs_c_min_rate_throttle(device);
2375 2447
2376 if (!drbd_rs_c_min_rate_throttle(device)) 2448 if (!throttle || throttle_if_app_is_waiting)
2377 return false; 2449 return throttle;
2378 2450
2379 spin_lock_irq(&device->al_lock); 2451 spin_lock_irq(&device->al_lock);
2380 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); 2452 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
@@ -2382,7 +2454,8 @@ bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
2382 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 2454 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2383 if (test_bit(BME_PRIORITY, &bm_ext->flags)) 2455 if (test_bit(BME_PRIORITY, &bm_ext->flags))
2384 throttle = false; 2456 throttle = false;
2385 /* Do not slow down if app IO is already waiting for this extent */ 2457 /* Do not slow down if app IO is already waiting for this extent,
2458 * and our progress is necessary for application IO to complete. */
2386 } 2459 }
2387 spin_unlock_irq(&device->al_lock); 2460 spin_unlock_irq(&device->al_lock);
2388 2461
@@ -2407,7 +2480,9 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2407 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 2480 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2408 (int)part_stat_read(&disk->part0, sectors[1]) - 2481 (int)part_stat_read(&disk->part0, sectors[1]) -
2409 atomic_read(&device->rs_sect_ev); 2482 atomic_read(&device->rs_sect_ev);
2410 if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { 2483
2484 if (atomic_read(&device->ap_actlog_cnt)
2485 || !device->rs_last_events || curr_events - device->rs_last_events > 64) {
2411 unsigned long rs_left; 2486 unsigned long rs_left;
2412 int i; 2487 int i;
2413 2488
@@ -2508,6 +2583,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2508 peer_req->w.cb = w_e_end_data_req; 2583 peer_req->w.cb = w_e_end_data_req;
2509 fault_type = DRBD_FAULT_DT_RD; 2584 fault_type = DRBD_FAULT_DT_RD;
2510 /* application IO, don't drbd_rs_begin_io */ 2585 /* application IO, don't drbd_rs_begin_io */
2586 peer_req->flags |= EE_APPLICATION;
2511 goto submit; 2587 goto submit;
2512 2588
2513 case P_RS_DATA_REQUEST: 2589 case P_RS_DATA_REQUEST:
@@ -2538,6 +2614,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2538 peer_req->w.cb = w_e_end_csum_rs_req; 2614 peer_req->w.cb = w_e_end_csum_rs_req;
2539 /* used in the sector offset progress display */ 2615 /* used in the sector offset progress display */
2540 device->bm_resync_fo = BM_SECT_TO_BIT(sector); 2616 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2617 /* remember to report stats in drbd_resync_finished */
2618 device->use_csums = true;
2541 } else if (pi->cmd == P_OV_REPLY) { 2619 } else if (pi->cmd == P_OV_REPLY) {
2542 /* track progress, we may need to throttle */ 2620 /* track progress, we may need to throttle */
2543 atomic_add(size >> 9, &device->rs_sect_in); 2621 atomic_add(size >> 9, &device->rs_sect_in);
@@ -2595,8 +2673,20 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2595 * we would also throttle its application reads. 2673 * we would also throttle its application reads.
2596 * In that case, throttling is done on the SyncTarget only. 2674 * In that case, throttling is done on the SyncTarget only.
2597 */ 2675 */
2598 if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector)) 2676
2677 /* Even though this may be a resync request, we do add to "read_ee";
2678 * "sync_ee" is only used for resync WRITEs.
2679 * Add to list early, so debugfs can find this request
2680 * even if we have to sleep below. */
2681 spin_lock_irq(&device->resource->req_lock);
2682 list_add_tail(&peer_req->w.list, &device->read_ee);
2683 spin_unlock_irq(&device->resource->req_lock);
2684
2685 update_receiver_timing_details(connection, drbd_rs_should_slow_down);
2686 if (device->state.peer != R_PRIMARY
2687 && drbd_rs_should_slow_down(device, sector, false))
2599 schedule_timeout_uninterruptible(HZ/10); 2688 schedule_timeout_uninterruptible(HZ/10);
2689 update_receiver_timing_details(connection, drbd_rs_begin_io);
2600 if (drbd_rs_begin_io(device, sector)) 2690 if (drbd_rs_begin_io(device, sector))
2601 goto out_free_e; 2691 goto out_free_e;
2602 2692
@@ -2604,22 +2694,20 @@ submit_for_resync:
2604 atomic_add(size >> 9, &device->rs_sect_ev); 2694 atomic_add(size >> 9, &device->rs_sect_ev);
2605 2695
2606submit: 2696submit:
2697 update_receiver_timing_details(connection, drbd_submit_peer_request);
2607 inc_unacked(device); 2698 inc_unacked(device);
2608 spin_lock_irq(&device->resource->req_lock);
2609 list_add_tail(&peer_req->w.list, &device->read_ee);
2610 spin_unlock_irq(&device->resource->req_lock);
2611
2612 if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0) 2699 if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
2613 return 0; 2700 return 0;
2614 2701
2615 /* don't care for the reason here */ 2702 /* don't care for the reason here */
2616 drbd_err(device, "submit failed, triggering re-connect\n"); 2703 drbd_err(device, "submit failed, triggering re-connect\n");
2704
2705out_free_e:
2617 spin_lock_irq(&device->resource->req_lock); 2706 spin_lock_irq(&device->resource->req_lock);
2618 list_del(&peer_req->w.list); 2707 list_del(&peer_req->w.list);
2619 spin_unlock_irq(&device->resource->req_lock); 2708 spin_unlock_irq(&device->resource->req_lock);
2620 /* no drbd_rs_complete_io(), we are dropping the connection anyways */ 2709 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2621 2710
2622out_free_e:
2623 put_ldev(device); 2711 put_ldev(device);
2624 drbd_free_peer_req(device, peer_req); 2712 drbd_free_peer_req(device, peer_req);
2625 return -EIO; 2713 return -EIO;
@@ -2842,8 +2930,10 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
2842-1091 requires proto 91 2930-1091 requires proto 91
2843-1096 requires proto 96 2931-1096 requires proto 96
2844 */ 2932 */
2845static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local) 2933static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
2846{ 2934{
2935 struct drbd_peer_device *const peer_device = first_peer_device(device);
2936 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
2847 u64 self, peer; 2937 u64 self, peer;
2848 int i, j; 2938 int i, j;
2849 2939
@@ -2869,7 +2959,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
2869 2959
2870 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) { 2960 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2871 2961
2872 if (first_peer_device(device)->connection->agreed_pro_version < 91) 2962 if (connection->agreed_pro_version < 91)
2873 return -1091; 2963 return -1091;
2874 2964
2875 if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) && 2965 if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
@@ -2892,7 +2982,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
2892 2982
2893 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) { 2983 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
2894 2984
2895 if (first_peer_device(device)->connection->agreed_pro_version < 91) 2985 if (connection->agreed_pro_version < 91)
2896 return -1091; 2986 return -1091;
2897 2987
2898 if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) && 2988 if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
@@ -2925,7 +3015,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
2925 case 1: /* self_pri && !peer_pri */ return 1; 3015 case 1: /* self_pri && !peer_pri */ return 1;
2926 case 2: /* !self_pri && peer_pri */ return -1; 3016 case 2: /* !self_pri && peer_pri */ return -1;
2927 case 3: /* self_pri && peer_pri */ 3017 case 3: /* self_pri && peer_pri */
2928 dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags); 3018 dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
2929 return dc ? -1 : 1; 3019 return dc ? -1 : 1;
2930 } 3020 }
2931 } 3021 }
@@ -2938,14 +3028,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
2938 *rule_nr = 51; 3028 *rule_nr = 51;
2939 peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1); 3029 peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
2940 if (self == peer) { 3030 if (self == peer) {
2941 if (first_peer_device(device)->connection->agreed_pro_version < 96 ? 3031 if (connection->agreed_pro_version < 96 ?
2942 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == 3032 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2943 (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : 3033 (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2944 peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) { 3034 peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
2945 /* The last P_SYNC_UUID did not get though. Undo the last start of 3035 /* The last P_SYNC_UUID did not get though. Undo the last start of
2946 resync as sync source modifications of the peer's UUIDs. */ 3036 resync as sync source modifications of the peer's UUIDs. */
2947 3037
2948 if (first_peer_device(device)->connection->agreed_pro_version < 91) 3038 if (connection->agreed_pro_version < 91)
2949 return -1091; 3039 return -1091;
2950 3040
2951 device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START]; 3041 device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
@@ -2975,14 +3065,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho
2975 *rule_nr = 71; 3065 *rule_nr = 71;
2976 self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 3066 self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2977 if (self == peer) { 3067 if (self == peer) {
2978 if (first_peer_device(device)->connection->agreed_pro_version < 96 ? 3068 if (connection->agreed_pro_version < 96 ?
2979 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == 3069 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2980 (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) : 3070 (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2981 self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { 3071 self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
2982 /* The last P_SYNC_UUID did not get though. Undo the last start of 3072 /* The last P_SYNC_UUID did not get though. Undo the last start of
2983 resync as sync source modifications of our UUIDs. */ 3073 resync as sync source modifications of our UUIDs. */
2984 3074
2985 if (first_peer_device(device)->connection->agreed_pro_version < 91) 3075 if (connection->agreed_pro_version < 91)
2986 return -1091; 3076 return -1091;
2987 3077
2988 __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]); 3078 __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
@@ -3352,8 +3442,7 @@ disconnect:
3352 * return: NULL (alg name was "") 3442 * return: NULL (alg name was "")
3353 * ERR_PTR(error) if something goes wrong 3443 * ERR_PTR(error) if something goes wrong
3354 * or the crypto hash ptr, if it worked out ok. */ 3444 * or the crypto hash ptr, if it worked out ok. */
3355static 3445static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
3356struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
3357 const char *alg, const char *name) 3446 const char *alg, const char *name)
3358{ 3447{
3359 struct crypto_hash *tfm; 3448 struct crypto_hash *tfm;
@@ -3639,7 +3728,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3639 struct drbd_device *device; 3728 struct drbd_device *device;
3640 struct p_sizes *p = pi->data; 3729 struct p_sizes *p = pi->data;
3641 enum determine_dev_size dd = DS_UNCHANGED; 3730 enum determine_dev_size dd = DS_UNCHANGED;
3642 sector_t p_size, p_usize, my_usize; 3731 sector_t p_size, p_usize, p_csize, my_usize;
3643 int ldsc = 0; /* local disk size changed */ 3732 int ldsc = 0; /* local disk size changed */
3644 enum dds_flags ddsf; 3733 enum dds_flags ddsf;
3645 3734
@@ -3650,6 +3739,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3650 3739
3651 p_size = be64_to_cpu(p->d_size); 3740 p_size = be64_to_cpu(p->d_size);
3652 p_usize = be64_to_cpu(p->u_size); 3741 p_usize = be64_to_cpu(p->u_size);
3742 p_csize = be64_to_cpu(p->c_size);
3653 3743
3654 /* just store the peer's disk size for now. 3744 /* just store the peer's disk size for now.
3655 * we still need to figure out whether we accept that. */ 3745 * we still need to figure out whether we accept that. */
@@ -3710,7 +3800,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3710 } 3800 }
3711 3801
3712 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); 3802 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3713 drbd_reconsider_max_bio_size(device);
3714 /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). 3803 /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
3715 In case we cleared the QUEUE_FLAG_DISCARD from our queue in 3804 In case we cleared the QUEUE_FLAG_DISCARD from our queue in
3716 drbd_reconsider_max_bio_size(), we can be sure that after 3805 drbd_reconsider_max_bio_size(), we can be sure that after
@@ -3718,14 +3807,28 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3718 3807
3719 ddsf = be16_to_cpu(p->dds_flags); 3808 ddsf = be16_to_cpu(p->dds_flags);
3720 if (get_ldev(device)) { 3809 if (get_ldev(device)) {
3810 drbd_reconsider_max_bio_size(device, device->ldev);
3721 dd = drbd_determine_dev_size(device, ddsf, NULL); 3811 dd = drbd_determine_dev_size(device, ddsf, NULL);
3722 put_ldev(device); 3812 put_ldev(device);
3723 if (dd == DS_ERROR) 3813 if (dd == DS_ERROR)
3724 return -EIO; 3814 return -EIO;
3725 drbd_md_sync(device); 3815 drbd_md_sync(device);
3726 } else { 3816 } else {
3727 /* I am diskless, need to accept the peer's size. */ 3817 /*
3728 drbd_set_my_capacity(device, p_size); 3818 * I am diskless, need to accept the peer's *current* size.
3819 * I must NOT accept the peers backing disk size,
3820 * it may have been larger than mine all along...
3821 *
3822 * At this point, the peer knows more about my disk, or at
3823 * least about what we last agreed upon, than myself.
3824 * So if his c_size is less than his d_size, the most likely
3825 * reason is that *my* d_size was smaller last time we checked.
3826 *
3827 * However, if he sends a zero current size,
3828 * take his (user-capped or) backing disk size anyways.
3829 */
3830 drbd_reconsider_max_bio_size(device, NULL);
3831 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
3729 } 3832 }
3730 3833
3731 if (get_ldev(device)) { 3834 if (get_ldev(device)) {
@@ -4501,6 +4604,7 @@ static void drbdd(struct drbd_connection *connection)
4501 struct data_cmd *cmd; 4604 struct data_cmd *cmd;
4502 4605
4503 drbd_thread_current_set_cpu(&connection->receiver); 4606 drbd_thread_current_set_cpu(&connection->receiver);
4607 update_receiver_timing_details(connection, drbd_recv_header);
4504 if (drbd_recv_header(connection, &pi)) 4608 if (drbd_recv_header(connection, &pi))
4505 goto err_out; 4609 goto err_out;
4506 4610
@@ -4519,12 +4623,14 @@ static void drbdd(struct drbd_connection *connection)
4519 } 4623 }
4520 4624
4521 if (shs) { 4625 if (shs) {
4626 update_receiver_timing_details(connection, drbd_recv_all_warn);
4522 err = drbd_recv_all_warn(connection, pi.data, shs); 4627 err = drbd_recv_all_warn(connection, pi.data, shs);
4523 if (err) 4628 if (err)
4524 goto err_out; 4629 goto err_out;
4525 pi.size -= shs; 4630 pi.size -= shs;
4526 } 4631 }
4527 4632
4633 update_receiver_timing_details(connection, cmd->fn);
4528 err = cmd->fn(connection, &pi); 4634 err = cmd->fn(connection, &pi);
4529 if (err) { 4635 if (err) {
4530 drbd_err(connection, "error receiving %s, e: %d l: %d!\n", 4636 drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 09803d0d5207..c67717d572d1 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -52,7 +52,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
52static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) 52static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
53{ 53{
54 int rw = bio_data_dir(req->master_bio); 54 int rw = bio_data_dir(req->master_bio);
55 unsigned long duration = jiffies - req->start_time; 55 unsigned long duration = jiffies - req->start_jif;
56 int cpu; 56 int cpu;
57 cpu = part_stat_lock(); 57 cpu = part_stat_lock();
58 part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration); 58 part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration);
@@ -66,7 +66,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
66{ 66{
67 struct drbd_request *req; 67 struct drbd_request *req;
68 68
69 req = mempool_alloc(drbd_request_mempool, GFP_NOIO); 69 req = mempool_alloc(drbd_request_mempool, GFP_NOIO | __GFP_ZERO);
70 if (!req) 70 if (!req)
71 return NULL; 71 return NULL;
72 72
@@ -84,6 +84,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
84 84
85 INIT_LIST_HEAD(&req->tl_requests); 85 INIT_LIST_HEAD(&req->tl_requests);
86 INIT_LIST_HEAD(&req->w.list); 86 INIT_LIST_HEAD(&req->w.list);
87 INIT_LIST_HEAD(&req->req_pending_master_completion);
88 INIT_LIST_HEAD(&req->req_pending_local);
87 89
88 /* one reference to be put by __drbd_make_request */ 90 /* one reference to be put by __drbd_make_request */
89 atomic_set(&req->completion_ref, 1); 91 atomic_set(&req->completion_ref, 1);
@@ -92,6 +94,19 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
92 return req; 94 return req;
93} 95}
94 96
97static void drbd_remove_request_interval(struct rb_root *root,
98 struct drbd_request *req)
99{
100 struct drbd_device *device = req->device;
101 struct drbd_interval *i = &req->i;
102
103 drbd_remove_interval(root, i);
104
105 /* Wake up any processes waiting for this request to complete. */
106 if (i->waiting)
107 wake_up(&device->misc_wait);
108}
109
95void drbd_req_destroy(struct kref *kref) 110void drbd_req_destroy(struct kref *kref)
96{ 111{
97 struct drbd_request *req = container_of(kref, struct drbd_request, kref); 112 struct drbd_request *req = container_of(kref, struct drbd_request, kref);
@@ -107,14 +122,30 @@ void drbd_req_destroy(struct kref *kref)
107 return; 122 return;
108 } 123 }
109 124
110 /* remove it from the transfer log. 125 /* If called from mod_rq_state (expected normal case) or
111 * well, only if it had been there in the first 126 * drbd_send_and_submit (the less likely normal path), this holds the
112 * place... if it had not (local only or conflicting 127 * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
113 * and never sent), it should still be "empty" as 128 * though it may be still empty (never added to the transfer log).
114 * initialized in drbd_req_new(), so we can list_del() it 129 *
115 * here unconditionally */ 130 * If called from do_retry(), we do NOT hold the req_lock, but we are
131 * still allowed to unconditionally list_del(&req->tl_requests),
132 * because it will be on a local on-stack list only. */
116 list_del_init(&req->tl_requests); 133 list_del_init(&req->tl_requests);
117 134
135 /* finally remove the request from the conflict detection
136 * respective block_id verification interval tree. */
137 if (!drbd_interval_empty(&req->i)) {
138 struct rb_root *root;
139
140 if (s & RQ_WRITE)
141 root = &device->write_requests;
142 else
143 root = &device->read_requests;
144 drbd_remove_request_interval(root, req);
145 } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
146 drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
147 s, (unsigned long long)req->i.sector, req->i.size);
148
118 /* if it was a write, we may have to set the corresponding 149 /* if it was a write, we may have to set the corresponding
119 * bit(s) out-of-sync first. If it had a local part, we need to 150 * bit(s) out-of-sync first. If it had a local part, we need to
120 * release the reference to the activity log. */ 151 * release the reference to the activity log. */
@@ -188,19 +219,6 @@ void complete_master_bio(struct drbd_device *device,
188} 219}
189 220
190 221
191static void drbd_remove_request_interval(struct rb_root *root,
192 struct drbd_request *req)
193{
194 struct drbd_device *device = req->device;
195 struct drbd_interval *i = &req->i;
196
197 drbd_remove_interval(root, i);
198
199 /* Wake up any processes waiting for this request to complete. */
200 if (i->waiting)
201 wake_up(&device->misc_wait);
202}
203
204/* Helper for __req_mod(). 222/* Helper for __req_mod().
205 * Set m->bio to the master bio, if it is fit to be completed, 223 * Set m->bio to the master bio, if it is fit to be completed,
206 * or leave it alone (it is initialized to NULL in __req_mod), 224 * or leave it alone (it is initialized to NULL in __req_mod),
@@ -254,18 +272,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
254 ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); 272 ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
255 error = PTR_ERR(req->private_bio); 273 error = PTR_ERR(req->private_bio);
256 274
257 /* remove the request from the conflict detection
258 * respective block_id verification hash */
259 if (!drbd_interval_empty(&req->i)) {
260 struct rb_root *root;
261
262 if (rw == WRITE)
263 root = &device->write_requests;
264 else
265 root = &device->read_requests;
266 drbd_remove_request_interval(root, req);
267 }
268
269 /* Before we can signal completion to the upper layers, 275 /* Before we can signal completion to the upper layers,
270 * we may need to close the current transfer log epoch. 276 * we may need to close the current transfer log epoch.
271 * We are within the request lock, so we can simply compare 277 * We are within the request lock, so we can simply compare
@@ -301,9 +307,24 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
301 m->error = ok ? 0 : (error ?: -EIO); 307 m->error = ok ? 0 : (error ?: -EIO);
302 m->bio = req->master_bio; 308 m->bio = req->master_bio;
303 req->master_bio = NULL; 309 req->master_bio = NULL;
310 /* We leave it in the tree, to be able to verify later
311 * write-acks in protocol != C during resync.
312 * But we mark it as "complete", so it won't be counted as
313 * conflict in a multi-primary setup. */
314 req->i.completed = true;
304 } 315 }
316
317 if (req->i.waiting)
318 wake_up(&device->misc_wait);
319
320 /* Either we are about to complete to upper layers,
321 * or we will restart this request.
322 * In either case, the request object will be destroyed soon,
323 * so better remove it from all lists. */
324 list_del_init(&req->req_pending_master_completion);
305} 325}
306 326
327/* still holds resource->req_lock */
307static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) 328static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
308{ 329{
309 struct drbd_device *device = req->device; 330 struct drbd_device *device = req->device;
@@ -324,12 +345,91 @@ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_
324 return 1; 345 return 1;
325} 346}
326 347
348static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
349{
350 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
351 if (!connection)
352 return;
353 if (connection->req_next == NULL)
354 connection->req_next = req;
355}
356
357static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
358{
359 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
360 if (!connection)
361 return;
362 if (connection->req_next != req)
363 return;
364 list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
365 const unsigned s = req->rq_state;
366 if (s & RQ_NET_QUEUED)
367 break;
368 }
369 if (&req->tl_requests == &connection->transfer_log)
370 req = NULL;
371 connection->req_next = req;
372}
373
374static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
375{
376 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
377 if (!connection)
378 return;
379 if (connection->req_ack_pending == NULL)
380 connection->req_ack_pending = req;
381}
382
383static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
384{
385 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
386 if (!connection)
387 return;
388 if (connection->req_ack_pending != req)
389 return;
390 list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
391 const unsigned s = req->rq_state;
392 if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
393 break;
394 }
395 if (&req->tl_requests == &connection->transfer_log)
396 req = NULL;
397 connection->req_ack_pending = req;
398}
399
400static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
401{
402 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
403 if (!connection)
404 return;
405 if (connection->req_not_net_done == NULL)
406 connection->req_not_net_done = req;
407}
408
409static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
410{
411 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
412 if (!connection)
413 return;
414 if (connection->req_not_net_done != req)
415 return;
416 list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
417 const unsigned s = req->rq_state;
418 if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
419 break;
420 }
421 if (&req->tl_requests == &connection->transfer_log)
422 req = NULL;
423 connection->req_not_net_done = req;
424}
425
327/* I'd like this to be the only place that manipulates 426/* I'd like this to be the only place that manipulates
328 * req->completion_ref and req->kref. */ 427 * req->completion_ref and req->kref. */
329static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, 428static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
330 int clear, int set) 429 int clear, int set)
331{ 430{
332 struct drbd_device *device = req->device; 431 struct drbd_device *device = req->device;
432 struct drbd_peer_device *peer_device = first_peer_device(device);
333 unsigned s = req->rq_state; 433 unsigned s = req->rq_state;
334 int c_put = 0; 434 int c_put = 0;
335 int k_put = 0; 435 int k_put = 0;
@@ -356,14 +456,23 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
356 atomic_inc(&req->completion_ref); 456 atomic_inc(&req->completion_ref);
357 } 457 }
358 458
359 if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) 459 if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
360 atomic_inc(&req->completion_ref); 460 atomic_inc(&req->completion_ref);
461 set_if_null_req_next(peer_device, req);
462 }
361 463
362 if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) 464 if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
363 kref_get(&req->kref); /* wait for the DONE */ 465 kref_get(&req->kref); /* wait for the DONE */
364 466
365 if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) 467 if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
366 atomic_add(req->i.size >> 9, &device->ap_in_flight); 468 /* potentially already completed in the asender thread */
469 if (!(s & RQ_NET_DONE)) {
470 atomic_add(req->i.size >> 9, &device->ap_in_flight);
471 set_if_null_req_not_net_done(peer_device, req);
472 }
473 if (s & RQ_NET_PENDING)
474 set_if_null_req_ack_pending(peer_device, req);
475 }
367 476
368 if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) 477 if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
369 atomic_inc(&req->completion_ref); 478 atomic_inc(&req->completion_ref);
@@ -386,20 +495,34 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
386 ++k_put; 495 ++k_put;
387 else 496 else
388 ++c_put; 497 ++c_put;
498 list_del_init(&req->req_pending_local);
389 } 499 }
390 500
391 if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { 501 if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
392 dec_ap_pending(device); 502 dec_ap_pending(device);
393 ++c_put; 503 ++c_put;
504 req->acked_jif = jiffies;
505 advance_conn_req_ack_pending(peer_device, req);
394 } 506 }
395 507
396 if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) 508 if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
397 ++c_put; 509 ++c_put;
510 advance_conn_req_next(peer_device, req);
511 }
398 512
399 if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { 513 if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
400 if (req->rq_state & RQ_NET_SENT) 514 if (s & RQ_NET_SENT)
401 atomic_sub(req->i.size >> 9, &device->ap_in_flight); 515 atomic_sub(req->i.size >> 9, &device->ap_in_flight);
402 ++k_put; 516 if (s & RQ_EXP_BARR_ACK)
517 ++k_put;
518 req->net_done_jif = jiffies;
519
520 /* in ahead/behind mode, or just in case,
521 * before we finally destroy this request,
522 * the caching pointers must not reference it anymore */
523 advance_conn_req_next(peer_device, req);
524 advance_conn_req_ack_pending(peer_device, req);
525 advance_conn_req_not_net_done(peer_device, req);
403 } 526 }
404 527
405 /* potentially complete and destroy */ 528 /* potentially complete and destroy */
@@ -439,6 +562,19 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
439 bdevname(device->ldev->backing_bdev, b)); 562 bdevname(device->ldev->backing_bdev, b));
440} 563}
441 564
565/* Helper for HANDED_OVER_TO_NETWORK.
566 * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
567 * Is it also still "PENDING"?
568 * --> If so, clear PENDING and set NET_OK below.
569 * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
570 * (and we must not set RQ_NET_OK) */
571static inline bool is_pending_write_protocol_A(struct drbd_request *req)
572{
573 return (req->rq_state &
574 (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
575 == (RQ_WRITE|RQ_NET_PENDING);
576}
577
442/* obviously this could be coded as many single functions 578/* obviously this could be coded as many single functions
443 * instead of one huge switch, 579 * instead of one huge switch,
444 * or by putting the code directly in the respective locations 580 * or by putting the code directly in the respective locations
@@ -454,7 +590,9 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request
454int __req_mod(struct drbd_request *req, enum drbd_req_event what, 590int __req_mod(struct drbd_request *req, enum drbd_req_event what,
455 struct bio_and_error *m) 591 struct bio_and_error *m)
456{ 592{
457 struct drbd_device *device = req->device; 593 struct drbd_device *const device = req->device;
594 struct drbd_peer_device *const peer_device = first_peer_device(device);
595 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
458 struct net_conf *nc; 596 struct net_conf *nc;
459 int p, rv = 0; 597 int p, rv = 0;
460 598
@@ -477,7 +615,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
477 * and from w_read_retry_remote */ 615 * and from w_read_retry_remote */
478 D_ASSERT(device, !(req->rq_state & RQ_NET_MASK)); 616 D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
479 rcu_read_lock(); 617 rcu_read_lock();
480 nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 618 nc = rcu_dereference(connection->net_conf);
481 p = nc->wire_protocol; 619 p = nc->wire_protocol;
482 rcu_read_unlock(); 620 rcu_read_unlock();
483 req->rq_state |= 621 req->rq_state |=
@@ -549,7 +687,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
549 D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0); 687 D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
550 mod_rq_state(req, m, 0, RQ_NET_QUEUED); 688 mod_rq_state(req, m, 0, RQ_NET_QUEUED);
551 req->w.cb = w_send_read_req; 689 req->w.cb = w_send_read_req;
552 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 690 drbd_queue_work(&connection->sender_work,
553 &req->w); 691 &req->w);
554 break; 692 break;
555 693
@@ -585,23 +723,23 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
585 D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 723 D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
586 mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); 724 mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
587 req->w.cb = w_send_dblock; 725 req->w.cb = w_send_dblock;
588 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 726 drbd_queue_work(&connection->sender_work,
589 &req->w); 727 &req->w);
590 728
591 /* close the epoch, in case it outgrew the limit */ 729 /* close the epoch, in case it outgrew the limit */
592 rcu_read_lock(); 730 rcu_read_lock();
593 nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 731 nc = rcu_dereference(connection->net_conf);
594 p = nc->max_epoch_size; 732 p = nc->max_epoch_size;
595 rcu_read_unlock(); 733 rcu_read_unlock();
596 if (first_peer_device(device)->connection->current_tle_writes >= p) 734 if (connection->current_tle_writes >= p)
597 start_new_tl_epoch(first_peer_device(device)->connection); 735 start_new_tl_epoch(connection);
598 736
599 break; 737 break;
600 738
601 case QUEUE_FOR_SEND_OOS: 739 case QUEUE_FOR_SEND_OOS:
602 mod_rq_state(req, m, 0, RQ_NET_QUEUED); 740 mod_rq_state(req, m, 0, RQ_NET_QUEUED);
603 req->w.cb = w_send_out_of_sync; 741 req->w.cb = w_send_out_of_sync;
604 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 742 drbd_queue_work(&connection->sender_work,
605 &req->w); 743 &req->w);
606 break; 744 break;
607 745
@@ -615,18 +753,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
615 753
616 case HANDED_OVER_TO_NETWORK: 754 case HANDED_OVER_TO_NETWORK:
617 /* assert something? */ 755 /* assert something? */
618 if (bio_data_dir(req->master_bio) == WRITE && 756 if (is_pending_write_protocol_A(req))
619 !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
620 /* this is what is dangerous about protocol A: 757 /* this is what is dangerous about protocol A:
621 * pretend it was successfully written on the peer. */ 758 * pretend it was successfully written on the peer. */
622 if (req->rq_state & RQ_NET_PENDING) 759 mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
623 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); 760 RQ_NET_SENT|RQ_NET_OK);
624 /* else: neg-ack was faster... */ 761 else
625 /* it is still not yet RQ_NET_DONE until the 762 mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
626 * corresponding epoch barrier got acked as well, 763 /* It is still not yet RQ_NET_DONE until the
627 * so we know what to dirty on connection loss */ 764 * corresponding epoch barrier got acked as well,
628 } 765 * so we know what to dirty on connection loss. */
629 mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
630 break; 766 break;
631 767
632 case OOS_HANDED_TO_NETWORK: 768 case OOS_HANDED_TO_NETWORK:
@@ -658,12 +794,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
658 case WRITE_ACKED_BY_PEER_AND_SIS: 794 case WRITE_ACKED_BY_PEER_AND_SIS:
659 req->rq_state |= RQ_NET_SIS; 795 req->rq_state |= RQ_NET_SIS;
660 case WRITE_ACKED_BY_PEER: 796 case WRITE_ACKED_BY_PEER:
661 D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); 797 /* Normal operation protocol C: successfully written on peer.
662 /* protocol C; successfully written on peer. 798 * During resync, even in protocol != C,
799 * we requested an explicit write ack anyways.
800 * Which means we cannot even assert anything here.
663 * Nothing more to do here. 801 * Nothing more to do here.
664 * We want to keep the tl in place for all protocols, to cater 802 * We want to keep the tl in place for all protocols, to cater
665 * for volatile write-back caches on lower level devices. */ 803 * for volatile write-back caches on lower level devices. */
666
667 goto ack_common; 804 goto ack_common;
668 case RECV_ACKED_BY_PEER: 805 case RECV_ACKED_BY_PEER:
669 D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK); 806 D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
@@ -671,7 +808,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
671 * see also notes above in HANDED_OVER_TO_NETWORK about 808 * see also notes above in HANDED_OVER_TO_NETWORK about
672 * protocol != C */ 809 * protocol != C */
673 ack_common: 810 ack_common:
674 D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
675 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); 811 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
676 break; 812 break;
677 813
@@ -714,7 +850,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
714 850
715 get_ldev(device); /* always succeeds in this call path */ 851 get_ldev(device); /* always succeeds in this call path */
716 req->w.cb = w_restart_disk_io; 852 req->w.cb = w_restart_disk_io;
717 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 853 drbd_queue_work(&connection->sender_work,
718 &req->w); 854 &req->w);
719 break; 855 break;
720 856
@@ -736,7 +872,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
736 872
737 mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); 873 mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
738 if (req->w.cb) { 874 if (req->w.cb) {
739 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 875 /* w.cb expected to be w_send_dblock, or w_send_read_req */
876 drbd_queue_work(&connection->sender_work,
740 &req->w); 877 &req->w);
741 rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; 878 rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
742 } /* else: FIXME can this happen? */ 879 } /* else: FIXME can this happen? */
@@ -769,7 +906,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
769 break; 906 break;
770 907
771 case QUEUE_AS_DRBD_BARRIER: 908 case QUEUE_AS_DRBD_BARRIER:
772 start_new_tl_epoch(first_peer_device(device)->connection); 909 start_new_tl_epoch(connection);
773 mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); 910 mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
774 break; 911 break;
775 }; 912 };
@@ -886,6 +1023,9 @@ static void maybe_pull_ahead(struct drbd_device *device)
886 connection->agreed_pro_version < 96) 1023 connection->agreed_pro_version < 96)
887 return; 1024 return;
888 1025
1026 if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
1027 return; /* nothing to do ... */
1028
889 /* If I don't even have good local storage, we can not reasonably try 1029 /* If I don't even have good local storage, we can not reasonably try
890 * to pull ahead of the peer. We also need the local reference to make 1030 * to pull ahead of the peer. We also need the local reference to make
891 * sure device->act_log is there. 1031 * sure device->act_log is there.
@@ -1021,6 +1161,7 @@ drbd_submit_req_private_bio(struct drbd_request *req)
1021 * stable storage, and this is a WRITE, we may not even submit 1161 * stable storage, and this is a WRITE, we may not even submit
1022 * this bio. */ 1162 * this bio. */
1023 if (get_ldev(device)) { 1163 if (get_ldev(device)) {
1164 req->pre_submit_jif = jiffies;
1024 if (drbd_insert_fault(device, 1165 if (drbd_insert_fault(device,
1025 rw == WRITE ? DRBD_FAULT_DT_WR 1166 rw == WRITE ? DRBD_FAULT_DT_WR
1026 : rw == READ ? DRBD_FAULT_DT_RD 1167 : rw == READ ? DRBD_FAULT_DT_RD
@@ -1035,10 +1176,14 @@ drbd_submit_req_private_bio(struct drbd_request *req)
1035 1176
1036static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req) 1177static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
1037{ 1178{
1038 spin_lock(&device->submit.lock); 1179 spin_lock_irq(&device->resource->req_lock);
1039 list_add_tail(&req->tl_requests, &device->submit.writes); 1180 list_add_tail(&req->tl_requests, &device->submit.writes);
1040 spin_unlock(&device->submit.lock); 1181 list_add_tail(&req->req_pending_master_completion,
1182 &device->pending_master_completion[1 /* WRITE */]);
1183 spin_unlock_irq(&device->resource->req_lock);
1041 queue_work(device->submit.wq, &device->submit.worker); 1184 queue_work(device->submit.wq, &device->submit.worker);
1185 /* do_submit() may sleep internally on al_wait, too */
1186 wake_up(&device->al_wait);
1042} 1187}
1043 1188
1044/* returns the new drbd_request pointer, if the caller is expected to 1189/* returns the new drbd_request pointer, if the caller is expected to
@@ -1047,7 +1192,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
1047 * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. 1192 * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
1048 */ 1193 */
1049static struct drbd_request * 1194static struct drbd_request *
1050drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time) 1195drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
1051{ 1196{
1052 const int rw = bio_data_dir(bio); 1197 const int rw = bio_data_dir(bio);
1053 struct drbd_request *req; 1198 struct drbd_request *req;
@@ -1062,7 +1207,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
1062 bio_endio(bio, -ENOMEM); 1207 bio_endio(bio, -ENOMEM);
1063 return ERR_PTR(-ENOMEM); 1208 return ERR_PTR(-ENOMEM);
1064 } 1209 }
1065 req->start_time = start_time; 1210 req->start_jif = start_jif;
1066 1211
1067 if (!get_ldev(device)) { 1212 if (!get_ldev(device)) {
1068 bio_put(req->private_bio); 1213 bio_put(req->private_bio);
@@ -1075,10 +1220,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
1075 if (rw == WRITE && req->private_bio && req->i.size 1220 if (rw == WRITE && req->private_bio && req->i.size
1076 && !test_bit(AL_SUSPENDED, &device->flags)) { 1221 && !test_bit(AL_SUSPENDED, &device->flags)) {
1077 if (!drbd_al_begin_io_fastpath(device, &req->i)) { 1222 if (!drbd_al_begin_io_fastpath(device, &req->i)) {
1223 atomic_inc(&device->ap_actlog_cnt);
1078 drbd_queue_write(device, req); 1224 drbd_queue_write(device, req);
1079 return NULL; 1225 return NULL;
1080 } 1226 }
1081 req->rq_state |= RQ_IN_ACT_LOG; 1227 req->rq_state |= RQ_IN_ACT_LOG;
1228 req->in_actlog_jif = jiffies;
1082 } 1229 }
1083 1230
1084 return req; 1231 return req;
@@ -1086,11 +1233,13 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
1086 1233
1087static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) 1234static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
1088{ 1235{
1236 struct drbd_resource *resource = device->resource;
1089 const int rw = bio_rw(req->master_bio); 1237 const int rw = bio_rw(req->master_bio);
1090 struct bio_and_error m = { NULL, }; 1238 struct bio_and_error m = { NULL, };
1091 bool no_remote = false; 1239 bool no_remote = false;
1240 bool submit_private_bio = false;
1092 1241
1093 spin_lock_irq(&device->resource->req_lock); 1242 spin_lock_irq(&resource->req_lock);
1094 if (rw == WRITE) { 1243 if (rw == WRITE) {
1095 /* This may temporarily give up the req_lock, 1244 /* This may temporarily give up the req_lock,
1096 * but will re-aquire it before it returns here. 1245 * but will re-aquire it before it returns here.
@@ -1148,13 +1297,18 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
1148 no_remote = true; 1297 no_remote = true;
1149 } 1298 }
1150 1299
1300 /* If it took the fast path in drbd_request_prepare, add it here.
1301 * The slow path has added it already. */
1302 if (list_empty(&req->req_pending_master_completion))
1303 list_add_tail(&req->req_pending_master_completion,
1304 &device->pending_master_completion[rw == WRITE]);
1151 if (req->private_bio) { 1305 if (req->private_bio) {
1152 /* needs to be marked within the same spinlock */ 1306 /* needs to be marked within the same spinlock */
1307 list_add_tail(&req->req_pending_local,
1308 &device->pending_completion[rw == WRITE]);
1153 _req_mod(req, TO_BE_SUBMITTED); 1309 _req_mod(req, TO_BE_SUBMITTED);
1154 /* but we need to give up the spinlock to submit */ 1310 /* but we need to give up the spinlock to submit */
1155 spin_unlock_irq(&device->resource->req_lock); 1311 submit_private_bio = true;
1156 drbd_submit_req_private_bio(req);
1157 spin_lock_irq(&device->resource->req_lock);
1158 } else if (no_remote) { 1312 } else if (no_remote) {
1159nodata: 1313nodata:
1160 if (__ratelimit(&drbd_ratelimit_state)) 1314 if (__ratelimit(&drbd_ratelimit_state))
@@ -1167,15 +1321,23 @@ nodata:
1167out: 1321out:
1168 if (drbd_req_put_completion_ref(req, &m, 1)) 1322 if (drbd_req_put_completion_ref(req, &m, 1))
1169 kref_put(&req->kref, drbd_req_destroy); 1323 kref_put(&req->kref, drbd_req_destroy);
1170 spin_unlock_irq(&device->resource->req_lock); 1324 spin_unlock_irq(&resource->req_lock);
1171 1325
1326 /* Even though above is a kref_put(), this is safe.
1327 * As long as we still need to submit our private bio,
1328 * we hold a completion ref, and the request cannot disappear.
1329 * If however this request did not even have a private bio to submit
1330 * (e.g. remote read), req may already be invalid now.
1331 * That's why we cannot check on req->private_bio. */
1332 if (submit_private_bio)
1333 drbd_submit_req_private_bio(req);
1172 if (m.bio) 1334 if (m.bio)
1173 complete_master_bio(device, &m); 1335 complete_master_bio(device, &m);
1174} 1336}
1175 1337
1176void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time) 1338void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
1177{ 1339{
1178 struct drbd_request *req = drbd_request_prepare(device, bio, start_time); 1340 struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
1179 if (IS_ERR_OR_NULL(req)) 1341 if (IS_ERR_OR_NULL(req))
1180 return; 1342 return;
1181 drbd_send_and_submit(device, req); 1343 drbd_send_and_submit(device, req);
@@ -1194,6 +1356,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
1194 continue; 1356 continue;
1195 1357
1196 req->rq_state |= RQ_IN_ACT_LOG; 1358 req->rq_state |= RQ_IN_ACT_LOG;
1359 req->in_actlog_jif = jiffies;
1360 atomic_dec(&device->ap_actlog_cnt);
1197 } 1361 }
1198 1362
1199 list_del_init(&req->tl_requests); 1363 list_del_init(&req->tl_requests);
@@ -1203,7 +1367,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom
1203 1367
1204static bool prepare_al_transaction_nonblock(struct drbd_device *device, 1368static bool prepare_al_transaction_nonblock(struct drbd_device *device,
1205 struct list_head *incoming, 1369 struct list_head *incoming,
1206 struct list_head *pending) 1370 struct list_head *pending,
1371 struct list_head *later)
1207{ 1372{
1208 struct drbd_request *req, *tmp; 1373 struct drbd_request *req, *tmp;
1209 int wake = 0; 1374 int wake = 0;
@@ -1212,45 +1377,105 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device,
1212 spin_lock_irq(&device->al_lock); 1377 spin_lock_irq(&device->al_lock);
1213 list_for_each_entry_safe(req, tmp, incoming, tl_requests) { 1378 list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
1214 err = drbd_al_begin_io_nonblock(device, &req->i); 1379 err = drbd_al_begin_io_nonblock(device, &req->i);
1380 if (err == -ENOBUFS)
1381 break;
1215 if (err == -EBUSY) 1382 if (err == -EBUSY)
1216 wake = 1; 1383 wake = 1;
1217 if (err) 1384 if (err)
1218 continue; 1385 list_move_tail(&req->tl_requests, later);
1219 req->rq_state |= RQ_IN_ACT_LOG; 1386 else
1220 list_move_tail(&req->tl_requests, pending); 1387 list_move_tail(&req->tl_requests, pending);
1221 } 1388 }
1222 spin_unlock_irq(&device->al_lock); 1389 spin_unlock_irq(&device->al_lock);
1223 if (wake) 1390 if (wake)
1224 wake_up(&device->al_wait); 1391 wake_up(&device->al_wait);
1225
1226 return !list_empty(pending); 1392 return !list_empty(pending);
1227} 1393}
1228 1394
1395void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
1396{
1397 struct drbd_request *req, *tmp;
1398
1399 list_for_each_entry_safe(req, tmp, pending, tl_requests) {
1400 req->rq_state |= RQ_IN_ACT_LOG;
1401 req->in_actlog_jif = jiffies;
1402 atomic_dec(&device->ap_actlog_cnt);
1403 list_del_init(&req->tl_requests);
1404 drbd_send_and_submit(device, req);
1405 }
1406}
1407
1229void do_submit(struct work_struct *ws) 1408void do_submit(struct work_struct *ws)
1230{ 1409{
1231 struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker); 1410 struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
1232 LIST_HEAD(incoming); 1411 LIST_HEAD(incoming); /* from drbd_make_request() */
1233 LIST_HEAD(pending); 1412 LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */
1234 struct drbd_request *req, *tmp; 1413 LIST_HEAD(busy); /* blocked by resync requests */
1414
1415 /* grab new incoming requests */
1416 spin_lock_irq(&device->resource->req_lock);
1417 list_splice_tail_init(&device->submit.writes, &incoming);
1418 spin_unlock_irq(&device->resource->req_lock);
1235 1419
1236 for (;;) { 1420 for (;;) {
1237 spin_lock(&device->submit.lock); 1421 DEFINE_WAIT(wait);
1238 list_splice_tail_init(&device->submit.writes, &incoming);
1239 spin_unlock(&device->submit.lock);
1240 1422
1423 /* move used-to-be-busy back to front of incoming */
1424 list_splice_init(&busy, &incoming);
1241 submit_fast_path(device, &incoming); 1425 submit_fast_path(device, &incoming);
1242 if (list_empty(&incoming)) 1426 if (list_empty(&incoming))
1243 break; 1427 break;
1244 1428
1245skip_fast_path:
1246 wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
1247 /* Maybe more was queued, while we prepared the transaction?
1248 * Try to stuff them into this transaction as well.
1249 * Be strictly non-blocking here, no wait_event, we already
1250 * have something to commit.
1251 * Stop if we don't make any more progres.
1252 */
1253 for (;;) { 1429 for (;;) {
1430 prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
1431
1432 list_splice_init(&busy, &incoming);
1433 prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
1434 if (!list_empty(&pending))
1435 break;
1436
1437 schedule();
1438
1439 /* If all currently "hot" activity log extents are kept busy by
1440 * incoming requests, we still must not totally starve new
1441 * requests to "cold" extents.
1442 * Something left on &incoming means there had not been
1443 * enough update slots available, and the activity log
1444 * has been marked as "starving".
1445 *
1446 * Try again now, without looking for new requests,
1447 * effectively blocking all new requests until we made
1448 * at least _some_ progress with what we currently have.
1449 */
1450 if (!list_empty(&incoming))
1451 continue;
1452
1453 /* Nothing moved to pending, but nothing left
1454 * on incoming: all moved to busy!
1455 * Grab new and iterate. */
1456 spin_lock_irq(&device->resource->req_lock);
1457 list_splice_tail_init(&device->submit.writes, &incoming);
1458 spin_unlock_irq(&device->resource->req_lock);
1459 }
1460 finish_wait(&device->al_wait, &wait);
1461
1462 /* If the transaction was full, before all incoming requests
1463 * had been processed, skip ahead to commit, and iterate
1464 * without splicing in more incoming requests from upper layers.
1465 *
1466 * Else, if all incoming have been processed,
1467 * they have become either "pending" (to be submitted after
1468 * next transaction commit) or "busy" (blocked by resync).
1469 *
1470 * Maybe more was queued, while we prepared the transaction?
1471 * Try to stuff those into this transaction as well.
1472 * Be strictly non-blocking here,
1473 * we already have something to commit.
1474 *
1475 * Commit if we don't make any more progres.
1476 */
1477
1478 while (list_empty(&incoming)) {
1254 LIST_HEAD(more_pending); 1479 LIST_HEAD(more_pending);
1255 LIST_HEAD(more_incoming); 1480 LIST_HEAD(more_incoming);
1256 bool made_progress; 1481 bool made_progress;
@@ -1260,55 +1485,32 @@ skip_fast_path:
1260 if (list_empty(&device->submit.writes)) 1485 if (list_empty(&device->submit.writes))
1261 break; 1486 break;
1262 1487
1263 spin_lock(&device->submit.lock); 1488 spin_lock_irq(&device->resource->req_lock);
1264 list_splice_tail_init(&device->submit.writes, &more_incoming); 1489 list_splice_tail_init(&device->submit.writes, &more_incoming);
1265 spin_unlock(&device->submit.lock); 1490 spin_unlock_irq(&device->resource->req_lock);
1266 1491
1267 if (list_empty(&more_incoming)) 1492 if (list_empty(&more_incoming))
1268 break; 1493 break;
1269 1494
1270 made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending); 1495 made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
1271 1496
1272 list_splice_tail_init(&more_pending, &pending); 1497 list_splice_tail_init(&more_pending, &pending);
1273 list_splice_tail_init(&more_incoming, &incoming); 1498 list_splice_tail_init(&more_incoming, &incoming);
1274
1275 if (!made_progress) 1499 if (!made_progress)
1276 break; 1500 break;
1277 } 1501 }
1278 drbd_al_begin_io_commit(device, false);
1279
1280 list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
1281 list_del_init(&req->tl_requests);
1282 drbd_send_and_submit(device, req);
1283 }
1284 1502
1285 /* If all currently hot activity log extents are kept busy by 1503 drbd_al_begin_io_commit(device);
1286 * incoming requests, we still must not totally starve new 1504 send_and_submit_pending(device, &pending);
1287 * requests to cold extents. In that case, prepare one request
1288 * in blocking mode. */
1289 list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
1290 list_del_init(&req->tl_requests);
1291 req->rq_state |= RQ_IN_ACT_LOG;
1292 if (!drbd_al_begin_io_prepare(device, &req->i)) {
1293 /* Corresponding extent was hot after all? */
1294 drbd_send_and_submit(device, req);
1295 } else {
1296 /* Found a request to a cold extent.
1297 * Put on "pending" list,
1298 * and try to cumulate with more. */
1299 list_add(&req->tl_requests, &pending);
1300 goto skip_fast_path;
1301 }
1302 }
1303 } 1505 }
1304} 1506}
1305 1507
1306void drbd_make_request(struct request_queue *q, struct bio *bio) 1508void drbd_make_request(struct request_queue *q, struct bio *bio)
1307{ 1509{
1308 struct drbd_device *device = (struct drbd_device *) q->queuedata; 1510 struct drbd_device *device = (struct drbd_device *) q->queuedata;
1309 unsigned long start_time; 1511 unsigned long start_jif;
1310 1512
1311 start_time = jiffies; 1513 start_jif = jiffies;
1312 1514
1313 /* 1515 /*
1314 * what we "blindly" assume: 1516 * what we "blindly" assume:
@@ -1316,7 +1518,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
1316 D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512)); 1518 D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
1317 1519
1318 inc_ap_bio(device); 1520 inc_ap_bio(device);
1319 __drbd_make_request(device, bio, start_time); 1521 __drbd_make_request(device, bio, start_jif);
1320} 1522}
1321 1523
1322/* This is called by bio_add_page(). 1524/* This is called by bio_add_page().
@@ -1353,36 +1555,13 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1353 return limit; 1555 return limit;
1354} 1556}
1355 1557
1356static void find_oldest_requests(
1357 struct drbd_connection *connection,
1358 struct drbd_device *device,
1359 struct drbd_request **oldest_req_waiting_for_peer,
1360 struct drbd_request **oldest_req_waiting_for_disk)
1361{
1362 struct drbd_request *r;
1363 *oldest_req_waiting_for_peer = NULL;
1364 *oldest_req_waiting_for_disk = NULL;
1365 list_for_each_entry(r, &connection->transfer_log, tl_requests) {
1366 const unsigned s = r->rq_state;
1367 if (!*oldest_req_waiting_for_peer
1368 && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
1369 *oldest_req_waiting_for_peer = r;
1370
1371 if (!*oldest_req_waiting_for_disk
1372 && (s & RQ_LOCAL_PENDING) && r->device == device)
1373 *oldest_req_waiting_for_disk = r;
1374
1375 if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
1376 break;
1377 }
1378}
1379
1380void request_timer_fn(unsigned long data) 1558void request_timer_fn(unsigned long data)
1381{ 1559{
1382 struct drbd_device *device = (struct drbd_device *) data; 1560 struct drbd_device *device = (struct drbd_device *) data;
1383 struct drbd_connection *connection = first_peer_device(device)->connection; 1561 struct drbd_connection *connection = first_peer_device(device)->connection;
1384 struct drbd_request *req_disk, *req_peer; /* oldest request */ 1562 struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
1385 struct net_conf *nc; 1563 struct net_conf *nc;
1564 unsigned long oldest_submit_jif;
1386 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ 1565 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
1387 unsigned long now; 1566 unsigned long now;
1388 1567
@@ -1403,14 +1582,31 @@ void request_timer_fn(unsigned long data)
1403 return; /* Recurring timer stopped */ 1582 return; /* Recurring timer stopped */
1404 1583
1405 now = jiffies; 1584 now = jiffies;
1585 nt = now + et;
1406 1586
1407 spin_lock_irq(&device->resource->req_lock); 1587 spin_lock_irq(&device->resource->req_lock);
1408 find_oldest_requests(connection, device, &req_peer, &req_disk); 1588 req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
1409 if (req_peer == NULL && req_disk == NULL) { 1589 req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
1410 spin_unlock_irq(&device->resource->req_lock); 1590 req_peer = connection->req_not_net_done;
1411 mod_timer(&device->request_timer, now + et); 1591 /* maybe the oldest request waiting for the peer is in fact still
1412 return; 1592 * blocking in tcp sendmsg */
1413 } 1593 if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
1594 req_peer = connection->req_next;
1595
1596 /* evaluate the oldest peer request only in one timer! */
1597 if (req_peer && req_peer->device != device)
1598 req_peer = NULL;
1599
1600 /* do we have something to evaluate? */
1601 if (req_peer == NULL && req_write == NULL && req_read == NULL)
1602 goto out;
1603
1604 oldest_submit_jif =
1605 (req_write && req_read)
1606 ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
1607 ? req_write->pre_submit_jif : req_read->pre_submit_jif )
1608 : req_write ? req_write->pre_submit_jif
1609 : req_read ? req_read->pre_submit_jif : now;
1414 1610
1415 /* The request is considered timed out, if 1611 /* The request is considered timed out, if
1416 * - we have some effective timeout from the configuration, 1612 * - we have some effective timeout from the configuration,
@@ -1429,13 +1625,13 @@ void request_timer_fn(unsigned long data)
1429 * to expire twice (worst case) to become effective. Good enough. 1625 * to expire twice (worst case) to become effective. Good enough.
1430 */ 1626 */
1431 if (ent && req_peer && 1627 if (ent && req_peer &&
1432 time_after(now, req_peer->start_time + ent) && 1628 time_after(now, req_peer->pre_send_jif + ent) &&
1433 !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { 1629 !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
1434 drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); 1630 drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
1435 _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); 1631 _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
1436 } 1632 }
1437 if (dt && req_disk && 1633 if (dt && oldest_submit_jif != now &&
1438 time_after(now, req_disk->start_time + dt) && 1634 time_after(now, oldest_submit_jif + dt) &&
1439 !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { 1635 !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
1440 drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); 1636 drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
1441 __drbd_chk_io_error(device, DRBD_FORCE_DETACH); 1637 __drbd_chk_io_error(device, DRBD_FORCE_DETACH);
@@ -1443,11 +1639,12 @@ void request_timer_fn(unsigned long data)
1443 1639
1444 /* Reschedule timer for the nearest not already expired timeout. 1640 /* Reschedule timer for the nearest not already expired timeout.
1445 * Fallback to now + min(effective network timeout, disk timeout). */ 1641 * Fallback to now + min(effective network timeout, disk timeout). */
1446 ent = (ent && req_peer && time_before(now, req_peer->start_time + ent)) 1642 ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
1447 ? req_peer->start_time + ent : now + et; 1643 ? req_peer->pre_send_jif + ent : now + et;
1448 dt = (dt && req_disk && time_before(now, req_disk->start_time + dt)) 1644 dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
1449 ? req_disk->start_time + dt : now + et; 1645 ? oldest_submit_jif + dt : now + et;
1450 nt = time_before(ent, dt) ? ent : dt; 1646 nt = time_before(ent, dt) ? ent : dt;
1647out:
1451 spin_unlock_irq(&connection->resource->req_lock); 1648 spin_unlock_irq(&connection->resource->req_lock);
1452 mod_timer(&device->request_timer, nt); 1649 mod_timer(&device->request_timer, nt);
1453} 1650}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 8566cd5866b4..9f6a04080e9f 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -288,6 +288,7 @@ extern void complete_master_bio(struct drbd_device *device,
288extern void request_timer_fn(unsigned long data); 288extern void request_timer_fn(unsigned long data);
289extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what); 289extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
290extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what); 290extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
291extern void tl_abort_disk_io(struct drbd_device *device);
291 292
292/* this is in drbd_main.c */ 293/* this is in drbd_main.c */
293extern void drbd_restart_request(struct drbd_request *req); 294extern void drbd_restart_request(struct drbd_request *req);
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index a5d8aae00e04..c35c0f001bb7 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -410,7 +410,7 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask,
410 return rv; 410 return rv;
411} 411}
412 412
413static void print_st(struct drbd_device *device, char *name, union drbd_state ns) 413static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
414{ 414{
415 drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", 415 drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
416 name, 416 name,
@@ -952,11 +952,12 @@ enum drbd_state_rv
952__drbd_set_state(struct drbd_device *device, union drbd_state ns, 952__drbd_set_state(struct drbd_device *device, union drbd_state ns,
953 enum chg_state_flags flags, struct completion *done) 953 enum chg_state_flags flags, struct completion *done)
954{ 954{
955 struct drbd_peer_device *peer_device = first_peer_device(device);
956 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
955 union drbd_state os; 957 union drbd_state os;
956 enum drbd_state_rv rv = SS_SUCCESS; 958 enum drbd_state_rv rv = SS_SUCCESS;
957 enum sanitize_state_warnings ssw; 959 enum sanitize_state_warnings ssw;
958 struct after_state_chg_work *ascw; 960 struct after_state_chg_work *ascw;
959 bool did_remote, should_do_remote;
960 961
961 os = drbd_read_state(device); 962 os = drbd_read_state(device);
962 963
@@ -978,9 +979,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
978 this happen...*/ 979 this happen...*/
979 980
980 if (is_valid_state(device, os) == rv) 981 if (is_valid_state(device, os) == rv)
981 rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); 982 rv = is_valid_soft_transition(os, ns, connection);
982 } else 983 } else
983 rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); 984 rv = is_valid_soft_transition(os, ns, connection);
984 } 985 }
985 986
986 if (rv < SS_SUCCESS) { 987 if (rv < SS_SUCCESS) {
@@ -997,7 +998,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
997 sanitize_state(). Only display it here if we where not called from 998 sanitize_state(). Only display it here if we where not called from
998 _conn_request_state() */ 999 _conn_request_state() */
999 if (!(flags & CS_DC_SUSP)) 1000 if (!(flags & CS_DC_SUSP))
1000 conn_pr_state_change(first_peer_device(device)->connection, os, ns, 1001 conn_pr_state_change(connection, os, ns,
1001 (flags & ~CS_DC_MASK) | CS_DC_SUSP); 1002 (flags & ~CS_DC_MASK) | CS_DC_SUSP);
1002 1003
1003 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference 1004 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
@@ -1008,28 +1009,35 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1008 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) 1009 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1009 atomic_inc(&device->local_cnt); 1010 atomic_inc(&device->local_cnt);
1010 1011
1011 did_remote = drbd_should_do_remote(device->state); 1012 if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
1013 clear_bit(RS_DONE, &device->flags);
1014
1015 /* changes to local_cnt and device flags should be visible before
1016 * changes to state, which again should be visible before anything else
1017 * depending on that change happens. */
1018 smp_wmb();
1012 device->state.i = ns.i; 1019 device->state.i = ns.i;
1013 should_do_remote = drbd_should_do_remote(device->state);
1014 device->resource->susp = ns.susp; 1020 device->resource->susp = ns.susp;
1015 device->resource->susp_nod = ns.susp_nod; 1021 device->resource->susp_nod = ns.susp_nod;
1016 device->resource->susp_fen = ns.susp_fen; 1022 device->resource->susp_fen = ns.susp_fen;
1023 smp_wmb();
1017 1024
1018 /* put replicated vs not-replicated requests in seperate epochs */ 1025 /* put replicated vs not-replicated requests in seperate epochs */
1019 if (did_remote != should_do_remote) 1026 if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
1020 start_new_tl_epoch(first_peer_device(device)->connection); 1027 drbd_should_do_remote((union drbd_dev_state)ns.i))
1028 start_new_tl_epoch(connection);
1021 1029
1022 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) 1030 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1023 drbd_print_uuids(device, "attached to UUIDs"); 1031 drbd_print_uuids(device, "attached to UUIDs");
1024 1032
1025 /* Wake up role changes, that were delayed because of connection establishing */ 1033 /* Wake up role changes, that were delayed because of connection establishing */
1026 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && 1034 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
1027 no_peer_wf_report_params(first_peer_device(device)->connection)) 1035 no_peer_wf_report_params(connection))
1028 clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags); 1036 clear_bit(STATE_SENT, &connection->flags);
1029 1037
1030 wake_up(&device->misc_wait); 1038 wake_up(&device->misc_wait);
1031 wake_up(&device->state_wait); 1039 wake_up(&device->state_wait);
1032 wake_up(&first_peer_device(device)->connection->ping_wait); 1040 wake_up(&connection->ping_wait);
1033 1041
1034 /* Aborted verify run, or we reached the stop sector. 1042 /* Aborted verify run, or we reached the stop sector.
1035 * Log the last position, unless end-of-device. */ 1043 * Log the last position, unless end-of-device. */
@@ -1118,21 +1126,21 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1118 1126
1119 /* Receiver should clean up itself */ 1127 /* Receiver should clean up itself */
1120 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) 1128 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1121 drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver); 1129 drbd_thread_stop_nowait(&connection->receiver);
1122 1130
1123 /* Now the receiver finished cleaning up itself, it should die */ 1131 /* Now the receiver finished cleaning up itself, it should die */
1124 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) 1132 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1125 drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver); 1133 drbd_thread_stop_nowait(&connection->receiver);
1126 1134
1127 /* Upon network failure, we need to restart the receiver. */ 1135 /* Upon network failure, we need to restart the receiver. */
1128 if (os.conn > C_WF_CONNECTION && 1136 if (os.conn > C_WF_CONNECTION &&
1129 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) 1137 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1130 drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver); 1138 drbd_thread_restart_nowait(&connection->receiver);
1131 1139
1132 /* Resume AL writing if we get a connection */ 1140 /* Resume AL writing if we get a connection */
1133 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { 1141 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1134 drbd_resume_al(device); 1142 drbd_resume_al(device);
1135 first_peer_device(device)->connection->connect_cnt++; 1143 connection->connect_cnt++;
1136 } 1144 }
1137 1145
1138 /* remember last attach time so request_timer_fn() won't 1146 /* remember last attach time so request_timer_fn() won't
@@ -1150,7 +1158,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1150 ascw->w.cb = w_after_state_ch; 1158 ascw->w.cb = w_after_state_ch;
1151 ascw->device = device; 1159 ascw->device = device;
1152 ascw->done = done; 1160 ascw->done = done;
1153 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 1161 drbd_queue_work(&connection->sender_work,
1154 &ascw->w); 1162 &ascw->w);
1155 } else { 1163 } else {
1156 drbd_err(device, "Could not kmalloc an ascw\n"); 1164 drbd_err(device, "Could not kmalloc an ascw\n");
@@ -1222,13 +1230,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1222 union drbd_state ns, enum chg_state_flags flags) 1230 union drbd_state ns, enum chg_state_flags flags)
1223{ 1231{
1224 struct drbd_resource *resource = device->resource; 1232 struct drbd_resource *resource = device->resource;
1233 struct drbd_peer_device *peer_device = first_peer_device(device);
1234 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
1225 struct sib_info sib; 1235 struct sib_info sib;
1226 1236
1227 sib.sib_reason = SIB_STATE_CHANGE; 1237 sib.sib_reason = SIB_STATE_CHANGE;
1228 sib.os = os; 1238 sib.os = os;
1229 sib.ns = ns; 1239 sib.ns = ns;
1230 1240
1231 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { 1241 if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
1242 && (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
1232 clear_bit(CRASHED_PRIMARY, &device->flags); 1243 clear_bit(CRASHED_PRIMARY, &device->flags);
1233 if (device->p_uuid) 1244 if (device->p_uuid)
1234 device->p_uuid[UI_FLAGS] &= ~((u64)2); 1245 device->p_uuid[UI_FLAGS] &= ~((u64)2);
@@ -1245,7 +1256,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1245 state change. This function might sleep */ 1256 state change. This function might sleep */
1246 1257
1247 if (ns.susp_nod) { 1258 if (ns.susp_nod) {
1248 struct drbd_connection *connection = first_peer_device(device)->connection;
1249 enum drbd_req_event what = NOTHING; 1259 enum drbd_req_event what = NOTHING;
1250 1260
1251 spin_lock_irq(&device->resource->req_lock); 1261 spin_lock_irq(&device->resource->req_lock);
@@ -1267,8 +1277,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1267 } 1277 }
1268 1278
1269 if (ns.susp_fen) { 1279 if (ns.susp_fen) {
1270 struct drbd_connection *connection = first_peer_device(device)->connection;
1271
1272 spin_lock_irq(&device->resource->req_lock); 1280 spin_lock_irq(&device->resource->req_lock);
1273 if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) { 1281 if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
1274 /* case2: The connection was established again: */ 1282 /* case2: The connection was established again: */
@@ -1294,8 +1302,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1294 * which is unexpected. */ 1302 * which is unexpected. */
1295 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && 1303 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1296 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && 1304 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1297 first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) { 1305 connection->agreed_pro_version >= 96 && get_ldev(device)) {
1298 drbd_gen_and_send_sync_uuid(first_peer_device(device)); 1306 drbd_gen_and_send_sync_uuid(peer_device);
1299 put_ldev(device); 1307 put_ldev(device);
1300 } 1308 }
1301 1309
@@ -1309,8 +1317,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1309 atomic_set(&device->rs_pending_cnt, 0); 1317 atomic_set(&device->rs_pending_cnt, 0);
1310 drbd_rs_cancel_all(device); 1318 drbd_rs_cancel_all(device);
1311 1319
1312 drbd_send_uuids(first_peer_device(device)); 1320 drbd_send_uuids(peer_device);
1313 drbd_send_state(first_peer_device(device), ns); 1321 drbd_send_state(peer_device, ns);
1314 } 1322 }
1315 /* No point in queuing send_bitmap if we don't have a connection 1323 /* No point in queuing send_bitmap if we don't have a connection
1316 * anymore, so check also the _current_ state, not only the new state 1324 * anymore, so check also the _current_ state, not only the new state
@@ -1335,7 +1343,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1335 set_bit(NEW_CUR_UUID, &device->flags); 1343 set_bit(NEW_CUR_UUID, &device->flags);
1336 } else { 1344 } else {
1337 drbd_uuid_new_current(device); 1345 drbd_uuid_new_current(device);
1338 drbd_send_uuids(first_peer_device(device)); 1346 drbd_send_uuids(peer_device);
1339 } 1347 }
1340 } 1348 }
1341 put_ldev(device); 1349 put_ldev(device);
@@ -1346,7 +1354,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1346 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && 1354 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1347 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1355 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1348 drbd_uuid_new_current(device); 1356 drbd_uuid_new_current(device);
1349 drbd_send_uuids(first_peer_device(device)); 1357 drbd_send_uuids(peer_device);
1350 } 1358 }
1351 /* D_DISKLESS Peer becomes secondary */ 1359 /* D_DISKLESS Peer becomes secondary */
1352 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1360 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1373,16 +1381,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1373 /* Last part of the attaching process ... */ 1381 /* Last part of the attaching process ... */
1374 if (ns.conn >= C_CONNECTED && 1382 if (ns.conn >= C_CONNECTED &&
1375 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { 1383 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1376 drbd_send_sizes(first_peer_device(device), 0, 0); /* to start sync... */ 1384 drbd_send_sizes(peer_device, 0, 0); /* to start sync... */
1377 drbd_send_uuids(first_peer_device(device)); 1385 drbd_send_uuids(peer_device);
1378 drbd_send_state(first_peer_device(device), ns); 1386 drbd_send_state(peer_device, ns);
1379 } 1387 }
1380 1388
1381 /* We want to pause/continue resync, tell peer. */ 1389 /* We want to pause/continue resync, tell peer. */
1382 if (ns.conn >= C_CONNECTED && 1390 if (ns.conn >= C_CONNECTED &&
1383 ((os.aftr_isp != ns.aftr_isp) || 1391 ((os.aftr_isp != ns.aftr_isp) ||
1384 (os.user_isp != ns.user_isp))) 1392 (os.user_isp != ns.user_isp)))
1385 drbd_send_state(first_peer_device(device), ns); 1393 drbd_send_state(peer_device, ns);
1386 1394
1387 /* In case one of the isp bits got set, suspend other devices. */ 1395 /* In case one of the isp bits got set, suspend other devices. */
1388 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && 1396 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1392,10 +1400,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1392 /* Make sure the peer gets informed about eventual state 1400 /* Make sure the peer gets informed about eventual state
1393 changes (ISP bits) while we were in WFReportParams. */ 1401 changes (ISP bits) while we were in WFReportParams. */
1394 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) 1402 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1395 drbd_send_state(first_peer_device(device), ns); 1403 drbd_send_state(peer_device, ns);
1396 1404
1397 if (os.conn != C_AHEAD && ns.conn == C_AHEAD) 1405 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1398 drbd_send_state(first_peer_device(device), ns); 1406 drbd_send_state(peer_device, ns);
1399 1407
1400 /* We are in the progress to start a full sync... */ 1408 /* We are in the progress to start a full sync... */
1401 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || 1409 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1449,7 +1457,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1449 drbd_disk_str(device->state.disk)); 1457 drbd_disk_str(device->state.disk));
1450 1458
1451 if (ns.conn >= C_CONNECTED) 1459 if (ns.conn >= C_CONNECTED)
1452 drbd_send_state(first_peer_device(device), ns); 1460 drbd_send_state(peer_device, ns);
1453 1461
1454 drbd_rs_cancel_all(device); 1462 drbd_rs_cancel_all(device);
1455 1463
@@ -1473,7 +1481,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1473 drbd_disk_str(device->state.disk)); 1481 drbd_disk_str(device->state.disk));
1474 1482
1475 if (ns.conn >= C_CONNECTED) 1483 if (ns.conn >= C_CONNECTED)
1476 drbd_send_state(first_peer_device(device), ns); 1484 drbd_send_state(peer_device, ns);
1477 /* corresponding get_ldev in __drbd_set_state 1485 /* corresponding get_ldev in __drbd_set_state
1478 * this may finally trigger drbd_ldev_destroy. */ 1486 * this may finally trigger drbd_ldev_destroy. */
1479 put_ldev(device); 1487 put_ldev(device);
@@ -1481,7 +1489,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1481 1489
1482 /* Notify peer that I had a local IO error, and did not detached.. */ 1490 /* Notify peer that I had a local IO error, and did not detached.. */
1483 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) 1491 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
1484 drbd_send_state(first_peer_device(device), ns); 1492 drbd_send_state(peer_device, ns);
1485 1493
1486 /* Disks got bigger while they were detached */ 1494 /* Disks got bigger while they were detached */
1487 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && 1495 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1499,14 +1507,14 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1499 /* sync target done with resync. Explicitly notify peer, even though 1507 /* sync target done with resync. Explicitly notify peer, even though
1500 * it should (at least for non-empty resyncs) already know itself. */ 1508 * it should (at least for non-empty resyncs) already know itself. */
1501 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) 1509 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1502 drbd_send_state(first_peer_device(device), ns); 1510 drbd_send_state(peer_device, ns);
1503 1511
1504 /* Verify finished, or reached stop sector. Peer did not know about 1512 /* Verify finished, or reached stop sector. Peer did not know about
1505 * the stop sector, and we may even have changed the stop sector during 1513 * the stop sector, and we may even have changed the stop sector during
1506 * verify to interrupt/stop early. Send the new state. */ 1514 * verify to interrupt/stop early. Send the new state. */
1507 if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED 1515 if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
1508 && verify_can_do_stop_sector(device)) 1516 && verify_can_do_stop_sector(device))
1509 drbd_send_state(first_peer_device(device), ns); 1517 drbd_send_state(peer_device, ns);
1510 1518
1511 /* This triggers bitmap writeout of potentially still unwritten pages 1519 /* This triggers bitmap writeout of potentially still unwritten pages
1512 * if the resync finished cleanly, or aborted because of peer disk 1520 * if the resync finished cleanly, or aborted because of peer disk
@@ -1563,7 +1571,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
1563 old_conf = connection->net_conf; 1571 old_conf = connection->net_conf;
1564 connection->my_addr_len = 0; 1572 connection->my_addr_len = 0;
1565 connection->peer_addr_len = 0; 1573 connection->peer_addr_len = 0;
1566 rcu_assign_pointer(connection->net_conf, NULL); 1574 RCU_INIT_POINTER(connection->net_conf, NULL);
1567 conn_free_crypto(connection); 1575 conn_free_crypto(connection);
1568 mutex_unlock(&connection->resource->conf_update); 1576 mutex_unlock(&connection->resource->conf_update);
1569 1577
@@ -1599,7 +1607,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
1599 return 0; 1607 return 0;
1600} 1608}
1601 1609
1602void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf) 1610static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
1603{ 1611{
1604 enum chg_state_flags flags = ~0; 1612 enum chg_state_flags flags = ~0;
1605 struct drbd_peer_device *peer_device; 1613 struct drbd_peer_device *peer_device;
@@ -1688,7 +1696,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma
1688 return rv; 1696 return rv;
1689} 1697}
1690 1698
1691void 1699static void
1692conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, 1700conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
1693 union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) 1701 union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
1694{ 1702{
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d8f57b6305cd..50776b362828 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -67,13 +67,10 @@ rwlock_t global_state_lock;
67 */ 67 */
68void drbd_md_io_complete(struct bio *bio, int error) 68void drbd_md_io_complete(struct bio *bio, int error)
69{ 69{
70 struct drbd_md_io *md_io;
71 struct drbd_device *device; 70 struct drbd_device *device;
72 71
73 md_io = (struct drbd_md_io *)bio->bi_private; 72 device = bio->bi_private;
74 device = container_of(md_io, struct drbd_device, md_io); 73 device->md_io.error = error;
75
76 md_io->error = error;
77 74
78 /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able 75 /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
79 * to timeout on the lower level device, and eventually detach from it. 76 * to timeout on the lower level device, and eventually detach from it.
@@ -87,7 +84,7 @@ void drbd_md_io_complete(struct bio *bio, int error)
87 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there. 84 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
88 */ 85 */
89 drbd_md_put_buffer(device); 86 drbd_md_put_buffer(device);
90 md_io->done = 1; 87 device->md_io.done = 1;
91 wake_up(&device->misc_wait); 88 wake_up(&device->misc_wait);
92 bio_put(bio); 89 bio_put(bio);
93 if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ 90 if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
@@ -135,6 +132,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
135 i = peer_req->i; 132 i = peer_req->i;
136 do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; 133 do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
137 block_id = peer_req->block_id; 134 block_id = peer_req->block_id;
135 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
138 136
139 spin_lock_irqsave(&device->resource->req_lock, flags); 137 spin_lock_irqsave(&device->resource->req_lock, flags);
140 device->writ_cnt += peer_req->i.size >> 9; 138 device->writ_cnt += peer_req->i.size >> 9;
@@ -398,9 +396,6 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
398 if (!get_ldev(device)) 396 if (!get_ldev(device))
399 return -EIO; 397 return -EIO;
400 398
401 if (drbd_rs_should_slow_down(device, sector))
402 goto defer;
403
404 /* GFP_TRY, because if there is no memory available right now, this may 399 /* GFP_TRY, because if there is no memory available right now, this may
405 * be rescheduled for later. It is "only" background resync, after all. */ 400 * be rescheduled for later. It is "only" background resync, after all. */
406 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, 401 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
@@ -410,7 +405,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
410 405
411 peer_req->w.cb = w_e_send_csum; 406 peer_req->w.cb = w_e_send_csum;
412 spin_lock_irq(&device->resource->req_lock); 407 spin_lock_irq(&device->resource->req_lock);
413 list_add(&peer_req->w.list, &device->read_ee); 408 list_add_tail(&peer_req->w.list, &device->read_ee);
414 spin_unlock_irq(&device->resource->req_lock); 409 spin_unlock_irq(&device->resource->req_lock);
415 410
416 atomic_add(size >> 9, &device->rs_sect_ev); 411 atomic_add(size >> 9, &device->rs_sect_ev);
@@ -452,9 +447,9 @@ void resync_timer_fn(unsigned long data)
452{ 447{
453 struct drbd_device *device = (struct drbd_device *) data; 448 struct drbd_device *device = (struct drbd_device *) data;
454 449
455 if (list_empty(&device->resync_work.list)) 450 drbd_queue_work_if_unqueued(
456 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 451 &first_peer_device(device)->connection->sender_work,
457 &device->resync_work); 452 &device->resync_work);
458} 453}
459 454
460static void fifo_set(struct fifo_buffer *fb, int value) 455static void fifo_set(struct fifo_buffer *fb, int value)
@@ -504,9 +499,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
504static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) 499static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
505{ 500{
506 struct disk_conf *dc; 501 struct disk_conf *dc;
507 unsigned int want; /* The number of sectors we want in the proxy */ 502 unsigned int want; /* The number of sectors we want in-flight */
508 int req_sect; /* Number of sectors to request in this turn */ 503 int req_sect; /* Number of sectors to request in this turn */
509 int correction; /* Number of sectors more we need in the proxy*/ 504 int correction; /* Number of sectors more we need in-flight */
510 int cps; /* correction per invocation of drbd_rs_controller() */ 505 int cps; /* correction per invocation of drbd_rs_controller() */
511 int steps; /* Number of time steps to plan ahead */ 506 int steps; /* Number of time steps to plan ahead */
512 int curr_corr; 507 int curr_corr;
@@ -577,20 +572,27 @@ static int drbd_rs_number_requests(struct drbd_device *device)
577 * potentially causing a distributed deadlock on congestion during 572 * potentially causing a distributed deadlock on congestion during
578 * online-verify or (checksum-based) resync, if max-buffers, 573 * online-verify or (checksum-based) resync, if max-buffers,
579 * socket buffer sizes and resync rate settings are mis-configured. */ 574 * socket buffer sizes and resync rate settings are mis-configured. */
580 if (mxb - device->rs_in_flight < number) 575
581 number = mxb - device->rs_in_flight; 576 /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
577 * mxb (as used here, and in drbd_alloc_pages on the peer) is
578 * "number of pages" (typically also 4k),
579 * but "rs_in_flight" is in "sectors" (512 Byte). */
580 if (mxb - device->rs_in_flight/8 < number)
581 number = mxb - device->rs_in_flight/8;
582 582
583 return number; 583 return number;
584} 584}
585 585
586static int make_resync_request(struct drbd_device *device, int cancel) 586static int make_resync_request(struct drbd_device *const device, int cancel)
587{ 587{
588 struct drbd_peer_device *const peer_device = first_peer_device(device);
589 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
588 unsigned long bit; 590 unsigned long bit;
589 sector_t sector; 591 sector_t sector;
590 const sector_t capacity = drbd_get_capacity(device->this_bdev); 592 const sector_t capacity = drbd_get_capacity(device->this_bdev);
591 int max_bio_size; 593 int max_bio_size;
592 int number, rollback_i, size; 594 int number, rollback_i, size;
593 int align, queued, sndbuf; 595 int align, requeue = 0;
594 int i = 0; 596 int i = 0;
595 597
596 if (unlikely(cancel)) 598 if (unlikely(cancel))
@@ -617,17 +619,22 @@ static int make_resync_request(struct drbd_device *device, int cancel)
617 goto requeue; 619 goto requeue;
618 620
619 for (i = 0; i < number; i++) { 621 for (i = 0; i < number; i++) {
620 /* Stop generating RS requests, when half of the send buffer is filled */ 622 /* Stop generating RS requests when half of the send buffer is filled,
621 mutex_lock(&first_peer_device(device)->connection->data.mutex); 623 * but notify TCP that we'd like to have more space. */
622 if (first_peer_device(device)->connection->data.socket) { 624 mutex_lock(&connection->data.mutex);
623 queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued; 625 if (connection->data.socket) {
624 sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf; 626 struct sock *sk = connection->data.socket->sk;
625 } else { 627 int queued = sk->sk_wmem_queued;
626 queued = 1; 628 int sndbuf = sk->sk_sndbuf;
627 sndbuf = 0; 629 if (queued > sndbuf / 2) {
628 } 630 requeue = 1;
629 mutex_unlock(&first_peer_device(device)->connection->data.mutex); 631 if (sk->sk_socket)
630 if (queued > sndbuf / 2) 632 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
633 }
634 } else
635 requeue = 1;
636 mutex_unlock(&connection->data.mutex);
637 if (requeue)
631 goto requeue; 638 goto requeue;
632 639
633next_sector: 640next_sector:
@@ -642,8 +649,7 @@ next_sector:
642 649
643 sector = BM_BIT_TO_SECT(bit); 650 sector = BM_BIT_TO_SECT(bit);
644 651
645 if (drbd_rs_should_slow_down(device, sector) || 652 if (drbd_try_rs_begin_io(device, sector)) {
646 drbd_try_rs_begin_io(device, sector)) {
647 device->bm_resync_fo = bit; 653 device->bm_resync_fo = bit;
648 goto requeue; 654 goto requeue;
649 } 655 }
@@ -696,9 +702,9 @@ next_sector:
696 /* adjust very last sectors, in case we are oddly sized */ 702 /* adjust very last sectors, in case we are oddly sized */
697 if (sector + (size>>9) > capacity) 703 if (sector + (size>>9) > capacity)
698 size = (capacity-sector)<<9; 704 size = (capacity-sector)<<9;
699 if (first_peer_device(device)->connection->agreed_pro_version >= 89 && 705
700 first_peer_device(device)->connection->csums_tfm) { 706 if (device->use_csums) {
701 switch (read_for_csum(first_peer_device(device), sector, size)) { 707 switch (read_for_csum(peer_device, sector, size)) {
702 case -EIO: /* Disk failure */ 708 case -EIO: /* Disk failure */
703 put_ldev(device); 709 put_ldev(device);
704 return -EIO; 710 return -EIO;
@@ -717,7 +723,7 @@ next_sector:
717 int err; 723 int err;
718 724
719 inc_rs_pending(device); 725 inc_rs_pending(device);
720 err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST, 726 err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
721 sector, size, ID_SYNCER); 727 sector, size, ID_SYNCER);
722 if (err) { 728 if (err) {
723 drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); 729 drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@@ -774,8 +780,7 @@ static int make_ov_request(struct drbd_device *device, int cancel)
774 780
775 size = BM_BLOCK_SIZE; 781 size = BM_BLOCK_SIZE;
776 782
777 if (drbd_rs_should_slow_down(device, sector) || 783 if (drbd_try_rs_begin_io(device, sector)) {
778 drbd_try_rs_begin_io(device, sector)) {
779 device->ov_position = sector; 784 device->ov_position = sector;
780 goto requeue; 785 goto requeue;
781 } 786 }
@@ -911,7 +916,7 @@ int drbd_resync_finished(struct drbd_device *device)
911 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) 916 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
912 khelper_cmd = "after-resync-target"; 917 khelper_cmd = "after-resync-target";
913 918
914 if (first_peer_device(device)->connection->csums_tfm && device->rs_total) { 919 if (device->use_csums && device->rs_total) {
915 const unsigned long s = device->rs_same_csum; 920 const unsigned long s = device->rs_same_csum;
916 const unsigned long t = device->rs_total; 921 const unsigned long t = device->rs_total;
917 const int ratio = 922 const int ratio =
@@ -1351,13 +1356,15 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
1351{ 1356{
1352 struct drbd_request *req = container_of(w, struct drbd_request, w); 1357 struct drbd_request *req = container_of(w, struct drbd_request, w);
1353 struct drbd_device *device = req->device; 1358 struct drbd_device *device = req->device;
1354 struct drbd_connection *connection = first_peer_device(device)->connection; 1359 struct drbd_peer_device *const peer_device = first_peer_device(device);
1360 struct drbd_connection *const connection = peer_device->connection;
1355 int err; 1361 int err;
1356 1362
1357 if (unlikely(cancel)) { 1363 if (unlikely(cancel)) {
1358 req_mod(req, SEND_CANCELED); 1364 req_mod(req, SEND_CANCELED);
1359 return 0; 1365 return 0;
1360 } 1366 }
1367 req->pre_send_jif = jiffies;
1361 1368
1362 /* this time, no connection->send.current_epoch_writes++; 1369 /* this time, no connection->send.current_epoch_writes++;
1363 * If it was sent, it was the closing barrier for the last 1370 * If it was sent, it was the closing barrier for the last
@@ -1365,7 +1372,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel)
1365 * No more barriers will be sent, until we leave AHEAD mode again. */ 1372 * No more barriers will be sent, until we leave AHEAD mode again. */
1366 maybe_send_barrier(connection, req->epoch); 1373 maybe_send_barrier(connection, req->epoch);
1367 1374
1368 err = drbd_send_out_of_sync(first_peer_device(device), req); 1375 err = drbd_send_out_of_sync(peer_device, req);
1369 req_mod(req, OOS_HANDED_TO_NETWORK); 1376 req_mod(req, OOS_HANDED_TO_NETWORK);
1370 1377
1371 return err; 1378 return err;
@@ -1380,19 +1387,21 @@ int w_send_dblock(struct drbd_work *w, int cancel)
1380{ 1387{
1381 struct drbd_request *req = container_of(w, struct drbd_request, w); 1388 struct drbd_request *req = container_of(w, struct drbd_request, w);
1382 struct drbd_device *device = req->device; 1389 struct drbd_device *device = req->device;
1383 struct drbd_connection *connection = first_peer_device(device)->connection; 1390 struct drbd_peer_device *const peer_device = first_peer_device(device);
1391 struct drbd_connection *connection = peer_device->connection;
1384 int err; 1392 int err;
1385 1393
1386 if (unlikely(cancel)) { 1394 if (unlikely(cancel)) {
1387 req_mod(req, SEND_CANCELED); 1395 req_mod(req, SEND_CANCELED);
1388 return 0; 1396 return 0;
1389 } 1397 }
1398 req->pre_send_jif = jiffies;
1390 1399
1391 re_init_if_first_write(connection, req->epoch); 1400 re_init_if_first_write(connection, req->epoch);
1392 maybe_send_barrier(connection, req->epoch); 1401 maybe_send_barrier(connection, req->epoch);
1393 connection->send.current_epoch_writes++; 1402 connection->send.current_epoch_writes++;
1394 1403
1395 err = drbd_send_dblock(first_peer_device(device), req); 1404 err = drbd_send_dblock(peer_device, req);
1396 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); 1405 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1397 1406
1398 return err; 1407 return err;
@@ -1407,19 +1416,21 @@ int w_send_read_req(struct drbd_work *w, int cancel)
1407{ 1416{
1408 struct drbd_request *req = container_of(w, struct drbd_request, w); 1417 struct drbd_request *req = container_of(w, struct drbd_request, w);
1409 struct drbd_device *device = req->device; 1418 struct drbd_device *device = req->device;
1410 struct drbd_connection *connection = first_peer_device(device)->connection; 1419 struct drbd_peer_device *const peer_device = first_peer_device(device);
1420 struct drbd_connection *connection = peer_device->connection;
1411 int err; 1421 int err;
1412 1422
1413 if (unlikely(cancel)) { 1423 if (unlikely(cancel)) {
1414 req_mod(req, SEND_CANCELED); 1424 req_mod(req, SEND_CANCELED);
1415 return 0; 1425 return 0;
1416 } 1426 }
1427 req->pre_send_jif = jiffies;
1417 1428
1418 /* Even read requests may close a write epoch, 1429 /* Even read requests may close a write epoch,
1419 * if there was any yet. */ 1430 * if there was any yet. */
1420 maybe_send_barrier(connection, req->epoch); 1431 maybe_send_barrier(connection, req->epoch);
1421 1432
1422 err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size, 1433 err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
1423 (unsigned long)req); 1434 (unsigned long)req);
1424 1435
1425 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); 1436 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
@@ -1433,7 +1444,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
1433 struct drbd_device *device = req->device; 1444 struct drbd_device *device = req->device;
1434 1445
1435 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) 1446 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1436 drbd_al_begin_io(device, &req->i, false); 1447 drbd_al_begin_io(device, &req->i);
1437 1448
1438 drbd_req_make_private_bio(req, req->master_bio); 1449 drbd_req_make_private_bio(req, req->master_bio);
1439 req->private_bio->bi_bdev = device->ldev->backing_bdev; 1450 req->private_bio->bi_bdev = device->ldev->backing_bdev;
@@ -1601,26 +1612,32 @@ void drbd_rs_controller_reset(struct drbd_device *device)
1601void start_resync_timer_fn(unsigned long data) 1612void start_resync_timer_fn(unsigned long data)
1602{ 1613{
1603 struct drbd_device *device = (struct drbd_device *) data; 1614 struct drbd_device *device = (struct drbd_device *) data;
1604 1615 drbd_device_post_work(device, RS_START);
1605 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
1606 &device->start_resync_work);
1607} 1616}
1608 1617
1609int w_start_resync(struct drbd_work *w, int cancel) 1618static void do_start_resync(struct drbd_device *device)
1610{ 1619{
1611 struct drbd_device *device =
1612 container_of(w, struct drbd_device, start_resync_work);
1613
1614 if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) { 1620 if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
1615 drbd_warn(device, "w_start_resync later...\n"); 1621 drbd_warn(device, "postponing start_resync ...\n");
1616 device->start_resync_timer.expires = jiffies + HZ/10; 1622 device->start_resync_timer.expires = jiffies + HZ/10;
1617 add_timer(&device->start_resync_timer); 1623 add_timer(&device->start_resync_timer);
1618 return 0; 1624 return;
1619 } 1625 }
1620 1626
1621 drbd_start_resync(device, C_SYNC_SOURCE); 1627 drbd_start_resync(device, C_SYNC_SOURCE);
1622 clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags); 1628 clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
1623 return 0; 1629}
1630
1631static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
1632{
1633 bool csums_after_crash_only;
1634 rcu_read_lock();
1635 csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
1636 rcu_read_unlock();
1637 return connection->agreed_pro_version >= 89 && /* supported? */
1638 connection->csums_tfm && /* configured? */
1639 (csums_after_crash_only == 0 /* use for each resync? */
1640 || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
1624} 1641}
1625 1642
1626/** 1643/**
@@ -1633,6 +1650,8 @@ int w_start_resync(struct drbd_work *w, int cancel)
1633 */ 1650 */
1634void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) 1651void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1635{ 1652{
1653 struct drbd_peer_device *peer_device = first_peer_device(device);
1654 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
1636 union drbd_state ns; 1655 union drbd_state ns;
1637 int r; 1656 int r;
1638 1657
@@ -1651,7 +1670,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1651 if (r > 0) { 1670 if (r > 0) {
1652 drbd_info(device, "before-resync-target handler returned %d, " 1671 drbd_info(device, "before-resync-target handler returned %d, "
1653 "dropping connection.\n", r); 1672 "dropping connection.\n", r);
1654 conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD); 1673 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
1655 return; 1674 return;
1656 } 1675 }
1657 } else /* C_SYNC_SOURCE */ { 1676 } else /* C_SYNC_SOURCE */ {
@@ -1664,7 +1683,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1664 } else { 1683 } else {
1665 drbd_info(device, "before-resync-source handler returned %d, " 1684 drbd_info(device, "before-resync-source handler returned %d, "
1666 "dropping connection.\n", r); 1685 "dropping connection.\n", r);
1667 conn_request_state(first_peer_device(device)->connection, 1686 conn_request_state(connection,
1668 NS(conn, C_DISCONNECTING), CS_HARD); 1687 NS(conn, C_DISCONNECTING), CS_HARD);
1669 return; 1688 return;
1670 } 1689 }
@@ -1672,7 +1691,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1672 } 1691 }
1673 } 1692 }
1674 1693
1675 if (current == first_peer_device(device)->connection->worker.task) { 1694 if (current == connection->worker.task) {
1676 /* The worker should not sleep waiting for state_mutex, 1695 /* The worker should not sleep waiting for state_mutex,
1677 that can take long */ 1696 that can take long */
1678 if (!mutex_trylock(device->state_mutex)) { 1697 if (!mutex_trylock(device->state_mutex)) {
@@ -1733,11 +1752,20 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1733 device->rs_mark_time[i] = now; 1752 device->rs_mark_time[i] = now;
1734 } 1753 }
1735 _drbd_pause_after(device); 1754 _drbd_pause_after(device);
1755 /* Forget potentially stale cached per resync extent bit-counts.
1756 * Open coded drbd_rs_cancel_all(device), we already have IRQs
1757 * disabled, and know the disk state is ok. */
1758 spin_lock(&device->al_lock);
1759 lc_reset(device->resync);
1760 device->resync_locked = 0;
1761 device->resync_wenr = LC_FREE;
1762 spin_unlock(&device->al_lock);
1736 } 1763 }
1737 write_unlock(&global_state_lock); 1764 write_unlock(&global_state_lock);
1738 spin_unlock_irq(&device->resource->req_lock); 1765 spin_unlock_irq(&device->resource->req_lock);
1739 1766
1740 if (r == SS_SUCCESS) { 1767 if (r == SS_SUCCESS) {
1768 wake_up(&device->al_wait); /* for lc_reset() above */
1741 /* reset rs_last_bcast when a resync or verify is started, 1769 /* reset rs_last_bcast when a resync or verify is started,
1742 * to deal with potential jiffies wrap. */ 1770 * to deal with potential jiffies wrap. */
1743 device->rs_last_bcast = jiffies - HZ; 1771 device->rs_last_bcast = jiffies - HZ;
@@ -1746,8 +1774,12 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1746 drbd_conn_str(ns.conn), 1774 drbd_conn_str(ns.conn),
1747 (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10), 1775 (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
1748 (unsigned long) device->rs_total); 1776 (unsigned long) device->rs_total);
1749 if (side == C_SYNC_TARGET) 1777 if (side == C_SYNC_TARGET) {
1750 device->bm_resync_fo = 0; 1778 device->bm_resync_fo = 0;
1779 device->use_csums = use_checksum_based_resync(connection, device);
1780 } else {
1781 device->use_csums = 0;
1782 }
1751 1783
1752 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid 1784 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1753 * with w_send_oos, or the sync target will get confused as to 1785 * with w_send_oos, or the sync target will get confused as to
@@ -1756,12 +1788,10 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1756 * drbd_resync_finished from here in that case. 1788 * drbd_resync_finished from here in that case.
1757 * We drbd_gen_and_send_sync_uuid here for protocol < 96, 1789 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1758 * and from after_state_ch otherwise. */ 1790 * and from after_state_ch otherwise. */
1759 if (side == C_SYNC_SOURCE && 1791 if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
1760 first_peer_device(device)->connection->agreed_pro_version < 96) 1792 drbd_gen_and_send_sync_uuid(peer_device);
1761 drbd_gen_and_send_sync_uuid(first_peer_device(device));
1762 1793
1763 if (first_peer_device(device)->connection->agreed_pro_version < 95 && 1794 if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
1764 device->rs_total == 0) {
1765 /* This still has a race (about when exactly the peers 1795 /* This still has a race (about when exactly the peers
1766 * detect connection loss) that can lead to a full sync 1796 * detect connection loss) that can lead to a full sync
1767 * on next handshake. In 8.3.9 we fixed this with explicit 1797 * on next handshake. In 8.3.9 we fixed this with explicit
@@ -1777,7 +1807,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1777 int timeo; 1807 int timeo;
1778 1808
1779 rcu_read_lock(); 1809 rcu_read_lock();
1780 nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 1810 nc = rcu_dereference(connection->net_conf);
1781 timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; 1811 timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1782 rcu_read_unlock(); 1812 rcu_read_unlock();
1783 schedule_timeout_interruptible(timeo); 1813 schedule_timeout_interruptible(timeo);
@@ -1799,10 +1829,165 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1799 mutex_unlock(device->state_mutex); 1829 mutex_unlock(device->state_mutex);
1800} 1830}
1801 1831
1832static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
1833{
1834 struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
1835 device->rs_last_bcast = jiffies;
1836
1837 if (!get_ldev(device))
1838 return;
1839
1840 drbd_bm_write_lazy(device, 0);
1841 if (resync_done && is_sync_state(device->state.conn))
1842 drbd_resync_finished(device);
1843
1844 drbd_bcast_event(device, &sib);
1845 /* update timestamp, in case it took a while to write out stuff */
1846 device->rs_last_bcast = jiffies;
1847 put_ldev(device);
1848}
1849
1850static void drbd_ldev_destroy(struct drbd_device *device)
1851{
1852 lc_destroy(device->resync);
1853 device->resync = NULL;
1854 lc_destroy(device->act_log);
1855 device->act_log = NULL;
1856 __no_warn(local,
1857 drbd_free_ldev(device->ldev);
1858 device->ldev = NULL;);
1859 clear_bit(GOING_DISKLESS, &device->flags);
1860 wake_up(&device->misc_wait);
1861}
1862
1863static void go_diskless(struct drbd_device *device)
1864{
1865 D_ASSERT(device, device->state.disk == D_FAILED);
1866 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
1867 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
1868 * the protected members anymore, though, so once put_ldev reaches zero
1869 * again, it will be safe to free them. */
1870
1871 /* Try to write changed bitmap pages, read errors may have just
1872 * set some bits outside the area covered by the activity log.
1873 *
1874 * If we have an IO error during the bitmap writeout,
1875 * we will want a full sync next time, just in case.
1876 * (Do we want a specific meta data flag for this?)
1877 *
1878 * If that does not make it to stable storage either,
1879 * we cannot do anything about that anymore.
1880 *
1881 * We still need to check if both bitmap and ldev are present, we may
1882 * end up here after a failed attach, before ldev was even assigned.
1883 */
1884 if (device->bitmap && device->ldev) {
1885 /* An interrupted resync or similar is allowed to recounts bits
1886 * while we detach.
1887 * Any modifications would not be expected anymore, though.
1888 */
1889 if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
1890 "detach", BM_LOCKED_TEST_ALLOWED)) {
1891 if (test_bit(WAS_READ_ERROR, &device->flags)) {
1892 drbd_md_set_flag(device, MDF_FULL_SYNC);
1893 drbd_md_sync(device);
1894 }
1895 }
1896 }
1897
1898 drbd_force_state(device, NS(disk, D_DISKLESS));
1899}
1900
1901static int do_md_sync(struct drbd_device *device)
1902{
1903 drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
1904 drbd_md_sync(device);
1905 return 0;
1906}
1907
1908/* only called from drbd_worker thread, no locking */
1909void __update_timing_details(
1910 struct drbd_thread_timing_details *tdp,
1911 unsigned int *cb_nr,
1912 void *cb,
1913 const char *fn, const unsigned int line)
1914{
1915 unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
1916 struct drbd_thread_timing_details *td = tdp + i;
1917
1918 td->start_jif = jiffies;
1919 td->cb_addr = cb;
1920 td->caller_fn = fn;
1921 td->line = line;
1922 td->cb_nr = *cb_nr;
1923
1924 i = (i+1) % DRBD_THREAD_DETAILS_HIST;
1925 td = tdp + i;
1926 memset(td, 0, sizeof(*td));
1927
1928 ++(*cb_nr);
1929}
1930
1931#define WORK_PENDING(work_bit, todo) (todo & (1UL << work_bit))
1932static void do_device_work(struct drbd_device *device, const unsigned long todo)
1933{
1934 if (WORK_PENDING(MD_SYNC, todo))
1935 do_md_sync(device);
1936 if (WORK_PENDING(RS_DONE, todo) ||
1937 WORK_PENDING(RS_PROGRESS, todo))
1938 update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo));
1939 if (WORK_PENDING(GO_DISKLESS, todo))
1940 go_diskless(device);
1941 if (WORK_PENDING(DESTROY_DISK, todo))
1942 drbd_ldev_destroy(device);
1943 if (WORK_PENDING(RS_START, todo))
1944 do_start_resync(device);
1945}
1946
1947#define DRBD_DEVICE_WORK_MASK \
1948 ((1UL << GO_DISKLESS) \
1949 |(1UL << DESTROY_DISK) \
1950 |(1UL << MD_SYNC) \
1951 |(1UL << RS_START) \
1952 |(1UL << RS_PROGRESS) \
1953 |(1UL << RS_DONE) \
1954 )
1955
1956static unsigned long get_work_bits(unsigned long *flags)
1957{
1958 unsigned long old, new;
1959 do {
1960 old = *flags;
1961 new = old & ~DRBD_DEVICE_WORK_MASK;
1962 } while (cmpxchg(flags, old, new) != old);
1963 return old & DRBD_DEVICE_WORK_MASK;
1964}
1965
1966static void do_unqueued_work(struct drbd_connection *connection)
1967{
1968 struct drbd_peer_device *peer_device;
1969 int vnr;
1970
1971 rcu_read_lock();
1972 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1973 struct drbd_device *device = peer_device->device;
1974 unsigned long todo = get_work_bits(&device->flags);
1975 if (!todo)
1976 continue;
1977
1978 kref_get(&device->kref);
1979 rcu_read_unlock();
1980 do_device_work(device, todo);
1981 kref_put(&device->kref, drbd_destroy_device);
1982 rcu_read_lock();
1983 }
1984 rcu_read_unlock();
1985}
1986
1802static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) 1987static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
1803{ 1988{
1804 spin_lock_irq(&queue->q_lock); 1989 spin_lock_irq(&queue->q_lock);
1805 list_splice_init(&queue->q, work_list); 1990 list_splice_tail_init(&queue->q, work_list);
1806 spin_unlock_irq(&queue->q_lock); 1991 spin_unlock_irq(&queue->q_lock);
1807 return !list_empty(work_list); 1992 return !list_empty(work_list);
1808} 1993}
@@ -1851,7 +2036,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
1851 /* dequeue single item only, 2036 /* dequeue single item only,
1852 * we still use drbd_queue_work_front() in some places */ 2037 * we still use drbd_queue_work_front() in some places */
1853 if (!list_empty(&connection->sender_work.q)) 2038 if (!list_empty(&connection->sender_work.q))
1854 list_move(connection->sender_work.q.next, work_list); 2039 list_splice_tail_init(&connection->sender_work.q, work_list);
1855 spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ 2040 spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
1856 if (!list_empty(work_list) || signal_pending(current)) { 2041 if (!list_empty(work_list) || signal_pending(current)) {
1857 spin_unlock_irq(&connection->resource->req_lock); 2042 spin_unlock_irq(&connection->resource->req_lock);
@@ -1873,6 +2058,14 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
1873 if (send_barrier) 2058 if (send_barrier)
1874 maybe_send_barrier(connection, 2059 maybe_send_barrier(connection,
1875 connection->send.current_epoch_nr + 1); 2060 connection->send.current_epoch_nr + 1);
2061
2062 if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
2063 break;
2064
2065 /* drbd_send() may have called flush_signals() */
2066 if (get_t_state(&connection->worker) != RUNNING)
2067 break;
2068
1876 schedule(); 2069 schedule();
1877 /* may be woken up for other things but new work, too, 2070 /* may be woken up for other things but new work, too,
1878 * e.g. if the current epoch got closed. 2071 * e.g. if the current epoch got closed.
@@ -1906,10 +2099,15 @@ int drbd_worker(struct drbd_thread *thi)
1906 while (get_t_state(thi) == RUNNING) { 2099 while (get_t_state(thi) == RUNNING) {
1907 drbd_thread_current_set_cpu(thi); 2100 drbd_thread_current_set_cpu(thi);
1908 2101
1909 /* as long as we use drbd_queue_work_front(), 2102 if (list_empty(&work_list)) {
1910 * we may only dequeue single work items here, not batches. */ 2103 update_worker_timing_details(connection, wait_for_work);
1911 if (list_empty(&work_list))
1912 wait_for_work(connection, &work_list); 2104 wait_for_work(connection, &work_list);
2105 }
2106
2107 if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2108 update_worker_timing_details(connection, do_unqueued_work);
2109 do_unqueued_work(connection);
2110 }
1913 2111
1914 if (signal_pending(current)) { 2112 if (signal_pending(current)) {
1915 flush_signals(current); 2113 flush_signals(current);
@@ -1926,6 +2124,7 @@ int drbd_worker(struct drbd_thread *thi)
1926 while (!list_empty(&work_list)) { 2124 while (!list_empty(&work_list)) {
1927 w = list_first_entry(&work_list, struct drbd_work, list); 2125 w = list_first_entry(&work_list, struct drbd_work, list);
1928 list_del_init(&w->list); 2126 list_del_init(&w->list);
2127 update_worker_timing_details(connection, w->cb);
1929 if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0) 2128 if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
1930 continue; 2129 continue;
1931 if (connection->cstate >= C_WF_REPORT_PARAMS) 2130 if (connection->cstate >= C_WF_REPORT_PARAMS)
@@ -1934,13 +2133,18 @@ int drbd_worker(struct drbd_thread *thi)
1934 } 2133 }
1935 2134
1936 do { 2135 do {
2136 if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
2137 update_worker_timing_details(connection, do_unqueued_work);
2138 do_unqueued_work(connection);
2139 }
1937 while (!list_empty(&work_list)) { 2140 while (!list_empty(&work_list)) {
1938 w = list_first_entry(&work_list, struct drbd_work, list); 2141 w = list_first_entry(&work_list, struct drbd_work, list);
1939 list_del_init(&w->list); 2142 list_del_init(&w->list);
2143 update_worker_timing_details(connection, w->cb);
1940 w->cb(w, 1); 2144 w->cb(w, 1);
1941 } 2145 }
1942 dequeue_work_batch(&connection->sender_work, &work_list); 2146 dequeue_work_batch(&connection->sender_work, &work_list);
1943 } while (!list_empty(&work_list)); 2147 } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
1944 2148
1945 rcu_read_lock(); 2149 rcu_read_lock();
1946 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 2150 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f63d358f3d93..0a581400de0f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -15,17 +15,22 @@
15#include <linux/numa.h> 15#include <linux/numa.h>
16 16
17#define PART_BITS 4 17#define PART_BITS 4
18#define VQ_NAME_LEN 16
18 19
19static int major; 20static int major;
20static DEFINE_IDA(vd_index_ida); 21static DEFINE_IDA(vd_index_ida);
21 22
22static struct workqueue_struct *virtblk_wq; 23static struct workqueue_struct *virtblk_wq;
23 24
25struct virtio_blk_vq {
26 struct virtqueue *vq;
27 spinlock_t lock;
28 char name[VQ_NAME_LEN];
29} ____cacheline_aligned_in_smp;
30
24struct virtio_blk 31struct virtio_blk
25{ 32{
26 struct virtio_device *vdev; 33 struct virtio_device *vdev;
27 struct virtqueue *vq;
28 spinlock_t vq_lock;
29 34
30 /* The disk structure for the kernel. */ 35 /* The disk structure for the kernel. */
31 struct gendisk *disk; 36 struct gendisk *disk;
@@ -47,6 +52,10 @@ struct virtio_blk
47 52
48 /* Ida index - used to track minor number allocations. */ 53 /* Ida index - used to track minor number allocations. */
49 int index; 54 int index;
55
56 /* num of vqs */
57 int num_vqs;
58 struct virtio_blk_vq *vqs;
50}; 59};
51 60
52struct virtblk_req 61struct virtblk_req
@@ -133,14 +142,15 @@ static void virtblk_done(struct virtqueue *vq)
133{ 142{
134 struct virtio_blk *vblk = vq->vdev->priv; 143 struct virtio_blk *vblk = vq->vdev->priv;
135 bool req_done = false; 144 bool req_done = false;
145 int qid = vq->index;
136 struct virtblk_req *vbr; 146 struct virtblk_req *vbr;
137 unsigned long flags; 147 unsigned long flags;
138 unsigned int len; 148 unsigned int len;
139 149
140 spin_lock_irqsave(&vblk->vq_lock, flags); 150 spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
141 do { 151 do {
142 virtqueue_disable_cb(vq); 152 virtqueue_disable_cb(vq);
143 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { 153 while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
144 blk_mq_complete_request(vbr->req); 154 blk_mq_complete_request(vbr->req);
145 req_done = true; 155 req_done = true;
146 } 156 }
@@ -151,7 +161,7 @@ static void virtblk_done(struct virtqueue *vq)
151 /* In case queue is stopped waiting for more buffers. */ 161 /* In case queue is stopped waiting for more buffers. */
152 if (req_done) 162 if (req_done)
153 blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); 163 blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
154 spin_unlock_irqrestore(&vblk->vq_lock, flags); 164 spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
155} 165}
156 166
157static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) 167static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
@@ -160,6 +170,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
160 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); 170 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
161 unsigned long flags; 171 unsigned long flags;
162 unsigned int num; 172 unsigned int num;
173 int qid = hctx->queue_num;
163 const bool last = (req->cmd_flags & REQ_END) != 0; 174 const bool last = (req->cmd_flags & REQ_END) != 0;
164 int err; 175 int err;
165 bool notify = false; 176 bool notify = false;
@@ -202,12 +213,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
202 vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 213 vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
203 } 214 }
204 215
205 spin_lock_irqsave(&vblk->vq_lock, flags); 216 spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
206 err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); 217 err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
207 if (err) { 218 if (err) {
208 virtqueue_kick(vblk->vq); 219 virtqueue_kick(vblk->vqs[qid].vq);
209 blk_mq_stop_hw_queue(hctx); 220 blk_mq_stop_hw_queue(hctx);
210 spin_unlock_irqrestore(&vblk->vq_lock, flags); 221 spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
211 /* Out of mem doesn't actually happen, since we fall back 222 /* Out of mem doesn't actually happen, since we fall back
212 * to direct descriptors */ 223 * to direct descriptors */
213 if (err == -ENOMEM || err == -ENOSPC) 224 if (err == -ENOMEM || err == -ENOSPC)
@@ -215,12 +226,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
215 return BLK_MQ_RQ_QUEUE_ERROR; 226 return BLK_MQ_RQ_QUEUE_ERROR;
216 } 227 }
217 228
218 if (last && virtqueue_kick_prepare(vblk->vq)) 229 if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
219 notify = true; 230 notify = true;
220 spin_unlock_irqrestore(&vblk->vq_lock, flags); 231 spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
221 232
222 if (notify) 233 if (notify)
223 virtqueue_notify(vblk->vq); 234 virtqueue_notify(vblk->vqs[qid].vq);
224 return BLK_MQ_RQ_QUEUE_OK; 235 return BLK_MQ_RQ_QUEUE_OK;
225} 236}
226 237
@@ -377,12 +388,64 @@ static void virtblk_config_changed(struct virtio_device *vdev)
377static int init_vq(struct virtio_blk *vblk) 388static int init_vq(struct virtio_blk *vblk)
378{ 389{
379 int err = 0; 390 int err = 0;
391 int i;
392 vq_callback_t **callbacks;
393 const char **names;
394 struct virtqueue **vqs;
395 unsigned short num_vqs;
396 struct virtio_device *vdev = vblk->vdev;
397
398 err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
399 struct virtio_blk_config, num_queues,
400 &num_vqs);
401 if (err)
402 num_vqs = 1;
403
404 vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL);
405 if (!vblk->vqs) {
406 err = -ENOMEM;
407 goto out;
408 }
409
410 names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
411 if (!names)
412 goto err_names;
413
414 callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
415 if (!callbacks)
416 goto err_callbacks;
417
418 vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
419 if (!vqs)
420 goto err_vqs;
380 421
381 /* We expect one virtqueue, for output. */ 422 for (i = 0; i < num_vqs; i++) {
382 vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests"); 423 callbacks[i] = virtblk_done;
383 if (IS_ERR(vblk->vq)) 424 snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
384 err = PTR_ERR(vblk->vq); 425 names[i] = vblk->vqs[i].name;
426 }
427
428 /* Discover virtqueues and write information to configuration. */
429 err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
430 if (err)
431 goto err_find_vqs;
385 432
433 for (i = 0; i < num_vqs; i++) {
434 spin_lock_init(&vblk->vqs[i].lock);
435 vblk->vqs[i].vq = vqs[i];
436 }
437 vblk->num_vqs = num_vqs;
438
439 err_find_vqs:
440 kfree(vqs);
441 err_vqs:
442 kfree(callbacks);
443 err_callbacks:
444 kfree(names);
445 err_names:
446 if (err)
447 kfree(vblk->vqs);
448 out:
386 return err; 449 return err;
387} 450}
388 451
@@ -551,7 +614,6 @@ static int virtblk_probe(struct virtio_device *vdev)
551 err = init_vq(vblk); 614 err = init_vq(vblk);
552 if (err) 615 if (err)
553 goto out_free_vblk; 616 goto out_free_vblk;
554 spin_lock_init(&vblk->vq_lock);
555 617
556 /* FIXME: How many partitions? How long is a piece of string? */ 618 /* FIXME: How many partitions? How long is a piece of string? */
557 vblk->disk = alloc_disk(1 << PART_BITS); 619 vblk->disk = alloc_disk(1 << PART_BITS);
@@ -562,7 +624,7 @@ static int virtblk_probe(struct virtio_device *vdev)
562 624
563 /* Default queue sizing is to fill the ring. */ 625 /* Default queue sizing is to fill the ring. */
564 if (!virtblk_queue_depth) { 626 if (!virtblk_queue_depth) {
565 virtblk_queue_depth = vblk->vq->num_free; 627 virtblk_queue_depth = vblk->vqs[0].vq->num_free;
566 /* ... but without indirect descs, we use 2 descs per req */ 628 /* ... but without indirect descs, we use 2 descs per req */
567 if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) 629 if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
568 virtblk_queue_depth /= 2; 630 virtblk_queue_depth /= 2;
@@ -570,7 +632,6 @@ static int virtblk_probe(struct virtio_device *vdev)
570 632
571 memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); 633 memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
572 vblk->tag_set.ops = &virtio_mq_ops; 634 vblk->tag_set.ops = &virtio_mq_ops;
573 vblk->tag_set.nr_hw_queues = 1;
574 vblk->tag_set.queue_depth = virtblk_queue_depth; 635 vblk->tag_set.queue_depth = virtblk_queue_depth;
575 vblk->tag_set.numa_node = NUMA_NO_NODE; 636 vblk->tag_set.numa_node = NUMA_NO_NODE;
576 vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 637 vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
@@ -578,6 +639,7 @@ static int virtblk_probe(struct virtio_device *vdev)
578 sizeof(struct virtblk_req) + 639 sizeof(struct virtblk_req) +
579 sizeof(struct scatterlist) * sg_elems; 640 sizeof(struct scatterlist) * sg_elems;
580 vblk->tag_set.driver_data = vblk; 641 vblk->tag_set.driver_data = vblk;
642 vblk->tag_set.nr_hw_queues = vblk->num_vqs;
581 643
582 err = blk_mq_alloc_tag_set(&vblk->tag_set); 644 err = blk_mq_alloc_tag_set(&vblk->tag_set);
583 if (err) 645 if (err)
@@ -727,6 +789,7 @@ static void virtblk_remove(struct virtio_device *vdev)
727 refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); 789 refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
728 put_disk(vblk->disk); 790 put_disk(vblk->disk);
729 vdev->config->del_vqs(vdev); 791 vdev->config->del_vqs(vdev);
792 kfree(vblk->vqs);
730 kfree(vblk); 793 kfree(vblk);
731 794
732 /* Only free device id if we don't have any users */ 795 /* Only free device id if we don't have any users */
@@ -777,7 +840,8 @@ static const struct virtio_device_id id_table[] = {
777static unsigned int features[] = { 840static unsigned int features[] = {
778 VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, 841 VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
779 VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, 842 VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
780 VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE 843 VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
844 VIRTIO_BLK_F_MQ,
781}; 845};
782 846
783static struct virtio_driver virtio_blk = { 847static struct virtio_driver virtio_blk = {