aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-08 14:51:05 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-08 14:51:05 -0400
commitebb37277796269da36a8bc5d72ed1e8e1fb7d34b (patch)
tree0ded627a62a5cec70b18d12825dd858855c135d3 /drivers/block/drbd
parent4de13d7aa8f4d02f4dc99d4609575659f92b3c5a (diff)
parentf50efd2fdbd9b35b11f5778ed85beb764184bda9 (diff)
Merge branch 'for-3.10/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: "It might look big in volume, but when categorized, not a lot of drivers are touched. The pull request contains: - mtip32xx fixes from Micron. - A slew of drbd updates, this time in a nicer series. - bcache, a flash/ssd caching framework from Kent. - Fixes for cciss" * 'for-3.10/drivers' of git://git.kernel.dk/linux-block: (66 commits) bcache: Use bd_link_disk_holder() bcache: Allocator cleanup/fixes cciss: bug fix to prevent cciss from loading in kdump crash kernel cciss: add cciss_allow_hpsa module parameter drivers/block/mg_disk.c: add CONFIG_PM_SLEEP to suspend/resume functions mtip32xx: Workaround for unaligned writes bcache: Make sure blocksize isn't smaller than device blocksize bcache: Fix merge_bvec_fn usage for when it modifies the bvm bcache: Correctly check against BIO_MAX_PAGES bcache: Hack around stuff that clones up to bi_max_vecs bcache: Set ra_pages based on backing device's ra_pages bcache: Take data offset from the bdev superblock. mtip32xx: mtip32xx: Disable TRIM support mtip32xx: fix a smatch warning bcache: Disable broken btree fuzz tester bcache: Fix a format string overflow bcache: Fix a minor memory leak on device teardown bcache: Documentation updates bcache: Use WARN_ONCE() instead of __WARN() bcache: Add missing #include <linux/prefetch.h> ...
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r--drivers/block/drbd/drbd_actlog.c246
-rw-r--r--drivers/block/drbd/drbd_bitmap.c13
-rw-r--r--drivers/block/drbd/drbd_int.h179
-rw-r--r--drivers/block/drbd/drbd_main.c251
-rw-r--r--drivers/block/drbd/drbd_nl.c200
-rw-r--r--drivers/block/drbd/drbd_proc.c10
-rw-r--r--drivers/block/drbd/drbd_receiver.c16
-rw-r--r--drivers/block/drbd/drbd_req.c192
-rw-r--r--drivers/block/drbd/drbd_req.h8
-rw-r--r--drivers/block/drbd/drbd_state.c28
-rw-r--r--drivers/block/drbd/drbd_strings.c1
-rw-r--r--drivers/block/drbd/drbd_worker.c24
12 files changed, 845 insertions, 323 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 92510f8ad013..6608076dc39e 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -104,7 +104,6 @@ struct update_al_work {
104 int err; 104 int err;
105}; 105};
106 106
107static int al_write_transaction(struct drbd_conf *mdev);
108 107
109void *drbd_md_get_buffer(struct drbd_conf *mdev) 108void *drbd_md_get_buffer(struct drbd_conf *mdev)
110{ 109{
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
168 bio->bi_end_io = drbd_md_io_complete; 167 bio->bi_end_io = drbd_md_io_complete;
169 bio->bi_rw = rw; 168 bio->bi_rw = rw;
170 169
171 if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ 170 if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
171 /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
172 ;
173 else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
174 /* Corresponding put_ldev in drbd_md_io_complete() */
172 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 175 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
173 err = -ENODEV; 176 err = -ENODEV;
174 goto out; 177 goto out;
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
199 202
200 BUG_ON(!bdev->md_bdev); 203 BUG_ON(!bdev->md_bdev);
201 204
202 dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", 205 dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
203 current->comm, current->pid, __func__, 206 current->comm, current->pid, __func__,
204 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 207 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
208 (void*)_RET_IP_ );
205 209
206 if (sector < drbd_md_first_sector(bdev) || 210 if (sector < drbd_md_first_sector(bdev) ||
207 sector + 7 > drbd_md_last_sector(bdev)) 211 sector + 7 > drbd_md_last_sector(bdev))
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
209 current->comm, current->pid, __func__, 213 current->comm, current->pid, __func__,
210 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 214 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
211 215
212 err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); 216 /* we do all our meta data IO in aligned 4k blocks. */
217 err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
213 if (err) { 218 if (err) {
214 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 219 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
215 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); 220 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
217 return err; 222 return err;
218} 223}
219 224
220static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) 225static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
221{ 226{
222 struct lc_element *al_ext;
223 struct lc_element *tmp; 227 struct lc_element *tmp;
224 int wake;
225
226 spin_lock_irq(&mdev->al_lock);
227 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); 228 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
228 if (unlikely(tmp != NULL)) { 229 if (unlikely(tmp != NULL)) {
229 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 230 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
230 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 231 if (test_bit(BME_NO_WRITES, &bm_ext->flags))
231 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 232 return bm_ext;
232 spin_unlock_irq(&mdev->al_lock); 233 }
233 if (wake) 234 return NULL;
234 wake_up(&mdev->al_wait); 235}
235 return NULL; 236
236 } 237static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
238{
239 struct lc_element *al_ext;
240 struct bm_extent *bm_ext;
241 int wake;
242
243 spin_lock_irq(&mdev->al_lock);
244 bm_ext = find_active_resync_extent(mdev, enr);
245 if (bm_ext) {
246 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
247 spin_unlock_irq(&mdev->al_lock);
248 if (wake)
249 wake_up(&mdev->al_wait);
250 return NULL;
237 } 251 }
238 al_ext = lc_get(mdev->act_log, enr); 252 if (nonblock)
253 al_ext = lc_try_get(mdev->act_log, enr);
254 else
255 al_ext = lc_get(mdev->act_log, enr);
239 spin_unlock_irq(&mdev->al_lock); 256 spin_unlock_irq(&mdev->al_lock);
240 return al_ext; 257 return al_ext;
241} 258}
242 259
243void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) 260bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
244{ 261{
245 /* for bios crossing activity log extent boundaries, 262 /* for bios crossing activity log extent boundaries,
246 * we may need to activate two extents in one go */ 263 * we may need to activate two extents in one go */
247 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 264 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
248 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 265 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
249 unsigned enr;
250 bool locked = false;
251 266
267 D_ASSERT((unsigned)(last - first) <= 1);
268 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
269
270 /* FIXME figure out a fast path for bios crossing AL extent boundaries */
271 if (first != last)
272 return false;
273
274 return _al_get(mdev, first, true);
275}
276
277bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
278{
279 /* for bios crossing activity log extent boundaries,
280 * we may need to activate two extents in one go */
281 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
282 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
283 unsigned enr;
284 bool need_transaction = false;
252 285
253 D_ASSERT(first <= last); 286 D_ASSERT(first <= last);
254 D_ASSERT(atomic_read(&mdev->local_cnt) > 0); 287 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
255 288
256 for (enr = first; enr <= last; enr++) 289 for (enr = first; enr <= last; enr++) {
257 wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); 290 struct lc_element *al_ext;
291 wait_event(mdev->al_wait,
292 (al_ext = _al_get(mdev, enr, false)) != NULL);
293 if (al_ext->lc_number != enr)
294 need_transaction = true;
295 }
296 return need_transaction;
297}
298
299static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
300
301/* When called through generic_make_request(), we must delegate
302 * activity log I/O to the worker thread: a further request
303 * submitted via generic_make_request() within the same task
304 * would be queued on current->bio_list, and would only start
305 * after this function returns (see generic_make_request()).
306 *
307 * However, if we *are* the worker, we must not delegate to ourselves.
308 */
309
310/*
311 * @delegate: delegate activity log I/O to the worker thread
312 */
313void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
314{
315 bool locked = false;
316
317 BUG_ON(delegate && current == mdev->tconn->worker.task);
258 318
259 /* Serialize multiple transactions. 319 /* Serialize multiple transactions.
260 * This uses test_and_set_bit, memory barrier is implicit. 320 * This uses test_and_set_bit, memory barrier is implicit.
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
264 (locked = lc_try_lock_for_transaction(mdev->act_log))); 324 (locked = lc_try_lock_for_transaction(mdev->act_log)));
265 325
266 if (locked) { 326 if (locked) {
267 /* drbd_al_write_transaction(mdev,al_ext,enr);
268 * recurses into generic_make_request(), which
269 * disallows recursion, bios being serialized on the
270 * current->bio_tail list now.
271 * we have to delegate updates to the activity log
272 * to the worker thread. */
273
274 /* Double check: it may have been committed by someone else, 327 /* Double check: it may have been committed by someone else,
275 * while we have been waiting for the lock. */ 328 * while we have been waiting for the lock. */
276 if (mdev->act_log->pending_changes) { 329 if (mdev->act_log->pending_changes) {
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
280 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; 333 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
281 rcu_read_unlock(); 334 rcu_read_unlock();
282 335
283 if (write_al_updates) { 336 if (write_al_updates)
284 al_write_transaction(mdev); 337 al_write_transaction(mdev, delegate);
285 mdev->al_writ_cnt++;
286 }
287
288 spin_lock_irq(&mdev->al_lock); 338 spin_lock_irq(&mdev->al_lock);
289 /* FIXME 339 /* FIXME
290 if (err) 340 if (err)
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
298 } 348 }
299} 349}
300 350
351/*
352 * @delegate: delegate activity log I/O to the worker thread
353 */
354void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
355{
356 BUG_ON(delegate && current == mdev->tconn->worker.task);
357
358 if (drbd_al_begin_io_prepare(mdev, i))
359 drbd_al_begin_io_commit(mdev, delegate);
360}
361
362int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
363{
364 struct lru_cache *al = mdev->act_log;
365 /* for bios crossing activity log extent boundaries,
366 * we may need to activate two extents in one go */
367 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
368 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
369 unsigned nr_al_extents;
370 unsigned available_update_slots;
371 unsigned enr;
372
373 D_ASSERT(first <= last);
374
375 nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
376 available_update_slots = min(al->nr_elements - al->used,
377 al->max_pending_changes - al->pending_changes);
378
379 /* We want all necessary updates for a given request within the same transaction
380 * We could first check how many updates are *actually* needed,
381 * and use that instead of the worst-case nr_al_extents */
382 if (available_update_slots < nr_al_extents)
383 return -EWOULDBLOCK;
384
385 /* Is resync active in this area? */
386 for (enr = first; enr <= last; enr++) {
387 struct lc_element *tmp;
388 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
389 if (unlikely(tmp != NULL)) {
390 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
391 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
392 if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
393 return -EBUSY;
394 return -EWOULDBLOCK;
395 }
396 }
397 }
398
399 /* Checkout the refcounts.
400 * Given that we checked for available elements and update slots above,
401 * this has to be successful. */
402 for (enr = first; enr <= last; enr++) {
403 struct lc_element *al_ext;
404 al_ext = lc_get_cumulative(mdev->act_log, enr);
405 if (!al_ext)
406 dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
407 }
408 return 0;
409}
410
301void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) 411void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
302{ 412{
303 /* for bios crossing activity log extent boundaries, 413 /* for bios crossing activity log extent boundaries,
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
350 (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); 460 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
351} 461}
352 462
463static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
464{
465 const unsigned int stripes = mdev->ldev->md.al_stripes;
466 const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
467
468 /* transaction number, modulo on-disk ring buffer wrap around */
469 unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
470
471 /* ... to aligned 4k on disk block */
472 t = ((t % stripes) * stripe_size_4kB) + t/stripes;
473
474 /* ... to 512 byte sector in activity log */
475 t *= 8;
476
477 /* ... plus offset to the on disk position */
478 return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
479}
480
353static int 481static int
354_al_write_transaction(struct drbd_conf *mdev) 482_al_write_transaction(struct drbd_conf *mdev)
355{ 483{
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev)
432 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) 560 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
433 mdev->al_tr_cycle = 0; 561 mdev->al_tr_cycle = 0;
434 562
435 sector = mdev->ldev->md.md_offset 563 sector = al_tr_number_to_on_disk_sector(mdev);
436 + mdev->ldev->md.al_offset
437 + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
438 564
439 crc = crc32c(0, buffer, 4096); 565 crc = crc32c(0, buffer, 4096);
440 buffer->crc32c = cpu_to_be32(crc); 566 buffer->crc32c = cpu_to_be32(crc);
441 567
442 if (drbd_bm_write_hinted(mdev)) 568 if (drbd_bm_write_hinted(mdev))
443 err = -EIO; 569 err = -EIO;
444 /* drbd_chk_io_error done already */ 570 else {
445 else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 571 bool write_al_updates;
446 err = -EIO; 572 rcu_read_lock();
447 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); 573 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
448 } else { 574 rcu_read_unlock();
449 /* advance ringbuffer position and transaction counter */ 575 if (write_al_updates) {
450 mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); 576 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
451 mdev->al_tr_number++; 577 err = -EIO;
578 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
579 } else {
580 mdev->al_tr_number++;
581 mdev->al_writ_cnt++;
582 }
583 }
452 } 584 }
453 585
454 drbd_md_put_buffer(mdev); 586 drbd_md_put_buffer(mdev);
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused)
474/* Calls from worker context (see w_restart_disk_io()) need to write the 606/* Calls from worker context (see w_restart_disk_io()) need to write the
475 transaction directly. Others came through generic_make_request(), 607 transaction directly. Others came through generic_make_request(),
476 those need to delegate it to the worker. */ 608 those need to delegate it to the worker. */
477static int al_write_transaction(struct drbd_conf *mdev) 609static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
478{ 610{
479 struct update_al_work al_work; 611 if (delegate) {
480 612 struct update_al_work al_work;
481 if (current == mdev->tconn->worker.task) 613 init_completion(&al_work.event);
614 al_work.w.cb = w_al_write_transaction;
615 al_work.w.mdev = mdev;
616 drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
617 wait_for_completion(&al_work.event);
618 return al_work.err;
619 } else
482 return _al_write_transaction(mdev); 620 return _al_write_transaction(mdev);
483
484 init_completion(&al_work.event);
485 al_work.w.cb = w_al_write_transaction;
486 al_work.w.mdev = mdev;
487 drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
488 wait_for_completion(&al_work.event);
489
490 return al_work.err;
491} 621}
492 622
493static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) 623static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 8dc29502dc08..64fbb8385cdc 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
612 } 612 }
613} 613}
614 614
615/* For the layout, see comment above drbd_md_set_sector_offsets(). */
616static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
617{
618 u64 bitmap_sectors;
619 if (ldev->md.al_offset == 8)
620 bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
621 else
622 bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
623 return bitmap_sectors << (9 + 3);
624}
625
615/* 626/*
616 * make sure the bitmap has enough room for the attached storage, 627 * make sure the bitmap has enough room for the attached storage,
617 * if necessary, resize. 628 * if necessary, resize.
@@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
668 words = ALIGN(bits, 64) >> LN2_BPL; 679 words = ALIGN(bits, 64) >> LN2_BPL;
669 680
670 if (get_ldev(mdev)) { 681 if (get_ldev(mdev)) {
671 u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; 682 u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev);
672 put_ldev(mdev); 683 put_ldev(mdev);
673 if (bits > bits_on_disk) { 684 if (bits > bits_on_disk) {
674 dev_info(DEV, "bits = %lu\n", bits); 685 dev_info(DEV, "bits = %lu\n", bits);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 6b51afa1aae1..f943aacfdad8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -753,13 +753,16 @@ struct drbd_md {
753 u32 flags; 753 u32 flags;
754 u32 md_size_sect; 754 u32 md_size_sect;
755 755
756 s32 al_offset; /* signed relative sector offset to al area */ 756 s32 al_offset; /* signed relative sector offset to activity log */
757 s32 bm_offset; /* signed relative sector offset to bitmap */ 757 s32 bm_offset; /* signed relative sector offset to bitmap */
758 758
759 /* u32 al_nr_extents; important for restoring the AL 759 /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
760 * is stored into ldev->dc.al_extents, which in turn 760 s32 meta_dev_idx;
761 * gets applied to act_log->nr_elements 761
762 */ 762 /* see al_tr_number_to_on_disk_sector() */
763 u32 al_stripes;
764 u32 al_stripe_size_4k;
765 u32 al_size_4k; /* cached product of the above */
763}; 766};
764 767
765struct drbd_backing_dev { 768struct drbd_backing_dev {
@@ -891,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */
891 } send; 894 } send;
892}; 895};
893 896
897struct submit_worker {
898 struct workqueue_struct *wq;
899 struct work_struct worker;
900
901 spinlock_t lock;
902 struct list_head writes;
903};
904
894struct drbd_conf { 905struct drbd_conf {
895 struct drbd_tconn *tconn; 906 struct drbd_tconn *tconn;
896 int vnr; /* volume number within the connection */ 907 int vnr; /* volume number within the connection */
@@ -1009,7 +1020,6 @@ struct drbd_conf {
1009 struct lru_cache *act_log; /* activity log */ 1020 struct lru_cache *act_log; /* activity log */
1010 unsigned int al_tr_number; 1021 unsigned int al_tr_number;
1011 int al_tr_cycle; 1022 int al_tr_cycle;
1012 int al_tr_pos; /* position of the next transaction in the journal */
1013 wait_queue_head_t seq_wait; 1023 wait_queue_head_t seq_wait;
1014 atomic_t packet_seq; 1024 atomic_t packet_seq;
1015 unsigned int peer_seq; 1025 unsigned int peer_seq;
@@ -1032,6 +1042,10 @@ struct drbd_conf {
1032 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ 1042 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
1033 unsigned int peer_max_bio_size; 1043 unsigned int peer_max_bio_size;
1034 unsigned int local_max_bio_size; 1044 unsigned int local_max_bio_size;
1045
1046 /* any requests that would block in drbd_make_request()
1047 * are deferred to this single-threaded work queue */
1048 struct submit_worker submit;
1035}; 1049};
1036 1050
1037static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1051static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1148 char *why, enum bm_flag flags); 1162 char *why, enum bm_flag flags);
1149extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); 1163extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1150extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1164extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1151extern void drbd_go_diskless(struct drbd_conf *mdev);
1152extern void drbd_ldev_destroy(struct drbd_conf *mdev); 1165extern void drbd_ldev_destroy(struct drbd_conf *mdev);
1153 1166
1154/* Meta data layout 1167/* Meta data layout
1155 We reserve a 128MB Block (4k aligned) 1168 *
1156 * either at the end of the backing device 1169 * We currently have two possible layouts.
1157 * or on a separate meta data device. */ 1170 * Offsets in (512 byte) sectors.
1171 * external:
1172 * |----------- md_size_sect ------------------|
1173 * [ 4k superblock ][ activity log ][ Bitmap ]
1174 * | al_offset == 8 |
1175 * | bm_offset = al_offset + X |
1176 * ==> bitmap sectors = md_size_sect - bm_offset
1177 *
1178 * Variants:
1179 * old, indexed fixed size meta data:
1180 *
1181 * internal:
1182 * |----------- md_size_sect ------------------|
1183 * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
1184 * | al_offset < 0 |
1185 * | bm_offset = al_offset - Y |
1186 * ==> bitmap sectors = Y = al_offset - bm_offset
1187 *
1188 * [padding*] are zero or up to 7 unused 512 Byte sectors to the
1189 * end of the device, so that the [4k superblock] will be 4k aligned.
1190 *
1191 * The activity log consists of 4k transaction blocks,
1192 * which are written in a ring-buffer, or striped ring-buffer like fashion,
1193 * which are writtensize used to be fixed 32kB,
1194 * but is about to become configurable.
1195 */
1158 1196
1159/* The following numbers are sectors */ 1197/* Our old fixed size meta data layout
1160/* Allows up to about 3.8TB, so if you want more, 1198 * allows up to about 3.8TB, so if you want more,
1161 * you need to use the "flexible" meta data format. */ 1199 * you need to use the "flexible" meta data format. */
1162#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ 1200#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
1163#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ 1201#define MD_4kB_SECT 8
1164#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ 1202#define MD_32kB_SECT 64
1165#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
1166
1167/* we do all meta data IO in 4k blocks */
1168#define MD_BLOCK_SHIFT 12
1169#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
1170 1203
1171/* One activity log extent represents 4M of storage */ 1204/* One activity log extent represents 4M of storage */
1172#define AL_EXTENT_SHIFT 22 1205#define AL_EXTENT_SHIFT 22
@@ -1256,7 +1289,6 @@ struct bm_extent {
1256 1289
1257/* in one sector of the bitmap, we have this many activity_log extents. */ 1290/* in one sector of the bitmap, we have this many activity_log extents. */
1258#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) 1291#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1259#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
1260 1292
1261#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) 1293#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
1262#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) 1294#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
@@ -1276,16 +1308,18 @@ struct bm_extent {
1276 */ 1308 */
1277 1309
1278#define DRBD_MAX_SECTORS_32 (0xffffffffLU) 1310#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
1279#define DRBD_MAX_SECTORS_BM \ 1311/* we have a certain meta data variant that has a fixed on-disk size of 128
1280 ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) 1312 * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
1281#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 1313 * log, leaving this many sectors for the bitmap.
1282#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM 1314 */
1283#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM 1315
1284#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 1316#define DRBD_MAX_SECTORS_FIXED_BM \
1317 ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
1318#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
1285#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 1319#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
1286#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 1320#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
1287#else 1321#else
1288#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM 1322#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
1289/* 16 TB in units of sectors */ 1323/* 16 TB in units of sectors */
1290#if BITS_PER_LONG == 32 1324#if BITS_PER_LONG == 32
1291/* adjust by one page worth of bitmap, 1325/* adjust by one page worth of bitmap,
@@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn);
1418extern int proc_details; 1452extern int proc_details;
1419 1453
1420/* drbd_req */ 1454/* drbd_req */
1455extern void do_submit(struct work_struct *ws);
1421extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); 1456extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
1422extern void drbd_make_request(struct request_queue *q, struct bio *bio); 1457extern void drbd_make_request(struct request_queue *q, struct bio *bio);
1423extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); 1458extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
@@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s);
1576extern const char *drbd_role_str(enum drbd_role s); 1611extern const char *drbd_role_str(enum drbd_role s);
1577 1612
1578/* drbd_actlog.c */ 1613/* drbd_actlog.c */
1579extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); 1614extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i);
1615extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate);
1616extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i);
1617extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate);
1580extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); 1618extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
1581extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); 1619extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
1582extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); 1620extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
1755 * BTW, for internal meta data, this happens to be the maximum capacity 1793 * BTW, for internal meta data, this happens to be the maximum capacity
1756 * we could agree upon with our peer node. 1794 * we could agree upon with our peer node.
1757 */ 1795 */
1758static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) 1796static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
1759{ 1797{
1760 switch (meta_dev_idx) { 1798 switch (bdev->md.meta_dev_idx) {
1761 case DRBD_MD_INDEX_INTERNAL: 1799 case DRBD_MD_INDEX_INTERNAL:
1762 case DRBD_MD_INDEX_FLEX_INT: 1800 case DRBD_MD_INDEX_FLEX_INT:
1763 return bdev->md.md_offset + bdev->md.bm_offset; 1801 return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi
1767 } 1805 }
1768} 1806}
1769 1807
1770static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
1771{
1772 int meta_dev_idx;
1773
1774 rcu_read_lock();
1775 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
1776 rcu_read_unlock();
1777
1778 return _drbd_md_first_sector(meta_dev_idx, bdev);
1779}
1780
1781/** 1808/**
1782 * drbd_md_last_sector() - Return the last sector number of the meta data area 1809 * drbd_md_last_sector() - Return the last sector number of the meta data area
1783 * @bdev: Meta data block device. 1810 * @bdev: Meta data block device.
1784 */ 1811 */
1785static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) 1812static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
1786{ 1813{
1787 int meta_dev_idx; 1814 switch (bdev->md.meta_dev_idx) {
1788
1789 rcu_read_lock();
1790 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
1791 rcu_read_unlock();
1792
1793 switch (meta_dev_idx) {
1794 case DRBD_MD_INDEX_INTERNAL: 1815 case DRBD_MD_INDEX_INTERNAL:
1795 case DRBD_MD_INDEX_FLEX_INT: 1816 case DRBD_MD_INDEX_FLEX_INT:
1796 return bdev->md.md_offset + MD_AL_OFFSET - 1; 1817 return bdev->md.md_offset + MD_4kB_SECT -1;
1797 case DRBD_MD_INDEX_FLEX_EXT: 1818 case DRBD_MD_INDEX_FLEX_EXT:
1798 default: 1819 default:
1799 return bdev->md.md_offset + bdev->md.md_size_sect; 1820 return bdev->md.md_offset + bdev->md.md_size_sect -1;
1800 } 1821 }
1801} 1822}
1802 1823
@@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
1818static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) 1839static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
1819{ 1840{
1820 sector_t s; 1841 sector_t s;
1821 int meta_dev_idx;
1822 1842
1823 rcu_read_lock(); 1843 switch (bdev->md.meta_dev_idx) {
1824 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
1825 rcu_read_unlock();
1826
1827 switch (meta_dev_idx) {
1828 case DRBD_MD_INDEX_INTERNAL: 1844 case DRBD_MD_INDEX_INTERNAL:
1829 case DRBD_MD_INDEX_FLEX_INT: 1845 case DRBD_MD_INDEX_FLEX_INT:
1830 s = drbd_get_capacity(bdev->backing_bdev) 1846 s = drbd_get_capacity(bdev->backing_bdev)
1831 ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, 1847 ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1832 _drbd_md_first_sector(meta_dev_idx, bdev)) 1848 drbd_md_first_sector(bdev))
1833 : 0; 1849 : 0;
1834 break; 1850 break;
1835 case DRBD_MD_INDEX_FLEX_EXT: 1851 case DRBD_MD_INDEX_FLEX_EXT:
@@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
1848} 1864}
1849 1865
1850/** 1866/**
1851 * drbd_md_ss__() - Return the sector number of our meta data super block 1867 * drbd_md_ss() - Return the sector number of our meta data super block
1852 * @mdev: DRBD device.
1853 * @bdev: Meta data block device. 1868 * @bdev: Meta data block device.
1854 */ 1869 */
1855static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, 1870static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
1856 struct drbd_backing_dev *bdev)
1857{ 1871{
1858 int meta_dev_idx; 1872 const int meta_dev_idx = bdev->md.meta_dev_idx;
1859 1873
1860 rcu_read_lock(); 1874 if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
1861 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
1862 rcu_read_unlock();
1863
1864 switch (meta_dev_idx) {
1865 default: /* external, some index */
1866 return MD_RESERVED_SECT * meta_dev_idx;
1867 case DRBD_MD_INDEX_INTERNAL:
1868 /* with drbd08, internal meta data is always "flexible" */
1869 case DRBD_MD_INDEX_FLEX_INT:
1870 /* sizeof(struct md_on_disk_07) == 4k
1871 * position: last 4k aligned block of 4k size */
1872 if (!bdev->backing_bdev) {
1873 if (__ratelimit(&drbd_ratelimit_state)) {
1874 dev_err(DEV, "bdev->backing_bdev==NULL\n");
1875 dump_stack();
1876 }
1877 return 0;
1878 }
1879 return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
1880 - MD_AL_OFFSET;
1881 case DRBD_MD_INDEX_FLEX_EXT:
1882 return 0; 1875 return 0;
1883 } 1876
1877 /* Since drbd08, internal meta data is always "flexible".
1878 * position: last 4k aligned block of 4k size */
1879 if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
1880 meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
1881 return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
1882
1883 /* external, some index; this is the old fixed size layout */
1884 return MD_128MB_SECT * bdev->md.meta_dev_idx;
1884} 1885}
1885 1886
1886static inline void 1887static inline void
@@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
2053 if (mdev->state.disk == D_DISKLESS) 2054 if (mdev->state.disk == D_DISKLESS)
2054 /* even internal references gone, safe to destroy */ 2055 /* even internal references gone, safe to destroy */
2055 drbd_ldev_destroy(mdev); 2056 drbd_ldev_destroy(mdev);
2056 if (mdev->state.disk == D_FAILED) 2057 if (mdev->state.disk == D_FAILED) {
2057 /* all application IO references gone. */ 2058 /* all application IO references gone. */
2058 drbd_go_diskless(mdev); 2059 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
2060 drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
2061 }
2059 wake_up(&mdev->misc_wait); 2062 wake_up(&mdev->misc_wait);
2060 } 2063 }
2061} 2064}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 298b868910dc..a5dca6affcbb 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -45,7 +45,7 @@
45#include <linux/reboot.h> 45#include <linux/reboot.h>
46#include <linux/notifier.h> 46#include <linux/notifier.h>
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48 48#include <linux/workqueue.h>
49#define __KERNEL_SYSCALLS__ 49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h> 50#include <linux/unistd.h>
51#include <linux/vmalloc.h> 51#include <linux/vmalloc.h>
@@ -2299,6 +2299,7 @@ static void drbd_cleanup(void)
2299 idr_for_each_entry(&minors, mdev, i) { 2299 idr_for_each_entry(&minors, mdev, i) {
2300 idr_remove(&minors, mdev_to_minor(mdev)); 2300 idr_remove(&minors, mdev_to_minor(mdev));
2301 idr_remove(&mdev->tconn->volumes, mdev->vnr); 2301 idr_remove(&mdev->tconn->volumes, mdev->vnr);
2302 destroy_workqueue(mdev->submit.wq);
2302 del_gendisk(mdev->vdisk); 2303 del_gendisk(mdev->vdisk);
2303 /* synchronize_rcu(); No other threads running at this point */ 2304 /* synchronize_rcu(); No other threads running at this point */
2304 kref_put(&mdev->kref, &drbd_minor_destroy); 2305 kref_put(&mdev->kref, &drbd_minor_destroy);
@@ -2588,6 +2589,21 @@ void conn_destroy(struct kref *kref)
2588 kfree(tconn); 2589 kfree(tconn);
2589} 2590}
2590 2591
2592int init_submitter(struct drbd_conf *mdev)
2593{
2594 /* opencoded create_singlethread_workqueue(),
2595 * to be able to say "drbd%d", ..., minor */
2596 mdev->submit.wq = alloc_workqueue("drbd%u_submit",
2597 WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor);
2598 if (!mdev->submit.wq)
2599 return -ENOMEM;
2600
2601 INIT_WORK(&mdev->submit.worker, do_submit);
2602 spin_lock_init(&mdev->submit.lock);
2603 INIT_LIST_HEAD(&mdev->submit.writes);
2604 return 0;
2605}
2606
2591enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) 2607enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
2592{ 2608{
2593 struct drbd_conf *mdev; 2609 struct drbd_conf *mdev;
@@ -2677,6 +2693,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
2677 goto out_idr_remove_minor; 2693 goto out_idr_remove_minor;
2678 } 2694 }
2679 2695
2696 if (init_submitter(mdev)) {
2697 err = ERR_NOMEM;
2698 drbd_msg_put_info("unable to create submit workqueue");
2699 goto out_idr_remove_vol;
2700 }
2701
2680 add_disk(disk); 2702 add_disk(disk);
2681 kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ 2703 kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
2682 2704
@@ -2687,6 +2709,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
2687 2709
2688 return NO_ERROR; 2710 return NO_ERROR;
2689 2711
2712out_idr_remove_vol:
2713 idr_remove(&tconn->volumes, vnr_got);
2690out_idr_remove_minor: 2714out_idr_remove_minor:
2691 idr_remove(&minors, minor_got); 2715 idr_remove(&minors, minor_got);
2692 synchronize_rcu(); 2716 synchronize_rcu();
@@ -2794,6 +2818,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
2794 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2818 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2795 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2819 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2796 2820
2821 kfree(ldev->disk_conf);
2797 kfree(ldev); 2822 kfree(ldev);
2798} 2823}
2799 2824
@@ -2833,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn)
2833 rcu_read_unlock(); 2858 rcu_read_unlock();
2834} 2859}
2835 2860
2861/* aligned 4kByte */
2836struct meta_data_on_disk { 2862struct meta_data_on_disk {
2837 u64 la_size; /* last agreed size. */ 2863 u64 la_size_sect; /* last agreed size. */
2838 u64 uuid[UI_SIZE]; /* UUIDs. */ 2864 u64 uuid[UI_SIZE]; /* UUIDs. */
2839 u64 device_uuid; 2865 u64 device_uuid;
2840 u64 reserved_u64_1; 2866 u64 reserved_u64_1;
@@ -2842,13 +2868,17 @@ struct meta_data_on_disk {
2842 u32 magic; 2868 u32 magic;
2843 u32 md_size_sect; 2869 u32 md_size_sect;
2844 u32 al_offset; /* offset to this block */ 2870 u32 al_offset; /* offset to this block */
2845 u32 al_nr_extents; /* important for restoring the AL */ 2871 u32 al_nr_extents; /* important for restoring the AL (userspace) */
2846 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ 2872 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
2847 u32 bm_offset; /* offset to the bitmap, from here */ 2873 u32 bm_offset; /* offset to the bitmap, from here */
2848 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ 2874 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
2849 u32 la_peer_max_bio_size; /* last peer max_bio_size */ 2875 u32 la_peer_max_bio_size; /* last peer max_bio_size */
2850 u32 reserved_u32[3];
2851 2876
2877 /* see al_tr_number_to_on_disk_sector() */
2878 u32 al_stripes;
2879 u32 al_stripe_size_4k;
2880
2881 u8 reserved_u8[4096 - (7*8 + 10*4)];
2852} __packed; 2882} __packed;
2853 2883
2854/** 2884/**
@@ -2861,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
2861 sector_t sector; 2891 sector_t sector;
2862 int i; 2892 int i;
2863 2893
2894 /* Don't accidentally change the DRBD meta data layout. */
2895 BUILD_BUG_ON(UI_SIZE != 4);
2896 BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
2897
2864 del_timer(&mdev->md_sync_timer); 2898 del_timer(&mdev->md_sync_timer);
2865 /* timer may be rearmed by drbd_md_mark_dirty() now. */ 2899 /* timer may be rearmed by drbd_md_mark_dirty() now. */
2866 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) 2900 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
@@ -2875,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
2875 if (!buffer) 2909 if (!buffer)
2876 goto out; 2910 goto out;
2877 2911
2878 memset(buffer, 0, 512); 2912 memset(buffer, 0, sizeof(*buffer));
2879 2913
2880 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); 2914 buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
2881 for (i = UI_CURRENT; i < UI_SIZE; i++) 2915 for (i = UI_CURRENT; i < UI_SIZE; i++)
2882 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); 2916 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
2883 buffer->flags = cpu_to_be32(mdev->ldev->md.flags); 2917 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
@@ -2892,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
2892 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); 2926 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
2893 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); 2927 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
2894 2928
2895 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); 2929 buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes);
2930 buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k);
2931
2932 D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset);
2896 sector = mdev->ldev->md.md_offset; 2933 sector = mdev->ldev->md.md_offset;
2897 2934
2898 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 2935 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
@@ -2910,13 +2947,141 @@ out:
2910 put_ldev(mdev); 2947 put_ldev(mdev);
2911} 2948}
2912 2949
2950static int check_activity_log_stripe_size(struct drbd_conf *mdev,
2951 struct meta_data_on_disk *on_disk,
2952 struct drbd_md *in_core)
2953{
2954 u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
2955 u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
2956 u64 al_size_4k;
2957
2958 /* both not set: default to old fixed size activity log */
2959 if (al_stripes == 0 && al_stripe_size_4k == 0) {
2960 al_stripes = 1;
2961 al_stripe_size_4k = MD_32kB_SECT/8;
2962 }
2963
2964 /* some paranoia plausibility checks */
2965
2966 /* we need both values to be set */
2967 if (al_stripes == 0 || al_stripe_size_4k == 0)
2968 goto err;
2969
2970 al_size_4k = (u64)al_stripes * al_stripe_size_4k;
2971
2972 /* Upper limit of activity log area, to avoid potential overflow
2973 * problems in al_tr_number_to_on_disk_sector(). As right now, more
2974 * than 72 * 4k blocks total only increases the amount of history,
2975 * limiting this arbitrarily to 16 GB is not a real limitation ;-) */
2976 if (al_size_4k > (16 * 1024 * 1024/4))
2977 goto err;
2978
2979 /* Lower limit: we need at least 8 transaction slots (32kB)
2980 * to not break existing setups */
2981 if (al_size_4k < MD_32kB_SECT/8)
2982 goto err;
2983
2984 in_core->al_stripe_size_4k = al_stripe_size_4k;
2985 in_core->al_stripes = al_stripes;
2986 in_core->al_size_4k = al_size_4k;
2987
2988 return 0;
2989err:
2990 dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
2991 al_stripes, al_stripe_size_4k);
2992 return -EINVAL;
2993}
2994
2995static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2996{
2997 sector_t capacity = drbd_get_capacity(bdev->md_bdev);
2998 struct drbd_md *in_core = &bdev->md;
2999 s32 on_disk_al_sect;
3000 s32 on_disk_bm_sect;
3001
3002 /* The on-disk size of the activity log, calculated from offsets, and
3003 * the size of the activity log calculated from the stripe settings,
3004 * should match.
3005 * Though we could relax this a bit: it is ok, if the striped activity log
3006 * fits in the available on-disk activity log size.
3007 * Right now, that would break how resize is implemented.
3008 * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
3009 * of possible unused padding space in the on disk layout. */
3010 if (in_core->al_offset < 0) {
3011 if (in_core->bm_offset > in_core->al_offset)
3012 goto err;
3013 on_disk_al_sect = -in_core->al_offset;
3014 on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
3015 } else {
3016 if (in_core->al_offset != MD_4kB_SECT)
3017 goto err;
3018 if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
3019 goto err;
3020
3021 on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
3022 on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
3023 }
3024
3025 /* old fixed size meta data is exactly that: fixed. */
3026 if (in_core->meta_dev_idx >= 0) {
3027 if (in_core->md_size_sect != MD_128MB_SECT
3028 || in_core->al_offset != MD_4kB_SECT
3029 || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
3030 || in_core->al_stripes != 1
3031 || in_core->al_stripe_size_4k != MD_32kB_SECT/8)
3032 goto err;
3033 }
3034
3035 if (capacity < in_core->md_size_sect)
3036 goto err;
3037 if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
3038 goto err;
3039
3040 /* should be aligned, and at least 32k */
3041 if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
3042 goto err;
3043
3044 /* should fit (for now: exactly) into the available on-disk space;
3045 * overflow prevention is in check_activity_log_stripe_size() above. */
3046 if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
3047 goto err;
3048
3049 /* again, should be aligned */
3050 if (in_core->bm_offset & 7)
3051 goto err;
3052
3053 /* FIXME check for device grow with flex external meta data? */
3054
3055 /* can the available bitmap space cover the last agreed device size? */
3056 if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
3057 goto err;
3058
3059 return 0;
3060
3061err:
3062 dev_err(DEV, "meta data offsets don't make sense: idx=%d "
3063 "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
3064 "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
3065 in_core->meta_dev_idx,
3066 in_core->al_stripes, in_core->al_stripe_size_4k,
3067 in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
3068 (unsigned long long)in_core->la_size_sect,
3069 (unsigned long long)capacity);
3070
3071 return -EINVAL;
3072}
3073
3074
2913/** 3075/**
2914 * drbd_md_read() - Reads in the meta data super block 3076 * drbd_md_read() - Reads in the meta data super block
2915 * @mdev: DRBD device. 3077 * @mdev: DRBD device.
2916 * @bdev: Device from which the meta data should be read in. 3078 * @bdev: Device from which the meta data should be read in.
2917 * 3079 *
2918 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case 3080 * Return NO_ERROR on success, and an enum drbd_ret_code in case
2919 * something goes wrong. 3081 * something goes wrong.
3082 *
3083 * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
3084 * even before @bdev is assigned to @mdev->ldev.
2920 */ 3085 */
2921int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) 3086int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2922{ 3087{
@@ -2924,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2924 u32 magic, flags; 3089 u32 magic, flags;
2925 int i, rv = NO_ERROR; 3090 int i, rv = NO_ERROR;
2926 3091
2927 if (!get_ldev_if_state(mdev, D_ATTACHING)) 3092 if (mdev->state.disk != D_DISKLESS)
2928 return ERR_IO_MD_DISK; 3093 return ERR_DISK_CONFIGURED;
2929 3094
2930 buffer = drbd_md_get_buffer(mdev); 3095 buffer = drbd_md_get_buffer(mdev);
2931 if (!buffer) 3096 if (!buffer)
2932 goto out; 3097 return ERR_NOMEM;
3098
3099 /* First, figure out where our meta data superblock is located,
3100 * and read it. */
3101 bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
3102 bdev->md.md_offset = drbd_md_ss(bdev);
2933 3103
2934 if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { 3104 if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
2935 /* NOTE: can't do normal error processing here as this is 3105 /* NOTE: can't do normal error processing here as this is
@@ -2948,45 +3118,51 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2948 rv = ERR_MD_UNCLEAN; 3118 rv = ERR_MD_UNCLEAN;
2949 goto err; 3119 goto err;
2950 } 3120 }
3121
3122 rv = ERR_MD_INVALID;
2951 if (magic != DRBD_MD_MAGIC_08) { 3123 if (magic != DRBD_MD_MAGIC_08) {
2952 if (magic == DRBD_MD_MAGIC_07) 3124 if (magic == DRBD_MD_MAGIC_07)
2953 dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); 3125 dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
2954 else 3126 else
2955 dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); 3127 dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
2956 rv = ERR_MD_INVALID;
2957 goto err; 3128 goto err;
2958 } 3129 }
2959 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { 3130
2960 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", 3131 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
2961 be32_to_cpu(buffer->al_offset), bdev->md.al_offset); 3132 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2962 rv = ERR_MD_INVALID; 3133 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2963 goto err; 3134 goto err;
2964 } 3135 }
3136
3137
3138 /* convert to in_core endian */
3139 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
3140 for (i = UI_CURRENT; i < UI_SIZE; i++)
3141 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3142 bdev->md.flags = be32_to_cpu(buffer->flags);
3143 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3144
3145 bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
3146 bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
3147 bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
3148
3149 if (check_activity_log_stripe_size(mdev, buffer, &bdev->md))
3150 goto err;
3151 if (check_offsets_and_sizes(mdev, bdev))
3152 goto err;
3153
2965 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { 3154 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
2966 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", 3155 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
2967 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); 3156 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
2968 rv = ERR_MD_INVALID;
2969 goto err; 3157 goto err;
2970 } 3158 }
2971 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { 3159 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
2972 dev_err(DEV, "unexpected md_size: %u (expected %u)\n", 3160 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
2973 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); 3161 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
2974 rv = ERR_MD_INVALID;
2975 goto err; 3162 goto err;
2976 } 3163 }
2977 3164
2978 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { 3165 rv = NO_ERROR;
2979 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2980 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2981 rv = ERR_MD_INVALID;
2982 goto err;
2983 }
2984
2985 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
2986 for (i = UI_CURRENT; i < UI_SIZE; i++)
2987 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
2988 bdev->md.flags = be32_to_cpu(buffer->flags);
2989 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
2990 3166
2991 spin_lock_irq(&mdev->tconn->req_lock); 3167 spin_lock_irq(&mdev->tconn->req_lock);
2992 if (mdev->state.conn < C_CONNECTED) { 3168 if (mdev->state.conn < C_CONNECTED) {
@@ -2999,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2999 3175
3000 err: 3176 err:
3001 drbd_md_put_buffer(mdev); 3177 drbd_md_put_buffer(mdev);
3002 out:
3003 put_ldev(mdev);
3004 3178
3005 return rv; 3179 return rv;
3006} 3180}
@@ -3238,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused)
3238 * end up here after a failed attach, before ldev was even assigned. 3412 * end up here after a failed attach, before ldev was even assigned.
3239 */ 3413 */
3240 if (mdev->bitmap && mdev->ldev) { 3414 if (mdev->bitmap && mdev->ldev) {
3415 /* An interrupted resync or similar is allowed to recounts bits
3416 * while we detach.
3417 * Any modifications would not be expected anymore, though.
3418 */
3241 if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, 3419 if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
3242 "detach", BM_LOCKED_MASK)) { 3420 "detach", BM_LOCKED_TEST_ALLOWED)) {
3243 if (test_bit(WAS_READ_ERROR, &mdev->flags)) { 3421 if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
3244 drbd_md_set_flag(mdev, MDF_FULL_SYNC); 3422 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3245 drbd_md_sync(mdev); 3423 drbd_md_sync(mdev);
@@ -3251,13 +3429,6 @@ static int w_go_diskless(struct drbd_work *w, int unused)
3251 return 0; 3429 return 0;
3252} 3430}
3253 3431
3254void drbd_go_diskless(struct drbd_conf *mdev)
3255{
3256 D_ASSERT(mdev->state.disk == D_FAILED);
3257 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3258 drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
3259}
3260
3261/** 3432/**
3262 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap 3433 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3263 * @mdev: DRBD device. 3434 * @mdev: DRBD device.
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 2af26fc95280..9e3f441e7e84 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -696,37 +696,52 @@ out:
696 return 0; 696 return 0;
697} 697}
698 698
699/* initializes the md.*_offset members, so we are able to find 699/* Initializes the md.*_offset members, so we are able to find
700 * the on disk meta data */ 700 * the on disk meta data.
701 *
702 * We currently have two possible layouts:
703 * external:
704 * |----------- md_size_sect ------------------|
705 * [ 4k superblock ][ activity log ][ Bitmap ]
706 * | al_offset == 8 |
707 * | bm_offset = al_offset + X |
708 * ==> bitmap sectors = md_size_sect - bm_offset
709 *
710 * internal:
711 * |----------- md_size_sect ------------------|
712 * [data.....][ Bitmap ][ activity log ][ 4k superblock ]
713 * | al_offset < 0 |
714 * | bm_offset = al_offset - Y |
715 * ==> bitmap sectors = Y = al_offset - bm_offset
716 *
717 * Activity log size used to be fixed 32kB,
718 * but is about to become configurable.
719 */
701static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, 720static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
702 struct drbd_backing_dev *bdev) 721 struct drbd_backing_dev *bdev)
703{ 722{
704 sector_t md_size_sect = 0; 723 sector_t md_size_sect = 0;
705 int meta_dev_idx; 724 unsigned int al_size_sect = bdev->md.al_size_4k * 8;
706 725
707 rcu_read_lock(); 726 bdev->md.md_offset = drbd_md_ss(bdev);
708 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
709 727
710 switch (meta_dev_idx) { 728 switch (bdev->md.meta_dev_idx) {
711 default: 729 default:
712 /* v07 style fixed size indexed meta data */ 730 /* v07 style fixed size indexed meta data */
713 bdev->md.md_size_sect = MD_RESERVED_SECT; 731 bdev->md.md_size_sect = MD_128MB_SECT;
714 bdev->md.md_offset = drbd_md_ss__(mdev, bdev); 732 bdev->md.al_offset = MD_4kB_SECT;
715 bdev->md.al_offset = MD_AL_OFFSET; 733 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
716 bdev->md.bm_offset = MD_BM_OFFSET;
717 break; 734 break;
718 case DRBD_MD_INDEX_FLEX_EXT: 735 case DRBD_MD_INDEX_FLEX_EXT:
719 /* just occupy the full device; unit: sectors */ 736 /* just occupy the full device; unit: sectors */
720 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); 737 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
721 bdev->md.md_offset = 0; 738 bdev->md.al_offset = MD_4kB_SECT;
722 bdev->md.al_offset = MD_AL_OFFSET; 739 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
723 bdev->md.bm_offset = MD_BM_OFFSET;
724 break; 740 break;
725 case DRBD_MD_INDEX_INTERNAL: 741 case DRBD_MD_INDEX_INTERNAL:
726 case DRBD_MD_INDEX_FLEX_INT: 742 case DRBD_MD_INDEX_FLEX_INT:
727 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
728 /* al size is still fixed */ 743 /* al size is still fixed */
729 bdev->md.al_offset = -MD_AL_SECTORS; 744 bdev->md.al_offset = -al_size_sect;
730 /* we need (slightly less than) ~ this much bitmap sectors: */ 745 /* we need (slightly less than) ~ this much bitmap sectors: */
731 md_size_sect = drbd_get_capacity(bdev->backing_bdev); 746 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
732 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); 747 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
735 750
736 /* plus the "drbd meta data super block", 751 /* plus the "drbd meta data super block",
737 * and the activity log; */ 752 * and the activity log; */
738 md_size_sect += MD_BM_OFFSET; 753 md_size_sect += MD_4kB_SECT + al_size_sect;
739 754
740 bdev->md.md_size_sect = md_size_sect; 755 bdev->md.md_size_sect = md_size_sect;
741 /* bitmap offset is adjusted by 'super' block size */ 756 /* bitmap offset is adjusted by 'super' block size */
742 bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; 757 bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
743 break; 758 break;
744 } 759 }
745 rcu_read_unlock();
746} 760}
747 761
748/* input size is expected to be in KB */ 762/* input size is expected to be in KB */
@@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
805enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) 819enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
806{ 820{
807 sector_t prev_first_sect, prev_size; /* previous meta location */ 821 sector_t prev_first_sect, prev_size; /* previous meta location */
808 sector_t la_size, u_size; 822 sector_t la_size_sect, u_size;
809 sector_t size; 823 sector_t size;
810 char ppb[10]; 824 char ppb[10];
811 825
@@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
828 842
829 prev_first_sect = drbd_md_first_sector(mdev->ldev); 843 prev_first_sect = drbd_md_first_sector(mdev->ldev);
830 prev_size = mdev->ldev->md.md_size_sect; 844 prev_size = mdev->ldev->md.md_size_sect;
831 la_size = mdev->ldev->md.la_size_sect; 845 la_size_sect = mdev->ldev->md.la_size_sect;
832 846
833 /* TODO: should only be some assert here, not (re)init... */ 847 /* TODO: should only be some assert here, not (re)init... */
834 drbd_md_set_sector_offsets(mdev, mdev->ldev); 848 drbd_md_set_sector_offsets(mdev, mdev->ldev);
@@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
864 if (rv == dev_size_error) 878 if (rv == dev_size_error)
865 goto out; 879 goto out;
866 880
867 la_size_changed = (la_size != mdev->ldev->md.la_size_sect); 881 la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
868 882
869 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) 883 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
870 || prev_size != mdev->ldev->md.md_size_sect; 884 || prev_size != mdev->ldev->md.md_size_sect;
@@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
886 drbd_md_mark_dirty(mdev); 900 drbd_md_mark_dirty(mdev);
887 } 901 }
888 902
889 if (size > la_size) 903 if (size > la_size_sect)
890 rv = grew; 904 rv = grew;
891 if (size < la_size) 905 if (size < la_size_sect)
892 rv = shrunk; 906 rv = shrunk;
893out: 907out:
894 lc_unlock(mdev->act_log); 908 lc_unlock(mdev->act_log);
@@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
903 sector_t u_size, int assume_peer_has_space) 917 sector_t u_size, int assume_peer_has_space)
904{ 918{
905 sector_t p_size = mdev->p_size; /* partner's disk size. */ 919 sector_t p_size = mdev->p_size; /* partner's disk size. */
906 sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ 920 sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
907 sector_t m_size; /* my size */ 921 sector_t m_size; /* my size */
908 sector_t size = 0; 922 sector_t size = 0;
909 923
@@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
917 if (p_size && m_size) { 931 if (p_size && m_size) {
918 size = min_t(sector_t, p_size, m_size); 932 size = min_t(sector_t, p_size, m_size);
919 } else { 933 } else {
920 if (la_size) { 934 if (la_size_sect) {
921 size = la_size; 935 size = la_size_sect;
922 if (m_size && m_size < size) 936 if (m_size && m_size < size)
923 size = m_size; 937 size = m_size;
924 if (p_size && p_size < size) 938 if (p_size && p_size < size)
@@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info)
1127 return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); 1141 return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
1128} 1142}
1129 1143
1130static void enforce_disk_conf_limits(struct disk_conf *dc) 1144static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
1131{ 1145{
1132 if (dc->al_extents < DRBD_AL_EXTENTS_MIN) 1146 /* This is limited by 16 bit "slot" numbers,
1133 dc->al_extents = DRBD_AL_EXTENTS_MIN; 1147 * and by available on-disk context storage.
1134 if (dc->al_extents > DRBD_AL_EXTENTS_MAX) 1148 *
1135 dc->al_extents = DRBD_AL_EXTENTS_MAX; 1149 * Also (u16)~0 is special (denotes a "free" extent).
1150 *
1151 * One transaction occupies one 4kB on-disk block,
1152 * we have n such blocks in the on disk ring buffer,
1153 * the "current" transaction may fail (n-1),
1154 * and there is 919 slot numbers context information per transaction.
1155 *
1156 * 72 transaction blocks amounts to more than 2**16 context slots,
1157 * so cap there first.
1158 */
1159 const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
1160 const unsigned int sufficient_on_disk =
1161 (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
1162 /AL_CONTEXT_PER_TRANSACTION;
1163
1164 unsigned int al_size_4k = bdev->md.al_size_4k;
1165
1166 if (al_size_4k > sufficient_on_disk)
1167 return max_al_nr;
1136 1168
1137 if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) 1169 return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
1138 dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1139} 1170}
1140 1171
1141int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) 1172int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
@@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1182 if (!expect(new_disk_conf->resync_rate >= 1)) 1213 if (!expect(new_disk_conf->resync_rate >= 1))
1183 new_disk_conf->resync_rate = 1; 1214 new_disk_conf->resync_rate = 1;
1184 1215
1185 enforce_disk_conf_limits(new_disk_conf); 1216 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1217 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1218 if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev))
1219 new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev);
1220
1221 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1222 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1186 1223
1187 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; 1224 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1188 if (fifo_size != mdev->rs_plan_s->size) { 1225 if (fifo_size != mdev->rs_plan_s->size) {
@@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1330 goto fail; 1367 goto fail;
1331 } 1368 }
1332 1369
1333 enforce_disk_conf_limits(new_disk_conf); 1370 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1371 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1334 1372
1335 new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); 1373 new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
1336 if (!new_plan) { 1374 if (!new_plan) {
@@ -1343,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1343 goto fail; 1381 goto fail;
1344 } 1382 }
1345 1383
1384 write_lock_irq(&global_state_lock);
1385 retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
1386 write_unlock_irq(&global_state_lock);
1387 if (retcode != NO_ERROR)
1388 goto fail;
1389
1346 rcu_read_lock(); 1390 rcu_read_lock();
1347 nc = rcu_dereference(mdev->tconn->net_conf); 1391 nc = rcu_dereference(mdev->tconn->net_conf);
1348 if (nc) { 1392 if (nc) {
@@ -1399,8 +1443,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1399 goto fail; 1443 goto fail;
1400 } 1444 }
1401 1445
1402 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ 1446 /* Read our meta data super block early.
1403 drbd_md_set_sector_offsets(mdev, nbc); 1447 * This also sets other on-disk offsets. */
1448 retcode = drbd_md_read(mdev, nbc);
1449 if (retcode != NO_ERROR)
1450 goto fail;
1451
1452 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1453 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1454 if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
1455 new_disk_conf->al_extents = drbd_al_extents_max(nbc);
1404 1456
1405 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { 1457 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
1406 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", 1458 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
@@ -1416,7 +1468,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1416 min_md_device_sectors = (2<<10); 1468 min_md_device_sectors = (2<<10);
1417 } else { 1469 } else {
1418 max_possible_sectors = DRBD_MAX_SECTORS; 1470 max_possible_sectors = DRBD_MAX_SECTORS;
1419 min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); 1471 min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
1420 } 1472 }
1421 1473
1422 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { 1474 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1467,8 +1519,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1467 if (!get_ldev_if_state(mdev, D_ATTACHING)) 1519 if (!get_ldev_if_state(mdev, D_ATTACHING))
1468 goto force_diskless; 1520 goto force_diskless;
1469 1521
1470 drbd_md_set_sector_offsets(mdev, nbc);
1471
1472 if (!mdev->bitmap) { 1522 if (!mdev->bitmap) {
1473 if (drbd_bm_init(mdev)) { 1523 if (drbd_bm_init(mdev)) {
1474 retcode = ERR_NOMEM; 1524 retcode = ERR_NOMEM;
@@ -1476,10 +1526,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1476 } 1526 }
1477 } 1527 }
1478 1528
1479 retcode = drbd_md_read(mdev, nbc);
1480 if (retcode != NO_ERROR)
1481 goto force_diskless_dec;
1482
1483 if (mdev->state.conn < C_CONNECTED && 1529 if (mdev->state.conn < C_CONNECTED &&
1484 mdev->state.role == R_PRIMARY && 1530 mdev->state.role == R_PRIMARY &&
1485 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { 1531 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
@@ -2158,8 +2204,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for
2158 return SS_SUCCESS; 2204 return SS_SUCCESS;
2159 case SS_PRIMARY_NOP: 2205 case SS_PRIMARY_NOP:
2160 /* Our state checking code wants to see the peer outdated. */ 2206 /* Our state checking code wants to see the peer outdated. */
2161 rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, 2207 rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
2162 pdsk, D_OUTDATED), CS_VERBOSE); 2208
2209 if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
2210 rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE);
2211
2163 break; 2212 break;
2164 case SS_CW_FAILED_BY_PEER: 2213 case SS_CW_FAILED_BY_PEER:
2165 /* The peer probably wants to see us outdated. */ 2214 /* The peer probably wants to see us outdated. */
@@ -2406,22 +2455,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2406 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 2455 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2407 drbd_flush_workqueue(mdev); 2456 drbd_flush_workqueue(mdev);
2408 2457
2409 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); 2458 /* If we happen to be C_STANDALONE R_SECONDARY, just change to
2410 2459 * D_INCONSISTENT, and set all bits in the bitmap. Otherwise,
2411 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) 2460 * try to start a resync handshake as sync target for full sync.
2412 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); 2461 */
2413 2462 if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) {
2414 while (retcode == SS_NEED_CONNECTION) { 2463 retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT));
2415 spin_lock_irq(&mdev->tconn->req_lock); 2464 if (retcode >= SS_SUCCESS) {
2416 if (mdev->state.conn < C_CONNECTED) 2465 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
2417 retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); 2466 "set_n_write from invalidate", BM_LOCKED_MASK))
2418 spin_unlock_irq(&mdev->tconn->req_lock); 2467 retcode = ERR_IO_MD_DISK;
2419 2468 }
2420 if (retcode != SS_NEED_CONNECTION) 2469 } else
2421 break;
2422
2423 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); 2470 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
2424 }
2425 drbd_resume_io(mdev); 2471 drbd_resume_io(mdev);
2426 2472
2427out: 2473out:
@@ -2475,21 +2521,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2475 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 2521 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2476 drbd_flush_workqueue(mdev); 2522 drbd_flush_workqueue(mdev);
2477 2523
2478 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); 2524 /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
2479 if (retcode < SS_SUCCESS) { 2525 * in the bitmap. Otherwise, try to start a resync handshake
2480 if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { 2526 * as sync source for full sync.
2481 /* The peer will get a resync upon connect anyways. 2527 */
2482 * Just make that into a full resync. */ 2528 if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) {
2483 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); 2529 /* The peer will get a resync upon connect anyways. Just make that
2484 if (retcode >= SS_SUCCESS) { 2530 into a full resync. */
2485 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, 2531 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
2486 "set_n_write from invalidate_peer", 2532 if (retcode >= SS_SUCCESS) {
2487 BM_LOCKED_SET_ALLOWED)) 2533 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
2488 retcode = ERR_IO_MD_DISK; 2534 "set_n_write from invalidate_peer",
2489 } 2535 BM_LOCKED_SET_ALLOWED))
2490 } else 2536 retcode = ERR_IO_MD_DISK;
2491 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); 2537 }
2492 } 2538 } else
2539 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
2493 drbd_resume_io(mdev); 2540 drbd_resume_io(mdev);
2494 2541
2495out: 2542out:
@@ -3162,6 +3209,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
3162 CS_VERBOSE + CS_WAIT_COMPLETE); 3209 CS_VERBOSE + CS_WAIT_COMPLETE);
3163 idr_remove(&mdev->tconn->volumes, mdev->vnr); 3210 idr_remove(&mdev->tconn->volumes, mdev->vnr);
3164 idr_remove(&minors, mdev_to_minor(mdev)); 3211 idr_remove(&minors, mdev_to_minor(mdev));
3212 destroy_workqueue(mdev->submit.wq);
3165 del_gendisk(mdev->vdisk); 3213 del_gendisk(mdev->vdisk);
3166 synchronize_rcu(); 3214 synchronize_rcu();
3167 kref_put(&mdev->kref, &drbd_minor_destroy); 3215 kref_put(&mdev->kref, &drbd_minor_destroy);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 928adb815b09..bf31d41dbaad 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
313 313
314static int drbd_proc_open(struct inode *inode, struct file *file) 314static int drbd_proc_open(struct inode *inode, struct file *file)
315{ 315{
316 if (try_module_get(THIS_MODULE)) 316 int err;
317 return single_open(file, drbd_seq_show, PDE_DATA(inode)); 317
318 if (try_module_get(THIS_MODULE)) {
319 err = single_open(file, drbd_seq_show, PDE_DATA(inode));
320 if (err)
321 module_put(THIS_MODULE);
322 return err;
323 }
318 return -ENODEV; 324 return -ENODEV;
319} 325}
320 326
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 83c5ae0ed56b..4222affff488 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -850,6 +850,7 @@ int drbd_connected(struct drbd_conf *mdev)
850 err = drbd_send_current_state(mdev); 850 err = drbd_send_current_state(mdev);
851 clear_bit(USE_DEGR_WFC_T, &mdev->flags); 851 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
852 clear_bit(RESIZE_PENDING, &mdev->flags); 852 clear_bit(RESIZE_PENDING, &mdev->flags);
853 atomic_set(&mdev->ap_in_flight, 0);
853 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ 854 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
854 return err; 855 return err;
855} 856}
@@ -2266,7 +2267,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
2266 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); 2267 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
2267 peer_req->flags |= EE_CALL_AL_COMPLETE_IO; 2268 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2268 peer_req->flags &= ~EE_MAY_SET_IN_SYNC; 2269 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2269 drbd_al_begin_io(mdev, &peer_req->i); 2270 drbd_al_begin_io(mdev, &peer_req->i, true);
2270 } 2271 }
2271 2272
2272 err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); 2273 err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2662,7 +2663,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2662 if (hg == -1 && mdev->state.role == R_PRIMARY) { 2663 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2663 enum drbd_state_rv rv2; 2664 enum drbd_state_rv rv2;
2664 2665
2665 drbd_set_role(mdev, R_SECONDARY, 0);
2666 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 2666 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2667 * we might be here in C_WF_REPORT_PARAMS which is transient. 2667 * we might be here in C_WF_REPORT_PARAMS which is transient.
2668 * we do not need to wait for the after state change work either. */ 2668 * we do not need to wait for the after state change work either. */
@@ -3993,7 +3993,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
3993 3993
3994 clear_bit(DISCARD_MY_DATA, &mdev->flags); 3994 clear_bit(DISCARD_MY_DATA, &mdev->flags);
3995 3995
3996 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ 3996 drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */
3997 3997
3998 return 0; 3998 return 0;
3999} 3999}
@@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn)
4660#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) 4660#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
4661static int drbd_do_auth(struct drbd_tconn *tconn) 4661static int drbd_do_auth(struct drbd_tconn *tconn)
4662{ 4662{
4663 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); 4663 conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4664 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); 4664 conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
4665 return -1; 4665 return -1;
4666} 4666}
4667#else 4667#else
@@ -5258,9 +5258,11 @@ int drbd_asender(struct drbd_thread *thi)
5258 bool ping_timeout_active = false; 5258 bool ping_timeout_active = false;
5259 struct net_conf *nc; 5259 struct net_conf *nc;
5260 int ping_timeo, tcp_cork, ping_int; 5260 int ping_timeo, tcp_cork, ping_int;
5261 struct sched_param param = { .sched_priority = 2 };
5261 5262
5262 current->policy = SCHED_RR; /* Make this a realtime task! */ 5263 rv = sched_setscheduler(current, SCHED_RR, &param);
5263 current->rt_priority = 2; /* more important than all other tasks */ 5264 if (rv < 0)
5265 conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv);
5264 5266
5265 while (get_t_state(thi) == RUNNING) { 5267 while (get_t_state(thi) == RUNNING) {
5266 drbd_thread_current_set_cpu(thi); 5268 drbd_thread_current_set_cpu(thi);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 2b8303ad63c9..c24379ffd4e3 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -34,14 +34,14 @@
34static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); 34static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
35 35
36/* Update disk stats at start of I/O request */ 36/* Update disk stats at start of I/O request */
37static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) 37static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
38{ 38{
39 const int rw = bio_data_dir(bio); 39 const int rw = bio_data_dir(req->master_bio);
40 int cpu; 40 int cpu;
41 cpu = part_stat_lock(); 41 cpu = part_stat_lock();
42 part_round_stats(cpu, &mdev->vdisk->part0); 42 part_round_stats(cpu, &mdev->vdisk->part0);
43 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); 43 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
44 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); 44 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9);
45 (void) cpu; /* The macro invocations above want the cpu argument, I do not like 45 (void) cpu; /* The macro invocations above want the cpu argument, I do not like
46 the compiler warning about cpu only assigned but never used... */ 46 the compiler warning about cpu only assigned but never used... */
47 part_inc_in_flight(&mdev->vdisk->part0, rw); 47 part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
263 else 263 else
264 root = &mdev->read_requests; 264 root = &mdev->read_requests;
265 drbd_remove_request_interval(root, req); 265 drbd_remove_request_interval(root, req);
266 } else if (!(s & RQ_POSTPONED)) 266 }
267 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
268 267
269 /* Before we can signal completion to the upper layers, 268 /* Before we can signal completion to the upper layers,
270 * we may need to close the current transfer log epoch. 269 * we may need to close the current transfer log epoch.
@@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
755 D_ASSERT(req->rq_state & RQ_NET_PENDING); 754 D_ASSERT(req->rq_state & RQ_NET_PENDING);
756 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); 755 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
757 break; 756 break;
757
758 case QUEUE_AS_DRBD_BARRIER:
759 start_new_tl_epoch(mdev->tconn);
760 mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
761 break;
758 }; 762 };
759 763
760 return rv; 764 return rv;
@@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev)
861 bool congested = false; 865 bool congested = false;
862 enum drbd_on_congestion on_congestion; 866 enum drbd_on_congestion on_congestion;
863 867
868 rcu_read_lock();
864 nc = rcu_dereference(tconn->net_conf); 869 nc = rcu_dereference(tconn->net_conf);
865 on_congestion = nc ? nc->on_congestion : OC_BLOCK; 870 on_congestion = nc ? nc->on_congestion : OC_BLOCK;
871 rcu_read_unlock();
866 if (on_congestion == OC_BLOCK || 872 if (on_congestion == OC_BLOCK ||
867 tconn->agreed_pro_version < 96) 873 tconn->agreed_pro_version < 96)
868 return; 874 return;
@@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req)
956 struct drbd_conf *mdev = req->w.mdev; 962 struct drbd_conf *mdev = req->w.mdev;
957 int remote, send_oos; 963 int remote, send_oos;
958 964
959 rcu_read_lock();
960 remote = drbd_should_do_remote(mdev->state); 965 remote = drbd_should_do_remote(mdev->state);
961 if (remote) {
962 maybe_pull_ahead(mdev);
963 remote = drbd_should_do_remote(mdev->state);
964 }
965 send_oos = drbd_should_send_out_of_sync(mdev->state); 966 send_oos = drbd_should_send_out_of_sync(mdev->state);
966 rcu_read_unlock();
967 967
968 /* Need to replicate writes. Unless it is an empty flush, 968 /* Need to replicate writes. Unless it is an empty flush,
969 * which is better mapped to a DRBD P_BARRIER packet, 969 * which is better mapped to a DRBD P_BARRIER packet,
@@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req)
975 /* The only size==0 bios we expect are empty flushes. */ 975 /* The only size==0 bios we expect are empty flushes. */
976 D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); 976 D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
977 if (remote) 977 if (remote)
978 start_new_tl_epoch(mdev->tconn); 978 _req_mod(req, QUEUE_AS_DRBD_BARRIER);
979 return 0; 979 return remote;
980 } 980 }
981 981
982 if (!remote && !send_oos) 982 if (!remote && !send_oos)
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req)
1020 bio_endio(bio, -EIO); 1020 bio_endio(bio, -EIO);
1021} 1021}
1022 1022
1023void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) 1023static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req)
1024{ 1024{
1025 const int rw = bio_rw(bio); 1025 spin_lock(&mdev->submit.lock);
1026 struct bio_and_error m = { NULL, }; 1026 list_add_tail(&req->tl_requests, &mdev->submit.writes);
1027 spin_unlock(&mdev->submit.lock);
1028 queue_work(mdev->submit.wq, &mdev->submit.worker);
1029}
1030
1031/* returns the new drbd_request pointer, if the caller is expected to
1032 * drbd_send_and_submit() it (to save latency), or NULL if we queued the
1033 * request on the submitter thread.
1034 * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
1035 */
1036struct drbd_request *
1037drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
1038{
1039 const int rw = bio_data_dir(bio);
1027 struct drbd_request *req; 1040 struct drbd_request *req;
1028 bool no_remote = false;
1029 1041
1030 /* allocate outside of all locks; */ 1042 /* allocate outside of all locks; */
1031 req = drbd_req_new(mdev, bio); 1043 req = drbd_req_new(mdev, bio);
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
1035 * if user cannot handle io errors, that's not our business. */ 1047 * if user cannot handle io errors, that's not our business. */
1036 dev_err(DEV, "could not kmalloc() req\n"); 1048 dev_err(DEV, "could not kmalloc() req\n");
1037 bio_endio(bio, -ENOMEM); 1049 bio_endio(bio, -ENOMEM);
1038 return; 1050 return ERR_PTR(-ENOMEM);
1039 } 1051 }
1040 req->start_time = start_time; 1052 req->start_time = start_time;
1041 1053
@@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
1044 req->private_bio = NULL; 1056 req->private_bio = NULL;
1045 } 1057 }
1046 1058
1047 /* For WRITES going to the local disk, grab a reference on the target 1059 /* Update disk stats */
1048 * extent. This waits for any resync activity in the corresponding 1060 _drbd_start_io_acct(mdev, req);
1049 * resync extent to finish, and, if necessary, pulls in the target 1061
1050 * extent into the activity log, which involves further disk io because
1051 * of transactional on-disk meta data updates.
1052 * Empty flushes don't need to go into the activity log, they can only
1053 * flush data for pending writes which are already in there. */
1054 if (rw == WRITE && req->private_bio && req->i.size 1062 if (rw == WRITE && req->private_bio && req->i.size
1055 && !test_bit(AL_SUSPENDED, &mdev->flags)) { 1063 && !test_bit(AL_SUSPENDED, &mdev->flags)) {
1064 if (!drbd_al_begin_io_fastpath(mdev, &req->i)) {
1065 drbd_queue_write(mdev, req);
1066 return NULL;
1067 }
1056 req->rq_state |= RQ_IN_ACT_LOG; 1068 req->rq_state |= RQ_IN_ACT_LOG;
1057 drbd_al_begin_io(mdev, &req->i);
1058 } 1069 }
1059 1070
1071 return req;
1072}
1073
1074static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req)
1075{
1076 const int rw = bio_rw(req->master_bio);
1077 struct bio_and_error m = { NULL, };
1078 bool no_remote = false;
1079
1060 spin_lock_irq(&mdev->tconn->req_lock); 1080 spin_lock_irq(&mdev->tconn->req_lock);
1061 if (rw == WRITE) { 1081 if (rw == WRITE) {
1062 /* This may temporarily give up the req_lock, 1082 /* This may temporarily give up the req_lock,
1063 * but will re-aquire it before it returns here. 1083 * but will re-aquire it before it returns here.
1064 * Needs to be before the check on drbd_suspended() */ 1084 * Needs to be before the check on drbd_suspended() */
1065 complete_conflicting_writes(req); 1085 complete_conflicting_writes(req);
1086 /* no more giving up req_lock from now on! */
1087
1088 /* check for congestion, and potentially stop sending
1089 * full data updates, but start sending "dirty bits" only. */
1090 maybe_pull_ahead(mdev);
1066 } 1091 }
1067 1092
1068 /* no more giving up req_lock from now on! */
1069 1093
1070 if (drbd_suspended(mdev)) { 1094 if (drbd_suspended(mdev)) {
1071 /* push back and retry: */ 1095 /* push back and retry: */
@@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
1078 goto out; 1102 goto out;
1079 } 1103 }
1080 1104
1081 /* Update disk stats */
1082 _drbd_start_io_acct(mdev, req, bio);
1083
1084 /* We fail READ/READA early, if we can not serve it. 1105 /* We fail READ/READA early, if we can not serve it.
1085 * We must do this before req is registered on any lists. 1106 * We must do this before req is registered on any lists.
1086 * Otherwise, drbd_req_complete() will queue failed READ for retry. */ 1107 * Otherwise, drbd_req_complete() will queue failed READ for retry. */
@@ -1137,7 +1158,116 @@ out:
1137 1158
1138 if (m.bio) 1159 if (m.bio)
1139 complete_master_bio(mdev, &m); 1160 complete_master_bio(mdev, &m);
1140 return; 1161}
1162
1163void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
1164{
1165 struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time);
1166 if (IS_ERR_OR_NULL(req))
1167 return;
1168 drbd_send_and_submit(mdev, req);
1169}
1170
1171static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming)
1172{
1173 struct drbd_request *req, *tmp;
1174 list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
1175 const int rw = bio_data_dir(req->master_bio);
1176
1177 if (rw == WRITE /* rw != WRITE should not even end up here! */
1178 && req->private_bio && req->i.size
1179 && !test_bit(AL_SUSPENDED, &mdev->flags)) {
1180 if (!drbd_al_begin_io_fastpath(mdev, &req->i))
1181 continue;
1182
1183 req->rq_state |= RQ_IN_ACT_LOG;
1184 }
1185
1186 list_del_init(&req->tl_requests);
1187 drbd_send_and_submit(mdev, req);
1188 }
1189}
1190
1191static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev,
1192 struct list_head *incoming,
1193 struct list_head *pending)
1194{
1195 struct drbd_request *req, *tmp;
1196 int wake = 0;
1197 int err;
1198
1199 spin_lock_irq(&mdev->al_lock);
1200 list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
1201 err = drbd_al_begin_io_nonblock(mdev, &req->i);
1202 if (err == -EBUSY)
1203 wake = 1;
1204 if (err)
1205 continue;
1206 req->rq_state |= RQ_IN_ACT_LOG;
1207 list_move_tail(&req->tl_requests, pending);
1208 }
1209 spin_unlock_irq(&mdev->al_lock);
1210 if (wake)
1211 wake_up(&mdev->al_wait);
1212
1213 return !list_empty(pending);
1214}
1215
1216void do_submit(struct work_struct *ws)
1217{
1218 struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker);
1219 LIST_HEAD(incoming);
1220 LIST_HEAD(pending);
1221 struct drbd_request *req, *tmp;
1222
1223 for (;;) {
1224 spin_lock(&mdev->submit.lock);
1225 list_splice_tail_init(&mdev->submit.writes, &incoming);
1226 spin_unlock(&mdev->submit.lock);
1227
1228 submit_fast_path(mdev, &incoming);
1229 if (list_empty(&incoming))
1230 break;
1231
1232 wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending));
1233 /* Maybe more was queued, while we prepared the transaction?
1234 * Try to stuff them into this transaction as well.
1235 * Be strictly non-blocking here, no wait_event, we already
1236 * have something to commit.
1237 * Stop if we don't make any more progres.
1238 */
1239 for (;;) {
1240 LIST_HEAD(more_pending);
1241 LIST_HEAD(more_incoming);
1242 bool made_progress;
1243
1244 /* It is ok to look outside the lock,
1245 * it's only an optimization anyways */
1246 if (list_empty(&mdev->submit.writes))
1247 break;
1248
1249 spin_lock(&mdev->submit.lock);
1250 list_splice_tail_init(&mdev->submit.writes, &more_incoming);
1251 spin_unlock(&mdev->submit.lock);
1252
1253 if (list_empty(&more_incoming))
1254 break;
1255
1256 made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending);
1257
1258 list_splice_tail_init(&more_pending, &pending);
1259 list_splice_tail_init(&more_incoming, &incoming);
1260
1261 if (!made_progress)
1262 break;
1263 }
1264 drbd_al_begin_io_commit(mdev, false);
1265
1266 list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
1267 list_del_init(&req->tl_requests);
1268 drbd_send_and_submit(mdev, req);
1269 }
1270 }
1141} 1271}
1142 1272
1143void drbd_make_request(struct request_queue *q, struct bio *bio) 1273void drbd_make_request(struct request_queue *q, struct bio *bio)
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index c08d22964d06..978cb1addc98 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -88,6 +88,14 @@ enum drbd_req_event {
88 QUEUE_FOR_NET_READ, 88 QUEUE_FOR_NET_READ,
89 QUEUE_FOR_SEND_OOS, 89 QUEUE_FOR_SEND_OOS,
90 90
91 /* An empty flush is queued as P_BARRIER,
92 * which will cause it to complete "successfully",
93 * even if the local disk flush failed.
94 *
95 * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
96 * only see an error if neither local nor remote data is reachable. */
97 QUEUE_AS_DRBD_BARRIER,
98
91 SEND_CANCELED, 99 SEND_CANCELED,
92 SEND_FAILED, 100 SEND_FAILED,
93 HANDED_OVER_TO_NETWORK, 101 HANDED_OVER_TO_NETWORK,
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 0fe220cfb9e9..90c5be2b1d30 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
570 mdev->tconn->agreed_pro_version < 88) 570 mdev->tconn->agreed_pro_version < 88)
571 rv = SS_NOT_SUPPORTED; 571 rv = SS_NOT_SUPPORTED;
572 572
573 else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
574 rv = SS_NO_UP_TO_DATE_DISK;
575
576 else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
577 ns.pdsk == D_UNKNOWN)
578 rv = SS_NEED_CONNECTION;
579
573 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) 580 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
574 rv = SS_CONNECTED_OUTDATES; 581 rv = SS_CONNECTED_OUTDATES;
575 582
@@ -635,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t
635 && os.conn < C_WF_REPORT_PARAMS) 642 && os.conn < C_WF_REPORT_PARAMS)
636 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ 643 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
637 644
645 if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
646 os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
647 rv = SS_OUTDATE_WO_CONN;
648
638 return rv; 649 return rv;
639} 650}
640 651
@@ -1377,13 +1388,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1377 &drbd_bmio_set_n_write, &abw_start_sync, 1388 &drbd_bmio_set_n_write, &abw_start_sync,
1378 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); 1389 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1379 1390
1380 /* We are invalidating our self... */
1381 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1382 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1383 /* other bitmap operation expected during this phase */
1384 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1385 "set_n_write from invalidate", BM_LOCKED_MASK);
1386
1387 /* first half of local IO error, failure to attach, 1391 /* first half of local IO error, failure to attach,
1388 * or administrative detach */ 1392 * or administrative detach */
1389 if (os.disk != D_FAILED && ns.disk == D_FAILED) { 1393 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
@@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state
1748 if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) 1752 if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
1749 return SS_CW_FAILED_BY_PEER; 1753 return SS_CW_FAILED_BY_PEER;
1750 1754
1751 rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; 1755 rv = conn_is_valid_transition(tconn, mask, val, 0);
1752 1756 if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS)
1753 if (rv == SS_UNKNOWN_ERROR) 1757 rv = SS_UNKNOWN_ERROR; /* continue waiting */
1754 rv = conn_is_valid_transition(tconn, mask, val, 0);
1755
1756 if (rv == SS_SUCCESS)
1757 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
1758 1758
1759 return rv; 1759 return rv;
1760} 1760}
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 9a664bd27404..58e08ff2b2ce 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
89 [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", 89 [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
90 [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", 90 [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
91 [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", 91 [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
92 [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
92 [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", 93 [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
93}; 94};
94 95
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 424dc7bdf9b7..891c0ecaa292 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error)
89 md_io->done = 1; 89 md_io->done = 1;
90 wake_up(&mdev->misc_wait); 90 wake_up(&mdev->misc_wait);
91 bio_put(bio); 91 bio_put(bio);
92 put_ldev(mdev); 92 if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
93 put_ldev(mdev);
93} 94}
94 95
95/* reads on behalf of the partner, 96/* reads on behalf of the partner,
@@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
1410 struct drbd_conf *mdev = w->mdev; 1411 struct drbd_conf *mdev = w->mdev;
1411 1412
1412 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) 1413 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1413 drbd_al_begin_io(mdev, &req->i); 1414 drbd_al_begin_io(mdev, &req->i, false);
1414 1415
1415 drbd_req_make_private_bio(req, req->master_bio); 1416 drbd_req_make_private_bio(req, req->master_bio);
1416 req->private_bio->bi_bdev = mdev->ldev->backing_bdev; 1417 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
@@ -1425,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
1425 int resync_after; 1426 int resync_after;
1426 1427
1427 while (1) { 1428 while (1) {
1428 if (!odev->ldev) 1429 if (!odev->ldev || odev->state.disk == D_DISKLESS)
1429 return 1; 1430 return 1;
1430 rcu_read_lock(); 1431 rcu_read_lock();
1431 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; 1432 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
@@ -1433,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
1433 if (resync_after == -1) 1434 if (resync_after == -1)
1434 return 1; 1435 return 1;
1435 odev = minor_to_mdev(resync_after); 1436 odev = minor_to_mdev(resync_after);
1436 if (!expect(odev)) 1437 if (!odev)
1437 return 1; 1438 return 1;
1438 if ((odev->state.conn >= C_SYNC_SOURCE && 1439 if ((odev->state.conn >= C_SYNC_SOURCE &&
1439 odev->state.conn <= C_PAUSED_SYNC_T) || 1440 odev->state.conn <= C_PAUSED_SYNC_T) ||
@@ -1515,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
1515 1516
1516 if (o_minor == -1) 1517 if (o_minor == -1)
1517 return NO_ERROR; 1518 return NO_ERROR;
1518 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) 1519 if (o_minor < -1 || o_minor > MINORMASK)
1519 return ERR_RESYNC_AFTER; 1520 return ERR_RESYNC_AFTER;
1520 1521
1521 /* check for loops */ 1522 /* check for loops */
@@ -1524,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
1524 if (odev == mdev) 1525 if (odev == mdev)
1525 return ERR_RESYNC_AFTER_CYCLE; 1526 return ERR_RESYNC_AFTER_CYCLE;
1526 1527
1528 /* You are free to depend on diskless, non-existing,
1529 * or not yet/no longer existing minors.
1530 * We only reject dependency loops.
1531 * We cannot follow the dependency chain beyond a detached or
1532 * missing minor.
1533 */
1534 if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1535 return NO_ERROR;
1536
1527 rcu_read_lock(); 1537 rcu_read_lock();
1528 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; 1538 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1529 rcu_read_unlock(); 1539 rcu_read_unlock();
@@ -1652,7 +1662,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1652 clear_bit(B_RS_H_DONE, &mdev->flags); 1662 clear_bit(B_RS_H_DONE, &mdev->flags);
1653 1663
1654 write_lock_irq(&global_state_lock); 1664 write_lock_irq(&global_state_lock);
1655 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { 1665 /* Did some connection breakage or IO error race with us? */
1666 if (mdev->state.conn < C_CONNECTED
1667 || !get_ldev_if_state(mdev, D_NEGOTIATING)) {
1656 write_unlock_irq(&global_state_lock); 1668 write_unlock_irq(&global_state_lock);
1657 mutex_unlock(mdev->state_mutex); 1669 mutex_unlock(mdev->state_mutex);
1658 return; 1670 return;