aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-21 21:19:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-21 21:19:38 -0500
commit641203549a21ba6a701aecd05c3dfc969ec670cc (patch)
tree5e3d177c380ed811b5bf37e0bf9b8098416a9bc6
parent404a47410c26a115123885977053e9a1a4460929 (diff)
parente93d12ae3be91d18b2a46deebb90a3f516db3d3c (diff)
Merge branch 'for-4.5/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: "This is the block driver pull request for 4.5, with the exception of NVMe, which is in a separate branch and will be posted after this one. This pull request contains: - A set of bcache stability fixes, which have been acked by Kent. These have been used and tested for more than a year by the community, so it's about time that they got in. - A set of drbd updates from the drbd team (Andreas, Lars, Philipp) and Markus Elfring, Oleg Drokin. - A set of fixes for xen blkback/front from the usual suspects, (Bob, Konrad) as well as community based fixes from Kiri, Julien, and Peng. - A 2038 time fix for sx8 from Shraddha, with a fix from me. - A small mtip32xx cleanup from Zhu Yanjun. - A null_blk division fix from Arnd" * 'for-4.5/drivers' of git://git.kernel.dk/linux-block: (71 commits) null_blk: use sector_div instead of do_div mtip32xx: restrict variables visible in current code module xen/blkfront: Fix crash if backend doesn't follow the right states. xen/blkback: Fix two memory leaks. xen/blkback: make st_ statistics per ring xen/blkfront: Handle non-indirect grant with 64KB pages xen-blkfront: Introduce blkif_ring_get_request xen-blkback: clear PF_NOFREEZE for xen_blkif_schedule() xen/blkback: Free resources if connect_ring failed. xen/blocks: Return -EXX instead of -1 xen/blkback: make pool of persistent grants and free pages per-queue xen/blkback: get the number of hardware queues/rings from blkfront xen/blkback: pseudo support for multi hardware queues/rings xen/blkback: separate ring information out of struct xen_blkif xen/blkfront: correct setting for xen_blkif_max_ring_order xen/blkfront: make persistent grants pool per-queue xen/blkfront: Remove duplicate setting of ->xbdev. xen/blkfront: Cleanup of comments, fix unaligned variables, and syntax errors. xen/blkfront: negotiate number of queues/rings to be used with backend xen/blkfront: split per device io_lock ...
-rw-r--r--MAINTAINERS11
-rw-r--r--drivers/block/drbd/drbd_actlog.c323
-rw-r--r--drivers/block/drbd/drbd_bitmap.c22
-rw-r--r--drivers/block/drbd/drbd_debugfs.c10
-rw-r--r--drivers/block/drbd/drbd_int.h111
-rw-r--r--drivers/block/drbd/drbd_main.c74
-rw-r--r--drivers/block/drbd/drbd_nl.c1361
-rw-r--r--drivers/block/drbd/drbd_proc.c6
-rw-r--r--drivers/block/drbd/drbd_protocol.h2
-rw-r--r--drivers/block/drbd/drbd_receiver.c254
-rw-r--r--drivers/block/drbd/drbd_req.c147
-rw-r--r--drivers/block/drbd/drbd_req.h17
-rw-r--r--drivers/block/drbd/drbd_state.c428
-rw-r--r--drivers/block/drbd/drbd_state.h6
-rw-r--r--drivers/block/drbd/drbd_state_change.h63
-rw-r--r--drivers/block/drbd/drbd_worker.c105
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c6
-rw-r--r--drivers/block/null_blk.c8
-rw-r--r--drivers/block/sx8.c7
-rw-r--r--drivers/block/xen-blkback/blkback.c391
-rw-r--r--drivers/block/xen-blkback/common.h86
-rw-r--r--drivers/block/xen-blkback/xenbus.c416
-rw-r--r--drivers/block/xen-blkfront.c1061
-rw-r--r--drivers/md/bcache/btree.c5
-rw-r--r--drivers/md/bcache/super.c16
-rw-r--r--drivers/md/bcache/writeback.c37
-rw-r--r--drivers/md/bcache/writeback.h3
-rw-r--r--include/linux/drbd.h26
-rw-r--r--include/linux/drbd_genl.h149
-rw-r--r--include/linux/idr.h14
-rw-r--r--include/linux/lru_cache.h2
-rw-r--r--include/xen/interface/io/blkif.h48
-rw-r--r--lib/lru_cache.c4
33 files changed, 3893 insertions, 1326 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 45d2717760fc..b8a717c4f863 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3665,13 +3665,12 @@ F: drivers/scsi/dpt*
3665F: drivers/scsi/dpt/ 3665F: drivers/scsi/dpt/
3666 3666
3667DRBD DRIVER 3667DRBD DRIVER
3668P: Philipp Reisner 3668M: Philipp Reisner <philipp.reisner@linbit.com>
3669P: Lars Ellenberg 3669M: Lars Ellenberg <lars.ellenberg@linbit.com>
3670M: drbd-dev@lists.linbit.com 3670L: drbd-dev@lists.linbit.com
3671L: drbd-user@lists.linbit.com
3672W: http://www.drbd.org 3671W: http://www.drbd.org
3673T: git git://git.drbd.org/linux-2.6-drbd.git drbd 3672T: git git://git.linbit.com/linux-drbd.git
3674T: git git://git.drbd.org/drbd-8.3.git 3673T: git git://git.linbit.com/drbd-8.4.git
3675S: Supported 3674S: Supported
3676F: drivers/block/drbd/ 3675F: drivers/block/drbd/
3677F: lib/lru_cache.c 3676F: lib/lru_cache.c
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index b3868e7a1ffd..10459a145062 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
288 return need_transaction; 288 return need_transaction;
289} 289}
290 290
291static int al_write_transaction(struct drbd_device *device); 291#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
292/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
293 * are still coupled, or assume too much about their relation.
294 * Code below will not work if this is violated.
295 * Will be cleaned up with some followup patch.
296 */
297# error FIXME
298#endif
299
300static unsigned int al_extent_to_bm_page(unsigned int al_enr)
301{
302 return al_enr >>
303 /* bit to page */
304 ((PAGE_SHIFT + 3) -
305 /* al extent number to bit */
306 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
307}
308
309static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
310{
311 const unsigned int stripes = device->ldev->md.al_stripes;
312 const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
313
314 /* transaction number, modulo on-disk ring buffer wrap around */
315 unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
316
317 /* ... to aligned 4k on disk block */
318 t = ((t % stripes) * stripe_size_4kB) + t/stripes;
319
320 /* ... to 512 byte sector in activity log */
321 t *= 8;
322
323 /* ... plus offset to the on disk position */
324 return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
325}
326
327static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer)
328{
329 struct lc_element *e;
330 sector_t sector;
331 int i, mx;
332 unsigned extent_nr;
333 unsigned crc = 0;
334 int err = 0;
335
336 memset(buffer, 0, sizeof(*buffer));
337 buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
338 buffer->tr_number = cpu_to_be32(device->al_tr_number);
339
340 i = 0;
341
342 /* Even though no one can start to change this list
343 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
344 * lc_try_lock_for_transaction() --, someone may still
345 * be in the process of changing it. */
346 spin_lock_irq(&device->al_lock);
347 list_for_each_entry(e, &device->act_log->to_be_changed, list) {
348 if (i == AL_UPDATES_PER_TRANSACTION) {
349 i++;
350 break;
351 }
352 buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
353 buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
354 if (e->lc_number != LC_FREE)
355 drbd_bm_mark_for_writeout(device,
356 al_extent_to_bm_page(e->lc_number));
357 i++;
358 }
359 spin_unlock_irq(&device->al_lock);
360 BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
361
362 buffer->n_updates = cpu_to_be16(i);
363 for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
364 buffer->update_slot_nr[i] = cpu_to_be16(-1);
365 buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
366 }
367
368 buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
369 buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
370
371 mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
372 device->act_log->nr_elements - device->al_tr_cycle);
373 for (i = 0; i < mx; i++) {
374 unsigned idx = device->al_tr_cycle + i;
375 extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
376 buffer->context[i] = cpu_to_be32(extent_nr);
377 }
378 for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
379 buffer->context[i] = cpu_to_be32(LC_FREE);
380
381 device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
382 if (device->al_tr_cycle >= device->act_log->nr_elements)
383 device->al_tr_cycle = 0;
384
385 sector = al_tr_number_to_on_disk_sector(device);
386
387 crc = crc32c(0, buffer, 4096);
388 buffer->crc32c = cpu_to_be32(crc);
389
390 if (drbd_bm_write_hinted(device))
391 err = -EIO;
392 else {
393 bool write_al_updates;
394 rcu_read_lock();
395 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
396 rcu_read_unlock();
397 if (write_al_updates) {
398 if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
399 err = -EIO;
400 drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
401 } else {
402 device->al_tr_number++;
403 device->al_writ_cnt++;
404 }
405 }
406 }
407
408 return err;
409}
410
411static int al_write_transaction(struct drbd_device *device)
412{
413 struct al_transaction_on_disk *buffer;
414 int err;
415
416 if (!get_ldev(device)) {
417 drbd_err(device, "disk is %s, cannot start al transaction\n",
418 drbd_disk_str(device->state.disk));
419 return -EIO;
420 }
421
422 /* The bitmap write may have failed, causing a state change. */
423 if (device->state.disk < D_INCONSISTENT) {
424 drbd_err(device,
425 "disk is %s, cannot write al transaction\n",
426 drbd_disk_str(device->state.disk));
427 put_ldev(device);
428 return -EIO;
429 }
430
431 /* protects md_io_buffer, al_tr_cycle, ... */
432 buffer = drbd_md_get_buffer(device, __func__);
433 if (!buffer) {
434 drbd_err(device, "disk failed while waiting for md_io buffer\n");
435 put_ldev(device);
436 return -ENODEV;
437 }
438
439 err = __al_write_transaction(device, buffer);
440
441 drbd_md_put_buffer(device);
442 put_ldev(device);
443
444 return err;
445}
446
292 447
293void drbd_al_begin_io_commit(struct drbd_device *device) 448void drbd_al_begin_io_commit(struct drbd_device *device)
294{ 449{
@@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
420 wake_up(&device->al_wait); 575 wake_up(&device->al_wait);
421} 576}
422 577
423#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
424/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
425 * are still coupled, or assume too much about their relation.
426 * Code below will not work if this is violated.
427 * Will be cleaned up with some followup patch.
428 */
429# error FIXME
430#endif
431
432static unsigned int al_extent_to_bm_page(unsigned int al_enr)
433{
434 return al_enr >>
435 /* bit to page */
436 ((PAGE_SHIFT + 3) -
437 /* al extent number to bit */
438 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
439}
440
441static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
442{
443 const unsigned int stripes = device->ldev->md.al_stripes;
444 const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
445
446 /* transaction number, modulo on-disk ring buffer wrap around */
447 unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
448
449 /* ... to aligned 4k on disk block */
450 t = ((t % stripes) * stripe_size_4kB) + t/stripes;
451
452 /* ... to 512 byte sector in activity log */
453 t *= 8;
454
455 /* ... plus offset to the on disk position */
456 return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
457}
458
459int al_write_transaction(struct drbd_device *device)
460{
461 struct al_transaction_on_disk *buffer;
462 struct lc_element *e;
463 sector_t sector;
464 int i, mx;
465 unsigned extent_nr;
466 unsigned crc = 0;
467 int err = 0;
468
469 if (!get_ldev(device)) {
470 drbd_err(device, "disk is %s, cannot start al transaction\n",
471 drbd_disk_str(device->state.disk));
472 return -EIO;
473 }
474
475 /* The bitmap write may have failed, causing a state change. */
476 if (device->state.disk < D_INCONSISTENT) {
477 drbd_err(device,
478 "disk is %s, cannot write al transaction\n",
479 drbd_disk_str(device->state.disk));
480 put_ldev(device);
481 return -EIO;
482 }
483
484 /* protects md_io_buffer, al_tr_cycle, ... */
485 buffer = drbd_md_get_buffer(device, __func__);
486 if (!buffer) {
487 drbd_err(device, "disk failed while waiting for md_io buffer\n");
488 put_ldev(device);
489 return -ENODEV;
490 }
491
492 memset(buffer, 0, sizeof(*buffer));
493 buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
494 buffer->tr_number = cpu_to_be32(device->al_tr_number);
495
496 i = 0;
497
498 /* Even though no one can start to change this list
499 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
500 * lc_try_lock_for_transaction() --, someone may still
501 * be in the process of changing it. */
502 spin_lock_irq(&device->al_lock);
503 list_for_each_entry(e, &device->act_log->to_be_changed, list) {
504 if (i == AL_UPDATES_PER_TRANSACTION) {
505 i++;
506 break;
507 }
508 buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
509 buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
510 if (e->lc_number != LC_FREE)
511 drbd_bm_mark_for_writeout(device,
512 al_extent_to_bm_page(e->lc_number));
513 i++;
514 }
515 spin_unlock_irq(&device->al_lock);
516 BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
517
518 buffer->n_updates = cpu_to_be16(i);
519 for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
520 buffer->update_slot_nr[i] = cpu_to_be16(-1);
521 buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
522 }
523
524 buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
525 buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
526
527 mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
528 device->act_log->nr_elements - device->al_tr_cycle);
529 for (i = 0; i < mx; i++) {
530 unsigned idx = device->al_tr_cycle + i;
531 extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
532 buffer->context[i] = cpu_to_be32(extent_nr);
533 }
534 for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
535 buffer->context[i] = cpu_to_be32(LC_FREE);
536
537 device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
538 if (device->al_tr_cycle >= device->act_log->nr_elements)
539 device->al_tr_cycle = 0;
540
541 sector = al_tr_number_to_on_disk_sector(device);
542
543 crc = crc32c(0, buffer, 4096);
544 buffer->crc32c = cpu_to_be32(crc);
545
546 if (drbd_bm_write_hinted(device))
547 err = -EIO;
548 else {
549 bool write_al_updates;
550 rcu_read_lock();
551 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
552 rcu_read_unlock();
553 if (write_al_updates) {
554 if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
555 err = -EIO;
556 drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
557 } else {
558 device->al_tr_number++;
559 device->al_writ_cnt++;
560 }
561 }
562 }
563
564 drbd_md_put_buffer(device);
565 put_ldev(device);
566
567 return err;
568}
569
570static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) 578static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
571{ 579{
572 int rv; 580 int rv;
@@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device)
606 wake_up(&device->al_wait); 614 wake_up(&device->al_wait);
607} 615}
608 616
609int drbd_initialize_al(struct drbd_device *device, void *buffer) 617int drbd_al_initialize(struct drbd_device *device, void *buffer)
610{ 618{
611 struct al_transaction_on_disk *al = buffer; 619 struct al_transaction_on_disk *al = buffer;
612 struct drbd_md *md = &device->ldev->md; 620 struct drbd_md *md = &device->ldev->md;
613 sector_t al_base = md->md_offset + md->al_offset;
614 int al_size_4k = md->al_stripes * md->al_stripe_size_4k; 621 int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
615 int i; 622 int i;
616 623
617 memset(al, 0, 4096); 624 __al_write_transaction(device, al);
618 al->magic = cpu_to_be32(DRBD_AL_MAGIC); 625 /* There may or may not have been a pending transaction. */
619 al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); 626 spin_lock_irq(&device->al_lock);
620 al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); 627 lc_committed(device->act_log);
628 spin_unlock_irq(&device->al_lock);
621 629
622 for (i = 0; i < al_size_4k; i++) { 630 /* The rest of the transactions will have an empty "updates" list, and
623 int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); 631 * are written out only to provide the context, and to initialize the
632 * on-disk ring buffer. */
633 for (i = 1; i < al_size_4k; i++) {
634 int err = __al_write_transaction(device, al);
624 if (err) 635 if (err)
625 return err; 636 return err;
626 } 637 }
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 9462d2752850..0dabc9b93725 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -24,7 +24,7 @@
24 24
25#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 25#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26 26
27#include <linux/bitops.h> 27#include <linux/bitmap.h>
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/drbd.h> 30#include <linux/drbd.h>
@@ -479,8 +479,14 @@ void drbd_bm_cleanup(struct drbd_device *device)
479 * this masks out the remaining bits. 479 * this masks out the remaining bits.
480 * Returns the number of bits cleared. 480 * Returns the number of bits cleared.
481 */ 481 */
482#ifndef BITS_PER_PAGE
482#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) 483#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
483#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1) 484#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
485#else
486# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
487# error "ambiguous BITS_PER_PAGE"
488# endif
489#endif
484#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1) 490#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
485static int bm_clear_surplus(struct drbd_bitmap *b) 491static int bm_clear_surplus(struct drbd_bitmap *b)
486{ 492{
@@ -559,21 +565,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
559 unsigned long *p_addr; 565 unsigned long *p_addr;
560 unsigned long bits = 0; 566 unsigned long bits = 0;
561 unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1; 567 unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
562 int idx, i, last_word; 568 int idx, last_word;
563 569
564 /* all but last page */ 570 /* all but last page */
565 for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { 571 for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
566 p_addr = __bm_map_pidx(b, idx); 572 p_addr = __bm_map_pidx(b, idx);
567 for (i = 0; i < LWPP; i++) 573 bits += bitmap_weight(p_addr, BITS_PER_PAGE);
568 bits += hweight_long(p_addr[i]);
569 __bm_unmap(p_addr); 574 __bm_unmap(p_addr);
570 cond_resched(); 575 cond_resched();
571 } 576 }
572 /* last (or only) page */ 577 /* last (or only) page */
573 last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; 578 last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
574 p_addr = __bm_map_pidx(b, idx); 579 p_addr = __bm_map_pidx(b, idx);
575 for (i = 0; i < last_word; i++) 580 bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
576 bits += hweight_long(p_addr[i]);
577 p_addr[last_word] &= cpu_to_lel(mask); 581 p_addr[last_word] &= cpu_to_lel(mask);
578 bits += hweight_long(p_addr[last_word]); 582 bits += hweight_long(p_addr[last_word]);
579 /* 32bit arch, may have an unused padding long */ 583 /* 32bit arch, may have an unused padding long */
@@ -1419,6 +1423,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1419 int bits; 1423 int bits;
1420 int changed = 0; 1424 int changed = 0;
1421 unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); 1425 unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
1426
1427 /* I think it is more cache line friendly to hweight_long then set to ~0UL,
1428 * than to first bitmap_weight() all words, then bitmap_fill() all words */
1422 for (i = first_word; i < last_word; i++) { 1429 for (i = first_word; i < last_word; i++) {
1423 bits = hweight_long(paddr[i]); 1430 bits = hweight_long(paddr[i]);
1424 paddr[i] = ~0UL; 1431 paddr[i] = ~0UL;
@@ -1628,8 +1635,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
1628 int n = e-s; 1635 int n = e-s;
1629 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); 1636 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1630 bm = p_addr + MLPP(s); 1637 bm = p_addr + MLPP(s);
1631 while (n--) 1638 count += bitmap_weight(bm, n * BITS_PER_LONG);
1632 count += hweight_long(*bm++);
1633 bm_unmap(p_addr); 1639 bm_unmap(p_addr);
1634 } else { 1640 } else {
1635 drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s); 1641 drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 6b88a35fb048..96a0107a72ea 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored)
771 return 0; 771 return 0;
772} 772}
773 773
774static int device_ed_gen_id_show(struct seq_file *m, void *ignored)
775{
776 struct drbd_device *device = m->private;
777 seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid);
778 return 0;
779}
780
774#define drbd_debugfs_device_attr(name) \ 781#define drbd_debugfs_device_attr(name) \
775static int device_ ## name ## _open(struct inode *inode, struct file *file) \ 782static int device_ ## name ## _open(struct inode *inode, struct file *file) \
776{ \ 783{ \
@@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests)
796drbd_debugfs_device_attr(act_log_extents) 803drbd_debugfs_device_attr(act_log_extents)
797drbd_debugfs_device_attr(resync_extents) 804drbd_debugfs_device_attr(resync_extents)
798drbd_debugfs_device_attr(data_gen_id) 805drbd_debugfs_device_attr(data_gen_id)
806drbd_debugfs_device_attr(ed_gen_id)
799 807
800void drbd_debugfs_device_add(struct drbd_device *device) 808void drbd_debugfs_device_add(struct drbd_device *device)
801{ 809{
@@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
839 DCF(act_log_extents); 847 DCF(act_log_extents);
840 DCF(resync_extents); 848 DCF(resync_extents);
841 DCF(data_gen_id); 849 DCF(data_gen_id);
850 DCF(ed_gen_id);
842#undef DCF 851#undef DCF
843 return; 852 return;
844 853
@@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device)
854 drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); 863 drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
855 drbd_debugfs_remove(&device->debugfs_vol_resync_extents); 864 drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
856 drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); 865 drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
866 drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id);
857 drbd_debugfs_remove(&device->debugfs_vol); 867 drbd_debugfs_remove(&device->debugfs_vol);
858} 868}
859 869
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e66d453a5f2b..b6844feb9f9b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -77,13 +77,6 @@ extern int fault_devs;
77extern char usermode_helper[]; 77extern char usermode_helper[];
78 78
79 79
80/* I don't remember why XCPU ...
81 * This is used to wake the asender,
82 * and to interrupt sending the sending task
83 * on disconnect.
84 */
85#define DRBD_SIG SIGXCPU
86
87/* This is used to stop/restart our threads. 80/* This is used to stop/restart our threads.
88 * Cannot use SIGTERM nor SIGKILL, since these 81 * Cannot use SIGTERM nor SIGKILL, since these
89 * are sent out by init on runlevel changes 82 * are sent out by init on runlevel changes
@@ -292,6 +285,9 @@ struct drbd_device_work {
292 285
293extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *); 286extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
294 287
288extern void lock_all_resources(void);
289extern void unlock_all_resources(void);
290
295struct drbd_request { 291struct drbd_request {
296 struct drbd_work w; 292 struct drbd_work w;
297 struct drbd_device *device; 293 struct drbd_device *device;
@@ -504,7 +500,6 @@ enum {
504 500
505 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ 501 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
506 502
507 SUSPEND_IO, /* suspend application io */
508 BITMAP_IO, /* suspend application io; 503 BITMAP_IO, /* suspend application io;
509 once no more io in flight, start bitmap io */ 504 once no more io in flight, start bitmap io */
510 BITMAP_IO_QUEUED, /* Started bitmap IO */ 505 BITMAP_IO_QUEUED, /* Started bitmap IO */
@@ -632,12 +627,6 @@ struct bm_io_work {
632 void (*done)(struct drbd_device *device, int rv); 627 void (*done)(struct drbd_device *device, int rv);
633}; 628};
634 629
635enum write_ordering_e {
636 WO_none,
637 WO_drain_io,
638 WO_bdev_flush,
639};
640
641struct fifo_buffer { 630struct fifo_buffer {
642 unsigned int head_index; 631 unsigned int head_index;
643 unsigned int size; 632 unsigned int size;
@@ -650,8 +639,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size);
650enum { 639enum {
651 NET_CONGESTED, /* The data socket is congested */ 640 NET_CONGESTED, /* The data socket is congested */
652 RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ 641 RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
653 SEND_PING, /* whether asender should send a ping asap */ 642 SEND_PING,
654 SIGNAL_ASENDER, /* whether asender wants to be interrupted */
655 GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ 643 GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
656 CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ 644 CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
657 CONN_WD_ST_CHG_OKAY, 645 CONN_WD_ST_CHG_OKAY,
@@ -670,6 +658,8 @@ enum {
670 DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ 658 DEVICE_WORK_PENDING, /* tell worker that some device has pending work */
671}; 659};
672 660
661enum which_state { NOW, OLD = NOW, NEW };
662
673struct drbd_resource { 663struct drbd_resource {
674 char *name; 664 char *name;
675#ifdef CONFIG_DEBUG_FS 665#ifdef CONFIG_DEBUG_FS
@@ -755,7 +745,8 @@ struct drbd_connection {
755 unsigned long last_reconnect_jif; 745 unsigned long last_reconnect_jif;
756 struct drbd_thread receiver; 746 struct drbd_thread receiver;
757 struct drbd_thread worker; 747 struct drbd_thread worker;
758 struct drbd_thread asender; 748 struct drbd_thread ack_receiver;
749 struct workqueue_struct *ack_sender;
759 750
760 /* cached pointers, 751 /* cached pointers,
761 * so we can look up the oldest pending requests more quickly. 752 * so we can look up the oldest pending requests more quickly.
@@ -774,6 +765,8 @@ struct drbd_connection {
774 struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; 765 struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
775 766
776 struct { 767 struct {
768 unsigned long last_sent_barrier_jif;
769
777 /* whether this sender thread 770 /* whether this sender thread
778 * has processed a single write yet. */ 771 * has processed a single write yet. */
779 bool seen_any_write_yet; 772 bool seen_any_write_yet;
@@ -788,6 +781,17 @@ struct drbd_connection {
788 } send; 781 } send;
789}; 782};
790 783
784static inline bool has_net_conf(struct drbd_connection *connection)
785{
786 bool has_net_conf;
787
788 rcu_read_lock();
789 has_net_conf = rcu_dereference(connection->net_conf);
790 rcu_read_unlock();
791
792 return has_net_conf;
793}
794
791void __update_timing_details( 795void __update_timing_details(
792 struct drbd_thread_timing_details *tdp, 796 struct drbd_thread_timing_details *tdp,
793 unsigned int *cb_nr, 797 unsigned int *cb_nr,
@@ -811,6 +815,7 @@ struct drbd_peer_device {
811 struct list_head peer_devices; 815 struct list_head peer_devices;
812 struct drbd_device *device; 816 struct drbd_device *device;
813 struct drbd_connection *connection; 817 struct drbd_connection *connection;
818 struct work_struct send_acks_work;
814#ifdef CONFIG_DEBUG_FS 819#ifdef CONFIG_DEBUG_FS
815 struct dentry *debugfs_peer_dev; 820 struct dentry *debugfs_peer_dev;
816#endif 821#endif
@@ -829,6 +834,7 @@ struct drbd_device {
829 struct dentry *debugfs_vol_act_log_extents; 834 struct dentry *debugfs_vol_act_log_extents;
830 struct dentry *debugfs_vol_resync_extents; 835 struct dentry *debugfs_vol_resync_extents;
831 struct dentry *debugfs_vol_data_gen_id; 836 struct dentry *debugfs_vol_data_gen_id;
837 struct dentry *debugfs_vol_ed_gen_id;
832#endif 838#endif
833 839
834 unsigned int vnr; /* volume number within the connection */ 840 unsigned int vnr; /* volume number within the connection */
@@ -873,6 +879,7 @@ struct drbd_device {
873 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ 879 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
874 atomic_t unacked_cnt; /* Need to send replies for */ 880 atomic_t unacked_cnt; /* Need to send replies for */
875 atomic_t local_cnt; /* Waiting for local completion */ 881 atomic_t local_cnt; /* Waiting for local completion */
882 atomic_t suspend_cnt;
876 883
877 /* Interval tree of pending local requests */ 884 /* Interval tree of pending local requests */
878 struct rb_root read_requests; 885 struct rb_root read_requests;
@@ -1020,6 +1027,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev
1020 return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); 1027 return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
1021} 1028}
1022 1029
1030static inline struct drbd_peer_device *
1031conn_peer_device(struct drbd_connection *connection, int volume_number)
1032{
1033 return idr_find(&connection->peer_devices, volume_number);
1034}
1035
1023#define for_each_resource(resource, _resources) \ 1036#define for_each_resource(resource, _resources) \
1024 list_for_each_entry(resource, _resources, resources) 1037 list_for_each_entry(resource, _resources, resources)
1025 1038
@@ -1113,7 +1126,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
1113extern int drbd_send_bitmap(struct drbd_device *device); 1126extern int drbd_send_bitmap(struct drbd_device *device);
1114extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); 1127extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
1115extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); 1128extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
1116extern void drbd_free_ldev(struct drbd_backing_dev *ldev); 1129extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
1117extern void drbd_device_cleanup(struct drbd_device *device); 1130extern void drbd_device_cleanup(struct drbd_device *device);
1118void drbd_print_uuids(struct drbd_device *device, const char *text); 1131void drbd_print_uuids(struct drbd_device *device, const char *text);
1119 1132
@@ -1424,7 +1437,7 @@ extern struct bio_set *drbd_md_io_bio_set;
1424/* to allocate from that set */ 1437/* to allocate from that set */
1425extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); 1438extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
1426 1439
1427extern rwlock_t global_state_lock; 1440extern struct mutex resources_mutex;
1428 1441
1429extern int conn_lowest_minor(struct drbd_connection *connection); 1442extern int conn_lowest_minor(struct drbd_connection *connection);
1430extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); 1443extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
@@ -1454,6 +1467,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
1454 1467
1455 1468
1456/* drbd_nl.c */ 1469/* drbd_nl.c */
1470
1471extern struct mutex notification_mutex;
1472
1457extern void drbd_suspend_io(struct drbd_device *device); 1473extern void drbd_suspend_io(struct drbd_device *device);
1458extern void drbd_resume_io(struct drbd_device *device); 1474extern void drbd_resume_io(struct drbd_device *device);
1459extern char *ppsize(char *buf, unsigned long long size); 1475extern char *ppsize(char *buf, unsigned long long size);
@@ -1536,7 +1552,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
1536 1552
1537/* drbd_receiver.c */ 1553/* drbd_receiver.c */
1538extern int drbd_receiver(struct drbd_thread *thi); 1554extern int drbd_receiver(struct drbd_thread *thi);
1539extern int drbd_asender(struct drbd_thread *thi); 1555extern int drbd_ack_receiver(struct drbd_thread *thi);
1556extern void drbd_send_ping_wf(struct work_struct *ws);
1557extern void drbd_send_acks_wf(struct work_struct *ws);
1540extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); 1558extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
1541extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, 1559extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
1542 bool throttle_if_app_is_waiting); 1560 bool throttle_if_app_is_waiting);
@@ -1649,7 +1667,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s
1649#define drbd_rs_failed_io(device, sector, size) \ 1667#define drbd_rs_failed_io(device, sector, size) \
1650 __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) 1668 __drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
1651extern void drbd_al_shrink(struct drbd_device *device); 1669extern void drbd_al_shrink(struct drbd_device *device);
1652extern int drbd_initialize_al(struct drbd_device *, void *); 1670extern int drbd_al_initialize(struct drbd_device *, void *);
1653 1671
1654/* drbd_nl.c */ 1672/* drbd_nl.c */
1655/* state info broadcast */ 1673/* state info broadcast */
@@ -1668,6 +1686,29 @@ struct sib_info {
1668}; 1686};
1669void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); 1687void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
1670 1688
1689extern void notify_resource_state(struct sk_buff *,
1690 unsigned int,
1691 struct drbd_resource *,
1692 struct resource_info *,
1693 enum drbd_notification_type);
1694extern void notify_device_state(struct sk_buff *,
1695 unsigned int,
1696 struct drbd_device *,
1697 struct device_info *,
1698 enum drbd_notification_type);
1699extern void notify_connection_state(struct sk_buff *,
1700 unsigned int,
1701 struct drbd_connection *,
1702 struct connection_info *,
1703 enum drbd_notification_type);
1704extern void notify_peer_device_state(struct sk_buff *,
1705 unsigned int,
1706 struct drbd_peer_device *,
1707 struct peer_device_info *,
1708 enum drbd_notification_type);
1709extern void notify_helper(enum drbd_notification_type, struct drbd_device *,
1710 struct drbd_connection *, const char *, int);
1711
1671/* 1712/*
1672 * inline helper functions 1713 * inline helper functions
1673 *************************/ 1714 *************************/
@@ -1694,19 +1735,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r
1694 return 0; 1735 return 0;
1695} 1736}
1696 1737
1697static inline enum drbd_state_rv
1698_drbd_set_state(struct drbd_device *device, union drbd_state ns,
1699 enum chg_state_flags flags, struct completion *done)
1700{
1701 enum drbd_state_rv rv;
1702
1703 read_lock(&global_state_lock);
1704 rv = __drbd_set_state(device, ns, flags, done);
1705 read_unlock(&global_state_lock);
1706
1707 return rv;
1708}
1709
1710static inline union drbd_state drbd_read_state(struct drbd_device *device) 1738static inline union drbd_state drbd_read_state(struct drbd_device *device)
1711{ 1739{
1712 struct drbd_resource *resource = device->resource; 1740 struct drbd_resource *resource = device->resource;
@@ -1937,16 +1965,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit)
1937 1965
1938extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); 1966extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
1939 1967
1940static inline void wake_asender(struct drbd_connection *connection) 1968/* To get the ack_receiver out of the blocking network stack,
1969 * so it can change its sk_rcvtimeo from idle- to ping-timeout,
1970 * and send a ping, we need to send a signal.
1971 * Which signal we send is irrelevant. */
1972static inline void wake_ack_receiver(struct drbd_connection *connection)
1941{ 1973{
1942 if (test_bit(SIGNAL_ASENDER, &connection->flags)) 1974 struct task_struct *task = connection->ack_receiver.task;
1943 force_sig(DRBD_SIG, connection->asender.task); 1975 if (task && get_t_state(&connection->ack_receiver) == RUNNING)
1976 force_sig(SIGXCPU, task);
1944} 1977}
1945 1978
1946static inline void request_ping(struct drbd_connection *connection) 1979static inline void request_ping(struct drbd_connection *connection)
1947{ 1980{
1948 set_bit(SEND_PING, &connection->flags); 1981 set_bit(SEND_PING, &connection->flags);
1949 wake_asender(connection); 1982 wake_ack_receiver(connection);
1950} 1983}
1951 1984
1952extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *); 1985extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
@@ -2230,7 +2263,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device)
2230 2263
2231 if (drbd_suspended(device)) 2264 if (drbd_suspended(device))
2232 return false; 2265 return false;
2233 if (test_bit(SUSPEND_IO, &device->flags)) 2266 if (atomic_read(&device->suspend_cnt))
2234 return false; 2267 return false;
2235 2268
2236 /* to avoid potential deadlock or bitmap corruption, 2269 /* to avoid potential deadlock or bitmap corruption,
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 74d97f4bac34..5b43dfb79819 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
117 */ 117 */
118struct idr drbd_devices; 118struct idr drbd_devices;
119struct list_head drbd_resources; 119struct list_head drbd_resources;
120struct mutex resources_mutex;
120 121
121struct kmem_cache *drbd_request_cache; 122struct kmem_cache *drbd_request_cache;
122struct kmem_cache *drbd_ee_cache; /* peer requests */ 123struct kmem_cache *drbd_ee_cache; /* peer requests */
@@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str
1435 /* long elapsed = (long)(jiffies - device->last_received); */ 1436 /* long elapsed = (long)(jiffies - device->last_received); */
1436 1437
1437 drop_it = connection->meta.socket == sock 1438 drop_it = connection->meta.socket == sock
1438 || !connection->asender.task 1439 || !connection->ack_receiver.task
1439 || get_t_state(&connection->asender) != RUNNING 1440 || get_t_state(&connection->ack_receiver) != RUNNING
1440 || connection->cstate < C_WF_REPORT_PARAMS; 1441 || connection->cstate < C_WF_REPORT_PARAMS;
1441 1442
1442 if (drop_it) 1443 if (drop_it)
@@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock,
1793 drbd_update_congested(connection); 1794 drbd_update_congested(connection);
1794 } 1795 }
1795 do { 1796 do {
1796 /* STRANGE
1797 * tcp_sendmsg does _not_ use its size parameter at all ?
1798 *
1799 * -EAGAIN on timeout, -EINTR on signal.
1800 */
1801/* THINK
1802 * do we need to block DRBD_SIG if sock == &meta.socket ??
1803 * otherwise wake_asender() might interrupt some send_*Ack !
1804 */
1805 rv = kernel_sendmsg(sock, &msg, &iov, 1, size); 1797 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
1806 if (rv == -EAGAIN) { 1798 if (rv == -EAGAIN) {
1807 if (we_should_drop_the_connection(connection, sock)) 1799 if (we_should_drop_the_connection(connection, sock))
@@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device)
2000 drbd_bm_cleanup(device); 1992 drbd_bm_cleanup(device);
2001 } 1993 }
2002 1994
2003 drbd_free_ldev(device->ldev); 1995 drbd_backing_dev_free(device, device->ldev);
2004 device->ldev = NULL; 1996 device->ldev = NULL;
2005 1997
2006 clear_bit(AL_SUSPENDED, &device->flags); 1998 clear_bit(AL_SUSPENDED, &device->flags);
@@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref)
2179 if (device->this_bdev) 2171 if (device->this_bdev)
2180 bdput(device->this_bdev); 2172 bdput(device->this_bdev);
2181 2173
2182 drbd_free_ldev(device->ldev); 2174 drbd_backing_dev_free(device, device->ldev);
2183 device->ldev = NULL; 2175 device->ldev = NULL;
2184 2176
2185 drbd_release_all_peer_reqs(device); 2177 drbd_release_all_peer_reqs(device);
@@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
2563 cpumask_copy(resource->cpu_mask, new_cpu_mask); 2555 cpumask_copy(resource->cpu_mask, new_cpu_mask);
2564 for_each_connection_rcu(connection, resource) { 2556 for_each_connection_rcu(connection, resource) {
2565 connection->receiver.reset_cpu_mask = 1; 2557 connection->receiver.reset_cpu_mask = 1;
2566 connection->asender.reset_cpu_mask = 1; 2558 connection->ack_receiver.reset_cpu_mask = 1;
2567 connection->worker.reset_cpu_mask = 1; 2559 connection->worker.reset_cpu_mask = 1;
2568 } 2560 }
2569 } 2561 }
@@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
2590 kref_init(&resource->kref); 2582 kref_init(&resource->kref);
2591 idr_init(&resource->devices); 2583 idr_init(&resource->devices);
2592 INIT_LIST_HEAD(&resource->connections); 2584 INIT_LIST_HEAD(&resource->connections);
2593 resource->write_ordering = WO_bdev_flush; 2585 resource->write_ordering = WO_BDEV_FLUSH;
2594 list_add_tail_rcu(&resource->resources, &drbd_resources); 2586 list_add_tail_rcu(&resource->resources, &drbd_resources);
2595 mutex_init(&resource->conf_update); 2587 mutex_init(&resource->conf_update);
2596 mutex_init(&resource->adm_mutex); 2588 mutex_init(&resource->adm_mutex);
@@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
2652 connection->receiver.connection = connection; 2644 connection->receiver.connection = connection;
2653 drbd_thread_init(resource, &connection->worker, drbd_worker, "worker"); 2645 drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
2654 connection->worker.connection = connection; 2646 connection->worker.connection = connection;
2655 drbd_thread_init(resource, &connection->asender, drbd_asender, "asender"); 2647 drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
2656 connection->asender.connection = connection; 2648 connection->ack_receiver.connection = connection;
2657 2649
2658 kref_init(&connection->kref); 2650 kref_init(&connection->kref);
2659 2651
@@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device)
2702{ 2694{
2703 /* opencoded create_singlethread_workqueue(), 2695 /* opencoded create_singlethread_workqueue(),
2704 * to be able to say "drbd%d", ..., minor */ 2696 * to be able to say "drbd%d", ..., minor */
2705 device->submit.wq = alloc_workqueue("drbd%u_submit", 2697 device->submit.wq =
2706 WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor); 2698 alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
2707 if (!device->submit.wq) 2699 if (!device->submit.wq)
2708 return -ENOMEM; 2700 return -ENOMEM;
2709 2701
@@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2820 goto out_idr_remove_from_resource; 2812 goto out_idr_remove_from_resource;
2821 } 2813 }
2822 kref_get(&connection->kref); 2814 kref_get(&connection->kref);
2815 INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
2823 } 2816 }
2824 2817
2825 if (init_submitter(device)) { 2818 if (init_submitter(device)) {
@@ -2923,7 +2916,7 @@ static int __init drbd_init(void)
2923 drbd_proc = NULL; /* play safe for drbd_cleanup */ 2916 drbd_proc = NULL; /* play safe for drbd_cleanup */
2924 idr_init(&drbd_devices); 2917 idr_init(&drbd_devices);
2925 2918
2926 rwlock_init(&global_state_lock); 2919 mutex_init(&resources_mutex);
2927 INIT_LIST_HEAD(&drbd_resources); 2920 INIT_LIST_HEAD(&drbd_resources);
2928 2921
2929 err = drbd_genl_register(); 2922 err = drbd_genl_register();
@@ -2971,18 +2964,6 @@ fail:
2971 return err; 2964 return err;
2972} 2965}
2973 2966
2974void drbd_free_ldev(struct drbd_backing_dev *ldev)
2975{
2976 if (ldev == NULL)
2977 return;
2978
2979 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2980 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2981
2982 kfree(ldev->disk_conf);
2983 kfree(ldev);
2984}
2985
2986static void drbd_free_one_sock(struct drbd_socket *ds) 2967static void drbd_free_one_sock(struct drbd_socket *ds)
2987{ 2968{
2988 struct socket *s; 2969 struct socket *s;
@@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
3277 * and read it. */ 3258 * and read it. */
3278 bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; 3259 bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
3279 bdev->md.md_offset = drbd_md_ss(bdev); 3260 bdev->md.md_offset = drbd_md_ss(bdev);
3261 /* Even for (flexible or indexed) external meta data,
3262 * initially restrict us to the 4k superblock for now.
3263 * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
3264 bdev->md.md_size_sect = 8;
3280 3265
3281 if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { 3266 if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
3282 /* NOTE: can't do normal error processing here as this is 3267 /* NOTE: can't do normal error processing here as this is
@@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
3578 3563
3579 spin_lock_irq(&device->resource->req_lock); 3564 spin_lock_irq(&device->resource->req_lock);
3580 set_bit(BITMAP_IO, &device->flags); 3565 set_bit(BITMAP_IO, &device->flags);
3581 if (atomic_read(&device->ap_bio_cnt) == 0) { 3566 /* don't wait for pending application IO if the caller indicates that
3567 * application IO does not conflict anyways. */
3568 if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
3582 if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) 3569 if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
3583 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 3570 drbd_queue_work(&first_peer_device(device)->connection->sender_work,
3584 &device->bm_io_work.w); 3571 &device->bm_io_work.w);
@@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
3746 return 0; 3733 return 0;
3747} 3734}
3748 3735
3736void lock_all_resources(void)
3737{
3738 struct drbd_resource *resource;
3739 int __maybe_unused i = 0;
3740
3741 mutex_lock(&resources_mutex);
3742 local_irq_disable();
3743 for_each_resource(resource, &drbd_resources)
3744 spin_lock_nested(&resource->req_lock, i++);
3745}
3746
3747void unlock_all_resources(void)
3748{
3749 struct drbd_resource *resource;
3750
3751 for_each_resource(resource, &drbd_resources)
3752 spin_unlock(&resource->req_lock);
3753 local_irq_enable();
3754 mutex_unlock(&resources_mutex);
3755}
3756
3749#ifdef CONFIG_DRBD_FAULT_INJECTION 3757#ifdef CONFIG_DRBD_FAULT_INJECTION
3750/* Fault insertion support including random number generator shamelessly 3758/* Fault insertion support including random number generator shamelessly
3751 * stolen from kernel/rcutorture.c */ 3759 * stolen from kernel/rcutorture.c */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e80cbefbc2b5..c055c5e12f24 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -36,6 +36,7 @@
36#include "drbd_int.h" 36#include "drbd_int.h"
37#include "drbd_protocol.h" 37#include "drbd_protocol.h"
38#include "drbd_req.h" 38#include "drbd_req.h"
39#include "drbd_state_change.h"
39#include <asm/unaligned.h> 40#include <asm/unaligned.h>
40#include <linux/drbd_limits.h> 41#include <linux/drbd_limits.h>
41#include <linux/kthread.h> 42#include <linux/kthread.h>
@@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
75int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); 76int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
76/* .dumpit */ 77/* .dumpit */
77int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); 78int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
79int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb);
80int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb);
81int drbd_adm_dump_devices_done(struct netlink_callback *cb);
82int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb);
83int drbd_adm_dump_connections_done(struct netlink_callback *cb);
84int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb);
85int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb);
86int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
78 87
79#include <linux/drbd_genl_api.h> 88#include <linux/drbd_genl_api.h>
80#include "drbd_nla.h" 89#include "drbd_nla.h"
81#include <linux/genl_magic_func.h> 90#include <linux/genl_magic_func.h>
82 91
92static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
93static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */
94
95DEFINE_MUTEX(notification_mutex);
96
83/* used blkdev_get_by_path, to claim our meta data device(s) */ 97/* used blkdev_get_by_path, to claim our meta data device(s) */
84static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; 98static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
85 99
@@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
349 sib.sib_reason = SIB_HELPER_PRE; 363 sib.sib_reason = SIB_HELPER_PRE;
350 sib.helper_name = cmd; 364 sib.helper_name = cmd;
351 drbd_bcast_event(device, &sib); 365 drbd_bcast_event(device, &sib);
366 notify_helper(NOTIFY_CALL, device, connection, cmd, 0);
352 ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); 367 ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
353 if (ret) 368 if (ret)
354 drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n", 369 drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
@@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
361 sib.sib_reason = SIB_HELPER_POST; 376 sib.sib_reason = SIB_HELPER_POST;
362 sib.helper_exit_code = ret; 377 sib.helper_exit_code = ret;
363 drbd_bcast_event(device, &sib); 378 drbd_bcast_event(device, &sib);
379 notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret);
364 380
365 if (current == connection->worker.task) 381 if (current == connection->worker.task)
366 clear_bit(CALLBACK_PENDING, &connection->flags); 382 clear_bit(CALLBACK_PENDING, &connection->flags);
@@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
388 404
389 drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name); 405 drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
390 /* TODO: conn_bcast_event() ?? */ 406 /* TODO: conn_bcast_event() ?? */
407 notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0);
391 408
392 ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); 409 ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
393 if (ret) 410 if (ret)
@@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
399 usermode_helper, cmd, resource_name, 416 usermode_helper, cmd, resource_name,
400 (ret >> 8) & 0xff, ret); 417 (ret >> 8) & 0xff, ret);
401 /* TODO: conn_bcast_event() ?? */ 418 /* TODO: conn_bcast_event() ?? */
419 notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret);
402 420
403 if (ret < 0) /* Ignore any ERRNOs we got. */ 421 if (ret < 0) /* Ignore any ERRNOs we got. */
404 ret = 0; 422 ret = 0;
@@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size)
847 * and can be long lived. 865 * and can be long lived.
848 * This changes an device->flag, is triggered by drbd internals, 866 * This changes an device->flag, is triggered by drbd internals,
849 * and should be short-lived. */ 867 * and should be short-lived. */
868/* It needs to be a counter, since multiple threads might
869 independently suspend and resume IO. */
850void drbd_suspend_io(struct drbd_device *device) 870void drbd_suspend_io(struct drbd_device *device)
851{ 871{
852 set_bit(SUSPEND_IO, &device->flags); 872 atomic_inc(&device->suspend_cnt);
853 if (drbd_suspended(device)) 873 if (drbd_suspended(device))
854 return; 874 return;
855 wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt)); 875 wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
@@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device)
857 877
858void drbd_resume_io(struct drbd_device *device) 878void drbd_resume_io(struct drbd_device *device)
859{ 879{
860 clear_bit(SUSPEND_IO, &device->flags); 880 if (atomic_dec_and_test(&device->suspend_cnt))
861 wake_up(&device->misc_wait); 881 wake_up(&device->misc_wait);
862} 882}
863 883
864/** 884/**
@@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device)
871enum determine_dev_size 891enum determine_dev_size
872drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) 892drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
873{ 893{
874 sector_t prev_first_sect, prev_size; /* previous meta location */ 894 struct md_offsets_and_sizes {
875 sector_t la_size_sect, u_size; 895 u64 last_agreed_sect;
896 u64 md_offset;
897 s32 al_offset;
898 s32 bm_offset;
899 u32 md_size_sect;
900
901 u32 al_stripes;
902 u32 al_stripe_size_4k;
903 } prev;
904 sector_t u_size, size;
876 struct drbd_md *md = &device->ldev->md; 905 struct drbd_md *md = &device->ldev->md;
877 u32 prev_al_stripe_size_4k;
878 u32 prev_al_stripes;
879 sector_t size;
880 char ppb[10]; 906 char ppb[10];
881 void *buffer; 907 void *buffer;
882 908
883 int md_moved, la_size_changed; 909 int md_moved, la_size_changed;
884 enum determine_dev_size rv = DS_UNCHANGED; 910 enum determine_dev_size rv = DS_UNCHANGED;
885 911
886 /* race: 912 /* We may change the on-disk offsets of our meta data below. Lock out
887 * application request passes inc_ap_bio, 913 * anything that may cause meta data IO, to avoid acting on incomplete
888 * but then cannot get an AL-reference. 914 * layout changes or scribbling over meta data that is in the process
889 * this function later may wait on ap_bio_cnt == 0. -> deadlock. 915 * of being moved.
890 * 916 *
891 * to avoid that: 917 * Move is not exactly correct, btw, currently we have all our meta
892 * Suspend IO right here. 918 * data in core memory, to "move" it we just write it all out, there
893 * still lock the act_log to not trigger ASSERTs there. 919 * are no reads. */
894 */
895 drbd_suspend_io(device); 920 drbd_suspend_io(device);
896 buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ 921 buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
897 if (!buffer) { 922 if (!buffer) {
@@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
899 return DS_ERROR; 924 return DS_ERROR;
900 } 925 }
901 926
902 /* no wait necessary anymore, actually we could assert that */ 927 /* remember current offset and sizes */
903 wait_event(device->al_wait, lc_try_lock(device->act_log)); 928 prev.last_agreed_sect = md->la_size_sect;
904 929 prev.md_offset = md->md_offset;
905 prev_first_sect = drbd_md_first_sector(device->ldev); 930 prev.al_offset = md->al_offset;
906 prev_size = device->ldev->md.md_size_sect; 931 prev.bm_offset = md->bm_offset;
907 la_size_sect = device->ldev->md.la_size_sect; 932 prev.md_size_sect = md->md_size_sect;
933 prev.al_stripes = md->al_stripes;
934 prev.al_stripe_size_4k = md->al_stripe_size_4k;
908 935
909 if (rs) { 936 if (rs) {
910 /* rs is non NULL if we should change the AL layout only */ 937 /* rs is non NULL if we should change the AL layout only */
911
912 prev_al_stripes = md->al_stripes;
913 prev_al_stripe_size_4k = md->al_stripe_size_4k;
914
915 md->al_stripes = rs->al_stripes; 938 md->al_stripes = rs->al_stripes;
916 md->al_stripe_size_4k = rs->al_stripe_size / 4; 939 md->al_stripe_size_4k = rs->al_stripe_size / 4;
917 md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; 940 md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
@@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
924 rcu_read_unlock(); 947 rcu_read_unlock();
925 size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED); 948 size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
926 949
927 if (size < la_size_sect) { 950 if (size < prev.last_agreed_sect) {
928 if (rs && u_size == 0) { 951 if (rs && u_size == 0) {
929 /* Remove "rs &&" later. This check should always be active, but 952 /* Remove "rs &&" later. This check should always be active, but
930 right now the receiver expects the permissive behavior */ 953 right now the receiver expects the permissive behavior */
@@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
945 err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); 968 err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
946 if (unlikely(err)) { 969 if (unlikely(err)) {
947 /* currently there is only one error: ENOMEM! */ 970 /* currently there is only one error: ENOMEM! */
948 size = drbd_bm_capacity(device)>>1; 971 size = drbd_bm_capacity(device);
949 if (size == 0) { 972 if (size == 0) {
950 drbd_err(device, "OUT OF MEMORY! " 973 drbd_err(device, "OUT OF MEMORY! "
951 "Could not allocate bitmap!\n"); 974 "Could not allocate bitmap!\n");
952 } else { 975 } else {
953 drbd_err(device, "BM resizing failed. " 976 drbd_err(device, "BM resizing failed. "
954 "Leaving size unchanged at size = %lu KB\n", 977 "Leaving size unchanged\n");
955 (unsigned long)size);
956 } 978 }
957 rv = DS_ERROR; 979 rv = DS_ERROR;
958 } 980 }
959 /* racy, see comments above. */ 981 /* racy, see comments above. */
960 drbd_set_my_capacity(device, size); 982 drbd_set_my_capacity(device, size);
961 device->ldev->md.la_size_sect = size; 983 md->la_size_sect = size;
962 drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), 984 drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
963 (unsigned long long)size>>1); 985 (unsigned long long)size>>1);
964 } 986 }
965 if (rv <= DS_ERROR) 987 if (rv <= DS_ERROR)
966 goto err_out; 988 goto err_out;
967 989
968 la_size_changed = (la_size_sect != device->ldev->md.la_size_sect); 990 la_size_changed = (prev.last_agreed_sect != md->la_size_sect);
969 991
970 md_moved = prev_first_sect != drbd_md_first_sector(device->ldev) 992 md_moved = prev.md_offset != md->md_offset
971 || prev_size != device->ldev->md.md_size_sect; 993 || prev.md_size_sect != md->md_size_sect;
972 994
973 if (la_size_changed || md_moved || rs) { 995 if (la_size_changed || md_moved || rs) {
974 u32 prev_flags; 996 u32 prev_flags;
@@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
977 * Clear the timer, to avoid scary "timer expired!" messages, 999 * Clear the timer, to avoid scary "timer expired!" messages,
978 * "Superblock" is written out at least twice below, anyways. */ 1000 * "Superblock" is written out at least twice below, anyways. */
979 del_timer(&device->md_sync_timer); 1001 del_timer(&device->md_sync_timer);
980 drbd_al_shrink(device); /* All extents inactive. */
981 1002
1003 /* We won't change the "al-extents" setting, we just may need
1004 * to move the on-disk location of the activity log ringbuffer.
1005 * Lock for transaction is good enough, it may well be "dirty"
1006 * or even "starving". */
1007 wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
1008
1009 /* mark current on-disk bitmap and activity log as unreliable */
982 prev_flags = md->flags; 1010 prev_flags = md->flags;
983 md->flags &= ~MDF_PRIMARY_IND; 1011 md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
984 drbd_md_write(device, buffer); 1012 drbd_md_write(device, buffer);
985 1013
1014 drbd_al_initialize(device, buffer);
1015
986 drbd_info(device, "Writing the whole bitmap, %s\n", 1016 drbd_info(device, "Writing the whole bitmap, %s\n",
987 la_size_changed && md_moved ? "size changed and md moved" : 1017 la_size_changed && md_moved ? "size changed and md moved" :
988 la_size_changed ? "size changed" : "md moved"); 1018 la_size_changed ? "size changed" : "md moved");
989 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ 1019 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
990 drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, 1020 drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
991 "size changed", BM_LOCKED_MASK); 1021 "size changed", BM_LOCKED_MASK);
992 drbd_initialize_al(device, buffer);
993 1022
1023 /* on-disk bitmap and activity log is authoritative again
1024 * (unless there was an IO error meanwhile...) */
994 md->flags = prev_flags; 1025 md->flags = prev_flags;
995 drbd_md_write(device, buffer); 1026 drbd_md_write(device, buffer);
996 1027
@@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
999 md->al_stripes, md->al_stripe_size_4k * 4); 1030 md->al_stripes, md->al_stripe_size_4k * 4);
1000 } 1031 }
1001 1032
1002 if (size > la_size_sect) 1033 if (size > prev.last_agreed_sect)
1003 rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; 1034 rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO;
1004 if (size < la_size_sect) 1035 if (size < prev.last_agreed_sect)
1005 rv = DS_SHRUNK; 1036 rv = DS_SHRUNK;
1006 1037
1007 if (0) { 1038 if (0) {
1008 err_out: 1039 err_out:
1009 if (rs) { 1040 /* restore previous offset and sizes */
1010 md->al_stripes = prev_al_stripes; 1041 md->la_size_sect = prev.last_agreed_sect;
1011 md->al_stripe_size_4k = prev_al_stripe_size_4k; 1042 md->md_offset = prev.md_offset;
1012 md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; 1043 md->al_offset = prev.al_offset;
1013 1044 md->bm_offset = prev.bm_offset;
1014 drbd_md_set_sector_offsets(device, device->ldev); 1045 md->md_size_sect = prev.md_size_sect;
1015 } 1046 md->al_stripes = prev.al_stripes;
1047 md->al_stripe_size_4k = prev.al_stripe_size_4k;
1048 md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k;
1016 } 1049 }
1017 lc_unlock(device->act_log); 1050 lc_unlock(device->act_log);
1018 wake_up(&device->al_wait); 1051 wake_up(&device->al_wait);
@@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
1115 lc_destroy(n); 1148 lc_destroy(n);
1116 return -EBUSY; 1149 return -EBUSY;
1117 } else { 1150 } else {
1118 if (t) 1151 lc_destroy(t);
1119 lc_destroy(t);
1120 } 1152 }
1121 drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */ 1153 drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
1122 return 0; 1154 return 0;
@@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
1151 if (b) { 1183 if (b) {
1152 struct drbd_connection *connection = first_peer_device(device)->connection; 1184 struct drbd_connection *connection = first_peer_device(device)->connection;
1153 1185
1186 blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
1187
1154 if (blk_queue_discard(b) && 1188 if (blk_queue_discard(b) &&
1155 (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { 1189 (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
1156 /* For now, don't allow more than one activity log extent worth of data 1190 /* We don't care, stacking below should fix it for the local device.
1157 * to be discarded in one go. We may need to rework drbd_al_begin_io() 1191 * Whether or not it is a suitable granularity on the remote device
1158 * to allow for even larger discard ranges */ 1192 * is not our problem, really. If you care, you need to
1159 blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); 1193 * use devices with similar topology on all peers. */
1160 1194 q->limits.discard_granularity = 512;
1161 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 1195 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1162 /* REALLY? Is stacking secdiscard "legal"? */
1163 if (blk_queue_secdiscard(b))
1164 queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
1165 } else { 1196 } else {
1166 blk_queue_max_discard_sectors(q, 0); 1197 blk_queue_max_discard_sectors(q, 0);
1167 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); 1198 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1168 queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); 1199 q->limits.discard_granularity = 0;
1169 } 1200 }
1170 1201
1171 blk_queue_stack_limits(q, b); 1202 blk_queue_stack_limits(q, b);
@@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
1177 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 1208 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
1178 } 1209 }
1179 } 1210 }
1211 /* To avoid confusion, if this queue does not support discard, clear
1212 * max_discard_sectors, which is what lsblk -D reports to the user. */
1213 if (!blk_queue_discard(q)) {
1214 blk_queue_max_discard_sectors(q, 0);
1215 q->limits.discard_granularity = 0;
1216 }
1180} 1217}
1181 1218
1182void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) 1219void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
@@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection)
1241 connection->cstate == C_STANDALONE; 1278 connection->cstate == C_STANDALONE;
1242 spin_unlock_irq(&connection->resource->req_lock); 1279 spin_unlock_irq(&connection->resource->req_lock);
1243 if (stop_threads) { 1280 if (stop_threads) {
1244 /* asender is implicitly stopped by receiver 1281 /* ack_receiver thread and ack_sender workqueue are implicitly
1245 * in conn_disconnect() */ 1282 * stopped by receiver in conn_disconnect() */
1246 drbd_thread_stop(&connection->receiver); 1283 drbd_thread_stop(&connection->receiver);
1247 drbd_thread_stop(&connection->worker); 1284 drbd_thread_stop(&connection->worker);
1248 } 1285 }
@@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1389 goto fail_unlock; 1426 goto fail_unlock;
1390 } 1427 }
1391 1428
1392 write_lock_irq(&global_state_lock); 1429 lock_all_resources();
1393 retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); 1430 retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1394 if (retcode == NO_ERROR) { 1431 if (retcode == NO_ERROR) {
1395 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); 1432 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
1396 drbd_resync_after_changed(device); 1433 drbd_resync_after_changed(device);
1397 } 1434 }
1398 write_unlock_irq(&global_state_lock); 1435 unlock_all_resources();
1399 1436
1400 if (retcode != NO_ERROR) 1437 if (retcode != NO_ERROR)
1401 goto fail_unlock; 1438 goto fail_unlock;
@@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1418 set_bit(MD_NO_FUA, &device->flags); 1455 set_bit(MD_NO_FUA, &device->flags);
1419 1456
1420 if (write_ordering_changed(old_disk_conf, new_disk_conf)) 1457 if (write_ordering_changed(old_disk_conf, new_disk_conf))
1421 drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); 1458 drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
1422 1459
1423 drbd_md_sync(device); 1460 drbd_md_sync(device);
1424 1461
@@ -1449,6 +1486,88 @@ success:
1449 return 0; 1486 return 0;
1450} 1487}
1451 1488
1489static struct block_device *open_backing_dev(struct drbd_device *device,
1490 const char *bdev_path, void *claim_ptr, bool do_bd_link)
1491{
1492 struct block_device *bdev;
1493 int err = 0;
1494
1495 bdev = blkdev_get_by_path(bdev_path,
1496 FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr);
1497 if (IS_ERR(bdev)) {
1498 drbd_err(device, "open(\"%s\") failed with %ld\n",
1499 bdev_path, PTR_ERR(bdev));
1500 return bdev;
1501 }
1502
1503 if (!do_bd_link)
1504 return bdev;
1505
1506 err = bd_link_disk_holder(bdev, device->vdisk);
1507 if (err) {
1508 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1509 drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
1510 bdev_path, err);
1511 bdev = ERR_PTR(err);
1512 }
1513 return bdev;
1514}
1515
1516static int open_backing_devices(struct drbd_device *device,
1517 struct disk_conf *new_disk_conf,
1518 struct drbd_backing_dev *nbc)
1519{
1520 struct block_device *bdev;
1521
1522 bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true);
1523 if (IS_ERR(bdev))
1524 return ERR_OPEN_DISK;
1525 nbc->backing_bdev = bdev;
1526
1527 /*
1528 * meta_dev_idx >= 0: external fixed size, possibly multiple
1529 * drbd sharing one meta device. TODO in that case, paranoia
1530 * check that [md_bdev, meta_dev_idx] is not yet used by some
1531 * other drbd minor! (if you use drbd.conf + drbdadm, that
1532 * should check it for you already; but if you don't, or
1533 * someone fooled it, we need to double check here)
1534 */
1535 bdev = open_backing_dev(device, new_disk_conf->meta_dev,
1536 /* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
1537 * if potentially shared with other drbd minors */
1538 (new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
1539 /* avoid double bd_claim_by_disk() for the same (source,target) tuple,
1540 * as would happen with internal metadata. */
1541 (new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
1542 new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
1543 if (IS_ERR(bdev))
1544 return ERR_OPEN_MD_DISK;
1545 nbc->md_bdev = bdev;
1546 return NO_ERROR;
1547}
1548
1549static void close_backing_dev(struct drbd_device *device, struct block_device *bdev,
1550 bool do_bd_unlink)
1551{
1552 if (!bdev)
1553 return;
1554 if (do_bd_unlink)
1555 bd_unlink_disk_holder(bdev, device->vdisk);
1556 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1557}
1558
1559void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
1560{
1561 if (ldev == NULL)
1562 return;
1563
1564 close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev);
1565 close_backing_dev(device, ldev->backing_bdev, true);
1566
1567 kfree(ldev->disk_conf);
1568 kfree(ldev);
1569}
1570
1452int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) 1571int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1453{ 1572{
1454 struct drbd_config_context adm_ctx; 1573 struct drbd_config_context adm_ctx;
@@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1462 sector_t min_md_device_sectors; 1581 sector_t min_md_device_sectors;
1463 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ 1582 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
1464 struct disk_conf *new_disk_conf = NULL; 1583 struct disk_conf *new_disk_conf = NULL;
1465 struct block_device *bdev;
1466 struct lru_cache *resync_lru = NULL; 1584 struct lru_cache *resync_lru = NULL;
1467 struct fifo_buffer *new_plan = NULL; 1585 struct fifo_buffer *new_plan = NULL;
1468 union drbd_state ns, os; 1586 union drbd_state ns, os;
@@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1478 device = adm_ctx.device; 1596 device = adm_ctx.device;
1479 mutex_lock(&adm_ctx.resource->adm_mutex); 1597 mutex_lock(&adm_ctx.resource->adm_mutex);
1480 peer_device = first_peer_device(device); 1598 peer_device = first_peer_device(device);
1481 connection = peer_device ? peer_device->connection : NULL; 1599 connection = peer_device->connection;
1482 conn_reconfig_start(connection); 1600 conn_reconfig_start(connection);
1483 1601
1484 /* if you want to reconfigure, please tear down first */ 1602 /* if you want to reconfigure, please tear down first */
@@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1539 goto fail; 1657 goto fail;
1540 } 1658 }
1541 1659
1542 write_lock_irq(&global_state_lock);
1543 retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1544 write_unlock_irq(&global_state_lock);
1545 if (retcode != NO_ERROR)
1546 goto fail;
1547
1548 rcu_read_lock(); 1660 rcu_read_lock();
1549 nc = rcu_dereference(connection->net_conf); 1661 nc = rcu_dereference(connection->net_conf);
1550 if (nc) { 1662 if (nc) {
@@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1556 } 1668 }
1557 rcu_read_unlock(); 1669 rcu_read_unlock();
1558 1670
1559 bdev = blkdev_get_by_path(new_disk_conf->backing_dev, 1671 retcode = open_backing_devices(device, new_disk_conf, nbc);
1560 FMODE_READ | FMODE_WRITE | FMODE_EXCL, device); 1672 if (retcode != NO_ERROR)
1561 if (IS_ERR(bdev)) {
1562 drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
1563 PTR_ERR(bdev));
1564 retcode = ERR_OPEN_DISK;
1565 goto fail;
1566 }
1567 nbc->backing_bdev = bdev;
1568
1569 /*
1570 * meta_dev_idx >= 0: external fixed size, possibly multiple
1571 * drbd sharing one meta device. TODO in that case, paranoia
1572 * check that [md_bdev, meta_dev_idx] is not yet used by some
1573 * other drbd minor! (if you use drbd.conf + drbdadm, that
1574 * should check it for you already; but if you don't, or
1575 * someone fooled it, we need to double check here)
1576 */
1577 bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
1578 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1579 (new_disk_conf->meta_dev_idx < 0) ?
1580 (void *)device : (void *)drbd_m_holder);
1581 if (IS_ERR(bdev)) {
1582 drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
1583 PTR_ERR(bdev));
1584 retcode = ERR_OPEN_MD_DISK;
1585 goto fail; 1673 goto fail;
1586 }
1587 nbc->md_bdev = bdev;
1588 1674
1589 if ((nbc->backing_bdev == nbc->md_bdev) != 1675 if ((nbc->backing_bdev == nbc->md_bdev) !=
1590 (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || 1676 (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
@@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1707 goto force_diskless_dec; 1793 goto force_diskless_dec;
1708 } 1794 }
1709 1795
1796 lock_all_resources();
1797 retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1798 if (retcode != NO_ERROR) {
1799 unlock_all_resources();
1800 goto force_diskless_dec;
1801 }
1802
1710 /* Reset the "barriers don't work" bits here, then force meta data to 1803 /* Reset the "barriers don't work" bits here, then force meta data to
1711 * be written, to ensure we determine if barriers are supported. */ 1804 * be written, to ensure we determine if barriers are supported. */
1712 if (new_disk_conf->md_flushes) 1805 if (new_disk_conf->md_flushes)
@@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1727 new_disk_conf = NULL; 1820 new_disk_conf = NULL;
1728 new_plan = NULL; 1821 new_plan = NULL;
1729 1822
1730 drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); 1823 drbd_resync_after_changed(device);
1824 drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
1825 unlock_all_resources();
1731 1826
1732 if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) 1827 if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
1733 set_bit(CRASHED_PRIMARY, &device->flags); 1828 set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1875 fail: 1970 fail:
1876 conn_reconfig_done(connection); 1971 conn_reconfig_done(connection);
1877 if (nbc) { 1972 if (nbc) {
1878 if (nbc->backing_bdev) 1973 close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev);
1879 blkdev_put(nbc->backing_bdev, 1974 close_backing_dev(device, nbc->backing_bdev, true);
1880 FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1881 if (nbc->md_bdev)
1882 blkdev_put(nbc->md_bdev,
1883 FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1884 kfree(nbc); 1975 kfree(nbc);
1885 } 1976 }
1886 kfree(new_disk_conf); 1977 kfree(new_disk_conf);
@@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1895static int adm_detach(struct drbd_device *device, int force) 1986static int adm_detach(struct drbd_device *device, int force)
1896{ 1987{
1897 enum drbd_state_rv retcode; 1988 enum drbd_state_rv retcode;
1989 void *buffer;
1898 int ret; 1990 int ret;
1899 1991
1900 if (force) { 1992 if (force) {
@@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force)
1905 } 1997 }
1906 1998
1907 drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ 1999 drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
1908 drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ 2000 buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
1909 retcode = drbd_request_state(device, NS(disk, D_FAILED)); 2001 if (buffer) {
1910 drbd_md_put_buffer(device); 2002 retcode = drbd_request_state(device, NS(disk, D_FAILED));
2003 drbd_md_put_buffer(device);
2004 } else /* already <= D_FAILED */
2005 retcode = SS_NOTHING_TO_DO;
1911 /* D_FAILED will transition to DISKLESS. */ 2006 /* D_FAILED will transition to DISKLESS. */
2007 drbd_resume_io(device);
1912 ret = wait_event_interruptible(device->misc_wait, 2008 ret = wait_event_interruptible(device->misc_wait,
1913 device->state.disk != D_FAILED); 2009 device->state.disk != D_FAILED);
1914 drbd_resume_io(device);
1915 if ((int)retcode == (int)SS_IS_DISKLESS) 2010 if ((int)retcode == (int)SS_IS_DISKLESS)
1916 retcode = SS_NOTHING_TO_DO; 2011 retcode = SS_NOTHING_TO_DO;
1917 if (ret) 2012 if (ret)
@@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2245 return 0; 2340 return 0;
2246} 2341}
2247 2342
2343static void connection_to_info(struct connection_info *info,
2344 struct drbd_connection *connection)
2345{
2346 info->conn_connection_state = connection->cstate;
2347 info->conn_role = conn_highest_peer(connection);
2348}
2349
2350static void peer_device_to_info(struct peer_device_info *info,
2351 struct drbd_peer_device *peer_device)
2352{
2353 struct drbd_device *device = peer_device->device;
2354
2355 info->peer_repl_state =
2356 max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn);
2357 info->peer_disk_state = device->state.pdsk;
2358 info->peer_resync_susp_user = device->state.user_isp;
2359 info->peer_resync_susp_peer = device->state.peer_isp;
2360 info->peer_resync_susp_dependency = device->state.aftr_isp;
2361}
2362
2248int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) 2363int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2249{ 2364{
2365 struct connection_info connection_info;
2366 enum drbd_notification_type flags;
2367 unsigned int peer_devices = 0;
2250 struct drbd_config_context adm_ctx; 2368 struct drbd_config_context adm_ctx;
2251 struct drbd_peer_device *peer_device; 2369 struct drbd_peer_device *peer_device;
2252 struct net_conf *old_net_conf, *new_net_conf = NULL; 2370 struct net_conf *old_net_conf, *new_net_conf = NULL;
@@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2347 connection->peer_addr_len = nla_len(adm_ctx.peer_addr); 2465 connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
2348 memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); 2466 memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
2349 2467
2468 idr_for_each_entry(&connection->peer_devices, peer_device, i) {
2469 peer_devices++;
2470 }
2471
2472 connection_to_info(&connection_info, connection);
2473 flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
2474 mutex_lock(&notification_mutex);
2475 notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags);
2476 idr_for_each_entry(&connection->peer_devices, peer_device, i) {
2477 struct peer_device_info peer_device_info;
2478
2479 peer_device_to_info(&peer_device_info, peer_device);
2480 flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
2481 notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags);
2482 }
2483 mutex_unlock(&notification_mutex);
2350 mutex_unlock(&adm_ctx.resource->conf_update); 2484 mutex_unlock(&adm_ctx.resource->conf_update);
2351 2485
2352 rcu_read_lock(); 2486 rcu_read_lock();
@@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection
2428 drbd_err(connection, 2562 drbd_err(connection,
2429 "unexpected rv2=%d in conn_try_disconnect()\n", 2563 "unexpected rv2=%d in conn_try_disconnect()\n",
2430 rv2); 2564 rv2);
2565 /* Unlike in DRBD 9, the state engine has generated
2566 * NOTIFY_DESTROY events before clearing connection->net_conf. */
2431 } 2567 }
2432 return rv; 2568 return rv;
2433} 2569}
@@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2585 mutex_unlock(&device->resource->conf_update); 2721 mutex_unlock(&device->resource->conf_update);
2586 synchronize_rcu(); 2722 synchronize_rcu();
2587 kfree(old_disk_conf); 2723 kfree(old_disk_conf);
2724 new_disk_conf = NULL;
2588 } 2725 }
2589 2726
2590 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); 2727 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
@@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2618 2755
2619 fail_ldev: 2756 fail_ldev:
2620 put_ldev(device); 2757 put_ldev(device);
2758 kfree(new_disk_conf);
2621 goto fail; 2759 goto fail;
2622} 2760}
2623 2761
@@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
2855 mutex_lock(&adm_ctx.resource->adm_mutex); 2993 mutex_lock(&adm_ctx.resource->adm_mutex);
2856 device = adm_ctx.device; 2994 device = adm_ctx.device;
2857 if (test_bit(NEW_CUR_UUID, &device->flags)) { 2995 if (test_bit(NEW_CUR_UUID, &device->flags)) {
2858 drbd_uuid_new_current(device); 2996 if (get_ldev_if_state(device, D_ATTACHING)) {
2997 drbd_uuid_new_current(device);
2998 put_ldev(device);
2999 } else {
3000 /* This is effectively a multi-stage "forced down".
3001 * The NEW_CUR_UUID bit is supposedly only set, if we
3002 * lost the replication connection, and are configured
3003 * to freeze IO and wait for some fence-peer handler.
3004 * So we still don't have a replication connection.
3005 * And now we don't have a local disk either. After
3006 * resume, we will fail all pending and new IO, because
3007 * we don't have any data anymore. Which means we will
3008 * eventually be able to terminate all users of this
3009 * device, and then take it down. By bumping the
3010 * "effective" data uuid, we make sure that you really
3011 * need to tear down before you reconfigure, we will
3012 * the refuse to re-connect or re-attach (because no
3013 * matching real data uuid exists).
3014 */
3015 u64 val;
3016 get_random_bytes(&val, sizeof(u64));
3017 drbd_set_ed_uuid(device, val);
3018 drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n");
3019 }
2859 clear_bit(NEW_CUR_UUID, &device->flags); 3020 clear_bit(NEW_CUR_UUID, &device->flags);
2860 } 3021 }
2861 drbd_suspend_io(device); 3022 drbd_suspend_io(device);
@@ -2910,6 +3071,486 @@ nla_put_failure:
2910} 3071}
2911 3072
2912/* 3073/*
3074 * The generic netlink dump callbacks are called outside the genl_lock(), so
3075 * they cannot use the simple attribute parsing code which uses global
3076 * attribute tables.
3077 */
3078static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr)
3079{
3080 const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
3081 const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
3082 struct nlattr *nla;
3083
3084 nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen),
3085 DRBD_NLA_CFG_CONTEXT);
3086 if (!nla)
3087 return NULL;
3088 return drbd_nla_find_nested(maxtype, nla, __nla_type(attr));
3089}
3090
3091static void resource_to_info(struct resource_info *, struct drbd_resource *);
3092
3093int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb)
3094{
3095 struct drbd_genlmsghdr *dh;
3096 struct drbd_resource *resource;
3097 struct resource_info resource_info;
3098 struct resource_statistics resource_statistics;
3099 int err;
3100
3101 rcu_read_lock();
3102 if (cb->args[0]) {
3103 for_each_resource_rcu(resource, &drbd_resources)
3104 if (resource == (struct drbd_resource *)cb->args[0])
3105 goto found_resource;
3106 err = 0; /* resource was probably deleted */
3107 goto out;
3108 }
3109 resource = list_entry(&drbd_resources,
3110 struct drbd_resource, resources);
3111
3112found_resource:
3113 list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) {
3114 goto put_result;
3115 }
3116 err = 0;
3117 goto out;
3118
3119put_result:
3120 dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
3121 cb->nlh->nlmsg_seq, &drbd_genl_family,
3122 NLM_F_MULTI, DRBD_ADM_GET_RESOURCES);
3123 err = -ENOMEM;
3124 if (!dh)
3125 goto out;
3126 dh->minor = -1U;
3127 dh->ret_code = NO_ERROR;
3128 err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL);
3129 if (err)
3130 goto out;
3131 err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN));
3132 if (err)
3133 goto out;
3134 resource_to_info(&resource_info, resource);
3135 err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN));
3136 if (err)
3137 goto out;
3138 resource_statistics.res_stat_write_ordering = resource->write_ordering;
3139 err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
3140 if (err)
3141 goto out;
3142 cb->args[0] = (long)resource;
3143 genlmsg_end(skb, dh);
3144 err = 0;
3145
3146out:
3147 rcu_read_unlock();
3148 if (err)
3149 return err;
3150 return skb->len;
3151}
3152
3153static void device_to_statistics(struct device_statistics *s,
3154 struct drbd_device *device)
3155{
3156 memset(s, 0, sizeof(*s));
3157 s->dev_upper_blocked = !may_inc_ap_bio(device);
3158 if (get_ldev(device)) {
3159 struct drbd_md *md = &device->ldev->md;
3160 u64 *history_uuids = (u64 *)s->history_uuids;
3161 struct request_queue *q;
3162 int n;
3163
3164 spin_lock_irq(&md->uuid_lock);
3165 s->dev_current_uuid = md->uuid[UI_CURRENT];
3166 BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
3167 for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
3168 history_uuids[n] = md->uuid[UI_HISTORY_START + n];
3169 for (; n < HISTORY_UUIDS; n++)
3170 history_uuids[n] = 0;
3171 s->history_uuids_len = HISTORY_UUIDS;
3172 spin_unlock_irq(&md->uuid_lock);
3173
3174 s->dev_disk_flags = md->flags;
3175 q = bdev_get_queue(device->ldev->backing_bdev);
3176 s->dev_lower_blocked =
3177 bdi_congested(&q->backing_dev_info,
3178 (1 << WB_async_congested) |
3179 (1 << WB_sync_congested));
3180 put_ldev(device);
3181 }
3182 s->dev_size = drbd_get_capacity(device->this_bdev);
3183 s->dev_read = device->read_cnt;
3184 s->dev_write = device->writ_cnt;
3185 s->dev_al_writes = device->al_writ_cnt;
3186 s->dev_bm_writes = device->bm_writ_cnt;
3187 s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
3188 s->dev_lower_pending = atomic_read(&device->local_cnt);
3189 s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
3190 s->dev_exposed_data_uuid = device->ed_uuid;
3191}
3192
3193static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr)
3194{
3195 if (cb->args[0]) {
3196 struct drbd_resource *resource =
3197 (struct drbd_resource *)cb->args[0];
3198 kref_put(&resource->kref, drbd_destroy_resource);
3199 }
3200
3201 return 0;
3202}
3203
3204int drbd_adm_dump_devices_done(struct netlink_callback *cb) {
3205 return put_resource_in_arg0(cb, 7);
3206}
3207
3208static void device_to_info(struct device_info *, struct drbd_device *);
3209
3210int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb)
3211{
3212 struct nlattr *resource_filter;
3213 struct drbd_resource *resource;
3214 struct drbd_device *uninitialized_var(device);
3215 int minor, err, retcode;
3216 struct drbd_genlmsghdr *dh;
3217 struct device_info device_info;
3218 struct device_statistics device_statistics;
3219 struct idr *idr_to_search;
3220
3221 resource = (struct drbd_resource *)cb->args[0];
3222 if (!cb->args[0] && !cb->args[1]) {
3223 resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
3224 if (resource_filter) {
3225 retcode = ERR_RES_NOT_KNOWN;
3226 resource = drbd_find_resource(nla_data(resource_filter));
3227 if (!resource)
3228 goto put_result;
3229 cb->args[0] = (long)resource;
3230 }
3231 }
3232
3233 rcu_read_lock();
3234 minor = cb->args[1];
3235 idr_to_search = resource ? &resource->devices : &drbd_devices;
3236 device = idr_get_next(idr_to_search, &minor);
3237 if (!device) {
3238 err = 0;
3239 goto out;
3240 }
3241 idr_for_each_entry_continue(idr_to_search, device, minor) {
3242 retcode = NO_ERROR;
3243 goto put_result; /* only one iteration */
3244 }
3245 err = 0;
3246 goto out; /* no more devices */
3247
3248put_result:
3249 dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
3250 cb->nlh->nlmsg_seq, &drbd_genl_family,
3251 NLM_F_MULTI, DRBD_ADM_GET_DEVICES);
3252 err = -ENOMEM;
3253 if (!dh)
3254 goto out;
3255 dh->ret_code = retcode;
3256 dh->minor = -1U;
3257 if (retcode == NO_ERROR) {
3258 dh->minor = device->minor;
3259 err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device);
3260 if (err)
3261 goto out;
3262 if (get_ldev(device)) {
3263 struct disk_conf *disk_conf =
3264 rcu_dereference(device->ldev->disk_conf);
3265
3266 err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN));
3267 put_ldev(device);
3268 if (err)
3269 goto out;
3270 }
3271 device_to_info(&device_info, device);
3272 err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN));
3273 if (err)
3274 goto out;
3275
3276 device_to_statistics(&device_statistics, device);
3277 err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
3278 if (err)
3279 goto out;
3280 cb->args[1] = minor + 1;
3281 }
3282 genlmsg_end(skb, dh);
3283 err = 0;
3284
3285out:
3286 rcu_read_unlock();
3287 if (err)
3288 return err;
3289 return skb->len;
3290}
3291
3292int drbd_adm_dump_connections_done(struct netlink_callback *cb)
3293{
3294 return put_resource_in_arg0(cb, 6);
3295}
3296
3297enum { SINGLE_RESOURCE, ITERATE_RESOURCES };
3298
3299int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb)
3300{
3301 struct nlattr *resource_filter;
3302 struct drbd_resource *resource = NULL, *next_resource;
3303 struct drbd_connection *uninitialized_var(connection);
3304 int err = 0, retcode;
3305 struct drbd_genlmsghdr *dh;
3306 struct connection_info connection_info;
3307 struct connection_statistics connection_statistics;
3308
3309 rcu_read_lock();
3310 resource = (struct drbd_resource *)cb->args[0];
3311 if (!cb->args[0]) {
3312 resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
3313 if (resource_filter) {
3314 retcode = ERR_RES_NOT_KNOWN;
3315 resource = drbd_find_resource(nla_data(resource_filter));
3316 if (!resource)
3317 goto put_result;
3318 cb->args[0] = (long)resource;
3319 cb->args[1] = SINGLE_RESOURCE;
3320 }
3321 }
3322 if (!resource) {
3323 if (list_empty(&drbd_resources))
3324 goto out;
3325 resource = list_first_entry(&drbd_resources, struct drbd_resource, resources);
3326 kref_get(&resource->kref);
3327 cb->args[0] = (long)resource;
3328 cb->args[1] = ITERATE_RESOURCES;
3329 }
3330
3331 next_resource:
3332 rcu_read_unlock();
3333 mutex_lock(&resource->conf_update);
3334 rcu_read_lock();
3335 if (cb->args[2]) {
3336 for_each_connection_rcu(connection, resource)
3337 if (connection == (struct drbd_connection *)cb->args[2])
3338 goto found_connection;
3339 /* connection was probably deleted */
3340 goto no_more_connections;
3341 }
3342 connection = list_entry(&resource->connections, struct drbd_connection, connections);
3343
3344found_connection:
3345 list_for_each_entry_continue_rcu(connection, &resource->connections, connections) {
3346 if (!has_net_conf(connection))
3347 continue;
3348 retcode = NO_ERROR;
3349 goto put_result; /* only one iteration */
3350 }
3351
3352no_more_connections:
3353 if (cb->args[1] == ITERATE_RESOURCES) {
3354 for_each_resource_rcu(next_resource, &drbd_resources) {
3355 if (next_resource == resource)
3356 goto found_resource;
3357 }
3358 /* resource was probably deleted */
3359 }
3360 goto out;
3361
3362found_resource:
3363 list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) {
3364 mutex_unlock(&resource->conf_update);
3365 kref_put(&resource->kref, drbd_destroy_resource);
3366 resource = next_resource;
3367 kref_get(&resource->kref);
3368 cb->args[0] = (long)resource;
3369 cb->args[2] = 0;
3370 goto next_resource;
3371 }
3372 goto out; /* no more resources */
3373
3374put_result:
3375 dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
3376 cb->nlh->nlmsg_seq, &drbd_genl_family,
3377 NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS);
3378 err = -ENOMEM;
3379 if (!dh)
3380 goto out;
3381 dh->ret_code = retcode;
3382 dh->minor = -1U;
3383 if (retcode == NO_ERROR) {
3384 struct net_conf *net_conf;
3385
3386 err = nla_put_drbd_cfg_context(skb, resource, connection, NULL);
3387 if (err)
3388 goto out;
3389 net_conf = rcu_dereference(connection->net_conf);
3390 if (net_conf) {
3391 err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN));
3392 if (err)
3393 goto out;
3394 }
3395 connection_to_info(&connection_info, connection);
3396 err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN));
3397 if (err)
3398 goto out;
3399 connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
3400 err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
3401 if (err)
3402 goto out;
3403 cb->args[2] = (long)connection;
3404 }
3405 genlmsg_end(skb, dh);
3406 err = 0;
3407
3408out:
3409 rcu_read_unlock();
3410 if (resource)
3411 mutex_unlock(&resource->conf_update);
3412 if (err)
3413 return err;
3414 return skb->len;
3415}
3416
3417enum mdf_peer_flag {
3418 MDF_PEER_CONNECTED = 1 << 0,
3419 MDF_PEER_OUTDATED = 1 << 1,
3420 MDF_PEER_FENCING = 1 << 2,
3421 MDF_PEER_FULL_SYNC = 1 << 3,
3422};
3423
3424static void peer_device_to_statistics(struct peer_device_statistics *s,
3425 struct drbd_peer_device *peer_device)
3426{
3427 struct drbd_device *device = peer_device->device;
3428
3429 memset(s, 0, sizeof(*s));
3430 s->peer_dev_received = device->recv_cnt;
3431 s->peer_dev_sent = device->send_cnt;
3432 s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
3433 atomic_read(&device->rs_pending_cnt);
3434 s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
3435 s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
3436 s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
3437 if (get_ldev(device)) {
3438 struct drbd_md *md = &device->ldev->md;
3439
3440 spin_lock_irq(&md->uuid_lock);
3441 s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
3442 spin_unlock_irq(&md->uuid_lock);
3443 s->peer_dev_flags =
3444 (drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
3445 MDF_PEER_CONNECTED : 0) +
3446 (drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
3447 !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
3448 MDF_PEER_OUTDATED : 0) +
3449 /* FIXME: MDF_PEER_FENCING? */
3450 (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
3451 MDF_PEER_FULL_SYNC : 0);
3452 put_ldev(device);
3453 }
3454}
3455
3456int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb)
3457{
3458 return put_resource_in_arg0(cb, 9);
3459}
3460
3461int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
3462{
3463 struct nlattr *resource_filter;
3464 struct drbd_resource *resource;
3465 struct drbd_device *uninitialized_var(device);
3466 struct drbd_peer_device *peer_device = NULL;
3467 int minor, err, retcode;
3468 struct drbd_genlmsghdr *dh;
3469 struct idr *idr_to_search;
3470
3471 resource = (struct drbd_resource *)cb->args[0];
3472 if (!cb->args[0] && !cb->args[1]) {
3473 resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
3474 if (resource_filter) {
3475 retcode = ERR_RES_NOT_KNOWN;
3476 resource = drbd_find_resource(nla_data(resource_filter));
3477 if (!resource)
3478 goto put_result;
3479 }
3480 cb->args[0] = (long)resource;
3481 }
3482
3483 rcu_read_lock();
3484 minor = cb->args[1];
3485 idr_to_search = resource ? &resource->devices : &drbd_devices;
3486 device = idr_find(idr_to_search, minor);
3487 if (!device) {
3488next_device:
3489 minor++;
3490 cb->args[2] = 0;
3491 device = idr_get_next(idr_to_search, &minor);
3492 if (!device) {
3493 err = 0;
3494 goto out;
3495 }
3496 }
3497 if (cb->args[2]) {
3498 for_each_peer_device(peer_device, device)
3499 if (peer_device == (struct drbd_peer_device *)cb->args[2])
3500 goto found_peer_device;
3501 /* peer device was probably deleted */
3502 goto next_device;
3503 }
3504 /* Make peer_device point to the list head (not the first entry). */
3505 peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
3506
3507found_peer_device:
3508 list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) {
3509 if (!has_net_conf(peer_device->connection))
3510 continue;
3511 retcode = NO_ERROR;
3512 goto put_result; /* only one iteration */
3513 }
3514 goto next_device;
3515
3516put_result:
3517 dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
3518 cb->nlh->nlmsg_seq, &drbd_genl_family,
3519 NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES);
3520 err = -ENOMEM;
3521 if (!dh)
3522 goto out;
3523 dh->ret_code = retcode;
3524 dh->minor = -1U;
3525 if (retcode == NO_ERROR) {
3526 struct peer_device_info peer_device_info;
3527 struct peer_device_statistics peer_device_statistics;
3528
3529 dh->minor = minor;
3530 err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device);
3531 if (err)
3532 goto out;
3533 peer_device_to_info(&peer_device_info, peer_device);
3534 err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN));
3535 if (err)
3536 goto out;
3537 peer_device_to_statistics(&peer_device_statistics, peer_device);
3538 err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
3539 if (err)
3540 goto out;
3541 cb->args[1] = minor;
3542 cb->args[2] = (long)peer_device;
3543 }
3544 genlmsg_end(skb, dh);
3545 err = 0;
3546
3547out:
3548 rcu_read_unlock();
3549 if (err)
3550 return err;
3551 return skb->len;
3552}
3553/*
2913 * Return the connection of @resource if @resource has exactly one connection. 3554 * Return the connection of @resource if @resource has exactly one connection.
2914 */ 3555 */
2915static struct drbd_connection *the_only_connection(struct drbd_resource *resource) 3556static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
@@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx)
3414 return NO_ERROR; 4055 return NO_ERROR;
3415} 4056}
3416 4057
4058static void resource_to_info(struct resource_info *info,
4059 struct drbd_resource *resource)
4060{
4061 info->res_role = conn_highest_role(first_connection(resource));
4062 info->res_susp = resource->susp;
4063 info->res_susp_nod = resource->susp_nod;
4064 info->res_susp_fen = resource->susp_fen;
4065}
4066
3417int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) 4067int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
3418{ 4068{
4069 struct drbd_connection *connection;
3419 struct drbd_config_context adm_ctx; 4070 struct drbd_config_context adm_ctx;
3420 enum drbd_ret_code retcode; 4071 enum drbd_ret_code retcode;
3421 struct res_opts res_opts; 4072 struct res_opts res_opts;
@@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
3449 } 4100 }
3450 4101
3451 /* not yet safe for genl_family.parallel_ops */ 4102 /* not yet safe for genl_family.parallel_ops */
3452 if (!conn_create(adm_ctx.resource_name, &res_opts)) 4103 mutex_lock(&resources_mutex);
4104 connection = conn_create(adm_ctx.resource_name, &res_opts);
4105 mutex_unlock(&resources_mutex);
4106
4107 if (connection) {
4108 struct resource_info resource_info;
4109
4110 mutex_lock(&notification_mutex);
4111 resource_to_info(&resource_info, connection->resource);
4112 notify_resource_state(NULL, 0, connection->resource,
4113 &resource_info, NOTIFY_CREATE);
4114 mutex_unlock(&notification_mutex);
4115 } else
3453 retcode = ERR_NOMEM; 4116 retcode = ERR_NOMEM;
4117
3454out: 4118out:
3455 drbd_adm_finish(&adm_ctx, info, retcode); 4119 drbd_adm_finish(&adm_ctx, info, retcode);
3456 return 0; 4120 return 0;
3457} 4121}
3458 4122
4123static void device_to_info(struct device_info *info,
4124 struct drbd_device *device)
4125{
4126 info->dev_disk_state = device->state.disk;
4127}
4128
4129
3459int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) 4130int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
3460{ 4131{
3461 struct drbd_config_context adm_ctx; 4132 struct drbd_config_context adm_ctx;
@@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
3490 4161
3491 mutex_lock(&adm_ctx.resource->adm_mutex); 4162 mutex_lock(&adm_ctx.resource->adm_mutex);
3492 retcode = drbd_create_device(&adm_ctx, dh->minor); 4163 retcode = drbd_create_device(&adm_ctx, dh->minor);
4164 if (retcode == NO_ERROR) {
4165 struct drbd_device *device;
4166 struct drbd_peer_device *peer_device;
4167 struct device_info info;
4168 unsigned int peer_devices = 0;
4169 enum drbd_notification_type flags;
4170
4171 device = minor_to_device(dh->minor);
4172 for_each_peer_device(peer_device, device) {
4173 if (!has_net_conf(peer_device->connection))
4174 continue;
4175 peer_devices++;
4176 }
4177
4178 device_to_info(&info, device);
4179 mutex_lock(&notification_mutex);
4180 flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
4181 notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags);
4182 for_each_peer_device(peer_device, device) {
4183 struct peer_device_info peer_device_info;
4184
4185 if (!has_net_conf(peer_device->connection))
4186 continue;
4187 peer_device_to_info(&peer_device_info, peer_device);
4188 flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
4189 notify_peer_device_state(NULL, 0, peer_device, &peer_device_info,
4190 NOTIFY_CREATE | flags);
4191 }
4192 mutex_unlock(&notification_mutex);
4193 }
3493 mutex_unlock(&adm_ctx.resource->adm_mutex); 4194 mutex_unlock(&adm_ctx.resource->adm_mutex);
3494out: 4195out:
3495 drbd_adm_finish(&adm_ctx, info, retcode); 4196 drbd_adm_finish(&adm_ctx, info, retcode);
@@ -3498,13 +4199,35 @@ out:
3498 4199
3499static enum drbd_ret_code adm_del_minor(struct drbd_device *device) 4200static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
3500{ 4201{
4202 struct drbd_peer_device *peer_device;
4203
3501 if (device->state.disk == D_DISKLESS && 4204 if (device->state.disk == D_DISKLESS &&
3502 /* no need to be device->state.conn == C_STANDALONE && 4205 /* no need to be device->state.conn == C_STANDALONE &&
3503 * we may want to delete a minor from a live replication group. 4206 * we may want to delete a minor from a live replication group.
3504 */ 4207 */
3505 device->state.role == R_SECONDARY) { 4208 device->state.role == R_SECONDARY) {
4209 struct drbd_connection *connection =
4210 first_connection(device->resource);
4211
3506 _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS), 4212 _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
3507 CS_VERBOSE + CS_WAIT_COMPLETE); 4213 CS_VERBOSE + CS_WAIT_COMPLETE);
4214
4215 /* If the state engine hasn't stopped the sender thread yet, we
4216 * need to flush the sender work queue before generating the
4217 * DESTROY events here. */
4218 if (get_t_state(&connection->worker) == RUNNING)
4219 drbd_flush_workqueue(&connection->sender_work);
4220
4221 mutex_lock(&notification_mutex);
4222 for_each_peer_device(peer_device, device) {
4223 if (!has_net_conf(peer_device->connection))
4224 continue;
4225 notify_peer_device_state(NULL, 0, peer_device, NULL,
4226 NOTIFY_DESTROY | NOTIFY_CONTINUES);
4227 }
4228 notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY);
4229 mutex_unlock(&notification_mutex);
4230
3508 drbd_delete_device(device); 4231 drbd_delete_device(device);
3509 return NO_ERROR; 4232 return NO_ERROR;
3510 } else 4233 } else
@@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource)
3541 if (!idr_is_empty(&resource->devices)) 4264 if (!idr_is_empty(&resource->devices))
3542 return ERR_RES_IN_USE; 4265 return ERR_RES_IN_USE;
3543 4266
4267 /* The state engine has stopped the sender thread, so we don't
4268 * need to flush the sender work queue before generating the
4269 * DESTROY event here. */
4270 mutex_lock(&notification_mutex);
4271 notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY);
4272 mutex_unlock(&notification_mutex);
4273
4274 mutex_lock(&resources_mutex);
3544 list_del_rcu(&resource->resources); 4275 list_del_rcu(&resource->resources);
4276 mutex_unlock(&resources_mutex);
3545 /* Make sure all threads have actually stopped: state handling only 4277 /* Make sure all threads have actually stopped: state handling only
3546 * does drbd_thread_stop_nowait(). */ 4278 * does drbd_thread_stop_nowait(). */
3547 list_for_each_entry(connection, &resource->connections, connections) 4279 list_for_each_entry(connection, &resource->connections, connections)
@@ -3637,7 +4369,6 @@ finish:
3637 4369
3638void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) 4370void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
3639{ 4371{
3640 static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
3641 struct sk_buff *msg; 4372 struct sk_buff *msg;
3642 struct drbd_genlmsghdr *d_out; 4373 struct drbd_genlmsghdr *d_out;
3643 unsigned seq; 4374 unsigned seq;
@@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
3658 if (nla_put_status_info(msg, device, sib)) 4389 if (nla_put_status_info(msg, device, sib))
3659 goto nla_put_failure; 4390 goto nla_put_failure;
3660 genlmsg_end(msg, d_out); 4391 genlmsg_end(msg, d_out);
3661 err = drbd_genl_multicast_events(msg, 0); 4392 err = drbd_genl_multicast_events(msg, GFP_NOWAIT);
3662 /* msg has been consumed or freed in netlink_broadcast() */ 4393 /* msg has been consumed or freed in netlink_broadcast() */
3663 if (err && err != -ESRCH) 4394 if (err && err != -ESRCH)
3664 goto failed; 4395 goto failed;
@@ -3672,3 +4403,405 @@ failed:
3672 "Event seq:%u sib_reason:%u\n", 4403 "Event seq:%u sib_reason:%u\n",
3673 err, seq, sib->sib_reason); 4404 err, seq, sib->sib_reason);
3674} 4405}
4406
4407static int nla_put_notification_header(struct sk_buff *msg,
4408 enum drbd_notification_type type)
4409{
4410 struct drbd_notification_header nh = {
4411 .nh_type = type,
4412 };
4413
4414 return drbd_notification_header_to_skb(msg, &nh, true);
4415}
4416
4417void notify_resource_state(struct sk_buff *skb,
4418 unsigned int seq,
4419 struct drbd_resource *resource,
4420 struct resource_info *resource_info,
4421 enum drbd_notification_type type)
4422{
4423 struct resource_statistics resource_statistics;
4424 struct drbd_genlmsghdr *dh;
4425 bool multicast = false;
4426 int err;
4427
4428 if (!skb) {
4429 seq = atomic_inc_return(&notify_genl_seq);
4430 skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
4431 err = -ENOMEM;
4432 if (!skb)
4433 goto failed;
4434 multicast = true;
4435 }
4436
4437 err = -EMSGSIZE;
4438 dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE);
4439 if (!dh)
4440 goto nla_put_failure;
4441 dh->minor = -1U;
4442 dh->ret_code = NO_ERROR;
4443 if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) ||
4444 nla_put_notification_header(skb, type) ||
4445 ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
4446 resource_info_to_skb(skb, resource_info, true)))
4447 goto nla_put_failure;
4448 resource_statistics.res_stat_write_ordering = resource->write_ordering;
4449 err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
4450 if (err)
4451 goto nla_put_failure;
4452 genlmsg_end(skb, dh);
4453 if (multicast) {
4454 err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
4455 /* skb has been consumed or freed in netlink_broadcast() */
4456 if (err && err != -ESRCH)
4457 goto failed;
4458 }
4459 return;
4460
4461nla_put_failure:
4462 nlmsg_free(skb);
4463failed:
4464 drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
4465 err, seq);
4466}
4467
4468void notify_device_state(struct sk_buff *skb,
4469 unsigned int seq,
4470 struct drbd_device *device,
4471 struct device_info *device_info,
4472 enum drbd_notification_type type)
4473{
4474 struct device_statistics device_statistics;
4475 struct drbd_genlmsghdr *dh;
4476 bool multicast = false;
4477 int err;
4478
4479 if (!skb) {
4480 seq = atomic_inc_return(&notify_genl_seq);
4481 skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
4482 err = -ENOMEM;
4483 if (!skb)
4484 goto failed;
4485 multicast = true;
4486 }
4487
4488 err = -EMSGSIZE;
4489 dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE);
4490 if (!dh)
4491 goto nla_put_failure;
4492 dh->minor = device->minor;
4493 dh->ret_code = NO_ERROR;
4494 if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) ||
4495 nla_put_notification_header(skb, type) ||
4496 ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
4497 device_info_to_skb(skb, device_info, true)))
4498 goto nla_put_failure;
4499 device_to_statistics(&device_statistics, device);
4500 device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
4501 genlmsg_end(skb, dh);
4502 if (multicast) {
4503 err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
4504 /* skb has been consumed or freed in netlink_broadcast() */
4505 if (err && err != -ESRCH)
4506 goto failed;
4507 }
4508 return;
4509
4510nla_put_failure:
4511 nlmsg_free(skb);
4512failed:
4513 drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n",
4514 err, seq);
4515}
4516
4517void notify_connection_state(struct sk_buff *skb,
4518 unsigned int seq,
4519 struct drbd_connection *connection,
4520 struct connection_info *connection_info,
4521 enum drbd_notification_type type)
4522{
4523 struct connection_statistics connection_statistics;
4524 struct drbd_genlmsghdr *dh;
4525 bool multicast = false;
4526 int err;
4527
4528 if (!skb) {
4529 seq = atomic_inc_return(&notify_genl_seq);
4530 skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
4531 err = -ENOMEM;
4532 if (!skb)
4533 goto failed;
4534 multicast = true;
4535 }
4536
4537 err = -EMSGSIZE;
4538 dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE);
4539 if (!dh)
4540 goto nla_put_failure;
4541 dh->minor = -1U;
4542 dh->ret_code = NO_ERROR;
4543 if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) ||
4544 nla_put_notification_header(skb, type) ||
4545 ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
4546 connection_info_to_skb(skb, connection_info, true)))
4547 goto nla_put_failure;
4548 connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
4549 connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
4550 genlmsg_end(skb, dh);
4551 if (multicast) {
4552 err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
4553 /* skb has been consumed or freed in netlink_broadcast() */
4554 if (err && err != -ESRCH)
4555 goto failed;
4556 }
4557 return;
4558
4559nla_put_failure:
4560 nlmsg_free(skb);
4561failed:
4562 drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n",
4563 err, seq);
4564}
4565
4566void notify_peer_device_state(struct sk_buff *skb,
4567 unsigned int seq,
4568 struct drbd_peer_device *peer_device,
4569 struct peer_device_info *peer_device_info,
4570 enum drbd_notification_type type)
4571{
4572 struct peer_device_statistics peer_device_statistics;
4573 struct drbd_resource *resource = peer_device->device->resource;
4574 struct drbd_genlmsghdr *dh;
4575 bool multicast = false;
4576 int err;
4577
4578 if (!skb) {
4579 seq = atomic_inc_return(&notify_genl_seq);
4580 skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
4581 err = -ENOMEM;
4582 if (!skb)
4583 goto failed;
4584 multicast = true;
4585 }
4586
4587 err = -EMSGSIZE;
4588 dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE);
4589 if (!dh)
4590 goto nla_put_failure;
4591 dh->minor = -1U;
4592 dh->ret_code = NO_ERROR;
4593 if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) ||
4594 nla_put_notification_header(skb, type) ||
4595 ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
4596 peer_device_info_to_skb(skb, peer_device_info, true)))
4597 goto nla_put_failure;
4598 peer_device_to_statistics(&peer_device_statistics, peer_device);
4599 peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
4600 genlmsg_end(skb, dh);
4601 if (multicast) {
4602 err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
4603 /* skb has been consumed or freed in netlink_broadcast() */
4604 if (err && err != -ESRCH)
4605 goto failed;
4606 }
4607 return;
4608
4609nla_put_failure:
4610 nlmsg_free(skb);
4611failed:
4612 drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n",
4613 err, seq);
4614}
4615
4616void notify_helper(enum drbd_notification_type type,
4617 struct drbd_device *device, struct drbd_connection *connection,
4618 const char *name, int status)
4619{
4620 struct drbd_resource *resource = device ? device->resource : connection->resource;
4621 struct drbd_helper_info helper_info;
4622 unsigned int seq = atomic_inc_return(&notify_genl_seq);
4623 struct sk_buff *skb = NULL;
4624 struct drbd_genlmsghdr *dh;
4625 int err;
4626
4627 strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name));
4628 helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name));
4629 helper_info.helper_status = status;
4630
4631 skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
4632 err = -ENOMEM;
4633 if (!skb)
4634 goto fail;
4635
4636 err = -EMSGSIZE;
4637 dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER);
4638 if (!dh)
4639 goto fail;
4640 dh->minor = device ? device->minor : -1;
4641 dh->ret_code = NO_ERROR;
4642 mutex_lock(&notification_mutex);
4643 if (nla_put_drbd_cfg_context(skb, resource, connection, device) ||
4644 nla_put_notification_header(skb, type) ||
4645 drbd_helper_info_to_skb(skb, &helper_info, true))
4646 goto unlock_fail;
4647 genlmsg_end(skb, dh);
4648 err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
4649 skb = NULL;
4650 /* skb has been consumed or freed in netlink_broadcast() */
4651 if (err && err != -ESRCH)
4652 goto unlock_fail;
4653 mutex_unlock(&notification_mutex);
4654 return;
4655
4656unlock_fail:
4657 mutex_unlock(&notification_mutex);
4658fail:
4659 nlmsg_free(skb);
4660 drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
4661 err, seq);
4662}
4663
4664static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
4665{
4666 struct drbd_genlmsghdr *dh;
4667 int err;
4668
4669 err = -EMSGSIZE;
4670 dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE);
4671 if (!dh)
4672 goto nla_put_failure;
4673 dh->minor = -1U;
4674 dh->ret_code = NO_ERROR;
4675 if (nla_put_notification_header(skb, NOTIFY_EXISTS))
4676 goto nla_put_failure;
4677 genlmsg_end(skb, dh);
4678 return;
4679
4680nla_put_failure:
4681 nlmsg_free(skb);
4682 pr_err("Error %d sending event. Event seq:%u\n", err, seq);
4683}
4684
4685static void free_state_changes(struct list_head *list)
4686{
4687 while (!list_empty(list)) {
4688 struct drbd_state_change *state_change =
4689 list_first_entry(list, struct drbd_state_change, list);
4690 list_del(&state_change->list);
4691 forget_state_change(state_change);
4692 }
4693}
4694
4695static unsigned int notifications_for_state_change(struct drbd_state_change *state_change)
4696{
4697 return 1 +
4698 state_change->n_connections +
4699 state_change->n_devices +
4700 state_change->n_devices * state_change->n_connections;
4701}
4702
4703static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
4704{
4705 struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0];
4706 unsigned int seq = cb->args[2];
4707 unsigned int n;
4708 enum drbd_notification_type flags = 0;
4709
4710 /* There is no need for taking notification_mutex here: it doesn't
4711 matter if the initial state events mix with later state chage
4712 events; we can always tell the events apart by the NOTIFY_EXISTS
4713 flag. */
4714
4715 cb->args[5]--;
4716 if (cb->args[5] == 1) {
4717 notify_initial_state_done(skb, seq);
4718 goto out;
4719 }
4720 n = cb->args[4]++;
4721 if (cb->args[4] < cb->args[3])
4722 flags |= NOTIFY_CONTINUES;
4723 if (n < 1) {
4724 notify_resource_state_change(skb, seq, state_change->resource,
4725 NOTIFY_EXISTS | flags);
4726 goto next;
4727 }
4728 n--;
4729 if (n < state_change->n_connections) {
4730 notify_connection_state_change(skb, seq, &state_change->connections[n],
4731 NOTIFY_EXISTS | flags);
4732 goto next;
4733 }
4734 n -= state_change->n_connections;
4735 if (n < state_change->n_devices) {
4736 notify_device_state_change(skb, seq, &state_change->devices[n],
4737 NOTIFY_EXISTS | flags);
4738 goto next;
4739 }
4740 n -= state_change->n_devices;
4741 if (n < state_change->n_devices * state_change->n_connections) {
4742 notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
4743 NOTIFY_EXISTS | flags);
4744 goto next;
4745 }
4746
4747next:
4748 if (cb->args[4] == cb->args[3]) {
4749 struct drbd_state_change *next_state_change =
4750 list_entry(state_change->list.next,
4751 struct drbd_state_change, list);
4752 cb->args[0] = (long)next_state_change;
4753 cb->args[3] = notifications_for_state_change(next_state_change);
4754 cb->args[4] = 0;
4755 }
4756out:
4757 return skb->len;
4758}
4759
4760int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
4761{
4762 struct drbd_resource *resource;
4763 LIST_HEAD(head);
4764
4765 if (cb->args[5] >= 1) {
4766 if (cb->args[5] > 1)
4767 return get_initial_state(skb, cb);
4768 if (cb->args[0]) {
4769 struct drbd_state_change *state_change =
4770 (struct drbd_state_change *)cb->args[0];
4771
4772 /* connect list to head */
4773 list_add(&head, &state_change->list);
4774 free_state_changes(&head);
4775 }
4776 return 0;
4777 }
4778
4779 cb->args[5] = 2; /* number of iterations */
4780 mutex_lock(&resources_mutex);
4781 for_each_resource(resource, &drbd_resources) {
4782 struct drbd_state_change *state_change;
4783
4784 state_change = remember_old_state(resource, GFP_KERNEL);
4785 if (!state_change) {
4786 if (!list_empty(&head))
4787 free_state_changes(&head);
4788 mutex_unlock(&resources_mutex);
4789 return -ENOMEM;
4790 }
4791 copy_old_to_new_state_change(state_change);
4792 list_add_tail(&state_change->list, &head);
4793 cb->args[5] += notifications_for_state_change(state_change);
4794 }
4795 mutex_unlock(&resources_mutex);
4796
4797 if (!list_empty(&head)) {
4798 struct drbd_state_change *state_change =
4799 list_entry(head.next, struct drbd_state_change, list);
4800 cb->args[0] = (long)state_change;
4801 cb->args[3] = notifications_for_state_change(state_change);
4802 list_del(&head); /* detach list from head */
4803 }
4804
4805 cb->args[2] = cb->nlh->nlmsg_seq;
4806 return get_initial_state(skb, cb);
4807}
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 3b10fa6cb039..6537b25db9c1 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
245 char wp; 245 char wp;
246 246
247 static char write_ordering_chars[] = { 247 static char write_ordering_chars[] = {
248 [WO_none] = 'n', 248 [WO_NONE] = 'n',
249 [WO_drain_io] = 'd', 249 [WO_DRAIN_IO] = 'd',
250 [WO_bdev_flush] = 'f', 250 [WO_BDEV_FLUSH] = 'f',
251 }; 251 };
252 252
253 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", 253 seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 2da9104a3851..ef9245363dcc 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -23,7 +23,7 @@ enum drbd_packet {
23 P_AUTH_RESPONSE = 0x11, 23 P_AUTH_RESPONSE = 0x11,
24 P_STATE_CHG_REQ = 0x12, 24 P_STATE_CHG_REQ = 0x12,
25 25
26 /* asender (meta socket */ 26 /* (meta socket) */
27 P_PING = 0x13, 27 P_PING = 0x13,
28 P_PING_ACK = 0x14, 28 P_PING_ACK = 0x14,
29 P_RECV_ACK = 0x15, /* Used in protocol B */ 29 P_RECV_ACK = 0x15, /* Used in protocol B */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b4b5680ac6ad..1957fe8601dc 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
215 } 215 }
216} 216}
217 217
218static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) 218static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
219{ 219{
220 LIST_HEAD(reclaimed); 220 LIST_HEAD(reclaimed);
221 struct drbd_peer_request *peer_req, *t; 221 struct drbd_peer_request *peer_req, *t;
@@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
223 spin_lock_irq(&device->resource->req_lock); 223 spin_lock_irq(&device->resource->req_lock);
224 reclaim_finished_net_peer_reqs(device, &reclaimed); 224 reclaim_finished_net_peer_reqs(device, &reclaimed);
225 spin_unlock_irq(&device->resource->req_lock); 225 spin_unlock_irq(&device->resource->req_lock);
226
227 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) 226 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
228 drbd_free_net_peer_req(device, peer_req); 227 drbd_free_net_peer_req(device, peer_req);
229} 228}
230 229
230static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
231{
232 struct drbd_peer_device *peer_device;
233 int vnr;
234
235 rcu_read_lock();
236 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
237 struct drbd_device *device = peer_device->device;
238 if (!atomic_read(&device->pp_in_use_by_net))
239 continue;
240
241 kref_get(&device->kref);
242 rcu_read_unlock();
243 drbd_reclaim_net_peer_reqs(device);
244 kref_put(&device->kref, drbd_destroy_device);
245 rcu_read_lock();
246 }
247 rcu_read_unlock();
248}
249
231/** 250/**
232 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) 251 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
233 * @device: DRBD device. 252 * @device: DRBD device.
@@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
265 if (atomic_read(&device->pp_in_use) < mxb) 284 if (atomic_read(&device->pp_in_use) < mxb)
266 page = __drbd_alloc_pages(device, number); 285 page = __drbd_alloc_pages(device, number);
267 286
287 /* Try to keep the fast path fast, but occasionally we need
288 * to reclaim the pages we lended to the network stack. */
289 if (page && atomic_read(&device->pp_in_use_by_net) > 512)
290 drbd_reclaim_net_peer_reqs(device);
291
268 while (page == NULL) { 292 while (page == NULL) {
269 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); 293 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
270 294
271 drbd_kick_lo_and_reclaim_net(device); 295 drbd_reclaim_net_peer_reqs(device);
272 296
273 if (atomic_read(&device->pp_in_use) < mxb) { 297 if (atomic_read(&device->pp_in_use) < mxb) {
274 page = __drbd_alloc_pages(device, number); 298 page = __drbd_alloc_pages(device, number);
@@ -1099,7 +1123,15 @@ randomize:
1099 return 0; 1123 return 0;
1100 } 1124 }
1101 1125
1102 drbd_thread_start(&connection->asender); 1126 drbd_thread_start(&connection->ack_receiver);
1127 /* opencoded create_singlethread_workqueue(),
1128 * to be able to use format string arguments */
1129 connection->ack_sender =
1130 alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
1131 if (!connection->ack_sender) {
1132 drbd_err(connection, "Failed to create workqueue ack_sender\n");
1133 return 0;
1134 }
1103 1135
1104 mutex_lock(&connection->resource->conf_update); 1136 mutex_lock(&connection->resource->conf_update);
1105 /* The discard_my_data flag is a single-shot modifier to the next 1137 /* The discard_my_data flag is a single-shot modifier to the next
@@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection)
1178 struct drbd_peer_device *peer_device; 1210 struct drbd_peer_device *peer_device;
1179 int vnr; 1211 int vnr;
1180 1212
1181 if (connection->resource->write_ordering >= WO_bdev_flush) { 1213 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
1182 rcu_read_lock(); 1214 rcu_read_lock();
1183 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1215 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1184 struct drbd_device *device = peer_device->device; 1216 struct drbd_device *device = peer_device->device;
@@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection)
1203 /* would rather check on EOPNOTSUPP, but that is not reliable. 1235 /* would rather check on EOPNOTSUPP, but that is not reliable.
1204 * don't try again for ANY return value != 0 1236 * don't try again for ANY return value != 0
1205 * if (rv == -EOPNOTSUPP) */ 1237 * if (rv == -EOPNOTSUPP) */
1206 drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); 1238 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1207 } 1239 }
1208 put_ldev(device); 1240 put_ldev(device);
1209 kref_put(&device->kref, drbd_destroy_device); 1241 kref_put(&device->kref, drbd_destroy_device);
@@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1299 1331
1300 dc = rcu_dereference(bdev->disk_conf); 1332 dc = rcu_dereference(bdev->disk_conf);
1301 1333
1302 if (wo == WO_bdev_flush && !dc->disk_flushes) 1334 if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
1303 wo = WO_drain_io; 1335 wo = WO_DRAIN_IO;
1304 if (wo == WO_drain_io && !dc->disk_drain) 1336 if (wo == WO_DRAIN_IO && !dc->disk_drain)
1305 wo = WO_none; 1337 wo = WO_NONE;
1306 1338
1307 return wo; 1339 return wo;
1308} 1340}
@@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
1319 enum write_ordering_e pwo; 1351 enum write_ordering_e pwo;
1320 int vnr; 1352 int vnr;
1321 static char *write_ordering_str[] = { 1353 static char *write_ordering_str[] = {
1322 [WO_none] = "none", 1354 [WO_NONE] = "none",
1323 [WO_drain_io] = "drain", 1355 [WO_DRAIN_IO] = "drain",
1324 [WO_bdev_flush] = "flush", 1356 [WO_BDEV_FLUSH] = "flush",
1325 }; 1357 };
1326 1358
1327 pwo = resource->write_ordering; 1359 pwo = resource->write_ordering;
1328 if (wo != WO_bdev_flush) 1360 if (wo != WO_BDEV_FLUSH)
1329 wo = min(pwo, wo); 1361 wo = min(pwo, wo);
1330 rcu_read_lock(); 1362 rcu_read_lock();
1331 idr_for_each_entry(&resource->devices, device, vnr) { 1363 idr_for_each_entry(&resource->devices, device, vnr) {
@@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
1343 rcu_read_unlock(); 1375 rcu_read_unlock();
1344 1376
1345 resource->write_ordering = wo; 1377 resource->write_ordering = wo;
1346 if (pwo != resource->write_ordering || wo == WO_bdev_flush) 1378 if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
1347 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); 1379 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1348} 1380}
1349 1381
@@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
1380 if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { 1412 if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
1381 /* wait for all pending IO completions, before we start 1413 /* wait for all pending IO completions, before we start
1382 * zeroing things out. */ 1414 * zeroing things out. */
1383 conn_wait_active_ee_empty(first_peer_device(device)->connection); 1415 conn_wait_active_ee_empty(peer_req->peer_device->connection);
1384 /* add it to the active list now, 1416 /* add it to the active list now,
1385 * so we can find it to present it in debugfs */ 1417 * so we can find it to present it in debugfs */
1386 peer_req->submit_jif = jiffies; 1418 peer_req->submit_jif = jiffies;
@@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection)
1508 rcu_read_unlock(); 1540 rcu_read_unlock();
1509} 1541}
1510 1542
1511static struct drbd_peer_device *
1512conn_peer_device(struct drbd_connection *connection, int volume_number)
1513{
1514 return idr_find(&connection->peer_devices, volume_number);
1515}
1516
1517static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) 1543static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
1518{ 1544{
1519 int rv; 1545 int rv;
@@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
1533 * Therefore we must send the barrier_ack after the barrier request was 1559 * Therefore we must send the barrier_ack after the barrier request was
1534 * completed. */ 1560 * completed. */
1535 switch (connection->resource->write_ordering) { 1561 switch (connection->resource->write_ordering) {
1536 case WO_none: 1562 case WO_NONE:
1537 if (rv == FE_RECYCLED) 1563 if (rv == FE_RECYCLED)
1538 return 0; 1564 return 0;
1539 1565
@@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
1546 drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); 1572 drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
1547 /* Fall through */ 1573 /* Fall through */
1548 1574
1549 case WO_bdev_flush: 1575 case WO_BDEV_FLUSH:
1550 case WO_drain_io: 1576 case WO_DRAIN_IO:
1551 conn_wait_active_ee_empty(connection); 1577 conn_wait_active_ee_empty(connection);
1552 drbd_flush(connection); 1578 drbd_flush(connection);
1553 1579
@@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
1752} 1778}
1753 1779
1754/* 1780/*
1755 * e_end_resync_block() is called in asender context via 1781 * e_end_resync_block() is called in ack_sender context via
1756 * drbd_finish_peer_reqs(). 1782 * drbd_finish_peer_reqs().
1757 */ 1783 */
1758static int e_end_resync_block(struct drbd_work *w, int unused) 1784static int e_end_resync_block(struct drbd_work *w, int unused)
@@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device,
1926} 1952}
1927 1953
1928/* 1954/*
1929 * e_end_block() is called in asender context via drbd_finish_peer_reqs(). 1955 * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
1930 */ 1956 */
1931static int e_end_block(struct drbd_work *w, int cancel) 1957static int e_end_block(struct drbd_work *w, int cancel)
1932{ 1958{
@@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
1966 } else 1992 } else
1967 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 1993 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
1968 1994
1969 drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); 1995 drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1970 1996
1971 return err; 1997 return err;
1972} 1998}
@@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
2098 } 2124 }
2099 2125
2100 rcu_read_lock(); 2126 rcu_read_lock();
2101 tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries; 2127 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
2102 rcu_read_unlock(); 2128 rcu_read_unlock();
2103 2129
2104 if (!tp) 2130 if (!tp)
@@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device,
2217 peer_req->w.cb = superseded ? e_send_superseded : 2243 peer_req->w.cb = superseded ? e_send_superseded :
2218 e_send_retry_write; 2244 e_send_retry_write;
2219 list_add_tail(&peer_req->w.list, &device->done_ee); 2245 list_add_tail(&peer_req->w.list, &device->done_ee);
2220 wake_asender(connection); 2246 queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
2221 2247
2222 err = -ENOENT; 2248 err = -ENOENT;
2223 goto out; 2249 goto out;
@@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2364 if (dp_flags & DP_SEND_RECEIVE_ACK) { 2390 if (dp_flags & DP_SEND_RECEIVE_ACK) {
2365 /* I really don't like it that the receiver thread 2391 /* I really don't like it that the receiver thread
2366 * sends on the msock, but anyways */ 2392 * sends on the msock, but anyways */
2367 drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); 2393 drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
2368 } 2394 }
2369 2395
2370 if (tp) { 2396 if (tp) {
@@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info
4056 os = ns = drbd_read_state(device); 4082 os = ns = drbd_read_state(device);
4057 spin_unlock_irq(&device->resource->req_lock); 4083 spin_unlock_irq(&device->resource->req_lock);
4058 4084
4059 /* If some other part of the code (asender thread, timeout) 4085 /* If some other part of the code (ack_receiver thread, timeout)
4060 * already decided to close the connection again, 4086 * already decided to close the connection again,
4061 * we must not "re-establish" it here. */ 4087 * we must not "re-establish" it here. */
4062 if (os.conn <= C_TEAR_DOWN) 4088 if (os.conn <= C_TEAR_DOWN)
@@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection)
4661 */ 4687 */
4662 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); 4688 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
4663 4689
4664 /* asender does not clean up anything. it must not interfere, either */ 4690 /* ack_receiver does not clean up anything. it must not interfere, either */
4665 drbd_thread_stop(&connection->asender); 4691 drbd_thread_stop(&connection->ack_receiver);
4692 if (connection->ack_sender) {
4693 destroy_workqueue(connection->ack_sender);
4694 connection->ack_sender = NULL;
4695 }
4666 drbd_free_sock(connection); 4696 drbd_free_sock(connection);
4667 4697
4668 rcu_read_lock(); 4698 rcu_read_lock();
@@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
5431 return 0; 5461 return 0;
5432} 5462}
5433 5463
5434static int connection_finish_peer_reqs(struct drbd_connection *connection) 5464struct meta_sock_cmd {
5465 size_t pkt_size;
5466 int (*fn)(struct drbd_connection *connection, struct packet_info *);
5467};
5468
5469static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
5435{ 5470{
5436 struct drbd_peer_device *peer_device; 5471 long t;
5437 int vnr, not_empty = 0; 5472 struct net_conf *nc;
5438 5473
5439 do { 5474 rcu_read_lock();
5440 clear_bit(SIGNAL_ASENDER, &connection->flags); 5475 nc = rcu_dereference(connection->net_conf);
5441 flush_signals(current); 5476 t = ping_timeout ? nc->ping_timeo : nc->ping_int;
5477 rcu_read_unlock();
5442 5478
5443 rcu_read_lock(); 5479 t *= HZ;
5444 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 5480 if (ping_timeout)
5445 struct drbd_device *device = peer_device->device; 5481 t /= 10;
5446 kref_get(&device->kref);
5447 rcu_read_unlock();
5448 if (drbd_finish_peer_reqs(device)) {
5449 kref_put(&device->kref, drbd_destroy_device);
5450 return 1;
5451 }
5452 kref_put(&device->kref, drbd_destroy_device);
5453 rcu_read_lock();
5454 }
5455 set_bit(SIGNAL_ASENDER, &connection->flags);
5456 5482
5457 spin_lock_irq(&connection->resource->req_lock); 5483 connection->meta.socket->sk->sk_rcvtimeo = t;
5458 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 5484}
5459 struct drbd_device *device = peer_device->device;
5460 not_empty = !list_empty(&device->done_ee);
5461 if (not_empty)
5462 break;
5463 }
5464 spin_unlock_irq(&connection->resource->req_lock);
5465 rcu_read_unlock();
5466 } while (not_empty);
5467 5485
5468 return 0; 5486static void set_ping_timeout(struct drbd_connection *connection)
5487{
5488 set_rcvtimeo(connection, 1);
5469} 5489}
5470 5490
5471struct asender_cmd { 5491static void set_idle_timeout(struct drbd_connection *connection)
5472 size_t pkt_size; 5492{
5473 int (*fn)(struct drbd_connection *connection, struct packet_info *); 5493 set_rcvtimeo(connection, 0);
5474}; 5494}
5475 5495
5476static struct asender_cmd asender_tbl[] = { 5496static struct meta_sock_cmd ack_receiver_tbl[] = {
5477 [P_PING] = { 0, got_Ping }, 5497 [P_PING] = { 0, got_Ping },
5478 [P_PING_ACK] = { 0, got_PingAck }, 5498 [P_PING_ACK] = { 0, got_PingAck },
5479 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, 5499 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
@@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = {
5493 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, 5513 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
5494}; 5514};
5495 5515
5496int drbd_asender(struct drbd_thread *thi) 5516int drbd_ack_receiver(struct drbd_thread *thi)
5497{ 5517{
5498 struct drbd_connection *connection = thi->connection; 5518 struct drbd_connection *connection = thi->connection;
5499 struct asender_cmd *cmd = NULL; 5519 struct meta_sock_cmd *cmd = NULL;
5500 struct packet_info pi; 5520 struct packet_info pi;
5521 unsigned long pre_recv_jif;
5501 int rv; 5522 int rv;
5502 void *buf = connection->meta.rbuf; 5523 void *buf = connection->meta.rbuf;
5503 int received = 0; 5524 int received = 0;
5504 unsigned int header_size = drbd_header_size(connection); 5525 unsigned int header_size = drbd_header_size(connection);
5505 int expect = header_size; 5526 int expect = header_size;
5506 bool ping_timeout_active = false; 5527 bool ping_timeout_active = false;
5507 struct net_conf *nc;
5508 int ping_timeo, tcp_cork, ping_int;
5509 struct sched_param param = { .sched_priority = 2 }; 5528 struct sched_param param = { .sched_priority = 2 };
5510 5529
5511 rv = sched_setscheduler(current, SCHED_RR, &param); 5530 rv = sched_setscheduler(current, SCHED_RR, &param);
5512 if (rv < 0) 5531 if (rv < 0)
5513 drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv); 5532 drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
5514 5533
5515 while (get_t_state(thi) == RUNNING) { 5534 while (get_t_state(thi) == RUNNING) {
5516 drbd_thread_current_set_cpu(thi); 5535 drbd_thread_current_set_cpu(thi);
5517 5536
5518 rcu_read_lock(); 5537 conn_reclaim_net_peer_reqs(connection);
5519 nc = rcu_dereference(connection->net_conf);
5520 ping_timeo = nc->ping_timeo;
5521 tcp_cork = nc->tcp_cork;
5522 ping_int = nc->ping_int;
5523 rcu_read_unlock();
5524 5538
5525 if (test_and_clear_bit(SEND_PING, &connection->flags)) { 5539 if (test_and_clear_bit(SEND_PING, &connection->flags)) {
5526 if (drbd_send_ping(connection)) { 5540 if (drbd_send_ping(connection)) {
5527 drbd_err(connection, "drbd_send_ping has failed\n"); 5541 drbd_err(connection, "drbd_send_ping has failed\n");
5528 goto reconnect; 5542 goto reconnect;
5529 } 5543 }
5530 connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; 5544 set_ping_timeout(connection);
5531 ping_timeout_active = true; 5545 ping_timeout_active = true;
5532 } 5546 }
5533 5547
5534 /* TODO: conditionally cork; it may hurt latency if we cork without 5548 pre_recv_jif = jiffies;
5535 much to send */
5536 if (tcp_cork)
5537 drbd_tcp_cork(connection->meta.socket);
5538 if (connection_finish_peer_reqs(connection)) {
5539 drbd_err(connection, "connection_finish_peer_reqs() failed\n");
5540 goto reconnect;
5541 }
5542 /* but unconditionally uncork unless disabled */
5543 if (tcp_cork)
5544 drbd_tcp_uncork(connection->meta.socket);
5545
5546 /* short circuit, recv_msg would return EINTR anyways. */
5547 if (signal_pending(current))
5548 continue;
5549
5550 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); 5549 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
5551 clear_bit(SIGNAL_ASENDER, &connection->flags);
5552
5553 flush_signals(current);
5554 5550
5555 /* Note: 5551 /* Note:
5556 * -EINTR (on meta) we got a signal 5552 * -EINTR (on meta) we got a signal
@@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi)
5562 * rv < expected: "woken" by signal during receive 5558 * rv < expected: "woken" by signal during receive
5563 * rv == 0 : "connection shut down by peer" 5559 * rv == 0 : "connection shut down by peer"
5564 */ 5560 */
5565received_more:
5566 if (likely(rv > 0)) { 5561 if (likely(rv > 0)) {
5567 received += rv; 5562 received += rv;
5568 buf += rv; 5563 buf += rv;
@@ -5584,8 +5579,7 @@ received_more:
5584 } else if (rv == -EAGAIN) { 5579 } else if (rv == -EAGAIN) {
5585 /* If the data socket received something meanwhile, 5580 /* If the data socket received something meanwhile,
5586 * that is good enough: peer is still alive. */ 5581 * that is good enough: peer is still alive. */
5587 if (time_after(connection->last_received, 5582 if (time_after(connection->last_received, pre_recv_jif))
5588 jiffies - connection->meta.socket->sk->sk_rcvtimeo))
5589 continue; 5583 continue;
5590 if (ping_timeout_active) { 5584 if (ping_timeout_active) {
5591 drbd_err(connection, "PingAck did not arrive in time.\n"); 5585 drbd_err(connection, "PingAck did not arrive in time.\n");
@@ -5594,6 +5588,10 @@ received_more:
5594 set_bit(SEND_PING, &connection->flags); 5588 set_bit(SEND_PING, &connection->flags);
5595 continue; 5589 continue;
5596 } else if (rv == -EINTR) { 5590 } else if (rv == -EINTR) {
5591 /* maybe drbd_thread_stop(): the while condition will notice.
5592 * maybe woken for send_ping: we'll send a ping above,
5593 * and change the rcvtimeo */
5594 flush_signals(current);
5597 continue; 5595 continue;
5598 } else { 5596 } else {
5599 drbd_err(connection, "sock_recvmsg returned %d\n", rv); 5597 drbd_err(connection, "sock_recvmsg returned %d\n", rv);
@@ -5603,8 +5601,8 @@ received_more:
5603 if (received == expect && cmd == NULL) { 5601 if (received == expect && cmd == NULL) {
5604 if (decode_header(connection, connection->meta.rbuf, &pi)) 5602 if (decode_header(connection, connection->meta.rbuf, &pi))
5605 goto reconnect; 5603 goto reconnect;
5606 cmd = &asender_tbl[pi.cmd]; 5604 cmd = &ack_receiver_tbl[pi.cmd];
5607 if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { 5605 if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
5608 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", 5606 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
5609 cmdname(pi.cmd), pi.cmd); 5607 cmdname(pi.cmd), pi.cmd);
5610 goto disconnect; 5608 goto disconnect;
@@ -5627,9 +5625,8 @@ received_more:
5627 5625
5628 connection->last_received = jiffies; 5626 connection->last_received = jiffies;
5629 5627
5630 if (cmd == &asender_tbl[P_PING_ACK]) { 5628 if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
5631 /* restore idle timeout */ 5629 set_idle_timeout(connection);
5632 connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
5633 ping_timeout_active = false; 5630 ping_timeout_active = false;
5634 } 5631 }
5635 5632
@@ -5638,11 +5635,6 @@ received_more:
5638 expect = header_size; 5635 expect = header_size;
5639 cmd = NULL; 5636 cmd = NULL;
5640 } 5637 }
5641 if (test_bit(SEND_PING, &connection->flags))
5642 continue;
5643 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
5644 if (rv > 0)
5645 goto received_more;
5646 } 5638 }
5647 5639
5648 if (0) { 5640 if (0) {
@@ -5654,9 +5646,41 @@ reconnect:
5654disconnect: 5646disconnect:
5655 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 5647 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5656 } 5648 }
5657 clear_bit(SIGNAL_ASENDER, &connection->flags);
5658 5649
5659 drbd_info(connection, "asender terminated\n"); 5650 drbd_info(connection, "ack_receiver terminated\n");
5660 5651
5661 return 0; 5652 return 0;
5662} 5653}
5654
5655void drbd_send_acks_wf(struct work_struct *ws)
5656{
5657 struct drbd_peer_device *peer_device =
5658 container_of(ws, struct drbd_peer_device, send_acks_work);
5659 struct drbd_connection *connection = peer_device->connection;
5660 struct drbd_device *device = peer_device->device;
5661 struct net_conf *nc;
5662 int tcp_cork, err;
5663
5664 rcu_read_lock();
5665 nc = rcu_dereference(connection->net_conf);
5666 tcp_cork = nc->tcp_cork;
5667 rcu_read_unlock();
5668
5669 if (tcp_cork)
5670 drbd_tcp_cork(connection->meta.socket);
5671
5672 err = drbd_finish_peer_reqs(device);
5673 kref_put(&device->kref, drbd_destroy_device);
5674 /* get is in drbd_endio_write_sec_final(). That is necessary to keep the
5675 struct work_struct send_acks_work alive, which is in the peer_device object */
5676
5677 if (err) {
5678 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5679 return;
5680 }
5681
5682 if (tcp_cork)
5683 drbd_tcp_uncork(connection->meta.socket);
5684
5685 return;
5686}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3ae2c0086563..2255dcfebd2b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
453 kref_get(&req->kref); /* wait for the DONE */ 453 kref_get(&req->kref); /* wait for the DONE */
454 454
455 if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { 455 if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
456 /* potentially already completed in the asender thread */ 456 /* potentially already completed in the ack_receiver thread */
457 if (!(s & RQ_NET_DONE)) { 457 if (!(s & RQ_NET_DONE)) {
458 atomic_add(req->i.size >> 9, &device->ap_in_flight); 458 atomic_add(req->i.size >> 9, &device->ap_in_flight);
459 set_if_null_req_not_net_done(peer_device, req); 459 set_if_null_req_not_net_done(peer_device, req);
460 } 460 }
461 if (s & RQ_NET_PENDING) 461 if (req->rq_state & RQ_NET_PENDING)
462 set_if_null_req_ack_pending(peer_device, req); 462 set_if_null_req_ack_pending(peer_device, req);
463 } 463 }
464 464
@@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req)
1095 return false; 1095 return false;
1096} 1096}
1097 1097
1098bool drbd_should_do_remote(union drbd_dev_state s)
1099{
1100 return s.pdsk == D_UP_TO_DATE ||
1101 (s.pdsk >= D_INCONSISTENT &&
1102 s.conn >= C_WF_BITMAP_T &&
1103 s.conn < C_AHEAD);
1104 /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
1105 That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
1106 states. */
1107}
1108
1109static bool drbd_should_send_out_of_sync(union drbd_dev_state s)
1110{
1111 return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
1112 /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
1113 since we enter state C_AHEAD only if proto >= 96 */
1114}
1115
1098/* returns number of connections (== 1, for drbd 8.4) 1116/* returns number of connections (== 1, for drbd 8.4)
1099 * expected to actually write this data, 1117 * expected to actually write this data,
1100 * which does NOT include those that we are L_AHEAD for. */ 1118 * which does NOT include those that we are L_AHEAD for. */
@@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
1149 * stable storage, and this is a WRITE, we may not even submit 1167 * stable storage, and this is a WRITE, we may not even submit
1150 * this bio. */ 1168 * this bio. */
1151 if (get_ldev(device)) { 1169 if (get_ldev(device)) {
1152 req->pre_submit_jif = jiffies;
1153 if (drbd_insert_fault(device, 1170 if (drbd_insert_fault(device,
1154 rw == WRITE ? DRBD_FAULT_DT_WR 1171 rw == WRITE ? DRBD_FAULT_DT_WR
1155 : rw == READ ? DRBD_FAULT_DT_RD 1172 : rw == READ ? DRBD_FAULT_DT_RD
@@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
1293 &device->pending_master_completion[rw == WRITE]); 1310 &device->pending_master_completion[rw == WRITE]);
1294 if (req->private_bio) { 1311 if (req->private_bio) {
1295 /* needs to be marked within the same spinlock */ 1312 /* needs to be marked within the same spinlock */
1313 req->pre_submit_jif = jiffies;
1296 list_add_tail(&req->req_pending_local, 1314 list_add_tail(&req->req_pending_local,
1297 &device->pending_completion[rw == WRITE]); 1315 &device->pending_completion[rw == WRITE]);
1298 _req_mod(req, TO_BE_SUBMITTED); 1316 _req_mod(req, TO_BE_SUBMITTED);
@@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
1513 return BLK_QC_T_NONE; 1531 return BLK_QC_T_NONE;
1514} 1532}
1515 1533
1534static bool net_timeout_reached(struct drbd_request *net_req,
1535 struct drbd_connection *connection,
1536 unsigned long now, unsigned long ent,
1537 unsigned int ko_count, unsigned int timeout)
1538{
1539 struct drbd_device *device = net_req->device;
1540
1541 if (!time_after(now, net_req->pre_send_jif + ent))
1542 return false;
1543
1544 if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
1545 return false;
1546
1547 if (net_req->rq_state & RQ_NET_PENDING) {
1548 drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
1549 jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
1550 return true;
1551 }
1552
1553 /* We received an ACK already (or are using protocol A),
1554 * but are waiting for the epoch closing barrier ack.
1555 * Check if we sent the barrier already. We should not blame the peer
1556 * for being unresponsive, if we did not even ask it yet. */
1557 if (net_req->epoch == connection->send.current_epoch_nr) {
1558 drbd_warn(device,
1559 "We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
1560 jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
1561 return false;
1562 }
1563
1564 /* Worst case: we may have been blocked for whatever reason, then
1565 * suddenly are able to send a lot of requests (and epoch separating
1566 * barriers) in quick succession.
1567 * The timestamp of the net_req may be much too old and not correspond
1568 * to the sending time of the relevant unack'ed barrier packet, so
1569 * would trigger a spurious timeout. The latest barrier packet may
1570 * have a too recent timestamp to trigger the timeout, potentially miss
1571 * a timeout. Right now we don't have a place to conveniently store
1572 * these timestamps.
1573 * But in this particular situation, the application requests are still
1574 * completed to upper layers, DRBD should still "feel" responsive.
1575 * No need yet to kill this connection, it may still recover.
1576 * If not, eventually we will have queued enough into the network for
1577 * us to block. From that point of view, the timestamp of the last sent
1578 * barrier packet is relevant enough.
1579 */
1580 if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
1581 drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
1582 connection->send.last_sent_barrier_jif, now,
1583 jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
1584 return true;
1585 }
1586 return false;
1587}
1588
1589/* A request is considered timed out, if
1590 * - we have some effective timeout from the configuration,
1591 * with some state restrictions applied,
1592 * - the oldest request is waiting for a response from the network
1593 * resp. the local disk,
1594 * - the oldest request is in fact older than the effective timeout,
1595 * - the connection was established (resp. disk was attached)
1596 * for longer than the timeout already.
1597 * Note that for 32bit jiffies and very stable connections/disks,
1598 * we may have a wrap around, which is catched by
1599 * !time_in_range(now, last_..._jif, last_..._jif + timeout).
1600 *
1601 * Side effect: once per 32bit wrap-around interval, which means every
1602 * ~198 days with 250 HZ, we have a window where the timeout would need
1603 * to expire twice (worst case) to become effective. Good enough.
1604 */
1605
1516void request_timer_fn(unsigned long data) 1606void request_timer_fn(unsigned long data)
1517{ 1607{
1518 struct drbd_device *device = (struct drbd_device *) data; 1608 struct drbd_device *device = (struct drbd_device *) data;
@@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data)
1522 unsigned long oldest_submit_jif; 1612 unsigned long oldest_submit_jif;
1523 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ 1613 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
1524 unsigned long now; 1614 unsigned long now;
1615 unsigned int ko_count = 0, timeout = 0;
1525 1616
1526 rcu_read_lock(); 1617 rcu_read_lock();
1527 nc = rcu_dereference(connection->net_conf); 1618 nc = rcu_dereference(connection->net_conf);
1528 if (nc && device->state.conn >= C_WF_REPORT_PARAMS) 1619 if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
1529 ent = nc->timeout * HZ/10 * nc->ko_count; 1620 ko_count = nc->ko_count;
1621 timeout = nc->timeout;
1622 }
1530 1623
1531 if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */ 1624 if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
1532 dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10; 1625 dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
@@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data)
1534 } 1627 }
1535 rcu_read_unlock(); 1628 rcu_read_unlock();
1536 1629
1630
1631 ent = timeout * HZ/10 * ko_count;
1537 et = min_not_zero(dt, ent); 1632 et = min_not_zero(dt, ent);
1538 1633
1539 if (!et) 1634 if (!et)
@@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data)
1545 spin_lock_irq(&device->resource->req_lock); 1640 spin_lock_irq(&device->resource->req_lock);
1546 req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); 1641 req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
1547 req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); 1642 req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
1548 req_peer = connection->req_not_net_done; 1643
1549 /* maybe the oldest request waiting for the peer is in fact still 1644 /* maybe the oldest request waiting for the peer is in fact still
1550 * blocking in tcp sendmsg */ 1645 * blocking in tcp sendmsg. That's ok, though, that's handled via the
1551 if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) 1646 * socket send timeout, requesting a ping, and bumping ko-count in
1552 req_peer = connection->req_next; 1647 * we_should_drop_the_connection().
1648 */
1649
1650 /* check the oldest request we did successfully sent,
1651 * but which is still waiting for an ACK. */
1652 req_peer = connection->req_ack_pending;
1653
1654 /* if we don't have such request (e.g. protocoll A)
1655 * check the oldest requests which is still waiting on its epoch
1656 * closing barrier ack. */
1657 if (!req_peer)
1658 req_peer = connection->req_not_net_done;
1553 1659
1554 /* evaluate the oldest peer request only in one timer! */ 1660 /* evaluate the oldest peer request only in one timer! */
1555 if (req_peer && req_peer->device != device) 1661 if (req_peer && req_peer->device != device)
@@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data)
1566 : req_write ? req_write->pre_submit_jif 1672 : req_write ? req_write->pre_submit_jif
1567 : req_read ? req_read->pre_submit_jif : now; 1673 : req_read ? req_read->pre_submit_jif : now;
1568 1674
1569 /* The request is considered timed out, if 1675 if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
1570 * - we have some effective timeout from the configuration,
1571 * with above state restrictions applied,
1572 * - the oldest request is waiting for a response from the network
1573 * resp. the local disk,
1574 * - the oldest request is in fact older than the effective timeout,
1575 * - the connection was established (resp. disk was attached)
1576 * for longer than the timeout already.
1577 * Note that for 32bit jiffies and very stable connections/disks,
1578 * we may have a wrap around, which is catched by
1579 * !time_in_range(now, last_..._jif, last_..._jif + timeout).
1580 *
1581 * Side effect: once per 32bit wrap-around interval, which means every
1582 * ~198 days with 250 HZ, we have a window where the timeout would need
1583 * to expire twice (worst case) to become effective. Good enough.
1584 */
1585 if (ent && req_peer &&
1586 time_after(now, req_peer->pre_send_jif + ent) &&
1587 !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
1588 drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
1589 _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); 1676 _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
1590 } 1677
1591 if (dt && oldest_submit_jif != now && 1678 if (dt && oldest_submit_jif != now &&
1592 time_after(now, oldest_submit_jif + dt) && 1679 time_after(now, oldest_submit_jif + dt) &&
1593 !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { 1680 !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 9f6a04080e9f..bb2ef78165e5 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req,
331 return rv; 331 return rv;
332} 332}
333 333
334static inline bool drbd_should_do_remote(union drbd_dev_state s) 334extern bool drbd_should_do_remote(union drbd_dev_state);
335{
336 return s.pdsk == D_UP_TO_DATE ||
337 (s.pdsk >= D_INCONSISTENT &&
338 s.conn >= C_WF_BITMAP_T &&
339 s.conn < C_AHEAD);
340 /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
341 That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
342 states. */
343}
344static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
345{
346 return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
347 /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
348 since we enter state C_AHEAD only if proto >= 96 */
349}
350 335
351#endif 336#endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 2d7dd269b6a8..5a7ef7873b67 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -29,6 +29,7 @@
29#include "drbd_int.h" 29#include "drbd_int.h"
30#include "drbd_protocol.h" 30#include "drbd_protocol.h"
31#include "drbd_req.h" 31#include "drbd_req.h"
32#include "drbd_state_change.h"
32 33
33struct after_state_chg_work { 34struct after_state_chg_work {
34 struct drbd_work w; 35 struct drbd_work w;
@@ -37,6 +38,7 @@ struct after_state_chg_work {
37 union drbd_state ns; 38 union drbd_state ns;
38 enum chg_state_flags flags; 39 enum chg_state_flags flags;
39 struct completion *done; 40 struct completion *done;
41 struct drbd_state_change *state_change;
40}; 42};
41 43
42enum sanitize_state_warnings { 44enum sanitize_state_warnings {
@@ -48,9 +50,248 @@ enum sanitize_state_warnings {
48 IMPLICITLY_UPGRADED_PDSK, 50 IMPLICITLY_UPGRADED_PDSK,
49}; 51};
50 52
53static void count_objects(struct drbd_resource *resource,
54 unsigned int *n_devices,
55 unsigned int *n_connections)
56{
57 struct drbd_device *device;
58 struct drbd_connection *connection;
59 int vnr;
60
61 *n_devices = 0;
62 *n_connections = 0;
63
64 idr_for_each_entry(&resource->devices, device, vnr)
65 (*n_devices)++;
66 for_each_connection(connection, resource)
67 (*n_connections)++;
68}
69
70static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
71{
72 struct drbd_state_change *state_change;
73 unsigned int size, n;
74
75 size = sizeof(struct drbd_state_change) +
76 n_devices * sizeof(struct drbd_device_state_change) +
77 n_connections * sizeof(struct drbd_connection_state_change) +
78 n_devices * n_connections * sizeof(struct drbd_peer_device_state_change);
79 state_change = kmalloc(size, gfp);
80 if (!state_change)
81 return NULL;
82 state_change->n_devices = n_devices;
83 state_change->n_connections = n_connections;
84 state_change->devices = (void *)(state_change + 1);
85 state_change->connections = (void *)&state_change->devices[n_devices];
86 state_change->peer_devices = (void *)&state_change->connections[n_connections];
87 state_change->resource->resource = NULL;
88 for (n = 0; n < n_devices; n++)
89 state_change->devices[n].device = NULL;
90 for (n = 0; n < n_connections; n++)
91 state_change->connections[n].connection = NULL;
92 return state_change;
93}
94
95struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp)
96{
97 struct drbd_state_change *state_change;
98 struct drbd_device *device;
99 unsigned int n_devices;
100 struct drbd_connection *connection;
101 unsigned int n_connections;
102 int vnr;
103
104 struct drbd_device_state_change *device_state_change;
105 struct drbd_peer_device_state_change *peer_device_state_change;
106 struct drbd_connection_state_change *connection_state_change;
107
108 /* Caller holds req_lock spinlock.
109 * No state, no device IDR, no connections lists can change. */
110 count_objects(resource, &n_devices, &n_connections);
111 state_change = alloc_state_change(n_devices, n_connections, gfp);
112 if (!state_change)
113 return NULL;
114
115 kref_get(&resource->kref);
116 state_change->resource->resource = resource;
117 state_change->resource->role[OLD] =
118 conn_highest_role(first_connection(resource));
119 state_change->resource->susp[OLD] = resource->susp;
120 state_change->resource->susp_nod[OLD] = resource->susp_nod;
121 state_change->resource->susp_fen[OLD] = resource->susp_fen;
122
123 connection_state_change = state_change->connections;
124 for_each_connection(connection, resource) {
125 kref_get(&connection->kref);
126 connection_state_change->connection = connection;
127 connection_state_change->cstate[OLD] =
128 connection->cstate;
129 connection_state_change->peer_role[OLD] =
130 conn_highest_peer(connection);
131 connection_state_change++;
132 }
133
134 device_state_change = state_change->devices;
135 peer_device_state_change = state_change->peer_devices;
136 idr_for_each_entry(&resource->devices, device, vnr) {
137 kref_get(&device->kref);
138 device_state_change->device = device;
139 device_state_change->disk_state[OLD] = device->state.disk;
140
141 /* The peer_devices for each device have to be enumerated in
142 the order of the connections. We may not use for_each_peer_device() here. */
143 for_each_connection(connection, resource) {
144 struct drbd_peer_device *peer_device;
145
146 peer_device = conn_peer_device(connection, device->vnr);
147 peer_device_state_change->peer_device = peer_device;
148 peer_device_state_change->disk_state[OLD] =
149 device->state.pdsk;
150 peer_device_state_change->repl_state[OLD] =
151 max_t(enum drbd_conns,
152 C_WF_REPORT_PARAMS, device->state.conn);
153 peer_device_state_change->resync_susp_user[OLD] =
154 device->state.user_isp;
155 peer_device_state_change->resync_susp_peer[OLD] =
156 device->state.peer_isp;
157 peer_device_state_change->resync_susp_dependency[OLD] =
158 device->state.aftr_isp;
159 peer_device_state_change++;
160 }
161 device_state_change++;
162 }
163
164 return state_change;
165}
166
167static void remember_new_state(struct drbd_state_change *state_change)
168{
169 struct drbd_resource_state_change *resource_state_change;
170 struct drbd_resource *resource;
171 unsigned int n;
172
173 if (!state_change)
174 return;
175
176 resource_state_change = &state_change->resource[0];
177 resource = resource_state_change->resource;
178
179 resource_state_change->role[NEW] =
180 conn_highest_role(first_connection(resource));
181 resource_state_change->susp[NEW] = resource->susp;
182 resource_state_change->susp_nod[NEW] = resource->susp_nod;
183 resource_state_change->susp_fen[NEW] = resource->susp_fen;
184
185 for (n = 0; n < state_change->n_devices; n++) {
186 struct drbd_device_state_change *device_state_change =
187 &state_change->devices[n];
188 struct drbd_device *device = device_state_change->device;
189
190 device_state_change->disk_state[NEW] = device->state.disk;
191 }
192
193 for (n = 0; n < state_change->n_connections; n++) {
194 struct drbd_connection_state_change *connection_state_change =
195 &state_change->connections[n];
196 struct drbd_connection *connection =
197 connection_state_change->connection;
198
199 connection_state_change->cstate[NEW] = connection->cstate;
200 connection_state_change->peer_role[NEW] =
201 conn_highest_peer(connection);
202 }
203
204 for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) {
205 struct drbd_peer_device_state_change *peer_device_state_change =
206 &state_change->peer_devices[n];
207 struct drbd_device *device =
208 peer_device_state_change->peer_device->device;
209 union drbd_dev_state state = device->state;
210
211 peer_device_state_change->disk_state[NEW] = state.pdsk;
212 peer_device_state_change->repl_state[NEW] =
213 max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn);
214 peer_device_state_change->resync_susp_user[NEW] =
215 state.user_isp;
216 peer_device_state_change->resync_susp_peer[NEW] =
217 state.peer_isp;
218 peer_device_state_change->resync_susp_dependency[NEW] =
219 state.aftr_isp;
220 }
221}
222
223void copy_old_to_new_state_change(struct drbd_state_change *state_change)
224{
225 struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
226 unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
227
228#define OLD_TO_NEW(x) \
229 (x[NEW] = x[OLD])
230
231 OLD_TO_NEW(resource_state_change->role);
232 OLD_TO_NEW(resource_state_change->susp);
233 OLD_TO_NEW(resource_state_change->susp_nod);
234 OLD_TO_NEW(resource_state_change->susp_fen);
235
236 for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
237 struct drbd_connection_state_change *connection_state_change =
238 &state_change->connections[n_connection];
239
240 OLD_TO_NEW(connection_state_change->peer_role);
241 OLD_TO_NEW(connection_state_change->cstate);
242 }
243
244 for (n_device = 0; n_device < state_change->n_devices; n_device++) {
245 struct drbd_device_state_change *device_state_change =
246 &state_change->devices[n_device];
247
248 OLD_TO_NEW(device_state_change->disk_state);
249 }
250
251 n_peer_devices = state_change->n_devices * state_change->n_connections;
252 for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
253 struct drbd_peer_device_state_change *p =
254 &state_change->peer_devices[n_peer_device];
255
256 OLD_TO_NEW(p->disk_state);
257 OLD_TO_NEW(p->repl_state);
258 OLD_TO_NEW(p->resync_susp_user);
259 OLD_TO_NEW(p->resync_susp_peer);
260 OLD_TO_NEW(p->resync_susp_dependency);
261 }
262
263#undef OLD_TO_NEW
264}
265
266void forget_state_change(struct drbd_state_change *state_change)
267{
268 unsigned int n;
269
270 if (!state_change)
271 return;
272
273 if (state_change->resource->resource)
274 kref_put(&state_change->resource->resource->kref, drbd_destroy_resource);
275 for (n = 0; n < state_change->n_devices; n++) {
276 struct drbd_device *device = state_change->devices[n].device;
277
278 if (device)
279 kref_put(&device->kref, drbd_destroy_device);
280 }
281 for (n = 0; n < state_change->n_connections; n++) {
282 struct drbd_connection *connection =
283 state_change->connections[n].connection;
284
285 if (connection)
286 kref_put(&connection->kref, drbd_destroy_connection);
287 }
288 kfree(state_change);
289}
290
51static int w_after_state_ch(struct drbd_work *w, int unused); 291static int w_after_state_ch(struct drbd_work *w, int unused);
52static void after_state_ch(struct drbd_device *device, union drbd_state os, 292static void after_state_ch(struct drbd_device *device, union drbd_state os,
53 union drbd_state ns, enum chg_state_flags flags); 293 union drbd_state ns, enum chg_state_flags flags,
294 struct drbd_state_change *);
54static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); 295static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
55static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); 296static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
56static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); 297static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
@@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
93 return R_SECONDARY; 334 return R_SECONDARY;
94 return R_UNKNOWN; 335 return R_UNKNOWN;
95} 336}
337
96static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) 338static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
97{ 339{
98 if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) 340 if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
@@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device)
937 drbd_info(device, "Resumed AL updates\n"); 1179 drbd_info(device, "Resumed AL updates\n");
938} 1180}
939 1181
940/* helper for __drbd_set_state */ 1182/* helper for _drbd_set_state */
941static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) 1183static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
942{ 1184{
943 if (first_peer_device(device)->connection->agreed_pro_version < 90) 1185 if (first_peer_device(device)->connection->agreed_pro_version < 90)
@@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
965} 1207}
966 1208
967/** 1209/**
968 * __drbd_set_state() - Set a new DRBD state 1210 * _drbd_set_state() - Set a new DRBD state
969 * @device: DRBD device. 1211 * @device: DRBD device.
970 * @ns: new state. 1212 * @ns: new state.
971 * @flags: Flags 1213 * @flags: Flags
972 * @done: Optional completion, that will get completed after the after_state_ch() finished 1214 * @done: Optional completion, that will get completed after the after_state_ch() finished
973 * 1215 *
974 * Caller needs to hold req_lock, and global_state_lock. Do not call directly. 1216 * Caller needs to hold req_lock. Do not call directly.
975 */ 1217 */
976enum drbd_state_rv 1218enum drbd_state_rv
977__drbd_set_state(struct drbd_device *device, union drbd_state ns, 1219_drbd_set_state(struct drbd_device *device, union drbd_state ns,
978 enum chg_state_flags flags, struct completion *done) 1220 enum chg_state_flags flags, struct completion *done)
979{ 1221{
980 struct drbd_peer_device *peer_device = first_peer_device(device); 1222 struct drbd_peer_device *peer_device = first_peer_device(device);
981 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 1223 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
@@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
983 enum drbd_state_rv rv = SS_SUCCESS; 1225 enum drbd_state_rv rv = SS_SUCCESS;
984 enum sanitize_state_warnings ssw; 1226 enum sanitize_state_warnings ssw;
985 struct after_state_chg_work *ascw; 1227 struct after_state_chg_work *ascw;
1228 struct drbd_state_change *state_change;
986 1229
987 os = drbd_read_state(device); 1230 os = drbd_read_state(device);
988 1231
@@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1037 if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) 1280 if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
1038 clear_bit(RS_DONE, &device->flags); 1281 clear_bit(RS_DONE, &device->flags);
1039 1282
1283 /* FIXME: Have any flags been set earlier in this function already? */
1284 state_change = remember_old_state(device->resource, GFP_ATOMIC);
1285
1040 /* changes to local_cnt and device flags should be visible before 1286 /* changes to local_cnt and device flags should be visible before
1041 * changes to state, which again should be visible before anything else 1287 * changes to state, which again should be visible before anything else
1042 * depending on that change happens. */ 1288 * depending on that change happens. */
@@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1047 device->resource->susp_fen = ns.susp_fen; 1293 device->resource->susp_fen = ns.susp_fen;
1048 smp_wmb(); 1294 smp_wmb();
1049 1295
1296 remember_new_state(state_change);
1297
1050 /* put replicated vs not-replicated requests in seperate epochs */ 1298 /* put replicated vs not-replicated requests in seperate epochs */
1051 if (drbd_should_do_remote((union drbd_dev_state)os.i) != 1299 if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
1052 drbd_should_do_remote((union drbd_dev_state)ns.i)) 1300 drbd_should_do_remote((union drbd_dev_state)ns.i))
@@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1184 ascw->w.cb = w_after_state_ch; 1432 ascw->w.cb = w_after_state_ch;
1185 ascw->device = device; 1433 ascw->device = device;
1186 ascw->done = done; 1434 ascw->done = done;
1435 ascw->state_change = state_change;
1187 drbd_queue_work(&connection->sender_work, 1436 drbd_queue_work(&connection->sender_work,
1188 &ascw->w); 1437 &ascw->w);
1189 } else { 1438 } else {
@@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused)
1199 container_of(w, struct after_state_chg_work, w); 1448 container_of(w, struct after_state_chg_work, w);
1200 struct drbd_device *device = ascw->device; 1449 struct drbd_device *device = ascw->device;
1201 1450
1202 after_state_ch(device, ascw->os, ascw->ns, ascw->flags); 1451 after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change);
1452 forget_state_change(ascw->state_change);
1203 if (ascw->flags & CS_WAIT_COMPLETE) 1453 if (ascw->flags & CS_WAIT_COMPLETE)
1204 complete(ascw->done); 1454 complete(ascw->done);
1205 kfree(ascw); 1455 kfree(ascw);
@@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
1234 D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); 1484 D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
1235 1485
1236 /* open coded non-blocking drbd_suspend_io(device); */ 1486 /* open coded non-blocking drbd_suspend_io(device); */
1237 set_bit(SUSPEND_IO, &device->flags); 1487 atomic_inc(&device->suspend_cnt);
1238 1488
1239 drbd_bm_lock(device, why, flags); 1489 drbd_bm_lock(device, why, flags);
1240 rv = io_fn(device); 1490 rv = io_fn(device);
@@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
1245 return rv; 1495 return rv;
1246} 1496}
1247 1497
1498void notify_resource_state_change(struct sk_buff *skb,
1499 unsigned int seq,
1500 struct drbd_resource_state_change *resource_state_change,
1501 enum drbd_notification_type type)
1502{
1503 struct drbd_resource *resource = resource_state_change->resource;
1504 struct resource_info resource_info = {
1505 .res_role = resource_state_change->role[NEW],
1506 .res_susp = resource_state_change->susp[NEW],
1507 .res_susp_nod = resource_state_change->susp_nod[NEW],
1508 .res_susp_fen = resource_state_change->susp_fen[NEW],
1509 };
1510
1511 notify_resource_state(skb, seq, resource, &resource_info, type);
1512}
1513
1514void notify_connection_state_change(struct sk_buff *skb,
1515 unsigned int seq,
1516 struct drbd_connection_state_change *connection_state_change,
1517 enum drbd_notification_type type)
1518{
1519 struct drbd_connection *connection = connection_state_change->connection;
1520 struct connection_info connection_info = {
1521 .conn_connection_state = connection_state_change->cstate[NEW],
1522 .conn_role = connection_state_change->peer_role[NEW],
1523 };
1524
1525 notify_connection_state(skb, seq, connection, &connection_info, type);
1526}
1527
1528void notify_device_state_change(struct sk_buff *skb,
1529 unsigned int seq,
1530 struct drbd_device_state_change *device_state_change,
1531 enum drbd_notification_type type)
1532{
1533 struct drbd_device *device = device_state_change->device;
1534 struct device_info device_info = {
1535 .dev_disk_state = device_state_change->disk_state[NEW],
1536 };
1537
1538 notify_device_state(skb, seq, device, &device_info, type);
1539}
1540
1541void notify_peer_device_state_change(struct sk_buff *skb,
1542 unsigned int seq,
1543 struct drbd_peer_device_state_change *p,
1544 enum drbd_notification_type type)
1545{
1546 struct drbd_peer_device *peer_device = p->peer_device;
1547 struct peer_device_info peer_device_info = {
1548 .peer_repl_state = p->repl_state[NEW],
1549 .peer_disk_state = p->disk_state[NEW],
1550 .peer_resync_susp_user = p->resync_susp_user[NEW],
1551 .peer_resync_susp_peer = p->resync_susp_peer[NEW],
1552 .peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
1553 };
1554
1555 notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
1556}
1557
1558static void broadcast_state_change(struct drbd_state_change *state_change)
1559{
1560 struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
1561 bool resource_state_has_changed;
1562 unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
1563 void (*last_func)(struct sk_buff *, unsigned int, void *,
1564 enum drbd_notification_type) = NULL;
1565 void *uninitialized_var(last_arg);
1566
1567#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
1568#define FINAL_STATE_CHANGE(type) \
1569 ({ if (last_func) \
1570 last_func(NULL, 0, last_arg, type); \
1571 })
1572#define REMEMBER_STATE_CHANGE(func, arg, type) \
1573 ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
1574 last_func = (typeof(last_func))func; \
1575 last_arg = arg; \
1576 })
1577
1578 mutex_lock(&notification_mutex);
1579
1580 resource_state_has_changed =
1581 HAS_CHANGED(resource_state_change->role) ||
1582 HAS_CHANGED(resource_state_change->susp) ||
1583 HAS_CHANGED(resource_state_change->susp_nod) ||
1584 HAS_CHANGED(resource_state_change->susp_fen);
1585
1586 if (resource_state_has_changed)
1587 REMEMBER_STATE_CHANGE(notify_resource_state_change,
1588 resource_state_change, NOTIFY_CHANGE);
1589
1590 for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
1591 struct drbd_connection_state_change *connection_state_change =
1592 &state_change->connections[n_connection];
1593
1594 if (HAS_CHANGED(connection_state_change->peer_role) ||
1595 HAS_CHANGED(connection_state_change->cstate))
1596 REMEMBER_STATE_CHANGE(notify_connection_state_change,
1597 connection_state_change, NOTIFY_CHANGE);
1598 }
1599
1600 for (n_device = 0; n_device < state_change->n_devices; n_device++) {
1601 struct drbd_device_state_change *device_state_change =
1602 &state_change->devices[n_device];
1603
1604 if (HAS_CHANGED(device_state_change->disk_state))
1605 REMEMBER_STATE_CHANGE(notify_device_state_change,
1606 device_state_change, NOTIFY_CHANGE);
1607 }
1608
1609 n_peer_devices = state_change->n_devices * state_change->n_connections;
1610 for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
1611 struct drbd_peer_device_state_change *p =
1612 &state_change->peer_devices[n_peer_device];
1613
1614 if (HAS_CHANGED(p->disk_state) ||
1615 HAS_CHANGED(p->repl_state) ||
1616 HAS_CHANGED(p->resync_susp_user) ||
1617 HAS_CHANGED(p->resync_susp_peer) ||
1618 HAS_CHANGED(p->resync_susp_dependency))
1619 REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
1620 p, NOTIFY_CHANGE);
1621 }
1622
1623 FINAL_STATE_CHANGE(NOTIFY_CHANGE);
1624 mutex_unlock(&notification_mutex);
1625
1626#undef HAS_CHANGED
1627#undef FINAL_STATE_CHANGE
1628#undef REMEMBER_STATE_CHANGE
1629}
1630
1248/** 1631/**
1249 * after_state_ch() - Perform after state change actions that may sleep 1632 * after_state_ch() - Perform after state change actions that may sleep
1250 * @device: DRBD device. 1633 * @device: DRBD device.
@@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
1253 * @flags: Flags 1636 * @flags: Flags
1254 */ 1637 */
1255static void after_state_ch(struct drbd_device *device, union drbd_state os, 1638static void after_state_ch(struct drbd_device *device, union drbd_state os,
1256 union drbd_state ns, enum chg_state_flags flags) 1639 union drbd_state ns, enum chg_state_flags flags,
1640 struct drbd_state_change *state_change)
1257{ 1641{
1258 struct drbd_resource *resource = device->resource; 1642 struct drbd_resource *resource = device->resource;
1259 struct drbd_peer_device *peer_device = first_peer_device(device); 1643 struct drbd_peer_device *peer_device = first_peer_device(device);
1260 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 1644 struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
1261 struct sib_info sib; 1645 struct sib_info sib;
1262 1646
1647 broadcast_state_change(state_change);
1648
1263 sib.sib_reason = SIB_STATE_CHANGE; 1649 sib.sib_reason = SIB_STATE_CHANGE;
1264 sib.os = os; 1650 sib.os = os;
1265 sib.ns = ns; 1651 sib.ns = ns;
@@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1377 } 1763 }
1378 1764
1379 if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { 1765 if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
1380 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && 1766 if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY &&
1381 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1767 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1382 drbd_uuid_new_current(device); 1768 drbd_uuid_new_current(device);
1383 drbd_send_uuids(peer_device); 1769 drbd_send_uuids(peer_device);
@@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1444 if (os.disk != D_FAILED && ns.disk == D_FAILED) { 1830 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1445 enum drbd_io_error_p eh = EP_PASS_ON; 1831 enum drbd_io_error_p eh = EP_PASS_ON;
1446 int was_io_error = 0; 1832 int was_io_error = 0;
1447 /* corresponding get_ldev was in __drbd_set_state, to serialize 1833 /* corresponding get_ldev was in _drbd_set_state, to serialize
1448 * our cleanup here with the transition to D_DISKLESS. 1834 * our cleanup here with the transition to D_DISKLESS.
1449 * But is is still not save to dreference ldev here, since 1835 * But is is still not save to dreference ldev here, since
1450 * we might come from an failed Attach before ldev was set. */ 1836 * we might come from an failed Attach before ldev was set. */
@@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1455 1841
1456 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); 1842 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
1457 1843
1844 /* Intentionally call this handler first, before drbd_send_state().
1845 * See: 2932204 drbd: call local-io-error handler early
1846 * People may chose to hard-reset the box from this handler.
1847 * It is useful if this looks like a "regular node crash". */
1458 if (was_io_error && eh == EP_CALL_HELPER) 1848 if (was_io_error && eh == EP_CALL_HELPER)
1459 drbd_khelper(device, "local-io-error"); 1849 drbd_khelper(device, "local-io-error");
1460 1850
@@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work {
1572 union drbd_state ns_max; /* new, max state, over all devices */ 1962 union drbd_state ns_max; /* new, max state, over all devices */
1573 enum chg_state_flags flags; 1963 enum chg_state_flags flags;
1574 struct drbd_connection *connection; 1964 struct drbd_connection *connection;
1965 struct drbd_state_change *state_change;
1575}; 1966};
1576 1967
1577static int w_after_conn_state_ch(struct drbd_work *w, int unused) 1968static int w_after_conn_state_ch(struct drbd_work *w, int unused)
@@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
1584 struct drbd_peer_device *peer_device; 1975 struct drbd_peer_device *peer_device;
1585 int vnr; 1976 int vnr;
1586 1977
1978 broadcast_state_change(acscw->state_change);
1979 forget_state_change(acscw->state_change);
1587 kfree(acscw); 1980 kfree(acscw);
1588 1981
1589 /* Upon network configuration, we need to start the receiver */ 1982 /* Upon network configuration, we need to start the receiver */
@@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
1593 if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { 1986 if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
1594 struct net_conf *old_conf; 1987 struct net_conf *old_conf;
1595 1988
1989 mutex_lock(&notification_mutex);
1990 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1991 notify_peer_device_state(NULL, 0, peer_device, NULL,
1992 NOTIFY_DESTROY | NOTIFY_CONTINUES);
1993 notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY);
1994 mutex_unlock(&notification_mutex);
1995
1596 mutex_lock(&connection->resource->conf_update); 1996 mutex_lock(&connection->resource->conf_update);
1597 old_conf = connection->net_conf; 1997 old_conf = connection->net_conf;
1598 connection->my_addr_len = 0; 1998 connection->my_addr_len = 0;
@@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
1759 if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) 2159 if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
1760 ns.disk = os.disk; 2160 ns.disk = os.disk;
1761 2161
1762 rv = __drbd_set_state(device, ns, flags, NULL); 2162 rv = _drbd_set_state(device, ns, flags, NULL);
1763 if (rv < SS_SUCCESS) 2163 if (rv < SS_SUCCESS)
1764 BUG(); 2164 BUG();
1765 2165
@@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
1823 enum drbd_conns oc = connection->cstate; 2223 enum drbd_conns oc = connection->cstate;
1824 union drbd_state ns_max, ns_min, os; 2224 union drbd_state ns_max, ns_min, os;
1825 bool have_mutex = false; 2225 bool have_mutex = false;
2226 struct drbd_state_change *state_change;
1826 2227
1827 if (mask.conn) { 2228 if (mask.conn) {
1828 rv = is_valid_conn_transition(oc, val.conn); 2229 rv = is_valid_conn_transition(oc, val.conn);
@@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
1868 goto abort; 2269 goto abort;
1869 } 2270 }
1870 2271
2272 state_change = remember_old_state(connection->resource, GFP_ATOMIC);
1871 conn_old_common_state(connection, &os, &flags); 2273 conn_old_common_state(connection, &os, &flags);
1872 flags |= CS_DC_SUSP; 2274 flags |= CS_DC_SUSP;
1873 conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); 2275 conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
1874 conn_pr_state_change(connection, os, ns_max, flags); 2276 conn_pr_state_change(connection, os, ns_max, flags);
2277 remember_new_state(state_change);
1875 2278
1876 acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); 2279 acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
1877 if (acscw) { 2280 if (acscw) {
@@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
1882 acscw->w.cb = w_after_conn_state_ch; 2285 acscw->w.cb = w_after_conn_state_ch;
1883 kref_get(&connection->kref); 2286 kref_get(&connection->kref);
1884 acscw->connection = connection; 2287 acscw->connection = connection;
2288 acscw->state_change = state_change;
1885 drbd_queue_work(&connection->sender_work, &acscw->w); 2289 drbd_queue_work(&connection->sender_work, &acscw->w);
1886 } else { 2290 } else {
1887 drbd_err(connection, "Could not kmalloc an acscw\n"); 2291 drbd_err(connection, "Could not kmalloc an acscw\n");
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index 7f53c40823cd..bd989536f888 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -122,9 +122,9 @@ extern enum drbd_state_rv
122_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, 122_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
123 union drbd_state, enum chg_state_flags); 123 union drbd_state, enum chg_state_flags);
124 124
125extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, 125extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state,
126 enum chg_state_flags, 126 enum chg_state_flags,
127 struct completion *done); 127 struct completion *done);
128extern void print_st_err(struct drbd_device *, union drbd_state, 128extern void print_st_err(struct drbd_device *, union drbd_state,
129 union drbd_state, int); 129 union drbd_state, int);
130 130
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
new file mode 100644
index 000000000000..9e503a1a0bfb
--- /dev/null
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -0,0 +1,63 @@
1#ifndef DRBD_STATE_CHANGE_H
2#define DRBD_STATE_CHANGE_H
3
4struct drbd_resource_state_change {
5 struct drbd_resource *resource;
6 enum drbd_role role[2];
7 bool susp[2];
8 bool susp_nod[2];
9 bool susp_fen[2];
10};
11
12struct drbd_device_state_change {
13 struct drbd_device *device;
14 enum drbd_disk_state disk_state[2];
15};
16
17struct drbd_connection_state_change {
18 struct drbd_connection *connection;
19 enum drbd_conns cstate[2]; /* drbd9: enum drbd_conn_state */
20 enum drbd_role peer_role[2];
21};
22
23struct drbd_peer_device_state_change {
24 struct drbd_peer_device *peer_device;
25 enum drbd_disk_state disk_state[2];
26 enum drbd_conns repl_state[2]; /* drbd9: enum drbd_repl_state */
27 bool resync_susp_user[2];
28 bool resync_susp_peer[2];
29 bool resync_susp_dependency[2];
30};
31
32struct drbd_state_change {
33 struct list_head list;
34 unsigned int n_devices;
35 unsigned int n_connections;
36 struct drbd_resource_state_change resource[1];
37 struct drbd_device_state_change *devices;
38 struct drbd_connection_state_change *connections;
39 struct drbd_peer_device_state_change *peer_devices;
40};
41
42extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t);
43extern void copy_old_to_new_state_change(struct drbd_state_change *);
44extern void forget_state_change(struct drbd_state_change *);
45
46extern void notify_resource_state_change(struct sk_buff *,
47 unsigned int,
48 struct drbd_resource_state_change *,
49 enum drbd_notification_type type);
50extern void notify_connection_state_change(struct sk_buff *,
51 unsigned int,
52 struct drbd_connection_state_change *,
53 enum drbd_notification_type type);
54extern void notify_device_state_change(struct sk_buff *,
55 unsigned int,
56 struct drbd_device_state_change *,
57 enum drbd_notification_type type);
58extern void notify_peer_device_state_change(struct sk_buff *,
59 unsigned int,
60 struct drbd_peer_device_state_change *,
61 enum drbd_notification_type type);
62
63#endif /* DRBD_STATE_CHANGE_H */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 5578c1477ba6..eff716c27b1f 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int);
55 * 55 *
56 */ 56 */
57 57
58
59/* About the global_state_lock
60 Each state transition on an device holds a read lock. In case we have
61 to evaluate the resync after dependencies, we grab a write lock, because
62 we need stable states on all devices for that. */
63rwlock_t global_state_lock;
64
65/* used for synchronous meta data and bitmap IO 58/* used for synchronous meta data and bitmap IO
66 * submitted by drbd_md_sync_page_io() 59 * submitted by drbd_md_sync_page_io()
67 */ 60 */
@@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
120 unsigned long flags = 0; 113 unsigned long flags = 0;
121 struct drbd_peer_device *peer_device = peer_req->peer_device; 114 struct drbd_peer_device *peer_device = peer_req->peer_device;
122 struct drbd_device *device = peer_device->device; 115 struct drbd_device *device = peer_device->device;
116 struct drbd_connection *connection = peer_device->connection;
123 struct drbd_interval i; 117 struct drbd_interval i;
124 int do_wake; 118 int do_wake;
125 u64 block_id; 119 u64 block_id;
@@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
152 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ 146 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
153 if (peer_req->flags & EE_WAS_ERROR) 147 if (peer_req->flags & EE_WAS_ERROR)
154 __drbd_chk_io_error(device, DRBD_WRITE_ERROR); 148 __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
149
150 if (connection->cstate >= C_WF_REPORT_PARAMS) {
151 kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
152 if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
153 kref_put(&device->kref, drbd_destroy_device);
154 }
155 spin_unlock_irqrestore(&device->resource->req_lock, flags); 155 spin_unlock_irqrestore(&device->resource->req_lock, flags);
156 156
157 if (block_id == ID_SYNCER) 157 if (block_id == ID_SYNCER)
@@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
163 if (do_al_complete_io) 163 if (do_al_complete_io)
164 drbd_al_complete_io(device, &i); 164 drbd_al_complete_io(device, &i);
165 165
166 wake_asender(peer_device->connection);
167 put_ldev(device); 166 put_ldev(device);
168} 167}
169 168
@@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio)
195 } 194 }
196} 195}
197 196
197void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
198{
199 panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
200 device->minor, device->resource->name, device->vnr);
201}
202
198/* read, readA or write requests on R_PRIMARY coming from drbd_make_request 203/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
199 */ 204 */
200void drbd_request_endio(struct bio *bio) 205void drbd_request_endio(struct bio *bio)
@@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio)
238 drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); 243 drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
239 244
240 if (!bio->bi_error) 245 if (!bio->bi_error)
241 panic("possible random memory corruption caused by delayed completion of aborted local request\n"); 246 drbd_panic_after_delayed_completion_of_aborted_request(device);
242 } 247 }
243 248
244 /* to avoid recursion in __req_mod */ 249 /* to avoid recursion in __req_mod */
@@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection)
1291 p->barrier = connection->send.current_epoch_nr; 1296 p->barrier = connection->send.current_epoch_nr;
1292 p->pad = 0; 1297 p->pad = 0;
1293 connection->send.current_epoch_writes = 0; 1298 connection->send.current_epoch_writes = 0;
1299 connection->send.last_sent_barrier_jif = jiffies;
1294 1300
1295 return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0); 1301 return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
1296} 1302}
@@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned
1315 connection->send.seen_any_write_yet = true; 1321 connection->send.seen_any_write_yet = true;
1316 connection->send.current_epoch_nr = epoch; 1322 connection->send.current_epoch_nr = epoch;
1317 connection->send.current_epoch_writes = 0; 1323 connection->send.current_epoch_writes = 0;
1324 connection->send.last_sent_barrier_jif = jiffies;
1318 } 1325 }
1319} 1326}
1320 1327
@@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device)
1456} 1463}
1457 1464
1458/** 1465/**
1459 * _drbd_pause_after() - Pause resync on all devices that may not resync now 1466 * drbd_pause_after() - Pause resync on all devices that may not resync now
1460 * @device: DRBD device. 1467 * @device: DRBD device.
1461 * 1468 *
1462 * Called from process context only (admin command and after_state_ch). 1469 * Called from process context only (admin command and after_state_ch).
1463 */ 1470 */
1464static int _drbd_pause_after(struct drbd_device *device) 1471static bool drbd_pause_after(struct drbd_device *device)
1465{ 1472{
1473 bool changed = false;
1466 struct drbd_device *odev; 1474 struct drbd_device *odev;
1467 int i, rv = 0; 1475 int i;
1468 1476
1469 rcu_read_lock(); 1477 rcu_read_lock();
1470 idr_for_each_entry(&drbd_devices, odev, i) { 1478 idr_for_each_entry(&drbd_devices, odev, i) {
1471 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) 1479 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1472 continue; 1480 continue;
1473 if (!_drbd_may_sync_now(odev)) 1481 if (!_drbd_may_sync_now(odev) &&
1474 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) 1482 _drbd_set_state(_NS(odev, aftr_isp, 1),
1475 != SS_NOTHING_TO_DO); 1483 CS_HARD, NULL) != SS_NOTHING_TO_DO)
1484 changed = true;
1476 } 1485 }
1477 rcu_read_unlock(); 1486 rcu_read_unlock();
1478 1487
1479 return rv; 1488 return changed;
1480} 1489}
1481 1490
1482/** 1491/**
1483 * _drbd_resume_next() - Resume resync on all devices that may resync now 1492 * drbd_resume_next() - Resume resync on all devices that may resync now
1484 * @device: DRBD device. 1493 * @device: DRBD device.
1485 * 1494 *
1486 * Called from process context only (admin command and worker). 1495 * Called from process context only (admin command and worker).
1487 */ 1496 */
1488static int _drbd_resume_next(struct drbd_device *device) 1497static bool drbd_resume_next(struct drbd_device *device)
1489{ 1498{
1499 bool changed = false;
1490 struct drbd_device *odev; 1500 struct drbd_device *odev;
1491 int i, rv = 0; 1501 int i;
1492 1502
1493 rcu_read_lock(); 1503 rcu_read_lock();
1494 idr_for_each_entry(&drbd_devices, odev, i) { 1504 idr_for_each_entry(&drbd_devices, odev, i) {
1495 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) 1505 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1496 continue; 1506 continue;
1497 if (odev->state.aftr_isp) { 1507 if (odev->state.aftr_isp) {
1498 if (_drbd_may_sync_now(odev)) 1508 if (_drbd_may_sync_now(odev) &&
1499 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), 1509 _drbd_set_state(_NS(odev, aftr_isp, 0),
1500 CS_HARD, NULL) 1510 CS_HARD, NULL) != SS_NOTHING_TO_DO)
1501 != SS_NOTHING_TO_DO) ; 1511 changed = true;
1502 } 1512 }
1503 } 1513 }
1504 rcu_read_unlock(); 1514 rcu_read_unlock();
1505 return rv; 1515 return changed;
1506} 1516}
1507 1517
1508void resume_next_sg(struct drbd_device *device) 1518void resume_next_sg(struct drbd_device *device)
1509{ 1519{
1510 write_lock_irq(&global_state_lock); 1520 lock_all_resources();
1511 _drbd_resume_next(device); 1521 drbd_resume_next(device);
1512 write_unlock_irq(&global_state_lock); 1522 unlock_all_resources();
1513} 1523}
1514 1524
1515void suspend_other_sg(struct drbd_device *device) 1525void suspend_other_sg(struct drbd_device *device)
1516{ 1526{
1517 write_lock_irq(&global_state_lock); 1527 lock_all_resources();
1518 _drbd_pause_after(device); 1528 drbd_pause_after(device);
1519 write_unlock_irq(&global_state_lock); 1529 unlock_all_resources();
1520} 1530}
1521 1531
1522/* caller must hold global_state_lock */ 1532/* caller must lock_all_resources() */
1523enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor) 1533enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
1524{ 1534{
1525 struct drbd_device *odev; 1535 struct drbd_device *odev;
@@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min
1557 } 1567 }
1558} 1568}
1559 1569
1560/* caller must hold global_state_lock */ 1570/* caller must lock_all_resources() */
1561void drbd_resync_after_changed(struct drbd_device *device) 1571void drbd_resync_after_changed(struct drbd_device *device)
1562{ 1572{
1563 int changes; 1573 int changed;
1564 1574
1565 do { 1575 do {
1566 changes = _drbd_pause_after(device); 1576 changed = drbd_pause_after(device);
1567 changes |= _drbd_resume_next(device); 1577 changed |= drbd_resume_next(device);
1568 } while (changes); 1578 } while (changed);
1569} 1579}
1570 1580
1571void drbd_rs_controller_reset(struct drbd_device *device) 1581void drbd_rs_controller_reset(struct drbd_device *device)
@@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1685 } else { 1695 } else {
1686 mutex_lock(device->state_mutex); 1696 mutex_lock(device->state_mutex);
1687 } 1697 }
1688 clear_bit(B_RS_H_DONE, &device->flags);
1689 1698
1690 /* req_lock: serialize with drbd_send_and_submit() and others 1699 lock_all_resources();
1691 * global_state_lock: for stable sync-after dependencies */ 1700 clear_bit(B_RS_H_DONE, &device->flags);
1692 spin_lock_irq(&device->resource->req_lock);
1693 write_lock(&global_state_lock);
1694 /* Did some connection breakage or IO error race with us? */ 1701 /* Did some connection breakage or IO error race with us? */
1695 if (device->state.conn < C_CONNECTED 1702 if (device->state.conn < C_CONNECTED
1696 || !get_ldev_if_state(device, D_NEGOTIATING)) { 1703 || !get_ldev_if_state(device, D_NEGOTIATING)) {
1697 write_unlock(&global_state_lock); 1704 unlock_all_resources();
1698 spin_unlock_irq(&device->resource->req_lock); 1705 goto out;
1699 mutex_unlock(device->state_mutex);
1700 return;
1701 } 1706 }
1702 1707
1703 ns = drbd_read_state(device); 1708 ns = drbd_read_state(device);
@@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1711 else /* side == C_SYNC_SOURCE */ 1716 else /* side == C_SYNC_SOURCE */
1712 ns.pdsk = D_INCONSISTENT; 1717 ns.pdsk = D_INCONSISTENT;
1713 1718
1714 r = __drbd_set_state(device, ns, CS_VERBOSE, NULL); 1719 r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
1715 ns = drbd_read_state(device); 1720 ns = drbd_read_state(device);
1716 1721
1717 if (ns.conn < C_CONNECTED) 1722 if (ns.conn < C_CONNECTED)
@@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1732 device->rs_mark_left[i] = tw; 1737 device->rs_mark_left[i] = tw;
1733 device->rs_mark_time[i] = now; 1738 device->rs_mark_time[i] = now;
1734 } 1739 }
1735 _drbd_pause_after(device); 1740 drbd_pause_after(device);
1736 /* Forget potentially stale cached per resync extent bit-counts. 1741 /* Forget potentially stale cached per resync extent bit-counts.
1737 * Open coded drbd_rs_cancel_all(device), we already have IRQs 1742 * Open coded drbd_rs_cancel_all(device), we already have IRQs
1738 * disabled, and know the disk state is ok. */ 1743 * disabled, and know the disk state is ok. */
@@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1742 device->resync_wenr = LC_FREE; 1747 device->resync_wenr = LC_FREE;
1743 spin_unlock(&device->al_lock); 1748 spin_unlock(&device->al_lock);
1744 } 1749 }
1745 write_unlock(&global_state_lock); 1750 unlock_all_resources();
1746 spin_unlock_irq(&device->resource->req_lock);
1747 1751
1748 if (r == SS_SUCCESS) { 1752 if (r == SS_SUCCESS) {
1749 wake_up(&device->al_wait); /* for lc_reset() above */ 1753 wake_up(&device->al_wait); /* for lc_reset() above */
@@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1807 drbd_md_sync(device); 1811 drbd_md_sync(device);
1808 } 1812 }
1809 put_ldev(device); 1813 put_ldev(device);
1814out:
1810 mutex_unlock(device->state_mutex); 1815 mutex_unlock(device->state_mutex);
1811} 1816}
1812 1817
@@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device)
1836 device->act_log = NULL; 1841 device->act_log = NULL;
1837 1842
1838 __acquire(local); 1843 __acquire(local);
1839 drbd_free_ldev(device->ldev); 1844 drbd_backing_dev_free(device, device->ldev);
1840 device->ldev = NULL; 1845 device->ldev = NULL;
1841 __release(local); 1846 __release(local);
1842 1847
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 15bec407ac37..9b180dbbd03c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -104,9 +104,9 @@
104/* Device instance number, incremented each time a device is probed. */ 104/* Device instance number, incremented each time a device is probed. */
105static int instance; 105static int instance;
106 106
107struct list_head online_list; 107static struct list_head online_list;
108struct list_head removing_list; 108static struct list_head removing_list;
109spinlock_t dev_lock; 109static spinlock_t dev_lock;
110 110
111/* 111/*
112 * Global variable used to hold the major block device number 112 * Global variable used to hold the major block device number
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 95dff91135ad..6f9587156569 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -495,17 +495,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
495 id->ppaf.ch_offset = 56; 495 id->ppaf.ch_offset = 56;
496 id->ppaf.ch_len = 8; 496 id->ppaf.ch_len = 8;
497 497
498 do_div(size, bs); /* convert size to pages */ 498 sector_div(size, bs); /* convert size to pages */
499 do_div(size, 256); /* concert size to pgs pr blk */ 499 size >>= 8; /* concert size to pgs pr blk */
500 grp = &id->groups[0]; 500 grp = &id->groups[0];
501 grp->mtype = 0; 501 grp->mtype = 0;
502 grp->fmtype = 0; 502 grp->fmtype = 0;
503 grp->num_ch = 1; 503 grp->num_ch = 1;
504 grp->num_pg = 256; 504 grp->num_pg = 256;
505 blksize = size; 505 blksize = size;
506 do_div(size, (1 << 16)); 506 size >>= 16;
507 grp->num_lun = size + 1; 507 grp->num_lun = size + 1;
508 do_div(blksize, grp->num_lun); 508 sector_div(blksize, grp->num_lun);
509 grp->num_blk = blksize; 509 grp->num_blk = blksize;
510 grp->num_pln = 1; 510 grp->num_pln = 1;
511 511
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 59c91d49b14b..ba4bfe933276 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -23,7 +23,7 @@
23#include <linux/workqueue.h> 23#include <linux/workqueue.h>
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/time.h> 26#include <linux/ktime.h>
27#include <linux/hdreg.h> 27#include <linux/hdreg.h>
28#include <linux/dma-mapping.h> 28#include <linux/dma-mapping.h>
29#include <linux/completion.h> 29#include <linux/completion.h>
@@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
671static unsigned int carm_fill_sync_time(struct carm_host *host, 671static unsigned int carm_fill_sync_time(struct carm_host *host,
672 unsigned int idx, void *mem) 672 unsigned int idx, void *mem)
673{ 673{
674 struct timeval tv;
675 struct carm_msg_sync_time *st = mem; 674 struct carm_msg_sync_time *st = mem;
676 675
677 do_gettimeofday(&tv); 676 time64_t tv = ktime_get_real_seconds();
678 677
679 memset(st, 0, sizeof(*st)); 678 memset(st, 0, sizeof(*st));
680 st->type = CARM_MSG_MISC; 679 st->type = CARM_MSG_MISC;
681 st->subtype = MISC_SET_TIME; 680 st->subtype = MISC_SET_TIME;
682 st->handle = cpu_to_le32(TAG_ENCODE(idx)); 681 st->handle = cpu_to_le32(TAG_ENCODE(idx));
683 st->timestamp = cpu_to_le32(tv.tv_sec); 682 st->timestamp = cpu_to_le32(tv);
684 683
685 return sizeof(struct carm_msg_sync_time); 684 return sizeof(struct carm_msg_sync_time);
686} 685}
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 41fb1a917b17..4809c1501d7e 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -84,6 +84,16 @@ MODULE_PARM_DESC(max_persistent_grants,
84 "Maximum number of grants to map persistently"); 84 "Maximum number of grants to map persistently");
85 85
86/* 86/*
87 * Maximum number of rings/queues blkback supports, allow as many queues as there
88 * are CPUs if user has not specified a value.
89 */
90unsigned int xenblk_max_queues;
91module_param_named(max_queues, xenblk_max_queues, uint, 0644);
92MODULE_PARM_DESC(max_queues,
93 "Maximum number of hardware queues per virtual disk." \
94 "By default it is the number of online CPUs.");
95
96/*
87 * Maximum order of pages to be used for the shared ring between front and 97 * Maximum order of pages to be used for the shared ring between front and
88 * backend, 4KB page granularity is used. 98 * backend, 4KB page granularity is used.
89 */ 99 */
@@ -113,71 +123,71 @@ module_param(log_stats, int, 0644);
113/* Number of free pages to remove on each call to gnttab_free_pages */ 123/* Number of free pages to remove on each call to gnttab_free_pages */
114#define NUM_BATCH_FREE_PAGES 10 124#define NUM_BATCH_FREE_PAGES 10
115 125
116static inline int get_free_page(struct xen_blkif *blkif, struct page **page) 126static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
117{ 127{
118 unsigned long flags; 128 unsigned long flags;
119 129
120 spin_lock_irqsave(&blkif->free_pages_lock, flags); 130 spin_lock_irqsave(&ring->free_pages_lock, flags);
121 if (list_empty(&blkif->free_pages)) { 131 if (list_empty(&ring->free_pages)) {
122 BUG_ON(blkif->free_pages_num != 0); 132 BUG_ON(ring->free_pages_num != 0);
123 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 133 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
124 return gnttab_alloc_pages(1, page); 134 return gnttab_alloc_pages(1, page);
125 } 135 }
126 BUG_ON(blkif->free_pages_num == 0); 136 BUG_ON(ring->free_pages_num == 0);
127 page[0] = list_first_entry(&blkif->free_pages, struct page, lru); 137 page[0] = list_first_entry(&ring->free_pages, struct page, lru);
128 list_del(&page[0]->lru); 138 list_del(&page[0]->lru);
129 blkif->free_pages_num--; 139 ring->free_pages_num--;
130 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 140 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
131 141
132 return 0; 142 return 0;
133} 143}
134 144
135static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, 145static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
136 int num) 146 int num)
137{ 147{
138 unsigned long flags; 148 unsigned long flags;
139 int i; 149 int i;
140 150
141 spin_lock_irqsave(&blkif->free_pages_lock, flags); 151 spin_lock_irqsave(&ring->free_pages_lock, flags);
142 for (i = 0; i < num; i++) 152 for (i = 0; i < num; i++)
143 list_add(&page[i]->lru, &blkif->free_pages); 153 list_add(&page[i]->lru, &ring->free_pages);
144 blkif->free_pages_num += num; 154 ring->free_pages_num += num;
145 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 155 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
146} 156}
147 157
148static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) 158static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
149{ 159{
150 /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ 160 /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
151 struct page *page[NUM_BATCH_FREE_PAGES]; 161 struct page *page[NUM_BATCH_FREE_PAGES];
152 unsigned int num_pages = 0; 162 unsigned int num_pages = 0;
153 unsigned long flags; 163 unsigned long flags;
154 164
155 spin_lock_irqsave(&blkif->free_pages_lock, flags); 165 spin_lock_irqsave(&ring->free_pages_lock, flags);
156 while (blkif->free_pages_num > num) { 166 while (ring->free_pages_num > num) {
157 BUG_ON(list_empty(&blkif->free_pages)); 167 BUG_ON(list_empty(&ring->free_pages));
158 page[num_pages] = list_first_entry(&blkif->free_pages, 168 page[num_pages] = list_first_entry(&ring->free_pages,
159 struct page, lru); 169 struct page, lru);
160 list_del(&page[num_pages]->lru); 170 list_del(&page[num_pages]->lru);
161 blkif->free_pages_num--; 171 ring->free_pages_num--;
162 if (++num_pages == NUM_BATCH_FREE_PAGES) { 172 if (++num_pages == NUM_BATCH_FREE_PAGES) {
163 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 173 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
164 gnttab_free_pages(num_pages, page); 174 gnttab_free_pages(num_pages, page);
165 spin_lock_irqsave(&blkif->free_pages_lock, flags); 175 spin_lock_irqsave(&ring->free_pages_lock, flags);
166 num_pages = 0; 176 num_pages = 0;
167 } 177 }
168 } 178 }
169 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 179 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
170 if (num_pages != 0) 180 if (num_pages != 0)
171 gnttab_free_pages(num_pages, page); 181 gnttab_free_pages(num_pages, page);
172} 182}
173 183
174#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) 184#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
175 185
176static int do_block_io_op(struct xen_blkif *blkif); 186static int do_block_io_op(struct xen_blkif_ring *ring);
177static int dispatch_rw_block_io(struct xen_blkif *blkif, 187static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
178 struct blkif_request *req, 188 struct blkif_request *req,
179 struct pending_req *pending_req); 189 struct pending_req *pending_req);
180static void make_response(struct xen_blkif *blkif, u64 id, 190static void make_response(struct xen_blkif_ring *ring, u64 id,
181 unsigned short op, int st); 191 unsigned short op, int st);
182 192
183#define foreach_grant_safe(pos, n, rbtree, node) \ 193#define foreach_grant_safe(pos, n, rbtree, node) \
@@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id,
190 200
191/* 201/*
192 * We don't need locking around the persistent grant helpers 202 * We don't need locking around the persistent grant helpers
193 * because blkback uses a single-thread for each backed, so we 203 * because blkback uses a single-thread for each backend, so we
194 * can be sure that this functions will never be called recursively. 204 * can be sure that this functions will never be called recursively.
195 * 205 *
196 * The only exception to that is put_persistent_grant, that can be called 206 * The only exception to that is put_persistent_grant, that can be called
@@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
198 * bit operations to modify the flags of a persistent grant and to count 208 * bit operations to modify the flags of a persistent grant and to count
199 * the number of used grants. 209 * the number of used grants.
200 */ 210 */
201static int add_persistent_gnt(struct xen_blkif *blkif, 211static int add_persistent_gnt(struct xen_blkif_ring *ring,
202 struct persistent_gnt *persistent_gnt) 212 struct persistent_gnt *persistent_gnt)
203{ 213{
204 struct rb_node **new = NULL, *parent = NULL; 214 struct rb_node **new = NULL, *parent = NULL;
205 struct persistent_gnt *this; 215 struct persistent_gnt *this;
216 struct xen_blkif *blkif = ring->blkif;
206 217
207 if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { 218 if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
208 if (!blkif->vbd.overflow_max_grants) 219 if (!blkif->vbd.overflow_max_grants)
209 blkif->vbd.overflow_max_grants = 1; 220 blkif->vbd.overflow_max_grants = 1;
210 return -EBUSY; 221 return -EBUSY;
211 } 222 }
212 /* Figure out where to put new node */ 223 /* Figure out where to put new node */
213 new = &blkif->persistent_gnts.rb_node; 224 new = &ring->persistent_gnts.rb_node;
214 while (*new) { 225 while (*new) {
215 this = container_of(*new, struct persistent_gnt, node); 226 this = container_of(*new, struct persistent_gnt, node);
216 227
@@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
229 set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); 240 set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
230 /* Add new node and rebalance tree. */ 241 /* Add new node and rebalance tree. */
231 rb_link_node(&(persistent_gnt->node), parent, new); 242 rb_link_node(&(persistent_gnt->node), parent, new);
232 rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); 243 rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
233 blkif->persistent_gnt_c++; 244 ring->persistent_gnt_c++;
234 atomic_inc(&blkif->persistent_gnt_in_use); 245 atomic_inc(&ring->persistent_gnt_in_use);
235 return 0; 246 return 0;
236} 247}
237 248
238static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, 249static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
239 grant_ref_t gref) 250 grant_ref_t gref)
240{ 251{
241 struct persistent_gnt *data; 252 struct persistent_gnt *data;
242 struct rb_node *node = NULL; 253 struct rb_node *node = NULL;
243 254
244 node = blkif->persistent_gnts.rb_node; 255 node = ring->persistent_gnts.rb_node;
245 while (node) { 256 while (node) {
246 data = container_of(node, struct persistent_gnt, node); 257 data = container_of(node, struct persistent_gnt, node);
247 258
@@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
255 return NULL; 266 return NULL;
256 } 267 }
257 set_bit(PERSISTENT_GNT_ACTIVE, data->flags); 268 set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
258 atomic_inc(&blkif->persistent_gnt_in_use); 269 atomic_inc(&ring->persistent_gnt_in_use);
259 return data; 270 return data;
260 } 271 }
261 } 272 }
262 return NULL; 273 return NULL;
263} 274}
264 275
265static void put_persistent_gnt(struct xen_blkif *blkif, 276static void put_persistent_gnt(struct xen_blkif_ring *ring,
266 struct persistent_gnt *persistent_gnt) 277 struct persistent_gnt *persistent_gnt)
267{ 278{
268 if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) 279 if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
269 pr_alert_ratelimited("freeing a grant already unused\n"); 280 pr_alert_ratelimited("freeing a grant already unused\n");
270 set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); 281 set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
271 clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); 282 clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
272 atomic_dec(&blkif->persistent_gnt_in_use); 283 atomic_dec(&ring->persistent_gnt_in_use);
273} 284}
274 285
275static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, 286static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
276 unsigned int num) 287 unsigned int num)
277{ 288{
278 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 289 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
303 unmap_data.count = segs_to_unmap; 314 unmap_data.count = segs_to_unmap;
304 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 315 BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
305 316
306 put_free_pages(blkif, pages, segs_to_unmap); 317 put_free_pages(ring, pages, segs_to_unmap);
307 segs_to_unmap = 0; 318 segs_to_unmap = 0;
308 } 319 }
309 320
@@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
320 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 331 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
321 struct persistent_gnt *persistent_gnt; 332 struct persistent_gnt *persistent_gnt;
322 int segs_to_unmap = 0; 333 int segs_to_unmap = 0;
323 struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); 334 struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
324 struct gntab_unmap_queue_data unmap_data; 335 struct gntab_unmap_queue_data unmap_data;
325 336
326 unmap_data.pages = pages; 337 unmap_data.pages = pages;
327 unmap_data.unmap_ops = unmap; 338 unmap_data.unmap_ops = unmap;
328 unmap_data.kunmap_ops = NULL; 339 unmap_data.kunmap_ops = NULL;
329 340
330 while(!list_empty(&blkif->persistent_purge_list)) { 341 while(!list_empty(&ring->persistent_purge_list)) {
331 persistent_gnt = list_first_entry(&blkif->persistent_purge_list, 342 persistent_gnt = list_first_entry(&ring->persistent_purge_list,
332 struct persistent_gnt, 343 struct persistent_gnt,
333 remove_node); 344 remove_node);
334 list_del(&persistent_gnt->remove_node); 345 list_del(&persistent_gnt->remove_node);
@@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
343 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { 354 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
344 unmap_data.count = segs_to_unmap; 355 unmap_data.count = segs_to_unmap;
345 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 356 BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
346 put_free_pages(blkif, pages, segs_to_unmap); 357 put_free_pages(ring, pages, segs_to_unmap);
347 segs_to_unmap = 0; 358 segs_to_unmap = 0;
348 } 359 }
349 kfree(persistent_gnt); 360 kfree(persistent_gnt);
@@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
351 if (segs_to_unmap > 0) { 362 if (segs_to_unmap > 0) {
352 unmap_data.count = segs_to_unmap; 363 unmap_data.count = segs_to_unmap;
353 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 364 BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
354 put_free_pages(blkif, pages, segs_to_unmap); 365 put_free_pages(ring, pages, segs_to_unmap);
355 } 366 }
356} 367}
357 368
358static void purge_persistent_gnt(struct xen_blkif *blkif) 369static void purge_persistent_gnt(struct xen_blkif_ring *ring)
359{ 370{
360 struct persistent_gnt *persistent_gnt; 371 struct persistent_gnt *persistent_gnt;
361 struct rb_node *n; 372 struct rb_node *n;
@@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
363 bool scan_used = false, clean_used = false; 374 bool scan_used = false, clean_used = false;
364 struct rb_root *root; 375 struct rb_root *root;
365 376
366 if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || 377 if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
367 (blkif->persistent_gnt_c == xen_blkif_max_pgrants && 378 (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
368 !blkif->vbd.overflow_max_grants)) { 379 !ring->blkif->vbd.overflow_max_grants)) {
369 return; 380 goto out;
370 } 381 }
371 382
372 if (work_busy(&blkif->persistent_purge_work)) { 383 if (work_busy(&ring->persistent_purge_work)) {
373 pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); 384 pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
374 return; 385 goto out;
375 } 386 }
376 387
377 num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; 388 num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
378 num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; 389 num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
379 num_clean = min(blkif->persistent_gnt_c, num_clean); 390 num_clean = min(ring->persistent_gnt_c, num_clean);
380 if ((num_clean == 0) || 391 if ((num_clean == 0) ||
381 (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) 392 (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
382 return; 393 goto out;
383 394
384 /* 395 /*
385 * At this point, we can assure that there will be no calls 396 * At this point, we can assure that there will be no calls
@@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
394 405
395 pr_debug("Going to purge %u persistent grants\n", num_clean); 406 pr_debug("Going to purge %u persistent grants\n", num_clean);
396 407
397 BUG_ON(!list_empty(&blkif->persistent_purge_list)); 408 BUG_ON(!list_empty(&ring->persistent_purge_list));
398 root = &blkif->persistent_gnts; 409 root = &ring->persistent_gnts;
399purge_list: 410purge_list:
400 foreach_grant_safe(persistent_gnt, n, root, node) { 411 foreach_grant_safe(persistent_gnt, n, root, node) {
401 BUG_ON(persistent_gnt->handle == 412 BUG_ON(persistent_gnt->handle ==
@@ -414,7 +425,7 @@ purge_list:
414 425
415 rb_erase(&persistent_gnt->node, root); 426 rb_erase(&persistent_gnt->node, root);
416 list_add(&persistent_gnt->remove_node, 427 list_add(&persistent_gnt->remove_node,
417 &blkif->persistent_purge_list); 428 &ring->persistent_purge_list);
418 if (--num_clean == 0) 429 if (--num_clean == 0)
419 goto finished; 430 goto finished;
420 } 431 }
@@ -435,30 +446,32 @@ finished:
435 goto purge_list; 446 goto purge_list;
436 } 447 }
437 448
438 blkif->persistent_gnt_c -= (total - num_clean); 449 ring->persistent_gnt_c -= (total - num_clean);
439 blkif->vbd.overflow_max_grants = 0; 450 ring->blkif->vbd.overflow_max_grants = 0;
440 451
441 /* We can defer this work */ 452 /* We can defer this work */
442 schedule_work(&blkif->persistent_purge_work); 453 schedule_work(&ring->persistent_purge_work);
443 pr_debug("Purged %u/%u\n", (total - num_clean), total); 454 pr_debug("Purged %u/%u\n", (total - num_clean), total);
455
456out:
444 return; 457 return;
445} 458}
446 459
447/* 460/*
448 * Retrieve from the 'pending_reqs' a free pending_req structure to be used. 461 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
449 */ 462 */
450static struct pending_req *alloc_req(struct xen_blkif *blkif) 463static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
451{ 464{
452 struct pending_req *req = NULL; 465 struct pending_req *req = NULL;
453 unsigned long flags; 466 unsigned long flags;
454 467
455 spin_lock_irqsave(&blkif->pending_free_lock, flags); 468 spin_lock_irqsave(&ring->pending_free_lock, flags);
456 if (!list_empty(&blkif->pending_free)) { 469 if (!list_empty(&ring->pending_free)) {
457 req = list_entry(blkif->pending_free.next, struct pending_req, 470 req = list_entry(ring->pending_free.next, struct pending_req,
458 free_list); 471 free_list);
459 list_del(&req->free_list); 472 list_del(&req->free_list);
460 } 473 }
461 spin_unlock_irqrestore(&blkif->pending_free_lock, flags); 474 spin_unlock_irqrestore(&ring->pending_free_lock, flags);
462 return req; 475 return req;
463} 476}
464 477
@@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
466 * Return the 'pending_req' structure back to the freepool. We also 479 * Return the 'pending_req' structure back to the freepool. We also
467 * wake up the thread if it was waiting for a free page. 480 * wake up the thread if it was waiting for a free page.
468 */ 481 */
469static void free_req(struct xen_blkif *blkif, struct pending_req *req) 482static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
470{ 483{
471 unsigned long flags; 484 unsigned long flags;
472 int was_empty; 485 int was_empty;
473 486
474 spin_lock_irqsave(&blkif->pending_free_lock, flags); 487 spin_lock_irqsave(&ring->pending_free_lock, flags);
475 was_empty = list_empty(&blkif->pending_free); 488 was_empty = list_empty(&ring->pending_free);
476 list_add(&req->free_list, &blkif->pending_free); 489 list_add(&req->free_list, &ring->pending_free);
477 spin_unlock_irqrestore(&blkif->pending_free_lock, flags); 490 spin_unlock_irqrestore(&ring->pending_free_lock, flags);
478 if (was_empty) 491 if (was_empty)
479 wake_up(&blkif->pending_free_wq); 492 wake_up(&ring->pending_free_wq);
480} 493}
481 494
482/* 495/*
@@ -556,10 +569,10 @@ abort:
556/* 569/*
557 * Notification from the guest OS. 570 * Notification from the guest OS.
558 */ 571 */
559static void blkif_notify_work(struct xen_blkif *blkif) 572static void blkif_notify_work(struct xen_blkif_ring *ring)
560{ 573{
561 blkif->waiting_reqs = 1; 574 ring->waiting_reqs = 1;
562 wake_up(&blkif->wq); 575 wake_up(&ring->wq);
563} 576}
564 577
565irqreturn_t xen_blkif_be_int(int irq, void *dev_id) 578irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
@@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
572 * SCHEDULER FUNCTIONS 585 * SCHEDULER FUNCTIONS
573 */ 586 */
574 587
575static void print_stats(struct xen_blkif *blkif) 588static void print_stats(struct xen_blkif_ring *ring)
576{ 589{
577 pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" 590 pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
578 " | ds %4llu | pg: %4u/%4d\n", 591 " | ds %4llu | pg: %4u/%4d\n",
579 current->comm, blkif->st_oo_req, 592 current->comm, ring->st_oo_req,
580 blkif->st_rd_req, blkif->st_wr_req, 593 ring->st_rd_req, ring->st_wr_req,
581 blkif->st_f_req, blkif->st_ds_req, 594 ring->st_f_req, ring->st_ds_req,
582 blkif->persistent_gnt_c, 595 ring->persistent_gnt_c,
583 xen_blkif_max_pgrants); 596 xen_blkif_max_pgrants);
584 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 597 ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
585 blkif->st_rd_req = 0; 598 ring->st_rd_req = 0;
586 blkif->st_wr_req = 0; 599 ring->st_wr_req = 0;
587 blkif->st_oo_req = 0; 600 ring->st_oo_req = 0;
588 blkif->st_ds_req = 0; 601 ring->st_ds_req = 0;
589} 602}
590 603
591int xen_blkif_schedule(void *arg) 604int xen_blkif_schedule(void *arg)
592{ 605{
593 struct xen_blkif *blkif = arg; 606 struct xen_blkif_ring *ring = arg;
607 struct xen_blkif *blkif = ring->blkif;
594 struct xen_vbd *vbd = &blkif->vbd; 608 struct xen_vbd *vbd = &blkif->vbd;
595 unsigned long timeout; 609 unsigned long timeout;
596 int ret; 610 int ret;
597 611
598 xen_blkif_get(blkif); 612 xen_blkif_get(blkif);
599 613
614 set_freezable();
600 while (!kthread_should_stop()) { 615 while (!kthread_should_stop()) {
601 if (try_to_freeze()) 616 if (try_to_freeze())
602 continue; 617 continue;
@@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg)
606 timeout = msecs_to_jiffies(LRU_INTERVAL); 621 timeout = msecs_to_jiffies(LRU_INTERVAL);
607 622
608 timeout = wait_event_interruptible_timeout( 623 timeout = wait_event_interruptible_timeout(
609 blkif->wq, 624 ring->wq,
610 blkif->waiting_reqs || kthread_should_stop(), 625 ring->waiting_reqs || kthread_should_stop(),
611 timeout); 626 timeout);
612 if (timeout == 0) 627 if (timeout == 0)
613 goto purge_gnt_list; 628 goto purge_gnt_list;
614 timeout = wait_event_interruptible_timeout( 629 timeout = wait_event_interruptible_timeout(
615 blkif->pending_free_wq, 630 ring->pending_free_wq,
616 !list_empty(&blkif->pending_free) || 631 !list_empty(&ring->pending_free) ||
617 kthread_should_stop(), 632 kthread_should_stop(),
618 timeout); 633 timeout);
619 if (timeout == 0) 634 if (timeout == 0)
620 goto purge_gnt_list; 635 goto purge_gnt_list;
621 636
622 blkif->waiting_reqs = 0; 637 ring->waiting_reqs = 0;
623 smp_mb(); /* clear flag *before* checking for work */ 638 smp_mb(); /* clear flag *before* checking for work */
624 639
625 ret = do_block_io_op(blkif); 640 ret = do_block_io_op(ring);
626 if (ret > 0) 641 if (ret > 0)
627 blkif->waiting_reqs = 1; 642 ring->waiting_reqs = 1;
628 if (ret == -EACCES) 643 if (ret == -EACCES)
629 wait_event_interruptible(blkif->shutdown_wq, 644 wait_event_interruptible(ring->shutdown_wq,
630 kthread_should_stop()); 645 kthread_should_stop());
631 646
632purge_gnt_list: 647purge_gnt_list:
633 if (blkif->vbd.feature_gnt_persistent && 648 if (blkif->vbd.feature_gnt_persistent &&
634 time_after(jiffies, blkif->next_lru)) { 649 time_after(jiffies, ring->next_lru)) {
635 purge_persistent_gnt(blkif); 650 purge_persistent_gnt(ring);
636 blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); 651 ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
637 } 652 }
638 653
639 /* Shrink if we have more than xen_blkif_max_buffer_pages */ 654 /* Shrink if we have more than xen_blkif_max_buffer_pages */
640 shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); 655 shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
641 656
642 if (log_stats && time_after(jiffies, blkif->st_print)) 657 if (log_stats && time_after(jiffies, ring->st_print))
643 print_stats(blkif); 658 print_stats(ring);
644 } 659 }
645 660
646 /* Drain pending purge work */ 661 /* Drain pending purge work */
647 flush_work(&blkif->persistent_purge_work); 662 flush_work(&ring->persistent_purge_work);
648 663
649 if (log_stats) 664 if (log_stats)
650 print_stats(blkif); 665 print_stats(ring);
651 666
652 blkif->xenblkd = NULL; 667 ring->xenblkd = NULL;
653 xen_blkif_put(blkif); 668 xen_blkif_put(blkif);
654 669
655 return 0; 670 return 0;
@@ -658,22 +673,22 @@ purge_gnt_list:
658/* 673/*
659 * Remove persistent grants and empty the pool of free pages 674 * Remove persistent grants and empty the pool of free pages
660 */ 675 */
661void xen_blkbk_free_caches(struct xen_blkif *blkif) 676void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
662{ 677{
663 /* Free all persistent grant pages */ 678 /* Free all persistent grant pages */
664 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) 679 if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
665 free_persistent_gnts(blkif, &blkif->persistent_gnts, 680 free_persistent_gnts(ring, &ring->persistent_gnts,
666 blkif->persistent_gnt_c); 681 ring->persistent_gnt_c);
667 682
668 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); 683 BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
669 blkif->persistent_gnt_c = 0; 684 ring->persistent_gnt_c = 0;
670 685
671 /* Since we are shutting down remove all pages from the buffer */ 686 /* Since we are shutting down remove all pages from the buffer */
672 shrink_free_pagepool(blkif, 0 /* All */); 687 shrink_free_pagepool(ring, 0 /* All */);
673} 688}
674 689
675static unsigned int xen_blkbk_unmap_prepare( 690static unsigned int xen_blkbk_unmap_prepare(
676 struct xen_blkif *blkif, 691 struct xen_blkif_ring *ring,
677 struct grant_page **pages, 692 struct grant_page **pages,
678 unsigned int num, 693 unsigned int num,
679 struct gnttab_unmap_grant_ref *unmap_ops, 694 struct gnttab_unmap_grant_ref *unmap_ops,
@@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare(
683 698
684 for (i = 0; i < num; i++) { 699 for (i = 0; i < num; i++) {
685 if (pages[i]->persistent_gnt != NULL) { 700 if (pages[i]->persistent_gnt != NULL) {
686 put_persistent_gnt(blkif, pages[i]->persistent_gnt); 701 put_persistent_gnt(ring, pages[i]->persistent_gnt);
687 continue; 702 continue;
688 } 703 }
689 if (pages[i]->handle == BLKBACK_INVALID_HANDLE) 704 if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare(
700 715
701static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) 716static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
702{ 717{
703 struct pending_req* pending_req = (struct pending_req*) (data->data); 718 struct pending_req *pending_req = (struct pending_req *)(data->data);
704 struct xen_blkif *blkif = pending_req->blkif; 719 struct xen_blkif_ring *ring = pending_req->ring;
720 struct xen_blkif *blkif = ring->blkif;
705 721
706 /* BUG_ON used to reproduce existing behaviour, 722 /* BUG_ON used to reproduce existing behaviour,
707 but is this the best way to deal with this? */ 723 but is this the best way to deal with this? */
708 BUG_ON(result); 724 BUG_ON(result);
709 725
710 put_free_pages(blkif, data->pages, data->count); 726 put_free_pages(ring, data->pages, data->count);
711 make_response(blkif, pending_req->id, 727 make_response(ring, pending_req->id,
712 pending_req->operation, pending_req->status); 728 pending_req->operation, pending_req->status);
713 free_req(blkif, pending_req); 729 free_req(ring, pending_req);
714 /* 730 /*
715 * Make sure the request is freed before releasing blkif, 731 * Make sure the request is freed before releasing blkif,
716 * or there could be a race between free_req and the 732 * or there could be a race between free_req and the
@@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
723 * pending_free_wq if there's a drain going on, but it has 739 * pending_free_wq if there's a drain going on, but it has
724 * to be taken into account if the current model is changed. 740 * to be taken into account if the current model is changed.
725 */ 741 */
726 if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { 742 if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
727 complete(&blkif->drain_complete); 743 complete(&blkif->drain_complete);
728 } 744 }
729 xen_blkif_put(blkif); 745 xen_blkif_put(blkif);
@@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
732static void xen_blkbk_unmap_and_respond(struct pending_req *req) 748static void xen_blkbk_unmap_and_respond(struct pending_req *req)
733{ 749{
734 struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; 750 struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
735 struct xen_blkif *blkif = req->blkif; 751 struct xen_blkif_ring *ring = req->ring;
736 struct grant_page **pages = req->segments; 752 struct grant_page **pages = req->segments;
737 unsigned int invcount; 753 unsigned int invcount;
738 754
739 invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, 755 invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
740 req->unmap, req->unmap_pages); 756 req->unmap, req->unmap_pages);
741 757
742 work->data = req; 758 work->data = req;
@@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
757 * of hypercalls, but since this is only used in error paths there's 773 * of hypercalls, but since this is only used in error paths there's
758 * no real need. 774 * no real need.
759 */ 775 */
760static void xen_blkbk_unmap(struct xen_blkif *blkif, 776static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
761 struct grant_page *pages[], 777 struct grant_page *pages[],
762 int num) 778 int num)
763{ 779{
@@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
768 784
769 while (num) { 785 while (num) {
770 unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); 786 unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
771 787
772 invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, 788 invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
773 unmap, unmap_pages); 789 unmap, unmap_pages);
774 if (invcount) { 790 if (invcount) {
775 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); 791 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
776 BUG_ON(ret); 792 BUG_ON(ret);
777 put_free_pages(blkif, unmap_pages, invcount); 793 put_free_pages(ring, unmap_pages, invcount);
778 } 794 }
779 pages += batch; 795 pages += batch;
780 num -= batch; 796 num -= batch;
781 } 797 }
782} 798}
783 799
784static int xen_blkbk_map(struct xen_blkif *blkif, 800static int xen_blkbk_map(struct xen_blkif_ring *ring,
785 struct grant_page *pages[], 801 struct grant_page *pages[],
786 int num, bool ro) 802 int num, bool ro)
787{ 803{
@@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
794 int ret = 0; 810 int ret = 0;
795 int last_map = 0, map_until = 0; 811 int last_map = 0, map_until = 0;
796 int use_persistent_gnts; 812 int use_persistent_gnts;
813 struct xen_blkif *blkif = ring->blkif;
797 814
798 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); 815 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
799 816
@@ -806,10 +823,11 @@ again:
806 for (i = map_until; i < num; i++) { 823 for (i = map_until; i < num; i++) {
807 uint32_t flags; 824 uint32_t flags;
808 825
809 if (use_persistent_gnts) 826 if (use_persistent_gnts) {
810 persistent_gnt = get_persistent_gnt( 827 persistent_gnt = get_persistent_gnt(
811 blkif, 828 ring,
812 pages[i]->gref); 829 pages[i]->gref);
830 }
813 831
814 if (persistent_gnt) { 832 if (persistent_gnt) {
815 /* 833 /*
@@ -819,7 +837,7 @@ again:
819 pages[i]->page = persistent_gnt->page; 837 pages[i]->page = persistent_gnt->page;
820 pages[i]->persistent_gnt = persistent_gnt; 838 pages[i]->persistent_gnt = persistent_gnt;
821 } else { 839 } else {
822 if (get_free_page(blkif, &pages[i]->page)) 840 if (get_free_page(ring, &pages[i]->page))
823 goto out_of_memory; 841 goto out_of_memory;
824 addr = vaddr(pages[i]->page); 842 addr = vaddr(pages[i]->page);
825 pages_to_gnt[segs_to_map] = pages[i]->page; 843 pages_to_gnt[segs_to_map] = pages[i]->page;
@@ -852,7 +870,7 @@ again:
852 BUG_ON(new_map_idx >= segs_to_map); 870 BUG_ON(new_map_idx >= segs_to_map);
853 if (unlikely(map[new_map_idx].status != 0)) { 871 if (unlikely(map[new_map_idx].status != 0)) {
854 pr_debug("invalid buffer -- could not remap it\n"); 872 pr_debug("invalid buffer -- could not remap it\n");
855 put_free_pages(blkif, &pages[seg_idx]->page, 1); 873 put_free_pages(ring, &pages[seg_idx]->page, 1);
856 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; 874 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
857 ret |= 1; 875 ret |= 1;
858 goto next; 876 goto next;
@@ -862,7 +880,7 @@ again:
862 continue; 880 continue;
863 } 881 }
864 if (use_persistent_gnts && 882 if (use_persistent_gnts &&
865 blkif->persistent_gnt_c < xen_blkif_max_pgrants) { 883 ring->persistent_gnt_c < xen_blkif_max_pgrants) {
866 /* 884 /*
867 * We are using persistent grants, the grant is 885 * We are using persistent grants, the grant is
868 * not mapped but we might have room for it. 886 * not mapped but we might have room for it.
@@ -880,7 +898,7 @@ again:
880 persistent_gnt->gnt = map[new_map_idx].ref; 898 persistent_gnt->gnt = map[new_map_idx].ref;
881 persistent_gnt->handle = map[new_map_idx].handle; 899 persistent_gnt->handle = map[new_map_idx].handle;
882 persistent_gnt->page = pages[seg_idx]->page; 900 persistent_gnt->page = pages[seg_idx]->page;
883 if (add_persistent_gnt(blkif, 901 if (add_persistent_gnt(ring,
884 persistent_gnt)) { 902 persistent_gnt)) {
885 kfree(persistent_gnt); 903 kfree(persistent_gnt);
886 persistent_gnt = NULL; 904 persistent_gnt = NULL;
@@ -888,7 +906,7 @@ again:
888 } 906 }
889 pages[seg_idx]->persistent_gnt = persistent_gnt; 907 pages[seg_idx]->persistent_gnt = persistent_gnt;
890 pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", 908 pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
891 persistent_gnt->gnt, blkif->persistent_gnt_c, 909 persistent_gnt->gnt, ring->persistent_gnt_c,
892 xen_blkif_max_pgrants); 910 xen_blkif_max_pgrants);
893 goto next; 911 goto next;
894 } 912 }
@@ -913,7 +931,7 @@ next:
913 931
914out_of_memory: 932out_of_memory:
915 pr_alert("%s: out of memory\n", __func__); 933 pr_alert("%s: out of memory\n", __func__);
916 put_free_pages(blkif, pages_to_gnt, segs_to_map); 934 put_free_pages(ring, pages_to_gnt, segs_to_map);
917 return -ENOMEM; 935 return -ENOMEM;
918} 936}
919 937
@@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
921{ 939{
922 int rc; 940 int rc;
923 941
924 rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, 942 rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
925 pending_req->nr_segs, 943 pending_req->nr_segs,
926 (pending_req->operation != BLKIF_OP_READ)); 944 (pending_req->operation != BLKIF_OP_READ));
927 945
@@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
934 struct phys_req *preq) 952 struct phys_req *preq)
935{ 953{
936 struct grant_page **pages = pending_req->indirect_pages; 954 struct grant_page **pages = pending_req->indirect_pages;
937 struct xen_blkif *blkif = pending_req->blkif; 955 struct xen_blkif_ring *ring = pending_req->ring;
938 int indirect_grefs, rc, n, nseg, i; 956 int indirect_grefs, rc, n, nseg, i;
939 struct blkif_request_segment *segments = NULL; 957 struct blkif_request_segment *segments = NULL;
940 958
@@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
945 for (i = 0; i < indirect_grefs; i++) 963 for (i = 0; i < indirect_grefs; i++)
946 pages[i]->gref = req->u.indirect.indirect_grefs[i]; 964 pages[i]->gref = req->u.indirect.indirect_grefs[i];
947 965
948 rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); 966 rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
949 if (rc) 967 if (rc)
950 goto unmap; 968 goto unmap;
951 969
@@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
977unmap: 995unmap:
978 if (segments) 996 if (segments)
979 kunmap_atomic(segments); 997 kunmap_atomic(segments);
980 xen_blkbk_unmap(blkif, pages, indirect_grefs); 998 xen_blkbk_unmap(ring, pages, indirect_grefs);
981 return rc; 999 return rc;
982} 1000}
983 1001
984static int dispatch_discard_io(struct xen_blkif *blkif, 1002static int dispatch_discard_io(struct xen_blkif_ring *ring,
985 struct blkif_request *req) 1003 struct blkif_request *req)
986{ 1004{
987 int err = 0; 1005 int err = 0;
988 int status = BLKIF_RSP_OKAY; 1006 int status = BLKIF_RSP_OKAY;
1007 struct xen_blkif *blkif = ring->blkif;
989 struct block_device *bdev = blkif->vbd.bdev; 1008 struct block_device *bdev = blkif->vbd.bdev;
990 unsigned long secure; 1009 unsigned long secure;
991 struct phys_req preq; 1010 struct phys_req preq;
@@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
1002 preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); 1021 preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
1003 goto fail_response; 1022 goto fail_response;
1004 } 1023 }
1005 blkif->st_ds_req++; 1024 ring->st_ds_req++;
1006 1025
1007 secure = (blkif->vbd.discard_secure && 1026 secure = (blkif->vbd.discard_secure &&
1008 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? 1027 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@@ -1018,26 +1037,28 @@ fail_response:
1018 } else if (err) 1037 } else if (err)
1019 status = BLKIF_RSP_ERROR; 1038 status = BLKIF_RSP_ERROR;
1020 1039
1021 make_response(blkif, req->u.discard.id, req->operation, status); 1040 make_response(ring, req->u.discard.id, req->operation, status);
1022 xen_blkif_put(blkif); 1041 xen_blkif_put(blkif);
1023 return err; 1042 return err;
1024} 1043}
1025 1044
1026static int dispatch_other_io(struct xen_blkif *blkif, 1045static int dispatch_other_io(struct xen_blkif_ring *ring,
1027 struct blkif_request *req, 1046 struct blkif_request *req,
1028 struct pending_req *pending_req) 1047 struct pending_req *pending_req)
1029{ 1048{
1030 free_req(blkif, pending_req); 1049 free_req(ring, pending_req);
1031 make_response(blkif, req->u.other.id, req->operation, 1050 make_response(ring, req->u.other.id, req->operation,
1032 BLKIF_RSP_EOPNOTSUPP); 1051 BLKIF_RSP_EOPNOTSUPP);
1033 return -EIO; 1052 return -EIO;
1034} 1053}
1035 1054
1036static void xen_blk_drain_io(struct xen_blkif *blkif) 1055static void xen_blk_drain_io(struct xen_blkif_ring *ring)
1037{ 1056{
1057 struct xen_blkif *blkif = ring->blkif;
1058
1038 atomic_set(&blkif->drain, 1); 1059 atomic_set(&blkif->drain, 1);
1039 do { 1060 do {
1040 if (atomic_read(&blkif->inflight) == 0) 1061 if (atomic_read(&ring->inflight) == 0)
1041 break; 1062 break;
1042 wait_for_completion_interruptible_timeout( 1063 wait_for_completion_interruptible_timeout(
1043 &blkif->drain_complete, HZ); 1064 &blkif->drain_complete, HZ);
@@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
1058 if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && 1079 if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
1059 (error == -EOPNOTSUPP)) { 1080 (error == -EOPNOTSUPP)) {
1060 pr_debug("flush diskcache op failed, not supported\n"); 1081 pr_debug("flush diskcache op failed, not supported\n");
1061 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); 1082 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
1062 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 1083 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1063 } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && 1084 } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
1064 (error == -EOPNOTSUPP)) { 1085 (error == -EOPNOTSUPP)) {
1065 pr_debug("write barrier op failed, not supported\n"); 1086 pr_debug("write barrier op failed, not supported\n");
1066 xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); 1087 xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
1067 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 1088 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1068 } else if (error) { 1089 } else if (error) {
1069 pr_debug("Buffer not up-to-date at end of operation," 1090 pr_debug("Buffer not up-to-date at end of operation,"
@@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio)
1097 * and transmute it to the block API to hand it over to the proper block disk. 1118 * and transmute it to the block API to hand it over to the proper block disk.
1098 */ 1119 */
1099static int 1120static int
1100__do_block_io_op(struct xen_blkif *blkif) 1121__do_block_io_op(struct xen_blkif_ring *ring)
1101{ 1122{
1102 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1123 union blkif_back_rings *blk_rings = &ring->blk_rings;
1103 struct blkif_request req; 1124 struct blkif_request req;
1104 struct pending_req *pending_req; 1125 struct pending_req *pending_req;
1105 RING_IDX rc, rp; 1126 RING_IDX rc, rp;
@@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif)
1112 if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { 1133 if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1113 rc = blk_rings->common.rsp_prod_pvt; 1134 rc = blk_rings->common.rsp_prod_pvt;
1114 pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", 1135 pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1115 rp, rc, rp - rc, blkif->vbd.pdevice); 1136 rp, rc, rp - rc, ring->blkif->vbd.pdevice);
1116 return -EACCES; 1137 return -EACCES;
1117 } 1138 }
1118 while (rc != rp) { 1139 while (rc != rp) {
@@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif)
1125 break; 1146 break;
1126 } 1147 }
1127 1148
1128 pending_req = alloc_req(blkif); 1149 pending_req = alloc_req(ring);
1129 if (NULL == pending_req) { 1150 if (NULL == pending_req) {
1130 blkif->st_oo_req++; 1151 ring->st_oo_req++;
1131 more_to_do = 1; 1152 more_to_do = 1;
1132 break; 1153 break;
1133 } 1154 }
1134 1155
1135 switch (blkif->blk_protocol) { 1156 switch (ring->blkif->blk_protocol) {
1136 case BLKIF_PROTOCOL_NATIVE: 1157 case BLKIF_PROTOCOL_NATIVE:
1137 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); 1158 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
1138 break; 1159 break;
@@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif)
1156 case BLKIF_OP_WRITE_BARRIER: 1177 case BLKIF_OP_WRITE_BARRIER:
1157 case BLKIF_OP_FLUSH_DISKCACHE: 1178 case BLKIF_OP_FLUSH_DISKCACHE:
1158 case BLKIF_OP_INDIRECT: 1179 case BLKIF_OP_INDIRECT:
1159 if (dispatch_rw_block_io(blkif, &req, pending_req)) 1180 if (dispatch_rw_block_io(ring, &req, pending_req))
1160 goto done; 1181 goto done;
1161 break; 1182 break;
1162 case BLKIF_OP_DISCARD: 1183 case BLKIF_OP_DISCARD:
1163 free_req(blkif, pending_req); 1184 free_req(ring, pending_req);
1164 if (dispatch_discard_io(blkif, &req)) 1185 if (dispatch_discard_io(ring, &req))
1165 goto done; 1186 goto done;
1166 break; 1187 break;
1167 default: 1188 default:
1168 if (dispatch_other_io(blkif, &req, pending_req)) 1189 if (dispatch_other_io(ring, &req, pending_req))
1169 goto done; 1190 goto done;
1170 break; 1191 break;
1171 } 1192 }
@@ -1178,13 +1199,13 @@ done:
1178} 1199}
1179 1200
1180static int 1201static int
1181do_block_io_op(struct xen_blkif *blkif) 1202do_block_io_op(struct xen_blkif_ring *ring)
1182{ 1203{
1183 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1204 union blkif_back_rings *blk_rings = &ring->blk_rings;
1184 int more_to_do; 1205 int more_to_do;
1185 1206
1186 do { 1207 do {
1187 more_to_do = __do_block_io_op(blkif); 1208 more_to_do = __do_block_io_op(ring);
1188 if (more_to_do) 1209 if (more_to_do)
1189 break; 1210 break;
1190 1211
@@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif)
1197 * Transmutation of the 'struct blkif_request' to a proper 'struct bio' 1218 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
1198 * and call the 'submit_bio' to pass it to the underlying storage. 1219 * and call the 'submit_bio' to pass it to the underlying storage.
1199 */ 1220 */
1200static int dispatch_rw_block_io(struct xen_blkif *blkif, 1221static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
1201 struct blkif_request *req, 1222 struct blkif_request *req,
1202 struct pending_req *pending_req) 1223 struct pending_req *pending_req)
1203{ 1224{
@@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1225 1246
1226 switch (req_operation) { 1247 switch (req_operation) {
1227 case BLKIF_OP_READ: 1248 case BLKIF_OP_READ:
1228 blkif->st_rd_req++; 1249 ring->st_rd_req++;
1229 operation = READ; 1250 operation = READ;
1230 break; 1251 break;
1231 case BLKIF_OP_WRITE: 1252 case BLKIF_OP_WRITE:
1232 blkif->st_wr_req++; 1253 ring->st_wr_req++;
1233 operation = WRITE_ODIRECT; 1254 operation = WRITE_ODIRECT;
1234 break; 1255 break;
1235 case BLKIF_OP_WRITE_BARRIER: 1256 case BLKIF_OP_WRITE_BARRIER:
1236 drain = true; 1257 drain = true;
1237 case BLKIF_OP_FLUSH_DISKCACHE: 1258 case BLKIF_OP_FLUSH_DISKCACHE:
1238 blkif->st_f_req++; 1259 ring->st_f_req++;
1239 operation = WRITE_FLUSH; 1260 operation = WRITE_FLUSH;
1240 break; 1261 break;
1241 default: 1262 default:
@@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1260 1281
1261 preq.nr_sects = 0; 1282 preq.nr_sects = 0;
1262 1283
1263 pending_req->blkif = blkif; 1284 pending_req->ring = ring;
1264 pending_req->id = req->u.rw.id; 1285 pending_req->id = req->u.rw.id;
1265 pending_req->operation = req_operation; 1286 pending_req->operation = req_operation;
1266 pending_req->status = BLKIF_RSP_OKAY; 1287 pending_req->status = BLKIF_RSP_OKAY;
@@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1287 goto fail_response; 1308 goto fail_response;
1288 } 1309 }
1289 1310
1290 if (xen_vbd_translate(&preq, blkif, operation) != 0) { 1311 if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
1291 pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", 1312 pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
1292 operation == READ ? "read" : "write", 1313 operation == READ ? "read" : "write",
1293 preq.sector_number, 1314 preq.sector_number,
1294 preq.sector_number + preq.nr_sects, 1315 preq.sector_number + preq.nr_sects,
1295 blkif->vbd.pdevice); 1316 ring->blkif->vbd.pdevice);
1296 goto fail_response; 1317 goto fail_response;
1297 } 1318 }
1298 1319
@@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1304 if (((int)preq.sector_number|(int)seg[i].nsec) & 1325 if (((int)preq.sector_number|(int)seg[i].nsec) &
1305 ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { 1326 ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
1306 pr_debug("Misaligned I/O request from domain %d\n", 1327 pr_debug("Misaligned I/O request from domain %d\n",
1307 blkif->domid); 1328 ring->blkif->domid);
1308 goto fail_response; 1329 goto fail_response;
1309 } 1330 }
1310 } 1331 }
@@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1313 * issue the WRITE_FLUSH. 1334 * issue the WRITE_FLUSH.
1314 */ 1335 */
1315 if (drain) 1336 if (drain)
1316 xen_blk_drain_io(pending_req->blkif); 1337 xen_blk_drain_io(pending_req->ring);
1317 1338
1318 /* 1339 /*
1319 * If we have failed at this point, we need to undo the M2P override, 1340 * If we have failed at this point, we need to undo the M2P override,
@@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1328 * This corresponding xen_blkif_put is done in __end_block_io_op, or 1349 * This corresponding xen_blkif_put is done in __end_block_io_op, or
1329 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. 1350 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
1330 */ 1351 */
1331 xen_blkif_get(blkif); 1352 xen_blkif_get(ring->blkif);
1332 atomic_inc(&blkif->inflight); 1353 atomic_inc(&ring->inflight);
1333 1354
1334 for (i = 0; i < nseg; i++) { 1355 for (i = 0; i < nseg; i++) {
1335 while ((bio == NULL) || 1356 while ((bio == NULL) ||
@@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1377 blk_finish_plug(&plug); 1398 blk_finish_plug(&plug);
1378 1399
1379 if (operation == READ) 1400 if (operation == READ)
1380 blkif->st_rd_sect += preq.nr_sects; 1401 ring->st_rd_sect += preq.nr_sects;
1381 else if (operation & WRITE) 1402 else if (operation & WRITE)
1382 blkif->st_wr_sect += preq.nr_sects; 1403 ring->st_wr_sect += preq.nr_sects;
1383 1404
1384 return 0; 1405 return 0;
1385 1406
1386 fail_flush: 1407 fail_flush:
1387 xen_blkbk_unmap(blkif, pending_req->segments, 1408 xen_blkbk_unmap(ring, pending_req->segments,
1388 pending_req->nr_segs); 1409 pending_req->nr_segs);
1389 fail_response: 1410 fail_response:
1390 /* Haven't submitted any bio's yet. */ 1411 /* Haven't submitted any bio's yet. */
1391 make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); 1412 make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1392 free_req(blkif, pending_req); 1413 free_req(ring, pending_req);
1393 msleep(1); /* back off a bit */ 1414 msleep(1); /* back off a bit */
1394 return -EIO; 1415 return -EIO;
1395 1416
@@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1407/* 1428/*
1408 * Put a response on the ring on how the operation fared. 1429 * Put a response on the ring on how the operation fared.
1409 */ 1430 */
1410static void make_response(struct xen_blkif *blkif, u64 id, 1431static void make_response(struct xen_blkif_ring *ring, u64 id,
1411 unsigned short op, int st) 1432 unsigned short op, int st)
1412{ 1433{
1413 struct blkif_response resp; 1434 struct blkif_response resp;
1414 unsigned long flags; 1435 unsigned long flags;
1415 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1436 union blkif_back_rings *blk_rings;
1416 int notify; 1437 int notify;
1417 1438
1418 resp.id = id; 1439 resp.id = id;
1419 resp.operation = op; 1440 resp.operation = op;
1420 resp.status = st; 1441 resp.status = st;
1421 1442
1422 spin_lock_irqsave(&blkif->blk_ring_lock, flags); 1443 spin_lock_irqsave(&ring->blk_ring_lock, flags);
1444 blk_rings = &ring->blk_rings;
1423 /* Place on the response ring for the relevant domain. */ 1445 /* Place on the response ring for the relevant domain. */
1424 switch (blkif->blk_protocol) { 1446 switch (ring->blkif->blk_protocol) {
1425 case BLKIF_PROTOCOL_NATIVE: 1447 case BLKIF_PROTOCOL_NATIVE:
1426 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), 1448 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
1427 &resp, sizeof(resp)); 1449 &resp, sizeof(resp));
@@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
1439 } 1461 }
1440 blk_rings->common.rsp_prod_pvt++; 1462 blk_rings->common.rsp_prod_pvt++;
1441 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); 1463 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1442 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); 1464 spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
1443 if (notify) 1465 if (notify)
1444 notify_remote_via_irq(blkif->irq); 1466 notify_remote_via_irq(ring->irq);
1445} 1467}
1446 1468
1447static int __init xen_blkif_init(void) 1469static int __init xen_blkif_init(void)
@@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void)
1457 xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; 1479 xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
1458 } 1480 }
1459 1481
1482 if (xenblk_max_queues == 0)
1483 xenblk_max_queues = num_online_cpus();
1484
1460 rc = xen_blkif_interface_init(); 1485 rc = xen_blkif_interface_init();
1461 if (rc) 1486 if (rc)
1462 goto failed_init; 1487 goto failed_init;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index c929ae22764c..dea61f6ab8cb 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -46,6 +46,7 @@
46#include <xen/interface/io/protocols.h> 46#include <xen/interface/io/protocols.h>
47 47
48extern unsigned int xen_blkif_max_ring_order; 48extern unsigned int xen_blkif_max_ring_order;
49extern unsigned int xenblk_max_queues;
49/* 50/*
50 * This is the maximum number of segments that would be allowed in indirect 51 * This is the maximum number of segments that would be allowed in indirect
51 * requests. This value will also be passed to the frontend. 52 * requests. This value will also be passed to the frontend.
@@ -269,68 +270,79 @@ struct persistent_gnt {
269 struct list_head remove_node; 270 struct list_head remove_node;
270}; 271};
271 272
272struct xen_blkif { 273/* Per-ring information. */
273 /* Unique identifier for this interface. */ 274struct xen_blkif_ring {
274 domid_t domid;
275 unsigned int handle;
276 /* Physical parameters of the comms window. */ 275 /* Physical parameters of the comms window. */
277 unsigned int irq; 276 unsigned int irq;
278 /* Comms information. */
279 enum blkif_protocol blk_protocol;
280 union blkif_back_rings blk_rings; 277 union blkif_back_rings blk_rings;
281 void *blk_ring; 278 void *blk_ring;
282 /* The VBD attached to this interface. */
283 struct xen_vbd vbd;
284 /* Back pointer to the backend_info. */
285 struct backend_info *be;
286 /* Private fields. */ 279 /* Private fields. */
287 spinlock_t blk_ring_lock; 280 spinlock_t blk_ring_lock;
288 atomic_t refcnt;
289 281
290 wait_queue_head_t wq; 282 wait_queue_head_t wq;
291 /* for barrier (drain) requests */
292 struct completion drain_complete;
293 atomic_t drain;
294 atomic_t inflight; 283 atomic_t inflight;
295 /* One thread per one blkif. */ 284 /* One thread per blkif ring. */
296 struct task_struct *xenblkd; 285 struct task_struct *xenblkd;
297 unsigned int waiting_reqs; 286 unsigned int waiting_reqs;
298 287
299 /* tree to store persistent grants */ 288 /* List of all 'pending_req' available */
289 struct list_head pending_free;
290 /* And its spinlock. */
291 spinlock_t pending_free_lock;
292 wait_queue_head_t pending_free_wq;
293
294 /* Tree to store persistent grants. */
295 spinlock_t pers_gnts_lock;
300 struct rb_root persistent_gnts; 296 struct rb_root persistent_gnts;
301 unsigned int persistent_gnt_c; 297 unsigned int persistent_gnt_c;
302 atomic_t persistent_gnt_in_use; 298 atomic_t persistent_gnt_in_use;
303 unsigned long next_lru; 299 unsigned long next_lru;
304 300
305 /* used by the kworker that offload work from the persistent purge */ 301 /* Statistics. */
302 unsigned long st_print;
303 unsigned long long st_rd_req;
304 unsigned long long st_wr_req;
305 unsigned long long st_oo_req;
306 unsigned long long st_f_req;
307 unsigned long long st_ds_req;
308 unsigned long long st_rd_sect;
309 unsigned long long st_wr_sect;
310
311 /* Used by the kworker that offload work from the persistent purge. */
306 struct list_head persistent_purge_list; 312 struct list_head persistent_purge_list;
307 struct work_struct persistent_purge_work; 313 struct work_struct persistent_purge_work;
308 314
309 /* buffer of free pages to map grant refs */ 315 /* Buffer of free pages to map grant refs. */
310 spinlock_t free_pages_lock; 316 spinlock_t free_pages_lock;
311 int free_pages_num; 317 int free_pages_num;
312 struct list_head free_pages; 318 struct list_head free_pages;
313 319
314 /* List of all 'pending_req' available */
315 struct list_head pending_free;
316 /* And its spinlock. */
317 spinlock_t pending_free_lock;
318 wait_queue_head_t pending_free_wq;
319
320 /* statistics */
321 unsigned long st_print;
322 unsigned long long st_rd_req;
323 unsigned long long st_wr_req;
324 unsigned long long st_oo_req;
325 unsigned long long st_f_req;
326 unsigned long long st_ds_req;
327 unsigned long long st_rd_sect;
328 unsigned long long st_wr_sect;
329
330 struct work_struct free_work; 320 struct work_struct free_work;
331 /* Thread shutdown wait queue. */ 321 /* Thread shutdown wait queue. */
332 wait_queue_head_t shutdown_wq; 322 wait_queue_head_t shutdown_wq;
333 unsigned int nr_ring_pages; 323 struct xen_blkif *blkif;
324};
325
326struct xen_blkif {
327 /* Unique identifier for this interface. */
328 domid_t domid;
329 unsigned int handle;
330 /* Comms information. */
331 enum blkif_protocol blk_protocol;
332 /* The VBD attached to this interface. */
333 struct xen_vbd vbd;
334 /* Back pointer to the backend_info. */
335 struct backend_info *be;
336 atomic_t refcnt;
337 /* for barrier (drain) requests */
338 struct completion drain_complete;
339 atomic_t drain;
340
341 struct work_struct free_work;
342 unsigned int nr_ring_pages;
343 /* All rings for this device. */
344 struct xen_blkif_ring *rings;
345 unsigned int nr_rings;
334}; 346};
335 347
336struct seg_buf { 348struct seg_buf {
@@ -352,7 +364,7 @@ struct grant_page {
352 * response queued for it, with the saved 'id' passed back. 364 * response queued for it, with the saved 'id' passed back.
353 */ 365 */
354struct pending_req { 366struct pending_req {
355 struct xen_blkif *blkif; 367 struct xen_blkif_ring *ring;
356 u64 id; 368 u64 id;
357 int nr_segs; 369 int nr_segs;
358 atomic_t pendcnt; 370 atomic_t pendcnt;
@@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void);
394irqreturn_t xen_blkif_be_int(int irq, void *dev_id); 406irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
395int xen_blkif_schedule(void *arg); 407int xen_blkif_schedule(void *arg);
396int xen_blkif_purge_persistent(void *arg); 408int xen_blkif_purge_persistent(void *arg);
397void xen_blkbk_free_caches(struct xen_blkif *blkif); 409void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
398 410
399int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, 411int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
400 struct backend_info *be, int state); 412 struct backend_info *be, int state);
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index f53cff42f8da..876763f7f13e 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
86{ 86{
87 int err; 87 int err;
88 char name[BLKBACK_NAME_LEN]; 88 char name[BLKBACK_NAME_LEN];
89 struct xen_blkif_ring *ring;
90 int i;
89 91
90 /* Not ready to connect? */ 92 /* Not ready to connect? */
91 if (!blkif->irq || !blkif->vbd.bdev) 93 if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
92 return; 94 return;
93 95
94 /* Already connected? */ 96 /* Already connected? */
@@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
113 } 115 }
114 invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); 116 invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
115 117
116 blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); 118 for (i = 0; i < blkif->nr_rings; i++) {
117 if (IS_ERR(blkif->xenblkd)) { 119 ring = &blkif->rings[i];
118 err = PTR_ERR(blkif->xenblkd); 120 ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
119 blkif->xenblkd = NULL; 121 if (IS_ERR(ring->xenblkd)) {
120 xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); 122 err = PTR_ERR(ring->xenblkd);
121 return; 123 ring->xenblkd = NULL;
124 xenbus_dev_fatal(blkif->be->dev, err,
125 "start %s-%d xenblkd", name, i);
126 goto out;
127 }
128 }
129 return;
130
131out:
132 while (--i >= 0) {
133 ring = &blkif->rings[i];
134 kthread_stop(ring->xenblkd);
135 }
136 return;
137}
138
139static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
140{
141 unsigned int r;
142
143 blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
144 if (!blkif->rings)
145 return -ENOMEM;
146
147 for (r = 0; r < blkif->nr_rings; r++) {
148 struct xen_blkif_ring *ring = &blkif->rings[r];
149
150 spin_lock_init(&ring->blk_ring_lock);
151 init_waitqueue_head(&ring->wq);
152 INIT_LIST_HEAD(&ring->pending_free);
153 INIT_LIST_HEAD(&ring->persistent_purge_list);
154 INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
155 spin_lock_init(&ring->free_pages_lock);
156 INIT_LIST_HEAD(&ring->free_pages);
157
158 spin_lock_init(&ring->pending_free_lock);
159 init_waitqueue_head(&ring->pending_free_wq);
160 init_waitqueue_head(&ring->shutdown_wq);
161 ring->blkif = blkif;
162 ring->st_print = jiffies;
163 xen_blkif_get(blkif);
122 } 164 }
165
166 return 0;
123} 167}
124 168
125static struct xen_blkif *xen_blkif_alloc(domid_t domid) 169static struct xen_blkif *xen_blkif_alloc(domid_t domid)
@@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
133 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
134 178
135 blkif->domid = domid; 179 blkif->domid = domid;
136 spin_lock_init(&blkif->blk_ring_lock);
137 atomic_set(&blkif->refcnt, 1); 180 atomic_set(&blkif->refcnt, 1);
138 init_waitqueue_head(&blkif->wq);
139 init_completion(&blkif->drain_complete); 181 init_completion(&blkif->drain_complete);
140 atomic_set(&blkif->drain, 0);
141 blkif->st_print = jiffies;
142 blkif->persistent_gnts.rb_node = NULL;
143 spin_lock_init(&blkif->free_pages_lock);
144 INIT_LIST_HEAD(&blkif->free_pages);
145 INIT_LIST_HEAD(&blkif->persistent_purge_list);
146 blkif->free_pages_num = 0;
147 atomic_set(&blkif->persistent_gnt_in_use, 0);
148 atomic_set(&blkif->inflight, 0);
149 INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
150
151 INIT_LIST_HEAD(&blkif->pending_free);
152 INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); 182 INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
153 spin_lock_init(&blkif->pending_free_lock);
154 init_waitqueue_head(&blkif->pending_free_wq);
155 init_waitqueue_head(&blkif->shutdown_wq);
156 183
157 return blkif; 184 return blkif;
158} 185}
159 186
160static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, 187static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
161 unsigned int nr_grefs, unsigned int evtchn) 188 unsigned int nr_grefs, unsigned int evtchn)
162{ 189{
163 int err; 190 int err;
191 struct xen_blkif *blkif = ring->blkif;
164 192
165 /* Already connected through? */ 193 /* Already connected through? */
166 if (blkif->irq) 194 if (ring->irq)
167 return 0; 195 return 0;
168 196
169 err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, 197 err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
170 &blkif->blk_ring); 198 &ring->blk_ring);
171 if (err < 0) 199 if (err < 0)
172 return err; 200 return err;
173 201
@@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
175 case BLKIF_PROTOCOL_NATIVE: 203 case BLKIF_PROTOCOL_NATIVE:
176 { 204 {
177 struct blkif_sring *sring; 205 struct blkif_sring *sring;
178 sring = (struct blkif_sring *)blkif->blk_ring; 206 sring = (struct blkif_sring *)ring->blk_ring;
179 BACK_RING_INIT(&blkif->blk_rings.native, sring, 207 BACK_RING_INIT(&ring->blk_rings.native, sring,
180 XEN_PAGE_SIZE * nr_grefs); 208 XEN_PAGE_SIZE * nr_grefs);
181 break; 209 break;
182 } 210 }
183 case BLKIF_PROTOCOL_X86_32: 211 case BLKIF_PROTOCOL_X86_32:
184 { 212 {
185 struct blkif_x86_32_sring *sring_x86_32; 213 struct blkif_x86_32_sring *sring_x86_32;
186 sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; 214 sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
187 BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, 215 BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
188 XEN_PAGE_SIZE * nr_grefs); 216 XEN_PAGE_SIZE * nr_grefs);
189 break; 217 break;
190 } 218 }
191 case BLKIF_PROTOCOL_X86_64: 219 case BLKIF_PROTOCOL_X86_64:
192 { 220 {
193 struct blkif_x86_64_sring *sring_x86_64; 221 struct blkif_x86_64_sring *sring_x86_64;
194 sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; 222 sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
195 BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, 223 BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
196 XEN_PAGE_SIZE * nr_grefs); 224 XEN_PAGE_SIZE * nr_grefs);
197 break; 225 break;
198 } 226 }
@@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
202 230
203 err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, 231 err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
204 xen_blkif_be_int, 0, 232 xen_blkif_be_int, 0,
205 "blkif-backend", blkif); 233 "blkif-backend", ring);
206 if (err < 0) { 234 if (err < 0) {
207 xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); 235 xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
208 blkif->blk_rings.common.sring = NULL; 236 ring->blk_rings.common.sring = NULL;
209 return err; 237 return err;
210 } 238 }
211 blkif->irq = err; 239 ring->irq = err;
212 240
213 return 0; 241 return 0;
214} 242}
@@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
216static int xen_blkif_disconnect(struct xen_blkif *blkif) 244static int xen_blkif_disconnect(struct xen_blkif *blkif)
217{ 245{
218 struct pending_req *req, *n; 246 struct pending_req *req, *n;
219 int i = 0, j; 247 unsigned int j, r;
220 248
221 if (blkif->xenblkd) { 249 for (r = 0; r < blkif->nr_rings; r++) {
222 kthread_stop(blkif->xenblkd); 250 struct xen_blkif_ring *ring = &blkif->rings[r];
223 wake_up(&blkif->shutdown_wq); 251 unsigned int i = 0;
224 blkif->xenblkd = NULL;
225 }
226 252
227 /* The above kthread_stop() guarantees that at this point we 253 if (ring->xenblkd) {
228 * don't have any discard_io or other_io requests. So, checking 254 kthread_stop(ring->xenblkd);
229 * for inflight IO is enough. 255 wake_up(&ring->shutdown_wq);
230 */ 256 ring->xenblkd = NULL;
231 if (atomic_read(&blkif->inflight) > 0) 257 }
232 return -EBUSY;
233 258
234 if (blkif->irq) { 259 /* The above kthread_stop() guarantees that at this point we
235 unbind_from_irqhandler(blkif->irq, blkif); 260 * don't have any discard_io or other_io requests. So, checking
236 blkif->irq = 0; 261 * for inflight IO is enough.
237 } 262 */
263 if (atomic_read(&ring->inflight) > 0)
264 return -EBUSY;
238 265
239 if (blkif->blk_rings.common.sring) { 266 if (ring->irq) {
240 xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); 267 unbind_from_irqhandler(ring->irq, ring);
241 blkif->blk_rings.common.sring = NULL; 268 ring->irq = 0;
242 } 269 }
243 270
244 /* Remove all persistent grants and the cache of ballooned pages. */ 271 if (ring->blk_rings.common.sring) {
245 xen_blkbk_free_caches(blkif); 272 xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
273 ring->blk_rings.common.sring = NULL;
274 }
246 275
247 /* Check that there is no request in use */ 276 /* Remove all persistent grants and the cache of ballooned pages. */
248 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { 277 xen_blkbk_free_caches(ring);
249 list_del(&req->free_list);
250 278
251 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) 279 /* Check that there is no request in use */
252 kfree(req->segments[j]); 280 list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
281 list_del(&req->free_list);
253 282
254 for (j = 0; j < MAX_INDIRECT_PAGES; j++) 283 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
255 kfree(req->indirect_pages[j]); 284 kfree(req->segments[j]);
256 285
257 kfree(req); 286 for (j = 0; j < MAX_INDIRECT_PAGES; j++)
258 i++; 287 kfree(req->indirect_pages[j]);
259 } 288
289 kfree(req);
290 i++;
291 }
260 292
261 WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); 293 BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
294 BUG_ON(!list_empty(&ring->persistent_purge_list));
295 BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
296 BUG_ON(!list_empty(&ring->free_pages));
297 BUG_ON(ring->free_pages_num != 0);
298 BUG_ON(ring->persistent_gnt_c != 0);
299 WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
300 xen_blkif_put(blkif);
301 }
262 blkif->nr_ring_pages = 0; 302 blkif->nr_ring_pages = 0;
303 /*
304 * blkif->rings was allocated in connect_ring, so we should free it in
305 * here.
306 */
307 kfree(blkif->rings);
308 blkif->rings = NULL;
309 blkif->nr_rings = 0;
263 310
264 return 0; 311 return 0;
265} 312}
@@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
271 xen_vbd_free(&blkif->vbd); 318 xen_vbd_free(&blkif->vbd);
272 319
273 /* Make sure everything is drained before shutting down */ 320 /* Make sure everything is drained before shutting down */
274 BUG_ON(blkif->persistent_gnt_c != 0);
275 BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
276 BUG_ON(blkif->free_pages_num != 0);
277 BUG_ON(!list_empty(&blkif->persistent_purge_list));
278 BUG_ON(!list_empty(&blkif->free_pages));
279 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
280
281 kmem_cache_free(xen_blkif_cachep, blkif); 321 kmem_cache_free(xen_blkif_cachep, blkif);
282} 322}
283 323
@@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void)
296 * sysfs interface for VBD I/O requests 336 * sysfs interface for VBD I/O requests
297 */ 337 */
298 338
299#define VBD_SHOW(name, format, args...) \ 339#define VBD_SHOW_ALLRING(name, format) \
300 static ssize_t show_##name(struct device *_dev, \ 340 static ssize_t show_##name(struct device *_dev, \
301 struct device_attribute *attr, \ 341 struct device_attribute *attr, \
302 char *buf) \ 342 char *buf) \
303 { \ 343 { \
304 struct xenbus_device *dev = to_xenbus_device(_dev); \ 344 struct xenbus_device *dev = to_xenbus_device(_dev); \
305 struct backend_info *be = dev_get_drvdata(&dev->dev); \ 345 struct backend_info *be = dev_get_drvdata(&dev->dev); \
346 struct xen_blkif *blkif = be->blkif; \
347 unsigned int i; \
348 unsigned long long result = 0; \
306 \ 349 \
307 return sprintf(buf, format, ##args); \ 350 if (!blkif->rings) \
351 goto out; \
352 \
353 for (i = 0; i < blkif->nr_rings; i++) { \
354 struct xen_blkif_ring *ring = &blkif->rings[i]; \
355 \
356 result += ring->st_##name; \
357 } \
358 \
359out: \
360 return sprintf(buf, format, result); \
308 } \ 361 } \
309 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) 362 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
310 363
311VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); 364VBD_SHOW_ALLRING(oo_req, "%llu\n");
312VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); 365VBD_SHOW_ALLRING(rd_req, "%llu\n");
313VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); 366VBD_SHOW_ALLRING(wr_req, "%llu\n");
314VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); 367VBD_SHOW_ALLRING(f_req, "%llu\n");
315VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); 368VBD_SHOW_ALLRING(ds_req, "%llu\n");
316VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); 369VBD_SHOW_ALLRING(rd_sect, "%llu\n");
317VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); 370VBD_SHOW_ALLRING(wr_sect, "%llu\n");
318 371
319static struct attribute *xen_vbdstat_attrs[] = { 372static struct attribute *xen_vbdstat_attrs[] = {
320 &dev_attr_oo_req.attr, 373 &dev_attr_oo_req.attr,
@@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = {
332 .attrs = xen_vbdstat_attrs, 385 .attrs = xen_vbdstat_attrs,
333}; 386};
334 387
388#define VBD_SHOW(name, format, args...) \
389 static ssize_t show_##name(struct device *_dev, \
390 struct device_attribute *attr, \
391 char *buf) \
392 { \
393 struct xenbus_device *dev = to_xenbus_device(_dev); \
394 struct backend_info *be = dev_get_drvdata(&dev->dev); \
395 \
396 return sprintf(buf, format, ##args); \
397 } \
398 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
399
335VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); 400VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
336VBD_SHOW(mode, "%s\n", be->mode); 401VBD_SHOW(mode, "%s\n", be->mode);
337 402
@@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
440 505
441 dev_set_drvdata(&dev->dev, NULL); 506 dev_set_drvdata(&dev->dev, NULL);
442 507
443 if (be->blkif) { 508 if (be->blkif)
444 xen_blkif_disconnect(be->blkif); 509 xen_blkif_disconnect(be->blkif);
445 xen_blkif_put(be->blkif);
446 }
447 510
511 /* Put the reference we set in xen_blkif_alloc(). */
512 xen_blkif_put(be->blkif);
448 kfree(be->mode); 513 kfree(be->mode);
449 kfree(be); 514 kfree(be);
450 return 0; 515 return 0;
@@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
553 goto fail; 618 goto fail;
554 } 619 }
555 620
621 /* Multi-queue: advertise how many queues are supported by us.*/
622 err = xenbus_printf(XBT_NIL, dev->nodename,
623 "multi-queue-max-queues", "%u", xenblk_max_queues);
624 if (err)
625 pr_warn("Error writing multi-queue-max-queues\n");
626
556 /* setup back pointer */ 627 /* setup back pointer */
557 be->blkif->be = be; 628 be->blkif->be = be;
558 629
@@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev,
708 } 779 }
709 780
710 err = connect_ring(be); 781 err = connect_ring(be);
711 if (err) 782 if (err) {
783 /*
784 * Clean up so that memory resources can be used by
785 * other devices. connect_ring reported already error.
786 */
787 xen_blkif_disconnect(be->blkif);
712 break; 788 break;
789 }
713 xen_update_blkif_status(be->blkif); 790 xen_update_blkif_status(be->blkif);
714 break; 791 break;
715 792
@@ -825,50 +902,43 @@ again:
825 xenbus_transaction_end(xbt, 1); 902 xenbus_transaction_end(xbt, 1);
826} 903}
827 904
828 905/*
829static int connect_ring(struct backend_info *be) 906 * Each ring may have multi pages, depends on "ring-page-order".
907 */
908static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
830{ 909{
831 struct xenbus_device *dev = be->dev;
832 unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; 910 unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
833 unsigned int evtchn, nr_grefs, ring_page_order;
834 unsigned int pers_grants;
835 char protocol[64] = "";
836 struct pending_req *req, *n; 911 struct pending_req *req, *n;
837 int err, i, j; 912 int err, i, j;
913 struct xen_blkif *blkif = ring->blkif;
914 struct xenbus_device *dev = blkif->be->dev;
915 unsigned int ring_page_order, nr_grefs, evtchn;
838 916
839 pr_debug("%s %s\n", __func__, dev->otherend); 917 err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
840
841 err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
842 &evtchn); 918 &evtchn);
843 if (err != 1) { 919 if (err != 1) {
844 err = -EINVAL; 920 err = -EINVAL;
845 xenbus_dev_fatal(dev, err, "reading %s/event-channel", 921 xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
846 dev->otherend);
847 return err; 922 return err;
848 } 923 }
849 pr_info("event-channel %u\n", evtchn);
850 924
851 err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", 925 err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
852 &ring_page_order); 926 &ring_page_order);
853 if (err != 1) { 927 if (err != 1) {
854 err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", 928 err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
855 "%u", &ring_ref[0]);
856 if (err != 1) { 929 if (err != 1) {
857 err = -EINVAL; 930 err = -EINVAL;
858 xenbus_dev_fatal(dev, err, "reading %s/ring-ref", 931 xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
859 dev->otherend);
860 return err; 932 return err;
861 } 933 }
862 nr_grefs = 1; 934 nr_grefs = 1;
863 pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
864 ring_ref[0]);
865 } else { 935 } else {
866 unsigned int i; 936 unsigned int i;
867 937
868 if (ring_page_order > xen_blkif_max_ring_order) { 938 if (ring_page_order > xen_blkif_max_ring_order) {
869 err = -EINVAL; 939 err = -EINVAL;
870 xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", 940 xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
871 dev->otherend, ring_page_order, 941 dir, ring_page_order,
872 xen_blkif_max_ring_order); 942 xen_blkif_max_ring_order);
873 return err; 943 return err;
874 } 944 }
@@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be)
878 char ring_ref_name[RINGREF_NAME_LEN]; 948 char ring_ref_name[RINGREF_NAME_LEN];
879 949
880 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); 950 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
881 err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, 951 err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
882 "%u", &ring_ref[i]); 952 "%u", &ring_ref[i]);
883 if (err != 1) { 953 if (err != 1) {
884 err = -EINVAL; 954 err = -EINVAL;
885 xenbus_dev_fatal(dev, err, "reading %s/%s", 955 xenbus_dev_fatal(dev, err, "reading %s/%s",
886 dev->otherend, ring_ref_name); 956 dir, ring_ref_name);
887 return err; 957 return err;
888 } 958 }
889 pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
890 } 959 }
891 } 960 }
892 961 blkif->nr_ring_pages = nr_grefs;
893 be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
894 err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
895 "%63s", protocol, NULL);
896 if (err)
897 strcpy(protocol, "unspecified, assuming default");
898 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
899 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
900 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
901 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
902 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
903 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
904 else {
905 xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
906 return -1;
907 }
908 err = xenbus_gather(XBT_NIL, dev->otherend,
909 "feature-persistent", "%u",
910 &pers_grants, NULL);
911 if (err)
912 pers_grants = 0;
913
914 be->blkif->vbd.feature_gnt_persistent = pers_grants;
915 be->blkif->vbd.overflow_max_grants = 0;
916 be->blkif->nr_ring_pages = nr_grefs;
917
918 pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
919 nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
920 pers_grants ? "persistent grants" : "");
921 962
922 for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { 963 for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
923 req = kzalloc(sizeof(*req), GFP_KERNEL); 964 req = kzalloc(sizeof(*req), GFP_KERNEL);
924 if (!req) 965 if (!req)
925 goto fail; 966 goto fail;
926 list_add_tail(&req->free_list, &be->blkif->pending_free); 967 list_add_tail(&req->free_list, &ring->pending_free);
927 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { 968 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
928 req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); 969 req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
929 if (!req->segments[j]) 970 if (!req->segments[j])
@@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be)
938 } 979 }
939 980
940 /* Map the shared frame, irq etc. */ 981 /* Map the shared frame, irq etc. */
941 err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); 982 err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
942 if (err) { 983 if (err) {
943 xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); 984 xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
944 return err; 985 return err;
@@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be)
947 return 0; 988 return 0;
948 989
949fail: 990fail:
950 list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { 991 list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
951 list_del(&req->free_list); 992 list_del(&req->free_list);
952 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { 993 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
953 if (!req->segments[j]) 994 if (!req->segments[j])
@@ -962,6 +1003,93 @@ fail:
962 kfree(req); 1003 kfree(req);
963 } 1004 }
964 return -ENOMEM; 1005 return -ENOMEM;
1006
1007}
1008
1009static int connect_ring(struct backend_info *be)
1010{
1011 struct xenbus_device *dev = be->dev;
1012 unsigned int pers_grants;
1013 char protocol[64] = "";
1014 int err, i;
1015 char *xspath;
1016 size_t xspathsize;
1017 const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
1018 unsigned int requested_num_queues = 0;
1019
1020 pr_debug("%s %s\n", __func__, dev->otherend);
1021
1022 be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
1023 err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
1024 "%63s", protocol, NULL);
1025 if (err)
1026 strcpy(protocol, "unspecified, assuming default");
1027 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
1028 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
1029 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
1030 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
1031 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
1032 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
1033 else {
1034 xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
1035 return -ENOSYS;
1036 }
1037 err = xenbus_gather(XBT_NIL, dev->otherend,
1038 "feature-persistent", "%u",
1039 &pers_grants, NULL);
1040 if (err)
1041 pers_grants = 0;
1042
1043 be->blkif->vbd.feature_gnt_persistent = pers_grants;
1044 be->blkif->vbd.overflow_max_grants = 0;
1045
1046 /*
1047 * Read the number of hardware queues from frontend.
1048 */
1049 err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
1050 "%u", &requested_num_queues);
1051 if (err < 0) {
1052 requested_num_queues = 1;
1053 } else {
1054 if (requested_num_queues > xenblk_max_queues
1055 || requested_num_queues == 0) {
1056 /* Buggy or malicious guest. */
1057 xenbus_dev_fatal(dev, err,
1058 "guest requested %u queues, exceeding the maximum of %u.",
1059 requested_num_queues, xenblk_max_queues);
1060 return -ENOSYS;
1061 }
1062 }
1063 be->blkif->nr_rings = requested_num_queues;
1064 if (xen_blkif_alloc_rings(be->blkif))
1065 return -ENOMEM;
1066
1067 pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
1068 be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
1069 pers_grants ? "persistent grants" : "");
1070
1071 if (be->blkif->nr_rings == 1)
1072 return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
1073 else {
1074 xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
1075 xspath = kmalloc(xspathsize, GFP_KERNEL);
1076 if (!xspath) {
1077 xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
1078 return -ENOMEM;
1079 }
1080
1081 for (i = 0; i < be->blkif->nr_rings; i++) {
1082 memset(xspath, 0, xspathsize);
1083 snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
1084 err = read_per_ring_refs(&be->blkif->rings[i], xspath);
1085 if (err) {
1086 kfree(xspath);
1087 return err;
1088 }
1089 }
1090 kfree(xspath);
1091 }
1092 return 0;
965} 1093}
966 1094
967static const struct xenbus_device_id xen_blkbk_ids[] = { 1095static const struct xenbus_device_id xen_blkbk_ids[] = {
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2fee2eef988d..8a8dc91c39f7 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
60 60
61#include <asm/xen/hypervisor.h> 61#include <asm/xen/hypervisor.h>
62 62
63/*
64 * The minimal size of segment supported by the block framework is PAGE_SIZE.
65 * When Linux is using a different page size than Xen, it may not be possible
66 * to put all the data in a single segment.
67 * This can happen when the backend doesn't support indirect descriptor and
68 * therefore the maximum amount of data that a request can carry is
69 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
70 *
71 * Note that we only support one extra request. So the Linux page size
72 * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
73 * 88KB.
74 */
75#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
76
63enum blkif_state { 77enum blkif_state {
64 BLKIF_STATE_DISCONNECTED, 78 BLKIF_STATE_DISCONNECTED,
65 BLKIF_STATE_CONNECTED, 79 BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
72 struct list_head node; 86 struct list_head node;
73}; 87};
74 88
89enum blk_req_status {
90 REQ_WAITING,
91 REQ_DONE,
92 REQ_ERROR,
93 REQ_EOPNOTSUPP,
94};
95
75struct blk_shadow { 96struct blk_shadow {
76 struct blkif_request req; 97 struct blkif_request req;
77 struct request *request; 98 struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
79 struct grant **indirect_grants; 100 struct grant **indirect_grants;
80 struct scatterlist *sg; 101 struct scatterlist *sg;
81 unsigned int num_sg; 102 unsigned int num_sg;
103 enum blk_req_status status;
104
105 #define NO_ASSOCIATED_ID ~0UL
106 /*
107 * Id of the sibling if we ever need 2 requests when handling a
108 * block I/O request
109 */
110 unsigned long associated_id;
82}; 111};
83 112
84struct split_bio { 113struct split_bio {
@@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32;
99module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); 128module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
100MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); 129MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
101 130
131static unsigned int xen_blkif_max_queues = 4;
132module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
133MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
134
102/* 135/*
103 * Maximum order of pages to be used for the shared ring between front and 136 * Maximum order of pages to be used for the shared ring between front and
104 * backend, 4KB page granularity is used. 137 * backend, 4KB page granularity is used.
@@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
114 __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) 147 __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
115 148
116/* 149/*
117 * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 150 * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
118 * characters are enough. Define to 20 to keep consist with backend. 151 * characters are enough. Define to 20 to keep consistent with backend.
119 */ 152 */
120#define RINGREF_NAME_LEN (20) 153#define RINGREF_NAME_LEN (20)
154/*
155 * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
156 */
157#define QUEUE_NAME_LEN (17)
158
159/*
160 * Per-ring info.
161 * Every blkfront device can associate with one or more blkfront_ring_info,
162 * depending on how many hardware queues/rings to be used.
163 */
164struct blkfront_ring_info {
165 /* Lock to protect data in every ring buffer. */
166 spinlock_t ring_lock;
167 struct blkif_front_ring ring;
168 unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
169 unsigned int evtchn, irq;
170 struct work_struct work;
171 struct gnttab_free_callback callback;
172 struct blk_shadow shadow[BLK_MAX_RING_SIZE];
173 struct list_head indirect_pages;
174 struct list_head grants;
175 unsigned int persistent_gnts_c;
176 unsigned long shadow_free;
177 struct blkfront_info *dev_info;
178};
121 179
122/* 180/*
123 * We have one of these per vbd, whether ide, scsi or 'other'. They 181 * We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
126 */ 184 */
127struct blkfront_info 185struct blkfront_info
128{ 186{
129 spinlock_t io_lock;
130 struct mutex mutex; 187 struct mutex mutex;
131 struct xenbus_device *xbdev; 188 struct xenbus_device *xbdev;
132 struct gendisk *gd; 189 struct gendisk *gd;
133 int vdevice; 190 int vdevice;
134 blkif_vdev_t handle; 191 blkif_vdev_t handle;
135 enum blkif_state connected; 192 enum blkif_state connected;
136 int ring_ref[XENBUS_MAX_RING_GRANTS]; 193 /* Number of pages per ring buffer. */
137 unsigned int nr_ring_pages; 194 unsigned int nr_ring_pages;
138 struct blkif_front_ring ring;
139 unsigned int evtchn, irq;
140 struct request_queue *rq; 195 struct request_queue *rq;
141 struct work_struct work;
142 struct gnttab_free_callback callback;
143 struct blk_shadow shadow[BLK_MAX_RING_SIZE];
144 struct list_head grants;
145 struct list_head indirect_pages;
146 unsigned int persistent_gnts_c;
147 unsigned long shadow_free;
148 unsigned int feature_flush; 196 unsigned int feature_flush;
149 unsigned int feature_discard:1; 197 unsigned int feature_discard:1;
150 unsigned int feature_secdiscard:1; 198 unsigned int feature_secdiscard:1;
@@ -155,6 +203,8 @@ struct blkfront_info
155 unsigned int max_indirect_segments; 203 unsigned int max_indirect_segments;
156 int is_ready; 204 int is_ready;
157 struct blk_mq_tag_set tag_set; 205 struct blk_mq_tag_set tag_set;
206 struct blkfront_ring_info *rinfo;
207 unsigned int nr_rings;
158}; 208};
159 209
160static unsigned int nr_minors; 210static unsigned int nr_minors;
@@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock);
198 248
199#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) 249#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG)
200 250
201static int blkfront_setup_indirect(struct blkfront_info *info); 251static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
202static int blkfront_gather_backend_features(struct blkfront_info *info); 252static void blkfront_gather_backend_features(struct blkfront_info *info);
203 253
204static int get_id_from_freelist(struct blkfront_info *info) 254static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
205{ 255{
206 unsigned long free = info->shadow_free; 256 unsigned long free = rinfo->shadow_free;
207 BUG_ON(free >= BLK_RING_SIZE(info)); 257
208 info->shadow_free = info->shadow[free].req.u.rw.id; 258 BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
209 info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ 259 rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
260 rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
210 return free; 261 return free;
211} 262}
212 263
213static int add_id_to_freelist(struct blkfront_info *info, 264static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
214 unsigned long id) 265 unsigned long id)
215{ 266{
216 if (info->shadow[id].req.u.rw.id != id) 267 if (rinfo->shadow[id].req.u.rw.id != id)
217 return -EINVAL; 268 return -EINVAL;
218 if (info->shadow[id].request == NULL) 269 if (rinfo->shadow[id].request == NULL)
219 return -EINVAL; 270 return -EINVAL;
220 info->shadow[id].req.u.rw.id = info->shadow_free; 271 rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free;
221 info->shadow[id].request = NULL; 272 rinfo->shadow[id].request = NULL;
222 info->shadow_free = id; 273 rinfo->shadow_free = id;
223 return 0; 274 return 0;
224} 275}
225 276
226static int fill_grant_buffer(struct blkfront_info *info, int num) 277static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
227{ 278{
279 struct blkfront_info *info = rinfo->dev_info;
228 struct page *granted_page; 280 struct page *granted_page;
229 struct grant *gnt_list_entry, *n; 281 struct grant *gnt_list_entry, *n;
230 int i = 0; 282 int i = 0;
231 283
232 while(i < num) { 284 while (i < num) {
233 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); 285 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
234 if (!gnt_list_entry) 286 if (!gnt_list_entry)
235 goto out_of_memory; 287 goto out_of_memory;
@@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
244 } 296 }
245 297
246 gnt_list_entry->gref = GRANT_INVALID_REF; 298 gnt_list_entry->gref = GRANT_INVALID_REF;
247 list_add(&gnt_list_entry->node, &info->grants); 299 list_add(&gnt_list_entry->node, &rinfo->grants);
248 i++; 300 i++;
249 } 301 }
250 302
@@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
252 304
253out_of_memory: 305out_of_memory:
254 list_for_each_entry_safe(gnt_list_entry, n, 306 list_for_each_entry_safe(gnt_list_entry, n,
255 &info->grants, node) { 307 &rinfo->grants, node) {
256 list_del(&gnt_list_entry->node); 308 list_del(&gnt_list_entry->node);
257 if (info->feature_persistent) 309 if (info->feature_persistent)
258 __free_page(gnt_list_entry->page); 310 __free_page(gnt_list_entry->page);
@@ -263,17 +315,17 @@ out_of_memory:
263 return -ENOMEM; 315 return -ENOMEM;
264} 316}
265 317
266static struct grant *get_free_grant(struct blkfront_info *info) 318static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
267{ 319{
268 struct grant *gnt_list_entry; 320 struct grant *gnt_list_entry;
269 321
270 BUG_ON(list_empty(&info->grants)); 322 BUG_ON(list_empty(&rinfo->grants));
271 gnt_list_entry = list_first_entry(&info->grants, struct grant, 323 gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
272 node); 324 node);
273 list_del(&gnt_list_entry->node); 325 list_del(&gnt_list_entry->node);
274 326
275 if (gnt_list_entry->gref != GRANT_INVALID_REF) 327 if (gnt_list_entry->gref != GRANT_INVALID_REF)
276 info->persistent_gnts_c--; 328 rinfo->persistent_gnts_c--;
277 329
278 return gnt_list_entry; 330 return gnt_list_entry;
279} 331}
@@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry,
289 341
290static struct grant *get_grant(grant_ref_t *gref_head, 342static struct grant *get_grant(grant_ref_t *gref_head,
291 unsigned long gfn, 343 unsigned long gfn,
292 struct blkfront_info *info) 344 struct blkfront_ring_info *rinfo)
293{ 345{
294 struct grant *gnt_list_entry = get_free_grant(info); 346 struct grant *gnt_list_entry = get_free_grant(rinfo);
347 struct blkfront_info *info = rinfo->dev_info;
295 348
296 if (gnt_list_entry->gref != GRANT_INVALID_REF) 349 if (gnt_list_entry->gref != GRANT_INVALID_REF)
297 return gnt_list_entry; 350 return gnt_list_entry;
@@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
312} 365}
313 366
314static struct grant *get_indirect_grant(grant_ref_t *gref_head, 367static struct grant *get_indirect_grant(grant_ref_t *gref_head,
315 struct blkfront_info *info) 368 struct blkfront_ring_info *rinfo)
316{ 369{
317 struct grant *gnt_list_entry = get_free_grant(info); 370 struct grant *gnt_list_entry = get_free_grant(rinfo);
371 struct blkfront_info *info = rinfo->dev_info;
318 372
319 if (gnt_list_entry->gref != GRANT_INVALID_REF) 373 if (gnt_list_entry->gref != GRANT_INVALID_REF)
320 return gnt_list_entry; 374 return gnt_list_entry;
@@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
326 struct page *indirect_page; 380 struct page *indirect_page;
327 381
328 /* Fetch a pre-allocated page to use for indirect grefs */ 382 /* Fetch a pre-allocated page to use for indirect grefs */
329 BUG_ON(list_empty(&info->indirect_pages)); 383 BUG_ON(list_empty(&rinfo->indirect_pages));
330 indirect_page = list_first_entry(&info->indirect_pages, 384 indirect_page = list_first_entry(&rinfo->indirect_pages,
331 struct page, lru); 385 struct page, lru);
332 list_del(&indirect_page->lru); 386 list_del(&indirect_page->lru);
333 gnt_list_entry->page = indirect_page; 387 gnt_list_entry->page = indirect_page;
@@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
403 457
404static void blkif_restart_queue_callback(void *arg) 458static void blkif_restart_queue_callback(void *arg)
405{ 459{
406 struct blkfront_info *info = (struct blkfront_info *)arg; 460 struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
407 schedule_work(&info->work); 461 schedule_work(&rinfo->work);
408} 462}
409 463
410static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) 464static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
@@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
456 return 0; 510 return 0;
457} 511}
458 512
459static int blkif_queue_discard_req(struct request *req) 513static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
514 struct request *req,
515 struct blkif_request **ring_req)
460{ 516{
461 struct blkfront_info *info = req->rq_disk->private_data; 517 unsigned long id;
518
519 *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
520 rinfo->ring.req_prod_pvt++;
521
522 id = get_id_from_freelist(rinfo);
523 rinfo->shadow[id].request = req;
524 rinfo->shadow[id].status = REQ_WAITING;
525 rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
526
527 (*ring_req)->u.rw.id = id;
528
529 return id;
530}
531
532static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
533{
534 struct blkfront_info *info = rinfo->dev_info;
462 struct blkif_request *ring_req; 535 struct blkif_request *ring_req;
463 unsigned long id; 536 unsigned long id;
464 537
465 /* Fill out a communications ring structure. */ 538 /* Fill out a communications ring structure. */
466 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 539 id = blkif_ring_get_request(rinfo, req, &ring_req);
467 id = get_id_from_freelist(info);
468 info->shadow[id].request = req;
469 540
470 ring_req->operation = BLKIF_OP_DISCARD; 541 ring_req->operation = BLKIF_OP_DISCARD;
471 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 542 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
@@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req)
476 else 547 else
477 ring_req->u.discard.flag = 0; 548 ring_req->u.discard.flag = 0;
478 549
479 info->ring.req_prod_pvt++;
480
481 /* Keep a private copy so we can reissue requests when recovering. */ 550 /* Keep a private copy so we can reissue requests when recovering. */
482 info->shadow[id].req = *ring_req; 551 rinfo->shadow[id].req = *ring_req;
483 552
484 return 0; 553 return 0;
485} 554}
@@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req)
487struct setup_rw_req { 556struct setup_rw_req {
488 unsigned int grant_idx; 557 unsigned int grant_idx;
489 struct blkif_request_segment *segments; 558 struct blkif_request_segment *segments;
490 struct blkfront_info *info; 559 struct blkfront_ring_info *rinfo;
491 struct blkif_request *ring_req; 560 struct blkif_request *ring_req;
492 grant_ref_t gref_head; 561 grant_ref_t gref_head;
493 unsigned int id; 562 unsigned int id;
@@ -495,6 +564,9 @@ struct setup_rw_req {
495 bool need_copy; 564 bool need_copy;
496 unsigned int bvec_off; 565 unsigned int bvec_off;
497 char *bvec_data; 566 char *bvec_data;
567
568 bool require_extra_req;
569 struct blkif_request *extra_ring_req;
498}; 570};
499 571
500static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, 572static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
507 /* Convenient aliases */ 579 /* Convenient aliases */
508 unsigned int grant_idx = setup->grant_idx; 580 unsigned int grant_idx = setup->grant_idx;
509 struct blkif_request *ring_req = setup->ring_req; 581 struct blkif_request *ring_req = setup->ring_req;
510 struct blkfront_info *info = setup->info; 582 struct blkfront_ring_info *rinfo = setup->rinfo;
511 struct blk_shadow *shadow = &info->shadow[setup->id]; 583 /*
584 * We always use the shadow of the first request to store the list
585 * of grant associated to the block I/O request. This made the
586 * completion more easy to handle even if the block I/O request is
587 * split.
588 */
589 struct blk_shadow *shadow = &rinfo->shadow[setup->id];
590
591 if (unlikely(setup->require_extra_req &&
592 grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
593 /*
594 * We are using the second request, setup grant_idx
595 * to be the index of the segment array.
596 */
597 grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
598 ring_req = setup->extra_ring_req;
599 }
512 600
513 if ((ring_req->operation == BLKIF_OP_INDIRECT) && 601 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
514 (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { 602 (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
@@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
516 kunmap_atomic(setup->segments); 604 kunmap_atomic(setup->segments);
517 605
518 n = grant_idx / GRANTS_PER_INDIRECT_FRAME; 606 n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
519 gnt_list_entry = get_indirect_grant(&setup->gref_head, info); 607 gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
520 shadow->indirect_grants[n] = gnt_list_entry; 608 shadow->indirect_grants[n] = gnt_list_entry;
521 setup->segments = kmap_atomic(gnt_list_entry->page); 609 setup->segments = kmap_atomic(gnt_list_entry->page);
522 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; 610 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
523 } 611 }
524 612
525 gnt_list_entry = get_grant(&setup->gref_head, gfn, info); 613 gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
526 ref = gnt_list_entry->gref; 614 ref = gnt_list_entry->gref;
527 shadow->grants_used[grant_idx] = gnt_list_entry; 615 /*
616 * All the grants are stored in the shadow of the first
617 * request. Therefore we have to use the global index.
618 */
619 shadow->grants_used[setup->grant_idx] = gnt_list_entry;
528 620
529 if (setup->need_copy) { 621 if (setup->need_copy) {
530 void *shared_data; 622 void *shared_data;
@@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
566 (setup->grant_idx)++; 658 (setup->grant_idx)++;
567} 659}
568 660
569static int blkif_queue_rw_req(struct request *req) 661static void blkif_setup_extra_req(struct blkif_request *first,
662 struct blkif_request *second)
570{ 663{
571 struct blkfront_info *info = req->rq_disk->private_data; 664 uint16_t nr_segments = first->u.rw.nr_segments;
572 struct blkif_request *ring_req; 665
573 unsigned long id; 666 /*
667 * The second request is only present when the first request uses
668 * all its segments. It's always the continuity of the first one.
669 */
670 first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
671
672 second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
673 second->u.rw.sector_number = first->u.rw.sector_number +
674 (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
675
676 second->u.rw.handle = first->u.rw.handle;
677 second->operation = first->operation;
678}
679
680static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
681{
682 struct blkfront_info *info = rinfo->dev_info;
683 struct blkif_request *ring_req, *extra_ring_req = NULL;
684 unsigned long id, extra_id = NO_ASSOCIATED_ID;
685 bool require_extra_req = false;
574 int i; 686 int i;
575 struct setup_rw_req setup = { 687 struct setup_rw_req setup = {
576 .grant_idx = 0, 688 .grant_idx = 0,
577 .segments = NULL, 689 .segments = NULL,
578 .info = info, 690 .rinfo = rinfo,
579 .need_copy = rq_data_dir(req) && info->feature_persistent, 691 .need_copy = rq_data_dir(req) && info->feature_persistent,
580 }; 692 };
581 693
@@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req)
584 * existing persistent grants, or if we have to get new grants, 696 * existing persistent grants, or if we have to get new grants,
585 * as there are not sufficiently many free. 697 * as there are not sufficiently many free.
586 */ 698 */
587 bool new_persistent_gnts;
588 struct scatterlist *sg; 699 struct scatterlist *sg;
589 int num_sg, max_grefs, num_grant; 700 int num_sg, max_grefs, num_grant;
590 701
@@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req)
596 */ 707 */
597 max_grefs += INDIRECT_GREFS(max_grefs); 708 max_grefs += INDIRECT_GREFS(max_grefs);
598 709
599 /* Check if we have enough grants to allocate a requests */ 710 /*
600 if (info->persistent_gnts_c < max_grefs) { 711 * We have to reserve 'max_grefs' grants because persistent
601 new_persistent_gnts = 1; 712 * grants are shared by all rings.
602 if (gnttab_alloc_grant_references( 713 */
603 max_grefs - info->persistent_gnts_c, 714 if (max_grefs > 0)
604 &setup.gref_head) < 0) { 715 if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
605 gnttab_request_free_callback( 716 gnttab_request_free_callback(
606 &info->callback, 717 &rinfo->callback,
607 blkif_restart_queue_callback, 718 blkif_restart_queue_callback,
608 info, 719 rinfo,
609 max_grefs); 720 max_grefs);
610 return 1; 721 return 1;
611 } 722 }
612 } else
613 new_persistent_gnts = 0;
614 723
615 /* Fill out a communications ring structure. */ 724 /* Fill out a communications ring structure. */
616 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 725 id = blkif_ring_get_request(rinfo, req, &ring_req);
617 id = get_id_from_freelist(info);
618 info->shadow[id].request = req;
619
620 BUG_ON(info->max_indirect_segments == 0 &&
621 GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
622 BUG_ON(info->max_indirect_segments &&
623 GREFS(req->nr_phys_segments) > info->max_indirect_segments);
624 726
625 num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); 727 num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
626 num_grant = 0; 728 num_grant = 0;
627 /* Calculate the number of grant used */ 729 /* Calculate the number of grant used */
628 for_each_sg(info->shadow[id].sg, sg, num_sg, i) 730 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
629 num_grant += gnttab_count_grant(sg->offset, sg->length); 731 num_grant += gnttab_count_grant(sg->offset, sg->length);
630 732
631 ring_req->u.rw.id = id; 733 require_extra_req = info->max_indirect_segments == 0 &&
632 info->shadow[id].num_sg = num_sg; 734 num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
633 if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { 735 BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
736
737 rinfo->shadow[id].num_sg = num_sg;
738 if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
739 likely(!require_extra_req)) {
634 /* 740 /*
635 * The indirect operation can only be a BLKIF_OP_READ or 741 * The indirect operation can only be a BLKIF_OP_READ or
636 * BLKIF_OP_WRITE 742 * BLKIF_OP_WRITE
@@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req)
670 } 776 }
671 } 777 }
672 ring_req->u.rw.nr_segments = num_grant; 778 ring_req->u.rw.nr_segments = num_grant;
779 if (unlikely(require_extra_req)) {
780 extra_id = blkif_ring_get_request(rinfo, req,
781 &extra_ring_req);
782 /*
783 * Only the first request contains the scatter-gather
784 * list.
785 */
786 rinfo->shadow[extra_id].num_sg = 0;
787
788 blkif_setup_extra_req(ring_req, extra_ring_req);
789
790 /* Link the 2 requests together */
791 rinfo->shadow[extra_id].associated_id = id;
792 rinfo->shadow[id].associated_id = extra_id;
793 }
673 } 794 }
674 795
675 setup.ring_req = ring_req; 796 setup.ring_req = ring_req;
676 setup.id = id; 797 setup.id = id;
677 for_each_sg(info->shadow[id].sg, sg, num_sg, i) { 798
799 setup.require_extra_req = require_extra_req;
800 if (unlikely(require_extra_req))
801 setup.extra_ring_req = extra_ring_req;
802
803 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
678 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 804 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
679 805
680 if (setup.need_copy) { 806 if (setup.need_copy) {
@@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req)
694 if (setup.segments) 820 if (setup.segments)
695 kunmap_atomic(setup.segments); 821 kunmap_atomic(setup.segments);
696 822
697 info->ring.req_prod_pvt++;
698
699 /* Keep a private copy so we can reissue requests when recovering. */ 823 /* Keep a private copy so we can reissue requests when recovering. */
700 info->shadow[id].req = *ring_req; 824 rinfo->shadow[id].req = *ring_req;
825 if (unlikely(require_extra_req))
826 rinfo->shadow[extra_id].req = *extra_ring_req;
701 827
702 if (new_persistent_gnts) 828 if (max_grefs > 0)
703 gnttab_free_grant_references(setup.gref_head); 829 gnttab_free_grant_references(setup.gref_head);
704 830
705 return 0; 831 return 0;
@@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req)
711 * 837 *
712 * @req: a request struct 838 * @req: a request struct
713 */ 839 */
714static int blkif_queue_request(struct request *req) 840static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
715{ 841{
716 struct blkfront_info *info = req->rq_disk->private_data; 842 if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
717
718 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
719 return 1; 843 return 1;
720 844
721 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) 845 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
722 return blkif_queue_discard_req(req); 846 return blkif_queue_discard_req(req, rinfo);
723 else 847 else
724 return blkif_queue_rw_req(req); 848 return blkif_queue_rw_req(req, rinfo);
725} 849}
726 850
727static inline void flush_requests(struct blkfront_info *info) 851static inline void flush_requests(struct blkfront_ring_info *rinfo)
728{ 852{
729 int notify; 853 int notify;
730 854
731 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); 855 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
732 856
733 if (notify) 857 if (notify)
734 notify_remote_via_irq(info->irq); 858 notify_remote_via_irq(rinfo->irq);
735} 859}
736 860
737static inline bool blkif_request_flush_invalid(struct request *req, 861static inline bool blkif_request_flush_invalid(struct request *req,
@@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req,
745} 869}
746 870
747static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, 871static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
748 const struct blk_mq_queue_data *qd) 872 const struct blk_mq_queue_data *qd)
749{ 873{
750 struct blkfront_info *info = qd->rq->rq_disk->private_data; 874 unsigned long flags;
875 struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
751 876
752 blk_mq_start_request(qd->rq); 877 blk_mq_start_request(qd->rq);
753 spin_lock_irq(&info->io_lock); 878 spin_lock_irqsave(&rinfo->ring_lock, flags);
754 if (RING_FULL(&info->ring)) 879 if (RING_FULL(&rinfo->ring))
755 goto out_busy; 880 goto out_busy;
756 881
757 if (blkif_request_flush_invalid(qd->rq, info)) 882 if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
758 goto out_err; 883 goto out_err;
759 884
760 if (blkif_queue_request(qd->rq)) 885 if (blkif_queue_request(qd->rq, rinfo))
761 goto out_busy; 886 goto out_busy;
762 887
763 flush_requests(info); 888 flush_requests(rinfo);
764 spin_unlock_irq(&info->io_lock); 889 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
765 return BLK_MQ_RQ_QUEUE_OK; 890 return BLK_MQ_RQ_QUEUE_OK;
766 891
767out_err: 892out_err:
768 spin_unlock_irq(&info->io_lock); 893 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
769 return BLK_MQ_RQ_QUEUE_ERROR; 894 return BLK_MQ_RQ_QUEUE_ERROR;
770 895
771out_busy: 896out_busy:
772 spin_unlock_irq(&info->io_lock); 897 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
773 blk_mq_stop_hw_queue(hctx); 898 blk_mq_stop_hw_queue(hctx);
774 return BLK_MQ_RQ_QUEUE_BUSY; 899 return BLK_MQ_RQ_QUEUE_BUSY;
775} 900}
776 901
902static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
903 unsigned int index)
904{
905 struct blkfront_info *info = (struct blkfront_info *)data;
906
907 BUG_ON(info->nr_rings <= index);
908 hctx->driver_data = &info->rinfo[index];
909 return 0;
910}
911
777static struct blk_mq_ops blkfront_mq_ops = { 912static struct blk_mq_ops blkfront_mq_ops = {
778 .queue_rq = blkif_queue_rq, 913 .queue_rq = blkif_queue_rq,
779 .map_queue = blk_mq_map_queue, 914 .map_queue = blk_mq_map_queue,
915 .init_hctx = blk_mq_init_hctx,
780}; 916};
781 917
782static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, 918static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
@@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
788 924
789 memset(&info->tag_set, 0, sizeof(info->tag_set)); 925 memset(&info->tag_set, 0, sizeof(info->tag_set));
790 info->tag_set.ops = &blkfront_mq_ops; 926 info->tag_set.ops = &blkfront_mq_ops;
791 info->tag_set.nr_hw_queues = 1; 927 info->tag_set.nr_hw_queues = info->nr_rings;
792 info->tag_set.queue_depth = BLK_RING_SIZE(info); 928 if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
929 /*
930 * When indirect descriptior is not supported, the I/O request
931 * will be split between multiple request in the ring.
932 * To avoid problems when sending the request, divide by
933 * 2 the depth of the queue.
934 */
935 info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
936 } else
937 info->tag_set.queue_depth = BLK_RING_SIZE(info);
793 info->tag_set.numa_node = NUMA_NO_NODE; 938 info->tag_set.numa_node = NUMA_NO_NODE;
794 info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 939 info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
795 info->tag_set.cmd_size = 0; 940 info->tag_set.cmd_size = 0;
796 info->tag_set.driver_data = info; 941 info->tag_set.driver_data = info;
797 942
798 if (blk_mq_alloc_tag_set(&info->tag_set)) 943 if (blk_mq_alloc_tag_set(&info->tag_set))
799 return -1; 944 return -EINVAL;
800 rq = blk_mq_init_queue(&info->tag_set); 945 rq = blk_mq_init_queue(&info->tag_set);
801 if (IS_ERR(rq)) { 946 if (IS_ERR(rq)) {
802 blk_mq_free_tag_set(&info->tag_set); 947 blk_mq_free_tag_set(&info->tag_set);
803 return -1; 948 return PTR_ERR(rq);
804 } 949 }
805 950
806 queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); 951 queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
@@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
1028 1173
1029static void xlvbd_release_gendisk(struct blkfront_info *info) 1174static void xlvbd_release_gendisk(struct blkfront_info *info)
1030{ 1175{
1031 unsigned int minor, nr_minors; 1176 unsigned int minor, nr_minors, i;
1032 1177
1033 if (info->rq == NULL) 1178 if (info->rq == NULL)
1034 return; 1179 return;
@@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
1036 /* No more blkif_request(). */ 1181 /* No more blkif_request(). */
1037 blk_mq_stop_hw_queues(info->rq); 1182 blk_mq_stop_hw_queues(info->rq);
1038 1183
1039 /* No more gnttab callback work. */ 1184 for (i = 0; i < info->nr_rings; i++) {
1040 gnttab_cancel_free_callback(&info->callback); 1185 struct blkfront_ring_info *rinfo = &info->rinfo[i];
1041 1186
1042 /* Flush gnttab callback work. Must be done with no locks held. */ 1187 /* No more gnttab callback work. */
1043 flush_work(&info->work); 1188 gnttab_cancel_free_callback(&rinfo->callback);
1189
1190 /* Flush gnttab callback work. Must be done with no locks held. */
1191 flush_work(&rinfo->work);
1192 }
1044 1193
1045 del_gendisk(info->gd); 1194 del_gendisk(info->gd);
1046 1195
@@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
1056 info->gd = NULL; 1205 info->gd = NULL;
1057} 1206}
1058 1207
1059/* Must be called with io_lock holded */ 1208/* Already hold rinfo->ring_lock. */
1060static void kick_pending_request_queues(struct blkfront_info *info) 1209static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
1061{ 1210{
1062 if (!RING_FULL(&info->ring)) 1211 if (!RING_FULL(&rinfo->ring))
1063 blk_mq_start_stopped_hw_queues(info->rq, true); 1212 blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
1064} 1213}
1065 1214
1066static void blkif_restart_queue(struct work_struct *work) 1215static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
1067{ 1216{
1068 struct blkfront_info *info = container_of(work, struct blkfront_info, work); 1217 unsigned long flags;
1069 1218
1070 spin_lock_irq(&info->io_lock); 1219 spin_lock_irqsave(&rinfo->ring_lock, flags);
1071 if (info->connected == BLKIF_STATE_CONNECTED) 1220 kick_pending_request_queues_locked(rinfo);
1072 kick_pending_request_queues(info); 1221 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1073 spin_unlock_irq(&info->io_lock);
1074} 1222}
1075 1223
1076static void blkif_free(struct blkfront_info *info, int suspend) 1224static void blkif_restart_queue(struct work_struct *work)
1077{ 1225{
1078 struct grant *persistent_gnt; 1226 struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
1079 struct grant *n;
1080 int i, j, segs;
1081 1227
1082 /* Prevent new requests being issued until we fix things up. */ 1228 if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
1083 spin_lock_irq(&info->io_lock); 1229 kick_pending_request_queues(rinfo);
1084 info->connected = suspend ? 1230}
1085 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1086 /* No more blkif_request(). */
1087 if (info->rq)
1088 blk_mq_stop_hw_queues(info->rq);
1089 1231
1090 /* Remove all persistent grants */ 1232static void blkif_free_ring(struct blkfront_ring_info *rinfo)
1091 if (!list_empty(&info->grants)) { 1233{
1092 list_for_each_entry_safe(persistent_gnt, n, 1234 struct grant *persistent_gnt, *n;
1093 &info->grants, node) { 1235 struct blkfront_info *info = rinfo->dev_info;
1094 list_del(&persistent_gnt->node); 1236 int i, j, segs;
1095 if (persistent_gnt->gref != GRANT_INVALID_REF) {
1096 gnttab_end_foreign_access(persistent_gnt->gref,
1097 0, 0UL);
1098 info->persistent_gnts_c--;
1099 }
1100 if (info->feature_persistent)
1101 __free_page(persistent_gnt->page);
1102 kfree(persistent_gnt);
1103 }
1104 }
1105 BUG_ON(info->persistent_gnts_c != 0);
1106 1237
1107 /* 1238 /*
1108 * Remove indirect pages, this only happens when using indirect 1239 * Remove indirect pages, this only happens when using indirect
1109 * descriptors but not persistent grants 1240 * descriptors but not persistent grants
1110 */ 1241 */
1111 if (!list_empty(&info->indirect_pages)) { 1242 if (!list_empty(&rinfo->indirect_pages)) {
1112 struct page *indirect_page, *n; 1243 struct page *indirect_page, *n;
1113 1244
1114 BUG_ON(info->feature_persistent); 1245 BUG_ON(info->feature_persistent);
1115 list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { 1246 list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
1116 list_del(&indirect_page->lru); 1247 list_del(&indirect_page->lru);
1117 __free_page(indirect_page); 1248 __free_page(indirect_page);
1118 } 1249 }
1119 } 1250 }
1120 1251
1252 /* Remove all persistent grants. */
1253 if (!list_empty(&rinfo->grants)) {
1254 list_for_each_entry_safe(persistent_gnt, n,
1255 &rinfo->grants, node) {
1256 list_del(&persistent_gnt->node);
1257 if (persistent_gnt->gref != GRANT_INVALID_REF) {
1258 gnttab_end_foreign_access(persistent_gnt->gref,
1259 0, 0UL);
1260 rinfo->persistent_gnts_c--;
1261 }
1262 if (info->feature_persistent)
1263 __free_page(persistent_gnt->page);
1264 kfree(persistent_gnt);
1265 }
1266 }
1267 BUG_ON(rinfo->persistent_gnts_c != 0);
1268
1121 for (i = 0; i < BLK_RING_SIZE(info); i++) { 1269 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1122 /* 1270 /*
1123 * Clear persistent grants present in requests already 1271 * Clear persistent grants present in requests already
1124 * on the shared ring 1272 * on the shared ring
1125 */ 1273 */
1126 if (!info->shadow[i].request) 1274 if (!rinfo->shadow[i].request)
1127 goto free_shadow; 1275 goto free_shadow;
1128 1276
1129 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? 1277 segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
1130 info->shadow[i].req.u.indirect.nr_segments : 1278 rinfo->shadow[i].req.u.indirect.nr_segments :
1131 info->shadow[i].req.u.rw.nr_segments; 1279 rinfo->shadow[i].req.u.rw.nr_segments;
1132 for (j = 0; j < segs; j++) { 1280 for (j = 0; j < segs; j++) {
1133 persistent_gnt = info->shadow[i].grants_used[j]; 1281 persistent_gnt = rinfo->shadow[i].grants_used[j];
1134 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); 1282 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
1135 if (info->feature_persistent) 1283 if (info->feature_persistent)
1136 __free_page(persistent_gnt->page); 1284 __free_page(persistent_gnt->page);
1137 kfree(persistent_gnt); 1285 kfree(persistent_gnt);
1138 } 1286 }
1139 1287
1140 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) 1288 if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
1141 /* 1289 /*
1142 * If this is not an indirect operation don't try to 1290 * If this is not an indirect operation don't try to
1143 * free indirect segments 1291 * free indirect segments
@@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend)
1145 goto free_shadow; 1293 goto free_shadow;
1146 1294
1147 for (j = 0; j < INDIRECT_GREFS(segs); j++) { 1295 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
1148 persistent_gnt = info->shadow[i].indirect_grants[j]; 1296 persistent_gnt = rinfo->shadow[i].indirect_grants[j];
1149 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); 1297 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
1150 __free_page(persistent_gnt->page); 1298 __free_page(persistent_gnt->page);
1151 kfree(persistent_gnt); 1299 kfree(persistent_gnt);
1152 } 1300 }
1153 1301
1154free_shadow: 1302free_shadow:
1155 kfree(info->shadow[i].grants_used); 1303 kfree(rinfo->shadow[i].grants_used);
1156 info->shadow[i].grants_used = NULL; 1304 rinfo->shadow[i].grants_used = NULL;
1157 kfree(info->shadow[i].indirect_grants); 1305 kfree(rinfo->shadow[i].indirect_grants);
1158 info->shadow[i].indirect_grants = NULL; 1306 rinfo->shadow[i].indirect_grants = NULL;
1159 kfree(info->shadow[i].sg); 1307 kfree(rinfo->shadow[i].sg);
1160 info->shadow[i].sg = NULL; 1308 rinfo->shadow[i].sg = NULL;
1161 } 1309 }
1162 1310
1163 /* No more gnttab callback work. */ 1311 /* No more gnttab callback work. */
1164 gnttab_cancel_free_callback(&info->callback); 1312 gnttab_cancel_free_callback(&rinfo->callback);
1165 spin_unlock_irq(&info->io_lock);
1166 1313
1167 /* Flush gnttab callback work. Must be done with no locks held. */ 1314 /* Flush gnttab callback work. Must be done with no locks held. */
1168 flush_work(&info->work); 1315 flush_work(&rinfo->work);
1169 1316
1170 /* Free resources associated with old device channel. */ 1317 /* Free resources associated with old device channel. */
1171 for (i = 0; i < info->nr_ring_pages; i++) { 1318 for (i = 0; i < info->nr_ring_pages; i++) {
1172 if (info->ring_ref[i] != GRANT_INVALID_REF) { 1319 if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
1173 gnttab_end_foreign_access(info->ring_ref[i], 0, 0); 1320 gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
1174 info->ring_ref[i] = GRANT_INVALID_REF; 1321 rinfo->ring_ref[i] = GRANT_INVALID_REF;
1175 } 1322 }
1176 } 1323 }
1177 free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); 1324 free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
1178 info->ring.sring = NULL; 1325 rinfo->ring.sring = NULL;
1179 1326
1180 if (info->irq) 1327 if (rinfo->irq)
1181 unbind_from_irqhandler(info->irq, info); 1328 unbind_from_irqhandler(rinfo->irq, rinfo);
1182 info->evtchn = info->irq = 0; 1329 rinfo->evtchn = rinfo->irq = 0;
1330}
1183 1331
1332static void blkif_free(struct blkfront_info *info, int suspend)
1333{
1334 unsigned int i;
1335
1336 /* Prevent new requests being issued until we fix things up. */
1337 info->connected = suspend ?
1338 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1339 /* No more blkif_request(). */
1340 if (info->rq)
1341 blk_mq_stop_hw_queues(info->rq);
1342
1343 for (i = 0; i < info->nr_rings; i++)
1344 blkif_free_ring(&info->rinfo[i]);
1345
1346 kfree(info->rinfo);
1347 info->rinfo = NULL;
1348 info->nr_rings = 0;
1184} 1349}
1185 1350
1186struct copy_from_grant { 1351struct copy_from_grant {
@@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
1209 kunmap_atomic(shared_data); 1374 kunmap_atomic(shared_data);
1210} 1375}
1211 1376
1212static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, 1377static enum blk_req_status blkif_rsp_to_req_status(int rsp)
1378{
1379 switch (rsp)
1380 {
1381 case BLKIF_RSP_OKAY:
1382 return REQ_DONE;
1383 case BLKIF_RSP_EOPNOTSUPP:
1384 return REQ_EOPNOTSUPP;
1385 case BLKIF_RSP_ERROR:
1386 /* Fallthrough. */
1387 default:
1388 return REQ_ERROR;
1389 }
1390}
1391
1392/*
1393 * Get the final status of the block request based on two ring response
1394 */
1395static int blkif_get_final_status(enum blk_req_status s1,
1396 enum blk_req_status s2)
1397{
1398 BUG_ON(s1 == REQ_WAITING);
1399 BUG_ON(s2 == REQ_WAITING);
1400
1401 if (s1 == REQ_ERROR || s2 == REQ_ERROR)
1402 return BLKIF_RSP_ERROR;
1403 else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
1404 return BLKIF_RSP_EOPNOTSUPP;
1405 return BLKIF_RSP_OKAY;
1406}
1407
1408static bool blkif_completion(unsigned long *id,
1409 struct blkfront_ring_info *rinfo,
1213 struct blkif_response *bret) 1410 struct blkif_response *bret)
1214{ 1411{
1215 int i = 0; 1412 int i = 0;
1216 struct scatterlist *sg; 1413 struct scatterlist *sg;
1217 int num_sg, num_grant; 1414 int num_sg, num_grant;
1415 struct blkfront_info *info = rinfo->dev_info;
1416 struct blk_shadow *s = &rinfo->shadow[*id];
1218 struct copy_from_grant data = { 1417 struct copy_from_grant data = {
1219 .s = s,
1220 .grant_idx = 0, 1418 .grant_idx = 0,
1221 }; 1419 };
1222 1420
1223 num_grant = s->req.operation == BLKIF_OP_INDIRECT ? 1421 num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
1224 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; 1422 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
1423
1424 /* The I/O request may be split in two. */
1425 if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
1426 struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
1427
1428 /* Keep the status of the current response in shadow. */
1429 s->status = blkif_rsp_to_req_status(bret->status);
1430
1431 /* Wait the second response if not yet here. */
1432 if (s2->status == REQ_WAITING)
1433 return 0;
1434
1435 bret->status = blkif_get_final_status(s->status,
1436 s2->status);
1437
1438 /*
1439 * All the grants is stored in the first shadow in order
1440 * to make the completion code simpler.
1441 */
1442 num_grant += s2->req.u.rw.nr_segments;
1443
1444 /*
1445 * The two responses may not come in order. Only the
1446 * first request will store the scatter-gather list.
1447 */
1448 if (s2->num_sg != 0) {
1449 /* Update "id" with the ID of the first response. */
1450 *id = s->associated_id;
1451 s = s2;
1452 }
1453
1454 /*
1455 * We don't need anymore the second request, so recycling
1456 * it now.
1457 */
1458 if (add_id_to_freelist(rinfo, s->associated_id))
1459 WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
1460 info->gd->disk_name, s->associated_id);
1461 }
1462
1463 data.s = s;
1225 num_sg = s->num_sg; 1464 num_sg = s->num_sg;
1226 1465
1227 if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { 1466 if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1252 if (!info->feature_persistent) 1491 if (!info->feature_persistent)
1253 pr_alert_ratelimited("backed has not unmapped grant: %u\n", 1492 pr_alert_ratelimited("backed has not unmapped grant: %u\n",
1254 s->grants_used[i]->gref); 1493 s->grants_used[i]->gref);
1255 list_add(&s->grants_used[i]->node, &info->grants); 1494 list_add(&s->grants_used[i]->node, &rinfo->grants);
1256 info->persistent_gnts_c++; 1495 rinfo->persistent_gnts_c++;
1257 } else { 1496 } else {
1258 /* 1497 /*
1259 * If the grant is not mapped by the backend we end the 1498 * If the grant is not mapped by the backend we end the
@@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1263 */ 1502 */
1264 gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); 1503 gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
1265 s->grants_used[i]->gref = GRANT_INVALID_REF; 1504 s->grants_used[i]->gref = GRANT_INVALID_REF;
1266 list_add_tail(&s->grants_used[i]->node, &info->grants); 1505 list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
1267 } 1506 }
1268 } 1507 }
1269 if (s->req.operation == BLKIF_OP_INDIRECT) { 1508 if (s->req.operation == BLKIF_OP_INDIRECT) {
@@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1272 if (!info->feature_persistent) 1511 if (!info->feature_persistent)
1273 pr_alert_ratelimited("backed has not unmapped grant: %u\n", 1512 pr_alert_ratelimited("backed has not unmapped grant: %u\n",
1274 s->indirect_grants[i]->gref); 1513 s->indirect_grants[i]->gref);
1275 list_add(&s->indirect_grants[i]->node, &info->grants); 1514 list_add(&s->indirect_grants[i]->node, &rinfo->grants);
1276 info->persistent_gnts_c++; 1515 rinfo->persistent_gnts_c++;
1277 } else { 1516 } else {
1278 struct page *indirect_page; 1517 struct page *indirect_page;
1279 1518
@@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1284 */ 1523 */
1285 if (!info->feature_persistent) { 1524 if (!info->feature_persistent) {
1286 indirect_page = s->indirect_grants[i]->page; 1525 indirect_page = s->indirect_grants[i]->page;
1287 list_add(&indirect_page->lru, &info->indirect_pages); 1526 list_add(&indirect_page->lru, &rinfo->indirect_pages);
1288 } 1527 }
1289 s->indirect_grants[i]->gref = GRANT_INVALID_REF; 1528 s->indirect_grants[i]->gref = GRANT_INVALID_REF;
1290 list_add_tail(&s->indirect_grants[i]->node, &info->grants); 1529 list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
1291 } 1530 }
1292 } 1531 }
1293 } 1532 }
1533
1534 return 1;
1294} 1535}
1295 1536
1296static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1537static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1299 struct blkif_response *bret; 1540 struct blkif_response *bret;
1300 RING_IDX i, rp; 1541 RING_IDX i, rp;
1301 unsigned long flags; 1542 unsigned long flags;
1302 struct blkfront_info *info = (struct blkfront_info *)dev_id; 1543 struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
1544 struct blkfront_info *info = rinfo->dev_info;
1303 int error; 1545 int error;
1304 1546
1305 spin_lock_irqsave(&info->io_lock, flags); 1547 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
1306
1307 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1308 spin_unlock_irqrestore(&info->io_lock, flags);
1309 return IRQ_HANDLED; 1548 return IRQ_HANDLED;
1310 }
1311 1549
1550 spin_lock_irqsave(&rinfo->ring_lock, flags);
1312 again: 1551 again:
1313 rp = info->ring.sring->rsp_prod; 1552 rp = rinfo->ring.sring->rsp_prod;
1314 rmb(); /* Ensure we see queued responses up to 'rp'. */ 1553 rmb(); /* Ensure we see queued responses up to 'rp'. */
1315 1554
1316 for (i = info->ring.rsp_cons; i != rp; i++) { 1555 for (i = rinfo->ring.rsp_cons; i != rp; i++) {
1317 unsigned long id; 1556 unsigned long id;
1318 1557
1319 bret = RING_GET_RESPONSE(&info->ring, i); 1558 bret = RING_GET_RESPONSE(&rinfo->ring, i);
1320 id = bret->id; 1559 id = bret->id;
1321 /* 1560 /*
1322 * The backend has messed up and given us an id that we would 1561 * The backend has messed up and given us an id that we would
@@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1330 * the id is busted. */ 1569 * the id is busted. */
1331 continue; 1570 continue;
1332 } 1571 }
1333 req = info->shadow[id].request; 1572 req = rinfo->shadow[id].request;
1334 1573
1335 if (bret->operation != BLKIF_OP_DISCARD) 1574 if (bret->operation != BLKIF_OP_DISCARD) {
1336 blkif_completion(&info->shadow[id], info, bret); 1575 /*
1576 * We may need to wait for an extra response if the
1577 * I/O request is split in 2
1578 */
1579 if (!blkif_completion(&id, rinfo, bret))
1580 continue;
1581 }
1337 1582
1338 if (add_id_to_freelist(info, id)) { 1583 if (add_id_to_freelist(rinfo, id)) {
1339 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", 1584 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
1340 info->gd->disk_name, op_name(bret->operation), id); 1585 info->gd->disk_name, op_name(bret->operation), id);
1341 continue; 1586 continue;
@@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1364 error = -EOPNOTSUPP; 1609 error = -EOPNOTSUPP;
1365 } 1610 }
1366 if (unlikely(bret->status == BLKIF_RSP_ERROR && 1611 if (unlikely(bret->status == BLKIF_RSP_ERROR &&
1367 info->shadow[id].req.u.rw.nr_segments == 0)) { 1612 rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
1368 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", 1613 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
1369 info->gd->disk_name, op_name(bret->operation)); 1614 info->gd->disk_name, op_name(bret->operation));
1370 error = -EOPNOTSUPP; 1615 error = -EOPNOTSUPP;
@@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1389 } 1634 }
1390 } 1635 }
1391 1636
1392 info->ring.rsp_cons = i; 1637 rinfo->ring.rsp_cons = i;
1393 1638
1394 if (i != info->ring.req_prod_pvt) { 1639 if (i != rinfo->ring.req_prod_pvt) {
1395 int more_to_do; 1640 int more_to_do;
1396 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); 1641 RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
1397 if (more_to_do) 1642 if (more_to_do)
1398 goto again; 1643 goto again;
1399 } else 1644 } else
1400 info->ring.sring->rsp_event = i + 1; 1645 rinfo->ring.sring->rsp_event = i + 1;
1401 1646
1402 kick_pending_request_queues(info); 1647 kick_pending_request_queues_locked(rinfo);
1403 1648
1404 spin_unlock_irqrestore(&info->io_lock, flags); 1649 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1405 1650
1406 return IRQ_HANDLED; 1651 return IRQ_HANDLED;
1407} 1652}
1408 1653
1409 1654
1410static int setup_blkring(struct xenbus_device *dev, 1655static int setup_blkring(struct xenbus_device *dev,
1411 struct blkfront_info *info) 1656 struct blkfront_ring_info *rinfo)
1412{ 1657{
1413 struct blkif_sring *sring; 1658 struct blkif_sring *sring;
1414 int err, i; 1659 int err, i;
1660 struct blkfront_info *info = rinfo->dev_info;
1415 unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; 1661 unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
1416 grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; 1662 grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
1417 1663
1418 for (i = 0; i < info->nr_ring_pages; i++) 1664 for (i = 0; i < info->nr_ring_pages; i++)
1419 info->ring_ref[i] = GRANT_INVALID_REF; 1665 rinfo->ring_ref[i] = GRANT_INVALID_REF;
1420 1666
1421 sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, 1667 sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
1422 get_order(ring_size)); 1668 get_order(ring_size));
@@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev,
1425 return -ENOMEM; 1671 return -ENOMEM;
1426 } 1672 }
1427 SHARED_RING_INIT(sring); 1673 SHARED_RING_INIT(sring);
1428 FRONT_RING_INIT(&info->ring, sring, ring_size); 1674 FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
1429 1675
1430 err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); 1676 err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
1431 if (err < 0) { 1677 if (err < 0) {
1432 free_pages((unsigned long)sring, get_order(ring_size)); 1678 free_pages((unsigned long)sring, get_order(ring_size));
1433 info->ring.sring = NULL; 1679 rinfo->ring.sring = NULL;
1434 goto fail; 1680 goto fail;
1435 } 1681 }
1436 for (i = 0; i < info->nr_ring_pages; i++) 1682 for (i = 0; i < info->nr_ring_pages; i++)
1437 info->ring_ref[i] = gref[i]; 1683 rinfo->ring_ref[i] = gref[i];
1438 1684
1439 err = xenbus_alloc_evtchn(dev, &info->evtchn); 1685 err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
1440 if (err) 1686 if (err)
1441 goto fail; 1687 goto fail;
1442 1688
1443 err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, 1689 err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0,
1444 "blkif", info); 1690 "blkif", rinfo);
1445 if (err <= 0) { 1691 if (err <= 0) {
1446 xenbus_dev_fatal(dev, err, 1692 xenbus_dev_fatal(dev, err,
1447 "bind_evtchn_to_irqhandler failed"); 1693 "bind_evtchn_to_irqhandler failed");
1448 goto fail; 1694 goto fail;
1449 } 1695 }
1450 info->irq = err; 1696 rinfo->irq = err;
1451 1697
1452 return 0; 1698 return 0;
1453fail: 1699fail:
@@ -1455,6 +1701,53 @@ fail:
1455 return err; 1701 return err;
1456} 1702}
1457 1703
1704/*
1705 * Write out per-ring/queue nodes including ring-ref and event-channel, and each
1706 * ring buffer may have multi pages depending on ->nr_ring_pages.
1707 */
1708static int write_per_ring_nodes(struct xenbus_transaction xbt,
1709 struct blkfront_ring_info *rinfo, const char *dir)
1710{
1711 int err;
1712 unsigned int i;
1713 const char *message = NULL;
1714 struct blkfront_info *info = rinfo->dev_info;
1715
1716 if (info->nr_ring_pages == 1) {
1717 err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
1718 if (err) {
1719 message = "writing ring-ref";
1720 goto abort_transaction;
1721 }
1722 } else {
1723 for (i = 0; i < info->nr_ring_pages; i++) {
1724 char ring_ref_name[RINGREF_NAME_LEN];
1725
1726 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
1727 err = xenbus_printf(xbt, dir, ring_ref_name,
1728 "%u", rinfo->ring_ref[i]);
1729 if (err) {
1730 message = "writing ring-ref";
1731 goto abort_transaction;
1732 }
1733 }
1734 }
1735
1736 err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
1737 if (err) {
1738 message = "writing event-channel";
1739 goto abort_transaction;
1740 }
1741
1742 return 0;
1743
1744abort_transaction:
1745 xenbus_transaction_end(xbt, 1);
1746 if (message)
1747 xenbus_dev_fatal(info->xbdev, err, "%s", message);
1748
1749 return err;
1750}
1458 1751
1459/* Common code used when first setting up, and when resuming. */ 1752/* Common code used when first setting up, and when resuming. */
1460static int talk_to_blkback(struct xenbus_device *dev, 1753static int talk_to_blkback(struct xenbus_device *dev,
@@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev,
1462{ 1755{
1463 const char *message = NULL; 1756 const char *message = NULL;
1464 struct xenbus_transaction xbt; 1757 struct xenbus_transaction xbt;
1465 int err, i; 1758 int err;
1466 unsigned int max_page_order = 0; 1759 unsigned int i, max_page_order = 0;
1467 unsigned int ring_page_order = 0; 1760 unsigned int ring_page_order = 0;
1468 1761
1469 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, 1762 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
@@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
1475 info->nr_ring_pages = 1 << ring_page_order; 1768 info->nr_ring_pages = 1 << ring_page_order;
1476 } 1769 }
1477 1770
1478 /* Create shared ring, alloc event channel. */ 1771 for (i = 0; i < info->nr_rings; i++) {
1479 err = setup_blkring(dev, info); 1772 struct blkfront_ring_info *rinfo = &info->rinfo[i];
1480 if (err) 1773
1481 goto out; 1774 /* Create shared ring, alloc event channel. */
1775 err = setup_blkring(dev, rinfo);
1776 if (err)
1777 goto destroy_blkring;
1778 }
1482 1779
1483again: 1780again:
1484 err = xenbus_transaction_start(&xbt); 1781 err = xenbus_transaction_start(&xbt);
@@ -1487,38 +1784,49 @@ again:
1487 goto destroy_blkring; 1784 goto destroy_blkring;
1488 } 1785 }
1489 1786
1490 if (info->nr_ring_pages == 1) { 1787 if (info->nr_ring_pages > 1) {
1491 err = xenbus_printf(xbt, dev->nodename, 1788 err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
1492 "ring-ref", "%u", info->ring_ref[0]); 1789 ring_page_order);
1493 if (err) { 1790 if (err) {
1494 message = "writing ring-ref"; 1791 message = "writing ring-page-order";
1495 goto abort_transaction; 1792 goto abort_transaction;
1496 } 1793 }
1794 }
1795
1796 /* We already got the number of queues/rings in _probe */
1797 if (info->nr_rings == 1) {
1798 err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename);
1799 if (err)
1800 goto destroy_blkring;
1497 } else { 1801 } else {
1498 err = xenbus_printf(xbt, dev->nodename, 1802 char *path;
1499 "ring-page-order", "%u", ring_page_order); 1803 size_t pathsize;
1804
1805 err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
1806 info->nr_rings);
1500 if (err) { 1807 if (err) {
1501 message = "writing ring-page-order"; 1808 message = "writing multi-queue-num-queues";
1502 goto abort_transaction; 1809 goto abort_transaction;
1503 } 1810 }
1504 1811
1505 for (i = 0; i < info->nr_ring_pages; i++) { 1812 pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
1506 char ring_ref_name[RINGREF_NAME_LEN]; 1813 path = kmalloc(pathsize, GFP_KERNEL);
1814 if (!path) {
1815 err = -ENOMEM;
1816 message = "ENOMEM while writing ring references";
1817 goto abort_transaction;
1818 }
1507 1819
1508 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); 1820 for (i = 0; i < info->nr_rings; i++) {
1509 err = xenbus_printf(xbt, dev->nodename, ring_ref_name, 1821 memset(path, 0, pathsize);
1510 "%u", info->ring_ref[i]); 1822 snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
1823 err = write_per_ring_nodes(xbt, &info->rinfo[i], path);
1511 if (err) { 1824 if (err) {
1512 message = "writing ring-ref"; 1825 kfree(path);
1513 goto abort_transaction; 1826 goto destroy_blkring;
1514 } 1827 }
1515 } 1828 }
1516 } 1829 kfree(path);
1517 err = xenbus_printf(xbt, dev->nodename,
1518 "event-channel", "%u", info->evtchn);
1519 if (err) {
1520 message = "writing event-channel";
1521 goto abort_transaction;
1522 } 1830 }
1523 err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", 1831 err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
1524 XEN_IO_PROTO_ABI_NATIVE); 1832 XEN_IO_PROTO_ABI_NATIVE);
@@ -1540,9 +1848,14 @@ again:
1540 goto destroy_blkring; 1848 goto destroy_blkring;
1541 } 1849 }
1542 1850
1543 for (i = 0; i < BLK_RING_SIZE(info); i++) 1851 for (i = 0; i < info->nr_rings; i++) {
1544 info->shadow[i].req.u.rw.id = i+1; 1852 unsigned int j;
1545 info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; 1853 struct blkfront_ring_info *rinfo = &info->rinfo[i];
1854
1855 for (j = 0; j < BLK_RING_SIZE(info); j++)
1856 rinfo->shadow[j].req.u.rw.id = j + 1;
1857 rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
1858 }
1546 xenbus_switch_state(dev, XenbusStateInitialised); 1859 xenbus_switch_state(dev, XenbusStateInitialised);
1547 1860
1548 return 0; 1861 return 0;
@@ -1553,7 +1866,10 @@ again:
1553 xenbus_dev_fatal(dev, err, "%s", message); 1866 xenbus_dev_fatal(dev, err, "%s", message);
1554 destroy_blkring: 1867 destroy_blkring:
1555 blkif_free(info, 0); 1868 blkif_free(info, 0);
1556 out: 1869
1870 kfree(info);
1871 dev_set_drvdata(&dev->dev, NULL);
1872
1557 return err; 1873 return err;
1558} 1874}
1559 1875
@@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev,
1567 const struct xenbus_device_id *id) 1883 const struct xenbus_device_id *id)
1568{ 1884{
1569 int err, vdevice; 1885 int err, vdevice;
1886 unsigned int r_index;
1570 struct blkfront_info *info; 1887 struct blkfront_info *info;
1888 unsigned int backend_max_queues = 0;
1571 1889
1572 /* FIXME: Use dynamic device id if this is not set. */ 1890 /* FIXME: Use dynamic device id if this is not set. */
1573 err = xenbus_scanf(XBT_NIL, dev->nodename, 1891 err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev,
1617 return -ENOMEM; 1935 return -ENOMEM;
1618 } 1936 }
1619 1937
1620 mutex_init(&info->mutex);
1621 spin_lock_init(&info->io_lock);
1622 info->xbdev = dev; 1938 info->xbdev = dev;
1939 /* Check if backend supports multiple queues. */
1940 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1941 "multi-queue-max-queues", "%u", &backend_max_queues);
1942 if (err < 0)
1943 backend_max_queues = 1;
1944
1945 info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
1946 /* We need at least one ring. */
1947 if (!info->nr_rings)
1948 info->nr_rings = 1;
1949
1950 info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
1951 if (!info->rinfo) {
1952 xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
1953 kfree(info);
1954 return -ENOMEM;
1955 }
1956
1957 for (r_index = 0; r_index < info->nr_rings; r_index++) {
1958 struct blkfront_ring_info *rinfo;
1959
1960 rinfo = &info->rinfo[r_index];
1961 INIT_LIST_HEAD(&rinfo->indirect_pages);
1962 INIT_LIST_HEAD(&rinfo->grants);
1963 rinfo->dev_info = info;
1964 INIT_WORK(&rinfo->work, blkif_restart_queue);
1965 spin_lock_init(&rinfo->ring_lock);
1966 }
1967
1968 mutex_init(&info->mutex);
1623 info->vdevice = vdevice; 1969 info->vdevice = vdevice;
1624 INIT_LIST_HEAD(&info->grants);
1625 INIT_LIST_HEAD(&info->indirect_pages);
1626 info->persistent_gnts_c = 0;
1627 info->connected = BLKIF_STATE_DISCONNECTED; 1970 info->connected = BLKIF_STATE_DISCONNECTED;
1628 INIT_WORK(&info->work, blkif_restart_queue);
1629 1971
1630 /* Front end dir is a number, which is used as the id. */ 1972 /* Front end dir is a number, which is used as the id. */
1631 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); 1973 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio)
1649 1991
1650static int blkif_recover(struct blkfront_info *info) 1992static int blkif_recover(struct blkfront_info *info)
1651{ 1993{
1652 int i; 1994 unsigned int i, r_index;
1653 struct request *req, *n; 1995 struct request *req, *n;
1654 struct blk_shadow *copy; 1996 struct blk_shadow *copy;
1655 int rc; 1997 int rc;
@@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info)
1660 struct split_bio *split_bio; 2002 struct split_bio *split_bio;
1661 struct list_head requests; 2003 struct list_head requests;
1662 2004
1663 /* Stage 1: Make a safe copy of the shadow state. */ 2005 blkfront_gather_backend_features(info);
1664 copy = kmemdup(info->shadow, sizeof(info->shadow),
1665 GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
1666 if (!copy)
1667 return -ENOMEM;
1668
1669 /* Stage 2: Set up free list. */
1670 memset(&info->shadow, 0, sizeof(info->shadow));
1671 for (i = 0; i < BLK_RING_SIZE(info); i++)
1672 info->shadow[i].req.u.rw.id = i+1;
1673 info->shadow_free = info->ring.req_prod_pvt;
1674 info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
1675
1676 rc = blkfront_gather_backend_features(info);
1677 if (rc) {
1678 kfree(copy);
1679 return rc;
1680 }
1681
1682 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; 2006 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1683 blk_queue_max_segments(info->rq, segs); 2007 blk_queue_max_segments(info->rq, segs);
1684 bio_list_init(&bio_list); 2008 bio_list_init(&bio_list);
1685 INIT_LIST_HEAD(&requests); 2009 INIT_LIST_HEAD(&requests);
1686 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1687 /* Not in use? */
1688 if (!copy[i].request)
1689 continue;
1690 2010
1691 /* 2011 for (r_index = 0; r_index < info->nr_rings; r_index++) {
1692 * Get the bios in the request so we can re-queue them. 2012 struct blkfront_ring_info *rinfo;
1693 */ 2013
1694 if (copy[i].request->cmd_flags & 2014 rinfo = &info->rinfo[r_index];
1695 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { 2015 /* Stage 1: Make a safe copy of the shadow state. */
2016 copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
2017 GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
2018 if (!copy)
2019 return -ENOMEM;
2020
2021 /* Stage 2: Set up free list. */
2022 memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
2023 for (i = 0; i < BLK_RING_SIZE(info); i++)
2024 rinfo->shadow[i].req.u.rw.id = i+1;
2025 rinfo->shadow_free = rinfo->ring.req_prod_pvt;
2026 rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
2027
2028 rc = blkfront_setup_indirect(rinfo);
2029 if (rc) {
2030 kfree(copy);
2031 return rc;
2032 }
2033
2034 for (i = 0; i < BLK_RING_SIZE(info); i++) {
2035 /* Not in use? */
2036 if (!copy[i].request)
2037 continue;
2038
1696 /* 2039 /*
1697 * Flush operations don't contain bios, so 2040 * Get the bios in the request so we can re-queue them.
1698 * we need to requeue the whole request
1699 */ 2041 */
1700 list_add(&copy[i].request->queuelist, &requests); 2042 if (copy[i].request->cmd_flags &
1701 continue; 2043 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
2044 /*
2045 * Flush operations don't contain bios, so
2046 * we need to requeue the whole request
2047 */
2048 list_add(&copy[i].request->queuelist, &requests);
2049 continue;
2050 }
2051 merge_bio.head = copy[i].request->bio;
2052 merge_bio.tail = copy[i].request->biotail;
2053 bio_list_merge(&bio_list, &merge_bio);
2054 copy[i].request->bio = NULL;
2055 blk_end_request_all(copy[i].request, 0);
1702 } 2056 }
1703 merge_bio.head = copy[i].request->bio;
1704 merge_bio.tail = copy[i].request->biotail;
1705 bio_list_merge(&bio_list, &merge_bio);
1706 copy[i].request->bio = NULL;
1707 blk_end_request_all(copy[i].request, 0);
1708 }
1709
1710 kfree(copy);
1711 2057
2058 kfree(copy);
2059 }
1712 xenbus_switch_state(info->xbdev, XenbusStateConnected); 2060 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1713 2061
1714 spin_lock_irq(&info->io_lock);
1715
1716 /* Now safe for us to use the shared ring */ 2062 /* Now safe for us to use the shared ring */
1717 info->connected = BLKIF_STATE_CONNECTED; 2063 info->connected = BLKIF_STATE_CONNECTED;
1718 2064
1719 /* Kick any other new requests queued since we resumed */ 2065 for (r_index = 0; r_index < info->nr_rings; r_index++) {
1720 kick_pending_request_queues(info); 2066 struct blkfront_ring_info *rinfo;
2067
2068 rinfo = &info->rinfo[r_index];
2069 /* Kick any other new requests queued since we resumed */
2070 kick_pending_request_queues(rinfo);
2071 }
1721 2072
1722 list_for_each_entry_safe(req, n, &requests, queuelist) { 2073 list_for_each_entry_safe(req, n, &requests, queuelist) {
1723 /* Requeue pending requests (flush or discard) */ 2074 /* Requeue pending requests (flush or discard) */
@@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info)
1725 BUG_ON(req->nr_phys_segments > segs); 2076 BUG_ON(req->nr_phys_segments > segs);
1726 blk_mq_requeue_request(req); 2077 blk_mq_requeue_request(req);
1727 } 2078 }
1728 spin_unlock_irq(&info->io_lock);
1729 blk_mq_kick_requeue_list(info->rq); 2079 blk_mq_kick_requeue_list(info->rq);
1730 2080
1731 while ((bio = bio_list_pop(&bio_list)) != NULL) { 2081 while ((bio = bio_list_pop(&bio_list)) != NULL) {
@@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev)
1790 return err; 2140 return err;
1791} 2141}
1792 2142
1793static void 2143static void blkfront_closing(struct blkfront_info *info)
1794blkfront_closing(struct blkfront_info *info)
1795{ 2144{
1796 struct xenbus_device *xbdev = info->xbdev; 2145 struct xenbus_device *xbdev = info->xbdev;
1797 struct block_device *bdev = NULL; 2146 struct block_device *bdev = NULL;
@@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info)
1851 info->feature_secdiscard = !!discard_secure; 2200 info->feature_secdiscard = !!discard_secure;
1852} 2201}
1853 2202
1854static int blkfront_setup_indirect(struct blkfront_info *info) 2203static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
1855{ 2204{
1856 unsigned int psegs, grants; 2205 unsigned int psegs, grants;
1857 int err, i; 2206 int err, i;
2207 struct blkfront_info *info = rinfo->dev_info;
1858 2208
1859 if (info->max_indirect_segments == 0) 2209 if (info->max_indirect_segments == 0) {
1860 grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; 2210 if (!HAS_EXTRA_REQ)
2211 grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2212 else {
2213 /*
2214 * When an extra req is required, the maximum
2215 * grants supported is related to the size of the
2216 * Linux block segment.
2217 */
2218 grants = GRANTS_PER_PSEG;
2219 }
2220 }
1861 else 2221 else
1862 grants = info->max_indirect_segments; 2222 grants = info->max_indirect_segments;
1863 psegs = grants / GRANTS_PER_PSEG; 2223 psegs = grants / GRANTS_PER_PSEG;
1864 2224
1865 err = fill_grant_buffer(info, 2225 err = fill_grant_buffer(rinfo,
1866 (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); 2226 (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
1867 if (err) 2227 if (err)
1868 goto out_of_memory; 2228 goto out_of_memory;
@@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1875 */ 2235 */
1876 int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); 2236 int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
1877 2237
1878 BUG_ON(!list_empty(&info->indirect_pages)); 2238 BUG_ON(!list_empty(&rinfo->indirect_pages));
1879 for (i = 0; i < num; i++) { 2239 for (i = 0; i < num; i++) {
1880 struct page *indirect_page = alloc_page(GFP_NOIO); 2240 struct page *indirect_page = alloc_page(GFP_NOIO);
1881 if (!indirect_page) 2241 if (!indirect_page)
1882 goto out_of_memory; 2242 goto out_of_memory;
1883 list_add(&indirect_page->lru, &info->indirect_pages); 2243 list_add(&indirect_page->lru, &rinfo->indirect_pages);
1884 } 2244 }
1885 } 2245 }
1886 2246
1887 for (i = 0; i < BLK_RING_SIZE(info); i++) { 2247 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1888 info->shadow[i].grants_used = kzalloc( 2248 rinfo->shadow[i].grants_used = kzalloc(
1889 sizeof(info->shadow[i].grants_used[0]) * grants, 2249 sizeof(rinfo->shadow[i].grants_used[0]) * grants,
1890 GFP_NOIO); 2250 GFP_NOIO);
1891 info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); 2251 rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO);
1892 if (info->max_indirect_segments) 2252 if (info->max_indirect_segments)
1893 info->shadow[i].indirect_grants = kzalloc( 2253 rinfo->shadow[i].indirect_grants = kzalloc(
1894 sizeof(info->shadow[i].indirect_grants[0]) * 2254 sizeof(rinfo->shadow[i].indirect_grants[0]) *
1895 INDIRECT_GREFS(grants), 2255 INDIRECT_GREFS(grants),
1896 GFP_NOIO); 2256 GFP_NOIO);
1897 if ((info->shadow[i].grants_used == NULL) || 2257 if ((rinfo->shadow[i].grants_used == NULL) ||
1898 (info->shadow[i].sg == NULL) || 2258 (rinfo->shadow[i].sg == NULL) ||
1899 (info->max_indirect_segments && 2259 (info->max_indirect_segments &&
1900 (info->shadow[i].indirect_grants == NULL))) 2260 (rinfo->shadow[i].indirect_grants == NULL)))
1901 goto out_of_memory; 2261 goto out_of_memory;
1902 sg_init_table(info->shadow[i].sg, psegs); 2262 sg_init_table(rinfo->shadow[i].sg, psegs);
1903 } 2263 }
1904 2264
1905 2265
@@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1907 2267
1908out_of_memory: 2268out_of_memory:
1909 for (i = 0; i < BLK_RING_SIZE(info); i++) { 2269 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1910 kfree(info->shadow[i].grants_used); 2270 kfree(rinfo->shadow[i].grants_used);
1911 info->shadow[i].grants_used = NULL; 2271 rinfo->shadow[i].grants_used = NULL;
1912 kfree(info->shadow[i].sg); 2272 kfree(rinfo->shadow[i].sg);
1913 info->shadow[i].sg = NULL; 2273 rinfo->shadow[i].sg = NULL;
1914 kfree(info->shadow[i].indirect_grants); 2274 kfree(rinfo->shadow[i].indirect_grants);
1915 info->shadow[i].indirect_grants = NULL; 2275 rinfo->shadow[i].indirect_grants = NULL;
1916 } 2276 }
1917 if (!list_empty(&info->indirect_pages)) { 2277 if (!list_empty(&rinfo->indirect_pages)) {
1918 struct page *indirect_page, *n; 2278 struct page *indirect_page, *n;
1919 list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { 2279 list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
1920 list_del(&indirect_page->lru); 2280 list_del(&indirect_page->lru);
1921 __free_page(indirect_page); 2281 __free_page(indirect_page);
1922 } 2282 }
@@ -1927,7 +2287,7 @@ out_of_memory:
1927/* 2287/*
1928 * Gather all backend feature-* 2288 * Gather all backend feature-*
1929 */ 2289 */
1930static int blkfront_gather_backend_features(struct blkfront_info *info) 2290static void blkfront_gather_backend_features(struct blkfront_info *info)
1931{ 2291{
1932 int err; 2292 int err;
1933 int barrier, flush, discard, persistent; 2293 int barrier, flush, discard, persistent;
@@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
1982 else 2342 else
1983 info->max_indirect_segments = min(indirect_segments, 2343 info->max_indirect_segments = min(indirect_segments,
1984 xen_blkif_max_segments); 2344 xen_blkif_max_segments);
1985
1986 return blkfront_setup_indirect(info);
1987} 2345}
1988 2346
1989/* 2347/*
@@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info)
1996 unsigned long sector_size; 2354 unsigned long sector_size;
1997 unsigned int physical_sector_size; 2355 unsigned int physical_sector_size;
1998 unsigned int binfo; 2356 unsigned int binfo;
1999 int err; 2357 int err, i;
2000 2358
2001 switch (info->connected) { 2359 switch (info->connected) {
2002 case BLKIF_STATE_CONNECTED: 2360 case BLKIF_STATE_CONNECTED:
@@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info)
2053 if (err != 1) 2411 if (err != 1)
2054 physical_sector_size = sector_size; 2412 physical_sector_size = sector_size;
2055 2413
2056 err = blkfront_gather_backend_features(info); 2414 blkfront_gather_backend_features(info);
2057 if (err) { 2415 for (i = 0; i < info->nr_rings; i++) {
2058 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", 2416 err = blkfront_setup_indirect(&info->rinfo[i]);
2059 info->xbdev->otherend); 2417 if (err) {
2060 return; 2418 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
2419 info->xbdev->otherend);
2420 blkif_free(info, 0);
2421 break;
2422 }
2061 } 2423 }
2062 2424
2063 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, 2425 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
@@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info)
2071 xenbus_switch_state(info->xbdev, XenbusStateConnected); 2433 xenbus_switch_state(info->xbdev, XenbusStateConnected);
2072 2434
2073 /* Kick pending requests. */ 2435 /* Kick pending requests. */
2074 spin_lock_irq(&info->io_lock);
2075 info->connected = BLKIF_STATE_CONNECTED; 2436 info->connected = BLKIF_STATE_CONNECTED;
2076 kick_pending_request_queues(info); 2437 for (i = 0; i < info->nr_rings; i++)
2077 spin_unlock_irq(&info->io_lock); 2438 kick_pending_request_queues(&info->rinfo[i]);
2078 2439
2079 add_disk(info->gd); 2440 add_disk(info->gd);
2080 2441
@@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev,
2095 case XenbusStateInitWait: 2456 case XenbusStateInitWait:
2096 if (dev->state != XenbusStateInitialising) 2457 if (dev->state != XenbusStateInitialising)
2097 break; 2458 break;
2098 if (talk_to_blkback(dev, info)) { 2459 if (talk_to_blkback(dev, info))
2099 kfree(info);
2100 dev_set_drvdata(&dev->dev, NULL);
2101 break; 2460 break;
2102 }
2103 case XenbusStateInitialising: 2461 case XenbusStateInitialising:
2104 case XenbusStateInitialised: 2462 case XenbusStateInitialised:
2105 case XenbusStateReconfiguring: 2463 case XenbusStateReconfiguring:
@@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev,
2108 break; 2466 break;
2109 2467
2110 case XenbusStateConnected: 2468 case XenbusStateConnected:
2469 if (dev->state != XenbusStateInitialised) {
2470 if (talk_to_blkback(dev, info))
2471 break;
2472 }
2111 blkfront_connect(info); 2473 blkfront_connect(info);
2112 break; 2474 break;
2113 2475
@@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = {
2281static int __init xlblk_init(void) 2643static int __init xlblk_init(void)
2282{ 2644{
2283 int ret; 2645 int ret;
2646 int nr_cpus = num_online_cpus();
2284 2647
2285 if (!xen_domain()) 2648 if (!xen_domain())
2286 return -ENODEV; 2649 return -ENODEV;
@@ -2288,7 +2651,13 @@ static int __init xlblk_init(void)
2288 if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { 2651 if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
2289 pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", 2652 pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
2290 xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); 2653 xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
2291 xen_blkif_max_ring_order = 0; 2654 xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
2655 }
2656
2657 if (xen_blkif_max_queues > nr_cpus) {
2658 pr_info("Invalid max_queues (%d), will use default max: %d.\n",
2659 xen_blkif_max_queues, nr_cpus);
2660 xen_blkif_max_queues = nr_cpus;
2292 } 2661 }
2293 2662
2294 if (!xen_has_pv_disk_devices()) 2663 if (!xen_has_pv_disk_devices())
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 83392f856dfd..22b9e34ceb75 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c)
1741 do { 1741 do {
1742 ret = btree_root(gc_root, c, &op, &writes, &stats); 1742 ret = btree_root(gc_root, c, &op, &writes, &stats);
1743 closure_sync(&writes); 1743 closure_sync(&writes);
1744 cond_resched();
1744 1745
1745 if (ret && ret != -EAGAIN) 1746 if (ret && ret != -EAGAIN)
1746 pr_warn("gc failed!"); 1747 pr_warn("gc failed!");
@@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2162 rw_lock(true, b, b->level); 2163 rw_lock(true, b, b->level);
2163 2164
2164 if (b->key.ptr[0] != btree_ptr || 2165 if (b->key.ptr[0] != btree_ptr ||
2165 b->seq != seq + 1) 2166 b->seq != seq + 1) {
2167 op->lock = b->level;
2166 goto out; 2168 goto out;
2169 }
2167 } 2170 }
2168 2171
2169 SET_KEY_PTRS(check_key, 1); 2172 SET_KEY_PTRS(check_key, 1);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 679a093a3bf6..8d0ead98eb6e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
685 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || 685 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
686 sysfs_create_link(&c->kobj, &d->kobj, d->name), 686 sysfs_create_link(&c->kobj, &d->kobj, d->name),
687 "Couldn't create device <-> cache set symlinks"); 687 "Couldn't create device <-> cache set symlinks");
688
689 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
688} 690}
689 691
690static void bcache_device_detach(struct bcache_device *d) 692static void bcache_device_detach(struct bcache_device *d)
@@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
847 buf[SB_LABEL_SIZE] = '\0'; 849 buf[SB_LABEL_SIZE] = '\0';
848 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); 850 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
849 851
850 if (atomic_xchg(&dc->running, 1)) 852 if (atomic_xchg(&dc->running, 1)) {
853 kfree(env[1]);
854 kfree(env[2]);
851 return; 855 return;
856 }
852 857
853 if (!d->c && 858 if (!d->c &&
854 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { 859 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
@@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1933 else 1938 else
1934 err = "device busy"; 1939 err = "device busy";
1935 mutex_unlock(&bch_register_lock); 1940 mutex_unlock(&bch_register_lock);
1941 if (attr == &ksysfs_register_quiet)
1942 goto out;
1936 } 1943 }
1937 goto err; 1944 goto err;
1938 } 1945 }
@@ -1971,8 +1978,7 @@ out:
1971err_close: 1978err_close:
1972 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1979 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1973err: 1980err:
1974 if (attr != &ksysfs_register_quiet) 1981 pr_info("error opening %s: %s", path, err);
1975 pr_info("error opening %s: %s", path, err);
1976 ret = -EINVAL; 1982 ret = -EINVAL;
1977 goto out; 1983 goto out;
1978} 1984}
@@ -2066,8 +2072,10 @@ static int __init bcache_init(void)
2066 closure_debug_init(); 2072 closure_debug_init();
2067 2073
2068 bcache_major = register_blkdev(0, "bcache"); 2074 bcache_major = register_blkdev(0, "bcache");
2069 if (bcache_major < 0) 2075 if (bcache_major < 0) {
2076 unregister_reboot_notifier(&reboot);
2070 return bcache_major; 2077 return bcache_major;
2078 }
2071 2079
2072 if (!(bcache_wq = create_workqueue("bcache")) || 2080 if (!(bcache_wq = create_workqueue("bcache")) ||
2073 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || 2081 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b23f88d9f18c..b9346cd9cda1 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
323 323
324static bool dirty_pred(struct keybuf *buf, struct bkey *k) 324static bool dirty_pred(struct keybuf *buf, struct bkey *k)
325{ 325{
326 struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
327
328 BUG_ON(KEY_INODE(k) != dc->disk.id);
329
326 return KEY_DIRTY(k); 330 return KEY_DIRTY(k);
327} 331}
328 332
@@ -372,11 +376,24 @@ next:
372 } 376 }
373} 377}
374 378
379/*
380 * Returns true if we scanned the entire disk
381 */
375static bool refill_dirty(struct cached_dev *dc) 382static bool refill_dirty(struct cached_dev *dc)
376{ 383{
377 struct keybuf *buf = &dc->writeback_keys; 384 struct keybuf *buf = &dc->writeback_keys;
385 struct bkey start = KEY(dc->disk.id, 0, 0);
378 struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); 386 struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
379 bool searched_from_start = false; 387 struct bkey start_pos;
388
389 /*
390 * make sure keybuf pos is inside the range for this disk - at bringup
391 * we might not be attached yet so this disk's inode nr isn't
392 * initialized then
393 */
394 if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
395 bkey_cmp(&buf->last_scanned, &end) > 0)
396 buf->last_scanned = start;
380 397
381 if (dc->partial_stripes_expensive) { 398 if (dc->partial_stripes_expensive) {
382 refill_full_stripes(dc); 399 refill_full_stripes(dc);
@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc)
384 return false; 401 return false;
385 } 402 }
386 403
387 if (bkey_cmp(&buf->last_scanned, &end) >= 0) { 404 start_pos = buf->last_scanned;
388 buf->last_scanned = KEY(dc->disk.id, 0, 0);
389 searched_from_start = true;
390 }
391
392 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); 405 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
393 406
394 return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; 407 if (bkey_cmp(&buf->last_scanned, &end) < 0)
408 return false;
409
410 /*
411 * If we get to the end start scanning again from the beginning, and
412 * only scan up to where we initially started scanning from:
413 */
414 buf->last_scanned = start;
415 bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
416
417 return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
395} 418}
396 419
397static int bch_writeback_thread(void *arg) 420static int bch_writeback_thread(void *arg)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 0a9dab187b79..073a042aed24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
63 63
64static inline void bch_writeback_queue(struct cached_dev *dc) 64static inline void bch_writeback_queue(struct cached_dev *dc)
65{ 65{
66 wake_up_process(dc->writeback_thread); 66 if (!IS_ERR_OR_NULL(dc->writeback_thread))
67 wake_up_process(dc->writeback_thread);
67} 68}
68 69
69static inline void bch_writeback_add(struct cached_dev *dc) 70static inline void bch_writeback_add(struct cached_dev *dc)
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 8723f2a99e15..d6b3c9943a2c 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -25,7 +25,6 @@
25*/ 25*/
26#ifndef DRBD_H 26#ifndef DRBD_H
27#define DRBD_H 27#define DRBD_H
28#include <linux/connector.h>
29#include <asm/types.h> 28#include <asm/types.h>
30 29
31#ifdef __KERNEL__ 30#ifdef __KERNEL__
@@ -52,7 +51,7 @@
52#endif 51#endif
53 52
54extern const char *drbd_buildtag(void); 53extern const char *drbd_buildtag(void);
55#define REL_VERSION "8.4.5" 54#define REL_VERSION "8.4.6"
56#define API_VERSION 1 55#define API_VERSION 1
57#define PRO_VERSION_MIN 86 56#define PRO_VERSION_MIN 86
58#define PRO_VERSION_MAX 101 57#define PRO_VERSION_MAX 101
@@ -339,6 +338,8 @@ enum drbd_state_rv {
339#define MDF_AL_CLEAN (1 << 7) 338#define MDF_AL_CLEAN (1 << 7)
340#define MDF_AL_DISABLED (1 << 8) 339#define MDF_AL_DISABLED (1 << 8)
341 340
341#define MAX_PEERS 32
342
342enum drbd_uuid_index { 343enum drbd_uuid_index {
343 UI_CURRENT, 344 UI_CURRENT,
344 UI_BITMAP, 345 UI_BITMAP,
@@ -349,14 +350,35 @@ enum drbd_uuid_index {
349 UI_EXTENDED_SIZE /* Everything. */ 350 UI_EXTENDED_SIZE /* Everything. */
350}; 351};
351 352
353#define HISTORY_UUIDS MAX_PEERS
354
352enum drbd_timeout_flag { 355enum drbd_timeout_flag {
353 UT_DEFAULT = 0, 356 UT_DEFAULT = 0,
354 UT_DEGRADED = 1, 357 UT_DEGRADED = 1,
355 UT_PEER_OUTDATED = 2, 358 UT_PEER_OUTDATED = 2,
356}; 359};
357 360
361enum drbd_notification_type {
362 NOTIFY_EXISTS,
363 NOTIFY_CREATE,
364 NOTIFY_CHANGE,
365 NOTIFY_DESTROY,
366 NOTIFY_CALL,
367 NOTIFY_RESPONSE,
368
369 NOTIFY_CONTINUES = 0x8000,
370 NOTIFY_FLAGS = NOTIFY_CONTINUES,
371};
372
358#define UUID_JUST_CREATED ((__u64)4) 373#define UUID_JUST_CREATED ((__u64)4)
359 374
375enum write_ordering_e {
376 WO_NONE,
377 WO_DRAIN_IO,
378 WO_BDEV_FLUSH,
379 WO_BIO_BARRIER
380};
381
360/* magic numbers used in meta data and network packets */ 382/* magic numbers used in meta data and network packets */
361#define DRBD_MAGIC 0x83740267 383#define DRBD_MAGIC 0x83740267
362#define DRBD_MAGIC_BIG 0x835a 384#define DRBD_MAGIC_BIG 0x835a
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 7b131ed8f9c6..2d0e5ad5de9d 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
250 __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) 250 __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach)
251) 251)
252 252
253GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info,
254 __u32_field(1, 0, res_role)
255 __flg_field(2, 0, res_susp)
256 __flg_field(3, 0, res_susp_nod)
257 __flg_field(4, 0, res_susp_fen)
258 /* __flg_field(5, 0, res_weak) */
259)
260
261GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info,
262 __u32_field(1, 0, dev_disk_state)
263)
264
265GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info,
266 __u32_field(1, 0, conn_connection_state)
267 __u32_field(2, 0, conn_role)
268)
269
270GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info,
271 __u32_field(1, 0, peer_repl_state)
272 __u32_field(2, 0, peer_disk_state)
273 __u32_field(3, 0, peer_resync_susp_user)
274 __u32_field(4, 0, peer_resync_susp_peer)
275 __u32_field(5, 0, peer_resync_susp_dependency)
276)
277
278GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics,
279 __u32_field(1, 0, res_stat_write_ordering)
280)
281
282GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics,
283 __u64_field(1, 0, dev_size) /* (sectors) */
284 __u64_field(2, 0, dev_read) /* (sectors) */
285 __u64_field(3, 0, dev_write) /* (sectors) */
286 __u64_field(4, 0, dev_al_writes) /* activity log writes (count) */
287 __u64_field(5, 0, dev_bm_writes) /* bitmap writes (count) */
288 __u32_field(6, 0, dev_upper_pending) /* application requests in progress */
289 __u32_field(7, 0, dev_lower_pending) /* backing device requests in progress */
290 __flg_field(8, 0, dev_upper_blocked)
291 __flg_field(9, 0, dev_lower_blocked)
292 __flg_field(10, 0, dev_al_suspended) /* activity log suspended */
293 __u64_field(11, 0, dev_exposed_data_uuid)
294 __u64_field(12, 0, dev_current_uuid)
295 __u32_field(13, 0, dev_disk_flags)
296 __bin_field(14, 0, history_uuids, HISTORY_UUIDS * sizeof(__u64))
297)
298
299GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics,
300 __flg_field(1, 0, conn_congested)
301)
302
303GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics,
304 __u64_field(1, 0, peer_dev_received) /* sectors */
305 __u64_field(2, 0, peer_dev_sent) /* sectors */
306 __u32_field(3, 0, peer_dev_pending) /* number of requests */
307 __u32_field(4, 0, peer_dev_unacked) /* number of requests */
308 __u64_field(5, 0, peer_dev_out_of_sync) /* sectors */
309 __u64_field(6, 0, peer_dev_resync_failed) /* sectors */
310 __u64_field(7, 0, peer_dev_bitmap_uuid)
311 __u32_field(9, 0, peer_dev_flags)
312)
313
314GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header,
315 __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type)
316)
317
318GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info,
319 __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32)
320 __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status)
321)
322
253/* 323/*
254 * Notifications and commands (genlmsghdr->cmd) 324 * Notifications and commands (genlmsghdr->cmd)
255 */ 325 */
@@ -382,3 +452,82 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
382 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) 452 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
383GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), 453GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down),
384 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) 454 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
455
456GENL_op(DRBD_ADM_GET_RESOURCES, 30,
457 GENL_op_init(
458 .dumpit = drbd_adm_dump_resources,
459 ),
460 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
461 GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY)
462 GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY))
463
464GENL_op(DRBD_ADM_GET_DEVICES, 31,
465 GENL_op_init(
466 .dumpit = drbd_adm_dump_devices,
467 .done = drbd_adm_dump_devices_done,
468 ),
469 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
470 GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
471 GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
472
473GENL_op(DRBD_ADM_GET_CONNECTIONS, 32,
474 GENL_op_init(
475 .dumpit = drbd_adm_dump_connections,
476 .done = drbd_adm_dump_connections_done,
477 ),
478 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
479 GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY)
480 GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY))
481
482GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33,
483 GENL_op_init(
484 .dumpit = drbd_adm_dump_peer_devices,
485 .done = drbd_adm_dump_peer_devices_done,
486 ),
487 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
488 GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
489 GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
490
491GENL_notification(
492 DRBD_RESOURCE_STATE, 34, events,
493 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
494 GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
495 GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_F_REQUIRED)
496 GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_F_REQUIRED))
497
498GENL_notification(
499 DRBD_DEVICE_STATE, 35, events,
500 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
501 GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
502 GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_F_REQUIRED)
503 GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_F_REQUIRED))
504
505GENL_notification(
506 DRBD_CONNECTION_STATE, 36, events,
507 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
508 GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
509 GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED)
510 GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED))
511
512GENL_notification(
513 DRBD_PEER_DEVICE_STATE, 37, events,
514 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
515 GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
516 GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_F_REQUIRED)
517 GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_F_REQUIRED))
518
519GENL_op(
520 DRBD_ADM_GET_INITIAL_STATE, 38,
521 GENL_op_init(
522 .dumpit = drbd_adm_get_initial_state,
523 ),
524 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY))
525
526GENL_notification(
527 DRBD_HELPER, 40, events,
528 GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
529 GENL_tla_expected(DRBD_NLA_HELPER, DRBD_F_REQUIRED))
530
531GENL_notification(
532 DRBD_INITIAL_STATE_DONE, 41, events,
533 GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED))
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 013fd9bc4cb6..083d61e92706 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id)
135#define idr_for_each_entry(idp, entry, id) \ 135#define idr_for_each_entry(idp, entry, id) \
136 for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) 136 for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
137 137
138/**
139 * idr_for_each_entry - continue iteration over an idr's elements of a given type
140 * @idp: idr handle
141 * @entry: the type * to use as cursor
142 * @id: id entry's key
143 *
144 * Continue to iterate over list of given type, continuing after
145 * the current position.
146 */
147#define idr_for_each_entry_continue(idp, entry, id) \
148 for ((entry) = idr_get_next((idp), &(id)); \
149 entry; \
150 ++id, (entry) = idr_get_next((idp), &(id)))
151
138/* 152/*
139 * IDA - IDR based id allocator, use when translation from id to 153 * IDA - IDR based id allocator, use when translation from id to
140 * pointer isn't necessary. 154 * pointer isn't necessary.
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 46262284de47..04fc6e6c7ff0 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
264extern void lc_committed(struct lru_cache *lc); 264extern void lc_committed(struct lru_cache *lc);
265 265
266struct seq_file; 266struct seq_file;
267extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); 267extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
268 268
269extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, 269extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
270 void (*detail) (struct seq_file *, struct lc_element *)); 270 void (*detail) (struct seq_file *, struct lc_element *));
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c33e1c489eb2..8b8cfadf7833 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -28,6 +28,54 @@ typedef uint16_t blkif_vdev_t;
28typedef uint64_t blkif_sector_t; 28typedef uint64_t blkif_sector_t;
29 29
30/* 30/*
31 * Multiple hardware queues/rings:
32 * If supported, the backend will write the key "multi-queue-max-queues" to
33 * the directory for that vbd, and set its value to the maximum supported
34 * number of queues.
35 * Frontends that are aware of this feature and wish to use it can write the
36 * key "multi-queue-num-queues" with the number they wish to use, which must be
37 * greater than zero, and no more than the value reported by the backend in
38 * "multi-queue-max-queues".
39 *
40 * For frontends requesting just one queue, the usual event-channel and
41 * ring-ref keys are written as before, simplifying the backend processing
42 * to avoid distinguishing between a frontend that doesn't understand the
43 * multi-queue feature, and one that does, but requested only one queue.
44 *
45 * Frontends requesting two or more queues must not write the toplevel
46 * event-channel and ring-ref keys, instead writing those keys under sub-keys
47 * having the name "queue-N" where N is the integer ID of the queue/ring for
48 * which those keys belong. Queues are indexed from zero.
49 * For example, a frontend with two queues must write the following set of
50 * queue-related keys:
51 *
52 * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
53 * /local/domain/1/device/vbd/0/queue-0 = ""
54 * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
55 * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
56 * /local/domain/1/device/vbd/0/queue-1 = ""
57 * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
58 * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
59 *
60 * It is also possible to use multiple queues/rings together with
61 * feature multi-page ring buffer.
62 * For example, a frontend requests two queues/rings and the size of each ring
63 * buffer is two pages must write the following set of related keys:
64 *
65 * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
66 * /local/domain/1/device/vbd/0/ring-page-order = "1"
67 * /local/domain/1/device/vbd/0/queue-0 = ""
68 * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
69 * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
70 * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
71 * /local/domain/1/device/vbd/0/queue-1 = ""
72 * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
73 * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
74 * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
75 *
76 */
77
78/*
31 * REQUEST CODES. 79 * REQUEST CODES.
32 */ 80 */
33#define BLKIF_OP_READ 0 81#define BLKIF_OP_READ 0
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 028f5d996eef..28ba40b99337 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -238,7 +238,7 @@ void lc_reset(struct lru_cache *lc)
238 * @seq: the seq_file to print into 238 * @seq: the seq_file to print into
239 * @lc: the lru cache to print statistics of 239 * @lc: the lru cache to print statistics of
240 */ 240 */
241size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) 241void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
242{ 242{
243 /* NOTE: 243 /* NOTE:
244 * total calls to lc_get are 244 * total calls to lc_get are
@@ -250,8 +250,6 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
250 seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", 250 seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
251 lc->name, lc->used, lc->nr_elements, 251 lc->name, lc->used, lc->nr_elements,
252 lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); 252 lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
253
254 return 0;
255} 253}
256 254
257static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) 255static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)