aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/brd.c4
-rw-r--r--drivers/block/cciss.c3
-rw-r--r--drivers/block/drbd/drbd_actlog.c29
-rw-r--r--drivers/block/drbd/drbd_bitmap.c84
-rw-r--r--drivers/block/drbd/drbd_debugfs.c13
-rw-r--r--drivers/block/drbd/drbd_int.h49
-rw-r--r--drivers/block/drbd/drbd_interval.h14
-rw-r--r--drivers/block/drbd/drbd_main.c115
-rw-r--r--drivers/block/drbd/drbd_nl.c282
-rw-r--r--drivers/block/drbd/drbd_proc.c30
-rw-r--r--drivers/block/drbd/drbd_protocol.h77
-rw-r--r--drivers/block/drbd/drbd_receiver.c535
-rw-r--r--drivers/block/drbd/drbd_req.c118
-rw-r--r--drivers/block/drbd/drbd_req.h5
-rw-r--r--drivers/block/drbd/drbd_state.c61
-rw-r--r--drivers/block/drbd/drbd_state.h2
-rw-r--r--drivers/block/drbd/drbd_strings.c8
-rw-r--r--drivers/block/drbd/drbd_worker.c115
-rw-r--r--drivers/block/floppy.c3
-rw-r--r--drivers/block/loop.c1
-rw-r--r--drivers/block/mg_disk.c9
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c5
-rw-r--r--drivers/block/null_blk.c2
-rw-r--r--drivers/block/ps3disk.c3
-rw-r--r--drivers/block/ps3vram.c3
-rw-r--r--drivers/block/rsxx/dev.c4
-rw-r--r--drivers/block/skd_main.c8
-rw-r--r--drivers/block/sunvdc.c3
-rw-r--r--drivers/block/umem.c6
-rw-r--r--drivers/block/virtio_blk.c24
-rw-r--r--drivers/block/xen-blkback/xenbus.c2
-rw-r--r--drivers/block/xen-blkfront.c18
32 files changed, 1258 insertions, 377 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index dd96a935fba0..ba5145d384d8 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -347,9 +347,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
347 goto out; 347 goto out;
348 } 348 }
349 349
350 rw = bio_rw(bio); 350 rw = bio_data_dir(bio);
351 if (rw == READA)
352 rw = READ;
353 351
354 bio_for_each_segment(bvec, bio, iter) { 352 bio_for_each_segment(bvec, bio, iter) {
355 unsigned int len = bvec.bv_len; 353 unsigned int len = bvec.bv_len;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 63c2064689f8..db9d6bb6352d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1951,7 +1951,6 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
1951 if (cciss_create_ld_sysfs_entry(h, drv_index)) 1951 if (cciss_create_ld_sysfs_entry(h, drv_index))
1952 goto cleanup_queue; 1952 goto cleanup_queue;
1953 disk->private_data = h->drv[drv_index]; 1953 disk->private_data = h->drv[drv_index];
1954 disk->driverfs_dev = &h->drv[drv_index]->dev;
1955 1954
1956 /* Set up queue information */ 1955 /* Set up queue information */
1957 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); 1956 blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
@@ -1973,7 +1972,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
1973 /* allows the interrupt handler to start the queue */ 1972 /* allows the interrupt handler to start the queue */
1974 wmb(); 1973 wmb();
1975 h->drv[drv_index]->queue = disk->queue; 1974 h->drv[drv_index]->queue = disk->queue;
1976 add_disk(disk); 1975 device_add_disk(&h->drv[drv_index]->dev, disk);
1977 return 0; 1976 return 0;
1978 1977
1979cleanup_queue: 1978cleanup_queue:
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index d524973f94b3..0a1aaf8c24c4 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -258,7 +258,7 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval
258 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 258 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
259 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 259 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
260 260
261 D_ASSERT(device, (unsigned)(last - first) <= 1); 261 D_ASSERT(device, first <= last);
262 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 262 D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
263 263
264 /* FIXME figure out a fast path for bios crossing AL extent boundaries */ 264 /* FIXME figure out a fast path for bios crossing AL extent boundaries */
@@ -341,6 +341,8 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact
341 341
342 i = 0; 342 i = 0;
343 343
344 drbd_bm_reset_al_hints(device);
345
344 /* Even though no one can start to change this list 346 /* Even though no one can start to change this list
345 * once we set the LC_LOCKED -- from drbd_al_begin_io(), 347 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
346 * lc_try_lock_for_transaction() --, someone may still 348 * lc_try_lock_for_transaction() --, someone may still
@@ -770,10 +772,18 @@ static bool lazy_bitmap_update_due(struct drbd_device *device)
770 772
771static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) 773static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
772{ 774{
773 if (rs_done) 775 if (rs_done) {
774 set_bit(RS_DONE, &device->flags); 776 struct drbd_connection *connection = first_peer_device(device)->connection;
775 /* and also set RS_PROGRESS below */ 777 if (connection->agreed_pro_version <= 95 ||
776 else if (!lazy_bitmap_update_due(device)) 778 is_sync_target_state(device->state.conn))
779 set_bit(RS_DONE, &device->flags);
780 /* and also set RS_PROGRESS below */
781
782 /* Else: rather wait for explicit notification via receive_state,
783 * to avoid uuids-rotated-too-fast causing full resync
784 * in next handshake, in case the replication link breaks
785 * at the most unfortunate time... */
786 } else if (!lazy_bitmap_update_due(device))
777 return; 787 return;
778 788
779 drbd_device_post_work(device, RS_PROGRESS); 789 drbd_device_post_work(device, RS_PROGRESS);
@@ -832,6 +842,13 @@ static int update_sync_bits(struct drbd_device *device,
832 return count; 842 return count;
833} 843}
834 844
845static bool plausible_request_size(int size)
846{
847 return size > 0
848 && size <= DRBD_MAX_BATCH_BIO_SIZE
849 && IS_ALIGNED(size, 512);
850}
851
835/* clear the bit corresponding to the piece of storage in question: 852/* clear the bit corresponding to the piece of storage in question:
836 * size byte of data starting from sector. Only clear a bits of the affected 853 * size byte of data starting from sector. Only clear a bits of the affected
837 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 854 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
@@ -851,7 +868,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
851 if ((mode == SET_OUT_OF_SYNC) && size == 0) 868 if ((mode == SET_OUT_OF_SYNC) && size == 0)
852 return 0; 869 return 0;
853 870
854 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 871 if (!plausible_request_size(size)) {
855 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", 872 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
856 drbd_change_sync_fname[mode], 873 drbd_change_sync_fname[mode],
857 (unsigned long long)sector, size); 874 (unsigned long long)sector, size);
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index e5d89f623b90..ab62b81c2ca7 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -96,6 +96,13 @@ struct drbd_bitmap {
96 struct page **bm_pages; 96 struct page **bm_pages;
97 spinlock_t bm_lock; 97 spinlock_t bm_lock;
98 98
99 /* exclusively to be used by __al_write_transaction(),
100 * drbd_bm_mark_for_writeout() and
101 * and drbd_bm_write_hinted() -> bm_rw() called from there.
102 */
103 unsigned int n_bitmap_hints;
104 unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
105
99 /* see LIMITATIONS: above */ 106 /* see LIMITATIONS: above */
100 107
101 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ 108 unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
@@ -242,6 +249,11 @@ static void bm_set_page_need_writeout(struct page *page)
242 set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); 249 set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
243} 250}
244 251
252void drbd_bm_reset_al_hints(struct drbd_device *device)
253{
254 device->bitmap->n_bitmap_hints = 0;
255}
256
245/** 257/**
246 * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout 258 * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
247 * @device: DRBD device. 259 * @device: DRBD device.
@@ -253,6 +265,7 @@ static void bm_set_page_need_writeout(struct page *page)
253 */ 265 */
254void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) 266void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
255{ 267{
268 struct drbd_bitmap *b = device->bitmap;
256 struct page *page; 269 struct page *page;
257 if (page_nr >= device->bitmap->bm_number_of_pages) { 270 if (page_nr >= device->bitmap->bm_number_of_pages) {
258 drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n", 271 drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
@@ -260,7 +273,9 @@ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
260 return; 273 return;
261 } 274 }
262 page = device->bitmap->bm_pages[page_nr]; 275 page = device->bitmap->bm_pages[page_nr];
263 set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); 276 BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
277 if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
278 b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
264} 279}
265 280
266static int bm_test_page_unchanged(struct page *page) 281static int bm_test_page_unchanged(struct page *page)
@@ -427,8 +442,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
427} 442}
428 443
429/* 444/*
430 * called on driver init only. TODO call when a device is created. 445 * allocates the drbd_bitmap and stores it in device->bitmap.
431 * allocates the drbd_bitmap, and stores it in device->bitmap.
432 */ 446 */
433int drbd_bm_init(struct drbd_device *device) 447int drbd_bm_init(struct drbd_device *device)
434{ 448{
@@ -633,7 +647,8 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
633 unsigned long bits, words, owords, obits; 647 unsigned long bits, words, owords, obits;
634 unsigned long want, have, onpages; /* number of pages */ 648 unsigned long want, have, onpages; /* number of pages */
635 struct page **npages, **opages = NULL; 649 struct page **npages, **opages = NULL;
636 int err = 0, growing; 650 int err = 0;
651 bool growing;
637 652
638 if (!expect(b)) 653 if (!expect(b))
639 return -ENOMEM; 654 return -ENOMEM;
@@ -1030,7 +1045,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
1030{ 1045{
1031 struct drbd_bm_aio_ctx *ctx; 1046 struct drbd_bm_aio_ctx *ctx;
1032 struct drbd_bitmap *b = device->bitmap; 1047 struct drbd_bitmap *b = device->bitmap;
1033 int num_pages, i, count = 0; 1048 unsigned int num_pages, i, count = 0;
1034 unsigned long now; 1049 unsigned long now;
1035 char ppb[10]; 1050 char ppb[10];
1036 int err = 0; 1051 int err = 0;
@@ -1078,16 +1093,37 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
1078 now = jiffies; 1093 now = jiffies;
1079 1094
1080 /* let the layers below us try to merge these bios... */ 1095 /* let the layers below us try to merge these bios... */
1081 for (i = 0; i < num_pages; i++) {
1082 /* ignore completely unchanged pages */
1083 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1084 break;
1085 if (!(flags & BM_AIO_READ)) {
1086 if ((flags & BM_AIO_WRITE_HINTED) &&
1087 !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
1088 &page_private(b->bm_pages[i])))
1089 continue;
1090 1096
1097 if (flags & BM_AIO_READ) {
1098 for (i = 0; i < num_pages; i++) {
1099 atomic_inc(&ctx->in_flight);
1100 bm_page_io_async(ctx, i);
1101 ++count;
1102 cond_resched();
1103 }
1104 } else if (flags & BM_AIO_WRITE_HINTED) {
1105 /* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
1106 unsigned int hint;
1107 for (hint = 0; hint < b->n_bitmap_hints; hint++) {
1108 i = b->al_bitmap_hints[hint];
1109 if (i >= num_pages) /* == -1U: no hint here. */
1110 continue;
1111 /* Several AL-extents may point to the same page. */
1112 if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
1113 &page_private(b->bm_pages[i])))
1114 continue;
1115 /* Has it even changed? */
1116 if (bm_test_page_unchanged(b->bm_pages[i]))
1117 continue;
1118 atomic_inc(&ctx->in_flight);
1119 bm_page_io_async(ctx, i);
1120 ++count;
1121 }
1122 } else {
1123 for (i = 0; i < num_pages; i++) {
1124 /* ignore completely unchanged pages */
1125 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1126 break;
1091 if (!(flags & BM_AIO_WRITE_ALL_PAGES) && 1127 if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
1092 bm_test_page_unchanged(b->bm_pages[i])) { 1128 bm_test_page_unchanged(b->bm_pages[i])) {
1093 dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); 1129 dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
@@ -1100,11 +1136,11 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
1100 dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i); 1136 dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
1101 continue; 1137 continue;
1102 } 1138 }
1139 atomic_inc(&ctx->in_flight);
1140 bm_page_io_async(ctx, i);
1141 ++count;
1142 cond_resched();
1103 } 1143 }
1104 atomic_inc(&ctx->in_flight);
1105 bm_page_io_async(ctx, i);
1106 ++count;
1107 cond_resched();
1108 } 1144 }
1109 1145
1110 /* 1146 /*
@@ -1121,10 +1157,14 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
1121 kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); 1157 kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
1122 1158
1123 /* summary for global bitmap IO */ 1159 /* summary for global bitmap IO */
1124 if (flags == 0) 1160 if (flags == 0) {
1125 drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", 1161 unsigned int ms = jiffies_to_msecs(jiffies - now);
1126 (flags & BM_AIO_READ) ? "READ" : "WRITE", 1162 if (ms > 5) {
1127 count, jiffies - now); 1163 drbd_info(device, "bitmap %s of %u pages took %u ms\n",
1164 (flags & BM_AIO_READ) ? "READ" : "WRITE",
1165 count, ms);
1166 }
1167 }
1128 1168
1129 if (ctx->error) { 1169 if (ctx->error) {
1130 drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n"); 1170 drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 4de95bbff486..be91a8d7c22a 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
237 seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); 237 seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
238 seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); 238 seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
239 239
240 if (f & EE_IS_TRIM) { 240 if (f & EE_IS_TRIM)
241 seq_putc(m, sep); 241 __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
242 sep = '|'; 242 seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
243 if (f & EE_IS_TRIM_USE_ZEROOUT)
244 seq_puts(m, "zero-out");
245 else
246 seq_puts(m, "trim");
247 }
248 seq_putc(m, '\n'); 243 seq_putc(m, '\n');
249} 244}
250 245
@@ -908,7 +903,7 @@ static int drbd_version_open(struct inode *inode, struct file *file)
908 return single_open(file, drbd_version_show, NULL); 903 return single_open(file, drbd_version_show, NULL);
909} 904}
910 905
911static struct file_operations drbd_version_fops = { 906static const struct file_operations drbd_version_fops = {
912 .owner = THIS_MODULE, 907 .owner = THIS_MODULE,
913 .open = drbd_version_open, 908 .open = drbd_version_open,
914 .llseek = seq_lseek, 909 .llseek = seq_lseek,
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a64c645b4184..7b54354976a5 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -468,9 +468,15 @@ enum {
468 /* this is/was a write request */ 468 /* this is/was a write request */
469 __EE_WRITE, 469 __EE_WRITE,
470 470
471 /* this is/was a write same request */
472 __EE_WRITE_SAME,
473
471 /* this originates from application on peer 474 /* this originates from application on peer
472 * (not some resync or verify or other DRBD internal request) */ 475 * (not some resync or verify or other DRBD internal request) */
473 __EE_APPLICATION, 476 __EE_APPLICATION,
477
478 /* If it contains only 0 bytes, send back P_RS_DEALLOCATED */
479 __EE_RS_THIN_REQ,
474}; 480};
475#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 481#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
476#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 482#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
@@ -484,7 +490,9 @@ enum {
484#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) 490#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
485#define EE_SUBMITTED (1<<__EE_SUBMITTED) 491#define EE_SUBMITTED (1<<__EE_SUBMITTED)
486#define EE_WRITE (1<<__EE_WRITE) 492#define EE_WRITE (1<<__EE_WRITE)
493#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
487#define EE_APPLICATION (1<<__EE_APPLICATION) 494#define EE_APPLICATION (1<<__EE_APPLICATION)
495#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
488 496
489/* flag bits per device */ 497/* flag bits per device */
490enum { 498enum {
@@ -1123,6 +1131,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
1123extern int drbd_send_bitmap(struct drbd_device *device); 1131extern int drbd_send_bitmap(struct drbd_device *device);
1124extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); 1132extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
1125extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); 1133extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
1134extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *);
1126extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); 1135extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
1127extern void drbd_device_cleanup(struct drbd_device *device); 1136extern void drbd_device_cleanup(struct drbd_device *device);
1128void drbd_print_uuids(struct drbd_device *device, const char *text); 1137void drbd_print_uuids(struct drbd_device *device, const char *text);
@@ -1342,11 +1351,11 @@ struct bm_extent {
1342#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ 1351#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
1343#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ 1352#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
1344 1353
1345/* For now, don't allow more than one activity log extent worth of data 1354/* For now, don't allow more than half of what we can "activate" in one
1346 * to be discarded in one go. We may need to rework drbd_al_begin_io() 1355 * activity log transaction to be discarded in one go. We may need to rework
1347 * to allow for even larger discard ranges */ 1356 * drbd_al_begin_io() to allow for even larger discard ranges */
1348#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE 1357#define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
1349#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) 1358#define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9)
1350 1359
1351extern int drbd_bm_init(struct drbd_device *device); 1360extern int drbd_bm_init(struct drbd_device *device);
1352extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); 1361extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
@@ -1369,6 +1378,7 @@ extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
1369extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); 1378extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
1370extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); 1379extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
1371extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); 1380extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
1381extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local);
1372extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); 1382extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
1373extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); 1383extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
1374extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); 1384extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
@@ -1483,12 +1493,14 @@ enum determine_dev_size {
1483extern enum determine_dev_size 1493extern enum determine_dev_size
1484drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); 1494drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
1485extern void resync_after_online_grow(struct drbd_device *); 1495extern void resync_after_online_grow(struct drbd_device *);
1486extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev); 1496extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
1497 struct drbd_backing_dev *bdev, struct o_qlim *o);
1487extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, 1498extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
1488 enum drbd_role new_role, 1499 enum drbd_role new_role,
1489 int force); 1500 int force);
1490extern bool conn_try_outdate_peer(struct drbd_connection *connection); 1501extern bool conn_try_outdate_peer(struct drbd_connection *connection);
1491extern void conn_try_outdate_peer_async(struct drbd_connection *connection); 1502extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
1503extern enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd);
1492extern int drbd_khelper(struct drbd_device *device, char *cmd); 1504extern int drbd_khelper(struct drbd_device *device, char *cmd);
1493 1505
1494/* drbd_worker.c */ 1506/* drbd_worker.c */
@@ -1548,6 +1560,8 @@ extern void start_resync_timer_fn(unsigned long data);
1548extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); 1560extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
1549 1561
1550/* drbd_receiver.c */ 1562/* drbd_receiver.c */
1563extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
1564 sector_t start, unsigned int nr_sectors, bool discard);
1551extern int drbd_receiver(struct drbd_thread *thi); 1565extern int drbd_receiver(struct drbd_thread *thi);
1552extern int drbd_ack_receiver(struct drbd_thread *thi); 1566extern int drbd_ack_receiver(struct drbd_thread *thi);
1553extern void drbd_send_ping_wf(struct work_struct *ws); 1567extern void drbd_send_ping_wf(struct work_struct *ws);
@@ -1561,7 +1575,7 @@ extern int drbd_submit_peer_request(struct drbd_device *,
1561extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); 1575extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
1562extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, 1576extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
1563 sector_t, unsigned int, 1577 sector_t, unsigned int,
1564 bool, 1578 unsigned int,
1565 gfp_t) __must_hold(local); 1579 gfp_t) __must_hold(local);
1566extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, 1580extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
1567 int); 1581 int);
@@ -1635,8 +1649,6 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
1635/* drbd_proc.c */ 1649/* drbd_proc.c */
1636extern struct proc_dir_entry *drbd_proc; 1650extern struct proc_dir_entry *drbd_proc;
1637extern const struct file_operations drbd_proc_fops; 1651extern const struct file_operations drbd_proc_fops;
1638extern const char *drbd_conn_str(enum drbd_conns s);
1639extern const char *drbd_role_str(enum drbd_role s);
1640 1652
1641/* drbd_actlog.c */ 1653/* drbd_actlog.c */
1642extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); 1654extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
@@ -2095,13 +2107,22 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
2095 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); 2107 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
2096} 2108}
2097 2109
2110static inline bool is_sync_target_state(enum drbd_conns connection_state)
2111{
2112 return connection_state == C_SYNC_TARGET ||
2113 connection_state == C_PAUSED_SYNC_T;
2114}
2115
2116static inline bool is_sync_source_state(enum drbd_conns connection_state)
2117{
2118 return connection_state == C_SYNC_SOURCE ||
2119 connection_state == C_PAUSED_SYNC_S;
2120}
2121
2098static inline bool is_sync_state(enum drbd_conns connection_state) 2122static inline bool is_sync_state(enum drbd_conns connection_state)
2099{ 2123{
2100 return 2124 return is_sync_source_state(connection_state) ||
2101 (connection_state == C_SYNC_SOURCE 2125 is_sync_target_state(connection_state);
2102 || connection_state == C_SYNC_TARGET
2103 || connection_state == C_PAUSED_SYNC_S
2104 || connection_state == C_PAUSED_SYNC_T);
2105} 2126}
2106 2127
2107/** 2128/**
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
index f210543f05f4..23c5a94428d2 100644
--- a/drivers/block/drbd/drbd_interval.h
+++ b/drivers/block/drbd/drbd_interval.h
@@ -6,13 +6,13 @@
6 6
7struct drbd_interval { 7struct drbd_interval {
8 struct rb_node rb; 8 struct rb_node rb;
9 sector_t sector; /* start sector of the interval */ 9 sector_t sector; /* start sector of the interval */
10 unsigned int size; /* size in bytes */ 10 unsigned int size; /* size in bytes */
11 sector_t end; /* highest interval end in subtree */ 11 sector_t end; /* highest interval end in subtree */
12 int local:1 /* local or remote request? */; 12 unsigned int local:1 /* local or remote request? */;
13 int waiting:1; /* someone is waiting for this to complete */ 13 unsigned int waiting:1; /* someone is waiting for completion */
14 int completed:1; /* this has been completed already; 14 unsigned int completed:1; /* this has been completed already;
15 * ignore for conflict detection */ 15 * ignore for conflict detection */
16}; 16};
17 17
18static inline void drbd_clear_interval(struct drbd_interval *i) 18static inline void drbd_clear_interval(struct drbd_interval *i)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2b37744db0fa..0501ae0c517b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -31,7 +31,7 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/jiffies.h> 32#include <linux/jiffies.h>
33#include <linux/drbd.h> 33#include <linux/drbd.h>
34#include <asm/uaccess.h> 34#include <linux/uaccess.h>
35#include <asm/types.h> 35#include <asm/types.h>
36#include <net/sock.h> 36#include <net/sock.h>
37#include <linux/ctype.h> 37#include <linux/ctype.h>
@@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
920 } 920 }
921} 921}
922 922
923/* communicated if (agreed_features & DRBD_FF_WSAME) */
924void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q)
925{
926 if (q) {
927 p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
928 p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
929 p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
930 p->qlim->io_min = cpu_to_be32(queue_io_min(q));
931 p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
932 p->qlim->discard_enabled = blk_queue_discard(q);
933 p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
934 p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
935 } else {
936 q = device->rq_queue;
937 p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
938 p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
939 p->qlim->alignment_offset = 0;
940 p->qlim->io_min = cpu_to_be32(queue_io_min(q));
941 p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
942 p->qlim->discard_enabled = 0;
943 p->qlim->discard_zeroes_data = 0;
944 p->qlim->write_same_capable = 0;
945 }
946}
947
923int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) 948int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
924{ 949{
925 struct drbd_device *device = peer_device->device; 950 struct drbd_device *device = peer_device->device;
@@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
928 sector_t d_size, u_size; 953 sector_t d_size, u_size;
929 int q_order_type; 954 int q_order_type;
930 unsigned int max_bio_size; 955 unsigned int max_bio_size;
956 unsigned int packet_size;
957
958 sock = &peer_device->connection->data;
959 p = drbd_prepare_command(peer_device, sock);
960 if (!p)
961 return -EIO;
931 962
963 packet_size = sizeof(*p);
964 if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
965 packet_size += sizeof(p->qlim[0]);
966
967 memset(p, 0, packet_size);
932 if (get_ldev_if_state(device, D_NEGOTIATING)) { 968 if (get_ldev_if_state(device, D_NEGOTIATING)) {
933 D_ASSERT(device, device->ldev->backing_bdev); 969 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
934 d_size = drbd_get_max_capacity(device->ldev); 970 d_size = drbd_get_max_capacity(device->ldev);
935 rcu_read_lock(); 971 rcu_read_lock();
936 u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; 972 u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
937 rcu_read_unlock(); 973 rcu_read_unlock();
938 q_order_type = drbd_queue_order_type(device); 974 q_order_type = drbd_queue_order_type(device);
939 max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; 975 max_bio_size = queue_max_hw_sectors(q) << 9;
940 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); 976 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
977 assign_p_sizes_qlim(device, p, q);
941 put_ldev(device); 978 put_ldev(device);
942 } else { 979 } else {
943 d_size = 0; 980 d_size = 0;
944 u_size = 0; 981 u_size = 0;
945 q_order_type = QUEUE_ORDERED_NONE; 982 q_order_type = QUEUE_ORDERED_NONE;
946 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ 983 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
984 assign_p_sizes_qlim(device, p, NULL);
947 } 985 }
948 986
949 sock = &peer_device->connection->data;
950 p = drbd_prepare_command(peer_device, sock);
951 if (!p)
952 return -EIO;
953
954 if (peer_device->connection->agreed_pro_version <= 94) 987 if (peer_device->connection->agreed_pro_version <= 94)
955 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); 988 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
956 else if (peer_device->connection->agreed_pro_version < 100) 989 else if (peer_device->connection->agreed_pro_version < 100)
@@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
962 p->max_bio_size = cpu_to_be32(max_bio_size); 995 p->max_bio_size = cpu_to_be32(max_bio_size);
963 p->queue_order_type = cpu_to_be16(q_order_type); 996 p->queue_order_type = cpu_to_be16(q_order_type);
964 p->dds_flags = cpu_to_be16(flags); 997 p->dds_flags = cpu_to_be16(flags);
965 return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0); 998
999 return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
966} 1000}
967 1001
968/** 1002/**
@@ -1377,6 +1411,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1377 cpu_to_be64(block_id)); 1411 cpu_to_be64(block_id));
1378} 1412}
1379 1413
1414int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
1415 struct drbd_peer_request *peer_req)
1416{
1417 struct drbd_socket *sock;
1418 struct p_block_desc *p;
1419
1420 sock = &peer_device->connection->data;
1421 p = drbd_prepare_command(peer_device, sock);
1422 if (!p)
1423 return -EIO;
1424 p->sector = cpu_to_be64(peer_req->i.sector);
1425 p->blksize = cpu_to_be32(peer_req->i.size);
1426 p->pad = 0;
1427 return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
1428}
1429
1380int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd, 1430int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
1381 sector_t sector, int size, u64 block_id) 1431 sector_t sector, int size, u64 block_id)
1382{ 1432{
@@ -1561,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
1561 ? 0 : MSG_MORE); 1611 ? 0 : MSG_MORE);
1562 if (err) 1612 if (err)
1563 return err; 1613 return err;
1614 /* REQ_OP_WRITE_SAME has only one segment */
1615 if (bio_op(bio) == REQ_OP_WRITE_SAME)
1616 break;
1564 } 1617 }
1565 return 0; 1618 return 0;
1566} 1619}
@@ -1579,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
1579 bio_iter_last(bvec, iter) ? 0 : MSG_MORE); 1632 bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
1580 if (err) 1633 if (err)
1581 return err; 1634 return err;
1635 /* REQ_OP_WRITE_SAME has only one segment */
1636 if (bio_op(bio) == REQ_OP_WRITE_SAME)
1637 break;
1582 } 1638 }
1583 return 0; 1639 return 0;
1584} 1640}
@@ -1610,6 +1666,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
1610 return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | 1666 return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
1611 (bio->bi_rw & REQ_FUA ? DP_FUA : 0) | 1667 (bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
1612 (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) | 1668 (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
1669 (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
1613 (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0); 1670 (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
1614 else 1671 else
1615 return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; 1672 return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
@@ -1623,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
1623 struct drbd_device *device = peer_device->device; 1680 struct drbd_device *device = peer_device->device;
1624 struct drbd_socket *sock; 1681 struct drbd_socket *sock;
1625 struct p_data *p; 1682 struct p_data *p;
1683 struct p_wsame *wsame = NULL;
1684 void *digest_out;
1626 unsigned int dp_flags = 0; 1685 unsigned int dp_flags = 0;
1627 int digest_size; 1686 int digest_size;
1628 int err; 1687 int err;
@@ -1658,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
1658 err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); 1717 err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
1659 goto out; 1718 goto out;
1660 } 1719 }
1720 if (dp_flags & DP_WSAME) {
1721 /* this will only work if DRBD_FF_WSAME is set AND the
1722 * handshake agreed that all nodes and backend devices are
1723 * WRITE_SAME capable and agree on logical_block_size */
1724 wsame = (struct p_wsame*)p;
1725 digest_out = wsame + 1;
1726 wsame->size = cpu_to_be32(req->i.size);
1727 } else
1728 digest_out = p + 1;
1661 1729
1662 /* our digest is still only over the payload. 1730 /* our digest is still only over the payload.
1663 * TRIM does not carry any payload. */ 1731 * TRIM does not carry any payload. */
1664 if (digest_size) 1732 if (digest_size)
1665 drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); 1733 drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
1666 err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size); 1734 if (wsame) {
1735 err =
1736 __send_command(peer_device->connection, device->vnr, sock, P_WSAME,
1737 sizeof(*wsame) + digest_size, NULL,
1738 bio_iovec(req->master_bio).bv_len);
1739 } else
1740 err =
1741 __send_command(peer_device->connection, device->vnr, sock, P_DATA,
1742 sizeof(*p) + digest_size, NULL, req->i.size);
1667 if (!err) { 1743 if (!err) {
1668 /* For protocol A, we have to memcpy the payload into 1744 /* For protocol A, we have to memcpy the payload into
1669 * socket buffers, as we may complete right away 1745 * socket buffers, as we may complete right away
@@ -3507,7 +3583,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
3507 struct bm_io_work *work = &device->bm_io_work; 3583 struct bm_io_work *work = &device->bm_io_work;
3508 int rv = -EIO; 3584 int rv = -EIO;
3509 3585
3510 D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0); 3586 if (work->flags != BM_LOCKED_CHANGE_ALLOWED) {
3587 int cnt = atomic_read(&device->ap_bio_cnt);
3588 if (cnt)
3589 drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n",
3590 cnt, work->why);
3591 }
3511 3592
3512 if (get_ldev(device)) { 3593 if (get_ldev(device)) {
3513 drbd_bm_lock(device, work->why, work->flags); 3594 drbd_bm_lock(device, work->why, work->flags);
@@ -3587,18 +3668,20 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
3587int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), 3668int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
3588 char *why, enum bm_flag flags) 3669 char *why, enum bm_flag flags)
3589{ 3670{
3671 /* Only suspend io, if some operation is supposed to be locked out */
3672 const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST);
3590 int rv; 3673 int rv;
3591 3674
3592 D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); 3675 D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
3593 3676
3594 if ((flags & BM_LOCKED_SET_ALLOWED) == 0) 3677 if (do_suspend_io)
3595 drbd_suspend_io(device); 3678 drbd_suspend_io(device);
3596 3679
3597 drbd_bm_lock(device, why, flags); 3680 drbd_bm_lock(device, why, flags);
3598 rv = io_fn(device); 3681 rv = io_fn(device);
3599 drbd_bm_unlock(device); 3682 drbd_bm_unlock(device);
3600 3683
3601 if ((flags & BM_LOCKED_SET_ALLOWED) == 0) 3684 if (do_suspend_io)
3602 drbd_resume_io(device); 3685 drbd_resume_io(device);
3603 3686
3604 return rv; 3687 return rv;
@@ -3637,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd)
3637 * one PRO_VERSION */ 3720 * one PRO_VERSION */
3638 static const char *cmdnames[] = { 3721 static const char *cmdnames[] = {
3639 [P_DATA] = "Data", 3722 [P_DATA] = "Data",
3723 [P_WSAME] = "WriteSame",
3724 [P_TRIM] = "Trim",
3640 [P_DATA_REPLY] = "DataReply", 3725 [P_DATA_REPLY] = "DataReply",
3641 [P_RS_DATA_REPLY] = "RSDataReply", 3726 [P_RS_DATA_REPLY] = "RSDataReply",
3642 [P_BARRIER] = "Barrier", 3727 [P_BARRIER] = "Barrier",
@@ -3681,6 +3766,8 @@ const char *cmdname(enum drbd_packet cmd)
3681 [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", 3766 [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
3682 [P_RETRY_WRITE] = "retry_write", 3767 [P_RETRY_WRITE] = "retry_write",
3683 [P_PROTOCOL_UPDATE] = "protocol_update", 3768 [P_PROTOCOL_UPDATE] = "protocol_update",
3769 [P_RS_THIN_REQ] = "rs_thin_req",
3770 [P_RS_DEALLOCATED] = "rs_deallocated",
3684 3771
3685 /* enum drbd_packet, but not commands - obsoleted flags: 3772 /* enum drbd_packet, but not commands - obsoleted flags:
3686 * P_MAY_IGNORE 3773 * P_MAY_IGNORE
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 0bac9c8246bc..f35db29cac76 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -343,7 +343,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
343 (char[20]) { }, /* address family */ 343 (char[20]) { }, /* address family */
344 (char[60]) { }, /* address */ 344 (char[60]) { }, /* address */
345 NULL }; 345 NULL };
346 char mb[12]; 346 char mb[14];
347 char *argv[] = {usermode_helper, cmd, mb, NULL }; 347 char *argv[] = {usermode_helper, cmd, mb, NULL };
348 struct drbd_connection *connection = first_peer_device(device)->connection; 348 struct drbd_connection *connection = first_peer_device(device)->connection;
349 struct sib_info sib; 349 struct sib_info sib;
@@ -352,7 +352,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
352 if (current == connection->worker.task) 352 if (current == connection->worker.task)
353 set_bit(CALLBACK_PENDING, &connection->flags); 353 set_bit(CALLBACK_PENDING, &connection->flags);
354 354
355 snprintf(mb, 12, "minor-%d", device_to_minor(device)); 355 snprintf(mb, 14, "minor-%d", device_to_minor(device));
356 setup_khelper_env(connection, envp); 356 setup_khelper_env(connection, envp);
357 357
358 /* The helper may take some time. 358 /* The helper may take some time.
@@ -387,7 +387,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
387 return ret; 387 return ret;
388} 388}
389 389
390static int conn_khelper(struct drbd_connection *connection, char *cmd) 390enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd)
391{ 391{
392 char *envp[] = { "HOME=/", 392 char *envp[] = { "HOME=/",
393 "TERM=linux", 393 "TERM=linux",
@@ -442,19 +442,17 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec
442 } 442 }
443 rcu_read_unlock(); 443 rcu_read_unlock();
444 444
445 if (fp == FP_NOT_AVAIL) {
446 /* IO Suspending works on the whole resource.
447 Do it only for one device. */
448 vnr = 0;
449 peer_device = idr_get_next(&connection->peer_devices, &vnr);
450 drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
451 }
452
453 return fp; 445 return fp;
454} 446}
455 447
448static bool resource_is_supended(struct drbd_resource *resource)
449{
450 return resource->susp || resource->susp_fen || resource->susp_nod;
451}
452
456bool conn_try_outdate_peer(struct drbd_connection *connection) 453bool conn_try_outdate_peer(struct drbd_connection *connection)
457{ 454{
455 struct drbd_resource * const resource = connection->resource;
458 unsigned int connect_cnt; 456 unsigned int connect_cnt;
459 union drbd_state mask = { }; 457 union drbd_state mask = { };
460 union drbd_state val = { }; 458 union drbd_state val = { };
@@ -462,21 +460,41 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
462 char *ex_to_string; 460 char *ex_to_string;
463 int r; 461 int r;
464 462
465 spin_lock_irq(&connection->resource->req_lock); 463 spin_lock_irq(&resource->req_lock);
466 if (connection->cstate >= C_WF_REPORT_PARAMS) { 464 if (connection->cstate >= C_WF_REPORT_PARAMS) {
467 drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); 465 drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
468 spin_unlock_irq(&connection->resource->req_lock); 466 spin_unlock_irq(&resource->req_lock);
469 return false; 467 return false;
470 } 468 }
471 469
472 connect_cnt = connection->connect_cnt; 470 connect_cnt = connection->connect_cnt;
473 spin_unlock_irq(&connection->resource->req_lock); 471 spin_unlock_irq(&resource->req_lock);
474 472
475 fp = highest_fencing_policy(connection); 473 fp = highest_fencing_policy(connection);
476 switch (fp) { 474 switch (fp) {
477 case FP_NOT_AVAIL: 475 case FP_NOT_AVAIL:
478 drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n"); 476 drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
479 goto out; 477 spin_lock_irq(&resource->req_lock);
478 if (connection->cstate < C_WF_REPORT_PARAMS) {
479 _conn_request_state(connection,
480 (union drbd_state) { { .susp_fen = 1 } },
481 (union drbd_state) { { .susp_fen = 0 } },
482 CS_VERBOSE | CS_HARD | CS_DC_SUSP);
483 /* We are no longer suspended due to the fencing policy.
484 * We may still be suspended due to the on-no-data-accessible policy.
485 * If that was OND_IO_ERROR, fail pending requests. */
486 if (!resource_is_supended(resource))
487 _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
488 }
489 /* Else: in case we raced with a connection handshake,
490 * let the handshake figure out if we maybe can RESEND,
491 * and do not resume/fail pending requests here.
492 * Worst case is we stay suspended for now, which may be
493 * resolved by either re-establishing the replication link, or
494 * the next link failure, or eventually the administrator. */
495 spin_unlock_irq(&resource->req_lock);
496 return false;
497
480 case FP_DONT_CARE: 498 case FP_DONT_CARE:
481 return true; 499 return true;
482 default: ; 500 default: ;
@@ -485,17 +503,17 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
485 r = conn_khelper(connection, "fence-peer"); 503 r = conn_khelper(connection, "fence-peer");
486 504
487 switch ((r>>8) & 0xff) { 505 switch ((r>>8) & 0xff) {
488 case 3: /* peer is inconsistent */ 506 case P_INCONSISTENT: /* peer is inconsistent */
489 ex_to_string = "peer is inconsistent or worse"; 507 ex_to_string = "peer is inconsistent or worse";
490 mask.pdsk = D_MASK; 508 mask.pdsk = D_MASK;
491 val.pdsk = D_INCONSISTENT; 509 val.pdsk = D_INCONSISTENT;
492 break; 510 break;
493 case 4: /* peer got outdated, or was already outdated */ 511 case P_OUTDATED: /* peer got outdated, or was already outdated */
494 ex_to_string = "peer was fenced"; 512 ex_to_string = "peer was fenced";
495 mask.pdsk = D_MASK; 513 mask.pdsk = D_MASK;
496 val.pdsk = D_OUTDATED; 514 val.pdsk = D_OUTDATED;
497 break; 515 break;
498 case 5: /* peer was down */ 516 case P_DOWN: /* peer was down */
499 if (conn_highest_disk(connection) == D_UP_TO_DATE) { 517 if (conn_highest_disk(connection) == D_UP_TO_DATE) {
500 /* we will(have) create(d) a new UUID anyways... */ 518 /* we will(have) create(d) a new UUID anyways... */
501 ex_to_string = "peer is unreachable, assumed to be dead"; 519 ex_to_string = "peer is unreachable, assumed to be dead";
@@ -505,7 +523,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
505 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; 523 ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
506 } 524 }
507 break; 525 break;
508 case 6: /* Peer is primary, voluntarily outdate myself. 526 case P_PRIMARY: /* Peer is primary, voluntarily outdate myself.
509 * This is useful when an unconnected R_SECONDARY is asked to 527 * This is useful when an unconnected R_SECONDARY is asked to
510 * become R_PRIMARY, but finds the other peer being active. */ 528 * become R_PRIMARY, but finds the other peer being active. */
511 ex_to_string = "peer is active"; 529 ex_to_string = "peer is active";
@@ -513,7 +531,9 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
513 mask.disk = D_MASK; 531 mask.disk = D_MASK;
514 val.disk = D_OUTDATED; 532 val.disk = D_OUTDATED;
515 break; 533 break;
516 case 7: 534 case P_FENCING:
535 /* THINK: do we need to handle this
536 * like case 4, or more like case 5? */
517 if (fp != FP_STONITH) 537 if (fp != FP_STONITH)
518 drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n"); 538 drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
519 ex_to_string = "peer was stonithed"; 539 ex_to_string = "peer was stonithed";
@@ -529,13 +549,11 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
529 drbd_info(connection, "fence-peer helper returned %d (%s)\n", 549 drbd_info(connection, "fence-peer helper returned %d (%s)\n",
530 (r>>8) & 0xff, ex_to_string); 550 (r>>8) & 0xff, ex_to_string);
531 551
532 out:
533
534 /* Not using 552 /* Not using
535 conn_request_state(connection, mask, val, CS_VERBOSE); 553 conn_request_state(connection, mask, val, CS_VERBOSE);
536 here, because we might were able to re-establish the connection in the 554 here, because we might were able to re-establish the connection in the
537 meantime. */ 555 meantime. */
538 spin_lock_irq(&connection->resource->req_lock); 556 spin_lock_irq(&resource->req_lock);
539 if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) { 557 if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
540 if (connection->connect_cnt != connect_cnt) 558 if (connection->connect_cnt != connect_cnt)
541 /* In case the connection was established and droped 559 /* In case the connection was established and droped
@@ -544,7 +562,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
544 else 562 else
545 _conn_request_state(connection, mask, val, CS_VERBOSE); 563 _conn_request_state(connection, mask, val, CS_VERBOSE);
546 } 564 }
547 spin_unlock_irq(&connection->resource->req_lock); 565 spin_unlock_irq(&resource->req_lock);
548 566
549 return conn_highest_pdsk(connection) <= D_OUTDATED; 567 return conn_highest_pdsk(connection) <= D_OUTDATED;
550} 568}
@@ -1154,51 +1172,160 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
1154 return 0; 1172 return 0;
1155} 1173}
1156 1174
1175static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity)
1176{
1177 q->limits.discard_granularity = granularity;
1178}
1179
1180static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
1181{
1182 /* when we introduced REQ_WRITE_SAME support, we also bumped
1183 * our maximum supported batch bio size used for discards. */
1184 if (connection->agreed_features & DRBD_FF_WSAME)
1185 return DRBD_MAX_BBIO_SECTORS;
1186 /* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */
1187 return AL_EXTENT_SIZE >> 9;
1188}
1189
1190static void decide_on_discard_support(struct drbd_device *device,
1191 struct request_queue *q,
1192 struct request_queue *b,
1193 bool discard_zeroes_if_aligned)
1194{
1195 /* q = drbd device queue (device->rq_queue)
1196 * b = backing device queue (device->ldev->backing_bdev->bd_disk->queue),
1197 * or NULL if diskless
1198 */
1199 struct drbd_connection *connection = first_peer_device(device)->connection;
1200 bool can_do = b ? blk_queue_discard(b) : true;
1201
1202 if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
1203 can_do = false;
1204 drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
1205 }
1206 if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
1207 can_do = false;
1208 drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
1209 }
1210 if (can_do) {
1211 /* We don't care for the granularity, really.
1212 * Stacking limits below should fix it for the local
1213 * device. Whether or not it is a suitable granularity
1214 * on the remote device is not our problem, really. If
1215 * you care, you need to use devices with similar
1216 * topology on all peers. */
1217 blk_queue_discard_granularity(q, 512);
1218 q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
1219 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1220 } else {
1221 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1222 blk_queue_discard_granularity(q, 0);
1223 q->limits.max_discard_sectors = 0;
1224 }
1225}
1226
1227static void fixup_discard_if_not_supported(struct request_queue *q)
1228{
1229 /* To avoid confusion, if this queue does not support discard, clear
1230 * max_discard_sectors, which is what lsblk -D reports to the user.
1231 * Older kernels got this wrong in "stack limits".
1232 * */
1233 if (!blk_queue_discard(q)) {
1234 blk_queue_max_discard_sectors(q, 0);
1235 blk_queue_discard_granularity(q, 0);
1236 }
1237}
1238
1239static void decide_on_write_same_support(struct drbd_device *device,
1240 struct request_queue *q,
1241 struct request_queue *b, struct o_qlim *o)
1242{
1243 struct drbd_peer_device *peer_device = first_peer_device(device);
1244 struct drbd_connection *connection = peer_device->connection;
1245 bool can_do = b ? b->limits.max_write_same_sectors : true;
1246
1247 if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
1248 can_do = false;
1249 drbd_info(peer_device, "peer does not support WRITE_SAME\n");
1250 }
1251
1252 if (o) {
1253 /* logical block size; queue_logical_block_size(NULL) is 512 */
1254 unsigned int peer_lbs = be32_to_cpu(o->logical_block_size);
1255 unsigned int me_lbs_b = queue_logical_block_size(b);
1256 unsigned int me_lbs = queue_logical_block_size(q);
1257
1258 if (me_lbs_b != me_lbs) {
1259 drbd_warn(device,
1260 "logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n",
1261 me_lbs, me_lbs_b);
1262 /* rather disable write same than trigger some BUG_ON later in the scsi layer. */
1263 can_do = false;
1264 }
1265 if (me_lbs_b != peer_lbs) {
1266 drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n",
1267 me_lbs, peer_lbs);
1268 if (can_do) {
1269 drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n");
1270 can_do = false;
1271 }
1272 me_lbs = max(me_lbs, me_lbs_b);
1273 /* We cannot change the logical block size of an in-use queue.
1274 * We can only hope that access happens to be properly aligned.
1275 * If not, the peer will likely produce an IO error, and detach. */
1276 if (peer_lbs > me_lbs) {
1277 if (device->state.role != R_PRIMARY) {
1278 blk_queue_logical_block_size(q, peer_lbs);
1279 drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs);
1280 } else {
1281 drbd_warn(peer_device,
1282 "current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n",
1283 me_lbs, peer_lbs);
1284 }
1285 }
1286 }
1287 if (can_do && !o->write_same_capable) {
1288 /* If we introduce an open-coded write-same loop on the receiving side,
1289 * the peer would present itself as "capable". */
1290 drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n");
1291 can_do = false;
1292 }
1293 }
1294
1295 blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0);
1296}
1297
1157static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, 1298static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
1158 unsigned int max_bio_size) 1299 unsigned int max_bio_size, struct o_qlim *o)
1159{ 1300{
1160 struct request_queue * const q = device->rq_queue; 1301 struct request_queue * const q = device->rq_queue;
1161 unsigned int max_hw_sectors = max_bio_size >> 9; 1302 unsigned int max_hw_sectors = max_bio_size >> 9;
1162 unsigned int max_segments = 0; 1303 unsigned int max_segments = 0;
1163 struct request_queue *b = NULL; 1304 struct request_queue *b = NULL;
1305 struct disk_conf *dc;
1306 bool discard_zeroes_if_aligned = true;
1164 1307
1165 if (bdev) { 1308 if (bdev) {
1166 b = bdev->backing_bdev->bd_disk->queue; 1309 b = bdev->backing_bdev->bd_disk->queue;
1167 1310
1168 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 1311 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
1169 rcu_read_lock(); 1312 rcu_read_lock();
1170 max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; 1313 dc = rcu_dereference(device->ldev->disk_conf);
1314 max_segments = dc->max_bio_bvecs;
1315 discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned;
1171 rcu_read_unlock(); 1316 rcu_read_unlock();
1172 1317
1173 blk_set_stacking_limits(&q->limits); 1318 blk_set_stacking_limits(&q->limits);
1174 blk_queue_max_write_same_sectors(q, 0);
1175 } 1319 }
1176 1320
1177 blk_queue_logical_block_size(q, 512);
1178 blk_queue_max_hw_sectors(q, max_hw_sectors); 1321 blk_queue_max_hw_sectors(q, max_hw_sectors);
1179 /* This is the workaround for "bio would need to, but cannot, be split" */ 1322 /* This is the workaround for "bio would need to, but cannot, be split" */
1180 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); 1323 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
1181 blk_queue_segment_boundary(q, PAGE_SIZE-1); 1324 blk_queue_segment_boundary(q, PAGE_SIZE-1);
1325 decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
1326 decide_on_write_same_support(device, q, b, o);
1182 1327
1183 if (b) { 1328 if (b) {
1184 struct drbd_connection *connection = first_peer_device(device)->connection;
1185
1186 blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
1187
1188 if (blk_queue_discard(b) &&
1189 (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
1190 /* We don't care, stacking below should fix it for the local device.
1191 * Whether or not it is a suitable granularity on the remote device
1192 * is not our problem, really. If you care, you need to
1193 * use devices with similar topology on all peers. */
1194 q->limits.discard_granularity = 512;
1195 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1196 } else {
1197 blk_queue_max_discard_sectors(q, 0);
1198 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1199 q->limits.discard_granularity = 0;
1200 }
1201
1202 blk_queue_stack_limits(q, b); 1329 blk_queue_stack_limits(q, b);
1203 1330
1204 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 1331 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
@@ -1208,15 +1335,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
1208 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 1335 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
1209 } 1336 }
1210 } 1337 }
1211 /* To avoid confusion, if this queue does not support discard, clear 1338 fixup_discard_if_not_supported(q);
1212 * max_discard_sectors, which is what lsblk -D reports to the user. */
1213 if (!blk_queue_discard(q)) {
1214 blk_queue_max_discard_sectors(q, 0);
1215 q->limits.discard_granularity = 0;
1216 }
1217} 1339}
1218 1340
1219void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) 1341void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
1220{ 1342{
1221 unsigned int now, new, local, peer; 1343 unsigned int now, new, local, peer;
1222 1344
@@ -1259,7 +1381,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backin
1259 if (new != now) 1381 if (new != now)
1260 drbd_info(device, "max BIO size = %u\n", new); 1382 drbd_info(device, "max BIO size = %u\n", new);
1261 1383
1262 drbd_setup_queue_param(device, bdev, new); 1384 drbd_setup_queue_param(device, bdev, new, o);
1263} 1385}
1264 1386
1265/* Starts the worker thread */ 1387/* Starts the worker thread */
@@ -1348,6 +1470,43 @@ static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
1348 a->disk_drain != b->disk_drain; 1470 a->disk_drain != b->disk_drain;
1349} 1471}
1350 1472
1473static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf,
1474 struct drbd_backing_dev *nbc)
1475{
1476 struct request_queue * const q = nbc->backing_bdev->bd_disk->queue;
1477
1478 if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1479 disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1480 if (disk_conf->al_extents > drbd_al_extents_max(nbc))
1481 disk_conf->al_extents = drbd_al_extents_max(nbc);
1482
1483 if (!blk_queue_discard(q)
1484 || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
1485 if (disk_conf->rs_discard_granularity) {
1486 disk_conf->rs_discard_granularity = 0; /* disable feature */
1487 drbd_info(device, "rs_discard_granularity feature disabled\n");
1488 }
1489 }
1490
1491 if (disk_conf->rs_discard_granularity) {
1492 int orig_value = disk_conf->rs_discard_granularity;
1493 int remainder;
1494
1495 if (q->limits.discard_granularity > disk_conf->rs_discard_granularity)
1496 disk_conf->rs_discard_granularity = q->limits.discard_granularity;
1497
1498 remainder = disk_conf->rs_discard_granularity % q->limits.discard_granularity;
1499 disk_conf->rs_discard_granularity += remainder;
1500
1501 if (disk_conf->rs_discard_granularity > q->limits.max_discard_sectors << 9)
1502 disk_conf->rs_discard_granularity = q->limits.max_discard_sectors << 9;
1503
1504 if (disk_conf->rs_discard_granularity != orig_value)
1505 drbd_info(device, "rs_discard_granularity changed to %d\n",
1506 disk_conf->rs_discard_granularity);
1507 }
1508}
1509
1351int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) 1510int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1352{ 1511{
1353 struct drbd_config_context adm_ctx; 1512 struct drbd_config_context adm_ctx;
@@ -1395,10 +1554,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1395 if (!expect(new_disk_conf->resync_rate >= 1)) 1554 if (!expect(new_disk_conf->resync_rate >= 1))
1396 new_disk_conf->resync_rate = 1; 1555 new_disk_conf->resync_rate = 1;
1397 1556
1398 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) 1557 sanitize_disk_conf(device, new_disk_conf, device->ldev);
1399 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1400 if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
1401 new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
1402 1558
1403 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) 1559 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1404 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; 1560 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
@@ -1457,6 +1613,9 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1457 if (write_ordering_changed(old_disk_conf, new_disk_conf)) 1613 if (write_ordering_changed(old_disk_conf, new_disk_conf))
1458 drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); 1614 drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
1459 1615
1616 if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned)
1617 drbd_reconsider_queue_parameters(device, device->ldev, NULL);
1618
1460 drbd_md_sync(device); 1619 drbd_md_sync(device);
1461 1620
1462 if (device->state.conn >= C_CONNECTED) { 1621 if (device->state.conn >= C_CONNECTED) {
@@ -1693,10 +1852,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1693 if (retcode != NO_ERROR) 1852 if (retcode != NO_ERROR)
1694 goto fail; 1853 goto fail;
1695 1854
1696 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) 1855 sanitize_disk_conf(device, new_disk_conf, nbc);
1697 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1698 if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
1699 new_disk_conf->al_extents = drbd_al_extents_max(nbc);
1700 1856
1701 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { 1857 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
1702 drbd_err(device, "max capacity %llu smaller than disk size %llu\n", 1858 drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
@@ -1838,7 +1994,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1838 device->read_cnt = 0; 1994 device->read_cnt = 0;
1839 device->writ_cnt = 0; 1995 device->writ_cnt = 0;
1840 1996
1841 drbd_reconsider_max_bio_size(device, device->ldev); 1997 drbd_reconsider_queue_parameters(device, device->ldev, NULL);
1842 1998
1843 /* If I am currently not R_PRIMARY, 1999 /* If I am currently not R_PRIMARY,
1844 * but meta data primary indicator is set, 2000 * but meta data primary indicator is set,
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 6537b25db9c1..be2b93fd2c11 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -25,7 +25,7 @@
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27 27
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/file.h> 30#include <linux/file.h>
31#include <linux/proc_fs.h> 31#include <linux/proc_fs.h>
@@ -122,18 +122,18 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
122 122
123 x = res/50; 123 x = res/50;
124 y = 20-x; 124 y = 20-x;
125 seq_printf(seq, "\t["); 125 seq_puts(seq, "\t[");
126 for (i = 1; i < x; i++) 126 for (i = 1; i < x; i++)
127 seq_printf(seq, "="); 127 seq_putc(seq, '=');
128 seq_printf(seq, ">"); 128 seq_putc(seq, '>');
129 for (i = 0; i < y; i++) 129 for (i = 0; i < y; i++)
130 seq_printf(seq, "."); 130 seq_printf(seq, ".");
131 seq_printf(seq, "] "); 131 seq_puts(seq, "] ");
132 132
133 if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) 133 if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
134 seq_printf(seq, "verified:"); 134 seq_puts(seq, "verified:");
135 else 135 else
136 seq_printf(seq, "sync'ed:"); 136 seq_puts(seq, "sync'ed:");
137 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); 137 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
138 138
139 /* if more than a few GB, display in MB */ 139 /* if more than a few GB, display in MB */
@@ -146,7 +146,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
146 (unsigned long) Bit2KB(rs_left), 146 (unsigned long) Bit2KB(rs_left),
147 (unsigned long) Bit2KB(rs_total)); 147 (unsigned long) Bit2KB(rs_total));
148 148
149 seq_printf(seq, "\n\t"); 149 seq_puts(seq, "\n\t");
150 150
151 /* see drivers/md/md.c 151 /* see drivers/md/md.c
152 * We do not want to overflow, so the order of operands and 152 * We do not want to overflow, so the order of operands and
@@ -175,9 +175,9 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
175 rt / 3600, (rt % 3600) / 60, rt % 60); 175 rt / 3600, (rt % 3600) / 60, rt % 60);
176 176
177 dbdt = Bit2KB(db/dt); 177 dbdt = Bit2KB(db/dt);
178 seq_printf(seq, " speed: "); 178 seq_puts(seq, " speed: ");
179 seq_printf_with_thousands_grouping(seq, dbdt); 179 seq_printf_with_thousands_grouping(seq, dbdt);
180 seq_printf(seq, " ("); 180 seq_puts(seq, " (");
181 /* ------------------------- ~3s average ------------------------ */ 181 /* ------------------------- ~3s average ------------------------ */
182 if (proc_details >= 1) { 182 if (proc_details >= 1) {
183 /* this is what drbd_rs_should_slow_down() uses */ 183 /* this is what drbd_rs_should_slow_down() uses */
@@ -188,7 +188,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
188 db = device->rs_mark_left[i] - rs_left; 188 db = device->rs_mark_left[i] - rs_left;
189 dbdt = Bit2KB(db/dt); 189 dbdt = Bit2KB(db/dt);
190 seq_printf_with_thousands_grouping(seq, dbdt); 190 seq_printf_with_thousands_grouping(seq, dbdt);
191 seq_printf(seq, " -- "); 191 seq_puts(seq, " -- ");
192 } 192 }
193 193
194 /* --------------------- long term average ---------------------- */ 194 /* --------------------- long term average ---------------------- */
@@ -200,11 +200,11 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
200 db = rs_total - rs_left; 200 db = rs_total - rs_left;
201 dbdt = Bit2KB(db/dt); 201 dbdt = Bit2KB(db/dt);
202 seq_printf_with_thousands_grouping(seq, dbdt); 202 seq_printf_with_thousands_grouping(seq, dbdt);
203 seq_printf(seq, ")"); 203 seq_putc(seq, ')');
204 204
205 if (state.conn == C_SYNC_TARGET || 205 if (state.conn == C_SYNC_TARGET ||
206 state.conn == C_VERIFY_S) { 206 state.conn == C_VERIFY_S) {
207 seq_printf(seq, " want: "); 207 seq_puts(seq, " want: ");
208 seq_printf_with_thousands_grouping(seq, device->c_sync_rate); 208 seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
209 } 209 }
210 seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); 210 seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
@@ -231,7 +231,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
231 (unsigned long long)bm_bits * BM_SECT_PER_BIT); 231 (unsigned long long)bm_bits * BM_SECT_PER_BIT);
232 if (stop_sector != 0 && stop_sector != ULLONG_MAX) 232 if (stop_sector != 0 && stop_sector != ULLONG_MAX)
233 seq_printf(seq, " stop sector: %llu", stop_sector); 233 seq_printf(seq, " stop sector: %llu", stop_sector);
234 seq_printf(seq, "\n"); 234 seq_putc(seq, '\n');
235 } 235 }
236} 236}
237 237
@@ -276,7 +276,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
276 rcu_read_lock(); 276 rcu_read_lock();
277 idr_for_each_entry(&drbd_devices, device, i) { 277 idr_for_each_entry(&drbd_devices, device, i) {
278 if (prev_i != i - 1) 278 if (prev_i != i - 1)
279 seq_printf(seq, "\n"); 279 seq_putc(seq, '\n');
280 prev_i = i; 280 prev_i = i;
281 281
282 state = device->state; 282 state = device->state;
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 129f8c76c9b1..4d296800f706 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -60,6 +60,15 @@ enum drbd_packet {
60 * which is why I chose TRIM here, to disambiguate. */ 60 * which is why I chose TRIM here, to disambiguate. */
61 P_TRIM = 0x31, 61 P_TRIM = 0x31,
62 62
63 /* Only use these two if both support FF_THIN_RESYNC */
64 P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
65 P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */
66
67 /* REQ_WRITE_SAME.
68 * On a receiving side without REQ_WRITE_SAME,
69 * we may fall back to an opencoded loop instead. */
70 P_WSAME = 0x34,
71
63 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 72 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
64 P_MAX_OPT_CMD = 0x101, 73 P_MAX_OPT_CMD = 0x101,
65 74
@@ -106,8 +115,11 @@ struct p_header100 {
106 u32 pad; 115 u32 pad;
107} __packed; 116} __packed;
108 117
109/* these defines must not be changed without changing the protocol version */ 118/* These defines must not be changed without changing the protocol version.
110#define DP_HARDBARRIER 1 /* depricated */ 119 * New defines may only be introduced together with protocol version bump or
120 * new protocol feature flags.
121 */
122#define DP_HARDBARRIER 1 /* no longer used */
111#define DP_RW_SYNC 2 /* equals REQ_SYNC */ 123#define DP_RW_SYNC 2 /* equals REQ_SYNC */
112#define DP_MAY_SET_IN_SYNC 4 124#define DP_MAY_SET_IN_SYNC 4
113#define DP_UNPLUG 8 /* not used anymore */ 125#define DP_UNPLUG 8 /* not used anymore */
@@ -116,6 +128,7 @@ struct p_header100 {
116#define DP_DISCARD 64 /* equals REQ_DISCARD */ 128#define DP_DISCARD 64 /* equals REQ_DISCARD */
117#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ 129#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
118#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ 130#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
131#define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */
119 132
120struct p_data { 133struct p_data {
121 u64 sector; /* 64 bits sector number */ 134 u64 sector; /* 64 bits sector number */
@@ -129,6 +142,11 @@ struct p_trim {
129 u32 size; /* == bio->bi_size */ 142 u32 size; /* == bio->bi_size */
130} __packed; 143} __packed;
131 144
145struct p_wsame {
146 struct p_data p_data;
147 u32 size; /* == bio->bi_size */
148} __packed;
149
132/* 150/*
133 * commands which share a struct: 151 * commands which share a struct:
134 * p_block_ack: 152 * p_block_ack:
@@ -160,7 +178,23 @@ struct p_block_req {
160 * ReportParams 178 * ReportParams
161 */ 179 */
162 180
163#define FF_TRIM 1 181/* supports TRIM/DISCARD on the "wire" protocol */
182#define DRBD_FF_TRIM 1
183
184/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks
185 * instead of fully allocate a supposedly thin volume on initial resync */
186#define DRBD_FF_THIN_RESYNC 2
187
188/* supports REQ_WRITE_SAME on the "wire" protocol.
189 * Note: this flag is overloaded,
190 * its presence also
191 * - indicates support for 128 MiB "batch bios",
192 * max discard size of 128 MiB
193 * instead of 4M before that.
194 * - indicates that we exchange additional settings in p_sizes
195 * drbd_send_sizes()/receive_sizes()
196 */
197#define DRBD_FF_WSAME 4
164 198
165struct p_connection_features { 199struct p_connection_features {
166 u32 protocol_min; 200 u32 protocol_min;
@@ -235,6 +269,40 @@ struct p_rs_uuid {
235 u64 uuid; 269 u64 uuid;
236} __packed; 270} __packed;
237 271
272/* optional queue_limits if (agreed_features & DRBD_FF_WSAME)
273 * see also struct queue_limits, as of late 2015 */
274struct o_qlim {
275 /* we don't need it yet, but we may as well communicate it now */
276 u32 physical_block_size;
277
278 /* so the original in struct queue_limits is unsigned short,
279 * but I'd have to put in padding anyways. */
280 u32 logical_block_size;
281
282 /* One incoming bio becomes one DRBD request,
283 * which may be translated to several bio on the receiving side.
284 * We don't need to communicate chunk/boundary/segment ... limits.
285 */
286
287 /* various IO hints may be useful with "diskless client" setups */
288 u32 alignment_offset;
289 u32 io_min;
290 u32 io_opt;
291
292 /* We may need to communicate integrity stuff at some point,
293 * but let's not get ahead of ourselves. */
294
295 /* Backend discard capabilities.
296 * Receiving side uses "blkdev_issue_discard()", no need to communicate
297 * more specifics. If the backend cannot do discards, the DRBD peer
298 * may fall back to blkdev_issue_zeroout().
299 */
300 u8 discard_enabled;
301 u8 discard_zeroes_data;
302 u8 write_same_capable;
303 u8 _pad;
304} __packed;
305
238struct p_sizes { 306struct p_sizes {
239 u64 d_size; /* size of disk */ 307 u64 d_size; /* size of disk */
240 u64 u_size; /* user requested size */ 308 u64 u_size; /* user requested size */
@@ -242,6 +310,9 @@ struct p_sizes {
242 u32 max_bio_size; /* Maximal size of a BIO */ 310 u32 max_bio_size; /* Maximal size of a BIO */
243 u16 queue_order_type; /* not yet implemented in DRBD*/ 311 u16 queue_order_type; /* not yet implemented in DRBD*/
244 u16 dds_flags; /* use enum dds_flags here. */ 312 u16 dds_flags; /* use enum dds_flags here. */
313
314 /* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */
315 struct o_qlim qlim[0];
245} __packed; 316} __packed;
246 317
247struct p_state { 318struct p_state {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 1ee002352ea2..df45713dfbe8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -25,7 +25,7 @@
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27 27
28#include <asm/uaccess.h> 28#include <linux/uaccess.h>
29#include <net/sock.h> 29#include <net/sock.h>
30 30
31#include <linux/drbd.h> 31#include <linux/drbd.h>
@@ -48,7 +48,7 @@
48#include "drbd_req.h" 48#include "drbd_req.h"
49#include "drbd_vli.h" 49#include "drbd_vli.h"
50 50
51#define PRO_FEATURES (FF_TRIM) 51#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME)
52 52
53struct packet_info { 53struct packet_info {
54 enum drbd_packet cmd; 54 enum drbd_packet cmd;
@@ -361,14 +361,17 @@ You must not have the req_lock:
361 drbd_wait_ee_list_empty() 361 drbd_wait_ee_list_empty()
362*/ 362*/
363 363
364/* normal: payload_size == request size (bi_size)
365 * w_same: payload_size == logical_block_size
366 * trim: payload_size == 0 */
364struct drbd_peer_request * 367struct drbd_peer_request *
365drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 368drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
366 unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) 369 unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
367{ 370{
368 struct drbd_device *device = peer_device->device; 371 struct drbd_device *device = peer_device->device;
369 struct drbd_peer_request *peer_req; 372 struct drbd_peer_request *peer_req;
370 struct page *page = NULL; 373 struct page *page = NULL;
371 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; 374 unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
372 375
373 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) 376 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
374 return NULL; 377 return NULL;
@@ -380,7 +383,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
380 return NULL; 383 return NULL;
381 } 384 }
382 385
383 if (has_payload && data_size) { 386 if (nr_pages) {
384 page = drbd_alloc_pages(peer_device, nr_pages, 387 page = drbd_alloc_pages(peer_device, nr_pages,
385 gfpflags_allow_blocking(gfp_mask)); 388 gfpflags_allow_blocking(gfp_mask));
386 if (!page) 389 if (!page)
@@ -390,7 +393,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
390 memset(peer_req, 0, sizeof(*peer_req)); 393 memset(peer_req, 0, sizeof(*peer_req));
391 INIT_LIST_HEAD(&peer_req->w.list); 394 INIT_LIST_HEAD(&peer_req->w.list);
392 drbd_clear_interval(&peer_req->i); 395 drbd_clear_interval(&peer_req->i);
393 peer_req->i.size = data_size; 396 peer_req->i.size = request_size;
394 peer_req->i.sector = sector; 397 peer_req->i.sector = sector;
395 peer_req->submit_jif = jiffies; 398 peer_req->submit_jif = jiffies;
396 peer_req->peer_device = peer_device; 399 peer_req->peer_device = peer_device;
@@ -1204,13 +1207,84 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in
1204 return err; 1207 return err;
1205} 1208}
1206 1209
1207static void drbd_flush(struct drbd_connection *connection) 1210/* This is blkdev_issue_flush, but asynchronous.
1211 * We want to submit to all component volumes in parallel,
1212 * then wait for all completions.
1213 */
1214struct issue_flush_context {
1215 atomic_t pending;
1216 int error;
1217 struct completion done;
1218};
1219struct one_flush_context {
1220 struct drbd_device *device;
1221 struct issue_flush_context *ctx;
1222};
1223
1224void one_flush_endio(struct bio *bio)
1208{ 1225{
1209 int rv; 1226 struct one_flush_context *octx = bio->bi_private;
1210 struct drbd_peer_device *peer_device; 1227 struct drbd_device *device = octx->device;
1211 int vnr; 1228 struct issue_flush_context *ctx = octx->ctx;
1212 1229
1230 if (bio->bi_error) {
1231 ctx->error = bio->bi_error;
1232 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error);
1233 }
1234 kfree(octx);
1235 bio_put(bio);
1236
1237 clear_bit(FLUSH_PENDING, &device->flags);
1238 put_ldev(device);
1239 kref_put(&device->kref, drbd_destroy_device);
1240
1241 if (atomic_dec_and_test(&ctx->pending))
1242 complete(&ctx->done);
1243}
1244
1245static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
1246{
1247 struct bio *bio = bio_alloc(GFP_NOIO, 0);
1248 struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
1249 if (!bio || !octx) {
1250 drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
1251 /* FIXME: what else can I do now? disconnecting or detaching
1252 * really does not help to improve the state of the world, either.
1253 */
1254 kfree(octx);
1255 if (bio)
1256 bio_put(bio);
1257
1258 ctx->error = -ENOMEM;
1259 put_ldev(device);
1260 kref_put(&device->kref, drbd_destroy_device);
1261 return;
1262 }
1263
1264 octx->device = device;
1265 octx->ctx = ctx;
1266 bio->bi_bdev = device->ldev->backing_bdev;
1267 bio->bi_private = octx;
1268 bio->bi_end_io = one_flush_endio;
1269 bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH);
1270
1271 device->flush_jif = jiffies;
1272 set_bit(FLUSH_PENDING, &device->flags);
1273 atomic_inc(&ctx->pending);
1274 submit_bio(bio);
1275}
1276
1277static void drbd_flush(struct drbd_connection *connection)
1278{
1213 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { 1279 if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
1280 struct drbd_peer_device *peer_device;
1281 struct issue_flush_context ctx;
1282 int vnr;
1283
1284 atomic_set(&ctx.pending, 1);
1285 ctx.error = 0;
1286 init_completion(&ctx.done);
1287
1214 rcu_read_lock(); 1288 rcu_read_lock();
1215 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1289 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1216 struct drbd_device *device = peer_device->device; 1290 struct drbd_device *device = peer_device->device;
@@ -1220,31 +1294,24 @@ static void drbd_flush(struct drbd_connection *connection)
1220 kref_get(&device->kref); 1294 kref_get(&device->kref);
1221 rcu_read_unlock(); 1295 rcu_read_unlock();
1222 1296
1223 /* Right now, we have only this one synchronous code path 1297 submit_one_flush(device, &ctx);
1224 * for flushes between request epochs.
1225 * We may want to make those asynchronous,
1226 * or at least parallelize the flushes to the volume devices.
1227 */
1228 device->flush_jif = jiffies;
1229 set_bit(FLUSH_PENDING, &device->flags);
1230 rv = blkdev_issue_flush(device->ldev->backing_bdev,
1231 GFP_NOIO, NULL);
1232 clear_bit(FLUSH_PENDING, &device->flags);
1233 if (rv) {
1234 drbd_info(device, "local disk flush failed with status %d\n", rv);
1235 /* would rather check on EOPNOTSUPP, but that is not reliable.
1236 * don't try again for ANY return value != 0
1237 * if (rv == -EOPNOTSUPP) */
1238 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1239 }
1240 put_ldev(device);
1241 kref_put(&device->kref, drbd_destroy_device);
1242 1298
1243 rcu_read_lock(); 1299 rcu_read_lock();
1244 if (rv)
1245 break;
1246 } 1300 }
1247 rcu_read_unlock(); 1301 rcu_read_unlock();
1302
1303 /* Do we want to add a timeout,
1304 * if disk-timeout is set? */
1305 if (!atomic_dec_and_test(&ctx.pending))
1306 wait_for_completion(&ctx.done);
1307
1308 if (ctx.error) {
1309 /* would rather check on EOPNOTSUPP, but that is not reliable.
1310 * don't try again for ANY return value != 0
1311 * if (rv == -EOPNOTSUPP) */
1312 /* Any error is already reported by bio_endio callback. */
1313 drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1314 }
1248 } 1315 }
1249} 1316}
1250 1317
@@ -1379,6 +1446,120 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
1379 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); 1446 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1380} 1447}
1381 1448
1449/*
1450 * We *may* ignore the discard-zeroes-data setting, if so configured.
1451 *
1452 * Assumption is that it "discard_zeroes_data=0" is only because the backend
1453 * may ignore partial unaligned discards.
1454 *
1455 * LVM/DM thin as of at least
1456 * LVM version: 2.02.115(2)-RHEL7 (2015-01-28)
1457 * Library version: 1.02.93-RHEL7 (2015-01-28)
1458 * Driver version: 4.29.0
1459 * still behaves this way.
1460 *
1461 * For unaligned (wrt. alignment and granularity) or too small discards,
1462 * we zero-out the initial (and/or) trailing unaligned partial chunks,
1463 * but discard all the aligned full chunks.
1464 *
1465 * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
1466 */
1467int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
1468{
1469 struct block_device *bdev = device->ldev->backing_bdev;
1470 struct request_queue *q = bdev_get_queue(bdev);
1471 sector_t tmp, nr;
1472 unsigned int max_discard_sectors, granularity;
1473 int alignment;
1474 int err = 0;
1475
1476 if (!discard)
1477 goto zero_out;
1478
1479 /* Zero-sector (unknown) and one-sector granularities are the same. */
1480 granularity = max(q->limits.discard_granularity >> 9, 1U);
1481 alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
1482
1483 max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
1484 max_discard_sectors -= max_discard_sectors % granularity;
1485 if (unlikely(!max_discard_sectors))
1486 goto zero_out;
1487
1488 if (nr_sectors < granularity)
1489 goto zero_out;
1490
1491 tmp = start;
1492 if (sector_div(tmp, granularity) != alignment) {
1493 if (nr_sectors < 2*granularity)
1494 goto zero_out;
1495 /* start + gran - (start + gran - align) % gran */
1496 tmp = start + granularity - alignment;
1497 tmp = start + granularity - sector_div(tmp, granularity);
1498
1499 nr = tmp - start;
1500 err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
1501 nr_sectors -= nr;
1502 start = tmp;
1503 }
1504 while (nr_sectors >= granularity) {
1505 nr = min_t(sector_t, nr_sectors, max_discard_sectors);
1506 err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
1507 nr_sectors -= nr;
1508 start += nr;
1509 }
1510 zero_out:
1511 if (nr_sectors) {
1512 err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
1513 }
1514 return err != 0;
1515}
1516
1517static bool can_do_reliable_discards(struct drbd_device *device)
1518{
1519 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
1520 struct disk_conf *dc;
1521 bool can_do;
1522
1523 if (!blk_queue_discard(q))
1524 return false;
1525
1526 if (q->limits.discard_zeroes_data)
1527 return true;
1528
1529 rcu_read_lock();
1530 dc = rcu_dereference(device->ldev->disk_conf);
1531 can_do = dc->discard_zeroes_if_aligned;
1532 rcu_read_unlock();
1533 return can_do;
1534}
1535
1536static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
1537{
1538 /* If the backend cannot discard, or does not guarantee
1539 * read-back zeroes in discarded ranges, we fall back to
1540 * zero-out. Unless configuration specifically requested
1541 * otherwise. */
1542 if (!can_do_reliable_discards(device))
1543 peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
1544
1545 if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
1546 peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
1547 peer_req->flags |= EE_WAS_ERROR;
1548 drbd_endio_write_sec_final(peer_req);
1549}
1550
1551static void drbd_issue_peer_wsame(struct drbd_device *device,
1552 struct drbd_peer_request *peer_req)
1553{
1554 struct block_device *bdev = device->ldev->backing_bdev;
1555 sector_t s = peer_req->i.sector;
1556 sector_t nr = peer_req->i.size >> 9;
1557 if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
1558 peer_req->flags |= EE_WAS_ERROR;
1559 drbd_endio_write_sec_final(peer_req);
1560}
1561
1562
1382/** 1563/**
1383 * drbd_submit_peer_request() 1564 * drbd_submit_peer_request()
1384 * @device: DRBD device. 1565 * @device: DRBD device.
@@ -1410,7 +1591,13 @@ int drbd_submit_peer_request(struct drbd_device *device,
1410 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; 1591 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
1411 int err = -ENOMEM; 1592 int err = -ENOMEM;
1412 1593
1413 if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { 1594 /* TRIM/DISCARD: for now, always use the helper function
1595 * blkdev_issue_zeroout(..., discard=true).
1596 * It's synchronous, but it does the right thing wrt. bio splitting.
1597 * Correctness first, performance later. Next step is to code an
1598 * asynchronous variant of the same.
1599 */
1600 if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
1414 /* wait for all pending IO completions, before we start 1601 /* wait for all pending IO completions, before we start
1415 * zeroing things out. */ 1602 * zeroing things out. */
1416 conn_wait_active_ee_empty(peer_req->peer_device->connection); 1603 conn_wait_active_ee_empty(peer_req->peer_device->connection);
@@ -1418,22 +1605,22 @@ int drbd_submit_peer_request(struct drbd_device *device,
1418 * so we can find it to present it in debugfs */ 1605 * so we can find it to present it in debugfs */
1419 peer_req->submit_jif = jiffies; 1606 peer_req->submit_jif = jiffies;
1420 peer_req->flags |= EE_SUBMITTED; 1607 peer_req->flags |= EE_SUBMITTED;
1421 spin_lock_irq(&device->resource->req_lock); 1608
1422 list_add_tail(&peer_req->w.list, &device->active_ee); 1609 /* If this was a resync request from receive_rs_deallocated(),
1423 spin_unlock_irq(&device->resource->req_lock); 1610 * it is already on the sync_ee list */
1424 if (blkdev_issue_zeroout(device->ldev->backing_bdev, 1611 if (list_empty(&peer_req->w.list)) {
1425 sector, data_size >> 9, GFP_NOIO, false)) 1612 spin_lock_irq(&device->resource->req_lock);
1426 peer_req->flags |= EE_WAS_ERROR; 1613 list_add_tail(&peer_req->w.list, &device->active_ee);
1427 drbd_endio_write_sec_final(peer_req); 1614 spin_unlock_irq(&device->resource->req_lock);
1615 }
1616
1617 if (peer_req->flags & EE_IS_TRIM)
1618 drbd_issue_peer_discard(device, peer_req);
1619 else /* EE_WRITE_SAME */
1620 drbd_issue_peer_wsame(device, peer_req);
1428 return 0; 1621 return 0;
1429 } 1622 }
1430 1623
1431 /* Discards don't have any payload.
1432 * But the scsi layer still expects a bio_vec it can use internally,
1433 * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
1434 if (peer_req->flags & EE_IS_TRIM)
1435 nr_pages = 1;
1436
1437 /* In most cases, we will only need one bio. But in case the lower 1624 /* In most cases, we will only need one bio. But in case the lower
1438 * level restrictions happen to be different at this offset on this 1625 * level restrictions happen to be different at this offset on this
1439 * side than those of the sending peer, we may need to submit the 1626 * side than those of the sending peer, we may need to submit the
@@ -1459,11 +1646,6 @@ next_bio:
1459 bios = bio; 1646 bios = bio;
1460 ++n_bios; 1647 ++n_bios;
1461 1648
1462 if (op == REQ_OP_DISCARD) {
1463 bio->bi_iter.bi_size = data_size;
1464 goto submit;
1465 }
1466
1467 page_chain_for_each(page) { 1649 page_chain_for_each(page) {
1468 unsigned len = min_t(unsigned, data_size, PAGE_SIZE); 1650 unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
1469 if (!bio_add_page(bio, page, len, 0)) { 1651 if (!bio_add_page(bio, page, len, 0)) {
@@ -1485,7 +1667,6 @@ next_bio:
1485 --nr_pages; 1667 --nr_pages;
1486 } 1668 }
1487 D_ASSERT(device, data_size == 0); 1669 D_ASSERT(device, data_size == 0);
1488submit:
1489 D_ASSERT(device, page == NULL); 1670 D_ASSERT(device, page == NULL);
1490 1671
1491 atomic_set(&peer_req->pending_bios, n_bios); 1672 atomic_set(&peer_req->pending_bios, n_bios);
@@ -1609,8 +1790,26 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
1609 return 0; 1790 return 0;
1610} 1791}
1611 1792
1793/* quick wrapper in case payload size != request_size (write same) */
1794static void drbd_csum_ee_size(struct crypto_ahash *h,
1795 struct drbd_peer_request *r, void *d,
1796 unsigned int payload_size)
1797{
1798 unsigned int tmp = r->i.size;
1799 r->i.size = payload_size;
1800 drbd_csum_ee(h, r, d);
1801 r->i.size = tmp;
1802}
1803
1612/* used from receive_RSDataReply (recv_resync_read) 1804/* used from receive_RSDataReply (recv_resync_read)
1613 * and from receive_Data */ 1805 * and from receive_Data.
1806 * data_size: actual payload ("data in")
1807 * for normal writes that is bi_size.
1808 * for discards, that is zero.
1809 * for write same, it is logical_block_size.
1810 * both trim and write same have the bi_size ("data len to be affected")
1811 * as extra argument in the packet header.
1812 */
1614static struct drbd_peer_request * 1813static struct drbd_peer_request *
1615read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 1814read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1616 struct packet_info *pi) __must_hold(local) 1815 struct packet_info *pi) __must_hold(local)
@@ -1625,6 +1824,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1625 void *dig_vv = peer_device->connection->int_dig_vv; 1824 void *dig_vv = peer_device->connection->int_dig_vv;
1626 unsigned long *data; 1825 unsigned long *data;
1627 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; 1826 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
1827 struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
1628 1828
1629 digest_size = 0; 1829 digest_size = 0;
1630 if (!trim && peer_device->connection->peer_integrity_tfm) { 1830 if (!trim && peer_device->connection->peer_integrity_tfm) {
@@ -1639,38 +1839,60 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1639 data_size -= digest_size; 1839 data_size -= digest_size;
1640 } 1840 }
1641 1841
1842 /* assume request_size == data_size, but special case trim and wsame. */
1843 ds = data_size;
1642 if (trim) { 1844 if (trim) {
1643 D_ASSERT(peer_device, data_size == 0); 1845 if (!expect(data_size == 0))
1644 data_size = be32_to_cpu(trim->size); 1846 return NULL;
1847 ds = be32_to_cpu(trim->size);
1848 } else if (wsame) {
1849 if (data_size != queue_logical_block_size(device->rq_queue)) {
1850 drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
1851 data_size, queue_logical_block_size(device->rq_queue));
1852 return NULL;
1853 }
1854 if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
1855 drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
1856 data_size, bdev_logical_block_size(device->ldev->backing_bdev));
1857 return NULL;
1858 }
1859 ds = be32_to_cpu(wsame->size);
1645 } 1860 }
1646 1861
1647 if (!expect(IS_ALIGNED(data_size, 512))) 1862 if (!expect(IS_ALIGNED(ds, 512)))
1648 return NULL; 1863 return NULL;
1649 /* prepare for larger trim requests. */ 1864 if (trim || wsame) {
1650 if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) 1865 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
1866 return NULL;
1867 } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
1651 return NULL; 1868 return NULL;
1652 1869
1653 /* even though we trust out peer, 1870 /* even though we trust out peer,
1654 * we sometimes have to double check. */ 1871 * we sometimes have to double check. */
1655 if (sector + (data_size>>9) > capacity) { 1872 if (sector + (ds>>9) > capacity) {
1656 drbd_err(device, "request from peer beyond end of local disk: " 1873 drbd_err(device, "request from peer beyond end of local disk: "
1657 "capacity: %llus < sector: %llus + size: %u\n", 1874 "capacity: %llus < sector: %llus + size: %u\n",
1658 (unsigned long long)capacity, 1875 (unsigned long long)capacity,
1659 (unsigned long long)sector, data_size); 1876 (unsigned long long)sector, ds);
1660 return NULL; 1877 return NULL;
1661 } 1878 }
1662 1879
1663 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1880 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1664 * "criss-cross" setup, that might cause write-out on some other DRBD, 1881 * "criss-cross" setup, that might cause write-out on some other DRBD,
1665 * which in turn might block on the other node at this very place. */ 1882 * which in turn might block on the other node at this very place. */
1666 peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); 1883 peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
1667 if (!peer_req) 1884 if (!peer_req)
1668 return NULL; 1885 return NULL;
1669 1886
1670 peer_req->flags |= EE_WRITE; 1887 peer_req->flags |= EE_WRITE;
1671 if (trim) 1888 if (trim) {
1889 peer_req->flags |= EE_IS_TRIM;
1672 return peer_req; 1890 return peer_req;
1891 }
1892 if (wsame)
1893 peer_req->flags |= EE_WRITE_SAME;
1673 1894
1895 /* receive payload size bytes into page chain */
1674 ds = data_size; 1896 ds = data_size;
1675 page = peer_req->pages; 1897 page = peer_req->pages;
1676 page_chain_for_each(page) { 1898 page_chain_for_each(page) {
@@ -1690,7 +1912,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1690 } 1912 }
1691 1913
1692 if (digest_size) { 1914 if (digest_size) {
1693 drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv); 1915 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
1694 if (memcmp(dig_in, dig_vv, digest_size)) { 1916 if (memcmp(dig_in, dig_vv, digest_size)) {
1695 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", 1917 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
1696 (unsigned long long)sector, data_size); 1918 (unsigned long long)sector, data_size);
@@ -2067,13 +2289,13 @@ static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
2067static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req) 2289static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
2068{ 2290{
2069 struct drbd_peer_request *rs_req; 2291 struct drbd_peer_request *rs_req;
2070 bool rv = 0; 2292 bool rv = false;
2071 2293
2072 spin_lock_irq(&device->resource->req_lock); 2294 spin_lock_irq(&device->resource->req_lock);
2073 list_for_each_entry(rs_req, &device->sync_ee, w.list) { 2295 list_for_each_entry(rs_req, &device->sync_ee, w.list) {
2074 if (overlaps(peer_req->i.sector, peer_req->i.size, 2296 if (overlaps(peer_req->i.sector, peer_req->i.size,
2075 rs_req->i.sector, rs_req->i.size)) { 2297 rs_req->i.sector, rs_req->i.size)) {
2076 rv = 1; 2298 rv = true;
2077 break; 2299 break;
2078 } 2300 }
2079 } 2301 }
@@ -2354,10 +2576,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2354 op = wire_flags_to_bio_op(dp_flags); 2576 op = wire_flags_to_bio_op(dp_flags);
2355 op_flags = wire_flags_to_bio_flags(dp_flags); 2577 op_flags = wire_flags_to_bio_flags(dp_flags);
2356 if (pi->cmd == P_TRIM) { 2578 if (pi->cmd == P_TRIM) {
2357 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
2358 peer_req->flags |= EE_IS_TRIM;
2359 if (!blk_queue_discard(q))
2360 peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
2361 D_ASSERT(peer_device, peer_req->i.size > 0); 2579 D_ASSERT(peer_device, peer_req->i.size > 0);
2362 D_ASSERT(peer_device, op == REQ_OP_DISCARD); 2580 D_ASSERT(peer_device, op == REQ_OP_DISCARD);
2363 D_ASSERT(peer_device, peer_req->pages == NULL); 2581 D_ASSERT(peer_device, peer_req->pages == NULL);
@@ -2424,11 +2642,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2424 update_peer_seq(peer_device, peer_seq); 2642 update_peer_seq(peer_device, peer_seq);
2425 spin_lock_irq(&device->resource->req_lock); 2643 spin_lock_irq(&device->resource->req_lock);
2426 } 2644 }
2427 /* if we use the zeroout fallback code, we process synchronously 2645 /* TRIM and WRITE_SAME are processed synchronously,
2428 * and we wait for all pending requests, respectively wait for 2646 * we wait for all pending requests, respectively wait for
2429 * active_ee to become empty in drbd_submit_peer_request(); 2647 * active_ee to become empty in drbd_submit_peer_request();
2430 * better not add ourselves here. */ 2648 * better not add ourselves here. */
2431 if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) 2649 if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
2432 list_add_tail(&peer_req->w.list, &device->active_ee); 2650 list_add_tail(&peer_req->w.list, &device->active_ee);
2433 spin_unlock_irq(&device->resource->req_lock); 2651 spin_unlock_irq(&device->resource->req_lock);
2434 2652
@@ -2460,7 +2678,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2460 } 2678 }
2461 2679
2462out_interrupted: 2680out_interrupted:
2463 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); 2681 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
2464 put_ldev(device); 2682 put_ldev(device);
2465 drbd_free_peer_req(device, peer_req); 2683 drbd_free_peer_req(device, peer_req);
2466 return err; 2684 return err;
@@ -2585,6 +2803,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2585 case P_DATA_REQUEST: 2803 case P_DATA_REQUEST:
2586 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); 2804 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
2587 break; 2805 break;
2806 case P_RS_THIN_REQ:
2588 case P_RS_DATA_REQUEST: 2807 case P_RS_DATA_REQUEST:
2589 case P_CSUM_RS_REQUEST: 2808 case P_CSUM_RS_REQUEST:
2590 case P_OV_REQUEST: 2809 case P_OV_REQUEST:
@@ -2610,7 +2829,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2610 * "criss-cross" setup, that might cause write-out on some other DRBD, 2829 * "criss-cross" setup, that might cause write-out on some other DRBD,
2611 * which in turn might block on the other node at this very place. */ 2830 * which in turn might block on the other node at this very place. */
2612 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, 2831 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2613 true /* has real payload */, GFP_NOIO); 2832 size, GFP_NOIO);
2614 if (!peer_req) { 2833 if (!peer_req) {
2615 put_ldev(device); 2834 put_ldev(device);
2616 return -ENOMEM; 2835 return -ENOMEM;
@@ -2624,6 +2843,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2624 peer_req->flags |= EE_APPLICATION; 2843 peer_req->flags |= EE_APPLICATION;
2625 goto submit; 2844 goto submit;
2626 2845
2846 case P_RS_THIN_REQ:
2847 /* If at some point in the future we have a smart way to
2848 find out if this data block is completely deallocated,
2849 then we would do something smarter here than reading
2850 the block... */
2851 peer_req->flags |= EE_RS_THIN_REQ;
2627 case P_RS_DATA_REQUEST: 2852 case P_RS_DATA_REQUEST:
2628 peer_req->w.cb = w_e_end_rsdata_req; 2853 peer_req->w.cb = w_e_end_rsdata_req;
2629 fault_type = DRBD_FAULT_RS_RD; 2854 fault_type = DRBD_FAULT_RS_RD;
@@ -2969,7 +3194,8 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
2969-1091 requires proto 91 3194-1091 requires proto 91
2970-1096 requires proto 96 3195-1096 requires proto 96
2971 */ 3196 */
2972static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local) 3197
3198static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
2973{ 3199{
2974 struct drbd_peer_device *const peer_device = first_peer_device(device); 3200 struct drbd_peer_device *const peer_device = first_peer_device(device);
2975 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 3201 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
@@ -3049,8 +3275,39 @@ static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __m
3049 * next bit (weight 2) is set when peer was primary */ 3275 * next bit (weight 2) is set when peer was primary */
3050 *rule_nr = 40; 3276 *rule_nr = 40;
3051 3277
3278 /* Neither has the "crashed primary" flag set,
3279 * only a replication link hickup. */
3280 if (rct == 0)
3281 return 0;
3282
3283 /* Current UUID equal and no bitmap uuid; does not necessarily
3284 * mean this was a "simultaneous hard crash", maybe IO was
3285 * frozen, so no UUID-bump happened.
3286 * This is a protocol change, overload DRBD_FF_WSAME as flag
3287 * for "new-enough" peer DRBD version. */
3288 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
3289 *rule_nr = 41;
3290 if (!(connection->agreed_features & DRBD_FF_WSAME)) {
3291 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
3292 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
3293 }
3294 if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
3295 /* At least one has the "crashed primary" bit set,
3296 * both are primary now, but neither has rotated its UUIDs?
3297 * "Can not happen." */
3298 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
3299 return -100;
3300 }
3301 if (device->state.role == R_PRIMARY)
3302 return 1;
3303 return -1;
3304 }
3305
3306 /* Both are secondary.
3307 * Really looks like recovery from simultaneous hard crash.
3308 * Check which had been primary before, and arbitrate. */
3052 switch (rct) { 3309 switch (rct) {
3053 case 0: /* !self_pri && !peer_pri */ return 0; 3310 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
3054 case 1: /* self_pri && !peer_pri */ return 1; 3311 case 1: /* self_pri && !peer_pri */ return 1;
3055 case 2: /* !self_pri && peer_pri */ return -1; 3312 case 2: /* !self_pri && peer_pri */ return -1;
3056 case 3: /* self_pri && peer_pri */ 3313 case 3: /* self_pri && peer_pri */
@@ -3177,7 +3434,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3177 drbd_uuid_dump(device, "peer", device->p_uuid, 3434 drbd_uuid_dump(device, "peer", device->p_uuid,
3178 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); 3435 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3179 3436
3180 hg = drbd_uuid_compare(device, &rule_nr); 3437 hg = drbd_uuid_compare(device, peer_role, &rule_nr);
3181 spin_unlock_irq(&device->ldev->md.uuid_lock); 3438 spin_unlock_irq(&device->ldev->md.uuid_lock);
3182 3439
3183 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); 3440 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
@@ -3186,6 +3443,15 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3186 drbd_alert(device, "Unrelated data, aborting!\n"); 3443 drbd_alert(device, "Unrelated data, aborting!\n");
3187 return C_MASK; 3444 return C_MASK;
3188 } 3445 }
3446 if (hg < -0x10000) {
3447 int proto, fflags;
3448 hg = -hg;
3449 proto = hg & 0xff;
3450 fflags = (hg >> 8) & 0xff;
3451 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
3452 proto, fflags);
3453 return C_MASK;
3454 }
3189 if (hg < -1000) { 3455 if (hg < -1000) {
3190 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); 3456 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
3191 return C_MASK; 3457 return C_MASK;
@@ -3415,7 +3681,8 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
3415 */ 3681 */
3416 3682
3417 peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC); 3683 peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3418 if (!peer_integrity_tfm) { 3684 if (IS_ERR(peer_integrity_tfm)) {
3685 peer_integrity_tfm = NULL;
3419 drbd_err(connection, "peer data-integrity-alg %s not supported\n", 3686 drbd_err(connection, "peer data-integrity-alg %s not supported\n",
3420 integrity_alg); 3687 integrity_alg);
3421 goto disconnect; 3688 goto disconnect;
@@ -3766,6 +4033,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3766 struct drbd_peer_device *peer_device; 4033 struct drbd_peer_device *peer_device;
3767 struct drbd_device *device; 4034 struct drbd_device *device;
3768 struct p_sizes *p = pi->data; 4035 struct p_sizes *p = pi->data;
4036 struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
3769 enum determine_dev_size dd = DS_UNCHANGED; 4037 enum determine_dev_size dd = DS_UNCHANGED;
3770 sector_t p_size, p_usize, p_csize, my_usize; 4038 sector_t p_size, p_usize, p_csize, my_usize;
3771 int ldsc = 0; /* local disk size changed */ 4039 int ldsc = 0; /* local disk size changed */
@@ -3785,6 +4053,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3785 device->p_size = p_size; 4053 device->p_size = p_size;
3786 4054
3787 if (get_ldev(device)) { 4055 if (get_ldev(device)) {
4056 sector_t new_size, cur_size;
3788 rcu_read_lock(); 4057 rcu_read_lock();
3789 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; 4058 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
3790 rcu_read_unlock(); 4059 rcu_read_unlock();
@@ -3801,11 +4070,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3801 4070
3802 /* Never shrink a device with usable data during connect. 4071 /* Never shrink a device with usable data during connect.
3803 But allow online shrinking if we are connected. */ 4072 But allow online shrinking if we are connected. */
3804 if (drbd_new_dev_size(device, device->ldev, p_usize, 0) < 4073 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
3805 drbd_get_capacity(device->this_bdev) && 4074 cur_size = drbd_get_capacity(device->this_bdev);
4075 if (new_size < cur_size &&
3806 device->state.disk >= D_OUTDATED && 4076 device->state.disk >= D_OUTDATED &&
3807 device->state.conn < C_CONNECTED) { 4077 device->state.conn < C_CONNECTED) {
3808 drbd_err(device, "The peer's disk size is too small!\n"); 4078 drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
4079 (unsigned long long)new_size, (unsigned long long)cur_size);
3809 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); 4080 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3810 put_ldev(device); 4081 put_ldev(device);
3811 return -EIO; 4082 return -EIO;
@@ -3839,14 +4110,14 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3839 } 4110 }
3840 4111
3841 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); 4112 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3842 /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). 4113 /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
3843 In case we cleared the QUEUE_FLAG_DISCARD from our queue in 4114 In case we cleared the QUEUE_FLAG_DISCARD from our queue in
3844 drbd_reconsider_max_bio_size(), we can be sure that after 4115 drbd_reconsider_queue_parameters(), we can be sure that after
3845 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ 4116 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
3846 4117
3847 ddsf = be16_to_cpu(p->dds_flags); 4118 ddsf = be16_to_cpu(p->dds_flags);
3848 if (get_ldev(device)) { 4119 if (get_ldev(device)) {
3849 drbd_reconsider_max_bio_size(device, device->ldev); 4120 drbd_reconsider_queue_parameters(device, device->ldev, o);
3850 dd = drbd_determine_dev_size(device, ddsf, NULL); 4121 dd = drbd_determine_dev_size(device, ddsf, NULL);
3851 put_ldev(device); 4122 put_ldev(device);
3852 if (dd == DS_ERROR) 4123 if (dd == DS_ERROR)
@@ -3866,7 +4137,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3866 * However, if he sends a zero current size, 4137 * However, if he sends a zero current size,
3867 * take his (user-capped or) backing disk size anyways. 4138 * take his (user-capped or) backing disk size anyways.
3868 */ 4139 */
3869 drbd_reconsider_max_bio_size(device, NULL); 4140 drbd_reconsider_queue_parameters(device, NULL, o);
3870 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); 4141 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
3871 } 4142 }
3872 4143
@@ -4599,9 +4870,75 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet
4599 return 0; 4870 return 0;
4600} 4871}
4601 4872
4873static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
4874{
4875 struct drbd_peer_device *peer_device;
4876 struct p_block_desc *p = pi->data;
4877 struct drbd_device *device;
4878 sector_t sector;
4879 int size, err = 0;
4880
4881 peer_device = conn_peer_device(connection, pi->vnr);
4882 if (!peer_device)
4883 return -EIO;
4884 device = peer_device->device;
4885
4886 sector = be64_to_cpu(p->sector);
4887 size = be32_to_cpu(p->blksize);
4888
4889 dec_rs_pending(device);
4890
4891 if (get_ldev(device)) {
4892 struct drbd_peer_request *peer_req;
4893 const int op = REQ_OP_DISCARD;
4894
4895 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
4896 size, 0, GFP_NOIO);
4897 if (!peer_req) {
4898 put_ldev(device);
4899 return -ENOMEM;
4900 }
4901
4902 peer_req->w.cb = e_end_resync_block;
4903 peer_req->submit_jif = jiffies;
4904 peer_req->flags |= EE_IS_TRIM;
4905
4906 spin_lock_irq(&device->resource->req_lock);
4907 list_add_tail(&peer_req->w.list, &device->sync_ee);
4908 spin_unlock_irq(&device->resource->req_lock);
4909
4910 atomic_add(pi->size >> 9, &device->rs_sect_ev);
4911 err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
4912
4913 if (err) {
4914 spin_lock_irq(&device->resource->req_lock);
4915 list_del(&peer_req->w.list);
4916 spin_unlock_irq(&device->resource->req_lock);
4917
4918 drbd_free_peer_req(device, peer_req);
4919 put_ldev(device);
4920 err = 0;
4921 goto fail;
4922 }
4923
4924 inc_unacked(device);
4925
4926 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
4927 as well as drbd_rs_complete_io() */
4928 } else {
4929 fail:
4930 drbd_rs_complete_io(device, sector);
4931 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
4932 }
4933
4934 atomic_add(size >> 9, &device->rs_sect_in);
4935
4936 return err;
4937}
4938
4602struct data_cmd { 4939struct data_cmd {
4603 int expect_payload; 4940 int expect_payload;
4604 size_t pkt_size; 4941 unsigned int pkt_size;
4605 int (*fn)(struct drbd_connection *, struct packet_info *); 4942 int (*fn)(struct drbd_connection *, struct packet_info *);
4606}; 4943};
4607 4944
@@ -4626,11 +4963,14 @@ static struct data_cmd drbd_cmd_handler[] = {
4626 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, 4963 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4627 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 4964 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4628 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, 4965 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4966 [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4629 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, 4967 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
4630 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, 4968 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4631 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, 4969 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
4632 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, 4970 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
4633 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, 4971 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
4972 [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
4973 [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data },
4634}; 4974};
4635 4975
4636static void drbdd(struct drbd_connection *connection) 4976static void drbdd(struct drbd_connection *connection)
@@ -4640,7 +4980,7 @@ static void drbdd(struct drbd_connection *connection)
4640 int err; 4980 int err;
4641 4981
4642 while (get_t_state(&connection->receiver) == RUNNING) { 4982 while (get_t_state(&connection->receiver) == RUNNING) {
4643 struct data_cmd *cmd; 4983 struct data_cmd const *cmd;
4644 4984
4645 drbd_thread_current_set_cpu(&connection->receiver); 4985 drbd_thread_current_set_cpu(&connection->receiver);
4646 update_receiver_timing_details(connection, drbd_recv_header); 4986 update_receiver_timing_details(connection, drbd_recv_header);
@@ -4655,11 +4995,18 @@ static void drbdd(struct drbd_connection *connection)
4655 } 4995 }
4656 4996
4657 shs = cmd->pkt_size; 4997 shs = cmd->pkt_size;
4998 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
4999 shs += sizeof(struct o_qlim);
4658 if (pi.size > shs && !cmd->expect_payload) { 5000 if (pi.size > shs && !cmd->expect_payload) {
4659 drbd_err(connection, "No payload expected %s l:%d\n", 5001 drbd_err(connection, "No payload expected %s l:%d\n",
4660 cmdname(pi.cmd), pi.size); 5002 cmdname(pi.cmd), pi.size);
4661 goto err_out; 5003 goto err_out;
4662 } 5004 }
5005 if (pi.size < shs) {
5006 drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
5007 cmdname(pi.cmd), (int)shs, pi.size);
5008 goto err_out;
5009 }
4663 5010
4664 if (shs) { 5011 if (shs) {
4665 update_receiver_timing_details(connection, drbd_recv_all_warn); 5012 update_receiver_timing_details(connection, drbd_recv_all_warn);
@@ -4795,9 +5142,11 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device)
4795 5142
4796 drbd_md_sync(device); 5143 drbd_md_sync(device);
4797 5144
4798 /* serialize with bitmap writeout triggered by the state change, 5145 if (get_ldev(device)) {
4799 * if any. */ 5146 drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
4800 wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); 5147 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
5148 put_ldev(device);
5149 }
4801 5150
4802 /* tcp_close and release of sendpage pages can be deferred. I don't 5151 /* tcp_close and release of sendpage pages can be deferred. I don't
4803 * want to use SO_LINGER, because apparently it can be deferred for 5152 * want to use SO_LINGER, because apparently it can be deferred for
@@ -4904,8 +5253,12 @@ static int drbd_do_features(struct drbd_connection *connection)
4904 drbd_info(connection, "Handshake successful: " 5253 drbd_info(connection, "Handshake successful: "
4905 "Agreed network protocol version %d\n", connection->agreed_pro_version); 5254 "Agreed network protocol version %d\n", connection->agreed_pro_version);
4906 5255
4907 drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", 5256 drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n",
4908 connection->agreed_features & FF_TRIM ? " " : " not "); 5257 connection->agreed_features,
5258 connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
5259 connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
5260 connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" :
5261 connection->agreed_features ? "" : " none");
4909 5262
4910 return 1; 5263 return 1;
4911 5264
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index eef6e9575b4e..66b8e4bb74d8 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
47 &device->vdisk->part0, req->start_jif); 47 &device->vdisk->part0, req->start_jif);
48} 48}
49 49
50static struct drbd_request *drbd_req_new(struct drbd_device *device, 50static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src)
51 struct bio *bio_src)
52{ 51{
53 struct drbd_request *req; 52 struct drbd_request *req;
54 53
@@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
58 memset(req, 0, sizeof(*req)); 57 memset(req, 0, sizeof(*req));
59 58
60 drbd_req_make_private_bio(req, bio_src); 59 drbd_req_make_private_bio(req, bio_src);
61 req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; 60 req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
62 req->device = device; 61 | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
63 req->master_bio = bio_src; 62 | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
64 req->epoch = 0; 63 req->device = device;
64 req->master_bio = bio_src;
65 req->epoch = 0;
65 66
66 drbd_clear_interval(&req->i); 67 drbd_clear_interval(&req->i);
67 req->i.sector = bio_src->bi_iter.bi_sector; 68 req->i.sector = bio_src->bi_iter.bi_sector;
@@ -218,7 +219,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
218{ 219{
219 const unsigned s = req->rq_state; 220 const unsigned s = req->rq_state;
220 struct drbd_device *device = req->device; 221 struct drbd_device *device = req->device;
221 int rw;
222 int error, ok; 222 int error, ok;
223 223
224 /* we must not complete the master bio, while it is 224 /* we must not complete the master bio, while it is
@@ -242,8 +242,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
242 return; 242 return;
243 } 243 }
244 244
245 rw = bio_rw(req->master_bio);
246
247 /* 245 /*
248 * figure out whether to report success or failure. 246 * figure out whether to report success or failure.
249 * 247 *
@@ -267,7 +265,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
267 * epoch number. If they match, increase the current_tle_nr, 265 * epoch number. If they match, increase the current_tle_nr,
268 * and reset the transfer log epoch write_cnt. 266 * and reset the transfer log epoch write_cnt.
269 */ 267 */
270 if (rw == WRITE && 268 if (op_is_write(bio_op(req->master_bio)) &&
271 req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr)) 269 req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
272 start_new_tl_epoch(first_peer_device(device)->connection); 270 start_new_tl_epoch(first_peer_device(device)->connection);
273 271
@@ -284,11 +282,14 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
284 * because no path was available, in which case 282 * because no path was available, in which case
285 * it was not even added to the transfer_log. 283 * it was not even added to the transfer_log.
286 * 284 *
287 * READA may fail, and will not be retried. 285 * read-ahead may fail, and will not be retried.
288 * 286 *
289 * WRITE should have used all available paths already. 287 * WRITE should have used all available paths already.
290 */ 288 */
291 if (!ok && rw == READ && !list_empty(&req->tl_requests)) 289 if (!ok &&
290 bio_op(req->master_bio) == REQ_OP_READ &&
291 !(req->master_bio->bi_rw & REQ_RAHEAD) &&
292 !list_empty(&req->tl_requests))
292 req->rq_state |= RQ_POSTPONED; 293 req->rq_state |= RQ_POSTPONED;
293 294
294 if (!(req->rq_state & RQ_POSTPONED)) { 295 if (!(req->rq_state & RQ_POSTPONED)) {
@@ -644,7 +645,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
644 __drbd_chk_io_error(device, DRBD_READ_ERROR); 645 __drbd_chk_io_error(device, DRBD_READ_ERROR);
645 /* fall through. */ 646 /* fall through. */
646 case READ_AHEAD_COMPLETED_WITH_ERROR: 647 case READ_AHEAD_COMPLETED_WITH_ERROR:
647 /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ 648 /* it is legal to fail read-ahead, no __drbd_chk_io_error in that case. */
648 mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); 649 mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
649 break; 650 break;
650 651
@@ -656,7 +657,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
656 break; 657 break;
657 658
658 case QUEUE_FOR_NET_READ: 659 case QUEUE_FOR_NET_READ:
659 /* READ or READA, and 660 /* READ, and
660 * no local disk, 661 * no local disk,
661 * or target area marked as invalid, 662 * or target area marked as invalid,
662 * or just got an io-error. */ 663 * or just got an io-error. */
@@ -977,16 +978,20 @@ static void complete_conflicting_writes(struct drbd_request *req)
977 sector_t sector = req->i.sector; 978 sector_t sector = req->i.sector;
978 int size = req->i.size; 979 int size = req->i.size;
979 980
980 i = drbd_find_overlap(&device->write_requests, sector, size);
981 if (!i)
982 return;
983
984 for (;;) { 981 for (;;) {
985 prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); 982 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
986 i = drbd_find_overlap(&device->write_requests, sector, size); 983 /* Ignore, if already completed to upper layers. */
987 if (!i) 984 if (i->completed)
985 continue;
986 /* Handle the first found overlap. After the schedule
987 * we have to restart the tree walk. */
988 break;
989 }
990 if (!i) /* if any */
988 break; 991 break;
992
989 /* Indicate to wake up device->misc_wait on progress. */ 993 /* Indicate to wake up device->misc_wait on progress. */
994 prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
990 i->waiting = true; 995 i->waiting = true;
991 spin_unlock_irq(&device->resource->req_lock); 996 spin_unlock_irq(&device->resource->req_lock);
992 schedule(); 997 schedule();
@@ -995,7 +1000,7 @@ static void complete_conflicting_writes(struct drbd_request *req)
995 finish_wait(&device->misc_wait, &wait); 1000 finish_wait(&device->misc_wait, &wait);
996} 1001}
997 1002
998/* called within req_lock and rcu_read_lock() */ 1003/* called within req_lock */
999static void maybe_pull_ahead(struct drbd_device *device) 1004static void maybe_pull_ahead(struct drbd_device *device)
1000{ 1005{
1001 struct drbd_connection *connection = first_peer_device(device)->connection; 1006 struct drbd_connection *connection = first_peer_device(device)->connection;
@@ -1152,12 +1157,29 @@ static int drbd_process_write_request(struct drbd_request *req)
1152 return remote; 1157 return remote;
1153} 1158}
1154 1159
1160static void drbd_process_discard_req(struct drbd_request *req)
1161{
1162 int err = drbd_issue_discard_or_zero_out(req->device,
1163 req->i.sector, req->i.size >> 9, true);
1164
1165 if (err)
1166 req->private_bio->bi_error = -EIO;
1167 bio_endio(req->private_bio);
1168}
1169
1155static void 1170static void
1156drbd_submit_req_private_bio(struct drbd_request *req) 1171drbd_submit_req_private_bio(struct drbd_request *req)
1157{ 1172{
1158 struct drbd_device *device = req->device; 1173 struct drbd_device *device = req->device;
1159 struct bio *bio = req->private_bio; 1174 struct bio *bio = req->private_bio;
1160 const int rw = bio_rw(bio); 1175 unsigned int type;
1176
1177 if (bio_op(bio) != REQ_OP_READ)
1178 type = DRBD_FAULT_DT_WR;
1179 else if (bio->bi_rw & REQ_RAHEAD)
1180 type = DRBD_FAULT_DT_RA;
1181 else
1182 type = DRBD_FAULT_DT_RD;
1161 1183
1162 bio->bi_bdev = device->ldev->backing_bdev; 1184 bio->bi_bdev = device->ldev->backing_bdev;
1163 1185
@@ -1167,11 +1189,10 @@ drbd_submit_req_private_bio(struct drbd_request *req)
1167 * stable storage, and this is a WRITE, we may not even submit 1189 * stable storage, and this is a WRITE, we may not even submit
1168 * this bio. */ 1190 * this bio. */
1169 if (get_ldev(device)) { 1191 if (get_ldev(device)) {
1170 if (drbd_insert_fault(device, 1192 if (drbd_insert_fault(device, type))
1171 rw == WRITE ? DRBD_FAULT_DT_WR
1172 : rw == READ ? DRBD_FAULT_DT_RD
1173 : DRBD_FAULT_DT_RA))
1174 bio_io_error(bio); 1193 bio_io_error(bio);
1194 else if (bio_op(bio) == REQ_OP_DISCARD)
1195 drbd_process_discard_req(req);
1175 else 1196 else
1176 generic_make_request(bio); 1197 generic_make_request(bio);
1177 put_ldev(device); 1198 put_ldev(device);
@@ -1223,24 +1244,45 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
1223 /* Update disk stats */ 1244 /* Update disk stats */
1224 _drbd_start_io_acct(device, req); 1245 _drbd_start_io_acct(device, req);
1225 1246
1247 /* process discards always from our submitter thread */
1248 if (bio_op(bio) & REQ_OP_DISCARD)
1249 goto queue_for_submitter_thread;
1250
1226 if (rw == WRITE && req->private_bio && req->i.size 1251 if (rw == WRITE && req->private_bio && req->i.size
1227 && !test_bit(AL_SUSPENDED, &device->flags)) { 1252 && !test_bit(AL_SUSPENDED, &device->flags)) {
1228 if (!drbd_al_begin_io_fastpath(device, &req->i)) { 1253 if (!drbd_al_begin_io_fastpath(device, &req->i))
1229 atomic_inc(&device->ap_actlog_cnt); 1254 goto queue_for_submitter_thread;
1230 drbd_queue_write(device, req);
1231 return NULL;
1232 }
1233 req->rq_state |= RQ_IN_ACT_LOG; 1255 req->rq_state |= RQ_IN_ACT_LOG;
1234 req->in_actlog_jif = jiffies; 1256 req->in_actlog_jif = jiffies;
1235 } 1257 }
1236
1237 return req; 1258 return req;
1259
1260 queue_for_submitter_thread:
1261 atomic_inc(&device->ap_actlog_cnt);
1262 drbd_queue_write(device, req);
1263 return NULL;
1264}
1265
1266/* Require at least one path to current data.
1267 * We don't want to allow writes on C_STANDALONE D_INCONSISTENT:
1268 * We would not allow to read what was written,
1269 * we would not have bumped the data generation uuids,
1270 * we would cause data divergence for all the wrong reasons.
1271 *
1272 * If we don't see at least one D_UP_TO_DATE, we will fail this request,
1273 * which either returns EIO, or, if OND_SUSPEND_IO is set, suspends IO,
1274 * and queues for retry later.
1275 */
1276static bool may_do_writes(struct drbd_device *device)
1277{
1278 const union drbd_dev_state s = device->state;
1279 return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE;
1238} 1280}
1239 1281
1240static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) 1282static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
1241{ 1283{
1242 struct drbd_resource *resource = device->resource; 1284 struct drbd_resource *resource = device->resource;
1243 const int rw = bio_rw(req->master_bio); 1285 const int rw = bio_data_dir(req->master_bio);
1244 struct bio_and_error m = { NULL, }; 1286 struct bio_and_error m = { NULL, };
1245 bool no_remote = false; 1287 bool no_remote = false;
1246 bool submit_private_bio = false; 1288 bool submit_private_bio = false;
@@ -1270,7 +1312,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
1270 goto out; 1312 goto out;
1271 } 1313 }
1272 1314
1273 /* We fail READ/READA early, if we can not serve it. 1315 /* We fail READ early, if we can not serve it.
1274 * We must do this before req is registered on any lists. 1316 * We must do this before req is registered on any lists.
1275 * Otherwise, drbd_req_complete() will queue failed READ for retry. */ 1317 * Otherwise, drbd_req_complete() will queue failed READ for retry. */
1276 if (rw != WRITE) { 1318 if (rw != WRITE) {
@@ -1291,6 +1333,12 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
1291 } 1333 }
1292 1334
1293 if (rw == WRITE) { 1335 if (rw == WRITE) {
1336 if (req->private_bio && !may_do_writes(device)) {
1337 bio_put(req->private_bio);
1338 req->private_bio = NULL;
1339 put_ldev(device);
1340 goto nodata;
1341 }
1294 if (!drbd_process_write_request(req)) 1342 if (!drbd_process_write_request(req))
1295 no_remote = true; 1343 no_remote = true;
1296 } else { 1344 } else {
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index bb2ef78165e5..eb49e7f2da91 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -206,6 +206,8 @@ enum drbd_req_state_bits {
206 206
207 /* Set when this is a write, clear for a read */ 207 /* Set when this is a write, clear for a read */
208 __RQ_WRITE, 208 __RQ_WRITE,
209 __RQ_WSAME,
210 __RQ_UNMAP,
209 211
210 /* Should call drbd_al_complete_io() for this request... */ 212 /* Should call drbd_al_complete_io() for this request... */
211 __RQ_IN_ACT_LOG, 213 __RQ_IN_ACT_LOG,
@@ -241,10 +243,11 @@ enum drbd_req_state_bits {
241#define RQ_NET_OK (1UL << __RQ_NET_OK) 243#define RQ_NET_OK (1UL << __RQ_NET_OK)
242#define RQ_NET_SIS (1UL << __RQ_NET_SIS) 244#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
243 245
244/* 0x1f8 */
245#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) 246#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
246 247
247#define RQ_WRITE (1UL << __RQ_WRITE) 248#define RQ_WRITE (1UL << __RQ_WRITE)
249#define RQ_WSAME (1UL << __RQ_WSAME)
250#define RQ_UNMAP (1UL << __RQ_UNMAP)
248#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) 251#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
249#define RQ_POSTPONED (1UL << __RQ_POSTPONED) 252#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
250#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) 253#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 5a7ef7873b67..eea0c4aec978 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -814,7 +814,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
814 } 814 }
815 815
816 if (rv <= 0) 816 if (rv <= 0)
817 /* already found a reason to abort */; 817 goto out; /* already found a reason to abort */
818 else if (ns.role == R_SECONDARY && device->open_cnt) 818 else if (ns.role == R_SECONDARY && device->open_cnt)
819 rv = SS_DEVICE_IN_USE; 819 rv = SS_DEVICE_IN_USE;
820 820
@@ -862,6 +862,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
862 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) 862 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
863 rv = SS_CONNECTED_OUTDATES; 863 rv = SS_CONNECTED_OUTDATES;
864 864
865out:
865 rcu_read_unlock(); 866 rcu_read_unlock();
866 867
867 return rv; 868 return rv;
@@ -906,6 +907,15 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c
906 (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) 907 (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
907 rv = SS_IN_TRANSIENT_STATE; 908 rv = SS_IN_TRANSIENT_STATE;
908 909
910 /* Do not promote during resync handshake triggered by "force primary".
911 * This is a hack. It should really be rejected by the peer during the
912 * cluster wide state change request. */
913 if (os.role != R_PRIMARY && ns.role == R_PRIMARY
914 && ns.pdsk == D_UP_TO_DATE
915 && ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS
916 && (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn))
917 rv = SS_IN_TRANSIENT_STATE;
918
909 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) 919 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
910 rv = SS_NEED_CONNECTION; 920 rv = SS_NEED_CONNECTION;
911 921
@@ -1628,6 +1638,26 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
1628#undef REMEMBER_STATE_CHANGE 1638#undef REMEMBER_STATE_CHANGE
1629} 1639}
1630 1640
1641/* takes old and new peer disk state */
1642static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns)
1643{
1644 if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED)
1645 && (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED))
1646 return true;
1647
1648 /* Scenario, starting with normal operation
1649 * Connected Primary/Secondary UpToDate/UpToDate
1650 * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen)
1651 * ...
1652 * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!)
1653 */
1654 if (os == D_UNKNOWN
1655 && (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED))
1656 return true;
1657
1658 return false;
1659}
1660
1631/** 1661/**
1632 * after_state_ch() - Perform after state change actions that may sleep 1662 * after_state_ch() - Perform after state change actions that may sleep
1633 * @device: DRBD device. 1663 * @device: DRBD device.
@@ -1675,7 +1705,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1675 what = RESEND; 1705 what = RESEND;
1676 1706
1677 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && 1707 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1678 conn_lowest_disk(connection) > D_NEGOTIATING) 1708 conn_lowest_disk(connection) == D_UP_TO_DATE)
1679 what = RESTART_FROZEN_DISK_IO; 1709 what = RESTART_FROZEN_DISK_IO;
1680 1710
1681 if (resource->susp_nod && what != NOTHING) { 1711 if (resource->susp_nod && what != NOTHING) {
@@ -1699,6 +1729,13 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1699 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) 1729 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1700 clear_bit(NEW_CUR_UUID, &peer_device->device->flags); 1730 clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
1701 rcu_read_unlock(); 1731 rcu_read_unlock();
1732
1733 /* We should actively create a new uuid, _before_
1734 * we resume/resent, if the peer is diskless
1735 * (recovery from a multiple error scenario).
1736 * Currently, this happens with a slight delay
1737 * below when checking lost_contact_to_peer_data() ...
1738 */
1702 _tl_restart(connection, RESEND); 1739 _tl_restart(connection, RESEND);
1703 _conn_request_state(connection, 1740 _conn_request_state(connection,
1704 (union drbd_state) { { .susp_fen = 1 } }, 1741 (union drbd_state) { { .susp_fen = 1 } },
@@ -1742,12 +1779,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1742 BM_LOCKED_TEST_ALLOWED); 1779 BM_LOCKED_TEST_ALLOWED);
1743 1780
1744 /* Lost contact to peer's copy of the data */ 1781 /* Lost contact to peer's copy of the data */
1745 if ((os.pdsk >= D_INCONSISTENT && 1782 if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) {
1746 os.pdsk != D_UNKNOWN &&
1747 os.pdsk != D_OUTDATED)
1748 && (ns.pdsk < D_INCONSISTENT ||
1749 ns.pdsk == D_UNKNOWN ||
1750 ns.pdsk == D_OUTDATED)) {
1751 if (get_ldev(device)) { 1783 if (get_ldev(device)) {
1752 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1784 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1753 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1785 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
@@ -1934,12 +1966,17 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
1934 1966
1935 /* This triggers bitmap writeout of potentially still unwritten pages 1967 /* This triggers bitmap writeout of potentially still unwritten pages
1936 * if the resync finished cleanly, or aborted because of peer disk 1968 * if the resync finished cleanly, or aborted because of peer disk
1937 * failure, or because of connection loss. 1969 * failure, or on transition from resync back to AHEAD/BEHIND.
1970 *
1971 * Connection loss is handled in drbd_disconnected() by the receiver.
1972 *
1938 * For resync aborted because of local disk failure, we cannot do 1973 * For resync aborted because of local disk failure, we cannot do
1939 * any bitmap writeout anymore. 1974 * any bitmap writeout anymore.
1975 *
1940 * No harm done if some bits change during this phase. 1976 * No harm done if some bits change during this phase.
1941 */ 1977 */
1942 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) { 1978 if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) &&
1979 (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) {
1943 drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, 1980 drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
1944 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); 1981 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
1945 put_ldev(device); 1982 put_ldev(device);
@@ -2160,9 +2197,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
2160 ns.disk = os.disk; 2197 ns.disk = os.disk;
2161 2198
2162 rv = _drbd_set_state(device, ns, flags, NULL); 2199 rv = _drbd_set_state(device, ns, flags, NULL);
2163 if (rv < SS_SUCCESS) 2200 BUG_ON(rv < SS_SUCCESS);
2164 BUG();
2165
2166 ns.i = device->state.i; 2201 ns.i = device->state.i;
2167 ns_max.role = max_role(ns.role, ns_max.role); 2202 ns_max.role = max_role(ns.role, ns_max.role);
2168 ns_max.peer = max_role(ns.peer, ns_max.peer); 2203 ns_max.peer = max_role(ns.peer, ns_max.peer);
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index bd989536f888..6c9d5d4a8a75 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -140,7 +140,7 @@ extern void drbd_resume_al(struct drbd_device *device);
140extern bool conn_all_vols_unconf(struct drbd_connection *connection); 140extern bool conn_all_vols_unconf(struct drbd_connection *connection);
141 141
142/** 142/**
143 * drbd_request_state() - Reqest a state change 143 * drbd_request_state() - Request a state change
144 * @device: DRBD device. 144 * @device: DRBD device.
145 * @mask: mask of state bits to change. 145 * @mask: mask of state bits to change.
146 * @val: value of new state bits. 146 * @val: value of new state bits.
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 80b0f63c7075..0eeab14776e9 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -26,7 +26,7 @@
26#include <linux/drbd.h> 26#include <linux/drbd.h>
27#include "drbd_strings.h" 27#include "drbd_strings.h"
28 28
29static const char *drbd_conn_s_names[] = { 29static const char * const drbd_conn_s_names[] = {
30 [C_STANDALONE] = "StandAlone", 30 [C_STANDALONE] = "StandAlone",
31 [C_DISCONNECTING] = "Disconnecting", 31 [C_DISCONNECTING] = "Disconnecting",
32 [C_UNCONNECTED] = "Unconnected", 32 [C_UNCONNECTED] = "Unconnected",
@@ -53,13 +53,13 @@ static const char *drbd_conn_s_names[] = {
53 [C_BEHIND] = "Behind", 53 [C_BEHIND] = "Behind",
54}; 54};
55 55
56static const char *drbd_role_s_names[] = { 56static const char * const drbd_role_s_names[] = {
57 [R_PRIMARY] = "Primary", 57 [R_PRIMARY] = "Primary",
58 [R_SECONDARY] = "Secondary", 58 [R_SECONDARY] = "Secondary",
59 [R_UNKNOWN] = "Unknown" 59 [R_UNKNOWN] = "Unknown"
60}; 60};
61 61
62static const char *drbd_disk_s_names[] = { 62static const char * const drbd_disk_s_names[] = {
63 [D_DISKLESS] = "Diskless", 63 [D_DISKLESS] = "Diskless",
64 [D_ATTACHING] = "Attaching", 64 [D_ATTACHING] = "Attaching",
65 [D_FAILED] = "Failed", 65 [D_FAILED] = "Failed",
@@ -71,7 +71,7 @@ static const char *drbd_disk_s_names[] = {
71 [D_UP_TO_DATE] = "UpToDate", 71 [D_UP_TO_DATE] = "UpToDate",
72}; 72};
73 73
74static const char *drbd_state_sw_errors[] = { 74static const char * const drbd_state_sw_errors[] = {
75 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", 75 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
76 [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", 76 [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
77 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", 77 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 51fab978eb61..35dbb3dca47e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -173,8 +173,8 @@ void drbd_peer_request_endio(struct bio *bio)
173{ 173{
174 struct drbd_peer_request *peer_req = bio->bi_private; 174 struct drbd_peer_request *peer_req = bio->bi_private;
175 struct drbd_device *device = peer_req->peer_device->device; 175 struct drbd_device *device = peer_req->peer_device->device;
176 int is_write = bio_data_dir(bio) == WRITE; 176 bool is_write = bio_data_dir(bio) == WRITE;
177 int is_discard = !!(bio_op(bio) == REQ_OP_DISCARD); 177 bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
178 178
179 if (bio->bi_error && __ratelimit(&drbd_ratelimit_state)) 179 if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
180 drbd_warn(device, "%s: error=%d s=%llus\n", 180 drbd_warn(device, "%s: error=%d s=%llus\n",
@@ -248,18 +248,26 @@ void drbd_request_endio(struct bio *bio)
248 248
249 /* to avoid recursion in __req_mod */ 249 /* to avoid recursion in __req_mod */
250 if (unlikely(bio->bi_error)) { 250 if (unlikely(bio->bi_error)) {
251 if (bio_op(bio) == REQ_OP_DISCARD) 251 switch (bio_op(bio)) {
252 what = (bio->bi_error == -EOPNOTSUPP) 252 case REQ_OP_DISCARD:
253 ? DISCARD_COMPLETED_NOTSUPP 253 if (bio->bi_error == -EOPNOTSUPP)
254 : DISCARD_COMPLETED_WITH_ERROR; 254 what = DISCARD_COMPLETED_NOTSUPP;
255 else 255 else
256 what = (bio_data_dir(bio) == WRITE) 256 what = DISCARD_COMPLETED_WITH_ERROR;
257 ? WRITE_COMPLETED_WITH_ERROR 257 break;
258 : (bio_rw(bio) == READ) 258 case REQ_OP_READ:
259 ? READ_COMPLETED_WITH_ERROR 259 if (bio->bi_rw & REQ_RAHEAD)
260 : READ_AHEAD_COMPLETED_WITH_ERROR; 260 what = READ_AHEAD_COMPLETED_WITH_ERROR;
261 } else 261 else
262 what = READ_COMPLETED_WITH_ERROR;
263 break;
264 default:
265 what = WRITE_COMPLETED_WITH_ERROR;
266 break;
267 }
268 } else {
262 what = COMPLETED_OK; 269 what = COMPLETED_OK;
270 }
263 271
264 bio_put(req->private_bio); 272 bio_put(req->private_bio);
265 req->private_bio = ERR_PTR(bio->bi_error); 273 req->private_bio = ERR_PTR(bio->bi_error);
@@ -320,6 +328,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest)
320 sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); 328 sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
321 ahash_request_set_crypt(req, &sg, NULL, sg.length); 329 ahash_request_set_crypt(req, &sg, NULL, sg.length);
322 crypto_ahash_update(req); 330 crypto_ahash_update(req);
331 /* REQ_OP_WRITE_SAME has only one segment,
332 * checksum the payload only once. */
333 if (bio_op(bio) == REQ_OP_WRITE_SAME)
334 break;
323 } 335 }
324 ahash_request_set_crypt(req, NULL, digest, 0); 336 ahash_request_set_crypt(req, NULL, digest, 0);
325 crypto_ahash_final(req); 337 crypto_ahash_final(req);
@@ -387,7 +399,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
387 /* GFP_TRY, because if there is no memory available right now, this may 399 /* GFP_TRY, because if there is no memory available right now, this may
388 * be rescheduled for later. It is "only" background resync, after all. */ 400 * be rescheduled for later. It is "only" background resync, after all. */
389 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, 401 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
390 size, true /* has real payload */, GFP_TRY); 402 size, size, GFP_TRY);
391 if (!peer_req) 403 if (!peer_req)
392 goto defer; 404 goto defer;
393 405
@@ -583,6 +595,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
583 int number, rollback_i, size; 595 int number, rollback_i, size;
584 int align, requeue = 0; 596 int align, requeue = 0;
585 int i = 0; 597 int i = 0;
598 int discard_granularity = 0;
586 599
587 if (unlikely(cancel)) 600 if (unlikely(cancel))
588 return 0; 601 return 0;
@@ -602,6 +615,12 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
602 return 0; 615 return 0;
603 } 616 }
604 617
618 if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
619 rcu_read_lock();
620 discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
621 rcu_read_unlock();
622 }
623
605 max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; 624 max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
606 number = drbd_rs_number_requests(device); 625 number = drbd_rs_number_requests(device);
607 if (number <= 0) 626 if (number <= 0)
@@ -666,6 +685,9 @@ next_sector:
666 if (sector & ((1<<(align+3))-1)) 685 if (sector & ((1<<(align+3))-1))
667 break; 686 break;
668 687
688 if (discard_granularity && size == discard_granularity)
689 break;
690
669 /* do not cross extent boundaries */ 691 /* do not cross extent boundaries */
670 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) 692 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
671 break; 693 break;
@@ -712,7 +734,8 @@ next_sector:
712 int err; 734 int err;
713 735
714 inc_rs_pending(device); 736 inc_rs_pending(device);
715 err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST, 737 err = drbd_send_drequest(peer_device,
738 size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST,
716 sector, size, ID_SYNCER); 739 sector, size, ID_SYNCER);
717 if (err) { 740 if (err) {
718 drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); 741 drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@@ -829,6 +852,7 @@ static void ping_peer(struct drbd_device *device)
829 852
830int drbd_resync_finished(struct drbd_device *device) 853int drbd_resync_finished(struct drbd_device *device)
831{ 854{
855 struct drbd_connection *connection = first_peer_device(device)->connection;
832 unsigned long db, dt, dbdt; 856 unsigned long db, dt, dbdt;
833 unsigned long n_oos; 857 unsigned long n_oos;
834 union drbd_state os, ns; 858 union drbd_state os, ns;
@@ -850,8 +874,7 @@ int drbd_resync_finished(struct drbd_device *device)
850 if (dw) { 874 if (dw) {
851 dw->w.cb = w_resync_finished; 875 dw->w.cb = w_resync_finished;
852 dw->device = device; 876 dw->device = device;
853 drbd_queue_work(&first_peer_device(device)->connection->sender_work, 877 drbd_queue_work(&connection->sender_work, &dw->w);
854 &dw->w);
855 return 1; 878 return 1;
856 } 879 }
857 drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n"); 880 drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
@@ -964,6 +987,30 @@ int drbd_resync_finished(struct drbd_device *device)
964 _drbd_set_state(device, ns, CS_VERBOSE, NULL); 987 _drbd_set_state(device, ns, CS_VERBOSE, NULL);
965out_unlock: 988out_unlock:
966 spin_unlock_irq(&device->resource->req_lock); 989 spin_unlock_irq(&device->resource->req_lock);
990
991 /* If we have been sync source, and have an effective fencing-policy,
992 * once *all* volumes are back in sync, call "unfence". */
993 if (os.conn == C_SYNC_SOURCE) {
994 enum drbd_disk_state disk_state = D_MASK;
995 enum drbd_disk_state pdsk_state = D_MASK;
996 enum drbd_fencing_p fp = FP_DONT_CARE;
997
998 rcu_read_lock();
999 fp = rcu_dereference(device->ldev->disk_conf)->fencing;
1000 if (fp != FP_DONT_CARE) {
1001 struct drbd_peer_device *peer_device;
1002 int vnr;
1003 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1004 struct drbd_device *device = peer_device->device;
1005 disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
1006 pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk);
1007 }
1008 }
1009 rcu_read_unlock();
1010 if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE)
1011 conn_khelper(connection, "unfence-peer");
1012 }
1013
967 put_ldev(device); 1014 put_ldev(device);
968out: 1015out:
969 device->rs_total = 0; 1016 device->rs_total = 0;
@@ -1000,7 +1047,6 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_
1000 1047
1001/** 1048/**
1002 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST 1049 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
1003 * @device: DRBD device.
1004 * @w: work object. 1050 * @w: work object.
1005 * @cancel: The connection will be closed anyways 1051 * @cancel: The connection will be closed anyways
1006 */ 1052 */
@@ -1036,6 +1082,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
1036 return err; 1082 return err;
1037} 1083}
1038 1084
1085static bool all_zero(struct drbd_peer_request *peer_req)
1086{
1087 struct page *page = peer_req->pages;
1088 unsigned int len = peer_req->i.size;
1089
1090 page_chain_for_each(page) {
1091 unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
1092 unsigned int i, words = l / sizeof(long);
1093 unsigned long *d;
1094
1095 d = kmap_atomic(page);
1096 for (i = 0; i < words; i++) {
1097 if (d[i]) {
1098 kunmap_atomic(d);
1099 return false;
1100 }
1101 }
1102 kunmap_atomic(d);
1103 len -= l;
1104 }
1105
1106 return true;
1107}
1108
1039/** 1109/**
1040 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST 1110 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
1041 * @w: work object. 1111 * @w: work object.
@@ -1064,7 +1134,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
1064 } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { 1134 } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1065 if (likely(device->state.pdsk >= D_INCONSISTENT)) { 1135 if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1066 inc_rs_pending(device); 1136 inc_rs_pending(device);
1067 err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); 1137 if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
1138 err = drbd_send_rs_deallocated(peer_device, peer_req);
1139 else
1140 err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
1068 } else { 1141 } else {
1069 if (__ratelimit(&drbd_ratelimit_state)) 1142 if (__ratelimit(&drbd_ratelimit_state))
1070 drbd_err(device, "Not sending RSDataReply, " 1143 drbd_err(device, "Not sending RSDataReply, "
@@ -1634,7 +1707,7 @@ static bool use_checksum_based_resync(struct drbd_connection *connection, struct
1634 rcu_read_unlock(); 1707 rcu_read_unlock();
1635 return connection->agreed_pro_version >= 89 && /* supported? */ 1708 return connection->agreed_pro_version >= 89 && /* supported? */
1636 connection->csums_tfm && /* configured? */ 1709 connection->csums_tfm && /* configured? */
1637 (csums_after_crash_only == 0 /* use for each resync? */ 1710 (csums_after_crash_only == false /* use for each resync? */
1638 || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ 1711 || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
1639} 1712}
1640 1713
@@ -1769,7 +1842,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1769 device->bm_resync_fo = 0; 1842 device->bm_resync_fo = 0;
1770 device->use_csums = use_checksum_based_resync(connection, device); 1843 device->use_csums = use_checksum_based_resync(connection, device);
1771 } else { 1844 } else {
1772 device->use_csums = 0; 1845 device->use_csums = false;
1773 } 1846 }
1774 1847
1775 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid 1848 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index f9bfecd733a8..c557057fe8ae 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4350,8 +4350,7 @@ static int __init do_floppy_init(void)
4350 /* to be cleaned up... */ 4350 /* to be cleaned up... */
4351 disks[drive]->private_data = (void *)(long)drive; 4351 disks[drive]->private_data = (void *)(long)drive;
4352 disks[drive]->flags |= GENHD_FL_REMOVABLE; 4352 disks[drive]->flags |= GENHD_FL_REMOVABLE;
4353 disks[drive]->driverfs_dev = &floppy_device[drive].dev; 4353 device_add_disk(&floppy_device[drive].dev, disks[drive]);
4354 add_disk(disks[drive]);
4355 } 4354 }
4356 4355
4357 return 0; 4356 return 0;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 364d491d4bdd..075377eee0c0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1765,6 +1765,7 @@ static int loop_add(struct loop_device **l, int i)
1765 */ 1765 */
1766 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); 1766 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
1767 1767
1768 err = -ENOMEM;
1768 disk = lo->lo_disk = alloc_disk(1 << part_shift); 1769 disk = lo->lo_disk = alloc_disk(1 << part_shift);
1769 if (!disk) 1770 if (!disk)
1770 goto out_free_queue; 1771 goto out_free_queue;
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 145ce2aa2e78..e937fcf71769 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -687,15 +687,13 @@ static unsigned int mg_issue_req(struct request *req,
687 unsigned int sect_num, 687 unsigned int sect_num,
688 unsigned int sect_cnt) 688 unsigned int sect_cnt)
689{ 689{
690 switch (rq_data_dir(req)) { 690 if (rq_data_dir(req) == READ) {
691 case READ:
692 if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr) 691 if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
693 != MG_ERR_NONE) { 692 != MG_ERR_NONE) {
694 mg_bad_rw_intr(host); 693 mg_bad_rw_intr(host);
695 return host->error; 694 return host->error;
696 } 695 }
697 break; 696 } else {
698 case WRITE:
699 /* TODO : handler */ 697 /* TODO : handler */
700 outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); 698 outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
701 if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr) 699 if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
@@ -714,7 +712,6 @@ static unsigned int mg_issue_req(struct request *req,
714 mod_timer(&host->timer, jiffies + 3 * HZ); 712 mod_timer(&host->timer, jiffies + 3 * HZ);
715 outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + 713 outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
716 MG_REG_COMMAND); 714 MG_REG_COMMAND);
717 break;
718 } 715 }
719 return MG_ERR_NONE; 716 return MG_ERR_NONE;
720} 717}
@@ -1018,7 +1015,7 @@ probe_err_7:
1018probe_err_6: 1015probe_err_6:
1019 blk_cleanup_queue(host->breq); 1016 blk_cleanup_queue(host->breq);
1020probe_err_5: 1017probe_err_5:
1021 unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME); 1018 unregister_blkdev(host->major, MG_DISK_NAME);
1022probe_err_4: 1019probe_err_4:
1023 if (!prv_data->use_polling) 1020 if (!prv_data->use_polling)
1024 free_irq(host->irq, host); 1021 free_irq(host->irq, host);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 8e3e708cb9ee..2aca98e8e427 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3956,7 +3956,6 @@ static int mtip_block_initialize(struct driver_data *dd)
3956 if (rv) 3956 if (rv)
3957 goto disk_index_error; 3957 goto disk_index_error;
3958 3958
3959 dd->disk->driverfs_dev = &dd->pdev->dev;
3960 dd->disk->major = dd->major; 3959 dd->disk->major = dd->major;
3961 dd->disk->first_minor = index * MTIP_MAX_MINORS; 3960 dd->disk->first_minor = index * MTIP_MAX_MINORS;
3962 dd->disk->minors = MTIP_MAX_MINORS; 3961 dd->disk->minors = MTIP_MAX_MINORS;
@@ -4008,7 +4007,7 @@ skip_create_disk:
4008 4007
4009 /* 4008 /*
4010 * if rebuild pending, start the service thread, and delay the block 4009 * if rebuild pending, start the service thread, and delay the block
4011 * queue creation and add_disk() 4010 * queue creation and device_add_disk()
4012 */ 4011 */
4013 if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) 4012 if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
4014 goto start_service_thread; 4013 goto start_service_thread;
@@ -4042,7 +4041,7 @@ skip_create_disk:
4042 set_capacity(dd->disk, capacity); 4041 set_capacity(dd->disk, capacity);
4043 4042
4044 /* Enable the block device and add it to /dev */ 4043 /* Enable the block device and add it to /dev */
4045 add_disk(dd->disk); 4044 device_add_disk(&dd->pdev->dev, dd->disk);
4046 4045
4047 dd->bdev = bdget_disk(dd->disk, 0); 4046 dd->bdev = bdget_disk(dd->disk, 0);
4048 /* 4047 /*
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cab97593ba54..75a7f88d6717 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -448,7 +448,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
448 struct request *rq; 448 struct request *rq;
449 struct bio *bio = rqd->bio; 449 struct bio *bio = rqd->bio;
450 450
451 rq = blk_mq_alloc_request(q, bio_rw(bio), 0); 451 rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0);
452 if (IS_ERR(rq)) 452 if (IS_ERR(rq))
453 return -ENOMEM; 453 return -ENOMEM;
454 454
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index acb44529c05e..76f33c84ce3d 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -487,7 +487,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
487 gendisk->fops = &ps3disk_fops; 487 gendisk->fops = &ps3disk_fops;
488 gendisk->queue = queue; 488 gendisk->queue = queue;
489 gendisk->private_data = dev; 489 gendisk->private_data = dev;
490 gendisk->driverfs_dev = &dev->sbd.core;
491 snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME, 490 snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME,
492 devidx+'a'); 491 devidx+'a');
493 priv->blocking_factor = dev->blk_size >> 9; 492 priv->blocking_factor = dev->blk_size >> 9;
@@ -499,7 +498,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
499 gendisk->disk_name, priv->model, priv->raw_capacity >> 11, 498 gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
500 get_capacity(gendisk) >> 11); 499 get_capacity(gendisk) >> 11);
501 500
502 add_disk(gendisk); 501 device_add_disk(&dev->sbd.core, gendisk);
503 return 0; 502 return 0;
504 503
505fail_cleanup_queue: 504fail_cleanup_queue:
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 56847fcda086..456b4fe21559 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -773,14 +773,13 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
773 gendisk->fops = &ps3vram_fops; 773 gendisk->fops = &ps3vram_fops;
774 gendisk->queue = queue; 774 gendisk->queue = queue;
775 gendisk->private_data = dev; 775 gendisk->private_data = dev;
776 gendisk->driverfs_dev = &dev->core;
777 strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); 776 strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name));
778 set_capacity(gendisk, priv->size >> 9); 777 set_capacity(gendisk, priv->size >> 9);
779 778
780 dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n", 779 dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n",
781 gendisk->disk_name, get_capacity(gendisk) >> 11); 780 gendisk->disk_name, get_capacity(gendisk) >> 11);
782 781
783 add_disk(gendisk); 782 device_add_disk(&dev->core, gendisk);
784 return 0; 783 return 0;
785 784
786fail_cleanup_queue: 785fail_cleanup_queue:
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index e1b8b7061d2f..f81d70b39d10 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -230,8 +230,7 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card)
230 set_capacity(card->gendisk, card->size8 >> 9); 230 set_capacity(card->gendisk, card->size8 >> 9);
231 else 231 else
232 set_capacity(card->gendisk, 0); 232 set_capacity(card->gendisk, 0);
233 add_disk(card->gendisk); 233 device_add_disk(CARD_TO_DEV(card), card->gendisk);
234
235 card->bdev_attached = 1; 234 card->bdev_attached = 1;
236 } 235 }
237 236
@@ -308,7 +307,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
308 307
309 snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), 308 snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name),
310 "rsxx%d", card->disk_id); 309 "rsxx%d", card->disk_id);
311 card->gendisk->driverfs_dev = &card->dev->dev;
312 card->gendisk->major = card->major; 310 card->gendisk->major = card->major;
313 card->gendisk->first_minor = 0; 311 card->gendisk->first_minor = 0;
314 card->gendisk->fops = &rsxx_fops; 312 card->gendisk->fops = &rsxx_fops;
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 5c07a23e2ada..3822eae102db 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -4690,10 +4690,10 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4690 return -EIO; 4690 return -EIO;
4691} 4691}
4692 4692
4693static int skd_bdev_attach(struct skd_device *skdev) 4693static int skd_bdev_attach(struct device *parent, struct skd_device *skdev)
4694{ 4694{
4695 pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__); 4695 pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
4696 add_disk(skdev->disk); 4696 device_add_disk(parent, skdev->disk);
4697 return 0; 4697 return 0;
4698} 4698}
4699 4699
@@ -4812,8 +4812,6 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
4812 4812
4813 pci_set_drvdata(pdev, skdev); 4813 pci_set_drvdata(pdev, skdev);
4814 4814
4815 skdev->disk->driverfs_dev = &pdev->dev;
4816
4817 for (i = 0; i < SKD_MAX_BARS; i++) { 4815 for (i = 0; i < SKD_MAX_BARS; i++) {
4818 skdev->mem_phys[i] = pci_resource_start(pdev, i); 4816 skdev->mem_phys[i] = pci_resource_start(pdev, i);
4819 skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); 4817 skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
@@ -4851,7 +4849,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
4851 (SKD_START_WAIT_SECONDS * HZ)); 4849 (SKD_START_WAIT_SECONDS * HZ));
4852 if (skdev->gendisk_on > 0) { 4850 if (skdev->gendisk_on > 0) {
4853 /* device came on-line after reset */ 4851 /* device came on-line after reset */
4854 skd_bdev_attach(skdev); 4852 skd_bdev_attach(&pdev->dev, skdev);
4855 rc = 0; 4853 rc = 0;
4856 } else { 4854 } else {
4857 /* we timed out, something is wrong with the device, 4855 /* we timed out, something is wrong with the device,
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 4b911ed96ea3..cab157331c4e 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -804,7 +804,6 @@ static int probe_disk(struct vdc_port *port)
804 g->fops = &vdc_fops; 804 g->fops = &vdc_fops;
805 g->queue = q; 805 g->queue = q;
806 g->private_data = port; 806 g->private_data = port;
807 g->driverfs_dev = &port->vio.vdev->dev;
808 807
809 set_capacity(g, port->vdisk_size); 808 set_capacity(g, port->vdisk_size);
810 809
@@ -835,7 +834,7 @@ static int probe_disk(struct vdc_port *port)
835 port->vdisk_size, (port->vdisk_size >> (20 - 9)), 834 port->vdisk_size, (port->vdisk_size >> (20 - 9)),
836 port->vio.ver.major, port->vio.ver.minor); 835 port->vio.ver.major, port->vio.ver.minor);
837 836
838 add_disk(g); 837 device_add_disk(&port->vio.vdev->dev, g);
839 838
840 return 0; 839 return 0;
841} 840}
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 4b3ba74e9d22..d0a3e6d4515f 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -344,7 +344,6 @@ static int add_bio(struct cardinfo *card)
344 int offset; 344 int offset;
345 struct bio *bio; 345 struct bio *bio;
346 struct bio_vec vec; 346 struct bio_vec vec;
347 int rw;
348 347
349 bio = card->currentbio; 348 bio = card->currentbio;
350 if (!bio && card->bio) { 349 if (!bio && card->bio) {
@@ -359,7 +358,6 @@ static int add_bio(struct cardinfo *card)
359 if (!bio) 358 if (!bio)
360 return 0; 359 return 0;
361 360
362 rw = bio_rw(bio);
363 if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE) 361 if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE)
364 return 0; 362 return 0;
365 363
@@ -369,7 +367,7 @@ static int add_bio(struct cardinfo *card)
369 vec.bv_page, 367 vec.bv_page,
370 vec.bv_offset, 368 vec.bv_offset,
371 vec.bv_len, 369 vec.bv_len,
372 (rw == READ) ? 370 bio_op(bio) == REQ_OP_READ ?
373 PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); 371 PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
374 372
375 p = &card->mm_pages[card->Ready]; 373 p = &card->mm_pages[card->Ready];
@@ -398,7 +396,7 @@ static int add_bio(struct cardinfo *card)
398 DMASCR_CHAIN_EN | 396 DMASCR_CHAIN_EN |
399 DMASCR_SEM_EN | 397 DMASCR_SEM_EN |
400 pci_cmds); 398 pci_cmds);
401 if (rw == WRITE) 399 if (bio_op(bio) == REQ_OP_WRITE)
402 desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ); 400 desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ);
403 desc->sem_control_bits = desc->control_bits; 401 desc->sem_control_bits = desc->control_bits;
404 402
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 18e4069dd24b..1523e05c46fc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -236,25 +236,22 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
236static int virtblk_get_id(struct gendisk *disk, char *id_str) 236static int virtblk_get_id(struct gendisk *disk, char *id_str)
237{ 237{
238 struct virtio_blk *vblk = disk->private_data; 238 struct virtio_blk *vblk = disk->private_data;
239 struct request_queue *q = vblk->disk->queue;
239 struct request *req; 240 struct request *req;
240 struct bio *bio;
241 int err; 241 int err;
242 242
243 bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES, 243 req = blk_get_request(q, READ, GFP_KERNEL);
244 GFP_KERNEL); 244 if (IS_ERR(req))
245 if (IS_ERR(bio))
246 return PTR_ERR(bio);
247
248 req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
249 if (IS_ERR(req)) {
250 bio_put(bio);
251 return PTR_ERR(req); 245 return PTR_ERR(req);
252 }
253
254 req->cmd_type = REQ_TYPE_DRV_PRIV; 246 req->cmd_type = REQ_TYPE_DRV_PRIV;
247
248 err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
249 if (err)
250 goto out;
251
255 err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); 252 err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
253out:
256 blk_put_request(req); 254 blk_put_request(req);
257
258 return err; 255 return err;
259} 256}
260 257
@@ -656,7 +653,6 @@ static int virtblk_probe(struct virtio_device *vdev)
656 vblk->disk->first_minor = index_to_minor(index); 653 vblk->disk->first_minor = index_to_minor(index);
657 vblk->disk->private_data = vblk; 654 vblk->disk->private_data = vblk;
658 vblk->disk->fops = &virtblk_fops; 655 vblk->disk->fops = &virtblk_fops;
659 vblk->disk->driverfs_dev = &vdev->dev;
660 vblk->disk->flags |= GENHD_FL_EXT_DEVT; 656 vblk->disk->flags |= GENHD_FL_EXT_DEVT;
661 vblk->index = index; 657 vblk->index = index;
662 658
@@ -733,7 +729,7 @@ static int virtblk_probe(struct virtio_device *vdev)
733 729
734 virtio_device_ready(vdev); 730 virtio_device_ready(vdev);
735 731
736 add_disk(vblk->disk); 732 device_add_disk(&vdev->dev, vblk->disk);
737 err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); 733 err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
738 if (err) 734 if (err)
739 goto out_del_disk; 735 goto out_del_disk;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 3355f1cdd4e5..2994cfa44c8a 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -480,7 +480,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
480 if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) 480 if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
481 vbd->flush_support = true; 481 vbd->flush_support = true;
482 482
483 if (q && blk_queue_secdiscard(q)) 483 if (q && blk_queue_secure_erase(q))
484 vbd->discard_secure = true; 484 vbd->discard_secure = true;
485 485
486 pr_debug("Successful creation of handle=%04x (dom=%u)\n", 486 pr_debug("Successful creation of handle=%04x (dom=%u)\n",
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index da05d3f9bad2..0b6682a33e3b 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -548,7 +548,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf
548 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 548 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
549 ring_req->u.discard.id = id; 549 ring_req->u.discard.id = id;
550 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); 550 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
551 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 551 if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
552 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 552 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
553 else 553 else
554 ring_req->u.discard.flag = 0; 554 ring_req->u.discard.flag = 0;
@@ -844,7 +844,7 @@ static int blkif_queue_request(struct request *req, struct blkfront_ring_info *r
844 return 1; 844 return 1;
845 845
846 if (unlikely(req_op(req) == REQ_OP_DISCARD || 846 if (unlikely(req_op(req) == REQ_OP_DISCARD ||
847 req->cmd_flags & REQ_SECURE)) 847 req_op(req) == REQ_OP_SECURE_ERASE))
848 return blkif_queue_discard_req(req, rinfo); 848 return blkif_queue_discard_req(req, rinfo);
849 else 849 else
850 return blkif_queue_rw_req(req, rinfo); 850 return blkif_queue_rw_req(req, rinfo);
@@ -952,7 +952,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
952 rq->limits.discard_granularity = info->discard_granularity; 952 rq->limits.discard_granularity = info->discard_granularity;
953 rq->limits.discard_alignment = info->discard_alignment; 953 rq->limits.discard_alignment = info->discard_alignment;
954 if (info->feature_secdiscard) 954 if (info->feature_secdiscard)
955 queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); 955 queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq);
956 } 956 }
957 957
958 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 958 /* Hard sector size and max sectors impersonate the equiv. hardware. */
@@ -1134,7 +1134,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
1134 gd->first_minor = minor; 1134 gd->first_minor = minor;
1135 gd->fops = &xlvbd_block_fops; 1135 gd->fops = &xlvbd_block_fops;
1136 gd->private_data = info; 1136 gd->private_data = info;
1137 gd->driverfs_dev = &(info->xbdev->dev);
1138 set_capacity(gd, capacity); 1137 set_capacity(gd, capacity);
1139 1138
1140 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, 1139 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
@@ -1592,7 +1591,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1592 info->feature_discard = 0; 1591 info->feature_discard = 0;
1593 info->feature_secdiscard = 0; 1592 info->feature_secdiscard = 0;
1594 queue_flag_clear(QUEUE_FLAG_DISCARD, rq); 1593 queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
1595 queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); 1594 queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
1596 } 1595 }
1597 blk_mq_complete_request(req, error); 1596 blk_mq_complete_request(req, error);
1598 break; 1597 break;
@@ -2106,11 +2105,14 @@ static int blkfront_resume(struct xenbus_device *dev)
2106 */ 2105 */
2107 if (req_op(shadow[i].request) == REQ_OP_FLUSH || 2106 if (req_op(shadow[i].request) == REQ_OP_FLUSH ||
2108 req_op(shadow[i].request) == REQ_OP_DISCARD || 2107 req_op(shadow[i].request) == REQ_OP_DISCARD ||
2109 shadow[j].request->cmd_flags & (REQ_FUA | REQ_SECURE)) { 2108 req_op(shadow[i].request) == REQ_OP_SECURE_ERASE ||
2110 2109 shadow[j].request->cmd_flags & REQ_FUA) {
2111 /* 2110 /*
2112 * Flush operations don't contain bios, so 2111 * Flush operations don't contain bios, so
2113 * we need to requeue the whole request 2112 * we need to requeue the whole request
2113 *
2114 * XXX: but this doesn't make any sense for a
2115 * write with the FUA flag set..
2114 */ 2116 */
2115 list_add(&shadow[j].request->queuelist, &info->requests); 2117 list_add(&shadow[j].request->queuelist, &info->requests);
2116 continue; 2118 continue;
@@ -2445,7 +2447,7 @@ static void blkfront_connect(struct blkfront_info *info)
2445 for (i = 0; i < info->nr_rings; i++) 2447 for (i = 0; i < info->nr_rings; i++)
2446 kick_pending_request_queues(&info->rinfo[i]); 2448 kick_pending_request_queues(&info->rinfo[i]);
2447 2449
2448 add_disk(info->gd); 2450 device_add_disk(&info->xbdev->dev, info->gd);
2449 2451
2450 info->is_ready = 1; 2452 info->is_ready = 1;
2451} 2453}