aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig22
-rw-r--r--drivers/block/drbd/drbd_bitmap.c21
-rw-r--r--drivers/block/drbd/drbd_int.h151
-rw-r--r--drivers/block/drbd/drbd_main.c158
-rw-r--r--drivers/block/drbd/drbd_nl.c52
-rw-r--r--drivers/block/drbd/drbd_proc.c19
-rw-r--r--drivers/block/drbd/drbd_receiver.c666
-rw-r--r--drivers/block/drbd/drbd_req.c40
-rw-r--r--drivers/block/drbd/drbd_strings.c2
-rw-r--r--drivers/block/drbd/drbd_worker.c206
-rw-r--r--drivers/block/drbd/drbd_wrappers.h16
-rw-r--r--drivers/block/virtio_blk.c46
12 files changed, 957 insertions, 442 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 77bfce52e9ca..de277689da61 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -76,6 +76,17 @@ config BLK_DEV_XD
76 76
77 It's pretty unlikely that you have one of these: say N. 77 It's pretty unlikely that you have one of these: say N.
78 78
79config GDROM
80 tristate "SEGA Dreamcast GD-ROM drive"
81 depends on SH_DREAMCAST
82 help
83 A standard SEGA Dreamcast comes with a modified CD ROM drive called a
84 "GD-ROM" by SEGA to signify it is capable of reading special disks
85 with up to 1 GB of data. This drive will also read standard CD ROM
86 disks. Select this option to access any disks in your GD ROM drive.
87 Most users will want to say "Y" here.
88 You can also build this as a module which will be called gdrom.
89
79config PARIDE 90config PARIDE
80 tristate "Parallel port IDE device support" 91 tristate "Parallel port IDE device support"
81 depends on PARPORT_PC 92 depends on PARPORT_PC
@@ -103,17 +114,6 @@ config PARIDE
103 "MicroSolutions backpack protocol", "DataStor Commuter protocol" 114 "MicroSolutions backpack protocol", "DataStor Commuter protocol"
104 etc.). 115 etc.).
105 116
106config GDROM
107 tristate "SEGA Dreamcast GD-ROM drive"
108 depends on SH_DREAMCAST
109 help
110 A standard SEGA Dreamcast comes with a modified CD ROM drive called a
111 "GD-ROM" by SEGA to signify it is capable of reading special disks
112 with up to 1 GB of data. This drive will also read standard CD ROM
113 disks. Select this option to access any disks in your GD ROM drive.
114 Most users will want to say "Y" here.
115 You can also build this as a module which will be called gdrom.
116
117source "drivers/block/paride/Kconfig" 117source "drivers/block/paride/Kconfig"
118 118
119config BLK_CPQ_DA 119config BLK_CPQ_DA
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 3390716898d5..e3f88d6e1412 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -84,6 +84,9 @@ struct drbd_bitmap {
84#define BM_MD_IO_ERROR 1 84#define BM_MD_IO_ERROR 1
85#define BM_P_VMALLOCED 2 85#define BM_P_VMALLOCED 2
86 86
87static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
88 unsigned long e, int val, const enum km_type km);
89
87static int bm_is_locked(struct drbd_bitmap *b) 90static int bm_is_locked(struct drbd_bitmap *b)
88{ 91{
89 return test_bit(BM_LOCKED, &b->bm_flags); 92 return test_bit(BM_LOCKED, &b->bm_flags);
@@ -441,7 +444,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
441 * In case this is actually a resize, we copy the old bitmap into the new one. 444 * In case this is actually a resize, we copy the old bitmap into the new one.
442 * Otherwise, the bitmap is initialized to all bits set. 445 * Otherwise, the bitmap is initialized to all bits set.
443 */ 446 */
444int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) 447int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
445{ 448{
446 struct drbd_bitmap *b = mdev->bitmap; 449 struct drbd_bitmap *b = mdev->bitmap;
447 unsigned long bits, words, owords, obits, *p_addr, *bm; 450 unsigned long bits, words, owords, obits, *p_addr, *bm;
@@ -516,7 +519,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
516 obits = b->bm_bits; 519 obits = b->bm_bits;
517 520
518 growing = bits > obits; 521 growing = bits > obits;
519 if (opages) 522 if (opages && growing && set_new_bits)
520 bm_set_surplus(b); 523 bm_set_surplus(b);
521 524
522 b->bm_pages = npages; 525 b->bm_pages = npages;
@@ -526,8 +529,12 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
526 b->bm_dev_capacity = capacity; 529 b->bm_dev_capacity = capacity;
527 530
528 if (growing) { 531 if (growing) {
529 bm_memset(b, owords, 0xff, words-owords); 532 if (set_new_bits) {
530 b->bm_set += bits - obits; 533 bm_memset(b, owords, 0xff, words-owords);
534 b->bm_set += bits - obits;
535 } else
536 bm_memset(b, owords, 0x00, words-owords);
537
531 } 538 }
532 539
533 if (want < have) { 540 if (want < have) {
@@ -773,7 +780,7 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int
773 /* nothing to do, on disk == in memory */ 780 /* nothing to do, on disk == in memory */
774# define bm_cpu_to_lel(x) ((void)0) 781# define bm_cpu_to_lel(x) ((void)0)
775# else 782# else
776void bm_cpu_to_lel(struct drbd_bitmap *b) 783static void bm_cpu_to_lel(struct drbd_bitmap *b)
777{ 784{
778 /* need to cpu_to_lel all the pages ... 785 /* need to cpu_to_lel all the pages ...
779 * this may be optimized by using 786 * this may be optimized by using
@@ -1015,7 +1022,7 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f
1015 * wants bitnr, not sector. 1022 * wants bitnr, not sector.
1016 * expected to be called for only a few bits (e - s about BITS_PER_LONG). 1023 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1017 * Must hold bitmap lock already. */ 1024 * Must hold bitmap lock already. */
1018int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 1025static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1019 unsigned long e, int val, const enum km_type km) 1026 unsigned long e, int val, const enum km_type km)
1020{ 1027{
1021 struct drbd_bitmap *b = mdev->bitmap; 1028 struct drbd_bitmap *b = mdev->bitmap;
@@ -1053,7 +1060,7 @@ int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1053 * for val != 0, we change 0 -> 1, return code positive 1060 * for val != 0, we change 0 -> 1, return code positive
1054 * for val == 0, we change 1 -> 0, return code negative 1061 * for val == 0, we change 1 -> 0, return code negative
1055 * wants bitnr, not sector */ 1062 * wants bitnr, not sector */
1056int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 1063static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1057 const unsigned long e, int val) 1064 const unsigned long e, int val)
1058{ 1065{
1059 unsigned long flags; 1066 unsigned long flags;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e5e86a781820..e9654c8d5b62 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -132,6 +132,7 @@ enum {
132 DRBD_FAULT_DT_RA = 6, /* data read ahead */ 132 DRBD_FAULT_DT_RA = 6, /* data read ahead */
133 DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ 133 DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
134 DRBD_FAULT_AL_EE = 8, /* alloc ee */ 134 DRBD_FAULT_AL_EE = 8, /* alloc ee */
135 DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
135 136
136 DRBD_FAULT_MAX, 137 DRBD_FAULT_MAX,
137}; 138};
@@ -208,8 +209,11 @@ enum drbd_packets {
208 P_RS_IS_IN_SYNC = 0x22, /* meta socket */ 209 P_RS_IS_IN_SYNC = 0x22, /* meta socket */
209 P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ 210 P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
210 P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ 211 P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
212 /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
213 /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
214 P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
211 215
212 P_MAX_CMD = 0x25, 216 P_MAX_CMD = 0x28,
213 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 217 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
214 P_MAX_OPT_CMD = 0x101, 218 P_MAX_OPT_CMD = 0x101,
215 219
@@ -264,6 +268,7 @@ static inline const char *cmdname(enum drbd_packets cmd)
264 [P_CSUM_RS_REQUEST] = "CsumRSRequest", 268 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
265 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", 269 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
266 [P_COMPRESSED_BITMAP] = "CBitmap", 270 [P_COMPRESSED_BITMAP] = "CBitmap",
271 [P_DELAY_PROBE] = "DelayProbe",
267 [P_MAX_CMD] = NULL, 272 [P_MAX_CMD] = NULL,
268 }; 273 };
269 274
@@ -481,7 +486,8 @@ struct p_sizes {
481 u64 u_size; /* user requested size */ 486 u64 u_size; /* user requested size */
482 u64 c_size; /* current exported size */ 487 u64 c_size; /* current exported size */
483 u32 max_segment_size; /* Maximal size of a BIO */ 488 u32 max_segment_size; /* Maximal size of a BIO */
484 u32 queue_order_type; 489 u16 queue_order_type; /* not yet implemented in DRBD*/
490 u16 dds_flags; /* use enum dds_flags here. */
485} __packed; 491} __packed;
486 492
487struct p_state { 493struct p_state {
@@ -538,6 +544,18 @@ struct p_compressed_bm {
538 u8 code[0]; 544 u8 code[0];
539} __packed; 545} __packed;
540 546
547struct p_delay_probe {
548 struct p_header head;
549 u32 seq_num; /* sequence number to match the two probe packets */
550 u32 offset; /* usecs the probe got sent after the reference time point */
551} __packed;
552
553struct delay_probe {
554 struct list_head list;
555 unsigned int seq_num;
556 struct timeval time;
557};
558
541/* DCBP: Drbd Compressed Bitmap Packet ... */ 559/* DCBP: Drbd Compressed Bitmap Packet ... */
542static inline enum drbd_bitmap_code 560static inline enum drbd_bitmap_code
543DCBP_get_code(struct p_compressed_bm *p) 561DCBP_get_code(struct p_compressed_bm *p)
@@ -722,22 +740,6 @@ enum epoch_event {
722 EV_CLEANUP = 32, /* used as flag */ 740 EV_CLEANUP = 32, /* used as flag */
723}; 741};
724 742
725struct drbd_epoch_entry {
726 struct drbd_work w;
727 struct drbd_conf *mdev;
728 struct bio *private_bio;
729 struct hlist_node colision;
730 sector_t sector;
731 unsigned int size;
732 struct drbd_epoch *epoch;
733
734 /* up to here, the struct layout is identical to drbd_request;
735 * we might be able to use that to our advantage... */
736
737 unsigned int flags;
738 u64 block_id;
739};
740
741struct drbd_wq_barrier { 743struct drbd_wq_barrier {
742 struct drbd_work w; 744 struct drbd_work w;
743 struct completion done; 745 struct completion done;
@@ -748,17 +750,49 @@ struct digest_info {
748 void *digest; 750 void *digest;
749}; 751};
750 752
751/* ee flag bits */ 753struct drbd_epoch_entry {
754 struct drbd_work w;
755 struct hlist_node colision;
756 struct drbd_epoch *epoch;
757 struct drbd_conf *mdev;
758 struct page *pages;
759 atomic_t pending_bios;
760 unsigned int size;
761 /* see comments on ee flag bits below */
762 unsigned long flags;
763 sector_t sector;
764 u64 block_id;
765};
766
767/* ee flag bits.
768 * While corresponding bios are in flight, the only modification will be
769 * set_bit WAS_ERROR, which has to be atomic.
770 * If no bios are in flight yet, or all have been completed,
771 * non-atomic modification to ee->flags is ok.
772 */
752enum { 773enum {
753 __EE_CALL_AL_COMPLETE_IO, 774 __EE_CALL_AL_COMPLETE_IO,
754 __EE_CONFLICT_PENDING,
755 __EE_MAY_SET_IN_SYNC, 775 __EE_MAY_SET_IN_SYNC,
776
777 /* This epoch entry closes an epoch using a barrier.
778 * On sucessful completion, the epoch is released,
779 * and the P_BARRIER_ACK send. */
756 __EE_IS_BARRIER, 780 __EE_IS_BARRIER,
781
782 /* In case a barrier failed,
783 * we need to resubmit without the barrier flag. */
784 __EE_RESUBMITTED,
785
786 /* we may have several bios per epoch entry.
787 * if any of those fail, we set this flag atomically
788 * from the endio callback */
789 __EE_WAS_ERROR,
757}; 790};
758#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 791#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
759#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
760#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 792#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
761#define EE_IS_BARRIER (1<<__EE_IS_BARRIER) 793#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
794#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
795#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
762 796
763/* global flag bits */ 797/* global flag bits */
764enum { 798enum {
@@ -908,9 +942,12 @@ struct drbd_conf {
908 unsigned int ko_count; 942 unsigned int ko_count;
909 struct drbd_work resync_work, 943 struct drbd_work resync_work,
910 unplug_work, 944 unplug_work,
911 md_sync_work; 945 md_sync_work,
946 delay_probe_work,
947 uuid_work;
912 struct timer_list resync_timer; 948 struct timer_list resync_timer;
913 struct timer_list md_sync_timer; 949 struct timer_list md_sync_timer;
950 struct timer_list delay_probe_timer;
914 951
915 /* Used after attach while negotiating new disk state. */ 952 /* Used after attach while negotiating new disk state. */
916 union drbd_state new_state_tmp; 953 union drbd_state new_state_tmp;
@@ -1026,6 +1063,13 @@ struct drbd_conf {
1026 u64 ed_uuid; /* UUID of the exposed data */ 1063 u64 ed_uuid; /* UUID of the exposed data */
1027 struct mutex state_mutex; 1064 struct mutex state_mutex;
1028 char congestion_reason; /* Why we where congested... */ 1065 char congestion_reason; /* Why we where congested... */
1066 struct list_head delay_probes; /* protected by peer_seq_lock */
1067 int data_delay; /* Delay of packets on the data-sock behind meta-sock */
1068 unsigned int delay_seq; /* To generate sequence numbers of delay probes */
1069 struct timeval dps_time; /* delay-probes-start-time */
1070 unsigned int dp_volume_last; /* send_cnt of last delay probe */
1071 int c_sync_rate; /* current resync rate after delay_probe magic */
1072 atomic_t new_c_uuid;
1029}; 1073};
1030 1074
1031static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1075static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1081,6 +1125,11 @@ enum chg_state_flags {
1081 CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, 1125 CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
1082}; 1126};
1083 1127
1128enum dds_flags {
1129 DDSF_FORCED = 1,
1130 DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
1131};
1132
1084extern void drbd_init_set_defaults(struct drbd_conf *mdev); 1133extern void drbd_init_set_defaults(struct drbd_conf *mdev);
1085extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, 1134extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
1086 union drbd_state mask, union drbd_state val); 1135 union drbd_state mask, union drbd_state val);
@@ -1113,7 +1162,7 @@ extern int drbd_send_protocol(struct drbd_conf *mdev);
1113extern int drbd_send_uuids(struct drbd_conf *mdev); 1162extern int drbd_send_uuids(struct drbd_conf *mdev);
1114extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); 1163extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
1115extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); 1164extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
1116extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); 1165extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
1117extern int _drbd_send_state(struct drbd_conf *mdev); 1166extern int _drbd_send_state(struct drbd_conf *mdev);
1118extern int drbd_send_state(struct drbd_conf *mdev); 1167extern int drbd_send_state(struct drbd_conf *mdev);
1119extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, 1168extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
@@ -1311,7 +1360,7 @@ struct bm_extent {
1311#define APP_R_HSIZE 15 1360#define APP_R_HSIZE 15
1312 1361
1313extern int drbd_bm_init(struct drbd_conf *mdev); 1362extern int drbd_bm_init(struct drbd_conf *mdev);
1314extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); 1363extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
1315extern void drbd_bm_cleanup(struct drbd_conf *mdev); 1364extern void drbd_bm_cleanup(struct drbd_conf *mdev);
1316extern void drbd_bm_set_all(struct drbd_conf *mdev); 1365extern void drbd_bm_set_all(struct drbd_conf *mdev);
1317extern void drbd_bm_clear_all(struct drbd_conf *mdev); 1366extern void drbd_bm_clear_all(struct drbd_conf *mdev);
@@ -1383,7 +1432,7 @@ extern void drbd_resume_io(struct drbd_conf *mdev);
1383extern char *ppsize(char *buf, unsigned long long size); 1432extern char *ppsize(char *buf, unsigned long long size);
1384extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); 1433extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
1385enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; 1434enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1386extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local); 1435extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
1387extern void resync_after_online_grow(struct drbd_conf *); 1436extern void resync_after_online_grow(struct drbd_conf *);
1388extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); 1437extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
1389extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, 1438extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
@@ -1414,7 +1463,8 @@ static inline void ov_oos_print(struct drbd_conf *mdev)
1414} 1463}
1415 1464
1416 1465
1417extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); 1466extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
1467extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
1418/* worker callbacks */ 1468/* worker callbacks */
1419extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); 1469extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
1420extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); 1470extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
@@ -1438,6 +1488,8 @@ extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
1438extern void resync_timer_fn(unsigned long data); 1488extern void resync_timer_fn(unsigned long data);
1439 1489
1440/* drbd_receiver.c */ 1490/* drbd_receiver.c */
1491extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1492 const unsigned rw, const int fault_type);
1441extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); 1493extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
1442extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, 1494extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
1443 u64 id, 1495 u64 id,
@@ -1593,6 +1645,41 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
1593 * inline helper functions 1645 * inline helper functions
1594 *************************/ 1646 *************************/
1595 1647
1648/* see also page_chain_add and friends in drbd_receiver.c */
1649static inline struct page *page_chain_next(struct page *page)
1650{
1651 return (struct page *)page_private(page);
1652}
1653#define page_chain_for_each(page) \
1654 for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
1655 page = page_chain_next(page))
1656#define page_chain_for_each_safe(page, n) \
1657 for (; page && ({ n = page_chain_next(page); 1; }); page = n)
1658
1659static inline int drbd_bio_has_active_page(struct bio *bio)
1660{
1661 struct bio_vec *bvec;
1662 int i;
1663
1664 __bio_for_each_segment(bvec, bio, i, 0) {
1665 if (page_count(bvec->bv_page) > 1)
1666 return 1;
1667 }
1668
1669 return 0;
1670}
1671
1672static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
1673{
1674 struct page *page = e->pages;
1675 page_chain_for_each(page) {
1676 if (page_count(page) > 1)
1677 return 1;
1678 }
1679 return 0;
1680}
1681
1682
1596static inline void drbd_state_lock(struct drbd_conf *mdev) 1683static inline void drbd_state_lock(struct drbd_conf *mdev)
1597{ 1684{
1598 wait_event(mdev->misc_wait, 1685 wait_event(mdev->misc_wait,
@@ -2132,13 +2219,15 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
2132 return 0; 2219 return 0;
2133 if (test_bit(BITMAP_IO, &mdev->flags)) 2220 if (test_bit(BITMAP_IO, &mdev->flags))
2134 return 0; 2221 return 0;
2222 if (atomic_read(&mdev->new_c_uuid))
2223 return 0;
2135 return 1; 2224 return 1;
2136} 2225}
2137 2226
2138/* I'd like to use wait_event_lock_irq, 2227/* I'd like to use wait_event_lock_irq,
2139 * but I'm not sure when it got introduced, 2228 * but I'm not sure when it got introduced,
2140 * and not sure when it has 3 or 4 arguments */ 2229 * and not sure when it has 3 or 4 arguments */
2141static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) 2230static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2142{ 2231{
2143 /* compare with after_state_ch, 2232 /* compare with after_state_ch,
2144 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ 2233 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
@@ -2152,6 +2241,9 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
2152 * to avoid races with the reconnect code, 2241 * to avoid races with the reconnect code,
2153 * we need to atomic_inc within the spinlock. */ 2242 * we need to atomic_inc within the spinlock. */
2154 2243
2244 if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
2245 drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
2246
2155 spin_lock_irq(&mdev->req_lock); 2247 spin_lock_irq(&mdev->req_lock);
2156 while (!__inc_ap_bio_cond(mdev)) { 2248 while (!__inc_ap_bio_cond(mdev)) {
2157 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); 2249 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
@@ -2160,7 +2252,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
2160 finish_wait(&mdev->misc_wait, &wait); 2252 finish_wait(&mdev->misc_wait, &wait);
2161 spin_lock_irq(&mdev->req_lock); 2253 spin_lock_irq(&mdev->req_lock);
2162 } 2254 }
2163 atomic_add(one_or_two, &mdev->ap_bio_cnt); 2255 atomic_add(count, &mdev->ap_bio_cnt);
2164 spin_unlock_irq(&mdev->req_lock); 2256 spin_unlock_irq(&mdev->req_lock);
2165} 2257}
2166 2258
@@ -2251,7 +2343,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
2251 if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2343 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2252 return; 2344 return;
2253 2345
2254 r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); 2346 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
2347 BLKDEV_IFL_WAIT);
2255 if (r) { 2348 if (r) {
2256 set_bit(MD_NO_BARRIER, &mdev->flags); 2349 set_bit(MD_NO_BARRIER, &mdev->flags);
2257 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2350 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 93d1f9b469d4..be2d2da9cdba 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -684,6 +684,9 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) 684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685 rv = SS_NO_REMOTE_DISK; 685 rv = SS_NO_REMOTE_DISK;
686 686
687 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
688 rv = SS_NO_UP_TO_DATE_DISK;
689
687 else if ((ns.conn == C_CONNECTED || 690 else if ((ns.conn == C_CONNECTED ||
688 ns.conn == C_WF_BITMAP_S || 691 ns.conn == C_WF_BITMAP_S ||
689 ns.conn == C_SYNC_SOURCE || 692 ns.conn == C_SYNC_SOURCE ||
@@ -840,7 +843,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
840 break; 843 break;
841 case C_WF_BITMAP_S: 844 case C_WF_BITMAP_S:
842 case C_PAUSED_SYNC_S: 845 case C_PAUSED_SYNC_S:
843 ns.pdsk = D_OUTDATED; 846 /* remap any consistent state to D_OUTDATED,
847 * but disallow "upgrade" of not even consistent states.
848 */
849 ns.pdsk =
850 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
851 ? os.pdsk : D_OUTDATED;
844 break; 852 break;
845 case C_SYNC_SOURCE: 853 case C_SYNC_SOURCE:
846 ns.pdsk = D_INCONSISTENT; 854 ns.pdsk = D_INCONSISTENT;
@@ -1205,21 +1213,20 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1205 && (ns.pdsk < D_INCONSISTENT || 1213 && (ns.pdsk < D_INCONSISTENT ||
1206 ns.pdsk == D_UNKNOWN || 1214 ns.pdsk == D_UNKNOWN ||
1207 ns.pdsk == D_OUTDATED)) { 1215 ns.pdsk == D_OUTDATED)) {
1208 kfree(mdev->p_uuid);
1209 mdev->p_uuid = NULL;
1210 if (get_ldev(mdev)) { 1216 if (get_ldev(mdev)) {
1211 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1217 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1212 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1218 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
1213 drbd_uuid_new_current(mdev); 1219 !atomic_read(&mdev->new_c_uuid))
1214 drbd_send_uuids(mdev); 1220 atomic_set(&mdev->new_c_uuid, 2);
1215 }
1216 put_ldev(mdev); 1221 put_ldev(mdev);
1217 } 1222 }
1218 } 1223 }
1219 1224
1220 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { 1225 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1221 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) 1226 /* Diskless peer becomes primary or got connected do diskless, primary peer. */
1222 drbd_uuid_new_current(mdev); 1227 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 &&
1228 !atomic_read(&mdev->new_c_uuid))
1229 atomic_set(&mdev->new_c_uuid, 2);
1223 1230
1224 /* D_DISKLESS Peer becomes secondary */ 1231 /* D_DISKLESS Peer becomes secondary */
1225 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1232 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1232,7 +1239,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1232 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { 1239 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1233 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ 1240 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1234 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ 1241 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
1235 drbd_send_sizes(mdev, 0); /* to start sync... */ 1242 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
1236 drbd_send_uuids(mdev); 1243 drbd_send_uuids(mdev);
1237 drbd_send_state(mdev); 1244 drbd_send_state(mdev);
1238 } 1245 }
@@ -1343,6 +1350,24 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1343 drbd_md_sync(mdev); 1350 drbd_md_sync(mdev);
1344} 1351}
1345 1352
1353static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1354{
1355 if (get_ldev(mdev)) {
1356 if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1357 drbd_uuid_new_current(mdev);
1358 if (get_net_conf(mdev)) {
1359 drbd_send_uuids(mdev);
1360 put_net_conf(mdev);
1361 }
1362 drbd_md_sync(mdev);
1363 }
1364 put_ldev(mdev);
1365 }
1366 atomic_dec(&mdev->new_c_uuid);
1367 wake_up(&mdev->misc_wait);
1368
1369 return 1;
1370}
1346 1371
1347static int drbd_thread_setup(void *arg) 1372static int drbd_thread_setup(void *arg)
1348{ 1373{
@@ -1755,7 +1780,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1755 (struct p_header *)&p, sizeof(p)); 1780 (struct p_header *)&p, sizeof(p));
1756} 1781}
1757 1782
1758int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) 1783int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1759{ 1784{
1760 struct p_sizes p; 1785 struct p_sizes p;
1761 sector_t d_size, u_size; 1786 sector_t d_size, u_size;
@@ -1767,7 +1792,6 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1767 d_size = drbd_get_max_capacity(mdev->ldev); 1792 d_size = drbd_get_max_capacity(mdev->ldev);
1768 u_size = mdev->ldev->dc.disk_size; 1793 u_size = mdev->ldev->dc.disk_size;
1769 q_order_type = drbd_queue_order_type(mdev); 1794 q_order_type = drbd_queue_order_type(mdev);
1770 p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
1771 put_ldev(mdev); 1795 put_ldev(mdev);
1772 } else { 1796 } else {
1773 d_size = 0; 1797 d_size = 0;
@@ -1779,7 +1803,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
1779 p.u_size = cpu_to_be64(u_size); 1803 p.u_size = cpu_to_be64(u_size);
1780 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); 1804 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1781 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); 1805 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1782 p.queue_order_type = cpu_to_be32(q_order_type); 1806 p.queue_order_type = cpu_to_be16(q_order_type);
1807 p.dds_flags = cpu_to_be16(flags);
1783 1808
1784 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, 1809 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1785 (struct p_header *)&p, sizeof(p)); 1810 (struct p_header *)&p, sizeof(p));
@@ -2180,6 +2205,43 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2180 return ok; 2205 return ok;
2181} 2206}
2182 2207
2208static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
2209{
2210 struct p_delay_probe dp;
2211 int offset, ok = 0;
2212 struct timeval now;
2213
2214 mutex_lock(&ds->mutex);
2215 if (likely(ds->socket)) {
2216 do_gettimeofday(&now);
2217 offset = now.tv_usec - mdev->dps_time.tv_usec +
2218 (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
2219 dp.seq_num = cpu_to_be32(mdev->delay_seq);
2220 dp.offset = cpu_to_be32(offset);
2221
2222 ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
2223 (struct p_header *)&dp, sizeof(dp), 0);
2224 }
2225 mutex_unlock(&ds->mutex);
2226
2227 return ok;
2228}
2229
2230static int drbd_send_delay_probes(struct drbd_conf *mdev)
2231{
2232 int ok;
2233
2234 mdev->delay_seq++;
2235 do_gettimeofday(&mdev->dps_time);
2236 ok = drbd_send_delay_probe(mdev, &mdev->meta);
2237 ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
2238
2239 mdev->dp_volume_last = mdev->send_cnt;
2240 mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
2241
2242 return ok;
2243}
2244
2183/* called on sndtimeo 2245/* called on sndtimeo
2184 * returns FALSE if we should retry, 2246 * returns FALSE if we should retry,
2185 * TRUE if we think connection is dead 2247 * TRUE if we think connection is dead
@@ -2309,6 +2371,44 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2309 return 1; 2371 return 1;
2310} 2372}
2311 2373
2374static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2375{
2376 struct page *page = e->pages;
2377 unsigned len = e->size;
2378 page_chain_for_each(page) {
2379 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2380 if (!_drbd_send_page(mdev, page, 0, l))
2381 return 0;
2382 len -= l;
2383 }
2384 return 1;
2385}
2386
2387static void consider_delay_probes(struct drbd_conf *mdev)
2388{
2389 if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93)
2390 return;
2391
2392 if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
2393 drbd_send_delay_probes(mdev);
2394}
2395
2396static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
2397{
2398 if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
2399 drbd_send_delay_probes(mdev);
2400
2401 return 1;
2402}
2403
2404static void delay_probe_timer_fn(unsigned long data)
2405{
2406 struct drbd_conf *mdev = (struct drbd_conf *) data;
2407
2408 if (list_empty(&mdev->delay_probe_work.list))
2409 drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
2410}
2411
2312/* Used to send write requests 2412/* Used to send write requests
2313 * R_PRIMARY -> Peer (P_DATA) 2413 * R_PRIMARY -> Peer (P_DATA)
2314 */ 2414 */
@@ -2360,7 +2460,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2360 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); 2460 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2361 if (ok && dgs) { 2461 if (ok && dgs) {
2362 dgb = mdev->int_dig_out; 2462 dgb = mdev->int_dig_out;
2363 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); 2463 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2364 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2464 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2365 } 2465 }
2366 if (ok) { 2466 if (ok) {
@@ -2371,6 +2471,10 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2371 } 2471 }
2372 2472
2373 drbd_put_data_sock(mdev); 2473 drbd_put_data_sock(mdev);
2474
2475 if (ok)
2476 consider_delay_probes(mdev);
2477
2374 return ok; 2478 return ok;
2375} 2479}
2376 2480
@@ -2409,13 +2513,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2409 sizeof(p), MSG_MORE); 2513 sizeof(p), MSG_MORE);
2410 if (ok && dgs) { 2514 if (ok && dgs) {
2411 dgb = mdev->int_dig_out; 2515 dgb = mdev->int_dig_out;
2412 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); 2516 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2413 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2517 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2414 } 2518 }
2415 if (ok) 2519 if (ok)
2416 ok = _drbd_send_zc_bio(mdev, e->private_bio); 2520 ok = _drbd_send_zc_ee(mdev, e);
2417 2521
2418 drbd_put_data_sock(mdev); 2522 drbd_put_data_sock(mdev);
2523
2524 if (ok)
2525 consider_delay_probes(mdev);
2526
2419 return ok; 2527 return ok;
2420} 2528}
2421 2529
@@ -2600,6 +2708,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2600 atomic_set(&mdev->net_cnt, 0); 2708 atomic_set(&mdev->net_cnt, 0);
2601 atomic_set(&mdev->packet_seq, 0); 2709 atomic_set(&mdev->packet_seq, 0);
2602 atomic_set(&mdev->pp_in_use, 0); 2710 atomic_set(&mdev->pp_in_use, 0);
2711 atomic_set(&mdev->new_c_uuid, 0);
2603 2712
2604 mutex_init(&mdev->md_io_mutex); 2713 mutex_init(&mdev->md_io_mutex);
2605 mutex_init(&mdev->data.mutex); 2714 mutex_init(&mdev->data.mutex);
@@ -2628,16 +2737,26 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2628 INIT_LIST_HEAD(&mdev->unplug_work.list); 2737 INIT_LIST_HEAD(&mdev->unplug_work.list);
2629 INIT_LIST_HEAD(&mdev->md_sync_work.list); 2738 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2630 INIT_LIST_HEAD(&mdev->bm_io_work.w.list); 2739 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2740 INIT_LIST_HEAD(&mdev->delay_probes);
2741 INIT_LIST_HEAD(&mdev->delay_probe_work.list);
2742 INIT_LIST_HEAD(&mdev->uuid_work.list);
2743
2631 mdev->resync_work.cb = w_resync_inactive; 2744 mdev->resync_work.cb = w_resync_inactive;
2632 mdev->unplug_work.cb = w_send_write_hint; 2745 mdev->unplug_work.cb = w_send_write_hint;
2633 mdev->md_sync_work.cb = w_md_sync; 2746 mdev->md_sync_work.cb = w_md_sync;
2634 mdev->bm_io_work.w.cb = w_bitmap_io; 2747 mdev->bm_io_work.w.cb = w_bitmap_io;
2748 mdev->delay_probe_work.cb = w_delay_probes;
2749 mdev->uuid_work.cb = w_new_current_uuid;
2635 init_timer(&mdev->resync_timer); 2750 init_timer(&mdev->resync_timer);
2636 init_timer(&mdev->md_sync_timer); 2751 init_timer(&mdev->md_sync_timer);
2752 init_timer(&mdev->delay_probe_timer);
2637 mdev->resync_timer.function = resync_timer_fn; 2753 mdev->resync_timer.function = resync_timer_fn;
2638 mdev->resync_timer.data = (unsigned long) mdev; 2754 mdev->resync_timer.data = (unsigned long) mdev;
2639 mdev->md_sync_timer.function = md_sync_timer_fn; 2755 mdev->md_sync_timer.function = md_sync_timer_fn;
2640 mdev->md_sync_timer.data = (unsigned long) mdev; 2756 mdev->md_sync_timer.data = (unsigned long) mdev;
2757 mdev->delay_probe_timer.function = delay_probe_timer_fn;
2758 mdev->delay_probe_timer.data = (unsigned long) mdev;
2759
2641 2760
2642 init_waitqueue_head(&mdev->misc_wait); 2761 init_waitqueue_head(&mdev->misc_wait);
2643 init_waitqueue_head(&mdev->state_wait); 2762 init_waitqueue_head(&mdev->state_wait);
@@ -2680,7 +2799,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
2680 drbd_set_my_capacity(mdev, 0); 2799 drbd_set_my_capacity(mdev, 0);
2681 if (mdev->bitmap) { 2800 if (mdev->bitmap) {
2682 /* maybe never allocated. */ 2801 /* maybe never allocated. */
2683 drbd_bm_resize(mdev, 0); 2802 drbd_bm_resize(mdev, 0, 1);
2684 drbd_bm_cleanup(mdev); 2803 drbd_bm_cleanup(mdev);
2685 } 2804 }
2686 2805
@@ -3129,7 +3248,7 @@ int __init drbd_init(void)
3129 if (err) 3248 if (err)
3130 goto Enomem; 3249 goto Enomem;
3131 3250
3132 drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); 3251 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3133 if (!drbd_proc) { 3252 if (!drbd_proc) {
3134 printk(KERN_ERR "drbd: unable to register proc file\n"); 3253 printk(KERN_ERR "drbd: unable to register proc file\n");
3135 goto Enomem; 3254 goto Enomem;
@@ -3660,7 +3779,8 @@ _drbd_fault_str(unsigned int type) {
3660 [DRBD_FAULT_DT_RD] = "Data read", 3779 [DRBD_FAULT_DT_RD] = "Data read",
3661 [DRBD_FAULT_DT_RA] = "Data read ahead", 3780 [DRBD_FAULT_DT_RA] = "Data read ahead",
3662 [DRBD_FAULT_BM_ALLOC] = "BM allocation", 3781 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3663 [DRBD_FAULT_AL_EE] = "EE allocation" 3782 [DRBD_FAULT_AL_EE] = "EE allocation",
3783 [DRBD_FAULT_RECEIVE] = "receive data corruption",
3664 }; 3784 };
3665 3785
3666 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; 3786 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6429d2b19e06..632e3245d1bb 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -510,7 +510,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
510 * Returns 0 on success, negative return values indicate errors. 510 * Returns 0 on success, negative return values indicate errors.
511 * You should call drbd_md_sync() after calling this function. 511 * You should call drbd_md_sync() after calling this function.
512 */ 512 */
513enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local) 513enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
514{ 514{
515 sector_t prev_first_sect, prev_size; /* previous meta location */ 515 sector_t prev_first_sect, prev_size; /* previous meta location */
516 sector_t la_size; 516 sector_t la_size;
@@ -541,12 +541,12 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force
541 /* TODO: should only be some assert here, not (re)init... */ 541 /* TODO: should only be some assert here, not (re)init... */
542 drbd_md_set_sector_offsets(mdev, mdev->ldev); 542 drbd_md_set_sector_offsets(mdev, mdev->ldev);
543 543
544 size = drbd_new_dev_size(mdev, mdev->ldev, force); 544 size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
545 545
546 if (drbd_get_capacity(mdev->this_bdev) != size || 546 if (drbd_get_capacity(mdev->this_bdev) != size ||
547 drbd_bm_capacity(mdev) != size) { 547 drbd_bm_capacity(mdev) != size) {
548 int err; 548 int err;
549 err = drbd_bm_resize(mdev, size); 549 err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
550 if (unlikely(err)) { 550 if (unlikely(err)) {
551 /* currently there is only one error: ENOMEM! */ 551 /* currently there is only one error: ENOMEM! */
552 size = drbd_bm_capacity(mdev)>>1; 552 size = drbd_bm_capacity(mdev)>>1;
@@ -704,9 +704,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
704 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 704 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
705 int max_segments = mdev->ldev->dc.max_bio_bvecs; 705 int max_segments = mdev->ldev->dc.max_bio_bvecs;
706 706
707 if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
708 max_seg_s = PAGE_SIZE;
709
710 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); 707 max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
711 708
712 blk_queue_max_hw_sectors(q, max_seg_s >> 9); 709 blk_queue_max_hw_sectors(q, max_seg_s >> 9);
@@ -1199,13 +1196,12 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1199 } 1196 }
1200 1197
1201 /* allocation not in the IO path, cqueue thread context */ 1198 /* allocation not in the IO path, cqueue thread context */
1202 new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); 1199 new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
1203 if (!new_conf) { 1200 if (!new_conf) {
1204 retcode = ERR_NOMEM; 1201 retcode = ERR_NOMEM;
1205 goto fail; 1202 goto fail;
1206 } 1203 }
1207 1204
1208 memset(new_conf, 0, sizeof(struct net_conf));
1209 new_conf->timeout = DRBD_TIMEOUT_DEF; 1205 new_conf->timeout = DRBD_TIMEOUT_DEF;
1210 new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; 1206 new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
1211 new_conf->ping_int = DRBD_PING_INT_DEF; 1207 new_conf->ping_int = DRBD_PING_INT_DEF;
@@ -1477,8 +1473,8 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1477{ 1473{
1478 struct resize rs; 1474 struct resize rs;
1479 int retcode = NO_ERROR; 1475 int retcode = NO_ERROR;
1480 int ldsc = 0; /* local disk size changed */
1481 enum determine_dev_size dd; 1476 enum determine_dev_size dd;
1477 enum dds_flags ddsf;
1482 1478
1483 memset(&rs, 0, sizeof(struct resize)); 1479 memset(&rs, 0, sizeof(struct resize));
1484 if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { 1480 if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
@@ -1502,13 +1498,17 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1502 goto fail; 1498 goto fail;
1503 } 1499 }
1504 1500
1505 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 1501 if (rs.no_resync && mdev->agreed_pro_version < 93) {
1506 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 1502 retcode = ERR_NEED_APV_93;
1507 ldsc = 1; 1503 goto fail;
1508 } 1504 }
1509 1505
1506 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
1507 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
1508
1510 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; 1509 mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
1511 dd = drbd_determin_dev_size(mdev, rs.resize_force); 1510 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
1511 dd = drbd_determin_dev_size(mdev, ddsf);
1512 drbd_md_sync(mdev); 1512 drbd_md_sync(mdev);
1513 put_ldev(mdev); 1513 put_ldev(mdev);
1514 if (dd == dev_size_error) { 1514 if (dd == dev_size_error) {
@@ -1516,12 +1516,12 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
1516 goto fail; 1516 goto fail;
1517 } 1517 }
1518 1518
1519 if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { 1519 if (mdev->state.conn == C_CONNECTED) {
1520 if (dd == grew) 1520 if (dd == grew)
1521 set_bit(RESIZE_PENDING, &mdev->flags); 1521 set_bit(RESIZE_PENDING, &mdev->flags);
1522 1522
1523 drbd_send_uuids(mdev); 1523 drbd_send_uuids(mdev);
1524 drbd_send_sizes(mdev, 1); 1524 drbd_send_sizes(mdev, 1, ddsf);
1525 } 1525 }
1526 1526
1527 fail: 1527 fail:
@@ -1551,6 +1551,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
1551 sc.rate = DRBD_RATE_DEF; 1551 sc.rate = DRBD_RATE_DEF;
1552 sc.after = DRBD_AFTER_DEF; 1552 sc.after = DRBD_AFTER_DEF;
1553 sc.al_extents = DRBD_AL_EXTENTS_DEF; 1553 sc.al_extents = DRBD_AL_EXTENTS_DEF;
1554 sc.dp_volume = DRBD_DP_VOLUME_DEF;
1555 sc.dp_interval = DRBD_DP_INTERVAL_DEF;
1556 sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF;
1557 sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF;
1554 } else 1558 } else
1555 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); 1559 memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
1556 1560
@@ -2207,9 +2211,9 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2207{ 2211{
2208 struct cn_msg *cn_reply; 2212 struct cn_msg *cn_reply;
2209 struct drbd_nl_cfg_reply *reply; 2213 struct drbd_nl_cfg_reply *reply;
2210 struct bio_vec *bvec;
2211 unsigned short *tl; 2214 unsigned short *tl;
2212 int i; 2215 struct page *page;
2216 unsigned len;
2213 2217
2214 if (!e) 2218 if (!e)
2215 return; 2219 return;
@@ -2247,11 +2251,15 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
2247 put_unaligned(T_ee_data, tl++); 2251 put_unaligned(T_ee_data, tl++);
2248 put_unaligned(e->size, tl++); 2252 put_unaligned(e->size, tl++);
2249 2253
2250 __bio_for_each_segment(bvec, e->private_bio, i, 0) { 2254 len = e->size;
2251 void *d = kmap(bvec->bv_page); 2255 page = e->pages;
2252 memcpy(tl, d + bvec->bv_offset, bvec->bv_len); 2256 page_chain_for_each(page) {
2253 kunmap(bvec->bv_page); 2257 void *d = kmap_atomic(page, KM_USER0);
2254 tl=(unsigned short*)((char*)tl + bvec->bv_len); 2258 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2259 memcpy(tl, d, l);
2260 kunmap_atomic(d, KM_USER0);
2261 tl = (unsigned short*)((char*)tl + l);
2262 len -= l;
2255 } 2263 }
2256 put_unaligned(TT_END, tl++); /* Close the tag list */ 2264 put_unaligned(TT_END, tl++); /* Close the tag list */
2257 2265
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be3374b68460..d0f1767ea4c3 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -73,14 +73,21 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
73 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); 73 seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
74 /* if more than 1 GB display in MB */ 74 /* if more than 1 GB display in MB */
75 if (mdev->rs_total > 0x100000L) 75 if (mdev->rs_total > 0x100000L)
76 seq_printf(seq, "(%lu/%lu)M\n\t", 76 seq_printf(seq, "(%lu/%lu)M",
77 (unsigned long) Bit2KB(rs_left >> 10), 77 (unsigned long) Bit2KB(rs_left >> 10),
78 (unsigned long) Bit2KB(mdev->rs_total >> 10)); 78 (unsigned long) Bit2KB(mdev->rs_total >> 10));
79 else 79 else
80 seq_printf(seq, "(%lu/%lu)K\n\t", 80 seq_printf(seq, "(%lu/%lu)K",
81 (unsigned long) Bit2KB(rs_left), 81 (unsigned long) Bit2KB(rs_left),
82 (unsigned long) Bit2KB(mdev->rs_total)); 82 (unsigned long) Bit2KB(mdev->rs_total));
83 83
84 if (mdev->state.conn == C_SYNC_TARGET)
85 seq_printf(seq, " queue_delay: %d.%d ms\n\t",
86 mdev->data_delay / 1000,
87 (mdev->data_delay % 1000) / 100);
88 else if (mdev->state.conn == C_SYNC_SOURCE)
89 seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq);
90
84 /* see drivers/md/md.c 91 /* see drivers/md/md.c
85 * We do not want to overflow, so the order of operands and 92 * We do not want to overflow, so the order of operands and
86 * the * 100 / 100 trick are important. We do a +1 to be 93 * the * 100 / 100 trick are important. We do a +1 to be
@@ -128,6 +135,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
128 else 135 else
129 seq_printf(seq, " (%ld)", dbdt); 136 seq_printf(seq, " (%ld)", dbdt);
130 137
138 if (mdev->state.conn == C_SYNC_TARGET) {
139 if (mdev->c_sync_rate > 1000)
140 seq_printf(seq, " want: %d,%03d",
141 mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
142 else
143 seq_printf(seq, " want: %d", mdev->c_sync_rate);
144 }
145
131 seq_printf(seq, " K/sec\n"); 146 seq_printf(seq, " K/sec\n");
132} 147}
133 148
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3f096e7959b4..bc9ab7fb2cc7 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -80,30 +80,128 @@ static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epo
80 80
81#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 81#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
82 82
83static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) 83/*
84 * some helper functions to deal with single linked page lists,
85 * page->private being our "next" pointer.
86 */
87
88/* If at least n pages are linked at head, get n pages off.
89 * Otherwise, don't modify head, and return NULL.
90 * Locking is the responsibility of the caller.
91 */
92static struct page *page_chain_del(struct page **head, int n)
93{
94 struct page *page;
95 struct page *tmp;
96
97 BUG_ON(!n);
98 BUG_ON(!head);
99
100 page = *head;
101
102 if (!page)
103 return NULL;
104
105 while (page) {
106 tmp = page_chain_next(page);
107 if (--n == 0)
108 break; /* found sufficient pages */
109 if (tmp == NULL)
110 /* insufficient pages, don't use any of them. */
111 return NULL;
112 page = tmp;
113 }
114
115 /* add end of list marker for the returned list */
116 set_page_private(page, 0);
117 /* actual return value, and adjustment of head */
118 page = *head;
119 *head = tmp;
120 return page;
121}
122
123/* may be used outside of locks to find the tail of a (usually short)
124 * "private" page chain, before adding it back to a global chain head
125 * with page_chain_add() under a spinlock. */
126static struct page *page_chain_tail(struct page *page, int *len)
127{
128 struct page *tmp;
129 int i = 1;
130 while ((tmp = page_chain_next(page)))
131 ++i, page = tmp;
132 if (len)
133 *len = i;
134 return page;
135}
136
137static int page_chain_free(struct page *page)
138{
139 struct page *tmp;
140 int i = 0;
141 page_chain_for_each_safe(page, tmp) {
142 put_page(page);
143 ++i;
144 }
145 return i;
146}
147
148static void page_chain_add(struct page **head,
149 struct page *chain_first, struct page *chain_last)
150{
151#if 1
152 struct page *tmp;
153 tmp = page_chain_tail(chain_first, NULL);
154 BUG_ON(tmp != chain_last);
155#endif
156
157 /* add chain to head */
158 set_page_private(chain_last, (unsigned long)*head);
159 *head = chain_first;
160}
161
162static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
84{ 163{
85 struct page *page = NULL; 164 struct page *page = NULL;
165 struct page *tmp = NULL;
166 int i = 0;
86 167
87 /* Yes, testing drbd_pp_vacant outside the lock is racy. 168 /* Yes, testing drbd_pp_vacant outside the lock is racy.
88 * So what. It saves a spin_lock. */ 169 * So what. It saves a spin_lock. */
89 if (drbd_pp_vacant > 0) { 170 if (drbd_pp_vacant >= number) {
90 spin_lock(&drbd_pp_lock); 171 spin_lock(&drbd_pp_lock);
91 page = drbd_pp_pool; 172 page = page_chain_del(&drbd_pp_pool, number);
92 if (page) { 173 if (page)
93 drbd_pp_pool = (struct page *)page_private(page); 174 drbd_pp_vacant -= number;
94 set_page_private(page, 0); /* just to be polite */
95 drbd_pp_vacant--;
96 }
97 spin_unlock(&drbd_pp_lock); 175 spin_unlock(&drbd_pp_lock);
176 if (page)
177 return page;
98 } 178 }
179
99 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD 180 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
100 * "criss-cross" setup, that might cause write-out on some other DRBD, 181 * "criss-cross" setup, that might cause write-out on some other DRBD,
101 * which in turn might block on the other node at this very place. */ 182 * which in turn might block on the other node at this very place. */
102 if (!page) 183 for (i = 0; i < number; i++) {
103 page = alloc_page(GFP_TRY); 184 tmp = alloc_page(GFP_TRY);
104 if (page) 185 if (!tmp)
105 atomic_inc(&mdev->pp_in_use); 186 break;
106 return page; 187 set_page_private(tmp, (unsigned long)page);
188 page = tmp;
189 }
190
191 if (i == number)
192 return page;
193
194 /* Not enough pages immediately available this time.
195 * No need to jump around here, drbd_pp_alloc will retry this
196 * function "soon". */
197 if (page) {
198 tmp = page_chain_tail(page, NULL);
199 spin_lock(&drbd_pp_lock);
200 page_chain_add(&drbd_pp_pool, page, tmp);
201 drbd_pp_vacant += i;
202 spin_unlock(&drbd_pp_lock);
203 }
204 return NULL;
107} 205}
108 206
109/* kick lower level device, if we have more than (arbitrary number) 207/* kick lower level device, if we have more than (arbitrary number)
@@ -127,7 +225,7 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
127 225
128 list_for_each_safe(le, tle, &mdev->net_ee) { 226 list_for_each_safe(le, tle, &mdev->net_ee) {
129 e = list_entry(le, struct drbd_epoch_entry, w.list); 227 e = list_entry(le, struct drbd_epoch_entry, w.list);
130 if (drbd_bio_has_active_page(e->private_bio)) 228 if (drbd_ee_has_active_page(e))
131 break; 229 break;
132 list_move(le, to_be_freed); 230 list_move(le, to_be_freed);
133 } 231 }
@@ -148,32 +246,34 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
148} 246}
149 247
150/** 248/**
151 * drbd_pp_alloc() - Returns a page, fails only if a signal comes in 249 * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
152 * @mdev: DRBD device. 250 * @mdev: DRBD device.
153 * @retry: whether or not to retry allocation forever (or until signalled) 251 * @number: number of pages requested
252 * @retry: whether to retry, if not enough pages are available right now
253 *
254 * Tries to allocate number pages, first from our own page pool, then from
255 * the kernel, unless this allocation would exceed the max_buffers setting.
256 * Possibly retry until DRBD frees sufficient pages somewhere else.
154 * 257 *
155 * Tries to allocate a page, first from our own page pool, then from the 258 * Returns a page chain linked via page->private.
156 * kernel, unless this allocation would exceed the max_buffers setting.
157 * If @retry is non-zero, retry until DRBD frees a page somewhere else.
158 */ 259 */
159static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) 260static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
160{ 261{
161 struct page *page = NULL; 262 struct page *page = NULL;
162 DEFINE_WAIT(wait); 263 DEFINE_WAIT(wait);
163 264
164 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { 265 /* Yes, we may run up to @number over max_buffers. If we
165 page = drbd_pp_first_page_or_try_alloc(mdev); 266 * follow it strictly, the admin will get it wrong anyways. */
166 if (page) 267 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
167 return page; 268 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
168 }
169 269
170 for (;;) { 270 while (page == NULL) {
171 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); 271 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
172 272
173 drbd_kick_lo_and_reclaim_net(mdev); 273 drbd_kick_lo_and_reclaim_net(mdev);
174 274
175 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { 275 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
176 page = drbd_pp_first_page_or_try_alloc(mdev); 276 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
177 if (page) 277 if (page)
178 break; 278 break;
179 } 279 }
@@ -190,62 +290,32 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
190 } 290 }
191 finish_wait(&drbd_pp_wait, &wait); 291 finish_wait(&drbd_pp_wait, &wait);
192 292
293 if (page)
294 atomic_add(number, &mdev->pp_in_use);
193 return page; 295 return page;
194} 296}
195 297
196/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. 298/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
197 * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ 299 * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
300 * Either links the page chain back to the global pool,
301 * or returns all pages to the system. */
198static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) 302static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
199{ 303{
200 int free_it;
201
202 spin_lock(&drbd_pp_lock);
203 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
204 free_it = 1;
205 } else {
206 set_page_private(page, (unsigned long)drbd_pp_pool);
207 drbd_pp_pool = page;
208 drbd_pp_vacant++;
209 free_it = 0;
210 }
211 spin_unlock(&drbd_pp_lock);
212
213 atomic_dec(&mdev->pp_in_use);
214
215 if (free_it)
216 __free_page(page);
217
218 wake_up(&drbd_pp_wait);
219}
220
221static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
222{
223 struct page *p_to_be_freed = NULL;
224 struct page *page;
225 struct bio_vec *bvec;
226 int i; 304 int i;
227 305 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
228 spin_lock(&drbd_pp_lock); 306 i = page_chain_free(page);
229 __bio_for_each_segment(bvec, bio, i, 0) { 307 else {
230 if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { 308 struct page *tmp;
231 set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); 309 tmp = page_chain_tail(page, &i);
232 p_to_be_freed = bvec->bv_page; 310 spin_lock(&drbd_pp_lock);
233 } else { 311 page_chain_add(&drbd_pp_pool, page, tmp);
234 set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); 312 drbd_pp_vacant += i;
235 drbd_pp_pool = bvec->bv_page; 313 spin_unlock(&drbd_pp_lock);
236 drbd_pp_vacant++;
237 }
238 }
239 spin_unlock(&drbd_pp_lock);
240 atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
241
242 while (p_to_be_freed) {
243 page = p_to_be_freed;
244 p_to_be_freed = (struct page *)page_private(page);
245 set_page_private(page, 0); /* just to be polite */
246 put_page(page);
247 } 314 }
248 315 atomic_sub(i, &mdev->pp_in_use);
316 i = atomic_read(&mdev->pp_in_use);
317 if (i < 0)
318 dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
249 wake_up(&drbd_pp_wait); 319 wake_up(&drbd_pp_wait);
250} 320}
251 321
@@ -270,11 +340,9 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
270 unsigned int data_size, 340 unsigned int data_size,
271 gfp_t gfp_mask) __must_hold(local) 341 gfp_t gfp_mask) __must_hold(local)
272{ 342{
273 struct request_queue *q;
274 struct drbd_epoch_entry *e; 343 struct drbd_epoch_entry *e;
275 struct page *page; 344 struct page *page;
276 struct bio *bio; 345 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
277 unsigned int ds;
278 346
279 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) 347 if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
280 return NULL; 348 return NULL;
@@ -286,84 +354,32 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
286 return NULL; 354 return NULL;
287 } 355 }
288 356
289 bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); 357 page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
290 if (!bio) { 358 if (!page)
291 if (!(gfp_mask & __GFP_NOWARN)) 359 goto fail;
292 dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
293 goto fail1;
294 }
295
296 bio->bi_bdev = mdev->ldev->backing_bdev;
297 bio->bi_sector = sector;
298
299 ds = data_size;
300 while (ds) {
301 page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
302 if (!page) {
303 if (!(gfp_mask & __GFP_NOWARN))
304 dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
305 goto fail2;
306 }
307 if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
308 drbd_pp_free(mdev, page);
309 dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
310 "data_size=%u,ds=%u) failed\n",
311 (unsigned long long)sector, data_size, ds);
312
313 q = bdev_get_queue(bio->bi_bdev);
314 if (q->merge_bvec_fn) {
315 struct bvec_merge_data bvm = {
316 .bi_bdev = bio->bi_bdev,
317 .bi_sector = bio->bi_sector,
318 .bi_size = bio->bi_size,
319 .bi_rw = bio->bi_rw,
320 };
321 int l = q->merge_bvec_fn(q, &bvm,
322 &bio->bi_io_vec[bio->bi_vcnt]);
323 dev_err(DEV, "merge_bvec_fn() = %d\n", l);
324 }
325
326 /* dump more of the bio. */
327 dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
328 dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
329 dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
330 dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
331
332 goto fail2;
333 break;
334 }
335 ds -= min_t(int, ds, PAGE_SIZE);
336 }
337
338 D_ASSERT(data_size == bio->bi_size);
339
340 bio->bi_private = e;
341 e->mdev = mdev;
342 e->sector = sector;
343 e->size = bio->bi_size;
344 360
345 e->private_bio = bio;
346 e->block_id = id;
347 INIT_HLIST_NODE(&e->colision); 361 INIT_HLIST_NODE(&e->colision);
348 e->epoch = NULL; 362 e->epoch = NULL;
363 e->mdev = mdev;
364 e->pages = page;
365 atomic_set(&e->pending_bios, 0);
366 e->size = data_size;
349 e->flags = 0; 367 e->flags = 0;
368 e->sector = sector;
369 e->sector = sector;
370 e->block_id = id;
350 371
351 return e; 372 return e;
352 373
353 fail2: 374 fail:
354 drbd_pp_free_bio_pages(mdev, bio);
355 bio_put(bio);
356 fail1:
357 mempool_free(e, drbd_ee_mempool); 375 mempool_free(e, drbd_ee_mempool);
358
359 return NULL; 376 return NULL;
360} 377}
361 378
362void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) 379void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
363{ 380{
364 struct bio *bio = e->private_bio; 381 drbd_pp_free(mdev, e->pages);
365 drbd_pp_free_bio_pages(mdev, bio); 382 D_ASSERT(atomic_read(&e->pending_bios) == 0);
366 bio_put(bio);
367 D_ASSERT(hlist_unhashed(&e->colision)); 383 D_ASSERT(hlist_unhashed(&e->colision));
368 mempool_free(e, drbd_ee_mempool); 384 mempool_free(e, drbd_ee_mempool);
369} 385}
@@ -902,7 +918,7 @@ retry:
902 if (!drbd_send_protocol(mdev)) 918 if (!drbd_send_protocol(mdev))
903 return -1; 919 return -1;
904 drbd_send_sync_param(mdev, &mdev->sync_conf); 920 drbd_send_sync_param(mdev, &mdev->sync_conf);
905 drbd_send_sizes(mdev, 0); 921 drbd_send_sizes(mdev, 0, 0);
906 drbd_send_uuids(mdev); 922 drbd_send_uuids(mdev);
907 drbd_send_state(mdev); 923 drbd_send_state(mdev);
908 clear_bit(USE_DEGR_WFC_T, &mdev->flags); 924 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
@@ -946,7 +962,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
946 int rv; 962 int rv;
947 963
948 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { 964 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
949 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); 965 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
966 NULL, BLKDEV_IFL_WAIT);
950 if (rv) { 967 if (rv) {
951 dev_err(DEV, "local disk flush failed with status %d\n", rv); 968 dev_err(DEV, "local disk flush failed with status %d\n", rv);
952 /* would rather check on EOPNOTSUPP, but that is not reliable. 969 /* would rather check on EOPNOTSUPP, but that is not reliable.
@@ -1120,6 +1137,101 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1120} 1137}
1121 1138
1122/** 1139/**
1140 * drbd_submit_ee()
1141 * @mdev: DRBD device.
1142 * @e: epoch entry
1143 * @rw: flag field, see bio->bi_rw
1144 */
1145/* TODO allocate from our own bio_set. */
1146int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1147 const unsigned rw, const int fault_type)
1148{
1149 struct bio *bios = NULL;
1150 struct bio *bio;
1151 struct page *page = e->pages;
1152 sector_t sector = e->sector;
1153 unsigned ds = e->size;
1154 unsigned n_bios = 0;
1155 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1156
1157 if (atomic_read(&mdev->new_c_uuid)) {
1158 if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
1159 drbd_uuid_new_current(mdev);
1160 drbd_md_sync(mdev);
1161
1162 atomic_dec(&mdev->new_c_uuid);
1163 wake_up(&mdev->misc_wait);
1164 }
1165 wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
1166 }
1167
1168 /* In most cases, we will only need one bio. But in case the lower
1169 * level restrictions happen to be different at this offset on this
1170 * side than those of the sending peer, we may need to submit the
1171 * request in more than one bio. */
1172next_bio:
1173 bio = bio_alloc(GFP_NOIO, nr_pages);
1174 if (!bio) {
1175 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1176 goto fail;
1177 }
1178 /* > e->sector, unless this is the first bio */
1179 bio->bi_sector = sector;
1180 bio->bi_bdev = mdev->ldev->backing_bdev;
1181 /* we special case some flags in the multi-bio case, see below
1182 * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */
1183 bio->bi_rw = rw;
1184 bio->bi_private = e;
1185 bio->bi_end_io = drbd_endio_sec;
1186
1187 bio->bi_next = bios;
1188 bios = bio;
1189 ++n_bios;
1190
1191 page_chain_for_each(page) {
1192 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1193 if (!bio_add_page(bio, page, len, 0)) {
1194 /* a single page must always be possible! */
1195 BUG_ON(bio->bi_vcnt == 0);
1196 goto next_bio;
1197 }
1198 ds -= len;
1199 sector += len >> 9;
1200 --nr_pages;
1201 }
1202 D_ASSERT(page == NULL);
1203 D_ASSERT(ds == 0);
1204
1205 atomic_set(&e->pending_bios, n_bios);
1206 do {
1207 bio = bios;
1208 bios = bios->bi_next;
1209 bio->bi_next = NULL;
1210
1211 /* strip off BIO_RW_UNPLUG unless it is the last bio */
1212 if (bios)
1213 bio->bi_rw &= ~(1<<BIO_RW_UNPLUG);
1214
1215 drbd_generic_make_request(mdev, fault_type, bio);
1216
1217 /* strip off BIO_RW_BARRIER,
1218 * unless it is the first or last bio */
1219 if (bios && bios->bi_next)
1220 bios->bi_rw &= ~(1<<BIO_RW_BARRIER);
1221 } while (bios);
1222 maybe_kick_lo(mdev);
1223 return 0;
1224
1225fail:
1226 while (bios) {
1227 bio = bios;
1228 bios = bios->bi_next;
1229 bio_put(bio);
1230 }
1231 return -ENOMEM;
1232}
1233
1234/**
1123 * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set 1235 * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
1124 * @mdev: DRBD device. 1236 * @mdev: DRBD device.
1125 * @w: work object. 1237 * @w: work object.
@@ -1128,8 +1240,6 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1128int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) 1240int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1129{ 1241{
1130 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1242 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1131 struct bio *bio = e->private_bio;
1132
1133 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, 1243 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1134 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) 1244 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1135 so that we can finish that epoch in drbd_may_finish_epoch(). 1245 so that we can finish that epoch in drbd_may_finish_epoch().
@@ -1143,33 +1253,17 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
1143 if (previous_epoch(mdev, e->epoch)) 1253 if (previous_epoch(mdev, e->epoch))
1144 dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); 1254 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1145 1255
1146 /* prepare bio for re-submit,
1147 * re-init volatile members */
1148 /* we still have a local reference, 1256 /* we still have a local reference,
1149 * get_ldev was done in receive_Data. */ 1257 * get_ldev was done in receive_Data. */
1150 bio->bi_bdev = mdev->ldev->backing_bdev;
1151 bio->bi_sector = e->sector;
1152 bio->bi_size = e->size;
1153 bio->bi_idx = 0;
1154
1155 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1156 bio->bi_flags |= 1 << BIO_UPTODATE;
1157
1158 /* don't know whether this is necessary: */
1159 bio->bi_phys_segments = 0;
1160 bio->bi_next = NULL;
1161
1162 /* these should be unchanged: */
1163 /* bio->bi_end_io = drbd_endio_write_sec; */
1164 /* bio->bi_vcnt = whatever; */
1165 1258
1166 e->w.cb = e_end_block; 1259 e->w.cb = e_end_block;
1167 1260 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1168 /* This is no longer a barrier request. */ 1261 /* drbd_submit_ee fails for one reason only:
1169 bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); 1262 * if was not able to allocate sufficient bios.
1170 1263 * requeue, try again later. */
1171 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); 1264 e->w.cb = w_e_reissue;
1172 1265 drbd_queue_work(&mdev->data.work, &e->w);
1266 }
1173 return 1; 1267 return 1;
1174} 1268}
1175 1269
@@ -1261,13 +1355,13 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1261static struct drbd_epoch_entry * 1355static struct drbd_epoch_entry *
1262read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) 1356read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1263{ 1357{
1358 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1264 struct drbd_epoch_entry *e; 1359 struct drbd_epoch_entry *e;
1265 struct bio_vec *bvec;
1266 struct page *page; 1360 struct page *page;
1267 struct bio *bio; 1361 int dgs, ds, rr;
1268 int dgs, ds, i, rr;
1269 void *dig_in = mdev->int_dig_in; 1362 void *dig_in = mdev->int_dig_in;
1270 void *dig_vv = mdev->int_dig_vv; 1363 void *dig_vv = mdev->int_dig_vv;
1364 unsigned long *data;
1271 1365
1272 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? 1366 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1273 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; 1367 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
@@ -1286,29 +1380,44 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
1286 ERR_IF(data_size & 0x1ff) return NULL; 1380 ERR_IF(data_size & 0x1ff) return NULL;
1287 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; 1381 ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
1288 1382
1383 /* even though we trust out peer,
1384 * we sometimes have to double check. */
1385 if (sector + (data_size>>9) > capacity) {
1386 dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
1387 (unsigned long long)capacity,
1388 (unsigned long long)sector, data_size);
1389 return NULL;
1390 }
1391
1289 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1392 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1290 * "criss-cross" setup, that might cause write-out on some other DRBD, 1393 * "criss-cross" setup, that might cause write-out on some other DRBD,
1291 * which in turn might block on the other node at this very place. */ 1394 * which in turn might block on the other node at this very place. */
1292 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); 1395 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1293 if (!e) 1396 if (!e)
1294 return NULL; 1397 return NULL;
1295 bio = e->private_bio; 1398
1296 ds = data_size; 1399 ds = data_size;
1297 bio_for_each_segment(bvec, bio, i) { 1400 page = e->pages;
1298 page = bvec->bv_page; 1401 page_chain_for_each(page) {
1299 rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); 1402 unsigned len = min_t(int, ds, PAGE_SIZE);
1403 data = kmap(page);
1404 rr = drbd_recv(mdev, data, len);
1405 if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
1406 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1407 data[0] = data[0] ^ (unsigned long)-1;
1408 }
1300 kunmap(page); 1409 kunmap(page);
1301 if (rr != min_t(int, ds, PAGE_SIZE)) { 1410 if (rr != len) {
1302 drbd_free_ee(mdev, e); 1411 drbd_free_ee(mdev, e);
1303 dev_warn(DEV, "short read receiving data: read %d expected %d\n", 1412 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1304 rr, min_t(int, ds, PAGE_SIZE)); 1413 rr, len);
1305 return NULL; 1414 return NULL;
1306 } 1415 }
1307 ds -= rr; 1416 ds -= rr;
1308 } 1417 }
1309 1418
1310 if (dgs) { 1419 if (dgs) {
1311 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); 1420 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
1312 if (memcmp(dig_in, dig_vv, dgs)) { 1421 if (memcmp(dig_in, dig_vv, dgs)) {
1313 dev_err(DEV, "Digest integrity check FAILED.\n"); 1422 dev_err(DEV, "Digest integrity check FAILED.\n");
1314 drbd_bcast_ee(mdev, "digest failed", 1423 drbd_bcast_ee(mdev, "digest failed",
@@ -1330,7 +1439,10 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1330 int rr, rv = 1; 1439 int rr, rv = 1;
1331 void *data; 1440 void *data;
1332 1441
1333 page = drbd_pp_alloc(mdev, 1); 1442 if (!data_size)
1443 return TRUE;
1444
1445 page = drbd_pp_alloc(mdev, 1, 1);
1334 1446
1335 data = kmap(page); 1447 data = kmap(page);
1336 while (data_size) { 1448 while (data_size) {
@@ -1394,7 +1506,7 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1394 } 1506 }
1395 1507
1396 if (dgs) { 1508 if (dgs) {
1397 drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); 1509 drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1398 if (memcmp(dig_in, dig_vv, dgs)) { 1510 if (memcmp(dig_in, dig_vv, dgs)) {
1399 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); 1511 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1400 return 0; 1512 return 0;
@@ -1415,7 +1527,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
1415 1527
1416 D_ASSERT(hlist_unhashed(&e->colision)); 1528 D_ASSERT(hlist_unhashed(&e->colision));
1417 1529
1418 if (likely(drbd_bio_uptodate(e->private_bio))) { 1530 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1419 drbd_set_in_sync(mdev, sector, e->size); 1531 drbd_set_in_sync(mdev, sector, e->size);
1420 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); 1532 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1421 } else { 1533 } else {
@@ -1434,30 +1546,28 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
1434 struct drbd_epoch_entry *e; 1546 struct drbd_epoch_entry *e;
1435 1547
1436 e = read_in_block(mdev, ID_SYNCER, sector, data_size); 1548 e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1437 if (!e) { 1549 if (!e)
1438 put_ldev(mdev); 1550 goto fail;
1439 return FALSE;
1440 }
1441 1551
1442 dec_rs_pending(mdev); 1552 dec_rs_pending(mdev);
1443 1553
1444 e->private_bio->bi_end_io = drbd_endio_write_sec;
1445 e->private_bio->bi_rw = WRITE;
1446 e->w.cb = e_end_resync_block;
1447
1448 inc_unacked(mdev); 1554 inc_unacked(mdev);
1449 /* corresponding dec_unacked() in e_end_resync_block() 1555 /* corresponding dec_unacked() in e_end_resync_block()
1450 * respective _drbd_clear_done_ee */ 1556 * respective _drbd_clear_done_ee */
1451 1557
1558 e->w.cb = e_end_resync_block;
1559
1452 spin_lock_irq(&mdev->req_lock); 1560 spin_lock_irq(&mdev->req_lock);
1453 list_add(&e->w.list, &mdev->sync_ee); 1561 list_add(&e->w.list, &mdev->sync_ee);
1454 spin_unlock_irq(&mdev->req_lock); 1562 spin_unlock_irq(&mdev->req_lock);
1455 1563
1456 drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); 1564 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
1457 /* accounting done in endio */ 1565 return TRUE;
1458 1566
1459 maybe_kick_lo(mdev); 1567 drbd_free_ee(mdev, e);
1460 return TRUE; 1568fail:
1569 put_ldev(mdev);
1570 return FALSE;
1461} 1571}
1462 1572
1463static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) 1573static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
@@ -1552,7 +1662,7 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1552 } 1662 }
1553 1663
1554 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { 1664 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1555 if (likely(drbd_bio_uptodate(e->private_bio))) { 1665 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1556 pcmd = (mdev->state.conn >= C_SYNC_SOURCE && 1666 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1557 mdev->state.conn <= C_PAUSED_SYNC_T && 1667 mdev->state.conn <= C_PAUSED_SYNC_T &&
1558 e->flags & EE_MAY_SET_IN_SYNC) ? 1668 e->flags & EE_MAY_SET_IN_SYNC) ?
@@ -1698,7 +1808,6 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1698 return FALSE; 1808 return FALSE;
1699 } 1809 }
1700 1810
1701 e->private_bio->bi_end_io = drbd_endio_write_sec;
1702 e->w.cb = e_end_block; 1811 e->w.cb = e_end_block;
1703 1812
1704 spin_lock(&mdev->epoch_lock); 1813 spin_lock(&mdev->epoch_lock);
@@ -1894,12 +2003,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1894 drbd_al_begin_io(mdev, e->sector); 2003 drbd_al_begin_io(mdev, e->sector);
1895 } 2004 }
1896 2005
1897 e->private_bio->bi_rw = rw; 2006 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
1898 drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); 2007 return TRUE;
1899 /* accounting done in endio */
1900
1901 maybe_kick_lo(mdev);
1902 return TRUE;
1903 2008
1904out_interrupted: 2009out_interrupted:
1905 /* yes, the epoch_size now is imbalanced. 2010 /* yes, the epoch_size now is imbalanced.
@@ -1945,7 +2050,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
1945 "no local data.\n"); 2050 "no local data.\n");
1946 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : 2051 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
1947 P_NEG_RS_DREPLY , p); 2052 P_NEG_RS_DREPLY , p);
1948 return TRUE; 2053 return drbd_drain_block(mdev, h->length - brps);
1949 } 2054 }
1950 2055
1951 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 2056 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -1957,9 +2062,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
1957 return FALSE; 2062 return FALSE;
1958 } 2063 }
1959 2064
1960 e->private_bio->bi_rw = READ;
1961 e->private_bio->bi_end_io = drbd_endio_read_sec;
1962
1963 switch (h->command) { 2065 switch (h->command) {
1964 case P_DATA_REQUEST: 2066 case P_DATA_REQUEST:
1965 e->w.cb = w_e_end_data_req; 2067 e->w.cb = w_e_end_data_req;
@@ -2053,10 +2155,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2053 2155
2054 inc_unacked(mdev); 2156 inc_unacked(mdev);
2055 2157
2056 drbd_generic_make_request(mdev, fault_type, e->private_bio); 2158 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
2057 maybe_kick_lo(mdev); 2159 return TRUE;
2058
2059 return TRUE;
2060 2160
2061out_free_e: 2161out_free_e:
2062 kfree(di); 2162 kfree(di);
@@ -2473,6 +2573,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
2473 hg > 0 ? "source" : "target"); 2573 hg > 0 ? "source" : "target");
2474 } 2574 }
2475 2575
2576 if (abs(hg) == 100)
2577 drbd_khelper(mdev, "initial-split-brain");
2578
2476 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { 2579 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2477 int pcount = (mdev->state.role == R_PRIMARY) 2580 int pcount = (mdev->state.role == R_PRIMARY)
2478 + (peer_role == R_PRIMARY); 2581 + (peer_role == R_PRIMARY);
@@ -2518,7 +2621,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
2518 * after an attempted attach on a diskless node. 2621 * after an attempted attach on a diskless node.
2519 * We just refuse to attach -- well, we drop the "connection" 2622 * We just refuse to attach -- well, we drop the "connection"
2520 * to that disk, in a way... */ 2623 * to that disk, in a way... */
2521 dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); 2624 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
2522 drbd_khelper(mdev, "split-brain"); 2625 drbd_khelper(mdev, "split-brain");
2523 return C_MASK; 2626 return C_MASK;
2524 } 2627 }
@@ -2849,7 +2952,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2849 unsigned int max_seg_s; 2952 unsigned int max_seg_s;
2850 sector_t p_size, p_usize, my_usize; 2953 sector_t p_size, p_usize, my_usize;
2851 int ldsc = 0; /* local disk size changed */ 2954 int ldsc = 0; /* local disk size changed */
2852 enum drbd_conns nconn; 2955 enum dds_flags ddsf;
2853 2956
2854 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; 2957 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2855 if (drbd_recv(mdev, h->payload, h->length) != h->length) 2958 if (drbd_recv(mdev, h->payload, h->length) != h->length)
@@ -2905,8 +3008,9 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2905 } 3008 }
2906#undef min_not_zero 3009#undef min_not_zero
2907 3010
3011 ddsf = be16_to_cpu(p->dds_flags);
2908 if (get_ldev(mdev)) { 3012 if (get_ldev(mdev)) {
2909 dd = drbd_determin_dev_size(mdev, 0); 3013 dd = drbd_determin_dev_size(mdev, ddsf);
2910 put_ldev(mdev); 3014 put_ldev(mdev);
2911 if (dd == dev_size_error) 3015 if (dd == dev_size_error)
2912 return FALSE; 3016 return FALSE;
@@ -2916,33 +3020,21 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2916 drbd_set_my_capacity(mdev, p_size); 3020 drbd_set_my_capacity(mdev, p_size);
2917 } 3021 }
2918 3022
2919 if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
2920 nconn = drbd_sync_handshake(mdev,
2921 mdev->state.peer, mdev->state.pdsk);
2922 put_ldev(mdev);
2923
2924 if (nconn == C_MASK) {
2925 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2926 return FALSE;
2927 }
2928
2929 if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
2930 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2931 return FALSE;
2932 }
2933 }
2934
2935 if (get_ldev(mdev)) { 3023 if (get_ldev(mdev)) {
2936 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 3024 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
2937 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 3025 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
2938 ldsc = 1; 3026 ldsc = 1;
2939 } 3027 }
2940 3028
2941 max_seg_s = be32_to_cpu(p->max_segment_size); 3029 if (mdev->agreed_pro_version < 94)
3030 max_seg_s = be32_to_cpu(p->max_segment_size);
3031 else /* drbd 8.3.8 onwards */
3032 max_seg_s = DRBD_MAX_SEGMENT_SIZE;
3033
2942 if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) 3034 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
2943 drbd_setup_queue_param(mdev, max_seg_s); 3035 drbd_setup_queue_param(mdev, max_seg_s);
2944 3036
2945 drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); 3037 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
2946 put_ldev(mdev); 3038 put_ldev(mdev);
2947 } 3039 }
2948 3040
@@ -2951,14 +3043,17 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2951 drbd_get_capacity(mdev->this_bdev) || ldsc) { 3043 drbd_get_capacity(mdev->this_bdev) || ldsc) {
2952 /* we have different sizes, probably peer 3044 /* we have different sizes, probably peer
2953 * needs to know my new size... */ 3045 * needs to know my new size... */
2954 drbd_send_sizes(mdev, 0); 3046 drbd_send_sizes(mdev, 0, ddsf);
2955 } 3047 }
2956 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || 3048 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
2957 (dd == grew && mdev->state.conn == C_CONNECTED)) { 3049 (dd == grew && mdev->state.conn == C_CONNECTED)) {
2958 if (mdev->state.pdsk >= D_INCONSISTENT && 3050 if (mdev->state.pdsk >= D_INCONSISTENT &&
2959 mdev->state.disk >= D_INCONSISTENT) 3051 mdev->state.disk >= D_INCONSISTENT) {
2960 resync_after_online_grow(mdev); 3052 if (ddsf & DDSF_NO_RESYNC)
2961 else 3053 dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3054 else
3055 resync_after_online_grow(mdev);
3056 } else
2962 set_bit(RESYNC_AFTER_NEG, &mdev->flags); 3057 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
2963 } 3058 }
2964 } 3059 }
@@ -3490,6 +3585,92 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3490 return TRUE; 3585 return TRUE;
3491} 3586}
3492 3587
3588static void timeval_sub_us(struct timeval* tv, unsigned int us)
3589{
3590 tv->tv_sec -= us / 1000000;
3591 us = us % 1000000;
3592 if (tv->tv_usec > us) {
3593 tv->tv_usec += 1000000;
3594 tv->tv_sec--;
3595 }
3596 tv->tv_usec -= us;
3597}
3598
3599static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p)
3600{
3601 struct delay_probe *dp;
3602 struct list_head *le;
3603 struct timeval now;
3604 int seq_num;
3605 int offset;
3606 int data_delay;
3607
3608 seq_num = be32_to_cpu(p->seq_num);
3609 offset = be32_to_cpu(p->offset);
3610
3611 spin_lock(&mdev->peer_seq_lock);
3612 if (!list_empty(&mdev->delay_probes)) {
3613 if (from == USE_DATA_SOCKET)
3614 le = mdev->delay_probes.next;
3615 else
3616 le = mdev->delay_probes.prev;
3617
3618 dp = list_entry(le, struct delay_probe, list);
3619
3620 if (dp->seq_num == seq_num) {
3621 list_del(le);
3622 spin_unlock(&mdev->peer_seq_lock);
3623 do_gettimeofday(&now);
3624 timeval_sub_us(&now, offset);
3625 data_delay =
3626 now.tv_usec - dp->time.tv_usec +
3627 (now.tv_sec - dp->time.tv_sec) * 1000000;
3628
3629 if (data_delay > 0)
3630 mdev->data_delay = data_delay;
3631
3632 kfree(dp);
3633 return;
3634 }
3635
3636 if (dp->seq_num > seq_num) {
3637 spin_unlock(&mdev->peer_seq_lock);
3638 dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n");
3639 return; /* Do not alloca a struct delay_probe.... */
3640 }
3641 }
3642 spin_unlock(&mdev->peer_seq_lock);
3643
3644 dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO);
3645 if (!dp) {
3646 dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n");
3647 return;
3648 }
3649
3650 dp->seq_num = seq_num;
3651 do_gettimeofday(&dp->time);
3652 timeval_sub_us(&dp->time, offset);
3653
3654 spin_lock(&mdev->peer_seq_lock);
3655 if (from == USE_DATA_SOCKET)
3656 list_add(&dp->list, &mdev->delay_probes);
3657 else
3658 list_add_tail(&dp->list, &mdev->delay_probes);
3659 spin_unlock(&mdev->peer_seq_lock);
3660}
3661
3662static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h)
3663{
3664 struct p_delay_probe *p = (struct p_delay_probe *)h;
3665
3666 ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3667 if (drbd_recv(mdev, h->payload, h->length) != h->length)
3668 return FALSE;
3669
3670 got_delay_probe(mdev, USE_DATA_SOCKET, p);
3671 return TRUE;
3672}
3673
3493typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); 3674typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
3494 3675
3495static drbd_cmd_handler_f drbd_default_handler[] = { 3676static drbd_cmd_handler_f drbd_default_handler[] = {
@@ -3513,6 +3694,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = {
3513 [P_OV_REQUEST] = receive_DataRequest, 3694 [P_OV_REQUEST] = receive_DataRequest,
3514 [P_OV_REPLY] = receive_DataRequest, 3695 [P_OV_REPLY] = receive_DataRequest,
3515 [P_CSUM_RS_REQUEST] = receive_DataRequest, 3696 [P_CSUM_RS_REQUEST] = receive_DataRequest,
3697 [P_DELAY_PROBE] = receive_delay_probe,
3516 /* anything missing from this table is in 3698 /* anything missing from this table is in
3517 * the asender_tbl, see get_asender_cmd */ 3699 * the asender_tbl, see get_asender_cmd */
3518 [P_MAX_CMD] = NULL, 3700 [P_MAX_CMD] = NULL,
@@ -3739,7 +3921,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
3739 dev_info(DEV, "net_ee not empty, killed %u entries\n", i); 3921 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3740 i = atomic_read(&mdev->pp_in_use); 3922 i = atomic_read(&mdev->pp_in_use);
3741 if (i) 3923 if (i)
3742 dev_info(DEV, "pp_in_use = %u, expected 0\n", i); 3924 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
3743 3925
3744 D_ASSERT(list_empty(&mdev->read_ee)); 3926 D_ASSERT(list_empty(&mdev->read_ee));
3745 D_ASSERT(list_empty(&mdev->active_ee)); 3927 D_ASSERT(list_empty(&mdev->active_ee));
@@ -4232,7 +4414,6 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4232 4414
4233 sector = be64_to_cpu(p->sector); 4415 sector = be64_to_cpu(p->sector);
4234 size = be32_to_cpu(p->blksize); 4416 size = be32_to_cpu(p->blksize);
4235 D_ASSERT(p->block_id == ID_SYNCER);
4236 4417
4237 update_peer_seq(mdev, be32_to_cpu(p->seq_num)); 4418 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4238 4419
@@ -4290,6 +4471,14 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4290 return TRUE; 4471 return TRUE;
4291} 4472}
4292 4473
4474static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h)
4475{
4476 struct p_delay_probe *p = (struct p_delay_probe *)h;
4477
4478 got_delay_probe(mdev, USE_META_SOCKET, p);
4479 return TRUE;
4480}
4481
4293struct asender_cmd { 4482struct asender_cmd {
4294 size_t pkt_size; 4483 size_t pkt_size;
4295 int (*process)(struct drbd_conf *mdev, struct p_header *h); 4484 int (*process)(struct drbd_conf *mdev, struct p_header *h);
@@ -4314,6 +4503,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
4314 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, 4503 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4315 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, 4504 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4316 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, 4505 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4506 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_delay_probe_m },
4317 [P_MAX_CMD] = { 0, NULL }, 4507 [P_MAX_CMD] = { 0, NULL },
4318 }; 4508 };
4319 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) 4509 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index de81ab7b4627..3397f11d0ba9 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -722,6 +722,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
722 struct drbd_request *req; 722 struct drbd_request *req;
723 int local, remote; 723 int local, remote;
724 int err = -EIO; 724 int err = -EIO;
725 int ret = 0;
725 726
726 /* allocate outside of all locks; */ 727 /* allocate outside of all locks; */
727 req = drbd_req_new(mdev, bio); 728 req = drbd_req_new(mdev, bio);
@@ -784,7 +785,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
784 (mdev->state.pdsk == D_INCONSISTENT && 785 (mdev->state.pdsk == D_INCONSISTENT &&
785 mdev->state.conn >= C_CONNECTED)); 786 mdev->state.conn >= C_CONNECTED));
786 787
787 if (!(local || remote)) { 788 if (!(local || remote) && !mdev->state.susp) {
788 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 789 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
789 goto fail_free_complete; 790 goto fail_free_complete;
790 } 791 }
@@ -810,6 +811,16 @@ allocate_barrier:
810 /* GOOD, everything prepared, grab the spin_lock */ 811 /* GOOD, everything prepared, grab the spin_lock */
811 spin_lock_irq(&mdev->req_lock); 812 spin_lock_irq(&mdev->req_lock);
812 813
814 if (mdev->state.susp) {
815 /* If we got suspended, use the retry mechanism of
816 generic_make_request() to restart processing of this
817 bio. In the next call to drbd_make_request_26
818 we sleep in inc_ap_bio() */
819 ret = 1;
820 spin_unlock_irq(&mdev->req_lock);
821 goto fail_free_complete;
822 }
823
813 if (remote) { 824 if (remote) {
814 remote = (mdev->state.pdsk == D_UP_TO_DATE || 825 remote = (mdev->state.pdsk == D_UP_TO_DATE ||
815 (mdev->state.pdsk == D_INCONSISTENT && 826 (mdev->state.pdsk == D_INCONSISTENT &&
@@ -947,12 +958,14 @@ fail_and_free_req:
947 req->private_bio = NULL; 958 req->private_bio = NULL;
948 put_ldev(mdev); 959 put_ldev(mdev);
949 } 960 }
950 bio_endio(bio, err); 961 if (!ret)
962 bio_endio(bio, err);
963
951 drbd_req_free(req); 964 drbd_req_free(req);
952 dec_ap_bio(mdev); 965 dec_ap_bio(mdev);
953 kfree(b); 966 kfree(b);
954 967
955 return 0; 968 return ret;
956} 969}
957 970
958/* helper function for drbd_make_request 971/* helper function for drbd_make_request
@@ -962,11 +975,6 @@ fail_and_free_req:
962 */ 975 */
963static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) 976static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
964{ 977{
965 /* Unconfigured */
966 if (mdev->state.conn == C_DISCONNECTING &&
967 mdev->state.disk == D_DISKLESS)
968 return 1;
969
970 if (mdev->state.role != R_PRIMARY && 978 if (mdev->state.role != R_PRIMARY &&
971 (!allow_oos || is_write)) { 979 (!allow_oos || is_write)) {
972 if (__ratelimit(&drbd_ratelimit_state)) { 980 if (__ratelimit(&drbd_ratelimit_state)) {
@@ -1070,15 +1078,21 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
1070 1078
1071 /* we need to get a "reference count" (ap_bio_cnt) 1079 /* we need to get a "reference count" (ap_bio_cnt)
1072 * to avoid races with the disconnect/reconnect/suspend code. 1080 * to avoid races with the disconnect/reconnect/suspend code.
1073 * In case we need to split the bio here, we need to get two references 1081 * In case we need to split the bio here, we need to get three references
1074 * atomically, otherwise we might deadlock when trying to submit the 1082 * atomically, otherwise we might deadlock when trying to submit the
1075 * second one! */ 1083 * second one! */
1076 inc_ap_bio(mdev, 2); 1084 inc_ap_bio(mdev, 3);
1077 1085
1078 D_ASSERT(e_enr == s_enr + 1); 1086 D_ASSERT(e_enr == s_enr + 1);
1079 1087
1080 drbd_make_request_common(mdev, &bp->bio1); 1088 while (drbd_make_request_common(mdev, &bp->bio1))
1081 drbd_make_request_common(mdev, &bp->bio2); 1089 inc_ap_bio(mdev, 1);
1090
1091 while (drbd_make_request_common(mdev, &bp->bio2))
1092 inc_ap_bio(mdev, 1);
1093
1094 dec_ap_bio(mdev);
1095
1082 bio_pair_release(bp); 1096 bio_pair_release(bp);
1083 } 1097 }
1084 return 0; 1098 return 0;
@@ -1115,7 +1129,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1115 } else if (limit && get_ldev(mdev)) { 1129 } else if (limit && get_ldev(mdev)) {
1116 struct request_queue * const b = 1130 struct request_queue * const b =
1117 mdev->ldev->backing_bdev->bd_disk->queue; 1131 mdev->ldev->backing_bdev->bd_disk->queue;
1118 if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { 1132 if (b->merge_bvec_fn) {
1119 backing_limit = b->merge_bvec_fn(b, bvm, bvec); 1133 backing_limit = b->merge_bvec_fn(b, bvm, bvec);
1120 limit = min(limit, backing_limit); 1134 limit = min(limit, backing_limit);
1121 } 1135 }
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 76863e3f05be..85179e1fb50a 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -70,7 +70,7 @@ static const char *drbd_disk_s_names[] = {
70 70
71static const char *drbd_state_sw_errors[] = { 71static const char *drbd_state_sw_errors[] = {
72 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", 72 [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
73 [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", 73 [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
74 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", 74 [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
75 [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", 75 [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
76 [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", 76 [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d48a1dfd7b24..727ff6339754 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -47,8 +47,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
47 47
48/* defined here: 48/* defined here:
49 drbd_md_io_complete 49 drbd_md_io_complete
50 drbd_endio_write_sec 50 drbd_endio_sec
51 drbd_endio_read_sec
52 drbd_endio_pri 51 drbd_endio_pri
53 52
54 * more endio handlers: 53 * more endio handlers:
@@ -85,27 +84,10 @@ void drbd_md_io_complete(struct bio *bio, int error)
85/* reads on behalf of the partner, 84/* reads on behalf of the partner,
86 * "submitted" by the receiver 85 * "submitted" by the receiver
87 */ 86 */
88void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) 87void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
89{ 88{
90 unsigned long flags = 0; 89 unsigned long flags = 0;
91 struct drbd_epoch_entry *e = NULL; 90 struct drbd_conf *mdev = e->mdev;
92 struct drbd_conf *mdev;
93 int uptodate = bio_flagged(bio, BIO_UPTODATE);
94
95 e = bio->bi_private;
96 mdev = e->mdev;
97
98 if (error)
99 dev_warn(DEV, "read: error=%d s=%llus\n", error,
100 (unsigned long long)e->sector);
101 if (!error && !uptodate) {
102 dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
103 (unsigned long long)e->sector);
104 /* strange behavior of some lower level drivers...
105 * fail the request by clearing the uptodate flag,
106 * but do not return any error?! */
107 error = -EIO;
108 }
109 91
110 D_ASSERT(e->block_id != ID_VACANT); 92 D_ASSERT(e->block_id != ID_VACANT);
111 93
@@ -114,49 +96,38 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
114 list_del(&e->w.list); 96 list_del(&e->w.list);
115 if (list_empty(&mdev->read_ee)) 97 if (list_empty(&mdev->read_ee))
116 wake_up(&mdev->ee_wait); 98 wake_up(&mdev->ee_wait);
99 if (test_bit(__EE_WAS_ERROR, &e->flags))
100 __drbd_chk_io_error(mdev, FALSE);
117 spin_unlock_irqrestore(&mdev->req_lock, flags); 101 spin_unlock_irqrestore(&mdev->req_lock, flags);
118 102
119 drbd_chk_io_error(mdev, error, FALSE);
120 drbd_queue_work(&mdev->data.work, &e->w); 103 drbd_queue_work(&mdev->data.work, &e->w);
121 put_ldev(mdev); 104 put_ldev(mdev);
122} 105}
123 106
107static int is_failed_barrier(int ee_flags)
108{
109 return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
110 == (EE_IS_BARRIER|EE_WAS_ERROR);
111}
112
124/* writes on behalf of the partner, or resync writes, 113/* writes on behalf of the partner, or resync writes,
125 * "submitted" by the receiver. 114 * "submitted" by the receiver, final stage. */
126 */ 115static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
127void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
128{ 116{
129 unsigned long flags = 0; 117 unsigned long flags = 0;
130 struct drbd_epoch_entry *e = NULL; 118 struct drbd_conf *mdev = e->mdev;
131 struct drbd_conf *mdev;
132 sector_t e_sector; 119 sector_t e_sector;
133 int do_wake; 120 int do_wake;
134 int is_syncer_req; 121 int is_syncer_req;
135 int do_al_complete_io; 122 int do_al_complete_io;
136 int uptodate = bio_flagged(bio, BIO_UPTODATE);
137 int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
138
139 e = bio->bi_private;
140 mdev = e->mdev;
141 123
142 if (error) 124 /* if this is a failed barrier request, disable use of barriers,
143 dev_warn(DEV, "write: error=%d s=%llus\n", error, 125 * and schedule for resubmission */
144 (unsigned long long)e->sector); 126 if (is_failed_barrier(e->flags)) {
145 if (!error && !uptodate) {
146 dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
147 (unsigned long long)e->sector);
148 /* strange behavior of some lower level drivers...
149 * fail the request by clearing the uptodate flag,
150 * but do not return any error?! */
151 error = -EIO;
152 }
153
154 /* error == -ENOTSUPP would be a better test,
155 * alas it is not reliable */
156 if (error && is_barrier && e->flags & EE_IS_BARRIER) {
157 drbd_bump_write_ordering(mdev, WO_bdev_flush); 127 drbd_bump_write_ordering(mdev, WO_bdev_flush);
158 spin_lock_irqsave(&mdev->req_lock, flags); 128 spin_lock_irqsave(&mdev->req_lock, flags);
159 list_del(&e->w.list); 129 list_del(&e->w.list);
130 e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
160 e->w.cb = w_e_reissue; 131 e->w.cb = w_e_reissue;
161 /* put_ldev actually happens below, once we come here again. */ 132 /* put_ldev actually happens below, once we come here again. */
162 __release(local); 133 __release(local);
@@ -167,17 +138,16 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
167 138
168 D_ASSERT(e->block_id != ID_VACANT); 139 D_ASSERT(e->block_id != ID_VACANT);
169 140
170 spin_lock_irqsave(&mdev->req_lock, flags);
171 mdev->writ_cnt += e->size >> 9;
172 is_syncer_req = is_syncer_block_id(e->block_id);
173
174 /* after we moved e to done_ee, 141 /* after we moved e to done_ee,
175 * we may no longer access it, 142 * we may no longer access it,
176 * it may be freed/reused already! 143 * it may be freed/reused already!
177 * (as soon as we release the req_lock) */ 144 * (as soon as we release the req_lock) */
178 e_sector = e->sector; 145 e_sector = e->sector;
179 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; 146 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
147 is_syncer_req = is_syncer_block_id(e->block_id);
180 148
149 spin_lock_irqsave(&mdev->req_lock, flags);
150 mdev->writ_cnt += e->size >> 9;
181 list_del(&e->w.list); /* has been on active_ee or sync_ee */ 151 list_del(&e->w.list); /* has been on active_ee or sync_ee */
182 list_add_tail(&e->w.list, &mdev->done_ee); 152 list_add_tail(&e->w.list, &mdev->done_ee);
183 153
@@ -190,7 +160,7 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
190 ? list_empty(&mdev->sync_ee) 160 ? list_empty(&mdev->sync_ee)
191 : list_empty(&mdev->active_ee); 161 : list_empty(&mdev->active_ee);
192 162
193 if (error) 163 if (test_bit(__EE_WAS_ERROR, &e->flags))
194 __drbd_chk_io_error(mdev, FALSE); 164 __drbd_chk_io_error(mdev, FALSE);
195 spin_unlock_irqrestore(&mdev->req_lock, flags); 165 spin_unlock_irqrestore(&mdev->req_lock, flags);
196 166
@@ -205,7 +175,42 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
205 175
206 wake_asender(mdev); 176 wake_asender(mdev);
207 put_ldev(mdev); 177 put_ldev(mdev);
178}
179
180/* writes on behalf of the partner, or resync writes,
181 * "submitted" by the receiver.
182 */
183void drbd_endio_sec(struct bio *bio, int error)
184{
185 struct drbd_epoch_entry *e = bio->bi_private;
186 struct drbd_conf *mdev = e->mdev;
187 int uptodate = bio_flagged(bio, BIO_UPTODATE);
188 int is_write = bio_data_dir(bio) == WRITE;
189
190 if (error)
191 dev_warn(DEV, "%s: error=%d s=%llus\n",
192 is_write ? "write" : "read", error,
193 (unsigned long long)e->sector);
194 if (!error && !uptodate) {
195 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
196 is_write ? "write" : "read",
197 (unsigned long long)e->sector);
198 /* strange behavior of some lower level drivers...
199 * fail the request by clearing the uptodate flag,
200 * but do not return any error?! */
201 error = -EIO;
202 }
203
204 if (error)
205 set_bit(__EE_WAS_ERROR, &e->flags);
208 206
207 bio_put(bio); /* no need for the bio anymore */
208 if (atomic_dec_and_test(&e->pending_bios)) {
209 if (is_write)
210 drbd_endio_write_sec_final(e);
211 else
212 drbd_endio_read_sec_final(e);
213 }
209} 214}
210 215
211/* read, readA or write requests on R_PRIMARY coming from drbd_make_request 216/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
@@ -295,7 +300,34 @@ int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
295 return 1; /* Simply ignore this! */ 300 return 1; /* Simply ignore this! */
296} 301}
297 302
298void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) 303void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
304{
305 struct hash_desc desc;
306 struct scatterlist sg;
307 struct page *page = e->pages;
308 struct page *tmp;
309 unsigned len;
310
311 desc.tfm = tfm;
312 desc.flags = 0;
313
314 sg_init_table(&sg, 1);
315 crypto_hash_init(&desc);
316
317 while ((tmp = page_chain_next(page))) {
318 /* all but the last page will be fully used */
319 sg_set_page(&sg, page, PAGE_SIZE, 0);
320 crypto_hash_update(&desc, &sg, sg.length);
321 page = tmp;
322 }
323 /* and now the last, possibly only partially used page */
324 len = e->size & (PAGE_SIZE - 1);
325 sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
326 crypto_hash_update(&desc, &sg, sg.length);
327 crypto_hash_final(&desc, digest);
328}
329
330void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
299{ 331{
300 struct hash_desc desc; 332 struct hash_desc desc;
301 struct scatterlist sg; 333 struct scatterlist sg;
@@ -329,11 +361,11 @@ static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel
329 return 1; 361 return 1;
330 } 362 }
331 363
332 if (likely(drbd_bio_uptodate(e->private_bio))) { 364 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
333 digest_size = crypto_hash_digestsize(mdev->csums_tfm); 365 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
334 digest = kmalloc(digest_size, GFP_NOIO); 366 digest = kmalloc(digest_size, GFP_NOIO);
335 if (digest) { 367 if (digest) {
336 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); 368 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
337 369
338 inc_rs_pending(mdev); 370 inc_rs_pending(mdev);
339 ok = drbd_send_drequest_csum(mdev, 371 ok = drbd_send_drequest_csum(mdev,
@@ -369,23 +401,21 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
369 /* GFP_TRY, because if there is no memory available right now, this may 401 /* GFP_TRY, because if there is no memory available right now, this may
370 * be rescheduled for later. It is "only" background resync, after all. */ 402 * be rescheduled for later. It is "only" background resync, after all. */
371 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); 403 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
372 if (!e) { 404 if (!e)
373 put_ldev(mdev); 405 goto fail;
374 return 2;
375 }
376 406
377 spin_lock_irq(&mdev->req_lock); 407 spin_lock_irq(&mdev->req_lock);
378 list_add(&e->w.list, &mdev->read_ee); 408 list_add(&e->w.list, &mdev->read_ee);
379 spin_unlock_irq(&mdev->req_lock); 409 spin_unlock_irq(&mdev->req_lock);
380 410
381 e->private_bio->bi_end_io = drbd_endio_read_sec;
382 e->private_bio->bi_rw = READ;
383 e->w.cb = w_e_send_csum; 411 e->w.cb = w_e_send_csum;
412 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
413 return 1;
384 414
385 mdev->read_cnt += size >> 9; 415 drbd_free_ee(mdev, e);
386 drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); 416fail:
387 417 put_ldev(mdev);
388 return 1; 418 return 2;
389} 419}
390 420
391void resync_timer_fn(unsigned long data) 421void resync_timer_fn(unsigned long data)
@@ -414,13 +444,25 @@ void resync_timer_fn(unsigned long data)
414 drbd_queue_work(&mdev->data.work, &mdev->resync_work); 444 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
415} 445}
416 446
447static int calc_resync_rate(struct drbd_conf *mdev)
448{
449 int d = mdev->data_delay / 1000; /* us -> ms */
450 int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */
451 int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */
452 int cr = mdev->sync_conf.rate;
453
454 return d <= td ? cr :
455 d >= hd ? 0 :
456 cr + (cr * (td - d) / (hd - td));
457}
458
417int w_make_resync_request(struct drbd_conf *mdev, 459int w_make_resync_request(struct drbd_conf *mdev,
418 struct drbd_work *w, int cancel) 460 struct drbd_work *w, int cancel)
419{ 461{
420 unsigned long bit; 462 unsigned long bit;
421 sector_t sector; 463 sector_t sector;
422 const sector_t capacity = drbd_get_capacity(mdev->this_bdev); 464 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
423 int max_segment_size = queue_max_segment_size(mdev->rq_queue); 465 int max_segment_size;
424 int number, i, size, pe, mx; 466 int number, i, size, pe, mx;
425 int align, queued, sndbuf; 467 int align, queued, sndbuf;
426 468
@@ -446,7 +488,13 @@ int w_make_resync_request(struct drbd_conf *mdev,
446 return 1; 488 return 1;
447 } 489 }
448 490
449 number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); 491 /* starting with drbd 8.3.8, we can handle multi-bio EEs,
492 * if it should be necessary */
493 max_segment_size = mdev->agreed_pro_version < 94 ?
494 queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
495
496 mdev->c_sync_rate = calc_resync_rate(mdev);
497 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
450 pe = atomic_read(&mdev->rs_pending_cnt); 498 pe = atomic_read(&mdev->rs_pending_cnt);
451 499
452 mutex_lock(&mdev->data.mutex); 500 mutex_lock(&mdev->data.mutex);
@@ -509,12 +557,6 @@ next_sector:
509 * 557 *
510 * Additionally always align bigger requests, in order to 558 * Additionally always align bigger requests, in order to
511 * be prepared for all stripe sizes of software RAIDs. 559 * be prepared for all stripe sizes of software RAIDs.
512 *
513 * we _do_ care about the agreed-upon q->max_segment_size
514 * here, as splitting up the requests on the other side is more
515 * difficult. the consequence is, that on lvm and md and other
516 * "indirect" devices, this is dead code, since
517 * q->max_segment_size will be PAGE_SIZE.
518 */ 560 */
519 align = 1; 561 align = 1;
520 for (;;) { 562 for (;;) {
@@ -806,7 +848,7 @@ out:
806/* helper */ 848/* helper */
807static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) 849static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
808{ 850{
809 if (drbd_bio_has_active_page(e->private_bio)) { 851 if (drbd_ee_has_active_page(e)) {
810 /* This might happen if sendpage() has not finished */ 852 /* This might happen if sendpage() has not finished */
811 spin_lock_irq(&mdev->req_lock); 853 spin_lock_irq(&mdev->req_lock);
812 list_add_tail(&e->w.list, &mdev->net_ee); 854 list_add_tail(&e->w.list, &mdev->net_ee);
@@ -832,7 +874,7 @@ int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
832 return 1; 874 return 1;
833 } 875 }
834 876
835 if (likely(drbd_bio_uptodate(e->private_bio))) { 877 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
836 ok = drbd_send_block(mdev, P_DATA_REPLY, e); 878 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
837 } else { 879 } else {
838 if (__ratelimit(&drbd_ratelimit_state)) 880 if (__ratelimit(&drbd_ratelimit_state))
@@ -873,7 +915,7 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
873 put_ldev(mdev); 915 put_ldev(mdev);
874 } 916 }
875 917
876 if (likely(drbd_bio_uptodate(e->private_bio))) { 918 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
877 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { 919 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
878 inc_rs_pending(mdev); 920 inc_rs_pending(mdev);
879 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); 921 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
@@ -921,7 +963,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
921 963
922 di = (struct digest_info *)(unsigned long)e->block_id; 964 di = (struct digest_info *)(unsigned long)e->block_id;
923 965
924 if (likely(drbd_bio_uptodate(e->private_bio))) { 966 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
925 /* quick hack to try to avoid a race against reconfiguration. 967 /* quick hack to try to avoid a race against reconfiguration.
926 * a real fix would be much more involved, 968 * a real fix would be much more involved,
927 * introducing more locking mechanisms */ 969 * introducing more locking mechanisms */
@@ -931,7 +973,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
931 digest = kmalloc(digest_size, GFP_NOIO); 973 digest = kmalloc(digest_size, GFP_NOIO);
932 } 974 }
933 if (digest) { 975 if (digest) {
934 drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); 976 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
935 eq = !memcmp(digest, di->digest, digest_size); 977 eq = !memcmp(digest, di->digest, digest_size);
936 kfree(digest); 978 kfree(digest);
937 } 979 }
@@ -973,14 +1015,14 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
973 if (unlikely(cancel)) 1015 if (unlikely(cancel))
974 goto out; 1016 goto out;
975 1017
976 if (unlikely(!drbd_bio_uptodate(e->private_bio))) 1018 if (unlikely((e->flags & EE_WAS_ERROR) != 0))
977 goto out; 1019 goto out;
978 1020
979 digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1021 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
980 /* FIXME if this allocation fails, online verify will not terminate! */ 1022 /* FIXME if this allocation fails, online verify will not terminate! */
981 digest = kmalloc(digest_size, GFP_NOIO); 1023 digest = kmalloc(digest_size, GFP_NOIO);
982 if (digest) { 1024 if (digest) {
983 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); 1025 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
984 inc_rs_pending(mdev); 1026 inc_rs_pending(mdev);
985 ok = drbd_send_drequest_csum(mdev, e->sector, e->size, 1027 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
986 digest, digest_size, P_OV_REPLY); 1028 digest, digest_size, P_OV_REPLY);
@@ -1029,11 +1071,11 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1029 1071
1030 di = (struct digest_info *)(unsigned long)e->block_id; 1072 di = (struct digest_info *)(unsigned long)e->block_id;
1031 1073
1032 if (likely(drbd_bio_uptodate(e->private_bio))) { 1074 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1033 digest_size = crypto_hash_digestsize(mdev->verify_tfm); 1075 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1034 digest = kmalloc(digest_size, GFP_NOIO); 1076 digest = kmalloc(digest_size, GFP_NOIO);
1035 if (digest) { 1077 if (digest) {
1036 drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); 1078 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
1037 1079
1038 D_ASSERT(digest_size == di->digest_size); 1080 D_ASSERT(digest_size == di->digest_size);
1039 eq = !memcmp(digest, di->digest, digest_size); 1081 eq = !memcmp(digest, di->digest, digest_size);
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index f93fa111ce50..defdb5013ea3 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -18,23 +18,9 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
18 18
19#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) 19#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
20 20
21static inline int drbd_bio_has_active_page(struct bio *bio)
22{
23 struct bio_vec *bvec;
24 int i;
25
26 __bio_for_each_segment(bvec, bio, i, 0) {
27 if (page_count(bvec->bv_page) > 1)
28 return 1;
29 }
30
31 return 0;
32}
33
34/* bi_end_io handlers */ 21/* bi_end_io handlers */
35extern void drbd_md_io_complete(struct bio *bio, int error); 22extern void drbd_md_io_complete(struct bio *bio, int error);
36extern void drbd_endio_read_sec(struct bio *bio, int error); 23extern void drbd_endio_sec(struct bio *bio, int error);
37extern void drbd_endio_write_sec(struct bio *bio, int error);
38extern void drbd_endio_pri(struct bio *bio, int error); 24extern void drbd_endio_pri(struct bio *bio, int error);
39 25
40/* 26/*
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2138a7ae050c..83fa09a836ca 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -50,7 +50,7 @@ static void blk_done(struct virtqueue *vq)
50 unsigned long flags; 50 unsigned long flags;
51 51
52 spin_lock_irqsave(&vblk->lock, flags); 52 spin_lock_irqsave(&vblk->lock, flags);
53 while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) { 53 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
54 int error; 54 int error;
55 55
56 switch (vbr->status) { 56 switch (vbr->status) {
@@ -70,6 +70,8 @@ static void blk_done(struct virtqueue *vq)
70 vbr->req->sense_len = vbr->in_hdr.sense_len; 70 vbr->req->sense_len = vbr->in_hdr.sense_len;
71 vbr->req->errors = vbr->in_hdr.errors; 71 vbr->req->errors = vbr->in_hdr.errors;
72 } 72 }
73 if (blk_special_request(vbr->req))
74 vbr->req->errors = (error != 0);
73 75
74 __blk_end_request_all(vbr->req, error); 76 __blk_end_request_all(vbr->req, error);
75 list_del(&vbr->list); 77 list_del(&vbr->list);
@@ -103,6 +105,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
103 vbr->out_hdr.sector = 0; 105 vbr->out_hdr.sector = 0;
104 vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); 106 vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
105 break; 107 break;
108 case REQ_TYPE_SPECIAL:
109 vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID;
110 vbr->out_hdr.sector = 0;
111 vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
112 break;
106 case REQ_TYPE_LINUX_BLOCK: 113 case REQ_TYPE_LINUX_BLOCK:
107 if (req->cmd[0] == REQ_LB_OP_FLUSH) { 114 if (req->cmd[0] == REQ_LB_OP_FLUSH) {
108 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; 115 vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
@@ -151,7 +158,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
151 } 158 }
152 } 159 }
153 160
154 if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) { 161 if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) {
155 mempool_free(vbr, vblk->pool); 162 mempool_free(vbr, vblk->pool);
156 return false; 163 return false;
157 } 164 }
@@ -180,7 +187,7 @@ static void do_virtblk_request(struct request_queue *q)
180 } 187 }
181 188
182 if (issued) 189 if (issued)
183 vblk->vq->vq_ops->kick(vblk->vq); 190 virtqueue_kick(vblk->vq);
184} 191}
185 192
186static void virtblk_prepare_flush(struct request_queue *q, struct request *req) 193static void virtblk_prepare_flush(struct request_queue *q, struct request *req)
@@ -189,12 +196,45 @@ static void virtblk_prepare_flush(struct request_queue *q, struct request *req)
189 req->cmd[0] = REQ_LB_OP_FLUSH; 196 req->cmd[0] = REQ_LB_OP_FLUSH;
190} 197}
191 198
199/* return id (s/n) string for *disk to *id_str
200 */
201static int virtblk_get_id(struct gendisk *disk, char *id_str)
202{
203 struct virtio_blk *vblk = disk->private_data;
204 struct request *req;
205 struct bio *bio;
206
207 bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES,
208 GFP_KERNEL);
209 if (IS_ERR(bio))
210 return PTR_ERR(bio);
211
212 req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
213 if (IS_ERR(req)) {
214 bio_put(bio);
215 return PTR_ERR(req);
216 }
217
218 req->cmd_type = REQ_TYPE_SPECIAL;
219 return blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
220}
221
192static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, 222static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
193 unsigned cmd, unsigned long data) 223 unsigned cmd, unsigned long data)
194{ 224{
195 struct gendisk *disk = bdev->bd_disk; 225 struct gendisk *disk = bdev->bd_disk;
196 struct virtio_blk *vblk = disk->private_data; 226 struct virtio_blk *vblk = disk->private_data;
197 227
228 if (cmd == 0x56424944) { /* 'VBID' */
229 void __user *usr_data = (void __user *)data;
230 char id_str[VIRTIO_BLK_ID_BYTES];
231 int err;
232
233 err = virtblk_get_id(disk, id_str);
234 if (!err && copy_to_user(usr_data, id_str, VIRTIO_BLK_ID_BYTES))
235 err = -EFAULT;
236 return err;
237 }
198 /* 238 /*
199 * Only allow the generic SCSI ioctls if the host can support it. 239 * Only allow the generic SCSI ioctls if the host can support it.
200 */ 240 */