aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPhilipp Reisner <philipp.reisner@linbit.com>2011-05-20 10:39:13 -0400
committerPhilipp Reisner <philipp.reisner@linbit.com>2011-05-24 04:08:58 -0400
commit99432fcc528d7a5ac8494a4c07ad4726670c96e2 (patch)
tree0b86df2b3e86af1eab14b987e81b7f4a5f88c090
parent21423fa79119a80e335de0c82ec29f67ed59f1bc (diff)
drbd: Take a more conservative approach when deciding max_bio_size
The old (optimistic) implementation could shrink the bio size on an primary device. Shrinking the bio size on a primary device is bad. Since there we might get BIOs with the old (bigger) size shortly after we published the new size. The new implementation is more conservative, and eventually increases the max_bio_size on a primary device (which is valid). It does so, when it knows the local limit AND the remote limit. We cache the last seen max_bio_size of the peer in the meta data, and rely on that, to make the operation of single nodes more efficient. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
-rw-r--r--drivers/block/drbd/drbd_int.h5
-rw-r--r--drivers/block/drbd/drbd_main.c26
-rw-r--r--drivers/block/drbd/drbd_nl.c96
-rw-r--r--drivers/block/drbd/drbd_receiver.c20
4 files changed, 97 insertions, 50 deletions
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 5c994739d11e..8aa10391115b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1128,6 +1128,8 @@ struct drbd_conf {
1128 int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ 1128 int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
1129 int rs_planed; /* resync sectors already planned */ 1129 int rs_planed; /* resync sectors already planned */
1130 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ 1130 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
1131 int peer_max_bio_size;
1132 int local_max_bio_size;
1131}; 1133};
1132 1134
1133static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1135static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1433,6 +1435,7 @@ struct bm_extent {
1433 * hash table. */ 1435 * hash table. */
1434#define HT_SHIFT 8 1436#define HT_SHIFT 8
1435#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) 1437#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
1438#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */
1436 1439
1437#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ 1440#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
1438 1441
@@ -1519,7 +1522,7 @@ extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *,
1519enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; 1522enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
1520extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); 1523extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
1521extern void resync_after_online_grow(struct drbd_conf *); 1524extern void resync_after_online_grow(struct drbd_conf *);
1522extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); 1525extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
1523extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, 1526extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
1524 enum drbd_role new_role, 1527 enum drbd_role new_role,
1525 int force); 1528 int force);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index ce6a764e905b..cfeb13b5a216 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2071,7 +2071,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
2071{ 2071{
2072 struct p_sizes p; 2072 struct p_sizes p;
2073 sector_t d_size, u_size; 2073 sector_t d_size, u_size;
2074 int q_order_type; 2074 int q_order_type, max_bio_size;
2075 int ok; 2075 int ok;
2076 2076
2077 if (get_ldev_if_state(mdev, D_NEGOTIATING)) { 2077 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -2079,17 +2079,20 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
2079 d_size = drbd_get_max_capacity(mdev->ldev); 2079 d_size = drbd_get_max_capacity(mdev->ldev);
2080 u_size = mdev->ldev->dc.disk_size; 2080 u_size = mdev->ldev->dc.disk_size;
2081 q_order_type = drbd_queue_order_type(mdev); 2081 q_order_type = drbd_queue_order_type(mdev);
2082 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2083 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
2082 put_ldev(mdev); 2084 put_ldev(mdev);
2083 } else { 2085 } else {
2084 d_size = 0; 2086 d_size = 0;
2085 u_size = 0; 2087 u_size = 0;
2086 q_order_type = QUEUE_ORDERED_NONE; 2088 q_order_type = QUEUE_ORDERED_NONE;
2089 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
2087 } 2090 }
2088 2091
2089 p.d_size = cpu_to_be64(d_size); 2092 p.d_size = cpu_to_be64(d_size);
2090 p.u_size = cpu_to_be64(u_size); 2093 p.u_size = cpu_to_be64(u_size);
2091 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); 2094 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2092 p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9); 2095 p.max_bio_size = cpu_to_be32(max_bio_size);
2093 p.queue_order_type = cpu_to_be16(q_order_type); 2096 p.queue_order_type = cpu_to_be16(q_order_type);
2094 p.dds_flags = cpu_to_be16(flags); 2097 p.dds_flags = cpu_to_be16(flags);
2095 2098
@@ -3048,6 +3051,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
3048 mdev->agreed_pro_version = PRO_VERSION_MAX; 3051 mdev->agreed_pro_version = PRO_VERSION_MAX;
3049 mdev->write_ordering = WO_bdev_flush; 3052 mdev->write_ordering = WO_bdev_flush;
3050 mdev->resync_wenr = LC_FREE; 3053 mdev->resync_wenr = LC_FREE;
3054 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3055 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3051} 3056}
3052 3057
3053void drbd_mdev_cleanup(struct drbd_conf *mdev) 3058void drbd_mdev_cleanup(struct drbd_conf *mdev)
@@ -3422,7 +3427,9 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
3422 q->backing_dev_info.congested_data = mdev; 3427 q->backing_dev_info.congested_data = mdev;
3423 3428
3424 blk_queue_make_request(q, drbd_make_request); 3429 blk_queue_make_request(q, drbd_make_request);
3425 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9); 3430 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3431 This triggers a max_bio_size message upon first attach or connect */
3432 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3426 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 3433 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3427 blk_queue_merge_bvec(q, drbd_merge_bvec); 3434 blk_queue_merge_bvec(q, drbd_merge_bvec);
3428 q->queue_lock = &mdev->req_lock; 3435 q->queue_lock = &mdev->req_lock;
@@ -3634,7 +3641,8 @@ struct meta_data_on_disk {
3634 /* `-- act_log->nr_elements <-- sync_conf.al_extents */ 3641 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3635 u32 bm_offset; /* offset to the bitmap, from here */ 3642 u32 bm_offset; /* offset to the bitmap, from here */
3636 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ 3643 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3637 u32 reserved_u32[4]; 3644 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3645 u32 reserved_u32[3];
3638 3646
3639} __packed; 3647} __packed;
3640 3648
@@ -3675,6 +3683,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
3675 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); 3683 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3676 3684
3677 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); 3685 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3686 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
3678 3687
3679 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); 3688 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3680 sector = mdev->ldev->md.md_offset; 3689 sector = mdev->ldev->md.md_offset;
@@ -3758,6 +3767,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3758 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); 3767 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3759 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); 3768 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3760 3769
3770 spin_lock_irq(&mdev->req_lock);
3771 if (mdev->state.conn < C_CONNECTED) {
3772 int peer;
3773 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3774 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3775 mdev->peer_max_bio_size = peer;
3776 }
3777 spin_unlock_irq(&mdev->req_lock);
3778
3761 if (mdev->sync_conf.al_extents < 7) 3779 if (mdev->sync_conf.al_extents < 7)
3762 mdev->sync_conf.al_extents = 127; 3780 mdev->sync_conf.al_extents = 127;
3763 3781
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9dfe58a09625..7c64ec042124 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -278,8 +278,14 @@ static int _try_outdate_peer_async(void *data)
278 278
279 /* Not using 279 /* Not using
280 drbd_request_state(mdev, NS(pdsk, nps)); 280 drbd_request_state(mdev, NS(pdsk, nps));
281 here, because we might were able to re-establish the connection in the 281 here, because we might were able to re-establish the connection
282 meantime. 282 in the meantime. This can only partially be solved in the state's
283 engine is_valid_state() and is_valid_state_transition()
284 functions.
285
286 nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
287 pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
288 therefore we have to have the pre state change check here.
283 */ 289 */
284 spin_lock_irq(&mdev->req_lock); 290 spin_lock_irq(&mdev->req_lock);
285 ns = mdev->state; 291 ns = mdev->state;
@@ -786,30 +792,78 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
786 return 0; 792 return 0;
787} 793}
788 794
789void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) __must_hold(local) 795static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
790{ 796{
791 struct request_queue * const q = mdev->rq_queue; 797 struct request_queue * const q = mdev->rq_queue;
792 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; 798 int max_hw_sectors = max_bio_size >> 9;
793 int max_segments = mdev->ldev->dc.max_bio_bvecs; 799 int max_segments = 0;
794 int max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 800
801 if (get_ldev_if_state(mdev, D_ATTACHING)) {
802 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
803
804 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
805 max_segments = mdev->ldev->dc.max_bio_bvecs;
806 put_ldev(mdev);
807 }
795 808
796 blk_queue_logical_block_size(q, 512); 809 blk_queue_logical_block_size(q, 512);
797 blk_queue_max_hw_sectors(q, max_hw_sectors); 810 blk_queue_max_hw_sectors(q, max_hw_sectors);
798 /* This is the workaround for "bio would need to, but cannot, be split" */ 811 /* This is the workaround for "bio would need to, but cannot, be split" */
799 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); 812 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
800 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); 813 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
801 blk_queue_stack_limits(q, b);
802 814
803 dev_info(DEV, "max BIO size = %u\n", queue_max_hw_sectors(q) << 9); 815 if (get_ldev_if_state(mdev, D_ATTACHING)) {
816 struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
804 817
805 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { 818 blk_queue_stack_limits(q, b);
806 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", 819
807 q->backing_dev_info.ra_pages, 820 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
808 b->backing_dev_info.ra_pages); 821 dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
809 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 822 q->backing_dev_info.ra_pages,
823 b->backing_dev_info.ra_pages);
824 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
825 }
826 put_ldev(mdev);
810 } 827 }
811} 828}
812 829
830void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
831{
832 int now, new, local, peer;
833
834 now = queue_max_hw_sectors(mdev->rq_queue) << 9;
835 local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
836 peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */
837
838 if (get_ldev_if_state(mdev, D_ATTACHING)) {
839 local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
840 mdev->local_max_bio_size = local;
841 put_ldev(mdev);
842 }
843
844 /* We may ignore peer limits if the peer is modern enough.
845 Because new from 8.3.8 onwards the peer can use multiple
846 BIOs for a single peer_request */
847 if (mdev->state.conn >= C_CONNECTED) {
848 if (mdev->agreed_pro_version < 94)
849 peer = mdev->peer_max_bio_size;
850 else if (mdev->agreed_pro_version == 94)
851 peer = DRBD_MAX_SIZE_H80_PACKET;
852 else /* drbd 8.3.8 onwards */
853 peer = DRBD_MAX_BIO_SIZE;
854 }
855
856 new = min_t(int, local, peer);
857
858 if (mdev->state.role == R_PRIMARY && new < now)
859 dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
860
861 if (new != now)
862 dev_info(DEV, "max BIO size = %u\n", new);
863
864 drbd_setup_queue_param(mdev, new);
865}
866
813/* serialize deconfig (worker exiting, doing cleanup) 867/* serialize deconfig (worker exiting, doing cleanup)
814 * and reconfig (drbdsetup disk, drbdsetup net) 868 * and reconfig (drbdsetup disk, drbdsetup net)
815 * 869 *
@@ -878,7 +932,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
878 struct block_device *bdev; 932 struct block_device *bdev;
879 struct lru_cache *resync_lru = NULL; 933 struct lru_cache *resync_lru = NULL;
880 union drbd_state ns, os; 934 union drbd_state ns, os;
881 unsigned int max_bio_size;
882 enum drbd_state_rv rv; 935 enum drbd_state_rv rv;
883 int cp_discovered = 0; 936 int cp_discovered = 0;
884 int logical_block_size; 937 int logical_block_size;
@@ -1130,20 +1183,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1130 mdev->read_cnt = 0; 1183 mdev->read_cnt = 0;
1131 mdev->writ_cnt = 0; 1184 mdev->writ_cnt = 0;
1132 1185
1133 max_bio_size = DRBD_MAX_BIO_SIZE; 1186 drbd_reconsider_max_bio_size(mdev);
1134 if (mdev->state.conn == C_CONNECTED) {
1135 /* We are Primary, Connected, and now attach a new local
1136 * backing store. We must not increase the user visible maximum
1137 * bio size on this device to something the peer may not be
1138 * able to handle. */
1139 if (mdev->agreed_pro_version < 94)
1140 max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
1141 else if (mdev->agreed_pro_version == 94)
1142 max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
1143 /* else: drbd 8.3.9 and later, stay with default */
1144 }
1145
1146 drbd_setup_queue_param(mdev, max_bio_size);
1147 1187
1148 /* If I am currently not R_PRIMARY, 1188 /* If I am currently not R_PRIMARY,
1149 * but meta data primary indicator is set, 1189 * but meta data primary indicator is set,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b0b0ba345e83..6ea0a4b51ece 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -899,11 +899,6 @@ retry:
899 899
900 drbd_thread_start(&mdev->asender); 900 drbd_thread_start(&mdev->asender);
901 901
902 if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
903 drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
904 put_ldev(mdev);
905 }
906
907 if (drbd_send_protocol(mdev) == -1) 902 if (drbd_send_protocol(mdev) == -1)
908 return -1; 903 return -1;
909 drbd_send_sync_param(mdev, &mdev->sync_conf); 904 drbd_send_sync_param(mdev, &mdev->sync_conf);
@@ -2939,7 +2934,6 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
2939{ 2934{
2940 struct p_sizes *p = &mdev->data.rbuf.sizes; 2935 struct p_sizes *p = &mdev->data.rbuf.sizes;
2941 enum determine_dev_size dd = unchanged; 2936 enum determine_dev_size dd = unchanged;
2942 unsigned int max_bio_size;
2943 sector_t p_size, p_usize, my_usize; 2937 sector_t p_size, p_usize, my_usize;
2944 int ldsc = 0; /* local disk size changed */ 2938 int ldsc = 0; /* local disk size changed */
2945 enum dds_flags ddsf; 2939 enum dds_flags ddsf;
@@ -3004,23 +2998,15 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3004 drbd_set_my_capacity(mdev, p_size); 2998 drbd_set_my_capacity(mdev, p_size);
3005 } 2999 }
3006 3000
3001 mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3002 drbd_reconsider_max_bio_size(mdev);
3003
3007 if (get_ldev(mdev)) { 3004 if (get_ldev(mdev)) {
3008 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { 3005 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3009 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 3006 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3010 ldsc = 1; 3007 ldsc = 1;
3011 } 3008 }
3012 3009
3013 if (mdev->agreed_pro_version < 94)
3014 max_bio_size = be32_to_cpu(p->max_bio_size);
3015 else if (mdev->agreed_pro_version == 94)
3016 max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
3017 else /* drbd 8.3.8 onwards */
3018 max_bio_size = DRBD_MAX_BIO_SIZE;
3019
3020 if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9)
3021 drbd_setup_queue_param(mdev, max_bio_size);
3022
3023 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
3024 put_ldev(mdev); 3010 put_ldev(mdev);
3025 } 3011 }
3026 3012