aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd/drbd_req.c
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2012-03-29 11:04:14 -0400
committerPhilipp Reisner <philipp.reisner@linbit.com>2012-11-08 10:58:35 -0500
commit5da9c8364443797ece9393670fb7ab69cff055ed (patch)
tree90a7206ac7e446c46863ad6caefbd014c94ec1d8 /drivers/block/drbd/drbd_req.c
parentb6dd1a89767bc33e9c98b3195f8925b46c5c95f3 (diff)
drbd: better separate WRITE and READ code paths in drbd_make_request
cherry-picked and adapted from drbd 9 devel branch READs will be interesting to at most one connection, WRITEs should be interesting for all established connections. Introduce some helper functions to hopefully make this easier to follow. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd/drbd_req.c')
-rw-r--r--drivers/block/drbd/drbd_req.c399
1 files changed, 211 insertions, 188 deletions
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index ca28b56b7a2f..d2d61af034ec 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -304,15 +304,21 @@ void req_may_be_completed(struct drbd_request *req, struct bio_and_error *m)
304 /* Update disk stats */ 304 /* Update disk stats */
305 _drbd_end_io_acct(mdev, req); 305 _drbd_end_io_acct(mdev, req);
306 306
307 /* if READ failed, 307 /* If READ failed,
308 * have it be pushed back to the retry work queue, 308 * have it be pushed back to the retry work queue,
309 * so it will re-enter __drbd_make_request, 309 * so it will re-enter __drbd_make_request(),
310 * and be re-assigned to a suitable local or remote path, 310 * and be re-assigned to a suitable local or remote path,
311 * or failed if we do not have access to good data anymore. 311 * or failed if we do not have access to good data anymore.
312 * READA may fail. 312 *
313 * Unless it was failed early by __drbd_make_request(),
314 * because no path was available, in which case
315 * it was not even added to the transfer_log.
316 *
317 * READA may fail, and will not be retried.
318 *
313 * WRITE should have used all available paths already. 319 * WRITE should have used all available paths already.
314 */ 320 */
315 if (!ok && rw == READ) 321 if (!ok && rw == READ && !list_empty(&req->tl_requests))
316 req->rq_state |= RQ_POSTPONED; 322 req->rq_state |= RQ_POSTPONED;
317 323
318 if (!(req->rq_state & RQ_POSTPONED)) { 324 if (!(req->rq_state & RQ_POSTPONED)) {
@@ -725,19 +731,12 @@ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int
725 return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0; 731 return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
726} 732}
727 733
728static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector) 734static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector,
735 enum drbd_read_balancing rbm)
729{ 736{
730 enum drbd_read_balancing rbm;
731 struct backing_dev_info *bdi; 737 struct backing_dev_info *bdi;
732 int stripe_shift; 738 int stripe_shift;
733 739
734 if (mdev->state.pdsk < D_UP_TO_DATE)
735 return false;
736
737 rcu_read_lock();
738 rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
739 rcu_read_unlock();
740
741 switch (rbm) { 740 switch (rbm) {
742 case RB_CONGESTED_REMOTE: 741 case RB_CONGESTED_REMOTE:
743 bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info; 742 bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
@@ -798,17 +797,160 @@ static void complete_conflicting_writes(struct drbd_request *req)
798 finish_wait(&mdev->misc_wait, &wait); 797 finish_wait(&mdev->misc_wait, &wait);
799} 798}
800 799
800/* called within req_lock and rcu_read_lock() */
801static bool conn_check_congested(struct drbd_conf *mdev)
802{
803 struct drbd_tconn *tconn = mdev->tconn;
804 struct net_conf *nc;
805 bool congested = false;
806 enum drbd_on_congestion on_congestion;
807
808 nc = rcu_dereference(tconn->net_conf);
809 on_congestion = nc ? nc->on_congestion : OC_BLOCK;
810 if (on_congestion == OC_BLOCK ||
811 tconn->agreed_pro_version < 96)
812 return false;
813
814 if (nc->cong_fill &&
815 atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
816 dev_info(DEV, "Congestion-fill threshold reached\n");
817 congested = true;
818 }
819
820 if (mdev->act_log->used >= nc->cong_extents) {
821 dev_info(DEV, "Congestion-extents threshold reached\n");
822 congested = true;
823 }
824
825 if (congested) {
826 if (mdev->tconn->current_tle_writes)
827 /* start a new epoch for non-mirrored writes */
828 start_new_tl_epoch(mdev->tconn);
829
830 if (on_congestion == OC_PULL_AHEAD)
831 _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
832 else /*nc->on_congestion == OC_DISCONNECT */
833 _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
834 }
835
836 return congested;
837}
838
839/* If this returns false, and req->private_bio is still set,
840 * this should be submitted locally.
841 *
842 * If it returns false, but req->private_bio is not set,
843 * we do not have access to good data :(
844 *
845 * Otherwise, this destroys req->private_bio, if any,
846 * and returns true.
847 */
848static bool do_remote_read(struct drbd_request *req)
849{
850 struct drbd_conf *mdev = req->w.mdev;
851 enum drbd_read_balancing rbm;
852
853 if (req->private_bio) {
854 if (!drbd_may_do_local_read(mdev,
855 req->i.sector, req->i.size)) {
856 bio_put(req->private_bio);
857 req->private_bio = NULL;
858 put_ldev(mdev);
859 }
860 }
861
862 if (mdev->state.pdsk != D_UP_TO_DATE)
863 return false;
864
865 /* TODO: improve read balancing decisions, take into account drbd
866 * protocol, pending requests etc. */
867
868 rcu_read_lock();
869 rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
870 rcu_read_unlock();
871
872 if (rbm == RB_PREFER_LOCAL && req->private_bio)
873 return false; /* submit locally */
874
875 if (req->private_bio == NULL)
876 return true;
877
878 if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) {
879 if (req->private_bio) {
880 bio_put(req->private_bio);
881 req->private_bio = NULL;
882 put_ldev(mdev);
883 }
884 return true;
885 }
886
887 return false;
888}
889
890/* returns number of connections (== 1, for drbd 8.4)
891 * expected to actually write this data,
892 * which does NOT include those that we are L_AHEAD for. */
893static int drbd_process_write_request(struct drbd_request *req)
894{
895 struct drbd_conf *mdev = req->w.mdev;
896 int remote, send_oos;
897
898 rcu_read_lock();
899 remote = drbd_should_do_remote(mdev->state);
900 if (remote) {
901 conn_check_congested(mdev);
902 remote = drbd_should_do_remote(mdev->state);
903 }
904 send_oos = drbd_should_send_out_of_sync(mdev->state);
905 rcu_read_unlock();
906
907 if (!remote && !send_oos)
908 return 0;
909
910 D_ASSERT(!(remote && send_oos));
911
912 if (remote) {
913 _req_mod(req, TO_BE_SENT);
914 _req_mod(req, QUEUE_FOR_NET_WRITE);
915 } else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size))
916 _req_mod(req, QUEUE_FOR_SEND_OOS);
917
918 return remote;
919}
920
921static void
922drbd_submit_req_private_bio(struct drbd_request *req)
923{
924 struct drbd_conf *mdev = req->w.mdev;
925 struct bio *bio = req->private_bio;
926 const int rw = bio_rw(bio);
927
928 bio->bi_bdev = mdev->ldev->backing_bdev;
929
930 /* State may have changed since we grabbed our reference on the
931 * ->ldev member. Double check, and short-circuit to endio.
932 * In case the last activity log transaction failed to get on
933 * stable storage, and this is a WRITE, we may not even submit
934 * this bio. */
935 if (get_ldev(mdev)) {
936 if (drbd_insert_fault(mdev,
937 rw == WRITE ? DRBD_FAULT_DT_WR
938 : rw == READ ? DRBD_FAULT_DT_RD
939 : DRBD_FAULT_DT_RA))
940 bio_endio(bio, -EIO);
941 else
942 generic_make_request(bio);
943 put_ldev(mdev);
944 } else
945 bio_endio(bio, -EIO);
946}
947
801int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) 948int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
802{ 949{
803 const int rw = bio_rw(bio); 950 const int rw = bio_rw(bio);
804 const int size = bio->bi_size; 951 struct bio_and_error m = { NULL, };
805 const sector_t sector = bio->bi_sector;
806 struct drbd_request *req; 952 struct drbd_request *req;
807 struct net_conf *nc; 953 bool no_remote = false;
808 int local, remote, send_oos = 0;
809 int err = 0;
810 int ret = 0;
811 union drbd_dev_state s;
812 954
813 /* allocate outside of all locks; */ 955 /* allocate outside of all locks; */
814 req = drbd_req_new(mdev, bio); 956 req = drbd_req_new(mdev, bio);
@@ -822,70 +964,23 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
822 } 964 }
823 req->start_time = start_time; 965 req->start_time = start_time;
824 966
825 local = get_ldev(mdev); 967 if (!get_ldev(mdev)) {
826 if (!local) { 968 bio_put(req->private_bio);
827 bio_put(req->private_bio); /* or we get a bio leak */
828 req->private_bio = NULL; 969 req->private_bio = NULL;
829 } 970 }
830 if (rw == WRITE) {
831 remote = 1;
832 } else {
833 /* READ || READA */
834 if (local) {
835 if (!drbd_may_do_local_read(mdev, sector, size) ||
836 remote_due_to_read_balancing(mdev, sector)) {
837 /* we could kick the syncer to
838 * sync this extent asap, wait for
839 * it, then continue locally.
840 * Or just issue the request remotely.
841 */
842 local = 0;
843 bio_put(req->private_bio);
844 req->private_bio = NULL;
845 put_ldev(mdev);
846 }
847 }
848 remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
849 }
850
851 /* If we have a disk, but a READA request is mapped to remote,
852 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
853 * Just fail that READA request right here.
854 *
855 * THINK: maybe fail all READA when not local?
856 * or make this configurable...
857 * if network is slow, READA won't do any good.
858 */
859 if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
860 err = -EWOULDBLOCK;
861 goto fail_and_free_req;
862 }
863 971
864 /* For WRITES going to the local disk, grab a reference on the target 972 /* For WRITES going to the local disk, grab a reference on the target
865 * extent. This waits for any resync activity in the corresponding 973 * extent. This waits for any resync activity in the corresponding
866 * resync extent to finish, and, if necessary, pulls in the target 974 * resync extent to finish, and, if necessary, pulls in the target
867 * extent into the activity log, which involves further disk io because 975 * extent into the activity log, which involves further disk io because
868 * of transactional on-disk meta data updates. */ 976 * of transactional on-disk meta data updates. */
869 if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { 977 if (rw == WRITE && req->private_bio
978 && !test_bit(AL_SUSPENDED, &mdev->flags)) {
870 req->rq_state |= RQ_IN_ACT_LOG; 979 req->rq_state |= RQ_IN_ACT_LOG;
871 drbd_al_begin_io(mdev, &req->i); 980 drbd_al_begin_io(mdev, &req->i);
872 } 981 }
873 982
874 s = mdev->state;
875 remote = remote && drbd_should_do_remote(s);
876 send_oos = rw == WRITE && drbd_should_send_out_of_sync(s);
877 D_ASSERT(!(remote && send_oos));
878
879 if (!(local || remote) && !drbd_suspended(mdev)) {
880 if (__ratelimit(&drbd_ratelimit_state))
881 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
882 err = -EIO;
883 goto fail_free_complete;
884 }
885
886 /* GOOD, everything prepared, grab the spin_lock */
887 spin_lock_irq(&mdev->tconn->req_lock); 983 spin_lock_irq(&mdev->tconn->req_lock);
888
889 if (rw == WRITE) { 984 if (rw == WRITE) {
890 /* This may temporarily give up the req_lock, 985 /* This may temporarily give up the req_lock,
891 * but will re-aquire it before it returns here. 986 * but will re-aquire it before it returns here.
@@ -893,53 +988,28 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
893 complete_conflicting_writes(req); 988 complete_conflicting_writes(req);
894 } 989 }
895 990
896 if (drbd_suspended(mdev)) { 991 /* no more giving up req_lock from now on! */
897 /* If we got suspended, use the retry mechanism in
898 drbd_make_request() to restart processing of this
899 bio. In the next call to drbd_make_request
900 we sleep in inc_ap_bio() */
901 ret = 1;
902 spin_unlock_irq(&mdev->tconn->req_lock);
903 goto fail_free_complete;
904 }
905
906 if (remote || send_oos) {
907 remote = drbd_should_do_remote(mdev->state);
908 send_oos = rw == WRITE && drbd_should_send_out_of_sync(mdev->state);
909 D_ASSERT(!(remote && send_oos));
910 992
911 if (!(remote || send_oos)) 993 if (drbd_suspended(mdev)) {
912 dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); 994 /* push back and retry: */
913 if (!(local || remote)) { 995 req->rq_state |= RQ_POSTPONED;
914 dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); 996 if (req->private_bio) {
915 spin_unlock_irq(&mdev->tconn->req_lock); 997 bio_put(req->private_bio);
916 err = -EIO; 998 req->private_bio = NULL;
917 goto fail_free_complete;
918 } 999 }
1000 goto out;
919 } 1001 }
920 1002
921 /* Update disk stats */ 1003 /* Update disk stats */
922 _drbd_start_io_acct(mdev, req, bio); 1004 _drbd_start_io_acct(mdev, req, bio);
923 1005
924 /* NOTE 1006 /* We fail READ/READA early, if we can not serve it.
925 * Actually, 'local' may be wrong here already, since we may have failed 1007 * We must do this before req is registered on any lists.
926 * to write to the meta data, and may become wrong anytime because of 1008 * Otherwise, req_may_be_completed() will queue failed READ for retry. */
927 * local io-error for some other request, which would lead to us 1009 if (rw != WRITE) {
928 * "detaching" the local disk. 1010 if (!do_remote_read(req) && !req->private_bio)
929 * 1011 goto nodata;
930 * 'remote' may become wrong any time because the network could fail. 1012 }
931 *
932 * This is a harmless race condition, though, since it is handled
933 * correctly at the appropriate places; so it just defers the failure
934 * of the respective operation.
935 */
936
937 /* mark them early for readability.
938 * this just sets some state flags. */
939 if (remote)
940 _req_mod(req, TO_BE_SENT);
941 if (local)
942 _req_mod(req, TO_BE_SUBMITTED);
943 1013
944 /* which transfer log epoch does this belong to? */ 1014 /* which transfer log epoch does this belong to? */
945 req->epoch = atomic_read(&mdev->tconn->current_tle_nr); 1015 req->epoch = atomic_read(&mdev->tconn->current_tle_nr);
@@ -948,90 +1018,43 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
948 1018
949 list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log); 1019 list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
950 1020
951 /* NOTE remote first: to get the concurrent write detection right, 1021 if (rw == WRITE) {
952 * we must register the request before start of local IO. */ 1022 if (!drbd_process_write_request(req))
953 if (remote) { 1023 no_remote = true;
954 /* either WRITE and C_CONNECTED, 1024 } else {
955 * or READ, and no local disk, 1025 /* We either have a private_bio, or we can read from remote.
956 * or READ, but not in sync. 1026 * Otherwise we had done the goto nodata above. */
957 */ 1027 if (req->private_bio == NULL) {
958 _req_mod(req, (rw == WRITE) 1028 _req_mod(req, TO_BE_SENT);
959 ? QUEUE_FOR_NET_WRITE 1029 _req_mod(req, QUEUE_FOR_NET_READ);
960 : QUEUE_FOR_NET_READ); 1030 } else
1031 no_remote = true;
961 } 1032 }
962 if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
963 _req_mod(req, QUEUE_FOR_SEND_OOS);
964 1033
965 rcu_read_lock(); 1034 if (req->private_bio) {
966 nc = rcu_dereference(mdev->tconn->net_conf); 1035 /* needs to be marked within the same spinlock */
967 if (remote && 1036 _req_mod(req, TO_BE_SUBMITTED);
968 nc->on_congestion != OC_BLOCK && mdev->tconn->agreed_pro_version >= 96) { 1037 /* but we need to give up the spinlock to submit */
969 int congested = 0; 1038 spin_unlock_irq(&mdev->tconn->req_lock);
970 1039 drbd_submit_req_private_bio(req);
971 if (nc->cong_fill && 1040 /* once we have submitted, we must no longer look at req,
972 atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) { 1041 * it may already be destroyed. */
973 dev_info(DEV, "Congestion-fill threshold reached\n"); 1042 return 0;
974 congested = 1; 1043 } else if (no_remote) {
975 } 1044nodata:
976 1045 if (__ratelimit(&drbd_ratelimit_state))
977 if (mdev->act_log->used >= nc->cong_extents) { 1046 dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
978 dev_info(DEV, "Congestion-extents threshold reached\n"); 1047 /* A write may have been queued for send_oos, however.
979 congested = 1; 1048 * So we can not simply free it, we must go through req_may_be_completed() */
980 }
981
982 if (congested) {
983 if (mdev->tconn->current_tle_writes)
984 /* start a new epoch for non-mirrored writes */
985 start_new_tl_epoch(mdev->tconn);
986
987 if (nc->on_congestion == OC_PULL_AHEAD)
988 _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
989 else /*nc->on_congestion == OC_DISCONNECT */
990 _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
991 }
992 } 1049 }
993 rcu_read_unlock();
994 1050
1051out:
1052 req_may_be_completed(req, &m);
995 spin_unlock_irq(&mdev->tconn->req_lock); 1053 spin_unlock_irq(&mdev->tconn->req_lock);
996 1054
997 if (local) { 1055 if (m.bio)
998 req->private_bio->bi_bdev = mdev->ldev->backing_bdev; 1056 complete_master_bio(mdev, &m);
999
1000 /* State may have changed since we grabbed our reference on the
1001 * mdev->ldev member. Double check, and short-circuit to endio.
1002 * In case the last activity log transaction failed to get on
1003 * stable storage, and this is a WRITE, we may not even submit
1004 * this bio. */
1005 if (get_ldev(mdev)) {
1006 if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
1007 : rw == READ ? DRBD_FAULT_DT_RD
1008 : DRBD_FAULT_DT_RA))
1009 bio_endio(req->private_bio, -EIO);
1010 else
1011 generic_make_request(req->private_bio);
1012 put_ldev(mdev);
1013 } else
1014 bio_endio(req->private_bio, -EIO);
1015 }
1016
1017 return 0; 1057 return 0;
1018
1019fail_free_complete:
1020 if (req->rq_state & RQ_IN_ACT_LOG)
1021 drbd_al_complete_io(mdev, &req->i);
1022fail_and_free_req:
1023 if (local) {
1024 bio_put(req->private_bio);
1025 req->private_bio = NULL;
1026 put_ldev(mdev);
1027 }
1028 if (!ret)
1029 bio_endio(bio, err);
1030
1031 drbd_req_free(req);
1032 dec_ap_bio(mdev);
1033
1034 return ret;
1035} 1058}
1036 1059
1037int drbd_make_request(struct request_queue *q, struct bio *bio) 1060int drbd_make_request(struct request_queue *q, struct bio *bio)