diff options
Diffstat (limited to 'drivers/block/drbd/drbd_req.c')
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 192 |
1 files changed, 161 insertions, 31 deletions
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2b8303ad63c9..c24379ffd4e3 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -34,14 +34,14 @@ | |||
34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); | 34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); |
35 | 35 | ||
36 | /* Update disk stats at start of I/O request */ | 36 | /* Update disk stats at start of I/O request */ |
37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | 37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req) |
38 | { | 38 | { |
39 | const int rw = bio_data_dir(bio); | 39 | const int rw = bio_data_dir(req->master_bio); |
40 | int cpu; | 40 | int cpu; |
41 | cpu = part_stat_lock(); | 41 | cpu = part_stat_lock(); |
42 | part_round_stats(cpu, &mdev->vdisk->part0); | 42 | part_round_stats(cpu, &mdev->vdisk->part0); |
43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | 43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); |
44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | 44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9); |
45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like | 45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like |
46 | the compiler warning about cpu only assigned but never used... */ | 46 | the compiler warning about cpu only assigned but never used... */ |
47 | part_inc_in_flight(&mdev->vdisk->part0, rw); | 47 | part_inc_in_flight(&mdev->vdisk->part0, rw); |
@@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) | |||
263 | else | 263 | else |
264 | root = &mdev->read_requests; | 264 | root = &mdev->read_requests; |
265 | drbd_remove_request_interval(root, req); | 265 | drbd_remove_request_interval(root, req); |
266 | } else if (!(s & RQ_POSTPONED)) | 266 | } |
267 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
268 | 267 | ||
269 | /* Before we can signal completion to the upper layers, | 268 | /* Before we can signal completion to the upper layers, |
270 | * we may need to close the current transfer log epoch. | 269 | * we may need to close the current transfer log epoch. |
@@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
755 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 754 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
756 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); | 755 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); |
757 | break; | 756 | break; |
757 | |||
758 | case QUEUE_AS_DRBD_BARRIER: | ||
759 | start_new_tl_epoch(mdev->tconn); | ||
760 | mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); | ||
761 | break; | ||
758 | }; | 762 | }; |
759 | 763 | ||
760 | return rv; | 764 | return rv; |
@@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev) | |||
861 | bool congested = false; | 865 | bool congested = false; |
862 | enum drbd_on_congestion on_congestion; | 866 | enum drbd_on_congestion on_congestion; |
863 | 867 | ||
868 | rcu_read_lock(); | ||
864 | nc = rcu_dereference(tconn->net_conf); | 869 | nc = rcu_dereference(tconn->net_conf); |
865 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; | 870 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; |
871 | rcu_read_unlock(); | ||
866 | if (on_congestion == OC_BLOCK || | 872 | if (on_congestion == OC_BLOCK || |
867 | tconn->agreed_pro_version < 96) | 873 | tconn->agreed_pro_version < 96) |
868 | return; | 874 | return; |
@@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req) | |||
956 | struct drbd_conf *mdev = req->w.mdev; | 962 | struct drbd_conf *mdev = req->w.mdev; |
957 | int remote, send_oos; | 963 | int remote, send_oos; |
958 | 964 | ||
959 | rcu_read_lock(); | ||
960 | remote = drbd_should_do_remote(mdev->state); | 965 | remote = drbd_should_do_remote(mdev->state); |
961 | if (remote) { | ||
962 | maybe_pull_ahead(mdev); | ||
963 | remote = drbd_should_do_remote(mdev->state); | ||
964 | } | ||
965 | send_oos = drbd_should_send_out_of_sync(mdev->state); | 966 | send_oos = drbd_should_send_out_of_sync(mdev->state); |
966 | rcu_read_unlock(); | ||
967 | 967 | ||
968 | /* Need to replicate writes. Unless it is an empty flush, | 968 | /* Need to replicate writes. Unless it is an empty flush, |
969 | * which is better mapped to a DRBD P_BARRIER packet, | 969 | * which is better mapped to a DRBD P_BARRIER packet, |
@@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req) | |||
975 | /* The only size==0 bios we expect are empty flushes. */ | 975 | /* The only size==0 bios we expect are empty flushes. */ |
976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); | 976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); |
977 | if (remote) | 977 | if (remote) |
978 | start_new_tl_epoch(mdev->tconn); | 978 | _req_mod(req, QUEUE_AS_DRBD_BARRIER); |
979 | return 0; | 979 | return remote; |
980 | } | 980 | } |
981 | 981 | ||
982 | if (!remote && !send_oos) | 982 | if (!remote && !send_oos) |
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req) | |||
1020 | bio_endio(bio, -EIO); | 1020 | bio_endio(bio, -EIO); |
1021 | } | 1021 | } |
1022 | 1022 | ||
1023 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | 1023 | static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req) |
1024 | { | 1024 | { |
1025 | const int rw = bio_rw(bio); | 1025 | spin_lock(&mdev->submit.lock); |
1026 | struct bio_and_error m = { NULL, }; | 1026 | list_add_tail(&req->tl_requests, &mdev->submit.writes); |
1027 | spin_unlock(&mdev->submit.lock); | ||
1028 | queue_work(mdev->submit.wq, &mdev->submit.worker); | ||
1029 | } | ||
1030 | |||
1031 | /* returns the new drbd_request pointer, if the caller is expected to | ||
1032 | * drbd_send_and_submit() it (to save latency), or NULL if we queued the | ||
1033 | * request on the submitter thread. | ||
1034 | * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. | ||
1035 | */ | ||
1036 | struct drbd_request * | ||
1037 | drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
1038 | { | ||
1039 | const int rw = bio_data_dir(bio); | ||
1027 | struct drbd_request *req; | 1040 | struct drbd_request *req; |
1028 | bool no_remote = false; | ||
1029 | 1041 | ||
1030 | /* allocate outside of all locks; */ | 1042 | /* allocate outside of all locks; */ |
1031 | req = drbd_req_new(mdev, bio); | 1043 | req = drbd_req_new(mdev, bio); |
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1035 | * if user cannot handle io errors, that's not our business. */ | 1047 | * if user cannot handle io errors, that's not our business. */ |
1036 | dev_err(DEV, "could not kmalloc() req\n"); | 1048 | dev_err(DEV, "could not kmalloc() req\n"); |
1037 | bio_endio(bio, -ENOMEM); | 1049 | bio_endio(bio, -ENOMEM); |
1038 | return; | 1050 | return ERR_PTR(-ENOMEM); |
1039 | } | 1051 | } |
1040 | req->start_time = start_time; | 1052 | req->start_time = start_time; |
1041 | 1053 | ||
@@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1044 | req->private_bio = NULL; | 1056 | req->private_bio = NULL; |
1045 | } | 1057 | } |
1046 | 1058 | ||
1047 | /* For WRITES going to the local disk, grab a reference on the target | 1059 | /* Update disk stats */ |
1048 | * extent. This waits for any resync activity in the corresponding | 1060 | _drbd_start_io_acct(mdev, req); |
1049 | * resync extent to finish, and, if necessary, pulls in the target | 1061 | |
1050 | * extent into the activity log, which involves further disk io because | ||
1051 | * of transactional on-disk meta data updates. | ||
1052 | * Empty flushes don't need to go into the activity log, they can only | ||
1053 | * flush data for pending writes which are already in there. */ | ||
1054 | if (rw == WRITE && req->private_bio && req->i.size | 1062 | if (rw == WRITE && req->private_bio && req->i.size |
1055 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | 1063 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { |
1064 | if (!drbd_al_begin_io_fastpath(mdev, &req->i)) { | ||
1065 | drbd_queue_write(mdev, req); | ||
1066 | return NULL; | ||
1067 | } | ||
1056 | req->rq_state |= RQ_IN_ACT_LOG; | 1068 | req->rq_state |= RQ_IN_ACT_LOG; |
1057 | drbd_al_begin_io(mdev, &req->i); | ||
1058 | } | 1069 | } |
1059 | 1070 | ||
1071 | return req; | ||
1072 | } | ||
1073 | |||
1074 | static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req) | ||
1075 | { | ||
1076 | const int rw = bio_rw(req->master_bio); | ||
1077 | struct bio_and_error m = { NULL, }; | ||
1078 | bool no_remote = false; | ||
1079 | |||
1060 | spin_lock_irq(&mdev->tconn->req_lock); | 1080 | spin_lock_irq(&mdev->tconn->req_lock); |
1061 | if (rw == WRITE) { | 1081 | if (rw == WRITE) { |
1062 | /* This may temporarily give up the req_lock, | 1082 | /* This may temporarily give up the req_lock, |
1063 | * but will re-aquire it before it returns here. | 1083 | * but will re-aquire it before it returns here. |
1064 | * Needs to be before the check on drbd_suspended() */ | 1084 | * Needs to be before the check on drbd_suspended() */ |
1065 | complete_conflicting_writes(req); | 1085 | complete_conflicting_writes(req); |
1086 | /* no more giving up req_lock from now on! */ | ||
1087 | |||
1088 | /* check for congestion, and potentially stop sending | ||
1089 | * full data updates, but start sending "dirty bits" only. */ | ||
1090 | maybe_pull_ahead(mdev); | ||
1066 | } | 1091 | } |
1067 | 1092 | ||
1068 | /* no more giving up req_lock from now on! */ | ||
1069 | 1093 | ||
1070 | if (drbd_suspended(mdev)) { | 1094 | if (drbd_suspended(mdev)) { |
1071 | /* push back and retry: */ | 1095 | /* push back and retry: */ |
@@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1078 | goto out; | 1102 | goto out; |
1079 | } | 1103 | } |
1080 | 1104 | ||
1081 | /* Update disk stats */ | ||
1082 | _drbd_start_io_acct(mdev, req, bio); | ||
1083 | |||
1084 | /* We fail READ/READA early, if we can not serve it. | 1105 | /* We fail READ/READA early, if we can not serve it. |
1085 | * We must do this before req is registered on any lists. | 1106 | * We must do this before req is registered on any lists. |
1086 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ | 1107 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ |
@@ -1137,7 +1158,116 @@ out: | |||
1137 | 1158 | ||
1138 | if (m.bio) | 1159 | if (m.bio) |
1139 | complete_master_bio(mdev, &m); | 1160 | complete_master_bio(mdev, &m); |
1140 | return; | 1161 | } |
1162 | |||
1163 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
1164 | { | ||
1165 | struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time); | ||
1166 | if (IS_ERR_OR_NULL(req)) | ||
1167 | return; | ||
1168 | drbd_send_and_submit(mdev, req); | ||
1169 | } | ||
1170 | |||
1171 | static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming) | ||
1172 | { | ||
1173 | struct drbd_request *req, *tmp; | ||
1174 | list_for_each_entry_safe(req, tmp, incoming, tl_requests) { | ||
1175 | const int rw = bio_data_dir(req->master_bio); | ||
1176 | |||
1177 | if (rw == WRITE /* rw != WRITE should not even end up here! */ | ||
1178 | && req->private_bio && req->i.size | ||
1179 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | ||
1180 | if (!drbd_al_begin_io_fastpath(mdev, &req->i)) | ||
1181 | continue; | ||
1182 | |||
1183 | req->rq_state |= RQ_IN_ACT_LOG; | ||
1184 | } | ||
1185 | |||
1186 | list_del_init(&req->tl_requests); | ||
1187 | drbd_send_and_submit(mdev, req); | ||
1188 | } | ||
1189 | } | ||
1190 | |||
1191 | static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev, | ||
1192 | struct list_head *incoming, | ||
1193 | struct list_head *pending) | ||
1194 | { | ||
1195 | struct drbd_request *req, *tmp; | ||
1196 | int wake = 0; | ||
1197 | int err; | ||
1198 | |||
1199 | spin_lock_irq(&mdev->al_lock); | ||
1200 | list_for_each_entry_safe(req, tmp, incoming, tl_requests) { | ||
1201 | err = drbd_al_begin_io_nonblock(mdev, &req->i); | ||
1202 | if (err == -EBUSY) | ||
1203 | wake = 1; | ||
1204 | if (err) | ||
1205 | continue; | ||
1206 | req->rq_state |= RQ_IN_ACT_LOG; | ||
1207 | list_move_tail(&req->tl_requests, pending); | ||
1208 | } | ||
1209 | spin_unlock_irq(&mdev->al_lock); | ||
1210 | if (wake) | ||
1211 | wake_up(&mdev->al_wait); | ||
1212 | |||
1213 | return !list_empty(pending); | ||
1214 | } | ||
1215 | |||
1216 | void do_submit(struct work_struct *ws) | ||
1217 | { | ||
1218 | struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker); | ||
1219 | LIST_HEAD(incoming); | ||
1220 | LIST_HEAD(pending); | ||
1221 | struct drbd_request *req, *tmp; | ||
1222 | |||
1223 | for (;;) { | ||
1224 | spin_lock(&mdev->submit.lock); | ||
1225 | list_splice_tail_init(&mdev->submit.writes, &incoming); | ||
1226 | spin_unlock(&mdev->submit.lock); | ||
1227 | |||
1228 | submit_fast_path(mdev, &incoming); | ||
1229 | if (list_empty(&incoming)) | ||
1230 | break; | ||
1231 | |||
1232 | wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending)); | ||
1233 | /* Maybe more was queued, while we prepared the transaction? | ||
1234 | * Try to stuff them into this transaction as well. | ||
1235 | * Be strictly non-blocking here, no wait_event, we already | ||
1236 | * have something to commit. | ||
1237 | * Stop if we don't make any more progres. | ||
1238 | */ | ||
1239 | for (;;) { | ||
1240 | LIST_HEAD(more_pending); | ||
1241 | LIST_HEAD(more_incoming); | ||
1242 | bool made_progress; | ||
1243 | |||
1244 | /* It is ok to look outside the lock, | ||
1245 | * it's only an optimization anyways */ | ||
1246 | if (list_empty(&mdev->submit.writes)) | ||
1247 | break; | ||
1248 | |||
1249 | spin_lock(&mdev->submit.lock); | ||
1250 | list_splice_tail_init(&mdev->submit.writes, &more_incoming); | ||
1251 | spin_unlock(&mdev->submit.lock); | ||
1252 | |||
1253 | if (list_empty(&more_incoming)) | ||
1254 | break; | ||
1255 | |||
1256 | made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending); | ||
1257 | |||
1258 | list_splice_tail_init(&more_pending, &pending); | ||
1259 | list_splice_tail_init(&more_incoming, &incoming); | ||
1260 | |||
1261 | if (!made_progress) | ||
1262 | break; | ||
1263 | } | ||
1264 | drbd_al_begin_io_commit(mdev, false); | ||
1265 | |||
1266 | list_for_each_entry_safe(req, tmp, &pending, tl_requests) { | ||
1267 | list_del_init(&req->tl_requests); | ||
1268 | drbd_send_and_submit(mdev, req); | ||
1269 | } | ||
1270 | } | ||
1141 | } | 1271 | } |
1142 | 1272 | ||
1143 | void drbd_make_request(struct request_queue *q, struct bio *bio) | 1273 | void drbd_make_request(struct request_queue *q, struct bio *bio) |