diff options
Diffstat (limited to 'drivers/block/drbd/drbd_req.c')
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 132 |
1 files changed, 83 insertions, 49 deletions
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4a0f314086e5..9c5c84946b05 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req | |||
37 | const int rw = bio_data_dir(bio); | 37 | const int rw = bio_data_dir(bio); |
38 | int cpu; | 38 | int cpu; |
39 | cpu = part_stat_lock(); | 39 | cpu = part_stat_lock(); |
40 | part_round_stats(cpu, &mdev->vdisk->part0); | ||
40 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | 41 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); |
41 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | 42 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); |
42 | part_inc_in_flight(&mdev->vdisk->part0, rw); | 43 | part_inc_in_flight(&mdev->vdisk->part0, rw); |
@@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | |||
214 | { | 215 | { |
215 | const unsigned long s = req->rq_state; | 216 | const unsigned long s = req->rq_state; |
216 | struct drbd_conf *mdev = req->mdev; | 217 | struct drbd_conf *mdev = req->mdev; |
217 | /* only WRITES may end up here without a master bio (on barrier ack) */ | 218 | int rw = req->rq_state & RQ_WRITE ? WRITE : READ; |
218 | int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; | ||
219 | 219 | ||
220 | /* we must not complete the master bio, while it is | 220 | /* we must not complete the master bio, while it is |
221 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) | 221 | * still being processed by _drbd_send_zc_bio (drbd_send_dblock) |
@@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | |||
230 | return; | 230 | return; |
231 | if (s & RQ_NET_PENDING) | 231 | if (s & RQ_NET_PENDING) |
232 | return; | 232 | return; |
233 | if (s & RQ_LOCAL_PENDING) | 233 | if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) |
234 | return; | 234 | return; |
235 | 235 | ||
236 | if (req->master_bio) { | 236 | if (req->master_bio) { |
@@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) | |||
277 | req->master_bio = NULL; | 277 | req->master_bio = NULL; |
278 | } | 278 | } |
279 | 279 | ||
280 | if (s & RQ_LOCAL_PENDING) | ||
281 | return; | ||
282 | |||
280 | if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { | 283 | if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { |
281 | /* this is disconnected (local only) operation, | 284 | /* this is disconnected (local only) operation, |
282 | * or protocol C P_WRITE_ACK, | 285 | * or protocol C P_WRITE_ACK, |
@@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
429 | break; | 432 | break; |
430 | 433 | ||
431 | case completed_ok: | 434 | case completed_ok: |
432 | if (bio_data_dir(req->master_bio) == WRITE) | 435 | if (req->rq_state & RQ_WRITE) |
433 | mdev->writ_cnt += req->size>>9; | 436 | mdev->writ_cnt += req->size>>9; |
434 | else | 437 | else |
435 | mdev->read_cnt += req->size>>9; | 438 | mdev->read_cnt += req->size>>9; |
@@ -438,7 +441,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
438 | req->rq_state &= ~RQ_LOCAL_PENDING; | 441 | req->rq_state &= ~RQ_LOCAL_PENDING; |
439 | 442 | ||
440 | _req_may_be_done_not_susp(req, m); | 443 | _req_may_be_done_not_susp(req, m); |
441 | put_ldev(mdev); | 444 | break; |
445 | |||
446 | case abort_disk_io: | ||
447 | req->rq_state |= RQ_LOCAL_ABORTED; | ||
448 | if (req->rq_state & RQ_WRITE) | ||
449 | _req_may_be_done_not_susp(req, m); | ||
450 | else | ||
451 | goto goto_queue_for_net_read; | ||
442 | break; | 452 | break; |
443 | 453 | ||
444 | case write_completed_with_error: | 454 | case write_completed_with_error: |
@@ -447,7 +457,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
447 | 457 | ||
448 | __drbd_chk_io_error(mdev, false); | 458 | __drbd_chk_io_error(mdev, false); |
449 | _req_may_be_done_not_susp(req, m); | 459 | _req_may_be_done_not_susp(req, m); |
450 | put_ldev(mdev); | ||
451 | break; | 460 | break; |
452 | 461 | ||
453 | case read_ahead_completed_with_error: | 462 | case read_ahead_completed_with_error: |
@@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
455 | req->rq_state |= RQ_LOCAL_COMPLETED; | 464 | req->rq_state |= RQ_LOCAL_COMPLETED; |
456 | req->rq_state &= ~RQ_LOCAL_PENDING; | 465 | req->rq_state &= ~RQ_LOCAL_PENDING; |
457 | _req_may_be_done_not_susp(req, m); | 466 | _req_may_be_done_not_susp(req, m); |
458 | put_ldev(mdev); | ||
459 | break; | 467 | break; |
460 | 468 | ||
461 | case read_completed_with_error: | 469 | case read_completed_with_error: |
@@ -467,7 +475,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
467 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); | 475 | D_ASSERT(!(req->rq_state & RQ_NET_MASK)); |
468 | 476 | ||
469 | __drbd_chk_io_error(mdev, false); | 477 | __drbd_chk_io_error(mdev, false); |
470 | put_ldev(mdev); | 478 | |
479 | goto_queue_for_net_read: | ||
471 | 480 | ||
472 | /* no point in retrying if there is no good remote data, | 481 | /* no point in retrying if there is no good remote data, |
473 | * or we have no connection. */ | 482 | * or we have no connection. */ |
@@ -556,10 +565,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
556 | drbd_queue_work(&mdev->data.work, &req->w); | 565 | drbd_queue_work(&mdev->data.work, &req->w); |
557 | break; | 566 | break; |
558 | 567 | ||
559 | case oos_handed_to_network: | 568 | case read_retry_remote_canceled: |
560 | /* actually the same */ | ||
561 | case send_canceled: | 569 | case send_canceled: |
562 | /* treat it the same */ | ||
563 | case send_failed: | 570 | case send_failed: |
564 | /* real cleanup will be done from tl_clear. just update flags | 571 | /* real cleanup will be done from tl_clear. just update flags |
565 | * so it is no longer marked as on the worker queue */ | 572 | * so it is no longer marked as on the worker queue */ |
@@ -589,17 +596,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
589 | } | 596 | } |
590 | req->rq_state &= ~RQ_NET_QUEUED; | 597 | req->rq_state &= ~RQ_NET_QUEUED; |
591 | req->rq_state |= RQ_NET_SENT; | 598 | req->rq_state |= RQ_NET_SENT; |
592 | /* because _drbd_send_zc_bio could sleep, and may want to | ||
593 | * dereference the bio even after the "write_acked_by_peer" and | ||
594 | * "completed_ok" events came in, once we return from | ||
595 | * _drbd_send_zc_bio (drbd_send_dblock), we have to check | ||
596 | * whether it is done already, and end it. */ | ||
597 | _req_may_be_done_not_susp(req, m); | 599 | _req_may_be_done_not_susp(req, m); |
598 | break; | 600 | break; |
599 | 601 | ||
600 | case read_retry_remote_canceled: | 602 | case oos_handed_to_network: |
603 | /* Was not set PENDING, no longer QUEUED, so is now DONE | ||
604 | * as far as this connection is concerned. */ | ||
601 | req->rq_state &= ~RQ_NET_QUEUED; | 605 | req->rq_state &= ~RQ_NET_QUEUED; |
602 | /* fall through, in case we raced with drbd_disconnect */ | 606 | req->rq_state |= RQ_NET_DONE; |
607 | _req_may_be_done_not_susp(req, m); | ||
608 | break; | ||
609 | |||
603 | case connection_lost_while_pending: | 610 | case connection_lost_while_pending: |
604 | /* transfer log cleanup after connection loss */ | 611 | /* transfer log cleanup after connection loss */ |
605 | /* assert something? */ | 612 | /* assert something? */ |
@@ -616,8 +623,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
616 | _req_may_be_done(req, m); /* Allowed while state.susp */ | 623 | _req_may_be_done(req, m); /* Allowed while state.susp */ |
617 | break; | 624 | break; |
618 | 625 | ||
619 | case write_acked_by_peer_and_sis: | ||
620 | req->rq_state |= RQ_NET_SIS; | ||
621 | case conflict_discarded_by_peer: | 626 | case conflict_discarded_by_peer: |
622 | /* for discarded conflicting writes of multiple primaries, | 627 | /* for discarded conflicting writes of multiple primaries, |
623 | * there is no need to keep anything in the tl, potential | 628 | * there is no need to keep anything in the tl, potential |
@@ -628,18 +633,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
628 | (unsigned long long)req->sector, req->size); | 633 | (unsigned long long)req->sector, req->size); |
629 | req->rq_state |= RQ_NET_DONE; | 634 | req->rq_state |= RQ_NET_DONE; |
630 | /* fall through */ | 635 | /* fall through */ |
636 | case write_acked_by_peer_and_sis: | ||
631 | case write_acked_by_peer: | 637 | case write_acked_by_peer: |
638 | if (what == write_acked_by_peer_and_sis) | ||
639 | req->rq_state |= RQ_NET_SIS; | ||
632 | /* protocol C; successfully written on peer. | 640 | /* protocol C; successfully written on peer. |
633 | * Nothing to do here. | 641 | * Nothing more to do here. |
634 | * We want to keep the tl in place for all protocols, to cater | 642 | * We want to keep the tl in place for all protocols, to cater |
635 | * for volatile write-back caches on lower level devices. | 643 | * for volatile write-back caches on lower level devices. */ |
636 | * | ||
637 | * A barrier request is expected to have forced all prior | ||
638 | * requests onto stable storage, so completion of a barrier | ||
639 | * request could set NET_DONE right here, and not wait for the | ||
640 | * P_BARRIER_ACK, but that is an unnecessary optimization. */ | ||
641 | 644 | ||
642 | /* this makes it effectively the same as for: */ | ||
643 | case recv_acked_by_peer: | 645 | case recv_acked_by_peer: |
644 | /* protocol B; pretends to be successfully written on peer. | 646 | /* protocol B; pretends to be successfully written on peer. |
645 | * see also notes above in handed_over_to_network about | 647 | * see also notes above in handed_over_to_network about |
@@ -773,6 +775,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
773 | int local, remote, send_oos = 0; | 775 | int local, remote, send_oos = 0; |
774 | int err = -EIO; | 776 | int err = -EIO; |
775 | int ret = 0; | 777 | int ret = 0; |
778 | union drbd_state s; | ||
776 | 779 | ||
777 | /* allocate outside of all locks; */ | 780 | /* allocate outside of all locks; */ |
778 | req = drbd_req_new(mdev, bio); | 781 | req = drbd_req_new(mdev, bio); |
@@ -834,8 +837,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
834 | drbd_al_begin_io(mdev, sector); | 837 | drbd_al_begin_io(mdev, sector); |
835 | } | 838 | } |
836 | 839 | ||
837 | remote = remote && drbd_should_do_remote(mdev->state); | 840 | s = mdev->state; |
838 | send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); | 841 | remote = remote && drbd_should_do_remote(s); |
842 | send_oos = rw == WRITE && drbd_should_send_oos(s); | ||
839 | D_ASSERT(!(remote && send_oos)); | 843 | D_ASSERT(!(remote && send_oos)); |
840 | 844 | ||
841 | if (!(local || remote) && !is_susp(mdev->state)) { | 845 | if (!(local || remote) && !is_susp(mdev->state)) { |
@@ -867,7 +871,7 @@ allocate_barrier: | |||
867 | 871 | ||
868 | if (is_susp(mdev->state)) { | 872 | if (is_susp(mdev->state)) { |
869 | /* If we got suspended, use the retry mechanism of | 873 | /* If we got suspended, use the retry mechanism of |
870 | generic_make_request() to restart processing of this | 874 | drbd_make_request() to restart processing of this |
871 | bio. In the next call to drbd_make_request | 875 | bio. In the next call to drbd_make_request |
872 | we sleep in inc_ap_bio() */ | 876 | we sleep in inc_ap_bio() */ |
873 | ret = 1; | 877 | ret = 1; |
@@ -1091,7 +1095,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) | |||
1091 | */ | 1095 | */ |
1092 | D_ASSERT(bio->bi_size > 0); | 1096 | D_ASSERT(bio->bi_size > 0); |
1093 | D_ASSERT((bio->bi_size & 0x1ff) == 0); | 1097 | D_ASSERT((bio->bi_size & 0x1ff) == 0); |
1094 | D_ASSERT(bio->bi_idx == 0); | ||
1095 | 1098 | ||
1096 | /* to make some things easier, force alignment of requests within the | 1099 | /* to make some things easier, force alignment of requests within the |
1097 | * granularity of our hash tables */ | 1100 | * granularity of our hash tables */ |
@@ -1099,8 +1102,9 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) | |||
1099 | e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; | 1102 | e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; |
1100 | 1103 | ||
1101 | if (likely(s_enr == e_enr)) { | 1104 | if (likely(s_enr == e_enr)) { |
1102 | inc_ap_bio(mdev, 1); | 1105 | do { |
1103 | drbd_make_request_common(mdev, bio, start_time); | 1106 | inc_ap_bio(mdev, 1); |
1107 | } while (drbd_make_request_common(mdev, bio, start_time)); | ||
1104 | return; | 1108 | return; |
1105 | } | 1109 | } |
1106 | 1110 | ||
@@ -1196,36 +1200,66 @@ void request_timer_fn(unsigned long data) | |||
1196 | struct drbd_conf *mdev = (struct drbd_conf *) data; | 1200 | struct drbd_conf *mdev = (struct drbd_conf *) data; |
1197 | struct drbd_request *req; /* oldest request */ | 1201 | struct drbd_request *req; /* oldest request */ |
1198 | struct list_head *le; | 1202 | struct list_head *le; |
1199 | unsigned long et = 0; /* effective timeout = ko_count * timeout */ | 1203 | unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ |
1204 | unsigned long now; | ||
1200 | 1205 | ||
1201 | if (get_net_conf(mdev)) { | 1206 | if (get_net_conf(mdev)) { |
1202 | et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; | 1207 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) |
1208 | ent = mdev->net_conf->timeout*HZ/10 | ||
1209 | * mdev->net_conf->ko_count; | ||
1203 | put_net_conf(mdev); | 1210 | put_net_conf(mdev); |
1204 | } | 1211 | } |
1205 | if (!et || mdev->state.conn < C_WF_REPORT_PARAMS) | 1212 | if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ |
1213 | dt = mdev->ldev->dc.disk_timeout * HZ / 10; | ||
1214 | put_ldev(mdev); | ||
1215 | } | ||
1216 | et = min_not_zero(dt, ent); | ||
1217 | |||
1218 | if (!et) | ||
1206 | return; /* Recurring timer stopped */ | 1219 | return; /* Recurring timer stopped */ |
1207 | 1220 | ||
1221 | now = jiffies; | ||
1222 | |||
1208 | spin_lock_irq(&mdev->req_lock); | 1223 | spin_lock_irq(&mdev->req_lock); |
1209 | le = &mdev->oldest_tle->requests; | 1224 | le = &mdev->oldest_tle->requests; |
1210 | if (list_empty(le)) { | 1225 | if (list_empty(le)) { |
1211 | spin_unlock_irq(&mdev->req_lock); | 1226 | spin_unlock_irq(&mdev->req_lock); |
1212 | mod_timer(&mdev->request_timer, jiffies + et); | 1227 | mod_timer(&mdev->request_timer, now + et); |
1213 | return; | 1228 | return; |
1214 | } | 1229 | } |
1215 | 1230 | ||
1216 | le = le->prev; | 1231 | le = le->prev; |
1217 | req = list_entry(le, struct drbd_request, tl_requests); | 1232 | req = list_entry(le, struct drbd_request, tl_requests); |
1218 | if (time_is_before_eq_jiffies(req->start_time + et)) { | ||
1219 | if (req->rq_state & RQ_NET_PENDING) { | ||
1220 | dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); | ||
1221 | _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL); | ||
1222 | } else { | ||
1223 | dev_warn(DEV, "Local backing block device frozen?\n"); | ||
1224 | mod_timer(&mdev->request_timer, jiffies + et); | ||
1225 | } | ||
1226 | } else { | ||
1227 | mod_timer(&mdev->request_timer, req->start_time + et); | ||
1228 | } | ||
1229 | 1233 | ||
1234 | /* The request is considered timed out, if | ||
1235 | * - we have some effective timeout from the configuration, | ||
1236 | * with above state restrictions applied, | ||
1237 | * - the oldest request is waiting for a response from the network | ||
1238 | * resp. the local disk, | ||
1239 | * - the oldest request is in fact older than the effective timeout, | ||
1240 | * - the connection was established (resp. disk was attached) | ||
1241 | * for longer than the timeout already. | ||
1242 | * Note that for 32bit jiffies and very stable connections/disks, | ||
1243 | * we may have a wrap around, which is catched by | ||
1244 | * !time_in_range(now, last_..._jif, last_..._jif + timeout). | ||
1245 | * | ||
1246 | * Side effect: once per 32bit wrap-around interval, which means every | ||
1247 | * ~198 days with 250 HZ, we have a window where the timeout would need | ||
1248 | * to expire twice (worst case) to become effective. Good enough. | ||
1249 | */ | ||
1250 | if (ent && req->rq_state & RQ_NET_PENDING && | ||
1251 | time_after(now, req->start_time + ent) && | ||
1252 | !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { | ||
1253 | dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); | ||
1254 | _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); | ||
1255 | } | ||
1256 | if (dt && req->rq_state & RQ_LOCAL_PENDING && | ||
1257 | time_after(now, req->start_time + dt) && | ||
1258 | !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { | ||
1259 | dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); | ||
1260 | __drbd_chk_io_error(mdev, 1); | ||
1261 | } | ||
1262 | nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; | ||
1230 | spin_unlock_irq(&mdev->req_lock); | 1263 | spin_unlock_irq(&mdev->req_lock); |
1264 | mod_timer(&mdev->request_timer, nt); | ||
1231 | } | 1265 | } |