drbd: better separate WRITE and READ code paths in drbd_make_request

cherry-picked and adapted from drbd 9 devel branch READs will be interesting to at most one connection, WRITEs should be interesting for all established connections. Introduce some helper functions to hopefully make this easier to follow. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
author: Lars Ellenberg <lars.ellenberg@linbit.com> 2012-03-29 11:04:14 -0400
committer: Philipp Reisner <philipp.reisner@linbit.com> 2012-11-08 10:58:35 -0500
commit: 5da9c8364443797ece9393670fb7ab69cff055ed (patch)
tree: 90a7206ac7e446c46863ad6caefbd014c94ec1d8 /drivers/block/drbd/drbd_req.c
parent: b6dd1a89767bc33e9c98b3195f8925b46c5c95f3 (diff)
1 files changed, 211 insertions, 188 deletions
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index ca28b56b7a2f..d2d61af034ec 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -304,15 +304,21 @@ void req_may_be_completed(struct drbd_request *req, struct bio_and_error *m)
                /* Update disk stats */
                _drbd_end_io_acct(mdev, req);
-                /* if READ failed,
+                /* If READ failed,
                 * have it be pushed back to the retry work queue,
-                 * so it will re-enter __drbd_make_request,
+                 * so it will re-enter __drbd_make_request(),
                 * and be re-assigned to a suitable local or remote path,
                 * or failed if we do not have access to good data anymore.
-                 * READA may fail.
+                 *
+                 * Unless it was failed early by __drbd_make_request(),
+                 * because no path was available, in which case
+                 * it was not even added to the transfer_log.
+                 *
+                 * READA may fail, and will not be retried.
+                 *
                 * WRITE should have used all available paths already.
                 */
-                if (!ok && rw == READ)
+                if (!ok && rw == READ && !list_empty(&req->tl_requests))
                        req->rq_state |= RQ_POSTPONED;
                if (!(req->rq_state & RQ_POSTPONED)) {
@@ -725,19 +731,12 @@ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int
        return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
 }
-static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector)
+static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector,
+                enum drbd_read_balancing rbm)
 {
-        enum drbd_read_balancing rbm;
        struct backing_dev_info *bdi;
        int stripe_shift;
-        if (mdev->state.pdsk < D_UP_TO_DATE)
-                return false;
-        rcu_read_lock();
-        rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
-        rcu_read_unlock();
        switch (rbm) {
        case RB_CONGESTED_REMOTE:
                bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
@@ -798,17 +797,160 @@ static void complete_conflicting_writes(struct drbd_request *req)
        finish_wait(&mdev->misc_wait, &wait);
 }
+/* called within req_lock and rcu_read_lock() */
+static bool conn_check_congested(struct drbd_conf *mdev)
+{
+        struct drbd_tconn *tconn = mdev->tconn;
+        struct net_conf *nc;
+        bool congested = false;
+        enum drbd_on_congestion on_congestion;
+        nc = rcu_dereference(tconn->net_conf);
+        on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+        if (on_congestion == OC_BLOCK ||
+            tconn->agreed_pro_version < 96)
+                return false;
+        if (nc->cong_fill &&
+            atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
+                dev_info(DEV, "Congestion-fill threshold reached\n");
+                congested = true;
+        }
+        if (mdev->act_log->used >= nc->cong_extents) {
+                dev_info(DEV, "Congestion-extents threshold reached\n");
+                congested = true;
+        }
+        if (congested) {
+                if (mdev->tconn->current_tle_writes)
+                        /* start a new epoch for non-mirrored writes */
+                        start_new_tl_epoch(mdev->tconn);
+                if (on_congestion == OC_PULL_AHEAD)
+                        _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
+                else  /*nc->on_congestion == OC_DISCONNECT */
+                        _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
+        }
+        return congested;
+}
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+        struct drbd_conf *mdev = req->w.mdev;
+        enum drbd_read_balancing rbm;
+        if (req->private_bio) {
+                if (!drbd_may_do_local_read(mdev,
+                                        req->i.sector, req->i.size)) {
+                        bio_put(req->private_bio);
+                        req->private_bio = NULL;
+                        put_ldev(mdev);
+                }
+        }
+        if (mdev->state.pdsk != D_UP_TO_DATE)
+                return false;
+        /* TODO: improve read balancing decisions, take into account drbd
+         * protocol, pending requests etc. */
+        rcu_read_lock();
+        rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
+        rcu_read_unlock();
+        if (rbm == RB_PREFER_LOCAL && req->private_bio)
+                return false; /* submit locally */
+        if (req->private_bio == NULL)
+                return true;
+        if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) {
+                if (req->private_bio) {
+                        bio_put(req->private_bio);
+                        req->private_bio = NULL;
+                        put_ldev(mdev);
+                }
+                return true;
+        }
+        return false;
+}
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+        struct drbd_conf *mdev = req->w.mdev;
+        int remote, send_oos;
+        rcu_read_lock();
+        remote = drbd_should_do_remote(mdev->state);
+        if (remote) {
+                conn_check_congested(mdev);
+                remote = drbd_should_do_remote(mdev->state);
+        }
+        send_oos = drbd_should_send_out_of_sync(mdev->state);
+        rcu_read_unlock();
+        if (!remote && !send_oos)
+                return 0;
+        D_ASSERT(!(remote && send_oos));
+        if (remote) {
+                _req_mod(req, TO_BE_SENT);
+                _req_mod(req, QUEUE_FOR_NET_WRITE);
+        } else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size))
+                _req_mod(req, QUEUE_FOR_SEND_OOS);
+        return remote;
+}
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+        struct drbd_conf *mdev = req->w.mdev;
+        struct bio *bio = req->private_bio;
+        const int rw = bio_rw(bio);
+        bio->bi_bdev = mdev->ldev->backing_bdev;
+        /* State may have changed since we grabbed our reference on the
+         * ->ldev member. Double check, and short-circuit to endio.
+         * In case the last activity log transaction failed to get on
+         * stable storage, and this is a WRITE, we may not even submit
+         * this bio. */
+        if (get_ldev(mdev)) {
+                if (drbd_insert_fault(mdev,
+                                      rw == WRITE ? DRBD_FAULT_DT_WR
+                                    : rw == READ  ? DRBD_FAULT_DT_RD
+                                    :               DRBD_FAULT_DT_RA))
+                        bio_endio(bio, -EIO);
+                else
+                        generic_make_request(bio);
+                put_ldev(mdev);
+        } else
+                bio_endio(bio, -EIO);
+}
 int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
 {
        const int rw = bio_rw(bio);
-        const int size = bio->bi_size;
+        struct bio_and_error m = { NULL, };
-        const sector_t sector = bio->bi_sector;
        struct drbd_request *req;
-        struct net_conf *nc;
+        bool no_remote = false;
-        int local, remote, send_oos = 0;
-        int err = 0;
-        int ret = 0;
-        union drbd_dev_state s;
        /* allocate outside of all locks; */
        req = drbd_req_new(mdev, bio);
@@ -822,70 +964,23 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
        }
        req->start_time = start_time;
-        local = get_ldev(mdev);
+        if (!get_ldev(mdev)) {
-        if (!local) {
+                bio_put(req->private_bio);
-                bio_put(req->private_bio); /* or we get a bio leak */
                req->private_bio = NULL;
        }
-        if (rw == WRITE) {
-                remote = 1;
-        } else {
-                /* READ || READA */
-                if (local) {
-                        if (!drbd_may_do_local_read(mdev, sector, size) ||
-                            remote_due_to_read_balancing(mdev, sector)) {
-                                /* we could kick the syncer to
-                                 * sync this extent asap, wait for
-                                 * it, then continue locally.
-                                 * Or just issue the request remotely.
-                                 */
-                                local = 0;
-                                bio_put(req->private_bio);
-                                req->private_bio = NULL;
-                                put_ldev(mdev);
-                        }
-                }
-                remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
-        }
-        /* If we have a disk, but a READA request is mapped to remote,
-         * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
-         * Just fail that READA request right here.
-         *
-         * THINK: maybe fail all READA when not local?
-         *        or make this configurable...
-         *        if network is slow, READA won't do any good.
-         */
-        if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
-                err = -EWOULDBLOCK;
-                goto fail_and_free_req;
-        }
        /* For WRITES going to the local disk, grab a reference on the target
         * extent.  This waits for any resync activity in the corresponding
         * resync extent to finish, and, if necessary, pulls in the target
         * extent into the activity log, which involves further disk io because
         * of transactional on-disk meta data updates. */
-        if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+        if (rw == WRITE && req->private_bio
+        && !test_bit(AL_SUSPENDED, &mdev->flags)) {
                req->rq_state |= RQ_IN_ACT_LOG;
                drbd_al_begin_io(mdev, &req->i);
        }
-        s = mdev->state;
-        remote = remote && drbd_should_do_remote(s);
-        send_oos = rw == WRITE && drbd_should_send_out_of_sync(s);
-        D_ASSERT(!(remote && send_oos));
-        if (!(local || remote) && !drbd_suspended(mdev)) {
-                if (__ratelimit(&drbd_ratelimit_state))
-                        dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
-                err = -EIO;
-                goto fail_free_complete;
-        }
-        /* GOOD, everything prepared, grab the spin_lock */
        spin_lock_irq(&mdev->tconn->req_lock);
        if (rw == WRITE) {
                /* This may temporarily give up the req_lock,
                 * but will re-aquire it before it returns here.
@@ -893,53 +988,28 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
                complete_conflicting_writes(req);
        }
-        if (drbd_suspended(mdev)) {
+        /* no more giving up req_lock from now on! */
-                /* If we got suspended, use the retry mechanism in
-                   drbd_make_request() to restart processing of this
-                   bio. In the next call to drbd_make_request
-                   we sleep in inc_ap_bio() */
-                ret = 1;
-                spin_unlock_irq(&mdev->tconn->req_lock);
-                goto fail_free_complete;
-        }
-        if (remote || send_oos) {
-                remote = drbd_should_do_remote(mdev->state);
-                send_oos = rw == WRITE && drbd_should_send_out_of_sync(mdev->state);
-                D_ASSERT(!(remote && send_oos));
-                if (!(remote || send_oos))
+        if (drbd_suspended(mdev)) {
-                        dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
+                /* push back and retry: */
-                if (!(local || remote)) {
+                req->rq_state |= RQ_POSTPONED;
-                        dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+                if (req->private_bio) {
-                        spin_unlock_irq(&mdev->tconn->req_lock);
+                        bio_put(req->private_bio);
-                        err = -EIO;
+                        req->private_bio = NULL;
-                        goto fail_free_complete;
                }
+                goto out;
        }
        /* Update disk stats */
        _drbd_start_io_acct(mdev, req, bio);
-        /* NOTE
+        /* We fail READ/READA early, if we can not serve it.
-         * Actually, 'local' may be wrong here already, since we may have failed
+         * We must do this before req is registered on any lists.
-         * to write to the meta data, and may become wrong anytime because of
+         * Otherwise, req_may_be_completed() will queue failed READ for retry. */
-         * local io-error for some other request, which would lead to us
+        if (rw != WRITE) {
-         * "detaching" the local disk.
+                if (!do_remote_read(req) && !req->private_bio)
-         *
+                        goto nodata;
-         * 'remote' may become wrong any time because the network could fail.
+        }
-         *
-         * This is a harmless race condition, though, since it is handled
-         * correctly at the appropriate places; so it just defers the failure
-         * of the respective operation.
-         */
-        /* mark them early for readability.
-         * this just sets some state flags. */
-        if (remote)
-                _req_mod(req, TO_BE_SENT);
-        if (local)
-                _req_mod(req, TO_BE_SUBMITTED);
        /* which transfer log epoch does this belong to? */
        req->epoch = atomic_read(&mdev->tconn->current_tle_nr);
@@ -948,90 +1018,43 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
        list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
-        /* NOTE remote first: to get the concurrent write detection right,
+        if (rw == WRITE) {
-         * we must register the request before start of local IO.  */
+                if (!drbd_process_write_request(req))
-        if (remote) {
+                        no_remote = true;
-                /* either WRITE and C_CONNECTED,
+        } else {
-                 * or READ, and no local disk,
+                /* We either have a private_bio, or we can read from remote.
-                 * or READ, but not in sync.
+                 * Otherwise we had done the goto nodata above. */
-                 */
+                if (req->private_bio == NULL) {
-                _req_mod(req, (rw == WRITE)
+                        _req_mod(req, TO_BE_SENT);
-                                ? QUEUE_FOR_NET_WRITE
+                        _req_mod(req, QUEUE_FOR_NET_READ);
-                                : QUEUE_FOR_NET_READ);
+                } else
+                        no_remote = true;
        }
-        if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
-                _req_mod(req, QUEUE_FOR_SEND_OOS);
-        rcu_read_lock();
+        if (req->private_bio) {
-        nc = rcu_dereference(mdev->tconn->net_conf);
+                /* needs to be marked within the same spinlock */
-        if (remote &&
+                _req_mod(req, TO_BE_SUBMITTED);
-            nc->on_congestion != OC_BLOCK && mdev->tconn->agreed_pro_version >= 96) {
+                /* but we need to give up the spinlock to submit */
-                int congested = 0;
+                spin_unlock_irq(&mdev->tconn->req_lock);
+                drbd_submit_req_private_bio(req);
-                if (nc->cong_fill &&
+                /* once we have submitted, we must no longer look at req,
-                    atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
+                 * it may already be destroyed. */
-                        dev_info(DEV, "Congestion-fill threshold reached\n");
+                return 0;
-                        congested = 1;
+        } else if (no_remote) {
-                }
+nodata:
+                if (__ratelimit(&drbd_ratelimit_state))
-                if (mdev->act_log->used >= nc->cong_extents) {
+                        dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
-                        dev_info(DEV, "Congestion-extents threshold reached\n");
+                /* A write may have been queued for send_oos, however.
-                        congested = 1;
+                 * So we can not simply free it, we must go through req_may_be_completed() */
-                }
-                if (congested) {
-                        if (mdev->tconn->current_tle_writes)
-                                /* start a new epoch for non-mirrored writes */
-                                start_new_tl_epoch(mdev->tconn);
-                        if (nc->on_congestion == OC_PULL_AHEAD)
-                                _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
-                        else  /*nc->on_congestion == OC_DISCONNECT */
-                                _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
-                }
        }
-        rcu_read_unlock();
+out:
+        req_may_be_completed(req, &m);
        spin_unlock_irq(&mdev->tconn->req_lock);
-        if (local) {
+        if (m.bio)
-                req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+                complete_master_bio(mdev, &m);
-                /* State may have changed since we grabbed our reference on the
-                 * mdev->ldev member. Double check, and short-circuit to endio.
-                 * In case the last activity log transaction failed to get on
-                 * stable storage, and this is a WRITE, we may not even submit
-                 * this bio. */
-                if (get_ldev(mdev)) {
-                        if (drbd_insert_fault(mdev,   rw == WRITE ? DRBD_FAULT_DT_WR
-                                                    : rw == READ  ? DRBD_FAULT_DT_RD
-                                                    :               DRBD_FAULT_DT_RA))
-                                bio_endio(req->private_bio, -EIO);
-                        else
-                                generic_make_request(req->private_bio);
-                        put_ldev(mdev);
-                } else
-                        bio_endio(req->private_bio, -EIO);
-        }
        return 0;
-fail_free_complete:
-        if (req->rq_state & RQ_IN_ACT_LOG)
-                drbd_al_complete_io(mdev, &req->i);
-fail_and_free_req:
-        if (local) {
-                bio_put(req->private_bio);
-                req->private_bio = NULL;
-                put_ldev(mdev);
-        }
-        if (!ret)
-                bio_endio(bio, err);
-        drbd_req_free(req);
-        dec_ap_bio(mdev);
-        return ret;
 }
 int drbd_make_request(struct request_queue *q, struct bio *bio)
author	Lars Ellenberg <lars.ellenberg@linbit.com>	2012-03-29 11:04:14 -0400
committer	Philipp Reisner <philipp.reisner@linbit.com>	2012-11-08 10:58:35 -0500
commit	5da9c8364443797ece9393670fb7ab69cff055ed (patch)
tree	90a7206ac7e446c46863ad6caefbd014c94ec1d8 /drivers/block/drbd/drbd_req.c
parent	b6dd1a89767bc33e9c98b3195f8925b46c5c95f3 (diff)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index ca28b56b7a2f..d2d61af034ec 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c
@@ -304,15 +304,21 @@ void req_may_be_completed(struct drbd_request req, struct bio_and_error m)
304	/* Update disk stats */	304	/* Update disk stats */
305	_drbd_end_io_acct(mdev, req);	305	_drbd_end_io_acct(mdev, req);
306		306
307	/* if READ failed,	307	/* If READ failed,
308	* have it be pushed back to the retry work queue,	308	* have it be pushed back to the retry work queue,
309	* so it will re-enter __drbd_make_request,	309	* so it will re-enter __drbd_make_request(),
310	* and be re-assigned to a suitable local or remote path,	310	* and be re-assigned to a suitable local or remote path,
311	* or failed if we do not have access to good data anymore.	311	* or failed if we do not have access to good data anymore.
312	* READA may fail.	312	*
		313	* Unless it was failed early by __drbd_make_request(),
		314	* because no path was available, in which case
		315	* it was not even added to the transfer_log.
		316	*
		317	* READA may fail, and will not be retried.
		318	*
313	* WRITE should have used all available paths already.	319	* WRITE should have used all available paths already.
314	*/	320	*/
315	if (!ok && rw == READ)	321	if (!ok && rw == READ && !list_empty(&req->tl_requests))
316	req->rq_state \|= RQ_POSTPONED;	322	req->rq_state \|= RQ_POSTPONED;
317		323
318	if (!(req->rq_state & RQ_POSTPONED)) {	324	if (!(req->rq_state & RQ_POSTPONED)) {
@@ -725,19 +731,12 @@ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int
725	return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;	731	return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
726	}	732	}
727		733
728	static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector)	734	static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector,
		735	enum drbd_read_balancing rbm)
729	{	736	{
730	enum drbd_read_balancing rbm;
731	struct backing_dev_info *bdi;	737	struct backing_dev_info *bdi;
732	int stripe_shift;	738	int stripe_shift;
733		739
734	if (mdev->state.pdsk < D_UP_TO_DATE)
735	return false;
736
737	rcu_read_lock();
738	rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
739	rcu_read_unlock();
740
741	switch (rbm) {	740	switch (rbm) {
742	case RB_CONGESTED_REMOTE:	741	case RB_CONGESTED_REMOTE:
743	bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;	742	bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
@@ -798,17 +797,160 @@ static void complete_conflicting_writes(struct drbd_request *req)
798	finish_wait(&mdev->misc_wait, &wait);	797	finish_wait(&mdev->misc_wait, &wait);
799	}	798	}
800		799
		800	/* called within req_lock and rcu_read_lock() */
		801	static bool conn_check_congested(struct drbd_conf *mdev)
		802	{
		803	struct drbd_tconn *tconn = mdev->tconn;
		804	struct net_conf *nc;
		805	bool congested = false;
		806	enum drbd_on_congestion on_congestion;
		807
		808	nc = rcu_dereference(tconn->net_conf);
		809	on_congestion = nc ? nc->on_congestion : OC_BLOCK;
		810	if (on_congestion == OC_BLOCK \|\|
		811	tconn->agreed_pro_version < 96)
		812	return false;
		813
		814	if (nc->cong_fill &&
		815	atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
		816	dev_info(DEV, "Congestion-fill threshold reached\n");
		817	congested = true;
		818	}
		819
		820	if (mdev->act_log->used >= nc->cong_extents) {
		821	dev_info(DEV, "Congestion-extents threshold reached\n");
		822	congested = true;
		823	}
		824
		825	if (congested) {
		826	if (mdev->tconn->current_tle_writes)
		827	/* start a new epoch for non-mirrored writes */
		828	start_new_tl_epoch(mdev->tconn);
		829
		830	if (on_congestion == OC_PULL_AHEAD)
		831	_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
		832	else /nc->on_congestion == OC_DISCONNECT /
		833	_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
		834	}
		835
		836	return congested;
		837	}
		838
		839	/* If this returns false, and req->private_bio is still set,
		840	* this should be submitted locally.
		841	*
		842	* If it returns false, but req->private_bio is not set,
		843	* we do not have access to good data :(
		844	*
		845	* Otherwise, this destroys req->private_bio, if any,
		846	* and returns true.
		847	*/
		848	static bool do_remote_read(struct drbd_request *req)
		849	{
		850	struct drbd_conf *mdev = req->w.mdev;
		851	enum drbd_read_balancing rbm;
		852
		853	if (req->private_bio) {
		854	if (!drbd_may_do_local_read(mdev,
		855	req->i.sector, req->i.size)) {
		856	bio_put(req->private_bio);
		857	req->private_bio = NULL;
		858	put_ldev(mdev);
		859	}
		860	}
		861
		862	if (mdev->state.pdsk != D_UP_TO_DATE)
		863	return false;
		864
		865	/* TODO: improve read balancing decisions, take into account drbd
		866	* protocol, pending requests etc. */
		867
		868	rcu_read_lock();
		869	rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
		870	rcu_read_unlock();
		871
		872	if (rbm == RB_PREFER_LOCAL && req->private_bio)
		873	return false; /* submit locally */
		874
		875	if (req->private_bio == NULL)
		876	return true;
		877
		878	if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) {
		879	if (req->private_bio) {
		880	bio_put(req->private_bio);
		881	req->private_bio = NULL;
		882	put_ldev(mdev);
		883	}
		884	return true;
		885	}
		886
		887	return false;
		888	}
		889
		890	/* returns number of connections (== 1, for drbd 8.4)
		891	* expected to actually write this data,
		892	* which does NOT include those that we are L_AHEAD for. */
		893	static int drbd_process_write_request(struct drbd_request *req)
		894	{
		895	struct drbd_conf *mdev = req->w.mdev;
		896	int remote, send_oos;
		897
		898	rcu_read_lock();
		899	remote = drbd_should_do_remote(mdev->state);
		900	if (remote) {
		901	conn_check_congested(mdev);
		902	remote = drbd_should_do_remote(mdev->state);
		903	}
		904	send_oos = drbd_should_send_out_of_sync(mdev->state);
		905	rcu_read_unlock();
		906
		907	if (!remote && !send_oos)
		908	return 0;
		909
		910	D_ASSERT(!(remote && send_oos));
		911
		912	if (remote) {
		913	_req_mod(req, TO_BE_SENT);
		914	_req_mod(req, QUEUE_FOR_NET_WRITE);
		915	} else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size))
		916	_req_mod(req, QUEUE_FOR_SEND_OOS);
		917
		918	return remote;
		919	}
		920
		921	static void
		922	drbd_submit_req_private_bio(struct drbd_request *req)
		923	{
		924	struct drbd_conf *mdev = req->w.mdev;
		925	struct bio *bio = req->private_bio;
		926	const int rw = bio_rw(bio);
		927
		928	bio->bi_bdev = mdev->ldev->backing_bdev;
		929
		930	/* State may have changed since we grabbed our reference on the
		931	* ->ldev member. Double check, and short-circuit to endio.
		932	* In case the last activity log transaction failed to get on
		933	* stable storage, and this is a WRITE, we may not even submit
		934	* this bio. */
		935	if (get_ldev(mdev)) {
		936	if (drbd_insert_fault(mdev,
		937	rw == WRITE ? DRBD_FAULT_DT_WR
		938	: rw == READ ? DRBD_FAULT_DT_RD
		939	: DRBD_FAULT_DT_RA))
		940	bio_endio(bio, -EIO);
		941	else
		942	generic_make_request(bio);
		943	put_ldev(mdev);
		944	} else
		945	bio_endio(bio, -EIO);
		946	}
		947
801	int __drbd_make_request(struct drbd_conf mdev, struct bio bio, unsigned long start_time)	948	int __drbd_make_request(struct drbd_conf mdev, struct bio bio, unsigned long start_time)
802	{	949	{
803	const int rw = bio_rw(bio);	950	const int rw = bio_rw(bio);
804	const int size = bio->bi_size;	951	struct bio_and_error m = { NULL, };
805	const sector_t sector = bio->bi_sector;
806	struct drbd_request *req;	952	struct drbd_request *req;
807	struct net_conf *nc;	953	bool no_remote = false;
808	int local, remote, send_oos = 0;
809	int err = 0;
810	int ret = 0;
811	union drbd_dev_state s;
812		954
813	/* allocate outside of all locks; */	955	/* allocate outside of all locks; */
814	req = drbd_req_new(mdev, bio);	956	req = drbd_req_new(mdev, bio);
@@ -822,70 +964,23 @@ int __drbd_make_request(struct drbd_conf mdev, struct bio bio, unsigned long s
822	}	964	}
823	req->start_time = start_time;	965	req->start_time = start_time;
824		966
825	local = get_ldev(mdev);	967	if (!get_ldev(mdev)) {
826	if (!local) {	968	bio_put(req->private_bio);
827	bio_put(req->private_bio); /* or we get a bio leak */
828	req->private_bio = NULL;	969	req->private_bio = NULL;
829	}	970	}
830	if (rw == WRITE) {
831	remote = 1;
832	} else {
833	/* READ \|\| READA */
834	if (local) {
835	if (!drbd_may_do_local_read(mdev, sector, size) \|\|
836	remote_due_to_read_balancing(mdev, sector)) {
837	/* we could kick the syncer to
838	* sync this extent asap, wait for
839	* it, then continue locally.
840	* Or just issue the request remotely.
841	*/
842	local = 0;
843	bio_put(req->private_bio);
844	req->private_bio = NULL;
845	put_ldev(mdev);
846	}
847	}
848	remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
849	}
850
851	/* If we have a disk, but a READA request is mapped to remote,
852	* we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
853	* Just fail that READA request right here.
854	*
855	* THINK: maybe fail all READA when not local?
856	* or make this configurable...
857	* if network is slow, READA won't do any good.
858	*/
859	if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
860	err = -EWOULDBLOCK;
861	goto fail_and_free_req;
862	}
863		971
864	/* For WRITES going to the local disk, grab a reference on the target	972	/* For WRITES going to the local disk, grab a reference on the target
865	* extent. This waits for any resync activity in the corresponding	973	* extent. This waits for any resync activity in the corresponding
866	* resync extent to finish, and, if necessary, pulls in the target	974	* resync extent to finish, and, if necessary, pulls in the target
867	* extent into the activity log, which involves further disk io because	975	* extent into the activity log, which involves further disk io because
868	* of transactional on-disk meta data updates. */	976	* of transactional on-disk meta data updates. */
869	if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {	977	if (rw == WRITE && req->private_bio
		978	&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
870	req->rq_state \|= RQ_IN_ACT_LOG;	979	req->rq_state \|= RQ_IN_ACT_LOG;
871	drbd_al_begin_io(mdev, &req->i);	980	drbd_al_begin_io(mdev, &req->i);
872	}	981	}
873		982
874	s = mdev->state;
875	remote = remote && drbd_should_do_remote(s);
876	send_oos = rw == WRITE && drbd_should_send_out_of_sync(s);
877	D_ASSERT(!(remote && send_oos));
878
879	if (!(local \|\| remote) && !drbd_suspended(mdev)) {
880	if (__ratelimit(&drbd_ratelimit_state))
881	dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
882	err = -EIO;
883	goto fail_free_complete;
884	}
885
886	/* GOOD, everything prepared, grab the spin_lock */
887	spin_lock_irq(&mdev->tconn->req_lock);	983	spin_lock_irq(&mdev->tconn->req_lock);
888
889	if (rw == WRITE) {	984	if (rw == WRITE) {
890	/* This may temporarily give up the req_lock,	985	/* This may temporarily give up the req_lock,
891	* but will re-aquire it before it returns here.	986	* but will re-aquire it before it returns here.
@@ -893,53 +988,28 @@ int __drbd_make_request(struct drbd_conf mdev, struct bio bio, unsigned long s
893	complete_conflicting_writes(req);	988	complete_conflicting_writes(req);
894	}	989	}
895		990
896	if (drbd_suspended(mdev)) {	991	/* no more giving up req_lock from now on! */
897	/* If we got suspended, use the retry mechanism in
898	drbd_make_request() to restart processing of this
899	bio. In the next call to drbd_make_request
900	we sleep in inc_ap_bio() */
901	ret = 1;
902	spin_unlock_irq(&mdev->tconn->req_lock);
903	goto fail_free_complete;
904	}
905
906	if (remote \|\| send_oos) {
907	remote = drbd_should_do_remote(mdev->state);
908	send_oos = rw == WRITE && drbd_should_send_out_of_sync(mdev->state);
909	D_ASSERT(!(remote && send_oos));
910		992
911	if (!(remote \|\| send_oos))	993	if (drbd_suspended(mdev)) {
912	dev_warn(DEV, "lost connection while grabbing the req_lock!\n");	994	/* push back and retry: */
913	if (!(local \|\| remote)) {	995	req->rq_state \|= RQ_POSTPONED;
914	dev_err(DEV, "IO ERROR: neither local nor remote disk\n");	996	if (req->private_bio) {
915	spin_unlock_irq(&mdev->tconn->req_lock);	997	bio_put(req->private_bio);
916	err = -EIO;	998	req->private_bio = NULL;
917	goto fail_free_complete;
918	}	999	}
		1000	goto out;
919	}	1001	}
920		1002
921	/* Update disk stats */	1003	/* Update disk stats */
922	_drbd_start_io_acct(mdev, req, bio);	1004	_drbd_start_io_acct(mdev, req, bio);
923		1005
924	/* NOTE	1006	/* We fail READ/READA early, if we can not serve it.
925	* Actually, 'local' may be wrong here already, since we may have failed	1007	* We must do this before req is registered on any lists.
926	* to write to the meta data, and may become wrong anytime because of	1008	* Otherwise, req_may_be_completed() will queue failed READ for retry. */
927	* local io-error for some other request, which would lead to us	1009	if (rw != WRITE) {
928	* "detaching" the local disk.	1010	if (!do_remote_read(req) && !req->private_bio)
929	*	1011	goto nodata;
930	* 'remote' may become wrong any time because the network could fail.	1012	}
931	*
932	* This is a harmless race condition, though, since it is handled
933	* correctly at the appropriate places; so it just defers the failure
934	* of the respective operation.
935	*/
936
937	/* mark them early for readability.
938	* this just sets some state flags. */
939	if (remote)
940	_req_mod(req, TO_BE_SENT);
941	if (local)
942	_req_mod(req, TO_BE_SUBMITTED);
943		1013
944	/* which transfer log epoch does this belong to? */	1014	/* which transfer log epoch does this belong to? */
945	req->epoch = atomic_read(&mdev->tconn->current_tle_nr);	1015	req->epoch = atomic_read(&mdev->tconn->current_tle_nr);
@@ -948,90 +1018,43 @@ int __drbd_make_request(struct drbd_conf mdev, struct bio bio, unsigned long s
948		1018
949	list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);	1019	list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
950		1020
951	/* NOTE remote first: to get the concurrent write detection right,	1021	if (rw == WRITE) {
952	* we must register the request before start of local IO. */	1022	if (!drbd_process_write_request(req))
953	if (remote) {	1023	no_remote = true;
954	/* either WRITE and C_CONNECTED,	1024	} else {
955	* or READ, and no local disk,	1025	/* We either have a private_bio, or we can read from remote.
956	* or READ, but not in sync.	1026	* Otherwise we had done the goto nodata above. */
957	*/	1027	if (req->private_bio == NULL) {
958	_req_mod(req, (rw == WRITE)	1028	_req_mod(req, TO_BE_SENT);
959	? QUEUE_FOR_NET_WRITE	1029	_req_mod(req, QUEUE_FOR_NET_READ);
960	: QUEUE_FOR_NET_READ);	1030	} else
		1031	no_remote = true;
961	}	1032	}
962	if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
963	_req_mod(req, QUEUE_FOR_SEND_OOS);
964		1033
965	rcu_read_lock();	1034	if (req->private_bio) {
966	nc = rcu_dereference(mdev->tconn->net_conf);	1035	/* needs to be marked within the same spinlock */
967	if (remote &&	1036	_req_mod(req, TO_BE_SUBMITTED);
968	nc->on_congestion != OC_BLOCK && mdev->tconn->agreed_pro_version >= 96) {	1037	/* but we need to give up the spinlock to submit */
969	int congested = 0;	1038	spin_unlock_irq(&mdev->tconn->req_lock);
970		1039	drbd_submit_req_private_bio(req);
971	if (nc->cong_fill &&	1040	/* once we have submitted, we must no longer look at req,
972	atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {	1041	* it may already be destroyed. */
973	dev_info(DEV, "Congestion-fill threshold reached\n");	1042	return 0;
974	congested = 1;	1043	} else if (no_remote) {
975	}	1044	nodata:
976		1045	if (__ratelimit(&drbd_ratelimit_state))
977	if (mdev->act_log->used >= nc->cong_extents) {	1046	dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
978	dev_info(DEV, "Congestion-extents threshold reached\n");	1047	/* A write may have been queued for send_oos, however.
979	congested = 1;	1048	* So we can not simply free it, we must go through req_may_be_completed() */
980	}
981
982	if (congested) {
983	if (mdev->tconn->current_tle_writes)
984	/* start a new epoch for non-mirrored writes */
985	start_new_tl_epoch(mdev->tconn);
986
987	if (nc->on_congestion == OC_PULL_AHEAD)
988	_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
989	else /nc->on_congestion == OC_DISCONNECT /
990	_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
991	}
992	}	1049	}
993	rcu_read_unlock();
994		1050
		1051	out:
		1052	req_may_be_completed(req, &m);
995	spin_unlock_irq(&mdev->tconn->req_lock);	1053	spin_unlock_irq(&mdev->tconn->req_lock);
996		1054
997	if (local) {	1055	if (m.bio)
998	req->private_bio->bi_bdev = mdev->ldev->backing_bdev;	1056	complete_master_bio(mdev, &m);
999
1000	/* State may have changed since we grabbed our reference on the
1001	* mdev->ldev member. Double check, and short-circuit to endio.
1002	* In case the last activity log transaction failed to get on
1003	* stable storage, and this is a WRITE, we may not even submit
1004	* this bio. */
1005	if (get_ldev(mdev)) {
1006	if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
1007	: rw == READ ? DRBD_FAULT_DT_RD
1008	: DRBD_FAULT_DT_RA))
1009	bio_endio(req->private_bio, -EIO);
1010	else
1011	generic_make_request(req->private_bio);
1012	put_ldev(mdev);
1013	} else
1014	bio_endio(req->private_bio, -EIO);
1015	}
1016
1017	return 0;	1057	return 0;
1018
1019	fail_free_complete:
1020	if (req->rq_state & RQ_IN_ACT_LOG)
1021	drbd_al_complete_io(mdev, &req->i);
1022	fail_and_free_req:
1023	if (local) {
1024	bio_put(req->private_bio);
1025	req->private_bio = NULL;
1026	put_ldev(mdev);
1027	}
1028	if (!ret)
1029	bio_endio(bio, err);
1030
1031	drbd_req_free(req);
1032	dec_ap_bio(mdev);
1033
1034	return ret;
1035	}	1058	}
1036		1059
1037	int drbd_make_request(struct request_queue q, struct bio bio)	1060	int drbd_make_request(struct request_queue q, struct bio bio)