xen/blkfront: Handle non-indirect grant with 64KB pages

The minimal size of request in the block framework is always PAGE_SIZE. It means that when 64KB guest is support, the request will at least be 64KB. Although, if the backend doesn't support indirect descriptor (such as QDISK in QEMU), a ring request is only able to accommodate 11 segments of 4KB (i.e 44KB). The current frontend is assuming that an I/O request will always fit in a ring request. This is not true any more when using 64KB page granularity and will therefore crash during boot. On ARM64, the ABI is completely neutral to the page granularity used by the domU. The guest has the choice between different page granularity supported by the processors (for instance on ARM64: 4KB, 16KB, 64KB). This can't be enforced by the hypervisor and therefore it's possible to run guests using different page granularity. So we can't mandate the block backend to support indirect descriptor when the frontend is using 64KB page granularity and have to fix it properly in the frontend. The solution exposed below is based on modifying directly the frontend guest rather than asking the block framework to support smaller size (i.e < PAGE_SIZE). This is because the change is the block framework are not trivial as everything seems to relying on a struct *page (see [1]). Although, it may be possible that someone succeed to do it in the future and we would therefore be able to use it. Given that a block request may not fit in a single ring request, a second request is introduced for the data that cannot fit in the first one. This means that the second ring request should never be used on Linux if the page size is smaller than 44KB. To achieve the support of the extra ring request, the block queue size is divided by two. Therefore, the ring will always contain enough space to accommodate 2 ring requests. While this will reduce the overall performance, it will make the implementation more contained. The way forward to get better performance is to implement in the backend either indirect descriptor or multiple grants ring. Note that the parameters blk_queue_max_* helpers haven't been updated. The block code will set the mimimum size supported and we may be able to support directly any change in the block framework that lower down the minimal size of a request. [1] http://lists.xen.org/archives/html/xen-devel/2015-08/msg02200.html Signed-off-by: Julien Grall <julien.grall@citrix.com> Acked-by: Roger Pau Monné <roger.pau@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
author: Julien Grall <julien.grall@citrix.com> 2015-08-13 08:13:35 -0400
committer: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 2016-01-04 12:21:25 -0500
commit: 6cc5683390472c450fd69975d1283db79202667f (patch)
tree: 11ac3de1f8d5547455e37ad7fb37965315fbee88
parent: 2e073969d57f60fc0b863985779657624cbd4886 (diff)
1 files changed, 212 insertions, 16 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0b32c90ffc3f..f3d0d4758641 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
 #include <asm/xen/hypervisor.h>
+/*
+ * The minimal size of segment supported by the block framework is PAGE_SIZE.
+ * When Linux is using a different page size than Xen, it may not be possible
+ * to put all the data in a single segment.
+ * This can happen when the backend doesn't support indirect descriptor and
+ * therefore the maximum amount of data that a request can carry is
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
+ *
+ * Note that we only support one extra request. So the Linux page size
+ * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
+ * 88KB.
+ */
+#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
 enum blkif_state {
        BLKIF_STATE_DISCONNECTED,
        BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
        struct list_head node;
 };
+enum blk_req_status {
+        REQ_WAITING,
+        REQ_DONE,
+        REQ_ERROR,
+        REQ_EOPNOTSUPP,
+};
 struct blk_shadow {
        struct blkif_request req;
        struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
        struct grant **indirect_grants;
        struct scatterlist *sg;
        unsigned int num_sg;
+        enum blk_req_status status;
+        #define NO_ASSOCIATED_ID ~0UL
+        /*
+         * Id of the sibling if we ever need 2 requests when handling a
+         * block I/O request
+         */
+        unsigned long associated_id;
 };
 struct split_bio {
@@ -492,6 +521,8 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
        id = get_id_from_freelist(rinfo);
        rinfo->shadow[id].request = req;
+        rinfo->shadow[id].status = REQ_WAITING;
+        rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
        (*ring_req)->u.rw.id = id;
@@ -533,6 +564,9 @@ struct setup_rw_req {
        bool need_copy;
        unsigned int bvec_off;
        char *bvec_data;
+        bool require_extra_req;
+        struct blkif_request *extra_ring_req;
 };
 static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -546,8 +580,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
        unsigned int grant_idx = setup->grant_idx;
        struct blkif_request *ring_req = setup->ring_req;
        struct blkfront_ring_info *rinfo = setup->rinfo;
+        /*
+         * We always use the shadow of the first request to store the list
+         * of grant associated to the block I/O request. This made the
+         * completion more easy to handle even if the block I/O request is
+         * split.
+         */
        struct blk_shadow *shadow = &rinfo->shadow[setup->id];
+        if (unlikely(setup->require_extra_req &&
+                     grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+                /*
+                 * We are using the second request, setup grant_idx
+                 * to be the index of the segment array.
+                 */
+                grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+                ring_req = setup->extra_ring_req;
+        }
        if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
            (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
                if (setup->segments)
@@ -562,7 +612,11 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
        gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
        ref = gnt_list_entry->gref;
-        shadow->grants_used[grant_idx] = gnt_list_entry;
+        /*
+         * All the grants are stored in the shadow of the first
+         * request. Therefore we have to use the global index.
+         */
+        shadow->grants_used[setup->grant_idx] = gnt_list_entry;
        if (setup->need_copy) {
                void *shared_data;
@@ -604,11 +658,31 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
        (setup->grant_idx)++;
 }
+static void blkif_setup_extra_req(struct blkif_request *first,
+                                  struct blkif_request *second)
+{
+        uint16_t nr_segments = first->u.rw.nr_segments;
+        /*
+         * The second request is only present when the first request uses
+         * all its segments. It's always the continuity of the first one.
+         */
+        first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+        second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+        second->u.rw.sector_number = first->u.rw.sector_number +
+                (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+        second->u.rw.handle = first->u.rw.handle;
+        second->operation = first->operation;
+}
 static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
 {
        struct blkfront_info *info = rinfo->dev_info;
-        struct blkif_request *ring_req;
+        struct blkif_request *ring_req, *extra_ring_req = NULL;
-        unsigned long id;
+        unsigned long id, extra_id = NO_ASSOCIATED_ID;
+        bool require_extra_req = false;
        int i;
        struct setup_rw_req setup = {
                .grant_idx = 0,
@@ -650,19 +724,19 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
        /* Fill out a communications ring structure. */
        id = blkif_ring_get_request(rinfo, req, &ring_req);
-        BUG_ON(info->max_indirect_segments == 0 &&
-               GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-        BUG_ON(info->max_indirect_segments &&
-               GREFS(req->nr_phys_segments) > info->max_indirect_segments);
        num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
        num_grant = 0;
        /* Calculate the number of grant used */
        for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
               num_grant += gnttab_count_grant(sg->offset, sg->length);
+        require_extra_req = info->max_indirect_segments == 0 &&
+                num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
+        BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
        rinfo->shadow[id].num_sg = num_sg;
-        if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+        if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+            likely(!require_extra_req)) {
                /*
                 * The indirect operation can only be a BLKIF_OP_READ or
                 * BLKIF_OP_WRITE
@@ -702,10 +776,30 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
                        }
                }
                ring_req->u.rw.nr_segments = num_grant;
+                if (unlikely(require_extra_req)) {
+                        extra_id = blkif_ring_get_request(rinfo, req,
+                                                          &extra_ring_req);
+                        /*
+                         * Only the first request contains the scatter-gather
+                         * list.
+                         */
+                        rinfo->shadow[extra_id].num_sg = 0;
+                        blkif_setup_extra_req(ring_req, extra_ring_req);
+                        /* Link the 2 requests together */
+                        rinfo->shadow[extra_id].associated_id = id;
+                        rinfo->shadow[id].associated_id = extra_id;
+                }
        }
        setup.ring_req = ring_req;
        setup.id = id;
+        setup.require_extra_req = require_extra_req;
+        if (unlikely(require_extra_req))
+                setup.extra_ring_req = extra_ring_req;
        for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
                BUG_ON(sg->offset + sg->length > PAGE_SIZE);
@@ -728,6 +822,8 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
        /* Keep a private copy so we can reissue requests when recovering. */
        rinfo->shadow[id].req = *ring_req;
+        if (unlikely(require_extra_req))
+                rinfo->shadow[extra_id].req = *extra_ring_req;
        if (max_grefs > 0)
                gnttab_free_grant_references(setup.gref_head);
@@ -829,7 +925,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
        memset(&info->tag_set, 0, sizeof(info->tag_set));
        info->tag_set.ops = &blkfront_mq_ops;
        info->tag_set.nr_hw_queues = info->nr_rings;
-        info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+        if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+                /*
+                 * When indirect descriptior is not supported, the I/O request
+                 * will be split between multiple request in the ring.
+                 * To avoid problems when sending the request, divide by
+                 * 2 the depth of the queue.
+                 */
+                info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+        } else
+                info->tag_set.queue_depth = BLK_RING_SIZE(info);
        info->tag_set.numa_node = NUMA_NO_NODE;
        info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
        info->tag_set.cmd_size = 0;
@@ -1269,20 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
        kunmap_atomic(shared_data);
 }
-static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *rinfo,
+static enum blk_req_status blkif_rsp_to_req_status(int rsp)
+{
+        switch (rsp)
+        {
+        case BLKIF_RSP_OKAY:
+                return REQ_DONE;
+        case BLKIF_RSP_EOPNOTSUPP:
+                return REQ_EOPNOTSUPP;
+        case BLKIF_RSP_ERROR:
+                /* Fallthrough. */
+        default:
+                return REQ_ERROR;
+        }
+}
+/*
+ * Get the final status of the block request based on two ring response
+ */
+static int blkif_get_final_status(enum blk_req_status s1,
+                                  enum blk_req_status s2)
+{
+        BUG_ON(s1 == REQ_WAITING);
+        BUG_ON(s2 == REQ_WAITING);
+        if (s1 == REQ_ERROR || s2 == REQ_ERROR)
+                return BLKIF_RSP_ERROR;
+        else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
+                return BLKIF_RSP_EOPNOTSUPP;
+        return BLKIF_RSP_OKAY;
+}
+static bool blkif_completion(unsigned long *id,
+                             struct blkfront_ring_info *rinfo,
                             struct blkif_response *bret)
 {
        int i = 0;
        struct scatterlist *sg;
        int num_sg, num_grant;
        struct blkfront_info *info = rinfo->dev_info;
+        struct blk_shadow *s = &rinfo->shadow[*id];
        struct copy_from_grant data = {
-                .s = s,
                .grant_idx = 0,
        };
        num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
                s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+        /* The I/O request may be split in two. */
+        if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
+                struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
+                /* Keep the status of the current response in shadow. */
+                s->status = blkif_rsp_to_req_status(bret->status);
+                /* Wait the second response if not yet here. */
+                if (s2->status == REQ_WAITING)
+                        return 0;
+                bret->status = blkif_get_final_status(s->status,
+                                                      s2->status);
+                /*
+                 * All the grants is stored in the first shadow in order
+                 * to make the completion code simpler.
+                 */
+                num_grant += s2->req.u.rw.nr_segments;
+                /*
+                 * The two responses may not come in order. Only the
+                 * first request will store the scatter-gather list.
+                 */
+                if (s2->num_sg != 0) {
+                        /* Update "id" with the ID of the first response. */
+                        *id = s->associated_id;
+                        s = s2;
+                }
+                /*
+                 * We don't need anymore the second request, so recycling
+                 * it now.
+                 */
+                if (add_id_to_freelist(rinfo, s->associated_id))
+                        WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
+                             info->gd->disk_name, s->associated_id);
+        }
+        data.s = s;
        num_sg = s->num_sg;
        if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1352,6 +1530,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
                        }
                }
        }
+        return 1;
 }
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1391,8 +1571,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                }
                req  = rinfo->shadow[id].request;
-                if (bret->operation != BLKIF_OP_DISCARD)
+                if (bret->operation != BLKIF_OP_DISCARD) {
-                        blkif_completion(&rinfo->shadow[id], rinfo, bret);
+                        /*
+                         * We may need to wait for an extra response if the
+                         * I/O request is split in 2
+                         */
+                        if (!blkif_completion(&id, rinfo, bret))
+                                continue;
+                }
                if (add_id_to_freelist(rinfo, id)) {
                        WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@@ -2017,8 +2203,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
        int err, i;
        struct blkfront_info *info = rinfo->dev_info;
-        if (info->max_indirect_segments == 0)
+        if (info->max_indirect_segments == 0) {
-                grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+                if (!HAS_EXTRA_REQ)
+                        grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+                else {
+                        /*
+                         * When an extra req is required, the maximum
+                         * grants supported is related to the size of the
+                         * Linux block segment.
+                         */
+                        grants = GRANTS_PER_PSEG;
+                }
+        }
        else
                grants = info->max_indirect_segments;
        psegs = grants / GRANTS_PER_PSEG;
author	Julien Grall <julien.grall@citrix.com>	2015-08-13 08:13:35 -0400
committer	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>	2016-01-04 12:21:25 -0500
commit	6cc5683390472c450fd69975d1283db79202667f (patch)
tree	11ac3de1f8d5547455e37ad7fb37965315fbee88
parent	2e073969d57f60fc0b863985779657624cbd4886 (diff)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0b32c90ffc3f..f3d0d4758641 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
60		60
61	#include <asm/xen/hypervisor.h>	61	#include <asm/xen/hypervisor.h>
62		62
		63	/*
		64	* The minimal size of segment supported by the block framework is PAGE_SIZE.
		65	* When Linux is using a different page size than Xen, it may not be possible
		66	* to put all the data in a single segment.
		67	* This can happen when the backend doesn't support indirect descriptor and
		68	* therefore the maximum amount of data that a request can carry is
		69	* BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
		70	*
		71	* Note that we only support one extra request. So the Linux page size
		72	* should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
		73	* 88KB.
		74	*/
		75	#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
		76
63	enum blkif_state {	77	enum blkif_state {
64	BLKIF_STATE_DISCONNECTED,	78	BLKIF_STATE_DISCONNECTED,
65	BLKIF_STATE_CONNECTED,	79	BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
72	struct list_head node;	86	struct list_head node;
73	};	87	};
74		88
		89	enum blk_req_status {
		90	REQ_WAITING,
		91	REQ_DONE,
		92	REQ_ERROR,
		93	REQ_EOPNOTSUPP,
		94	};
		95
75	struct blk_shadow {	96	struct blk_shadow {
76	struct blkif_request req;	97	struct blkif_request req;
77	struct request *request;	98	struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
79	struct grant **indirect_grants;	100	struct grant **indirect_grants;
80	struct scatterlist *sg;	101	struct scatterlist *sg;
81	unsigned int num_sg;	102	unsigned int num_sg;
		103	enum blk_req_status status;
		104
		105	#define NO_ASSOCIATED_ID ~0UL
		106	/*
		107	* Id of the sibling if we ever need 2 requests when handling a
		108	* block I/O request
		109	*/
		110	unsigned long associated_id;
82	};	111	};
83		112
84	struct split_bio {	113	struct split_bio {
@@ -492,6 +521,8 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
492		521
493	id = get_id_from_freelist(rinfo);	522	id = get_id_from_freelist(rinfo);
494	rinfo->shadow[id].request = req;	523	rinfo->shadow[id].request = req;
		524	rinfo->shadow[id].status = REQ_WAITING;
		525	rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
495		526
496	(*ring_req)->u.rw.id = id;	527	(*ring_req)->u.rw.id = id;
497		528
@@ -533,6 +564,9 @@ struct setup_rw_req {
533	bool need_copy;	564	bool need_copy;
534	unsigned int bvec_off;	565	unsigned int bvec_off;
535	char *bvec_data;	566	char *bvec_data;
		567
		568	bool require_extra_req;
		569	struct blkif_request *extra_ring_req;
536	};	570	};
537		571
538	static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,	572	static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -546,8 +580,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
546	unsigned int grant_idx = setup->grant_idx;	580	unsigned int grant_idx = setup->grant_idx;
547	struct blkif_request *ring_req = setup->ring_req;	581	struct blkif_request *ring_req = setup->ring_req;
548	struct blkfront_ring_info *rinfo = setup->rinfo;	582	struct blkfront_ring_info *rinfo = setup->rinfo;
		583	/*
		584	* We always use the shadow of the first request to store the list
		585	* of grant associated to the block I/O request. This made the
		586	* completion more easy to handle even if the block I/O request is
		587	* split.
		588	*/
549	struct blk_shadow *shadow = &rinfo->shadow[setup->id];	589	struct blk_shadow *shadow = &rinfo->shadow[setup->id];
550		590
		591	if (unlikely(setup->require_extra_req &&
		592	grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
		593	/*
		594	* We are using the second request, setup grant_idx
		595	* to be the index of the segment array.
		596	*/
		597	grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
		598	ring_req = setup->extra_ring_req;
		599	}
		600
551	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&	601	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
552	(grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {	602	(grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
553	if (setup->segments)	603	if (setup->segments)
@@ -562,7 +612,11 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
562		612
563	gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);	613	gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
564	ref = gnt_list_entry->gref;	614	ref = gnt_list_entry->gref;
565	shadow->grants_used[grant_idx] = gnt_list_entry;	615	/*
		616	* All the grants are stored in the shadow of the first
		617	* request. Therefore we have to use the global index.
		618	*/
		619	shadow->grants_used[setup->grant_idx] = gnt_list_entry;
566		620
567	if (setup->need_copy) {	621	if (setup->need_copy) {
568	void *shared_data;	622	void *shared_data;
@@ -604,11 +658,31 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
604	(setup->grant_idx)++;	658	(setup->grant_idx)++;
605	}	659	}
606		660
		661	static void blkif_setup_extra_req(struct blkif_request *first,
		662	struct blkif_request *second)
		663	{
		664	uint16_t nr_segments = first->u.rw.nr_segments;
		665
		666	/*
		667	* The second request is only present when the first request uses
		668	* all its segments. It's always the continuity of the first one.
		669	*/
		670	first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
		671
		672	second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
		673	second->u.rw.sector_number = first->u.rw.sector_number +
		674	(BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
		675
		676	second->u.rw.handle = first->u.rw.handle;
		677	second->operation = first->operation;
		678	}
		679
607	static int blkif_queue_rw_req(struct request req, struct blkfront_ring_info rinfo)	680	static int blkif_queue_rw_req(struct request req, struct blkfront_ring_info rinfo)
608	{	681	{
609	struct blkfront_info *info = rinfo->dev_info;	682	struct blkfront_info *info = rinfo->dev_info;
610	struct blkif_request *ring_req;	683	struct blkif_request ring_req, extra_ring_req = NULL;
611	unsigned long id;	684	unsigned long id, extra_id = NO_ASSOCIATED_ID;
		685	bool require_extra_req = false;
612	int i;	686	int i;
613	struct setup_rw_req setup = {	687	struct setup_rw_req setup = {
614	.grant_idx = 0,	688	.grant_idx = 0,
@@ -650,19 +724,19 @@ static int blkif_queue_rw_req(struct request req, struct blkfront_ring_info ri
650	/* Fill out a communications ring structure. */	724	/* Fill out a communications ring structure. */
651	id = blkif_ring_get_request(rinfo, req, &ring_req);	725	id = blkif_ring_get_request(rinfo, req, &ring_req);
652		726
653	BUG_ON(info->max_indirect_segments == 0 &&
654	GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
655	BUG_ON(info->max_indirect_segments &&
656	GREFS(req->nr_phys_segments) > info->max_indirect_segments);
657
658	num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);	727	num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
659	num_grant = 0;	728	num_grant = 0;
660	/* Calculate the number of grant used */	729	/* Calculate the number of grant used */
661	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)	730	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
662	num_grant += gnttab_count_grant(sg->offset, sg->length);	731	num_grant += gnttab_count_grant(sg->offset, sg->length);
663		732
		733	require_extra_req = info->max_indirect_segments == 0 &&
		734	num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
		735	BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
		736
664	rinfo->shadow[id].num_sg = num_sg;	737	rinfo->shadow[id].num_sg = num_sg;
665	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {	738	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
		739	likely(!require_extra_req)) {
666	/*	740	/*
667	* The indirect operation can only be a BLKIF_OP_READ or	741	* The indirect operation can only be a BLKIF_OP_READ or
668	* BLKIF_OP_WRITE	742	* BLKIF_OP_WRITE
@@ -702,10 +776,30 @@ static int blkif_queue_rw_req(struct request req, struct blkfront_ring_info ri
702	}	776	}
703	}	777	}
704	ring_req->u.rw.nr_segments = num_grant;	778	ring_req->u.rw.nr_segments = num_grant;
		779	if (unlikely(require_extra_req)) {
		780	extra_id = blkif_ring_get_request(rinfo, req,
		781	&extra_ring_req);
		782	/*
		783	* Only the first request contains the scatter-gather
		784	* list.
		785	*/
		786	rinfo->shadow[extra_id].num_sg = 0;
		787
		788	blkif_setup_extra_req(ring_req, extra_ring_req);
		789
		790	/* Link the 2 requests together */
		791	rinfo->shadow[extra_id].associated_id = id;
		792	rinfo->shadow[id].associated_id = extra_id;
		793	}
705	}	794	}
706		795
707	setup.ring_req = ring_req;	796	setup.ring_req = ring_req;
708	setup.id = id;	797	setup.id = id;
		798
		799	setup.require_extra_req = require_extra_req;
		800	if (unlikely(require_extra_req))
		801	setup.extra_ring_req = extra_ring_req;
		802
709	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {	803	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
710	BUG_ON(sg->offset + sg->length > PAGE_SIZE);	804	BUG_ON(sg->offset + sg->length > PAGE_SIZE);
711		805
@@ -728,6 +822,8 @@ static int blkif_queue_rw_req(struct request req, struct blkfront_ring_info ri
728		822
729	/* Keep a private copy so we can reissue requests when recovering. */	823	/* Keep a private copy so we can reissue requests when recovering. */
730	rinfo->shadow[id].req = *ring_req;	824	rinfo->shadow[id].req = *ring_req;
		825	if (unlikely(require_extra_req))
		826	rinfo->shadow[extra_id].req = *extra_ring_req;
731		827
732	if (max_grefs > 0)	828	if (max_grefs > 0)
733	gnttab_free_grant_references(setup.gref_head);	829	gnttab_free_grant_references(setup.gref_head);
@@ -829,7 +925,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
829	memset(&info->tag_set, 0, sizeof(info->tag_set));	925	memset(&info->tag_set, 0, sizeof(info->tag_set));
830	info->tag_set.ops = &blkfront_mq_ops;	926	info->tag_set.ops = &blkfront_mq_ops;
831	info->tag_set.nr_hw_queues = info->nr_rings;	927	info->tag_set.nr_hw_queues = info->nr_rings;
832	info->tag_set.queue_depth = BLK_RING_SIZE(info);	928	if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
		929	/*
		930	* When indirect descriptior is not supported, the I/O request
		931	* will be split between multiple request in the ring.
		932	* To avoid problems when sending the request, divide by
		933	* 2 the depth of the queue.
		934	*/
		935	info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
		936	} else
		937	info->tag_set.queue_depth = BLK_RING_SIZE(info);
833	info->tag_set.numa_node = NUMA_NO_NODE;	938	info->tag_set.numa_node = NUMA_NO_NODE;
834	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE \| BLK_MQ_F_SG_MERGE;	939	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE \| BLK_MQ_F_SG_MERGE;
835	info->tag_set.cmd_size = 0;	940	info->tag_set.cmd_size = 0;
@@ -1269,20 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
1269	kunmap_atomic(shared_data);	1374	kunmap_atomic(shared_data);
1270	}	1375	}
1271		1376
1272	static void blkif_completion(struct blk_shadow s, struct blkfront_ring_info rinfo,	1377	static enum blk_req_status blkif_rsp_to_req_status(int rsp)
		1378	{
		1379	switch (rsp)
		1380	{
		1381	case BLKIF_RSP_OKAY:
		1382	return REQ_DONE;
		1383	case BLKIF_RSP_EOPNOTSUPP:
		1384	return REQ_EOPNOTSUPP;
		1385	case BLKIF_RSP_ERROR:
		1386	/* Fallthrough. */
		1387	default:
		1388	return REQ_ERROR;
		1389	}
		1390	}
		1391
		1392	/*
		1393	* Get the final status of the block request based on two ring response
		1394	*/
		1395	static int blkif_get_final_status(enum blk_req_status s1,
		1396	enum blk_req_status s2)
		1397	{
		1398	BUG_ON(s1 == REQ_WAITING);
		1399	BUG_ON(s2 == REQ_WAITING);
		1400
		1401	if (s1 == REQ_ERROR \|\| s2 == REQ_ERROR)
		1402	return BLKIF_RSP_ERROR;
		1403	else if (s1 == REQ_EOPNOTSUPP \|\| s2 == REQ_EOPNOTSUPP)
		1404	return BLKIF_RSP_EOPNOTSUPP;
		1405	return BLKIF_RSP_OKAY;
		1406	}
		1407
		1408	static bool blkif_completion(unsigned long *id,
		1409	struct blkfront_ring_info *rinfo,
1273	struct blkif_response *bret)	1410	struct blkif_response *bret)
1274	{	1411	{
1275	int i = 0;	1412	int i = 0;
1276	struct scatterlist *sg;	1413	struct scatterlist *sg;
1277	int num_sg, num_grant;	1414	int num_sg, num_grant;
1278	struct blkfront_info *info = rinfo->dev_info;	1415	struct blkfront_info *info = rinfo->dev_info;
		1416	struct blk_shadow s = &rinfo->shadow[id];
1279	struct copy_from_grant data = {	1417	struct copy_from_grant data = {
1280	.s = s,
1281	.grant_idx = 0,	1418	.grant_idx = 0,
1282	};	1419	};
1283		1420
1284	num_grant = s->req.operation == BLKIF_OP_INDIRECT ?	1421	num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
1285	s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;	1422	s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
		1423
		1424	/* The I/O request may be split in two. */
		1425	if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
		1426	struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
		1427
		1428	/* Keep the status of the current response in shadow. */
		1429	s->status = blkif_rsp_to_req_status(bret->status);
		1430
		1431	/* Wait the second response if not yet here. */
		1432	if (s2->status == REQ_WAITING)
		1433	return 0;
		1434
		1435	bret->status = blkif_get_final_status(s->status,
		1436	s2->status);
		1437
		1438	/*
		1439	* All the grants is stored in the first shadow in order
		1440	* to make the completion code simpler.
		1441	*/
		1442	num_grant += s2->req.u.rw.nr_segments;
		1443
		1444	/*
		1445	* The two responses may not come in order. Only the
		1446	* first request will store the scatter-gather list.
		1447	*/
		1448	if (s2->num_sg != 0) {
		1449	/* Update "id" with the ID of the first response. */
		1450	*id = s->associated_id;
		1451	s = s2;
		1452	}
		1453
		1454	/*
		1455	* We don't need anymore the second request, so recycling
		1456	* it now.
		1457	*/
		1458	if (add_id_to_freelist(rinfo, s->associated_id))
		1459	WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
		1460	info->gd->disk_name, s->associated_id);
		1461	}
		1462
		1463	data.s = s;
1286	num_sg = s->num_sg;	1464	num_sg = s->num_sg;
1287		1465
1288	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {	1466	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1352,6 +1530,8 @@ static void blkif_completion(struct blk_shadow s, struct blkfront_ring_info ri
1352	}	1530	}
1353	}	1531	}
1354	}	1532	}
		1533
		1534	return 1;
1355	}	1535	}
1356		1536
1357	static irqreturn_t blkif_interrupt(int irq, void *dev_id)	1537	static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1391,8 +1571,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1391	}	1571	}
1392	req = rinfo->shadow[id].request;	1572	req = rinfo->shadow[id].request;
1393		1573
1394	if (bret->operation != BLKIF_OP_DISCARD)	1574	if (bret->operation != BLKIF_OP_DISCARD) {
1395	blkif_completion(&rinfo->shadow[id], rinfo, bret);	1575	/*
		1576	* We may need to wait for an extra response if the
		1577	* I/O request is split in 2
		1578	*/
		1579	if (!blkif_completion(&id, rinfo, bret))
		1580	continue;
		1581	}
1396		1582
1397	if (add_id_to_freelist(rinfo, id)) {	1583	if (add_id_to_freelist(rinfo, id)) {
1398	WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",	1584	WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@@ -2017,8 +2203,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
2017	int err, i;	2203	int err, i;
2018	struct blkfront_info *info = rinfo->dev_info;	2204	struct blkfront_info *info = rinfo->dev_info;
2019		2205
2020	if (info->max_indirect_segments == 0)	2206	if (info->max_indirect_segments == 0) {
2021	grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;	2207	if (!HAS_EXTRA_REQ)
		2208	grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
		2209	else {
		2210	/*
		2211	* When an extra req is required, the maximum
		2212	* grants supported is related to the size of the
		2213	* Linux block segment.
		2214	*/
		2215	grants = GRANTS_PER_PSEG;
		2216	}
		2217	}
2022	else	2218	else
2023	grants = info->max_indirect_segments;	2219	grants = info->max_indirect_segments;
2024	psegs = grants / GRANTS_PER_PSEG;	2220	psegs = grants / GRANTS_PER_PSEG;