aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoger Pau Monne <roger.pau@citrix.com>2013-04-18 10:06:54 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2013-04-18 14:16:00 -0400
commit402b27f9f2c22309d5bb285628765bc27b82fcf5 (patch)
tree0177eb3c2073f1379674338ede8510e6d1e6d4bf
parent31552ee32df89f97a61766cee51b8dabb1ae3f4f (diff)
xen-block: implement indirect descriptors
Indirect descriptors introduce a new block operation (BLKIF_OP_INDIRECT) that passes grant references instead of segments in the request. This grant references are filled with arrays of blkif_request_segment_aligned, this way we can send more segments in a request. The proposed implementation sets the maximum number of indirect grefs (frames filled with blkif_request_segment_aligned) to 256 in the backend and 32 in the frontend. The value in the frontend has been chosen experimentally, and the backend value has been set to a sane value that allows expanding the maximum number of indirect descriptors in the frontend if needed. The migration code has changed from the previous implementation, in which we simply remapped the segments on the shared ring. Now the maximum number of segments allowed in a request can change depending on the backend, so we have to requeue all the requests in the ring and in the queue and split the bios in them if they are bigger than the new maximum number of segments. [v2: Fixed minor comments by Konrad. [v1: Added padding to make the indirect request 64bit aligned. Added some BUGs, comments; fixed number of indirect pages in blkif_get_x86_{32/64}_req. Added description about the indirect operation in blkif.h] Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> [v3: Fixed spaces and tabs mix ups] Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
-rw-r--r--drivers/block/xen-blkback/blkback.c134
-rw-r--r--drivers/block/xen-blkback/common.h98
-rw-r--r--drivers/block/xen-blkback/xenbus.c7
-rw-r--r--drivers/block/xen-blkfront.c490
-rw-r--r--include/xen/interface/io/blkif.h53
5 files changed, 657 insertions, 125 deletions
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 356722f65f88..1ebc0aa0f0e4 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -59,7 +59,7 @@
59 * IO workloads. 59 * IO workloads.
60 */ 60 */
61 61
62static int xen_blkif_max_buffer_pages = 704; 62static int xen_blkif_max_buffer_pages = 1024;
63module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644); 63module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
64MODULE_PARM_DESC(max_buffer_pages, 64MODULE_PARM_DESC(max_buffer_pages,
65"Maximum number of free pages to keep in each block backend buffer"); 65"Maximum number of free pages to keep in each block backend buffer");
@@ -75,7 +75,7 @@ MODULE_PARM_DESC(max_buffer_pages,
75 * algorithm. 75 * algorithm.
76 */ 76 */
77 77
78static int xen_blkif_max_pgrants = 352; 78static int xen_blkif_max_pgrants = 1056;
79module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); 79module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
80MODULE_PARM_DESC(max_persistent_grants, 80MODULE_PARM_DESC(max_persistent_grants,
81 "Maximum number of grants to map persistently"); 81 "Maximum number of grants to map persistently");
@@ -636,10 +636,6 @@ purge_gnt_list:
636 return 0; 636 return 0;
637} 637}
638 638
639struct seg_buf {
640 unsigned int offset;
641 unsigned int nsec;
642};
643/* 639/*
644 * Unmap the grant references, and also remove the M2P over-rides 640 * Unmap the grant references, and also remove the M2P over-rides
645 * used in the 'pending_req'. 641 * used in the 'pending_req'.
@@ -818,29 +814,69 @@ out_of_memory:
818 return -ENOMEM; 814 return -ENOMEM;
819} 815}
820 816
821static int xen_blkbk_map_seg(struct blkif_request *req, 817static int xen_blkbk_map_seg(struct pending_req *pending_req,
822 struct pending_req *pending_req,
823 struct seg_buf seg[], 818 struct seg_buf seg[],
824 struct page *pages[]) 819 struct page *pages[])
825{ 820{
826 int i, rc; 821 int rc;
827 grant_ref_t grefs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
828 822
829 for (i = 0; i < req->u.rw.nr_segments; i++) 823 rc = xen_blkbk_map(pending_req->blkif, pending_req->grefs,
830 grefs[i] = req->u.rw.seg[i].gref;
831
832 rc = xen_blkbk_map(pending_req->blkif, grefs,
833 pending_req->persistent_gnts, 824 pending_req->persistent_gnts,
834 pending_req->grant_handles, pending_req->pages, 825 pending_req->grant_handles, pending_req->pages,
835 req->u.rw.nr_segments, 826 pending_req->nr_pages,
836 (pending_req->operation != BLKIF_OP_READ)); 827 (pending_req->operation != BLKIF_OP_READ));
837 if (rc)
838 return rc;
839 828
840 for (i = 0; i < req->u.rw.nr_segments; i++) 829 return rc;
841 seg[i].offset = (req->u.rw.seg[i].first_sect << 9); 830}
842 831
843 return 0; 832static int xen_blkbk_parse_indirect(struct blkif_request *req,
833 struct pending_req *pending_req,
834 struct seg_buf seg[],
835 struct phys_req *preq)
836{
837 struct persistent_gnt **persistent =
838 pending_req->indirect_persistent_gnts;
839 struct page **pages = pending_req->indirect_pages;
840 struct xen_blkif *blkif = pending_req->blkif;
841 int indirect_grefs, rc, n, nseg, i;
842 struct blkif_request_segment_aligned *segments = NULL;
843
844 nseg = pending_req->nr_pages;
845 indirect_grefs = INDIRECT_PAGES(nseg);
846 BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
847
848 rc = xen_blkbk_map(blkif, req->u.indirect.indirect_grefs,
849 persistent, pending_req->indirect_handles,
850 pages, indirect_grefs, true);
851 if (rc)
852 goto unmap;
853
854 for (n = 0, i = 0; n < nseg; n++) {
855 if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
856 /* Map indirect segments */
857 if (segments)
858 kunmap_atomic(segments);
859 segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]);
860 }
861 i = n % SEGS_PER_INDIRECT_FRAME;
862 pending_req->grefs[n] = segments[i].gref;
863 seg[n].nsec = segments[i].last_sect -
864 segments[i].first_sect + 1;
865 seg[n].offset = (segments[i].first_sect << 9);
866 if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
867 (segments[i].last_sect < segments[i].first_sect)) {
868 rc = -EINVAL;
869 goto unmap;
870 }
871 preq->nr_sects += seg[n].nsec;
872 }
873
874unmap:
875 if (segments)
876 kunmap_atomic(segments);
877 xen_blkbk_unmap(blkif, pending_req->indirect_handles,
878 pages, persistent, indirect_grefs);
879 return rc;
844} 880}
845 881
846static int dispatch_discard_io(struct xen_blkif *blkif, 882static int dispatch_discard_io(struct xen_blkif *blkif,
@@ -1013,6 +1049,7 @@ __do_block_io_op(struct xen_blkif *blkif)
1013 case BLKIF_OP_WRITE: 1049 case BLKIF_OP_WRITE:
1014 case BLKIF_OP_WRITE_BARRIER: 1050 case BLKIF_OP_WRITE_BARRIER:
1015 case BLKIF_OP_FLUSH_DISKCACHE: 1051 case BLKIF_OP_FLUSH_DISKCACHE:
1052 case BLKIF_OP_INDIRECT:
1016 if (dispatch_rw_block_io(blkif, &req, pending_req)) 1053 if (dispatch_rw_block_io(blkif, &req, pending_req))
1017 goto done; 1054 goto done;
1018 break; 1055 break;
@@ -1059,17 +1096,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1059 struct pending_req *pending_req) 1096 struct pending_req *pending_req)
1060{ 1097{
1061 struct phys_req preq; 1098 struct phys_req preq;
1062 struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1099 struct seg_buf *seg = pending_req->seg;
1063 unsigned int nseg; 1100 unsigned int nseg;
1064 struct bio *bio = NULL; 1101 struct bio *bio = NULL;
1065 struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1102 struct bio **biolist = pending_req->biolist;
1066 int i, nbio = 0; 1103 int i, nbio = 0;
1067 int operation; 1104 int operation;
1068 struct blk_plug plug; 1105 struct blk_plug plug;
1069 bool drain = false; 1106 bool drain = false;
1070 struct page **pages = pending_req->pages; 1107 struct page **pages = pending_req->pages;
1108 unsigned short req_operation;
1109
1110 req_operation = req->operation == BLKIF_OP_INDIRECT ?
1111 req->u.indirect.indirect_op : req->operation;
1112 if ((req->operation == BLKIF_OP_INDIRECT) &&
1113 (req_operation != BLKIF_OP_READ) &&
1114 (req_operation != BLKIF_OP_WRITE)) {
1115 pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
1116 req_operation);
1117 goto fail_response;
1118 }
1071 1119
1072 switch (req->operation) { 1120 switch (req_operation) {
1073 case BLKIF_OP_READ: 1121 case BLKIF_OP_READ:
1074 blkif->st_rd_req++; 1122 blkif->st_rd_req++;
1075 operation = READ; 1123 operation = READ;
@@ -1091,33 +1139,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1091 } 1139 }
1092 1140
1093 /* Check that the number of segments is sane. */ 1141 /* Check that the number of segments is sane. */
1094 nseg = req->u.rw.nr_segments; 1142 nseg = req->operation == BLKIF_OP_INDIRECT ?
1143 req->u.indirect.nr_segments : req->u.rw.nr_segments;
1095 1144
1096 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || 1145 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
1097 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 1146 unlikely((req->operation != BLKIF_OP_INDIRECT) &&
1147 (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
1148 unlikely((req->operation == BLKIF_OP_INDIRECT) &&
1149 (nseg > MAX_INDIRECT_SEGMENTS))) {
1098 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", 1150 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
1099 nseg); 1151 nseg);
1100 /* Haven't submitted any bio's yet. */ 1152 /* Haven't submitted any bio's yet. */
1101 goto fail_response; 1153 goto fail_response;
1102 } 1154 }
1103 1155
1104 preq.sector_number = req->u.rw.sector_number;
1105 preq.nr_sects = 0; 1156 preq.nr_sects = 0;
1106 1157
1107 pending_req->blkif = blkif; 1158 pending_req->blkif = blkif;
1108 pending_req->id = req->u.rw.id; 1159 pending_req->id = req->u.rw.id;
1109 pending_req->operation = req->operation; 1160 pending_req->operation = req_operation;
1110 pending_req->status = BLKIF_RSP_OKAY; 1161 pending_req->status = BLKIF_RSP_OKAY;
1111 pending_req->nr_pages = nseg; 1162 pending_req->nr_pages = nseg;
1112 1163
1113 for (i = 0; i < nseg; i++) { 1164 if (req->operation != BLKIF_OP_INDIRECT) {
1114 seg[i].nsec = req->u.rw.seg[i].last_sect - 1165 preq.dev = req->u.rw.handle;
1115 req->u.rw.seg[i].first_sect + 1; 1166 preq.sector_number = req->u.rw.sector_number;
1116 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || 1167 for (i = 0; i < nseg; i++) {
1117 (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) 1168 pending_req->grefs[i] = req->u.rw.seg[i].gref;
1169 seg[i].nsec = req->u.rw.seg[i].last_sect -
1170 req->u.rw.seg[i].first_sect + 1;
1171 seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
1172 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
1173 (req->u.rw.seg[i].last_sect <
1174 req->u.rw.seg[i].first_sect))
1175 goto fail_response;
1176 preq.nr_sects += seg[i].nsec;
1177 }
1178 } else {
1179 preq.dev = req->u.indirect.handle;
1180 preq.sector_number = req->u.indirect.sector_number;
1181 if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
1118 goto fail_response; 1182 goto fail_response;
1119 preq.nr_sects += seg[i].nsec;
1120
1121 } 1183 }
1122 1184
1123 if (xen_vbd_translate(&preq, blkif, operation) != 0) { 1185 if (xen_vbd_translate(&preq, blkif, operation) != 0) {
@@ -1154,7 +1216,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1154 * the hypercall to unmap the grants - that is all done in 1216 * the hypercall to unmap the grants - that is all done in
1155 * xen_blkbk_unmap. 1217 * xen_blkbk_unmap.
1156 */ 1218 */
1157 if (xen_blkbk_map_seg(req, pending_req, seg, pages)) 1219 if (xen_blkbk_map_seg(pending_req, seg, pages))
1158 goto fail_flush; 1220 goto fail_flush;
1159 1221
1160 /* 1222 /*
@@ -1220,7 +1282,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1220 pending_req->nr_pages); 1282 pending_req->nr_pages);
1221 fail_response: 1283 fail_response:
1222 /* Haven't submitted any bio's yet. */ 1284 /* Haven't submitted any bio's yet. */
1223 make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); 1285 make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1224 free_req(blkif, pending_req); 1286 free_req(blkif, pending_req);
1225 msleep(1); /* back off a bit */ 1287 msleep(1); /* back off a bit */
1226 return -EIO; 1288 return -EIO;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index e33fafa0facd..1ac53da8410f 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -50,6 +50,19 @@
50 __func__, __LINE__, ##args) 50 __func__, __LINE__, ##args)
51 51
52 52
53/*
54 * This is the maximum number of segments that would be allowed in indirect
55 * requests. This value will also be passed to the frontend.
56 */
57#define MAX_INDIRECT_SEGMENTS 256
58
59#define SEGS_PER_INDIRECT_FRAME \
60 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
61#define MAX_INDIRECT_PAGES \
62 ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
63#define INDIRECT_PAGES(_segs) \
64 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
65
53/* Not a real protocol. Used to generate ring structs which contain 66/* Not a real protocol. Used to generate ring structs which contain
54 * the elements common to all protocols only. This way we get a 67 * the elements common to all protocols only. This way we get a
55 * compiler-checkable way to use common struct elements, so we can 68 * compiler-checkable way to use common struct elements, so we can
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
83 uint64_t id; /* private guest value, echoed in resp */ 96 uint64_t id; /* private guest value, echoed in resp */
84} __attribute__((__packed__)); 97} __attribute__((__packed__));
85 98
99struct blkif_x86_32_request_indirect {
100 uint8_t indirect_op;
101 uint16_t nr_segments;
102 uint64_t id;
103 blkif_sector_t sector_number;
104 blkif_vdev_t handle;
105 uint16_t _pad1;
106 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
107 /*
108 * The maximum number of indirect segments (and pages) that will
109 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
110 * is also exported to the guest (via xenstore
111 * feature-max-indirect-segments entry), so the frontend knows how
112 * many indirect segments the backend supports.
113 */
114 uint64_t _pad2; /* make it 64 byte aligned */
115} __attribute__((__packed__));
116
86struct blkif_x86_32_request { 117struct blkif_x86_32_request {
87 uint8_t operation; /* BLKIF_OP_??? */ 118 uint8_t operation; /* BLKIF_OP_??? */
88 union { 119 union {
89 struct blkif_x86_32_request_rw rw; 120 struct blkif_x86_32_request_rw rw;
90 struct blkif_x86_32_request_discard discard; 121 struct blkif_x86_32_request_discard discard;
91 struct blkif_x86_32_request_other other; 122 struct blkif_x86_32_request_other other;
123 struct blkif_x86_32_request_indirect indirect;
92 } u; 124 } u;
93} __attribute__((__packed__)); 125} __attribute__((__packed__));
94 126
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
127 uint64_t id; /* private guest value, echoed in resp */ 159 uint64_t id; /* private guest value, echoed in resp */
128} __attribute__((__packed__)); 160} __attribute__((__packed__));
129 161
162struct blkif_x86_64_request_indirect {
163 uint8_t indirect_op;
164 uint16_t nr_segments;
165 uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
166 uint64_t id;
167 blkif_sector_t sector_number;
168 blkif_vdev_t handle;
169 uint16_t _pad2;
170 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
171 /*
172 * The maximum number of indirect segments (and pages) that will
173 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
174 * is also exported to the guest (via xenstore
175 * feature-max-indirect-segments entry), so the frontend knows how
176 * many indirect segments the backend supports.
177 */
178 uint32_t _pad3; /* make it 64 byte aligned */
179} __attribute__((__packed__));
180
130struct blkif_x86_64_request { 181struct blkif_x86_64_request {
131 uint8_t operation; /* BLKIF_OP_??? */ 182 uint8_t operation; /* BLKIF_OP_??? */
132 union { 183 union {
133 struct blkif_x86_64_request_rw rw; 184 struct blkif_x86_64_request_rw rw;
134 struct blkif_x86_64_request_discard discard; 185 struct blkif_x86_64_request_discard discard;
135 struct blkif_x86_64_request_other other; 186 struct blkif_x86_64_request_other other;
187 struct blkif_x86_64_request_indirect indirect;
136 } u; 188 } u;
137} __attribute__((__packed__)); 189} __attribute__((__packed__));
138 190
@@ -266,6 +318,11 @@ struct xen_blkif {
266 wait_queue_head_t waiting_to_free; 318 wait_queue_head_t waiting_to_free;
267}; 319};
268 320
321struct seg_buf {
322 unsigned long offset;
323 unsigned int nsec;
324};
325
269/* 326/*
270 * Each outstanding request that we've passed to the lower device layers has a 327 * Each outstanding request that we've passed to the lower device layers has a
271 * 'pending_req' allocated to it. Each buffer_head that completes decrements 328 * 'pending_req' allocated to it. Each buffer_head that completes decrements
@@ -280,9 +337,16 @@ struct pending_req {
280 unsigned short operation; 337 unsigned short operation;
281 int status; 338 int status;
282 struct list_head free_list; 339 struct list_head free_list;
283 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 340 struct page *pages[MAX_INDIRECT_SEGMENTS];
284 struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 341 struct persistent_gnt *persistent_gnts[MAX_INDIRECT_SEGMENTS];
285 grant_handle_t grant_handles[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 342 grant_handle_t grant_handles[MAX_INDIRECT_SEGMENTS];
343 grant_ref_t grefs[MAX_INDIRECT_SEGMENTS];
344 /* Indirect descriptors */
345 struct persistent_gnt *indirect_persistent_gnts[MAX_INDIRECT_PAGES];
346 struct page *indirect_pages[MAX_INDIRECT_PAGES];
347 grant_handle_t indirect_handles[MAX_INDIRECT_PAGES];
348 struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
349 struct bio *biolist[MAX_INDIRECT_SEGMENTS];
286}; 350};
287 351
288 352
@@ -321,7 +385,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
321static inline void blkif_get_x86_32_req(struct blkif_request *dst, 385static inline void blkif_get_x86_32_req(struct blkif_request *dst,
322 struct blkif_x86_32_request *src) 386 struct blkif_x86_32_request *src)
323{ 387{
324 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; 388 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
325 dst->operation = src->operation; 389 dst->operation = src->operation;
326 switch (src->operation) { 390 switch (src->operation) {
327 case BLKIF_OP_READ: 391 case BLKIF_OP_READ:
@@ -344,6 +408,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
344 dst->u.discard.sector_number = src->u.discard.sector_number; 408 dst->u.discard.sector_number = src->u.discard.sector_number;
345 dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 409 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
346 break; 410 break;
411 case BLKIF_OP_INDIRECT:
412 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
413 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
414 dst->u.indirect.handle = src->u.indirect.handle;
415 dst->u.indirect.id = src->u.indirect.id;
416 dst->u.indirect.sector_number = src->u.indirect.sector_number;
417 barrier();
418 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
419 for (i = 0; i < j; i++)
420 dst->u.indirect.indirect_grefs[i] =
421 src->u.indirect.indirect_grefs[i];
422 break;
347 default: 423 default:
348 /* 424 /*
349 * Don't know how to translate this op. Only get the 425 * Don't know how to translate this op. Only get the
@@ -357,7 +433,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
357static inline void blkif_get_x86_64_req(struct blkif_request *dst, 433static inline void blkif_get_x86_64_req(struct blkif_request *dst,
358 struct blkif_x86_64_request *src) 434 struct blkif_x86_64_request *src)
359{ 435{
360 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; 436 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
361 dst->operation = src->operation; 437 dst->operation = src->operation;
362 switch (src->operation) { 438 switch (src->operation) {
363 case BLKIF_OP_READ: 439 case BLKIF_OP_READ:
@@ -380,6 +456,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
380 dst->u.discard.sector_number = src->u.discard.sector_number; 456 dst->u.discard.sector_number = src->u.discard.sector_number;
381 dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 457 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
382 break; 458 break;
459 case BLKIF_OP_INDIRECT:
460 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
461 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
462 dst->u.indirect.handle = src->u.indirect.handle;
463 dst->u.indirect.id = src->u.indirect.id;
464 dst->u.indirect.sector_number = src->u.indirect.sector_number;
465 barrier();
466 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
467 for (i = 0; i < j; i++)
468 dst->u.indirect.indirect_grefs[i] =
469 src->u.indirect.indirect_grefs[i];
470 break;
383 default: 471 default:
384 /* 472 /*
385 * Don't know how to translate this op. Only get the 473 * Don't know how to translate this op. Only get the
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 1f1ade6d6e09..afab208c54e3 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -107,6 +107,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
107 struct xen_blkif *blkif; 107 struct xen_blkif *blkif;
108 int i; 108 int i;
109 109
110 BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
111
110 blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); 112 blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
111 if (!blkif) 113 if (!blkif)
112 return ERR_PTR(-ENOMEM); 114 return ERR_PTR(-ENOMEM);
@@ -709,6 +711,11 @@ again:
709 dev->nodename); 711 dev->nodename);
710 goto abort; 712 goto abort;
711 } 713 }
714 err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
715 MAX_INDIRECT_SEGMENTS);
716 if (err)
717 dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
718 dev->nodename, err);
712 719
713 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", 720 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
714 (unsigned long long)vbd_sz(&be->blkif->vbd)); 721 (unsigned long long)vbd_sz(&be->blkif->vbd));
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index a894f88762d8..82d63d5b1750 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -74,12 +74,27 @@ struct grant {
74struct blk_shadow { 74struct blk_shadow {
75 struct blkif_request req; 75 struct blkif_request req;
76 struct request *request; 76 struct request *request;
77 struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 77 struct grant **grants_used;
78 struct grant **indirect_grants;
79};
80
81struct split_bio {
82 struct bio *bio;
83 atomic_t pending;
84 int err;
78}; 85};
79 86
80static DEFINE_MUTEX(blkfront_mutex); 87static DEFINE_MUTEX(blkfront_mutex);
81static const struct block_device_operations xlvbd_block_fops; 88static const struct block_device_operations xlvbd_block_fops;
82 89
90/*
91 * Maximum number of segments in indirect requests, the actual value used by
92 * the frontend driver is the minimum of this value and the value provided
93 * by the backend driver.
94 */
95
96static unsigned int xen_blkif_max_segments = 32;
97
83#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 98#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
84 99
85/* 100/*
@@ -98,7 +113,7 @@ struct blkfront_info
98 enum blkif_state connected; 113 enum blkif_state connected;
99 int ring_ref; 114 int ring_ref;
100 struct blkif_front_ring ring; 115 struct blkif_front_ring ring;
101 struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 116 struct scatterlist *sg;
102 unsigned int evtchn, irq; 117 unsigned int evtchn, irq;
103 struct request_queue *rq; 118 struct request_queue *rq;
104 struct work_struct work; 119 struct work_struct work;
@@ -114,6 +129,7 @@ struct blkfront_info
114 unsigned int discard_granularity; 129 unsigned int discard_granularity;
115 unsigned int discard_alignment; 130 unsigned int discard_alignment;
116 unsigned int feature_persistent:1; 131 unsigned int feature_persistent:1;
132 unsigned int max_indirect_segments;
117 int is_ready; 133 int is_ready;
118}; 134};
119 135
@@ -142,6 +158,13 @@ static DEFINE_SPINLOCK(minor_lock);
142 158
143#define DEV_NAME "xvd" /* name in /dev */ 159#define DEV_NAME "xvd" /* name in /dev */
144 160
161#define SEGS_PER_INDIRECT_FRAME \
162 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
163#define INDIRECT_GREFS(_segs) \
164 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
165
166static int blkfront_setup_indirect(struct blkfront_info *info);
167
145static int get_id_from_freelist(struct blkfront_info *info) 168static int get_id_from_freelist(struct blkfront_info *info)
146{ 169{
147 unsigned long free = info->shadow_free; 170 unsigned long free = info->shadow_free;
@@ -358,7 +381,8 @@ static int blkif_queue_request(struct request *req)
358 struct blkif_request *ring_req; 381 struct blkif_request *ring_req;
359 unsigned long id; 382 unsigned long id;
360 unsigned int fsect, lsect; 383 unsigned int fsect, lsect;
361 int i, ref; 384 int i, ref, n;
385 struct blkif_request_segment_aligned *segments = NULL;
362 386
363 /* 387 /*
364 * Used to store if we are able to queue the request by just using 388 * Used to store if we are able to queue the request by just using
@@ -369,21 +393,27 @@ static int blkif_queue_request(struct request *req)
369 grant_ref_t gref_head; 393 grant_ref_t gref_head;
370 struct grant *gnt_list_entry = NULL; 394 struct grant *gnt_list_entry = NULL;
371 struct scatterlist *sg; 395 struct scatterlist *sg;
396 int nseg, max_grefs;
372 397
373 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 398 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
374 return 1; 399 return 1;
375 400
376 /* Check if we have enought grants to allocate a requests */ 401 max_grefs = info->max_indirect_segments ?
377 if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { 402 info->max_indirect_segments +
403 INDIRECT_GREFS(info->max_indirect_segments) :
404 BLKIF_MAX_SEGMENTS_PER_REQUEST;
405
406 /* Check if we have enough grants to allocate a requests */
407 if (info->persistent_gnts_c < max_grefs) {
378 new_persistent_gnts = 1; 408 new_persistent_gnts = 1;
379 if (gnttab_alloc_grant_references( 409 if (gnttab_alloc_grant_references(
380 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, 410 max_grefs - info->persistent_gnts_c,
381 &gref_head) < 0) { 411 &gref_head) < 0) {
382 gnttab_request_free_callback( 412 gnttab_request_free_callback(
383 &info->callback, 413 &info->callback,
384 blkif_restart_queue_callback, 414 blkif_restart_queue_callback,
385 info, 415 info,
386 BLKIF_MAX_SEGMENTS_PER_REQUEST); 416 max_grefs);
387 return 1; 417 return 1;
388 } 418 }
389 } else 419 } else
@@ -394,42 +424,67 @@ static int blkif_queue_request(struct request *req)
394 id = get_id_from_freelist(info); 424 id = get_id_from_freelist(info);
395 info->shadow[id].request = req; 425 info->shadow[id].request = req;
396 426
397 ring_req->u.rw.id = id;
398 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
399 ring_req->u.rw.handle = info->handle;
400
401 ring_req->operation = rq_data_dir(req) ?
402 BLKIF_OP_WRITE : BLKIF_OP_READ;
403
404 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
405 /*
406 * Ideally we can do an unordered flush-to-disk. In case the
407 * backend onlysupports barriers, use that. A barrier request
408 * a superset of FUA, so we can implement it the same
409 * way. (It's also a FLUSH+FUA, since it is
410 * guaranteed ordered WRT previous writes.)
411 */
412 ring_req->operation = info->flush_op;
413 }
414
415 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { 427 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
416 /* id, sector_number and handle are set above. */
417 ring_req->operation = BLKIF_OP_DISCARD; 428 ring_req->operation = BLKIF_OP_DISCARD;
418 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 429 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
430 ring_req->u.discard.id = id;
431 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
419 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 432 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
420 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 433 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
421 else 434 else
422 ring_req->u.discard.flag = 0; 435 ring_req->u.discard.flag = 0;
423 } else { 436 } else {
424 ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, 437 BUG_ON(info->max_indirect_segments == 0 &&
425 info->sg); 438 req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
426 BUG_ON(ring_req->u.rw.nr_segments > 439 BUG_ON(info->max_indirect_segments &&
427 BLKIF_MAX_SEGMENTS_PER_REQUEST); 440 req->nr_phys_segments > info->max_indirect_segments);
428 441 nseg = blk_rq_map_sg(req->q, req, info->sg);
429 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { 442 ring_req->u.rw.id = id;
443 if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
444 /*
445 * The indirect operation can only be a BLKIF_OP_READ or
446 * BLKIF_OP_WRITE
447 */
448 BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
449 ring_req->operation = BLKIF_OP_INDIRECT;
450 ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
451 BLKIF_OP_WRITE : BLKIF_OP_READ;
452 ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
453 ring_req->u.indirect.handle = info->handle;
454 ring_req->u.indirect.nr_segments = nseg;
455 } else {
456 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
457 ring_req->u.rw.handle = info->handle;
458 ring_req->operation = rq_data_dir(req) ?
459 BLKIF_OP_WRITE : BLKIF_OP_READ;
460 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
461 /*
462 * Ideally we can do an unordered flush-to-disk. In case the
463 * backend onlysupports barriers, use that. A barrier request
464 * a superset of FUA, so we can implement it the same
465 * way. (It's also a FLUSH+FUA, since it is
466 * guaranteed ordered WRT previous writes.)
467 */
468 ring_req->operation = info->flush_op;
469 }
470 ring_req->u.rw.nr_segments = nseg;
471 }
472 for_each_sg(info->sg, sg, nseg, i) {
430 fsect = sg->offset >> 9; 473 fsect = sg->offset >> 9;
431 lsect = fsect + (sg->length >> 9) - 1; 474 lsect = fsect + (sg->length >> 9) - 1;
432 475
476 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
477 (i % SEGS_PER_INDIRECT_FRAME == 0)) {
478 if (segments)
479 kunmap_atomic(segments);
480
481 n = i / SEGS_PER_INDIRECT_FRAME;
482 gnt_list_entry = get_grant(&gref_head, info);
483 info->shadow[id].indirect_grants[n] = gnt_list_entry;
484 segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
485 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
486 }
487
433 gnt_list_entry = get_grant(&gref_head, info); 488 gnt_list_entry = get_grant(&gref_head, info);
434 ref = gnt_list_entry->gref; 489 ref = gnt_list_entry->gref;
435 490
@@ -441,8 +496,7 @@ static int blkif_queue_request(struct request *req)
441 496
442 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 497 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
443 498
444 shared_data = kmap_atomic( 499 shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
445 pfn_to_page(gnt_list_entry->pfn));
446 bvec_data = kmap_atomic(sg_page(sg)); 500 bvec_data = kmap_atomic(sg_page(sg));
447 501
448 /* 502 /*
@@ -461,13 +515,23 @@ static int blkif_queue_request(struct request *req)
461 kunmap_atomic(bvec_data); 515 kunmap_atomic(bvec_data);
462 kunmap_atomic(shared_data); 516 kunmap_atomic(shared_data);
463 } 517 }
464 518 if (ring_req->operation != BLKIF_OP_INDIRECT) {
465 ring_req->u.rw.seg[i] = 519 ring_req->u.rw.seg[i] =
466 (struct blkif_request_segment) { 520 (struct blkif_request_segment) {
467 .gref = ref, 521 .gref = ref,
468 .first_sect = fsect, 522 .first_sect = fsect,
469 .last_sect = lsect }; 523 .last_sect = lsect };
524 } else {
525 n = i % SEGS_PER_INDIRECT_FRAME;
526 segments[n] =
527 (struct blkif_request_segment_aligned) {
528 .gref = ref,
529 .first_sect = fsect,
530 .last_sect = lsect };
531 }
470 } 532 }
533 if (segments)
534 kunmap_atomic(segments);
471 } 535 }
472 536
473 info->ring.req_prod_pvt++; 537 info->ring.req_prod_pvt++;
@@ -542,7 +606,8 @@ wait:
542 flush_requests(info); 606 flush_requests(info);
543} 607}
544 608
545static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) 609static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
610 unsigned int segments)
546{ 611{
547 struct request_queue *rq; 612 struct request_queue *rq;
548 struct blkfront_info *info = gd->private_data; 613 struct blkfront_info *info = gd->private_data;
@@ -571,7 +636,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
571 blk_queue_max_segment_size(rq, PAGE_SIZE); 636 blk_queue_max_segment_size(rq, PAGE_SIZE);
572 637
573 /* Ensure a merged request will fit in a single I/O ring slot. */ 638 /* Ensure a merged request will fit in a single I/O ring slot. */
574 blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); 639 blk_queue_max_segments(rq, segments);
575 640
576 /* Make sure buffer addresses are sector-aligned. */ 641 /* Make sure buffer addresses are sector-aligned. */
577 blk_queue_dma_alignment(rq, 511); 642 blk_queue_dma_alignment(rq, 511);
@@ -588,13 +653,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
588static void xlvbd_flush(struct blkfront_info *info) 653static void xlvbd_flush(struct blkfront_info *info)
589{ 654{
590 blk_queue_flush(info->rq, info->feature_flush); 655 blk_queue_flush(info->rq, info->feature_flush);
591 printk(KERN_INFO "blkfront: %s: %s: %s %s\n", 656 printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
592 info->gd->disk_name, 657 info->gd->disk_name,
593 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 658 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
594 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 659 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
595 "flush diskcache" : "barrier or flush"), 660 "flush diskcache" : "barrier or flush"),
596 info->feature_flush ? "enabled" : "disabled", 661 info->feature_flush ? "enabled;" : "disabled;",
597 info->feature_persistent ? "using persistent grants" : ""); 662 "persistent grants:",
663 info->feature_persistent ? "enabled;" : "disabled;",
664 "indirect descriptors:",
665 info->max_indirect_segments ? "enabled;" : "disabled;");
598} 666}
599 667
600static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 668static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -734,7 +802,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
734 gd->driverfs_dev = &(info->xbdev->dev); 802 gd->driverfs_dev = &(info->xbdev->dev);
735 set_capacity(gd, capacity); 803 set_capacity(gd, capacity);
736 804
737 if (xlvbd_init_blk_queue(gd, sector_size)) { 805 if (xlvbd_init_blk_queue(gd, sector_size,
806 info->max_indirect_segments ? :
807 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
738 del_gendisk(gd); 808 del_gendisk(gd);
739 goto release; 809 goto release;
740 } 810 }
@@ -818,6 +888,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
818{ 888{
819 struct grant *persistent_gnt; 889 struct grant *persistent_gnt;
820 struct grant *n; 890 struct grant *n;
891 int i, j, segs;
821 892
822 /* Prevent new requests being issued until we fix things up. */ 893 /* Prevent new requests being issued until we fix things up. */
823 spin_lock_irq(&info->io_lock); 894 spin_lock_irq(&info->io_lock);
@@ -843,6 +914,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
843 } 914 }
844 BUG_ON(info->persistent_gnts_c != 0); 915 BUG_ON(info->persistent_gnts_c != 0);
845 916
917 kfree(info->sg);
918 info->sg = NULL;
919 for (i = 0; i < BLK_RING_SIZE; i++) {
920 /*
921 * Clear persistent grants present in requests already
922 * on the shared ring
923 */
924 if (!info->shadow[i].request)
925 goto free_shadow;
926
927 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
928 info->shadow[i].req.u.indirect.nr_segments :
929 info->shadow[i].req.u.rw.nr_segments;
930 for (j = 0; j < segs; j++) {
931 persistent_gnt = info->shadow[i].grants_used[j];
932 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
933 __free_page(pfn_to_page(persistent_gnt->pfn));
934 kfree(persistent_gnt);
935 }
936
937 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
938 /*
939 * If this is not an indirect operation don't try to
940 * free indirect segments
941 */
942 goto free_shadow;
943
944 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
945 persistent_gnt = info->shadow[i].indirect_grants[j];
946 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
947 __free_page(pfn_to_page(persistent_gnt->pfn));
948 kfree(persistent_gnt);
949 }
950
951free_shadow:
952 kfree(info->shadow[i].grants_used);
953 info->shadow[i].grants_used = NULL;
954 kfree(info->shadow[i].indirect_grants);
955 info->shadow[i].indirect_grants = NULL;
956 }
957
846 /* No more gnttab callback work. */ 958 /* No more gnttab callback work. */
847 gnttab_cancel_free_callback(&info->callback); 959 gnttab_cancel_free_callback(&info->callback);
848 spin_unlock_irq(&info->io_lock); 960 spin_unlock_irq(&info->io_lock);
@@ -873,6 +985,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
873 char *bvec_data; 985 char *bvec_data;
874 void *shared_data; 986 void *shared_data;
875 unsigned int offset = 0; 987 unsigned int offset = 0;
988 int nseg;
989
990 nseg = s->req.operation == BLKIF_OP_INDIRECT ?
991 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
876 992
877 if (bret->operation == BLKIF_OP_READ) { 993 if (bret->operation == BLKIF_OP_READ) {
878 /* 994 /*
@@ -885,7 +1001,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
885 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); 1001 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
886 if (bvec->bv_offset < offset) 1002 if (bvec->bv_offset < offset)
887 i++; 1003 i++;
888 BUG_ON(i >= s->req.u.rw.nr_segments); 1004 BUG_ON(i >= nseg);
889 shared_data = kmap_atomic( 1005 shared_data = kmap_atomic(
890 pfn_to_page(s->grants_used[i]->pfn)); 1006 pfn_to_page(s->grants_used[i]->pfn));
891 bvec_data = bvec_kmap_irq(bvec, &flags); 1007 bvec_data = bvec_kmap_irq(bvec, &flags);
@@ -897,10 +1013,16 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
897 } 1013 }
898 } 1014 }
899 /* Add the persistent grant into the list of free grants */ 1015 /* Add the persistent grant into the list of free grants */
900 for (i = 0; i < s->req.u.rw.nr_segments; i++) { 1016 for (i = 0; i < nseg; i++) {
901 list_add(&s->grants_used[i]->node, &info->persistent_gnts); 1017 list_add(&s->grants_used[i]->node, &info->persistent_gnts);
902 info->persistent_gnts_c++; 1018 info->persistent_gnts_c++;
903 } 1019 }
1020 if (s->req.operation == BLKIF_OP_INDIRECT) {
1021 for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
1022 list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
1023 info->persistent_gnts_c++;
1024 }
1025 }
904} 1026}
905 1027
906static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1028static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1034,14 +1156,6 @@ static int setup_blkring(struct xenbus_device *dev,
1034 SHARED_RING_INIT(sring); 1156 SHARED_RING_INIT(sring);
1035 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); 1157 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
1036 1158
1037 sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
1038
1039 /* Allocate memory for grants */
1040 err = fill_grant_buffer(info, BLK_RING_SIZE *
1041 BLKIF_MAX_SEGMENTS_PER_REQUEST);
1042 if (err)
1043 goto fail;
1044
1045 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); 1159 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
1046 if (err < 0) { 1160 if (err < 0) {
1047 free_page((unsigned long)sring); 1161 free_page((unsigned long)sring);
@@ -1223,13 +1337,84 @@ static int blkfront_probe(struct xenbus_device *dev,
1223 return 0; 1337 return 0;
1224} 1338}
1225 1339
1340/*
1341 * This is a clone of md_trim_bio, used to split a bio into smaller ones
1342 */
1343static void trim_bio(struct bio *bio, int offset, int size)
1344{
1345 /* 'bio' is a cloned bio which we need to trim to match
1346 * the given offset and size.
1347 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1348 */
1349 int i;
1350 struct bio_vec *bvec;
1351 int sofar = 0;
1352
1353 size <<= 9;
1354 if (offset == 0 && size == bio->bi_size)
1355 return;
1356
1357 bio->bi_sector += offset;
1358 bio->bi_size = size;
1359 offset <<= 9;
1360 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1361
1362 while (bio->bi_idx < bio->bi_vcnt &&
1363 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
1364 /* remove this whole bio_vec */
1365 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
1366 bio->bi_idx++;
1367 }
1368 if (bio->bi_idx < bio->bi_vcnt) {
1369 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
1370 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
1371 }
1372 /* avoid any complications with bi_idx being non-zero*/
1373 if (bio->bi_idx) {
1374 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1375 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1376 bio->bi_vcnt -= bio->bi_idx;
1377 bio->bi_idx = 0;
1378 }
1379 /* Make sure vcnt and last bv are not too big */
1380 bio_for_each_segment(bvec, bio, i) {
1381 if (sofar + bvec->bv_len > size)
1382 bvec->bv_len = size - sofar;
1383 if (bvec->bv_len == 0) {
1384 bio->bi_vcnt = i;
1385 break;
1386 }
1387 sofar += bvec->bv_len;
1388 }
1389}
1390
1391static void split_bio_end(struct bio *bio, int error)
1392{
1393 struct split_bio *split_bio = bio->bi_private;
1394
1395 if (error)
1396 split_bio->err = error;
1397
1398 if (atomic_dec_and_test(&split_bio->pending)) {
1399 split_bio->bio->bi_phys_segments = 0;
1400 bio_endio(split_bio->bio, split_bio->err);
1401 kfree(split_bio);
1402 }
1403 bio_put(bio);
1404}
1226 1405
1227static int blkif_recover(struct blkfront_info *info) 1406static int blkif_recover(struct blkfront_info *info)
1228{ 1407{
1229 int i; 1408 int i;
1230 struct blkif_request *req; 1409 struct request *req, *n;
1231 struct blk_shadow *copy; 1410 struct blk_shadow *copy;
1232 int j; 1411 int rc;
1412 struct bio *bio, *cloned_bio;
1413 struct bio_list bio_list, merge_bio;
1414 unsigned int segs, offset;
1415 int pending, size;
1416 struct split_bio *split_bio;
1417 struct list_head requests;
1233 1418
1234 /* Stage 1: Make a safe copy of the shadow state. */ 1419 /* Stage 1: Make a safe copy of the shadow state. */
1235 copy = kmemdup(info->shadow, sizeof(info->shadow), 1420 copy = kmemdup(info->shadow, sizeof(info->shadow),
@@ -1244,36 +1429,64 @@ static int blkif_recover(struct blkfront_info *info)
1244 info->shadow_free = info->ring.req_prod_pvt; 1429 info->shadow_free = info->ring.req_prod_pvt;
1245 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; 1430 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1246 1431
1247 /* Stage 3: Find pending requests and requeue them. */ 1432 rc = blkfront_setup_indirect(info);
1433 if (rc) {
1434 kfree(copy);
1435 return rc;
1436 }
1437
1438 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1439 blk_queue_max_segments(info->rq, segs);
1440 bio_list_init(&bio_list);
1441 INIT_LIST_HEAD(&requests);
1248 for (i = 0; i < BLK_RING_SIZE; i++) { 1442 for (i = 0; i < BLK_RING_SIZE; i++) {
1249 /* Not in use? */ 1443 /* Not in use? */
1250 if (!copy[i].request) 1444 if (!copy[i].request)
1251 continue; 1445 continue;
1252 1446
1253 /* Grab a request slot and copy shadow state into it. */ 1447 /*
1254 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 1448 * Get the bios in the request so we can re-queue them.
1255 *req = copy[i].req; 1449 */
1256 1450 if (copy[i].request->cmd_flags &
1257 /* We get a new request id, and must reset the shadow state. */ 1451 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1258 req->u.rw.id = get_id_from_freelist(info); 1452 /*
1259 memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i])); 1453 * Flush operations don't contain bios, so
1260 1454 * we need to requeue the whole request
1261 if (req->operation != BLKIF_OP_DISCARD) { 1455 */
1262 /* Rewrite any grant references invalidated by susp/resume. */ 1456 list_add(&copy[i].request->queuelist, &requests);
1263 for (j = 0; j < req->u.rw.nr_segments; j++) 1457 continue;
1264 gnttab_grant_foreign_access_ref(
1265 req->u.rw.seg[j].gref,
1266 info->xbdev->otherend_id,
1267 pfn_to_mfn(copy[i].grants_used[j]->pfn),
1268 0);
1269 } 1458 }
1270 info->shadow[req->u.rw.id].req = *req; 1459 merge_bio.head = copy[i].request->bio;
1271 1460 merge_bio.tail = copy[i].request->biotail;
1272 info->ring.req_prod_pvt++; 1461 bio_list_merge(&bio_list, &merge_bio);
1462 copy[i].request->bio = NULL;
1463 blk_put_request(copy[i].request);
1273 } 1464 }
1274 1465
1275 kfree(copy); 1466 kfree(copy);
1276 1467
1468 /*
1469 * Empty the queue, this is important because we might have
1470 * requests in the queue with more segments than what we
1471 * can handle now.
1472 */
1473 spin_lock_irq(&info->io_lock);
1474 while ((req = blk_fetch_request(info->rq)) != NULL) {
1475 if (req->cmd_flags &
1476 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1477 list_add(&req->queuelist, &requests);
1478 continue;
1479 }
1480 merge_bio.head = req->bio;
1481 merge_bio.tail = req->biotail;
1482 bio_list_merge(&bio_list, &merge_bio);
1483 req->bio = NULL;
1484 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
1485 pr_alert("diskcache flush request found!\n");
1486 __blk_put_request(info->rq, req);
1487 }
1488 spin_unlock_irq(&info->io_lock);
1489
1277 xenbus_switch_state(info->xbdev, XenbusStateConnected); 1490 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1278 1491
1279 spin_lock_irq(&info->io_lock); 1492 spin_lock_irq(&info->io_lock);
@@ -1281,14 +1494,50 @@ static int blkif_recover(struct blkfront_info *info)
1281 /* Now safe for us to use the shared ring */ 1494 /* Now safe for us to use the shared ring */
1282 info->connected = BLKIF_STATE_CONNECTED; 1495 info->connected = BLKIF_STATE_CONNECTED;
1283 1496
1284 /* Send off requeued requests */
1285 flush_requests(info);
1286
1287 /* Kick any other new requests queued since we resumed */ 1497 /* Kick any other new requests queued since we resumed */
1288 kick_pending_request_queues(info); 1498 kick_pending_request_queues(info);
1289 1499
1500 list_for_each_entry_safe(req, n, &requests, queuelist) {
1501 /* Requeue pending requests (flush or discard) */
1502 list_del_init(&req->queuelist);
1503 BUG_ON(req->nr_phys_segments > segs);
1504 blk_requeue_request(info->rq, req);
1505 }
1290 spin_unlock_irq(&info->io_lock); 1506 spin_unlock_irq(&info->io_lock);
1291 1507
1508 while ((bio = bio_list_pop(&bio_list)) != NULL) {
1509 /* Traverse the list of pending bios and re-queue them */
1510 if (bio_segments(bio) > segs) {
1511 /*
1512 * This bio has more segments than what we can
1513 * handle, we have to split it.
1514 */
1515 pending = (bio_segments(bio) + segs - 1) / segs;
1516 split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
1517 BUG_ON(split_bio == NULL);
1518 atomic_set(&split_bio->pending, pending);
1519 split_bio->bio = bio;
1520 for (i = 0; i < pending; i++) {
1521 offset = (i * segs * PAGE_SIZE) >> 9;
1522 size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
1523 (unsigned int)(bio->bi_size >> 9) - offset);
1524 cloned_bio = bio_clone(bio, GFP_NOIO);
1525 BUG_ON(cloned_bio == NULL);
1526 trim_bio(cloned_bio, offset, size);
1527 cloned_bio->bi_private = split_bio;
1528 cloned_bio->bi_end_io = split_bio_end;
1529 submit_bio(cloned_bio->bi_rw, cloned_bio);
1530 }
1531 /*
1532 * Now we have to wait for all those smaller bios to
1533 * end, so we can also end the "parent" bio.
1534 */
1535 continue;
1536 }
1537 /* We don't need to split this bio */
1538 submit_bio(bio->bi_rw, bio);
1539 }
1540
1292 return 0; 1541 return 0;
1293} 1542}
1294 1543
@@ -1308,8 +1557,12 @@ static int blkfront_resume(struct xenbus_device *dev)
1308 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); 1557 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
1309 1558
1310 err = talk_to_blkback(dev, info); 1559 err = talk_to_blkback(dev, info);
1311 if (info->connected == BLKIF_STATE_SUSPENDED && !err) 1560
1312 err = blkif_recover(info); 1561 /*
1562 * We have to wait for the backend to switch to
1563 * connected state, since we want to read which
1564 * features it supports.
1565 */
1313 1566
1314 return err; 1567 return err;
1315} 1568}
@@ -1387,6 +1640,61 @@ static void blkfront_setup_discard(struct blkfront_info *info)
1387 kfree(type); 1640 kfree(type);
1388} 1641}
1389 1642
1643static int blkfront_setup_indirect(struct blkfront_info *info)
1644{
1645 unsigned int indirect_segments, segs;
1646 int err, i;
1647
1648 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1649 "feature-max-indirect-segments", "%u", &indirect_segments,
1650 NULL);
1651 if (err) {
1652 info->max_indirect_segments = 0;
1653 segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1654 } else {
1655 info->max_indirect_segments = min(indirect_segments,
1656 xen_blkif_max_segments);
1657 segs = info->max_indirect_segments;
1658 }
1659 info->sg = kzalloc(sizeof(info->sg[0]) * segs, GFP_KERNEL);
1660 if (info->sg == NULL)
1661 goto out_of_memory;
1662 sg_init_table(info->sg, segs);
1663
1664 err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
1665 if (err)
1666 goto out_of_memory;
1667
1668 for (i = 0; i < BLK_RING_SIZE; i++) {
1669 info->shadow[i].grants_used = kzalloc(
1670 sizeof(info->shadow[i].grants_used[0]) * segs,
1671 GFP_NOIO);
1672 if (info->max_indirect_segments)
1673 info->shadow[i].indirect_grants = kzalloc(
1674 sizeof(info->shadow[i].indirect_grants[0]) *
1675 INDIRECT_GREFS(segs),
1676 GFP_NOIO);
1677 if ((info->shadow[i].grants_used == NULL) ||
1678 (info->max_indirect_segments &&
1679 (info->shadow[i].indirect_grants == NULL)))
1680 goto out_of_memory;
1681 }
1682
1683
1684 return 0;
1685
1686out_of_memory:
1687 kfree(info->sg);
1688 info->sg = NULL;
1689 for (i = 0; i < BLK_RING_SIZE; i++) {
1690 kfree(info->shadow[i].grants_used);
1691 info->shadow[i].grants_used = NULL;
1692 kfree(info->shadow[i].indirect_grants);
1693 info->shadow[i].indirect_grants = NULL;
1694 }
1695 return -ENOMEM;
1696}
1697
1390/* 1698/*
1391 * Invoked when the backend is finally 'ready' (and has told produced 1699 * Invoked when the backend is finally 'ready' (and has told produced
1392 * the details about the physical device - #sectors, size, etc). 1700 * the details about the physical device - #sectors, size, etc).
@@ -1414,8 +1722,15 @@ static void blkfront_connect(struct blkfront_info *info)
1414 set_capacity(info->gd, sectors); 1722 set_capacity(info->gd, sectors);
1415 revalidate_disk(info->gd); 1723 revalidate_disk(info->gd);
1416 1724
1417 /* fall through */ 1725 return;
1418 case BLKIF_STATE_SUSPENDED: 1726 case BLKIF_STATE_SUSPENDED:
1727 /*
1728 * If we are recovering from suspension, we need to wait
1729 * for the backend to announce it's features before
1730 * reconnecting, at least we need to know if the backend
1731 * supports indirect descriptors, and how many.
1732 */
1733 blkif_recover(info);
1419 return; 1734 return;
1420 1735
1421 default: 1736 default:
@@ -1483,6 +1798,13 @@ static void blkfront_connect(struct blkfront_info *info)
1483 else 1798 else
1484 info->feature_persistent = persistent; 1799 info->feature_persistent = persistent;
1485 1800
1801 err = blkfront_setup_indirect(info);
1802 if (err) {
1803 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
1804 info->xbdev->otherend);
1805 return;
1806 }
1807
1486 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1808 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1487 if (err) { 1809 if (err) {
1488 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1810 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index ffd4652de91c..65e12099ef89 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t;
103#define BLKIF_OP_DISCARD 5 103#define BLKIF_OP_DISCARD 5
104 104
105/* 105/*
106 * Recognized if "feature-max-indirect-segments" in present in the backend
107 * xenbus info. The "feature-max-indirect-segments" node contains the maximum
108 * number of segments allowed by the backend per request. If the node is
109 * present, the frontend might use blkif_request_indirect structs in order to
110 * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
111 * maximum number of indirect segments is fixed by the backend, but the
112 * frontend can issue requests with any number of indirect segments as long as
113 * it's less than the number provided by the backend. The indirect_grefs field
114 * in blkif_request_indirect should be filled by the frontend with the
115 * grant references of the pages that are holding the indirect segments.
116 * This pages are filled with an array of blkif_request_segment_aligned
117 * that hold the information about the segments. The number of indirect
118 * pages to use is determined by the maximum number of segments
119 * a indirect request contains. Every indirect page can contain a maximum
120 * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)),
121 * so to calculate the number of indirect pages to use we have to do
122 * ceil(indirect_segments/512).
123 *
124 * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
125 * create the "feature-max-indirect-segments" node!
126 */
127#define BLKIF_OP_INDIRECT 6
128
129/*
106 * Maximum scatter/gather segments per request. 130 * Maximum scatter/gather segments per request.
107 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. 131 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
108 * NB. This could be 12 if the ring indexes weren't stored in the same page. 132 * NB. This could be 12 if the ring indexes weren't stored in the same page.
109 */ 133 */
110#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 134#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
111 135
136#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
137
138struct blkif_request_segment_aligned {
139 grant_ref_t gref; /* reference to I/O buffer frame */
140 /* @first_sect: first sector in frame to transfer (inclusive). */
141 /* @last_sect: last sector in frame to transfer (inclusive). */
142 uint8_t first_sect, last_sect;
143 uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */
144} __attribute__((__packed__));
145
112struct blkif_request_rw { 146struct blkif_request_rw {
113 uint8_t nr_segments; /* number of segments */ 147 uint8_t nr_segments; /* number of segments */
114 blkif_vdev_t handle; /* only for read/write requests */ 148 blkif_vdev_t handle; /* only for read/write requests */
@@ -147,12 +181,31 @@ struct blkif_request_other {
147 uint64_t id; /* private guest value, echoed in resp */ 181 uint64_t id; /* private guest value, echoed in resp */
148} __attribute__((__packed__)); 182} __attribute__((__packed__));
149 183
184struct blkif_request_indirect {
185 uint8_t indirect_op;
186 uint16_t nr_segments;
187#ifdef CONFIG_X86_64
188 uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */
189#endif
190 uint64_t id;
191 blkif_sector_t sector_number;
192 blkif_vdev_t handle;
193 uint16_t _pad2;
194 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
195#ifdef CONFIG_X86_64
196 uint32_t _pad3; /* make it 64 byte aligned */
197#else
198 uint64_t _pad3; /* make it 64 byte aligned */
199#endif
200} __attribute__((__packed__));
201
150struct blkif_request { 202struct blkif_request {
151 uint8_t operation; /* BLKIF_OP_??? */ 203 uint8_t operation; /* BLKIF_OP_??? */
152 union { 204 union {
153 struct blkif_request_rw rw; 205 struct blkif_request_rw rw;
154 struct blkif_request_discard discard; 206 struct blkif_request_discard discard;
155 struct blkif_request_other other; 207 struct blkif_request_other other;
208 struct blkif_request_indirect indirect;
156 } u; 209 } u;
157} __attribute__((__packed__)); 210} __attribute__((__packed__));
158 211