aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/xen-blkfront.c
diff options
context:
space:
mode:
authorBob Liu <bob.liu@oracle.com>2015-06-03 01:40:03 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2015-06-05 21:14:05 -0400
commit86839c56dee28c315a4c19b7bfee450ccd84cd25 (patch)
tree325db48d042f41fbea511754e7bdf4799d604960 /drivers/block/xen-blkfront.c
parent8ab0144a466320cc37c52e7866b5103c5bbd4e90 (diff)
xen/block: add multi-page ring support
Extend xen/block to support multi-page ring, so that more requests can be issued by using more than one pages as the request ring between blkfront and backend. As a result, the performance can get improved significantly. We got some impressive improvements on our highend iscsi storage cluster backend. If using 64 pages as the ring, the IOPS increased about 15 times for the throughput testing and above doubled for the latency testing. The reason was the limit on outstanding requests is 32 if use only one-page ring, but in our case the iscsi lun was spread across about 100 physical drives, 32 was really not enough to keep them busy. Changes in v2: - Rebased to 4.0-rc6. - Document on how multi-page ring feature working to linux io/blkif.h. Changes in v3: - Remove changes to linux io/blkif.h and follow the protocol defined in io/blkif.h of XEN tree. - Rebased to 4.1-rc3 Changes in v4: - Turn to use 'ring-page-order' and 'max-ring-page-order'. - A few comments from Roger. Changes in v5: - Clarify with 4k granularity to comment - Address more comments from Roger Signed-off-by: Bob Liu <bob.liu@oracle.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Diffstat (limited to 'drivers/block/xen-blkfront.c')
-rw-r--r--drivers/block/xen-blkfront.c135
1 files changed, 96 insertions, 39 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 88e23fd8c7f3..d3c1a9523d1f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -98,7 +98,21 @@ static unsigned int xen_blkif_max_segments = 32;
98module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); 98module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
99MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); 99MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
100 100
101#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 101/*
102 * Maximum order of pages to be used for the shared ring between front and
103 * backend, 4KB page granularity is used.
104 */
105static unsigned int xen_blkif_max_ring_order;
106module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
107MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
108
109#define BLK_RING_SIZE(info) __CONST_RING_SIZE(blkif, PAGE_SIZE * (info)->nr_ring_pages)
110#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
111/*
112 * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
113 * characters are enough. Define to 20 to keep consist with backend.
114 */
115#define RINGREF_NAME_LEN (20)
102 116
103/* 117/*
104 * We have one of these per vbd, whether ide, scsi or 'other'. They 118 * We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -114,13 +128,14 @@ struct blkfront_info
114 int vdevice; 128 int vdevice;
115 blkif_vdev_t handle; 129 blkif_vdev_t handle;
116 enum blkif_state connected; 130 enum blkif_state connected;
117 int ring_ref; 131 int ring_ref[XENBUS_MAX_RING_PAGES];
132 unsigned int nr_ring_pages;
118 struct blkif_front_ring ring; 133 struct blkif_front_ring ring;
119 unsigned int evtchn, irq; 134 unsigned int evtchn, irq;
120 struct request_queue *rq; 135 struct request_queue *rq;
121 struct work_struct work; 136 struct work_struct work;
122 struct gnttab_free_callback callback; 137 struct gnttab_free_callback callback;
123 struct blk_shadow shadow[BLK_RING_SIZE]; 138 struct blk_shadow shadow[BLK_MAX_RING_SIZE];
124 struct list_head grants; 139 struct list_head grants;
125 struct list_head indirect_pages; 140 struct list_head indirect_pages;
126 unsigned int persistent_gnts_c; 141 unsigned int persistent_gnts_c;
@@ -139,8 +154,6 @@ static unsigned int nr_minors;
139static unsigned long *minors; 154static unsigned long *minors;
140static DEFINE_SPINLOCK(minor_lock); 155static DEFINE_SPINLOCK(minor_lock);
141 156
142#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
143 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
144#define GRANT_INVALID_REF 0 157#define GRANT_INVALID_REF 0
145 158
146#define PARTS_PER_DISK 16 159#define PARTS_PER_DISK 16
@@ -170,7 +183,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info);
170static int get_id_from_freelist(struct blkfront_info *info) 183static int get_id_from_freelist(struct blkfront_info *info)
171{ 184{
172 unsigned long free = info->shadow_free; 185 unsigned long free = info->shadow_free;
173 BUG_ON(free >= BLK_RING_SIZE); 186 BUG_ON(free >= BLK_RING_SIZE(info));
174 info->shadow_free = info->shadow[free].req.u.rw.id; 187 info->shadow_free = info->shadow[free].req.u.rw.id;
175 info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ 188 info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
176 return free; 189 return free;
@@ -983,7 +996,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
983 } 996 }
984 } 997 }
985 998
986 for (i = 0; i < BLK_RING_SIZE; i++) { 999 for (i = 0; i < BLK_RING_SIZE(info); i++) {
987 /* 1000 /*
988 * Clear persistent grants present in requests already 1001 * Clear persistent grants present in requests already
989 * on the shared ring 1002 * on the shared ring
@@ -1033,12 +1046,15 @@ free_shadow:
1033 flush_work(&info->work); 1046 flush_work(&info->work);
1034 1047
1035 /* Free resources associated with old device channel. */ 1048 /* Free resources associated with old device channel. */
1036 if (info->ring_ref != GRANT_INVALID_REF) { 1049 for (i = 0; i < info->nr_ring_pages; i++) {
1037 gnttab_end_foreign_access(info->ring_ref, 0, 1050 if (info->ring_ref[i] != GRANT_INVALID_REF) {
1038 (unsigned long)info->ring.sring); 1051 gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
1039 info->ring_ref = GRANT_INVALID_REF; 1052 info->ring_ref[i] = GRANT_INVALID_REF;
1040 info->ring.sring = NULL; 1053 }
1041 } 1054 }
1055 free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
1056 info->ring.sring = NULL;
1057
1042 if (info->irq) 1058 if (info->irq)
1043 unbind_from_irqhandler(info->irq, info); 1059 unbind_from_irqhandler(info->irq, info);
1044 info->evtchn = info->irq = 0; 1060 info->evtchn = info->irq = 0;
@@ -1157,7 +1173,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1157 * never have given to it (we stamp it up to BLK_RING_SIZE - 1173 * never have given to it (we stamp it up to BLK_RING_SIZE -
1158 * look in get_id_from_freelist. 1174 * look in get_id_from_freelist.
1159 */ 1175 */
1160 if (id >= BLK_RING_SIZE) { 1176 if (id >= BLK_RING_SIZE(info)) {
1161 WARN(1, "%s: response to %s has incorrect id (%ld)\n", 1177 WARN(1, "%s: response to %s has incorrect id (%ld)\n",
1162 info->gd->disk_name, op_name(bret->operation), id); 1178 info->gd->disk_name, op_name(bret->operation), id);
1163 /* We can't safely get the 'struct request' as 1179 /* We can't safely get the 'struct request' as
@@ -1245,26 +1261,30 @@ static int setup_blkring(struct xenbus_device *dev,
1245 struct blkfront_info *info) 1261 struct blkfront_info *info)
1246{ 1262{
1247 struct blkif_sring *sring; 1263 struct blkif_sring *sring;
1248 grant_ref_t gref; 1264 int err, i;
1249 int err; 1265 unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
1266 grant_ref_t gref[XENBUS_MAX_RING_PAGES];
1250 1267
1251 info->ring_ref = GRANT_INVALID_REF; 1268 for (i = 0; i < info->nr_ring_pages; i++)
1269 info->ring_ref[i] = GRANT_INVALID_REF;
1252 1270
1253 sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); 1271 sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
1272 get_order(ring_size));
1254 if (!sring) { 1273 if (!sring) {
1255 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); 1274 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
1256 return -ENOMEM; 1275 return -ENOMEM;
1257 } 1276 }
1258 SHARED_RING_INIT(sring); 1277 SHARED_RING_INIT(sring);
1259 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); 1278 FRONT_RING_INIT(&info->ring, sring, ring_size);
1260 1279
1261 err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref); 1280 err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
1262 if (err < 0) { 1281 if (err < 0) {
1263 free_page((unsigned long)sring); 1282 free_pages((unsigned long)sring, get_order(ring_size));
1264 info->ring.sring = NULL; 1283 info->ring.sring = NULL;
1265 goto fail; 1284 goto fail;
1266 } 1285 }
1267 info->ring_ref = gref; 1286 for (i = 0; i < info->nr_ring_pages; i++)
1287 info->ring_ref[i] = gref[i];
1268 1288
1269 err = xenbus_alloc_evtchn(dev, &info->evtchn); 1289 err = xenbus_alloc_evtchn(dev, &info->evtchn);
1270 if (err) 1290 if (err)
@@ -1292,7 +1312,18 @@ static int talk_to_blkback(struct xenbus_device *dev,
1292{ 1312{
1293 const char *message = NULL; 1313 const char *message = NULL;
1294 struct xenbus_transaction xbt; 1314 struct xenbus_transaction xbt;
1295 int err; 1315 int err, i;
1316 unsigned int max_page_order = 0;
1317 unsigned int ring_page_order = 0;
1318
1319 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1320 "max-ring-page-order", "%u", &max_page_order);
1321 if (err != 1)
1322 info->nr_ring_pages = 1;
1323 else {
1324 ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
1325 info->nr_ring_pages = 1 << ring_page_order;
1326 }
1296 1327
1297 /* Create shared ring, alloc event channel. */ 1328 /* Create shared ring, alloc event channel. */
1298 err = setup_blkring(dev, info); 1329 err = setup_blkring(dev, info);
@@ -1306,11 +1337,32 @@ again:
1306 goto destroy_blkring; 1337 goto destroy_blkring;
1307 } 1338 }
1308 1339
1309 err = xenbus_printf(xbt, dev->nodename, 1340 if (info->nr_ring_pages == 1) {
1310 "ring-ref", "%u", info->ring_ref); 1341 err = xenbus_printf(xbt, dev->nodename,
1311 if (err) { 1342 "ring-ref", "%u", info->ring_ref[0]);
1312 message = "writing ring-ref"; 1343 if (err) {
1313 goto abort_transaction; 1344 message = "writing ring-ref";
1345 goto abort_transaction;
1346 }
1347 } else {
1348 err = xenbus_printf(xbt, dev->nodename,
1349 "ring-page-order", "%u", ring_page_order);
1350 if (err) {
1351 message = "writing ring-page-order";
1352 goto abort_transaction;
1353 }
1354
1355 for (i = 0; i < info->nr_ring_pages; i++) {
1356 char ring_ref_name[RINGREF_NAME_LEN];
1357
1358 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
1359 err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
1360 "%u", info->ring_ref[i]);
1361 if (err) {
1362 message = "writing ring-ref";
1363 goto abort_transaction;
1364 }
1365 }
1314 } 1366 }
1315 err = xenbus_printf(xbt, dev->nodename, 1367 err = xenbus_printf(xbt, dev->nodename,
1316 "event-channel", "%u", info->evtchn); 1368 "event-channel", "%u", info->evtchn);
@@ -1338,6 +1390,9 @@ again:
1338 goto destroy_blkring; 1390 goto destroy_blkring;
1339 } 1391 }
1340 1392
1393 for (i = 0; i < BLK_RING_SIZE(info); i++)
1394 info->shadow[i].req.u.rw.id = i+1;
1395 info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
1341 xenbus_switch_state(dev, XenbusStateInitialised); 1396 xenbus_switch_state(dev, XenbusStateInitialised);
1342 1397
1343 return 0; 1398 return 0;
@@ -1361,7 +1416,7 @@ again:
1361static int blkfront_probe(struct xenbus_device *dev, 1416static int blkfront_probe(struct xenbus_device *dev,
1362 const struct xenbus_device_id *id) 1417 const struct xenbus_device_id *id)
1363{ 1418{
1364 int err, vdevice, i; 1419 int err, vdevice;
1365 struct blkfront_info *info; 1420 struct blkfront_info *info;
1366 1421
1367 /* FIXME: Use dynamic device id if this is not set. */ 1422 /* FIXME: Use dynamic device id if this is not set. */
@@ -1422,10 +1477,6 @@ static int blkfront_probe(struct xenbus_device *dev,
1422 info->connected = BLKIF_STATE_DISCONNECTED; 1477 info->connected = BLKIF_STATE_DISCONNECTED;
1423 INIT_WORK(&info->work, blkif_restart_queue); 1478 INIT_WORK(&info->work, blkif_restart_queue);
1424 1479
1425 for (i = 0; i < BLK_RING_SIZE; i++)
1426 info->shadow[i].req.u.rw.id = i+1;
1427 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1428
1429 /* Front end dir is a number, which is used as the id. */ 1480 /* Front end dir is a number, which is used as the id. */
1430 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); 1481 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
1431 dev_set_drvdata(&dev->dev, info); 1482 dev_set_drvdata(&dev->dev, info);
@@ -1469,10 +1520,10 @@ static int blkif_recover(struct blkfront_info *info)
1469 1520
1470 /* Stage 2: Set up free list. */ 1521 /* Stage 2: Set up free list. */
1471 memset(&info->shadow, 0, sizeof(info->shadow)); 1522 memset(&info->shadow, 0, sizeof(info->shadow));
1472 for (i = 0; i < BLK_RING_SIZE; i++) 1523 for (i = 0; i < BLK_RING_SIZE(info); i++)
1473 info->shadow[i].req.u.rw.id = i+1; 1524 info->shadow[i].req.u.rw.id = i+1;
1474 info->shadow_free = info->ring.req_prod_pvt; 1525 info->shadow_free = info->ring.req_prod_pvt;
1475 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; 1526 info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
1476 1527
1477 rc = blkfront_setup_indirect(info); 1528 rc = blkfront_setup_indirect(info);
1478 if (rc) { 1529 if (rc) {
@@ -1484,7 +1535,7 @@ static int blkif_recover(struct blkfront_info *info)
1484 blk_queue_max_segments(info->rq, segs); 1535 blk_queue_max_segments(info->rq, segs);
1485 bio_list_init(&bio_list); 1536 bio_list_init(&bio_list);
1486 INIT_LIST_HEAD(&requests); 1537 INIT_LIST_HEAD(&requests);
1487 for (i = 0; i < BLK_RING_SIZE; i++) { 1538 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1488 /* Not in use? */ 1539 /* Not in use? */
1489 if (!copy[i].request) 1540 if (!copy[i].request)
1490 continue; 1541 continue;
@@ -1690,7 +1741,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1690 segs = info->max_indirect_segments; 1741 segs = info->max_indirect_segments;
1691 } 1742 }
1692 1743
1693 err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); 1744 err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info));
1694 if (err) 1745 if (err)
1695 goto out_of_memory; 1746 goto out_of_memory;
1696 1747
@@ -1700,7 +1751,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1700 * grants, we need to allocate a set of pages that can be 1751 * grants, we need to allocate a set of pages that can be
1701 * used for mapping indirect grefs 1752 * used for mapping indirect grefs
1702 */ 1753 */
1703 int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE; 1754 int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE(info);
1704 1755
1705 BUG_ON(!list_empty(&info->indirect_pages)); 1756 BUG_ON(!list_empty(&info->indirect_pages));
1706 for (i = 0; i < num; i++) { 1757 for (i = 0; i < num; i++) {
@@ -1711,7 +1762,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1711 } 1762 }
1712 } 1763 }
1713 1764
1714 for (i = 0; i < BLK_RING_SIZE; i++) { 1765 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1715 info->shadow[i].grants_used = kzalloc( 1766 info->shadow[i].grants_used = kzalloc(
1716 sizeof(info->shadow[i].grants_used[0]) * segs, 1767 sizeof(info->shadow[i].grants_used[0]) * segs,
1717 GFP_NOIO); 1768 GFP_NOIO);
@@ -1733,7 +1784,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1733 return 0; 1784 return 0;
1734 1785
1735out_of_memory: 1786out_of_memory:
1736 for (i = 0; i < BLK_RING_SIZE; i++) { 1787 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1737 kfree(info->shadow[i].grants_used); 1788 kfree(info->shadow[i].grants_used);
1738 info->shadow[i].grants_used = NULL; 1789 info->shadow[i].grants_used = NULL;
1739 kfree(info->shadow[i].sg); 1790 kfree(info->shadow[i].sg);
@@ -2089,6 +2140,12 @@ static int __init xlblk_init(void)
2089 if (!xen_domain()) 2140 if (!xen_domain())
2090 return -ENODEV; 2141 return -ENODEV;
2091 2142
2143 if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
2144 pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
2145 xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
2146 xen_blkif_max_ring_order = 0;
2147 }
2148
2092 if (!xen_has_pv_disk_devices()) 2149 if (!xen_has_pv_disk_devices())
2093 return -ENODEV; 2150 return -ENODEV;
2094 2151