aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/xen-blkfront.c
diff options
context:
space:
mode:
authorRoger Pau Monne <roger.pau@citrix.com>2012-10-24 12:58:45 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-10-30 09:50:04 -0400
commit0a8704a51f386cab7394e38ff1d66eef924d8ab8 (patch)
tree8fb3897bad957fa592ff54cacc97de924246c125 /drivers/block/xen-blkfront.c
parent8f0d8163b50e01f398b14bcd4dc039ac5ab18d64 (diff)
xen/blkback: Persistent grant maps for xen blk drivers
This patch implements persistent grants for the xen-blk{front,back} mechanism. The effect of this change is to reduce the number of unmap operations performed, since they cause a (costly) TLB shootdown. This allows the I/O performance to scale better when a large number of VMs are performing I/O. Previously, the blkfront driver was supplied a bvec[] from the request queue. This was granted to dom0; dom0 performed the I/O and wrote directly into the grant-mapped memory and unmapped it; blkfront then removed foreign access for that grant. The cost of unmapping scales badly with the number of CPUs in Dom0. An experiment showed that when Dom0 has 24 VCPUs, and guests are performing parallel I/O to a ramdisk, the IPIs from performing unmap's is a bottleneck at 5 guests (at which point 650,000 IOPS are being performed in total). If more than 5 guests are used, the performance declines. By 10 guests, only 400,000 IOPS are being performed. This patch improves performance by only unmapping when the connection between blkfront and back is broken. On startup blkfront notifies blkback that it is using persistent grants, and blkback will do the same. If blkback is not capable of persistent mapping, blkfront will still use the same grants, since it is compatible with the previous protocol, and simplifies the code complexity in blkfront. To perform a read, in persistent mode, blkfront uses a separate pool of pages that it maps to dom0. When a request comes in, blkfront transmutes the request so that blkback will write into one of these free pages. Blkback keeps note of which grefs it has already mapped. When a new ring request comes to blkback, it looks to see if it has already mapped that page. If so, it will not map it again. If the page hasn't been previously mapped, it is mapped now, and a record is kept of this mapping. Blkback proceeds as usual. When blkfront is notified that blkback has completed a request, it memcpy's from the shared memory, into the bvec supplied. A record that the {gref, page} tuple is mapped, and not inflight is kept. Writes are similar, except that the memcpy is peformed from the supplied bvecs, into the shared pages, before the request is put onto the ring. Blkback stores a mapping of grefs=>{page mapped to by gref} in a red-black tree. As the grefs are not known apriori, and provide no guarantees on their ordering, we have to perform a search through this tree to find the page, for every gref we receive. This operation takes O(log n) time in the worst case. In blkfront grants are stored using a single linked list. The maximum number of grants that blkback will persistenly map is currently set to RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST, to prevent a malicios guest from attempting a DoS, by supplying fresh grefs, causing the Dom0 kernel to map excessively. If a guest is using persistent grants and exceeds the maximum number of grants to map persistenly the newly passed grefs will be mapped and unmaped. Using this approach, we can have requests that mix persistent and non-persistent grants, and we need to handle them correctly. This allows us to set the maximum number of persistent grants to a lower value than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST, although setting it will lead to unpredictable performance. In writing this patch, the question arrises as to if the additional cost of performing memcpys in the guest (to/from the pool of granted pages) outweigh the gains of not performing TLB shootdowns. The answer to that question is `no'. There appears to be very little, if any additional cost to the guest of using persistent grants. There is perhaps a small saving, from the reduced number of hypercalls performed in granting, and ending foreign access. Signed-off-by: Oliver Chick <oliver.chick@citrix.com> Signed-off-by: Roger Pau Monne <roger.pau@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [v1: Fixed up the misuse of bool as int]
Diffstat (limited to 'drivers/block/xen-blkfront.c')
-rw-r--r--drivers/block/xen-blkfront.c197
1 files changed, 169 insertions, 28 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 007db8986e84..911d733d21b6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -44,6 +44,7 @@
44#include <linux/mutex.h> 44#include <linux/mutex.h>
45#include <linux/scatterlist.h> 45#include <linux/scatterlist.h>
46#include <linux/bitmap.h> 46#include <linux/bitmap.h>
47#include <linux/llist.h>
47 48
48#include <xen/xen.h> 49#include <xen/xen.h>
49#include <xen/xenbus.h> 50#include <xen/xenbus.h>
@@ -64,10 +65,17 @@ enum blkif_state {
64 BLKIF_STATE_SUSPENDED, 65 BLKIF_STATE_SUSPENDED,
65}; 66};
66 67
68struct grant {
69 grant_ref_t gref;
70 unsigned long pfn;
71 struct llist_node node;
72};
73
67struct blk_shadow { 74struct blk_shadow {
68 struct blkif_request req; 75 struct blkif_request req;
69 struct request *request; 76 struct request *request;
70 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 77 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
78 struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
71}; 79};
72 80
73static DEFINE_MUTEX(blkfront_mutex); 81static DEFINE_MUTEX(blkfront_mutex);
@@ -97,6 +105,8 @@ struct blkfront_info
97 struct work_struct work; 105 struct work_struct work;
98 struct gnttab_free_callback callback; 106 struct gnttab_free_callback callback;
99 struct blk_shadow shadow[BLK_RING_SIZE]; 107 struct blk_shadow shadow[BLK_RING_SIZE];
108 struct llist_head persistent_gnts;
109 unsigned int persistent_gnts_c;
100 unsigned long shadow_free; 110 unsigned long shadow_free;
101 unsigned int feature_flush; 111 unsigned int feature_flush;
102 unsigned int flush_op; 112 unsigned int flush_op;
@@ -104,6 +114,7 @@ struct blkfront_info
104 unsigned int feature_secdiscard:1; 114 unsigned int feature_secdiscard:1;
105 unsigned int discard_granularity; 115 unsigned int discard_granularity;
106 unsigned int discard_alignment; 116 unsigned int discard_alignment;
117 unsigned int feature_persistent:1;
107 int is_ready; 118 int is_ready;
108}; 119};
109 120
@@ -287,21 +298,36 @@ static int blkif_queue_request(struct request *req)
287 unsigned long id; 298 unsigned long id;
288 unsigned int fsect, lsect; 299 unsigned int fsect, lsect;
289 int i, ref; 300 int i, ref;
301
302 /*
303 * Used to store if we are able to queue the request by just using
304 * existing persistent grants, or if we have to get new grants,
305 * as there are not sufficiently many free.
306 */
307 bool new_persistent_gnts;
290 grant_ref_t gref_head; 308 grant_ref_t gref_head;
309 struct page *granted_page;
310 struct grant *gnt_list_entry = NULL;
291 struct scatterlist *sg; 311 struct scatterlist *sg;
292 312
293 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 313 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
294 return 1; 314 return 1;
295 315
296 if (gnttab_alloc_grant_references( 316 /* Check if we have enought grants to allocate a requests */
297 BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { 317 if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
298 gnttab_request_free_callback( 318 new_persistent_gnts = 1;
299 &info->callback, 319 if (gnttab_alloc_grant_references(
300 blkif_restart_queue_callback, 320 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
301 info, 321 &gref_head) < 0) {
302 BLKIF_MAX_SEGMENTS_PER_REQUEST); 322 gnttab_request_free_callback(
303 return 1; 323 &info->callback,
304 } 324 blkif_restart_queue_callback,
325 info,
326 BLKIF_MAX_SEGMENTS_PER_REQUEST);
327 return 1;
328 }
329 } else
330 new_persistent_gnts = 0;
305 331
306 /* Fill out a communications ring structure. */ 332 /* Fill out a communications ring structure. */
307 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 333 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
@@ -341,18 +367,73 @@ static int blkif_queue_request(struct request *req)
341 BLKIF_MAX_SEGMENTS_PER_REQUEST); 367 BLKIF_MAX_SEGMENTS_PER_REQUEST);
342 368
343 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { 369 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
344 buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
345 fsect = sg->offset >> 9; 370 fsect = sg->offset >> 9;
346 lsect = fsect + (sg->length >> 9) - 1; 371 lsect = fsect + (sg->length >> 9) - 1;
347 /* install a grant reference. */
348 ref = gnttab_claim_grant_reference(&gref_head);
349 BUG_ON(ref == -ENOSPC);
350 372
351 gnttab_grant_foreign_access_ref( 373 if (info->persistent_gnts_c) {
352 ref, 374 BUG_ON(llist_empty(&info->persistent_gnts));
375 gnt_list_entry = llist_entry(
376 llist_del_first(&info->persistent_gnts),
377 struct grant, node);
378
379 ref = gnt_list_entry->gref;
380 buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
381 info->persistent_gnts_c--;
382 } else {
383 ref = gnttab_claim_grant_reference(&gref_head);
384 BUG_ON(ref == -ENOSPC);
385
386 gnt_list_entry =
387 kmalloc(sizeof(struct grant),
388 GFP_ATOMIC);
389 if (!gnt_list_entry)
390 return -ENOMEM;
391
392 granted_page = alloc_page(GFP_ATOMIC);
393 if (!granted_page) {
394 kfree(gnt_list_entry);
395 return -ENOMEM;
396 }
397
398 gnt_list_entry->pfn =
399 page_to_pfn(granted_page);
400 gnt_list_entry->gref = ref;
401
402 buffer_mfn = pfn_to_mfn(page_to_pfn(
403 granted_page));
404 gnttab_grant_foreign_access_ref(ref,
353 info->xbdev->otherend_id, 405 info->xbdev->otherend_id,
354 buffer_mfn, 406 buffer_mfn, 0);
355 rq_data_dir(req)); 407 }
408
409 info->shadow[id].grants_used[i] = gnt_list_entry;
410
411 if (rq_data_dir(req)) {
412 char *bvec_data;
413 void *shared_data;
414
415 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
416
417 shared_data = kmap_atomic(
418 pfn_to_page(gnt_list_entry->pfn));
419 bvec_data = kmap_atomic(sg_page(sg));
420
421 /*
422 * this does not wipe data stored outside the
423 * range sg->offset..sg->offset+sg->length.
424 * Therefore, blkback *could* see data from
425 * previous requests. This is OK as long as
426 * persistent grants are shared with just one
427 * domain. It may need refactoring if this
428 * changes
429 */
430 memcpy(shared_data + sg->offset,
431 bvec_data + sg->offset,
432 sg->length);
433
434 kunmap_atomic(bvec_data);
435 kunmap_atomic(shared_data);
436 }
356 437
357 info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); 438 info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
358 ring_req->u.rw.seg[i] = 439 ring_req->u.rw.seg[i] =
@@ -368,7 +449,8 @@ static int blkif_queue_request(struct request *req)
368 /* Keep a private copy so we can reissue requests when recovering. */ 449 /* Keep a private copy so we can reissue requests when recovering. */
369 info->shadow[id].req = *ring_req; 450 info->shadow[id].req = *ring_req;
370 451
371 gnttab_free_grant_references(gref_head); 452 if (new_persistent_gnts)
453 gnttab_free_grant_references(gref_head);
372 454
373 return 0; 455 return 0;
374} 456}
@@ -480,12 +562,13 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
480static void xlvbd_flush(struct blkfront_info *info) 562static void xlvbd_flush(struct blkfront_info *info)
481{ 563{
482 blk_queue_flush(info->rq, info->feature_flush); 564 blk_queue_flush(info->rq, info->feature_flush);
483 printk(KERN_INFO "blkfront: %s: %s: %s\n", 565 printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
484 info->gd->disk_name, 566 info->gd->disk_name,
485 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 567 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
486 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 568 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
487 "flush diskcache" : "barrier or flush"), 569 "flush diskcache" : "barrier or flush"),
488 info->feature_flush ? "enabled" : "disabled"); 570 info->feature_flush ? "enabled" : "disabled",
571 info->feature_persistent ? "using persistent grants" : "");
489} 572}
490 573
491static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 574static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -707,6 +790,9 @@ static void blkif_restart_queue(struct work_struct *work)
707 790
708static void blkif_free(struct blkfront_info *info, int suspend) 791static void blkif_free(struct blkfront_info *info, int suspend)
709{ 792{
793 struct llist_node *all_gnts;
794 struct grant *persistent_gnt;
795
710 /* Prevent new requests being issued until we fix things up. */ 796 /* Prevent new requests being issued until we fix things up. */
711 spin_lock_irq(&info->io_lock); 797 spin_lock_irq(&info->io_lock);
712 info->connected = suspend ? 798 info->connected = suspend ?
@@ -714,6 +800,17 @@ static void blkif_free(struct blkfront_info *info, int suspend)
714 /* No more blkif_request(). */ 800 /* No more blkif_request(). */
715 if (info->rq) 801 if (info->rq)
716 blk_stop_queue(info->rq); 802 blk_stop_queue(info->rq);
803
804 /* Remove all persistent grants */
805 if (info->persistent_gnts_c) {
806 all_gnts = llist_del_all(&info->persistent_gnts);
807 llist_for_each_entry(persistent_gnt, all_gnts, node) {
808 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
809 kfree(persistent_gnt);
810 }
811 info->persistent_gnts_c = 0;
812 }
813
717 /* No more gnttab callback work. */ 814 /* No more gnttab callback work. */
718 gnttab_cancel_free_callback(&info->callback); 815 gnttab_cancel_free_callback(&info->callback);
719 spin_unlock_irq(&info->io_lock); 816 spin_unlock_irq(&info->io_lock);
@@ -734,13 +831,42 @@ static void blkif_free(struct blkfront_info *info, int suspend)
734 831
735} 832}
736 833
737static void blkif_completion(struct blk_shadow *s) 834static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
835 struct blkif_response *bret)
738{ 836{
739 int i; 837 int i;
740 /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place 838 struct bio_vec *bvec;
741 * flag. */ 839 struct req_iterator iter;
742 for (i = 0; i < s->req.u.rw.nr_segments; i++) 840 unsigned long flags;
743 gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); 841 char *bvec_data;
842 void *shared_data;
843 unsigned int offset = 0;
844
845 if (bret->operation == BLKIF_OP_READ) {
846 /*
847 * Copy the data received from the backend into the bvec.
848 * Since bv_offset can be different than 0, and bv_len different
849 * than PAGE_SIZE, we have to keep track of the current offset,
850 * to be sure we are copying the data from the right shared page.
851 */
852 rq_for_each_segment(bvec, s->request, iter) {
853 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
854 i = offset >> PAGE_SHIFT;
855 shared_data = kmap_atomic(
856 pfn_to_page(s->grants_used[i]->pfn));
857 bvec_data = bvec_kmap_irq(bvec, &flags);
858 memcpy(bvec_data, shared_data + bvec->bv_offset,
859 bvec->bv_len);
860 bvec_kunmap_irq(bvec_data, &flags);
861 kunmap_atomic(shared_data);
862 offset += bvec->bv_len;
863 }
864 }
865 /* Add the persistent grant into the list of free grants */
866 for (i = 0; i < s->req.u.rw.nr_segments; i++) {
867 llist_add(&s->grants_used[i]->node, &info->persistent_gnts);
868 info->persistent_gnts_c++;
869 }
744} 870}
745 871
746static irqreturn_t blkif_interrupt(int irq, void *dev_id) 872static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -783,7 +909,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
783 req = info->shadow[id].request; 909 req = info->shadow[id].request;
784 910
785 if (bret->operation != BLKIF_OP_DISCARD) 911 if (bret->operation != BLKIF_OP_DISCARD)
786 blkif_completion(&info->shadow[id]); 912 blkif_completion(&info->shadow[id], info, bret);
787 913
788 if (add_id_to_freelist(info, id)) { 914 if (add_id_to_freelist(info, id)) {
789 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", 915 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@@ -942,6 +1068,11 @@ again:
942 message = "writing protocol"; 1068 message = "writing protocol";
943 goto abort_transaction; 1069 goto abort_transaction;
944 } 1070 }
1071 err = xenbus_printf(xbt, dev->nodename,
1072 "feature-persistent-grants", "%u", 1);
1073 if (err)
1074 dev_warn(&dev->dev,
1075 "writing persistent grants feature to xenbus");
945 1076
946 err = xenbus_transaction_end(xbt, 0); 1077 err = xenbus_transaction_end(xbt, 0);
947 if (err) { 1078 if (err) {
@@ -1029,6 +1160,8 @@ static int blkfront_probe(struct xenbus_device *dev,
1029 spin_lock_init(&info->io_lock); 1160 spin_lock_init(&info->io_lock);
1030 info->xbdev = dev; 1161 info->xbdev = dev;
1031 info->vdevice = vdevice; 1162 info->vdevice = vdevice;
1163 init_llist_head(&info->persistent_gnts);
1164 info->persistent_gnts_c = 0;
1032 info->connected = BLKIF_STATE_DISCONNECTED; 1165 info->connected = BLKIF_STATE_DISCONNECTED;
1033 INIT_WORK(&info->work, blkif_restart_queue); 1166 INIT_WORK(&info->work, blkif_restart_queue);
1034 1167
@@ -1093,7 +1226,7 @@ static int blkif_recover(struct blkfront_info *info)
1093 req->u.rw.seg[j].gref, 1226 req->u.rw.seg[j].gref,
1094 info->xbdev->otherend_id, 1227 info->xbdev->otherend_id,
1095 pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), 1228 pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
1096 rq_data_dir(info->shadow[req->u.rw.id].request)); 1229 0);
1097 } 1230 }
1098 info->shadow[req->u.rw.id].req = *req; 1231 info->shadow[req->u.rw.id].req = *req;
1099 1232
@@ -1225,7 +1358,7 @@ static void blkfront_connect(struct blkfront_info *info)
1225 unsigned long sector_size; 1358 unsigned long sector_size;
1226 unsigned int binfo; 1359 unsigned int binfo;
1227 int err; 1360 int err;
1228 int barrier, flush, discard; 1361 int barrier, flush, discard, persistent;
1229 1362
1230 switch (info->connected) { 1363 switch (info->connected) {
1231 case BLKIF_STATE_CONNECTED: 1364 case BLKIF_STATE_CONNECTED:
@@ -1303,6 +1436,14 @@ static void blkfront_connect(struct blkfront_info *info)
1303 if (!err && discard) 1436 if (!err && discard)
1304 blkfront_setup_discard(info); 1437 blkfront_setup_discard(info);
1305 1438
1439 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1440 "feature-persistent", "%u", &persistent,
1441 NULL);
1442 if (err)
1443 info->feature_persistent = 0;
1444 else
1445 info->feature_persistent = persistent;
1446
1306 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1447 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1307 if (err) { 1448 if (err) {
1308 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1449 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",