aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRoger Pau Monne <roger.pau@citrix.com>2012-10-24 12:58:45 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-10-30 09:50:04 -0400
commit0a8704a51f386cab7394e38ff1d66eef924d8ab8 (patch)
tree8fb3897bad957fa592ff54cacc97de924246c125
parent8f0d8163b50e01f398b14bcd4dc039ac5ab18d64 (diff)
xen/blkback: Persistent grant maps for xen blk drivers
This patch implements persistent grants for the xen-blk{front,back} mechanism. The effect of this change is to reduce the number of unmap operations performed, since they cause a (costly) TLB shootdown. This allows the I/O performance to scale better when a large number of VMs are performing I/O. Previously, the blkfront driver was supplied a bvec[] from the request queue. This was granted to dom0; dom0 performed the I/O and wrote directly into the grant-mapped memory and unmapped it; blkfront then removed foreign access for that grant. The cost of unmapping scales badly with the number of CPUs in Dom0. An experiment showed that when Dom0 has 24 VCPUs, and guests are performing parallel I/O to a ramdisk, the IPIs from performing unmap's is a bottleneck at 5 guests (at which point 650,000 IOPS are being performed in total). If more than 5 guests are used, the performance declines. By 10 guests, only 400,000 IOPS are being performed. This patch improves performance by only unmapping when the connection between blkfront and back is broken. On startup blkfront notifies blkback that it is using persistent grants, and blkback will do the same. If blkback is not capable of persistent mapping, blkfront will still use the same grants, since it is compatible with the previous protocol, and simplifies the code complexity in blkfront. To perform a read, in persistent mode, blkfront uses a separate pool of pages that it maps to dom0. When a request comes in, blkfront transmutes the request so that blkback will write into one of these free pages. Blkback keeps note of which grefs it has already mapped. When a new ring request comes to blkback, it looks to see if it has already mapped that page. If so, it will not map it again. If the page hasn't been previously mapped, it is mapped now, and a record is kept of this mapping. Blkback proceeds as usual. When blkfront is notified that blkback has completed a request, it memcpy's from the shared memory, into the bvec supplied. A record that the {gref, page} tuple is mapped, and not inflight is kept. Writes are similar, except that the memcpy is peformed from the supplied bvecs, into the shared pages, before the request is put onto the ring. Blkback stores a mapping of grefs=>{page mapped to by gref} in a red-black tree. As the grefs are not known apriori, and provide no guarantees on their ordering, we have to perform a search through this tree to find the page, for every gref we receive. This operation takes O(log n) time in the worst case. In blkfront grants are stored using a single linked list. The maximum number of grants that blkback will persistenly map is currently set to RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST, to prevent a malicios guest from attempting a DoS, by supplying fresh grefs, causing the Dom0 kernel to map excessively. If a guest is using persistent grants and exceeds the maximum number of grants to map persistenly the newly passed grefs will be mapped and unmaped. Using this approach, we can have requests that mix persistent and non-persistent grants, and we need to handle them correctly. This allows us to set the maximum number of persistent grants to a lower value than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST, although setting it will lead to unpredictable performance. In writing this patch, the question arrises as to if the additional cost of performing memcpys in the guest (to/from the pool of granted pages) outweigh the gains of not performing TLB shootdowns. The answer to that question is `no'. There appears to be very little, if any additional cost to the guest of using persistent grants. There is perhaps a small saving, from the reduced number of hypercalls performed in granting, and ending foreign access. Signed-off-by: Oliver Chick <oliver.chick@citrix.com> Signed-off-by: Roger Pau Monne <roger.pau@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [v1: Fixed up the misuse of bool as int]
-rw-r--r--drivers/block/xen-blkback/blkback.c292
-rw-r--r--drivers/block/xen-blkback/common.h17
-rw-r--r--drivers/block/xen-blkback/xenbus.c23
-rw-r--r--drivers/block/xen-blkfront.c197
4 files changed, 474 insertions, 55 deletions
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 280a13846e6c..d7dd5cbdac5f 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -39,6 +39,7 @@
39#include <linux/list.h> 39#include <linux/list.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
41#include <linux/freezer.h> 41#include <linux/freezer.h>
42#include <linux/bitmap.h>
42 43
43#include <xen/events.h> 44#include <xen/events.h>
44#include <xen/page.h> 45#include <xen/page.h>
@@ -79,6 +80,7 @@ struct pending_req {
79 unsigned short operation; 80 unsigned short operation;
80 int status; 81 int status;
81 struct list_head free_list; 82 struct list_head free_list;
83 DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
82}; 84};
83 85
84#define BLKBACK_INVALID_HANDLE (~0) 86#define BLKBACK_INVALID_HANDLE (~0)
@@ -99,6 +101,36 @@ struct xen_blkbk {
99static struct xen_blkbk *blkbk; 101static struct xen_blkbk *blkbk;
100 102
101/* 103/*
104 * Maximum number of grant pages that can be mapped in blkback.
105 * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
106 * pages that blkback will persistently map.
107 * Currently, this is:
108 * RING_SIZE = 32 (for all known ring types)
109 * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
110 * sizeof(struct persistent_gnt) = 48
111 * So the maximum memory used to store the grants is:
112 * 32 * 11 * 48 = 16896 bytes
113 */
114static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
115{
116 switch (protocol) {
117 case BLKIF_PROTOCOL_NATIVE:
118 return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
119 BLKIF_MAX_SEGMENTS_PER_REQUEST;
120 case BLKIF_PROTOCOL_X86_32:
121 return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
122 BLKIF_MAX_SEGMENTS_PER_REQUEST;
123 case BLKIF_PROTOCOL_X86_64:
124 return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
125 BLKIF_MAX_SEGMENTS_PER_REQUEST;
126 default:
127 BUG();
128 }
129 return 0;
130}
131
132
133/*
102 * Little helpful macro to figure out the index and virtual address of the 134 * Little helpful macro to figure out the index and virtual address of the
103 * pending_pages[..]. For each 'pending_req' we have have up to 135 * pending_pages[..]. For each 'pending_req' we have have up to
104 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through 136 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
@@ -129,6 +161,57 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
129static void make_response(struct xen_blkif *blkif, u64 id, 161static void make_response(struct xen_blkif *blkif, u64 id,
130 unsigned short op, int st); 162 unsigned short op, int st);
131 163
164#define foreach_grant(pos, rbtree, node) \
165 for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node); \
166 &(pos)->node != NULL; \
167 (pos) = container_of(rb_next(&(pos)->node), typeof(*(pos)), node))
168
169
170static void add_persistent_gnt(struct rb_root *root,
171 struct persistent_gnt *persistent_gnt)
172{
173 struct rb_node **new = &(root->rb_node), *parent = NULL;
174 struct persistent_gnt *this;
175
176 /* Figure out where to put new node */
177 while (*new) {
178 this = container_of(*new, struct persistent_gnt, node);
179
180 parent = *new;
181 if (persistent_gnt->gnt < this->gnt)
182 new = &((*new)->rb_left);
183 else if (persistent_gnt->gnt > this->gnt)
184 new = &((*new)->rb_right);
185 else {
186 pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
187 BUG();
188 }
189 }
190
191 /* Add new node and rebalance tree. */
192 rb_link_node(&(persistent_gnt->node), parent, new);
193 rb_insert_color(&(persistent_gnt->node), root);
194}
195
196static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
197 grant_ref_t gref)
198{
199 struct persistent_gnt *data;
200 struct rb_node *node = root->rb_node;
201
202 while (node) {
203 data = container_of(node, struct persistent_gnt, node);
204
205 if (gref < data->gnt)
206 node = node->rb_left;
207 else if (gref > data->gnt)
208 node = node->rb_right;
209 else
210 return data;
211 }
212 return NULL;
213}
214
132/* 215/*
133 * Retrieve from the 'pending_reqs' a free pending_req structure to be used. 216 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
134 */ 217 */
@@ -275,6 +358,11 @@ int xen_blkif_schedule(void *arg)
275{ 358{
276 struct xen_blkif *blkif = arg; 359 struct xen_blkif *blkif = arg;
277 struct xen_vbd *vbd = &blkif->vbd; 360 struct xen_vbd *vbd = &blkif->vbd;
361 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
362 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
363 struct persistent_gnt *persistent_gnt;
364 int ret = 0;
365 int segs_to_unmap = 0;
278 366
279 xen_blkif_get(blkif); 367 xen_blkif_get(blkif);
280 368
@@ -302,6 +390,36 @@ int xen_blkif_schedule(void *arg)
302 print_stats(blkif); 390 print_stats(blkif);
303 } 391 }
304 392
393 /* Free all persistent grant pages */
394 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) {
395 foreach_grant(persistent_gnt, &blkif->persistent_gnts, node) {
396 BUG_ON(persistent_gnt->handle ==
397 BLKBACK_INVALID_HANDLE);
398 gnttab_set_unmap_op(&unmap[segs_to_unmap],
399 (unsigned long) pfn_to_kaddr(page_to_pfn(
400 persistent_gnt->page)),
401 GNTMAP_host_map,
402 persistent_gnt->handle);
403
404 pages[segs_to_unmap] = persistent_gnt->page;
405 rb_erase(&persistent_gnt->node,
406 &blkif->persistent_gnts);
407 kfree(persistent_gnt);
408 blkif->persistent_gnt_c--;
409
410 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
411 !rb_next(&persistent_gnt->node)) {
412 ret = gnttab_unmap_refs(unmap, NULL, pages,
413 segs_to_unmap);
414 BUG_ON(ret);
415 segs_to_unmap = 0;
416 }
417 }
418 }
419
420 BUG_ON(blkif->persistent_gnt_c != 0);
421 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
422
305 if (log_stats) 423 if (log_stats)
306 print_stats(blkif); 424 print_stats(blkif);
307 425
@@ -328,6 +446,8 @@ static void xen_blkbk_unmap(struct pending_req *req)
328 int ret; 446 int ret;
329 447
330 for (i = 0; i < req->nr_pages; i++) { 448 for (i = 0; i < req->nr_pages; i++) {
449 if (!test_bit(i, req->unmap_seg))
450 continue;
331 handle = pending_handle(req, i); 451 handle = pending_handle(req, i);
332 if (handle == BLKBACK_INVALID_HANDLE) 452 if (handle == BLKBACK_INVALID_HANDLE)
333 continue; 453 continue;
@@ -344,12 +464,26 @@ static void xen_blkbk_unmap(struct pending_req *req)
344 464
345static int xen_blkbk_map(struct blkif_request *req, 465static int xen_blkbk_map(struct blkif_request *req,
346 struct pending_req *pending_req, 466 struct pending_req *pending_req,
347 struct seg_buf seg[]) 467 struct seg_buf seg[],
468 struct page *pages[])
348{ 469{
349 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 470 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
350 int i; 471 struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
472 struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
473 struct persistent_gnt *persistent_gnt = NULL;
474 struct xen_blkif *blkif = pending_req->blkif;
475 phys_addr_t addr = 0;
476 int i, j;
477 bool new_map;
351 int nseg = req->u.rw.nr_segments; 478 int nseg = req->u.rw.nr_segments;
479 int segs_to_map = 0;
352 int ret = 0; 480 int ret = 0;
481 int use_persistent_gnts;
482
483 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
484
485 BUG_ON(blkif->persistent_gnt_c >
486 max_mapped_grant_pages(pending_req->blkif->blk_protocol));
353 487
354 /* 488 /*
355 * Fill out preq.nr_sects with proper amount of sectors, and setup 489 * Fill out preq.nr_sects with proper amount of sectors, and setup
@@ -359,36 +493,143 @@ static int xen_blkbk_map(struct blkif_request *req,
359 for (i = 0; i < nseg; i++) { 493 for (i = 0; i < nseg; i++) {
360 uint32_t flags; 494 uint32_t flags;
361 495
362 flags = GNTMAP_host_map; 496 if (use_persistent_gnts)
363 if (pending_req->operation != BLKIF_OP_READ) 497 persistent_gnt = get_persistent_gnt(
364 flags |= GNTMAP_readonly; 498 &blkif->persistent_gnts,
365 gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, 499 req->u.rw.seg[i].gref);
366 req->u.rw.seg[i].gref, 500
367 pending_req->blkif->domid); 501 if (persistent_gnt) {
502 /*
503 * We are using persistent grants and
504 * the grant is already mapped
505 */
506 new_map = false;
507 } else if (use_persistent_gnts &&
508 blkif->persistent_gnt_c <
509 max_mapped_grant_pages(blkif->blk_protocol)) {
510 /*
511 * We are using persistent grants, the grant is
512 * not mapped but we have room for it
513 */
514 new_map = true;
515 persistent_gnt = kzalloc(
516 sizeof(struct persistent_gnt),
517 GFP_KERNEL);
518 if (!persistent_gnt)
519 return -ENOMEM;
520 persistent_gnt->page = alloc_page(GFP_KERNEL);
521 if (!persistent_gnt->page) {
522 kfree(persistent_gnt);
523 return -ENOMEM;
524 }
525 persistent_gnt->gnt = req->u.rw.seg[i].gref;
526
527 pages_to_gnt[segs_to_map] =
528 persistent_gnt->page;
529 addr = (unsigned long) pfn_to_kaddr(
530 page_to_pfn(persistent_gnt->page));
531
532 add_persistent_gnt(&blkif->persistent_gnts,
533 persistent_gnt);
534 blkif->persistent_gnt_c++;
535 pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
536 persistent_gnt->gnt, blkif->persistent_gnt_c,
537 max_mapped_grant_pages(blkif->blk_protocol));
538 } else {
539 /*
540 * We are either using persistent grants and
541 * hit the maximum limit of grants mapped,
542 * or we are not using persistent grants.
543 */
544 if (use_persistent_gnts &&
545 !blkif->vbd.overflow_max_grants) {
546 blkif->vbd.overflow_max_grants = 1;
547 pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
548 blkif->domid, blkif->vbd.handle);
549 }
550 new_map = true;
551 pages[i] = blkbk->pending_page(pending_req, i);
552 addr = vaddr(pending_req, i);
553 pages_to_gnt[segs_to_map] =
554 blkbk->pending_page(pending_req, i);
555 }
556
557 if (persistent_gnt) {
558 pages[i] = persistent_gnt->page;
559 persistent_gnts[i] = persistent_gnt;
560 } else {
561 persistent_gnts[i] = NULL;
562 }
563
564 if (new_map) {
565 flags = GNTMAP_host_map;
566 if (!persistent_gnt &&
567 (pending_req->operation != BLKIF_OP_READ))
568 flags |= GNTMAP_readonly;
569 gnttab_set_map_op(&map[segs_to_map++], addr,
570 flags, req->u.rw.seg[i].gref,
571 blkif->domid);
572 }
368 } 573 }
369 574
370 ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg); 575 if (segs_to_map) {
371 BUG_ON(ret); 576 ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
577 BUG_ON(ret);
578 }
372 579
373 /* 580 /*
374 * Now swizzle the MFN in our domain with the MFN from the other domain 581 * Now swizzle the MFN in our domain with the MFN from the other domain
375 * so that when we access vaddr(pending_req,i) it has the contents of 582 * so that when we access vaddr(pending_req,i) it has the contents of
376 * the page from the other domain. 583 * the page from the other domain.
377 */ 584 */
378 for (i = 0; i < nseg; i++) { 585 bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
379 if (unlikely(map[i].status != 0)) { 586 for (i = 0, j = 0; i < nseg; i++) {
380 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); 587 if (!persistent_gnts[i] || !persistent_gnts[i]->handle) {
381 map[i].handle = BLKBACK_INVALID_HANDLE; 588 /* This is a newly mapped grant */
382 ret |= 1; 589 BUG_ON(j >= segs_to_map);
590 if (unlikely(map[j].status != 0)) {
591 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
592 map[j].handle = BLKBACK_INVALID_HANDLE;
593 ret |= 1;
594 if (persistent_gnts[i]) {
595 rb_erase(&persistent_gnts[i]->node,
596 &blkif->persistent_gnts);
597 blkif->persistent_gnt_c--;
598 kfree(persistent_gnts[i]);
599 persistent_gnts[i] = NULL;
600 }
601 }
602 }
603 if (persistent_gnts[i]) {
604 if (!persistent_gnts[i]->handle) {
605 /*
606 * If this is a new persistent grant
607 * save the handler
608 */
609 persistent_gnts[i]->handle = map[j].handle;
610 persistent_gnts[i]->dev_bus_addr =
611 map[j++].dev_bus_addr;
612 }
613 pending_handle(pending_req, i) =
614 persistent_gnts[i]->handle;
615
616 if (ret)
617 continue;
618
619 seg[i].buf = persistent_gnts[i]->dev_bus_addr |
620 (req->u.rw.seg[i].first_sect << 9);
621 } else {
622 pending_handle(pending_req, i) = map[j].handle;
623 bitmap_set(pending_req->unmap_seg, i, 1);
624
625 if (ret) {
626 j++;
627 continue;
628 }
629
630 seg[i].buf = map[j++].dev_bus_addr |
631 (req->u.rw.seg[i].first_sect << 9);
383 } 632 }
384
385 pending_handle(pending_req, i) = map[i].handle;
386
387 if (ret)
388 continue;
389
390 seg[i].buf = map[i].dev_bus_addr |
391 (req->u.rw.seg[i].first_sect << 9);
392 } 633 }
393 return ret; 634 return ret;
394} 635}
@@ -591,6 +832,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
591 int operation; 832 int operation;
592 struct blk_plug plug; 833 struct blk_plug plug;
593 bool drain = false; 834 bool drain = false;
835 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
594 836
595 switch (req->operation) { 837 switch (req->operation) {
596 case BLKIF_OP_READ: 838 case BLKIF_OP_READ:
@@ -677,7 +919,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
677 * the hypercall to unmap the grants - that is all done in 919 * the hypercall to unmap the grants - that is all done in
678 * xen_blkbk_unmap. 920 * xen_blkbk_unmap.
679 */ 921 */
680 if (xen_blkbk_map(req, pending_req, seg)) 922 if (xen_blkbk_map(req, pending_req, seg, pages))
681 goto fail_flush; 923 goto fail_flush;
682 924
683 /* 925 /*
@@ -689,7 +931,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
689 for (i = 0; i < nseg; i++) { 931 for (i = 0; i < nseg; i++) {
690 while ((bio == NULL) || 932 while ((bio == NULL) ||
691 (bio_add_page(bio, 933 (bio_add_page(bio,
692 blkbk->pending_page(pending_req, i), 934 pages[i],
693 seg[i].nsec << 9, 935 seg[i].nsec << 9,
694 seg[i].buf & ~PAGE_MASK) == 0)) { 936 seg[i].buf & ~PAGE_MASK) == 0)) {
695 937
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 9ad3b5ec1dc1..ae7951f0e268 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -34,6 +34,7 @@
34#include <linux/vmalloc.h> 34#include <linux/vmalloc.h>
35#include <linux/wait.h> 35#include <linux/wait.h>
36#include <linux/io.h> 36#include <linux/io.h>
37#include <linux/rbtree.h>
37#include <asm/setup.h> 38#include <asm/setup.h>
38#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
39#include <asm/hypervisor.h> 40#include <asm/hypervisor.h>
@@ -160,10 +161,22 @@ struct xen_vbd {
160 sector_t size; 161 sector_t size;
161 bool flush_support; 162 bool flush_support;
162 bool discard_secure; 163 bool discard_secure;
164
165 unsigned int feature_gnt_persistent:1;
166 unsigned int overflow_max_grants:1;
163}; 167};
164 168
165struct backend_info; 169struct backend_info;
166 170
171
172struct persistent_gnt {
173 struct page *page;
174 grant_ref_t gnt;
175 grant_handle_t handle;
176 uint64_t dev_bus_addr;
177 struct rb_node node;
178};
179
167struct xen_blkif { 180struct xen_blkif {
168 /* Unique identifier for this interface. */ 181 /* Unique identifier for this interface. */
169 domid_t domid; 182 domid_t domid;
@@ -190,6 +203,10 @@ struct xen_blkif {
190 struct task_struct *xenblkd; 203 struct task_struct *xenblkd;
191 unsigned int waiting_reqs; 204 unsigned int waiting_reqs;
192 205
206 /* tree to store persistent grants */
207 struct rb_root persistent_gnts;
208 unsigned int persistent_gnt_c;
209
193 /* statistics */ 210 /* statistics */
194 unsigned long st_print; 211 unsigned long st_print;
195 int st_rd_req; 212 int st_rd_req;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 4f66171c6683..b2250265308a 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -118,6 +118,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
118 atomic_set(&blkif->drain, 0); 118 atomic_set(&blkif->drain, 0);
119 blkif->st_print = jiffies; 119 blkif->st_print = jiffies;
120 init_waitqueue_head(&blkif->waiting_to_free); 120 init_waitqueue_head(&blkif->waiting_to_free);
121 blkif->persistent_gnts.rb_node = NULL;
121 122
122 return blkif; 123 return blkif;
123} 124}
@@ -673,6 +674,13 @@ again:
673 674
674 xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); 675 xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
675 676
677 err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1);
678 if (err) {
679 xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
680 dev->nodename);
681 goto abort;
682 }
683
676 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", 684 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
677 (unsigned long long)vbd_sz(&be->blkif->vbd)); 685 (unsigned long long)vbd_sz(&be->blkif->vbd));
678 if (err) { 686 if (err) {
@@ -721,6 +729,7 @@ static int connect_ring(struct backend_info *be)
721 struct xenbus_device *dev = be->dev; 729 struct xenbus_device *dev = be->dev;
722 unsigned long ring_ref; 730 unsigned long ring_ref;
723 unsigned int evtchn; 731 unsigned int evtchn;
732 unsigned int pers_grants;
724 char protocol[64] = ""; 733 char protocol[64] = "";
725 int err; 734 int err;
726 735
@@ -750,8 +759,18 @@ static int connect_ring(struct backend_info *be)
750 xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); 759 xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
751 return -1; 760 return -1;
752 } 761 }
753 pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n", 762 err = xenbus_gather(XBT_NIL, dev->otherend,
754 ring_ref, evtchn, be->blkif->blk_protocol, protocol); 763 "feature-persistent-grants", "%u",
764 &pers_grants, NULL);
765 if (err)
766 pers_grants = 0;
767
768 be->blkif->vbd.feature_gnt_persistent = pers_grants;
769 be->blkif->vbd.overflow_max_grants = 0;
770
771 pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
772 ring_ref, evtchn, be->blkif->blk_protocol, protocol,
773 pers_grants ? "persistent grants" : "");
755 774
756 /* Map the shared frame, irq etc. */ 775 /* Map the shared frame, irq etc. */
757 err = xen_blkif_map(be->blkif, ring_ref, evtchn); 776 err = xen_blkif_map(be->blkif, ring_ref, evtchn);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 007db8986e84..911d733d21b6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -44,6 +44,7 @@
44#include <linux/mutex.h> 44#include <linux/mutex.h>
45#include <linux/scatterlist.h> 45#include <linux/scatterlist.h>
46#include <linux/bitmap.h> 46#include <linux/bitmap.h>
47#include <linux/llist.h>
47 48
48#include <xen/xen.h> 49#include <xen/xen.h>
49#include <xen/xenbus.h> 50#include <xen/xenbus.h>
@@ -64,10 +65,17 @@ enum blkif_state {
64 BLKIF_STATE_SUSPENDED, 65 BLKIF_STATE_SUSPENDED,
65}; 66};
66 67
68struct grant {
69 grant_ref_t gref;
70 unsigned long pfn;
71 struct llist_node node;
72};
73
67struct blk_shadow { 74struct blk_shadow {
68 struct blkif_request req; 75 struct blkif_request req;
69 struct request *request; 76 struct request *request;
70 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 77 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
78 struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
71}; 79};
72 80
73static DEFINE_MUTEX(blkfront_mutex); 81static DEFINE_MUTEX(blkfront_mutex);
@@ -97,6 +105,8 @@ struct blkfront_info
97 struct work_struct work; 105 struct work_struct work;
98 struct gnttab_free_callback callback; 106 struct gnttab_free_callback callback;
99 struct blk_shadow shadow[BLK_RING_SIZE]; 107 struct blk_shadow shadow[BLK_RING_SIZE];
108 struct llist_head persistent_gnts;
109 unsigned int persistent_gnts_c;
100 unsigned long shadow_free; 110 unsigned long shadow_free;
101 unsigned int feature_flush; 111 unsigned int feature_flush;
102 unsigned int flush_op; 112 unsigned int flush_op;
@@ -104,6 +114,7 @@ struct blkfront_info
104 unsigned int feature_secdiscard:1; 114 unsigned int feature_secdiscard:1;
105 unsigned int discard_granularity; 115 unsigned int discard_granularity;
106 unsigned int discard_alignment; 116 unsigned int discard_alignment;
117 unsigned int feature_persistent:1;
107 int is_ready; 118 int is_ready;
108}; 119};
109 120
@@ -287,21 +298,36 @@ static int blkif_queue_request(struct request *req)
287 unsigned long id; 298 unsigned long id;
288 unsigned int fsect, lsect; 299 unsigned int fsect, lsect;
289 int i, ref; 300 int i, ref;
301
302 /*
303 * Used to store if we are able to queue the request by just using
304 * existing persistent grants, or if we have to get new grants,
305 * as there are not sufficiently many free.
306 */
307 bool new_persistent_gnts;
290 grant_ref_t gref_head; 308 grant_ref_t gref_head;
309 struct page *granted_page;
310 struct grant *gnt_list_entry = NULL;
291 struct scatterlist *sg; 311 struct scatterlist *sg;
292 312
293 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 313 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
294 return 1; 314 return 1;
295 315
296 if (gnttab_alloc_grant_references( 316 /* Check if we have enought grants to allocate a requests */
297 BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { 317 if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
298 gnttab_request_free_callback( 318 new_persistent_gnts = 1;
299 &info->callback, 319 if (gnttab_alloc_grant_references(
300 blkif_restart_queue_callback, 320 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
301 info, 321 &gref_head) < 0) {
302 BLKIF_MAX_SEGMENTS_PER_REQUEST); 322 gnttab_request_free_callback(
303 return 1; 323 &info->callback,
304 } 324 blkif_restart_queue_callback,
325 info,
326 BLKIF_MAX_SEGMENTS_PER_REQUEST);
327 return 1;
328 }
329 } else
330 new_persistent_gnts = 0;
305 331
306 /* Fill out a communications ring structure. */ 332 /* Fill out a communications ring structure. */
307 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 333 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
@@ -341,18 +367,73 @@ static int blkif_queue_request(struct request *req)
341 BLKIF_MAX_SEGMENTS_PER_REQUEST); 367 BLKIF_MAX_SEGMENTS_PER_REQUEST);
342 368
343 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { 369 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
344 buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
345 fsect = sg->offset >> 9; 370 fsect = sg->offset >> 9;
346 lsect = fsect + (sg->length >> 9) - 1; 371 lsect = fsect + (sg->length >> 9) - 1;
347 /* install a grant reference. */
348 ref = gnttab_claim_grant_reference(&gref_head);
349 BUG_ON(ref == -ENOSPC);
350 372
351 gnttab_grant_foreign_access_ref( 373 if (info->persistent_gnts_c) {
352 ref, 374 BUG_ON(llist_empty(&info->persistent_gnts));
375 gnt_list_entry = llist_entry(
376 llist_del_first(&info->persistent_gnts),
377 struct grant, node);
378
379 ref = gnt_list_entry->gref;
380 buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
381 info->persistent_gnts_c--;
382 } else {
383 ref = gnttab_claim_grant_reference(&gref_head);
384 BUG_ON(ref == -ENOSPC);
385
386 gnt_list_entry =
387 kmalloc(sizeof(struct grant),
388 GFP_ATOMIC);
389 if (!gnt_list_entry)
390 return -ENOMEM;
391
392 granted_page = alloc_page(GFP_ATOMIC);
393 if (!granted_page) {
394 kfree(gnt_list_entry);
395 return -ENOMEM;
396 }
397
398 gnt_list_entry->pfn =
399 page_to_pfn(granted_page);
400 gnt_list_entry->gref = ref;
401
402 buffer_mfn = pfn_to_mfn(page_to_pfn(
403 granted_page));
404 gnttab_grant_foreign_access_ref(ref,
353 info->xbdev->otherend_id, 405 info->xbdev->otherend_id,
354 buffer_mfn, 406 buffer_mfn, 0);
355 rq_data_dir(req)); 407 }
408
409 info->shadow[id].grants_used[i] = gnt_list_entry;
410
411 if (rq_data_dir(req)) {
412 char *bvec_data;
413 void *shared_data;
414
415 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
416
417 shared_data = kmap_atomic(
418 pfn_to_page(gnt_list_entry->pfn));
419 bvec_data = kmap_atomic(sg_page(sg));
420
421 /*
422 * this does not wipe data stored outside the
423 * range sg->offset..sg->offset+sg->length.
424 * Therefore, blkback *could* see data from
425 * previous requests. This is OK as long as
426 * persistent grants are shared with just one
427 * domain. It may need refactoring if this
428 * changes
429 */
430 memcpy(shared_data + sg->offset,
431 bvec_data + sg->offset,
432 sg->length);
433
434 kunmap_atomic(bvec_data);
435 kunmap_atomic(shared_data);
436 }
356 437
357 info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); 438 info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
358 ring_req->u.rw.seg[i] = 439 ring_req->u.rw.seg[i] =
@@ -368,7 +449,8 @@ static int blkif_queue_request(struct request *req)
368 /* Keep a private copy so we can reissue requests when recovering. */ 449 /* Keep a private copy so we can reissue requests when recovering. */
369 info->shadow[id].req = *ring_req; 450 info->shadow[id].req = *ring_req;
370 451
371 gnttab_free_grant_references(gref_head); 452 if (new_persistent_gnts)
453 gnttab_free_grant_references(gref_head);
372 454
373 return 0; 455 return 0;
374} 456}
@@ -480,12 +562,13 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
480static void xlvbd_flush(struct blkfront_info *info) 562static void xlvbd_flush(struct blkfront_info *info)
481{ 563{
482 blk_queue_flush(info->rq, info->feature_flush); 564 blk_queue_flush(info->rq, info->feature_flush);
483 printk(KERN_INFO "blkfront: %s: %s: %s\n", 565 printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
484 info->gd->disk_name, 566 info->gd->disk_name,
485 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 567 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
486 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 568 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
487 "flush diskcache" : "barrier or flush"), 569 "flush diskcache" : "barrier or flush"),
488 info->feature_flush ? "enabled" : "disabled"); 570 info->feature_flush ? "enabled" : "disabled",
571 info->feature_persistent ? "using persistent grants" : "");
489} 572}
490 573
491static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 574static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -707,6 +790,9 @@ static void blkif_restart_queue(struct work_struct *work)
707 790
708static void blkif_free(struct blkfront_info *info, int suspend) 791static void blkif_free(struct blkfront_info *info, int suspend)
709{ 792{
793 struct llist_node *all_gnts;
794 struct grant *persistent_gnt;
795
710 /* Prevent new requests being issued until we fix things up. */ 796 /* Prevent new requests being issued until we fix things up. */
711 spin_lock_irq(&info->io_lock); 797 spin_lock_irq(&info->io_lock);
712 info->connected = suspend ? 798 info->connected = suspend ?
@@ -714,6 +800,17 @@ static void blkif_free(struct blkfront_info *info, int suspend)
714 /* No more blkif_request(). */ 800 /* No more blkif_request(). */
715 if (info->rq) 801 if (info->rq)
716 blk_stop_queue(info->rq); 802 blk_stop_queue(info->rq);
803
804 /* Remove all persistent grants */
805 if (info->persistent_gnts_c) {
806 all_gnts = llist_del_all(&info->persistent_gnts);
807 llist_for_each_entry(persistent_gnt, all_gnts, node) {
808 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
809 kfree(persistent_gnt);
810 }
811 info->persistent_gnts_c = 0;
812 }
813
717 /* No more gnttab callback work. */ 814 /* No more gnttab callback work. */
718 gnttab_cancel_free_callback(&info->callback); 815 gnttab_cancel_free_callback(&info->callback);
719 spin_unlock_irq(&info->io_lock); 816 spin_unlock_irq(&info->io_lock);
@@ -734,13 +831,42 @@ static void blkif_free(struct blkfront_info *info, int suspend)
734 831
735} 832}
736 833
737static void blkif_completion(struct blk_shadow *s) 834static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
835 struct blkif_response *bret)
738{ 836{
739 int i; 837 int i;
740 /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place 838 struct bio_vec *bvec;
741 * flag. */ 839 struct req_iterator iter;
742 for (i = 0; i < s->req.u.rw.nr_segments; i++) 840 unsigned long flags;
743 gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); 841 char *bvec_data;
842 void *shared_data;
843 unsigned int offset = 0;
844
845 if (bret->operation == BLKIF_OP_READ) {
846 /*
847 * Copy the data received from the backend into the bvec.
848 * Since bv_offset can be different than 0, and bv_len different
849 * than PAGE_SIZE, we have to keep track of the current offset,
850 * to be sure we are copying the data from the right shared page.
851 */
852 rq_for_each_segment(bvec, s->request, iter) {
853 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
854 i = offset >> PAGE_SHIFT;
855 shared_data = kmap_atomic(
856 pfn_to_page(s->grants_used[i]->pfn));
857 bvec_data = bvec_kmap_irq(bvec, &flags);
858 memcpy(bvec_data, shared_data + bvec->bv_offset,
859 bvec->bv_len);
860 bvec_kunmap_irq(bvec_data, &flags);
861 kunmap_atomic(shared_data);
862 offset += bvec->bv_len;
863 }
864 }
865 /* Add the persistent grant into the list of free grants */
866 for (i = 0; i < s->req.u.rw.nr_segments; i++) {
867 llist_add(&s->grants_used[i]->node, &info->persistent_gnts);
868 info->persistent_gnts_c++;
869 }
744} 870}
745 871
746static irqreturn_t blkif_interrupt(int irq, void *dev_id) 872static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -783,7 +909,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
783 req = info->shadow[id].request; 909 req = info->shadow[id].request;
784 910
785 if (bret->operation != BLKIF_OP_DISCARD) 911 if (bret->operation != BLKIF_OP_DISCARD)
786 blkif_completion(&info->shadow[id]); 912 blkif_completion(&info->shadow[id], info, bret);
787 913
788 if (add_id_to_freelist(info, id)) { 914 if (add_id_to_freelist(info, id)) {
789 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", 915 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@@ -942,6 +1068,11 @@ again:
942 message = "writing protocol"; 1068 message = "writing protocol";
943 goto abort_transaction; 1069 goto abort_transaction;
944 } 1070 }
1071 err = xenbus_printf(xbt, dev->nodename,
1072 "feature-persistent-grants", "%u", 1);
1073 if (err)
1074 dev_warn(&dev->dev,
1075 "writing persistent grants feature to xenbus");
945 1076
946 err = xenbus_transaction_end(xbt, 0); 1077 err = xenbus_transaction_end(xbt, 0);
947 if (err) { 1078 if (err) {
@@ -1029,6 +1160,8 @@ static int blkfront_probe(struct xenbus_device *dev,
1029 spin_lock_init(&info->io_lock); 1160 spin_lock_init(&info->io_lock);
1030 info->xbdev = dev; 1161 info->xbdev = dev;
1031 info->vdevice = vdevice; 1162 info->vdevice = vdevice;
1163 init_llist_head(&info->persistent_gnts);
1164 info->persistent_gnts_c = 0;
1032 info->connected = BLKIF_STATE_DISCONNECTED; 1165 info->connected = BLKIF_STATE_DISCONNECTED;
1033 INIT_WORK(&info->work, blkif_restart_queue); 1166 INIT_WORK(&info->work, blkif_restart_queue);
1034 1167
@@ -1093,7 +1226,7 @@ static int blkif_recover(struct blkfront_info *info)
1093 req->u.rw.seg[j].gref, 1226 req->u.rw.seg[j].gref,
1094 info->xbdev->otherend_id, 1227 info->xbdev->otherend_id,
1095 pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), 1228 pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
1096 rq_data_dir(info->shadow[req->u.rw.id].request)); 1229 0);
1097 } 1230 }
1098 info->shadow[req->u.rw.id].req = *req; 1231 info->shadow[req->u.rw.id].req = *req;
1099 1232
@@ -1225,7 +1358,7 @@ static void blkfront_connect(struct blkfront_info *info)
1225 unsigned long sector_size; 1358 unsigned long sector_size;
1226 unsigned int binfo; 1359 unsigned int binfo;
1227 int err; 1360 int err;
1228 int barrier, flush, discard; 1361 int barrier, flush, discard, persistent;
1229 1362
1230 switch (info->connected) { 1363 switch (info->connected) {
1231 case BLKIF_STATE_CONNECTED: 1364 case BLKIF_STATE_CONNECTED:
@@ -1303,6 +1436,14 @@ static void blkfront_connect(struct blkfront_info *info)
1303 if (!err && discard) 1436 if (!err && discard)
1304 blkfront_setup_discard(info); 1437 blkfront_setup_discard(info);
1305 1438
1439 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1440 "feature-persistent", "%u", &persistent,
1441 NULL);
1442 if (err)
1443 info->feature_persistent = 0;
1444 else
1445 info->feature_persistent = persistent;
1446
1306 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1447 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1307 if (err) { 1448 if (err) {
1308 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1449 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",