aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/xen-blkback/common.h
diff options
context:
space:
mode:
authorRoger Pau Monne <roger.pau@citrix.com>2012-10-24 12:58:45 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-10-30 09:50:04 -0400
commit0a8704a51f386cab7394e38ff1d66eef924d8ab8 (patch)
tree8fb3897bad957fa592ff54cacc97de924246c125 /drivers/block/xen-blkback/common.h
parent8f0d8163b50e01f398b14bcd4dc039ac5ab18d64 (diff)
xen/blkback: Persistent grant maps for xen blk drivers
This patch implements persistent grants for the xen-blk{front,back} mechanism. The effect of this change is to reduce the number of unmap operations performed, since they cause a (costly) TLB shootdown. This allows the I/O performance to scale better when a large number of VMs are performing I/O. Previously, the blkfront driver was supplied a bvec[] from the request queue. This was granted to dom0; dom0 performed the I/O and wrote directly into the grant-mapped memory and unmapped it; blkfront then removed foreign access for that grant. The cost of unmapping scales badly with the number of CPUs in Dom0. An experiment showed that when Dom0 has 24 VCPUs, and guests are performing parallel I/O to a ramdisk, the IPIs from performing unmap's is a bottleneck at 5 guests (at which point 650,000 IOPS are being performed in total). If more than 5 guests are used, the performance declines. By 10 guests, only 400,000 IOPS are being performed. This patch improves performance by only unmapping when the connection between blkfront and back is broken. On startup blkfront notifies blkback that it is using persistent grants, and blkback will do the same. If blkback is not capable of persistent mapping, blkfront will still use the same grants, since it is compatible with the previous protocol, and simplifies the code complexity in blkfront. To perform a read, in persistent mode, blkfront uses a separate pool of pages that it maps to dom0. When a request comes in, blkfront transmutes the request so that blkback will write into one of these free pages. Blkback keeps note of which grefs it has already mapped. When a new ring request comes to blkback, it looks to see if it has already mapped that page. If so, it will not map it again. If the page hasn't been previously mapped, it is mapped now, and a record is kept of this mapping. Blkback proceeds as usual. When blkfront is notified that blkback has completed a request, it memcpy's from the shared memory, into the bvec supplied. A record that the {gref, page} tuple is mapped, and not inflight is kept. Writes are similar, except that the memcpy is peformed from the supplied bvecs, into the shared pages, before the request is put onto the ring. Blkback stores a mapping of grefs=>{page mapped to by gref} in a red-black tree. As the grefs are not known apriori, and provide no guarantees on their ordering, we have to perform a search through this tree to find the page, for every gref we receive. This operation takes O(log n) time in the worst case. In blkfront grants are stored using a single linked list. The maximum number of grants that blkback will persistenly map is currently set to RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST, to prevent a malicios guest from attempting a DoS, by supplying fresh grefs, causing the Dom0 kernel to map excessively. If a guest is using persistent grants and exceeds the maximum number of grants to map persistenly the newly passed grefs will be mapped and unmaped. Using this approach, we can have requests that mix persistent and non-persistent grants, and we need to handle them correctly. This allows us to set the maximum number of persistent grants to a lower value than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST, although setting it will lead to unpredictable performance. In writing this patch, the question arrises as to if the additional cost of performing memcpys in the guest (to/from the pool of granted pages) outweigh the gains of not performing TLB shootdowns. The answer to that question is `no'. There appears to be very little, if any additional cost to the guest of using persistent grants. There is perhaps a small saving, from the reduced number of hypercalls performed in granting, and ending foreign access. Signed-off-by: Oliver Chick <oliver.chick@citrix.com> Signed-off-by: Roger Pau Monne <roger.pau@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [v1: Fixed up the misuse of bool as int]
Diffstat (limited to 'drivers/block/xen-blkback/common.h')
-rw-r--r--drivers/block/xen-blkback/common.h17
1 files changed, 17 insertions, 0 deletions
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 9ad3b5ec1dc1..ae7951f0e268 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -34,6 +34,7 @@
34#include <linux/vmalloc.h> 34#include <linux/vmalloc.h>
35#include <linux/wait.h> 35#include <linux/wait.h>
36#include <linux/io.h> 36#include <linux/io.h>
37#include <linux/rbtree.h>
37#include <asm/setup.h> 38#include <asm/setup.h>
38#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
39#include <asm/hypervisor.h> 40#include <asm/hypervisor.h>
@@ -160,10 +161,22 @@ struct xen_vbd {
160 sector_t size; 161 sector_t size;
161 bool flush_support; 162 bool flush_support;
162 bool discard_secure; 163 bool discard_secure;
164
165 unsigned int feature_gnt_persistent:1;
166 unsigned int overflow_max_grants:1;
163}; 167};
164 168
165struct backend_info; 169struct backend_info;
166 170
171
172struct persistent_gnt {
173 struct page *page;
174 grant_ref_t gnt;
175 grant_handle_t handle;
176 uint64_t dev_bus_addr;
177 struct rb_node node;
178};
179
167struct xen_blkif { 180struct xen_blkif {
168 /* Unique identifier for this interface. */ 181 /* Unique identifier for this interface. */
169 domid_t domid; 182 domid_t domid;
@@ -190,6 +203,10 @@ struct xen_blkif {
190 struct task_struct *xenblkd; 203 struct task_struct *xenblkd;
191 unsigned int waiting_reqs; 204 unsigned int waiting_reqs;
192 205
206 /* tree to store persistent grants */
207 struct rb_root persistent_gnts;
208 unsigned int persistent_gnt_c;
209
193 /* statistics */ 210 /* statistics */
194 unsigned long st_print; 211 unsigned long st_print;
195 int st_rd_req; 212 int st_rd_req;