aboutsummaryrefslogtreecommitdiffstats
path: root/include/xen
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-06-28 10:01:14 -0400
committerJens Axboe <axboe@kernel.dk>2013-06-28 10:01:14 -0400
commitf35546e072a7a86ccb950d4d1508879b0d49e374 (patch)
tree7e581edae814482ac4cfb00b9aea2e76ebe05f6d /include/xen
parent36f988e978f81ffa415df4d77bbcd8887917f25c (diff)
parent1e0f7a21b2fffc70f27cc4a454c60321501045b1 (diff)
Merge branch 'stable/for-jens-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen into for-3.11/drivers
Konrad writes: It has the 'feature-max-indirect-segments' implemented in both backend and frontend. The current problem with the backend and frontend is that the segment size is limited to 11 pages. It means we can at most squeeze in 44kB per request. The ring can hold 32 (next power of two below 36) requests, meaning we can do 1.4M of outstanding requests. Nowadays that is not enough. The problem in the past was addressed in two ways - but neither one went upstream. The first solution to this proposed by Justin from Spectralogic was to negotiate the segment size. This means that the ‘struct blkif_sring_entry’ is now a variable size. It can expand from 112 bytes (cover 11 pages of data - 44kB) to 1580 bytes (256 pages of data - so 1MB). It is a simple extension by just making the array in the request expand from 11 to a variable size negotiated. But it had limits: this extension still limits the number of segments per request to 255 (as the total number must be specified in the request, which only has an 8-bit field for that purpose). The other solution (from Intel - Ronghui) was to create one extra ring that only has the ‘struct blkif_request_segment’ in them. The ‘struct blkif_request’ would be changed to have an index in said ‘segment ring’. There is only one segment ring. This means that the size of the initial ring is still the same. The requests would point to the segment and enumerate out how many of the indexes it wants to use. The limit is of course the size of the segment. If one assumes a one-page segment this means we can in one request cover ~4MB. Those patches were posted as RFC and the author never followed up on the ideas on changing it to be a bit more flexible. There is yet another mechanism that could be employed  (which these patches implement) - and it borrows from VirtIO protocol. And that is the ‘indirect descriptors’. This very similar to what Intel suggests, but with a twist. The twist is to negotiate how many of these 'segment' pages (aka indirect descriptor pages) we want to support (in reality we negotiate how many entries in the segment we want to cover, and we module the number if it is bigger than the segment size). This means that with the existing 36 slots in the ring (single page) we can cover: 32 slots * each blkif_request_indirect covers: 512 * 4096 ~= 64M. Since we ample space in the blkif_request_indirect to span more than one indirect page, that number (64M) can be also multiplied by eight = 512MB. Roger Pau Monne took the idea and implemented them in these patches. They work great and the corner cases (migration between backends with and without this extension) work nicely. The backend has a limit right now off how many indirect entries it can handle: one indirect page, and at maximum 256 entries (out of 512 - so 50% of the page is used). That comes out to 32 slots * 256 entries in a indirect page * 1 indirect page per request * 4096 = 32MB. This is a conservative number that can change in the future. Right now it strikes a good balance between giving excellent performance, memory usage in the backend, and balancing the needs of many guests. In the patchset there is also the split of the blkback structure to be per-VBD. This means that the spinlock contention we had with many guests trying to do I/O and all the blkback threads hitting the same lock has been eliminated. Also there are bug-fixes to deal with oddly sized sectors, insane amounts on th ring, and also a security fix (posted earlier).
Diffstat (limited to 'include/xen')
-rw-r--r--include/xen/interface/io/blkif.h53
-rw-r--r--include/xen/interface/io/ring.h5
2 files changed, 58 insertions, 0 deletions
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index ffd4652de91c..65e12099ef89 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t;
103#define BLKIF_OP_DISCARD 5 103#define BLKIF_OP_DISCARD 5
104 104
105/* 105/*
106 * Recognized if "feature-max-indirect-segments" in present in the backend
107 * xenbus info. The "feature-max-indirect-segments" node contains the maximum
108 * number of segments allowed by the backend per request. If the node is
109 * present, the frontend might use blkif_request_indirect structs in order to
110 * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
111 * maximum number of indirect segments is fixed by the backend, but the
112 * frontend can issue requests with any number of indirect segments as long as
113 * it's less than the number provided by the backend. The indirect_grefs field
114 * in blkif_request_indirect should be filled by the frontend with the
115 * grant references of the pages that are holding the indirect segments.
116 * This pages are filled with an array of blkif_request_segment_aligned
117 * that hold the information about the segments. The number of indirect
118 * pages to use is determined by the maximum number of segments
119 * a indirect request contains. Every indirect page can contain a maximum
120 * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)),
121 * so to calculate the number of indirect pages to use we have to do
122 * ceil(indirect_segments/512).
123 *
124 * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
125 * create the "feature-max-indirect-segments" node!
126 */
127#define BLKIF_OP_INDIRECT 6
128
129/*
106 * Maximum scatter/gather segments per request. 130 * Maximum scatter/gather segments per request.
107 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. 131 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
108 * NB. This could be 12 if the ring indexes weren't stored in the same page. 132 * NB. This could be 12 if the ring indexes weren't stored in the same page.
109 */ 133 */
110#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 134#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
111 135
136#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
137
138struct blkif_request_segment_aligned {
139 grant_ref_t gref; /* reference to I/O buffer frame */
140 /* @first_sect: first sector in frame to transfer (inclusive). */
141 /* @last_sect: last sector in frame to transfer (inclusive). */
142 uint8_t first_sect, last_sect;
143 uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */
144} __attribute__((__packed__));
145
112struct blkif_request_rw { 146struct blkif_request_rw {
113 uint8_t nr_segments; /* number of segments */ 147 uint8_t nr_segments; /* number of segments */
114 blkif_vdev_t handle; /* only for read/write requests */ 148 blkif_vdev_t handle; /* only for read/write requests */
@@ -147,12 +181,31 @@ struct blkif_request_other {
147 uint64_t id; /* private guest value, echoed in resp */ 181 uint64_t id; /* private guest value, echoed in resp */
148} __attribute__((__packed__)); 182} __attribute__((__packed__));
149 183
184struct blkif_request_indirect {
185 uint8_t indirect_op;
186 uint16_t nr_segments;
187#ifdef CONFIG_X86_64
188 uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */
189#endif
190 uint64_t id;
191 blkif_sector_t sector_number;
192 blkif_vdev_t handle;
193 uint16_t _pad2;
194 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
195#ifdef CONFIG_X86_64
196 uint32_t _pad3; /* make it 64 byte aligned */
197#else
198 uint64_t _pad3; /* make it 64 byte aligned */
199#endif
200} __attribute__((__packed__));
201
150struct blkif_request { 202struct blkif_request {
151 uint8_t operation; /* BLKIF_OP_??? */ 203 uint8_t operation; /* BLKIF_OP_??? */
152 union { 204 union {
153 struct blkif_request_rw rw; 205 struct blkif_request_rw rw;
154 struct blkif_request_discard discard; 206 struct blkif_request_discard discard;
155 struct blkif_request_other other; 207 struct blkif_request_other other;
208 struct blkif_request_indirect indirect;
156 } u; 209 } u;
157} __attribute__((__packed__)); 210} __attribute__((__packed__));
158 211
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index 75271b9a8f61..7d28aff605c7 100644
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -188,6 +188,11 @@ struct __name##_back_ring { \
188#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ 188#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
189 (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) 189 (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
190 190
191/* Ill-behaved frontend determination: Can there be this many requests? */
192#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \
193 (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r))
194
195
191#define RING_PUSH_REQUESTS(_r) do { \ 196#define RING_PUSH_REQUESTS(_r) do { \
192 wmb(); /* back sees requests /before/ updated producer index */ \ 197 wmb(); /* back sees requests /before/ updated producer index */ \
193 (_r)->sring->req_prod = (_r)->req_prod_pvt; \ 198 (_r)->sring->req_prod = (_r)->req_prod_pvt; \