aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-06-28 10:01:14 -0400
committerJens Axboe <axboe@kernel.dk>2013-06-28 10:01:14 -0400
commitf35546e072a7a86ccb950d4d1508879b0d49e374 (patch)
tree7e581edae814482ac4cfb00b9aea2e76ebe05f6d
parent36f988e978f81ffa415df4d77bbcd8887917f25c (diff)
parent1e0f7a21b2fffc70f27cc4a454c60321501045b1 (diff)
Merge branch 'stable/for-jens-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen into for-3.11/drivers
Konrad writes: It has the 'feature-max-indirect-segments' implemented in both backend and frontend. The current problem with the backend and frontend is that the segment size is limited to 11 pages. It means we can at most squeeze in 44kB per request. The ring can hold 32 (next power of two below 36) requests, meaning we can do 1.4M of outstanding requests. Nowadays that is not enough. The problem in the past was addressed in two ways - but neither one went upstream. The first solution to this proposed by Justin from Spectralogic was to negotiate the segment size. This means that the ‘struct blkif_sring_entry’ is now a variable size. It can expand from 112 bytes (cover 11 pages of data - 44kB) to 1580 bytes (256 pages of data - so 1MB). It is a simple extension by just making the array in the request expand from 11 to a variable size negotiated. But it had limits: this extension still limits the number of segments per request to 255 (as the total number must be specified in the request, which only has an 8-bit field for that purpose). The other solution (from Intel - Ronghui) was to create one extra ring that only has the ‘struct blkif_request_segment’ in them. The ‘struct blkif_request’ would be changed to have an index in said ‘segment ring’. There is only one segment ring. This means that the size of the initial ring is still the same. The requests would point to the segment and enumerate out how many of the indexes it wants to use. The limit is of course the size of the segment. If one assumes a one-page segment this means we can in one request cover ~4MB. Those patches were posted as RFC and the author never followed up on the ideas on changing it to be a bit more flexible. There is yet another mechanism that could be employed  (which these patches implement) - and it borrows from VirtIO protocol. And that is the ‘indirect descriptors’. This very similar to what Intel suggests, but with a twist. The twist is to negotiate how many of these 'segment' pages (aka indirect descriptor pages) we want to support (in reality we negotiate how many entries in the segment we want to cover, and we module the number if it is bigger than the segment size). This means that with the existing 36 slots in the ring (single page) we can cover: 32 slots * each blkif_request_indirect covers: 512 * 4096 ~= 64M. Since we ample space in the blkif_request_indirect to span more than one indirect page, that number (64M) can be also multiplied by eight = 512MB. Roger Pau Monne took the idea and implemented them in these patches. They work great and the corner cases (migration between backends with and without this extension) work nicely. The backend has a limit right now off how many indirect entries it can handle: one indirect page, and at maximum 256 entries (out of 512 - so 50% of the page is used). That comes out to 32 slots * 256 entries in a indirect page * 1 indirect page per request * 4096 = 32MB. This is a conservative number that can change in the future. Right now it strikes a good balance between giving excellent performance, memory usage in the backend, and balancing the needs of many guests. In the patchset there is also the split of the blkback structure to be per-VBD. This means that the spinlock contention we had with many guests trying to do I/O and all the blkback threads hitting the same lock has been eliminated. Also there are bug-fixes to deal with oddly sized sectors, insane amounts on th ring, and also a security fix (posted earlier).
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkback17
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkfront10
-rw-r--r--drivers/block/xen-blkback/blkback.c872
-rw-r--r--drivers/block/xen-blkback/common.h147
-rw-r--r--drivers/block/xen-blkback/xenbus.c85
-rw-r--r--drivers/block/xen-blkfront.c532
-rw-r--r--include/xen/interface/io/blkif.h53
-rw-r--r--include/xen/interface/io/ring.h5
8 files changed, 1299 insertions, 422 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback
new file mode 100644
index 000000000000..8bb43b66eb55
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback
@@ -0,0 +1,17 @@
1What: /sys/module/xen_blkback/parameters/max_buffer_pages
2Date: March 2013
3KernelVersion: 3.11
4Contact: Roger Pau Monné <roger.pau@citrix.com>
5Description:
6 Maximum number of free pages to keep in each block
7 backend buffer.
8
9What: /sys/module/xen_blkback/parameters/max_persistent_grants
10Date: March 2013
11KernelVersion: 3.11
12Contact: Roger Pau Monné <roger.pau@citrix.com>
13Description:
14 Maximum number of grants to map persistently in
15 blkback. If the frontend tries to use more than
16 max_persistent_grants, the LRU kicks in and starts
17 removing 5% of max_persistent_grants every 100ms.
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkfront b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
new file mode 100644
index 000000000000..c0a6cb7eb314
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
@@ -0,0 +1,10 @@
1What: /sys/module/xen_blkfront/parameters/max
2Date: June 2013
3KernelVersion: 3.11
4Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
5Description:
6 Maximum number of segments that the frontend will negotiate
7 with the backend for indirect descriptors. The default value
8 is 32 - higher value means more potential throughput but more
9 memory usage. The backend picks the minimum of the frontend
10 and its default backend value.
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index dd5b2fed97e9..bf4b9d282c04 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -50,110 +50,118 @@
50#include "common.h" 50#include "common.h"
51 51
52/* 52/*
53 * These are rather arbitrary. They are fairly large because adjacent requests 53 * Maximum number of unused free pages to keep in the internal buffer.
54 * pulled from a communication ring are quite likely to end up being part of 54 * Setting this to a value too low will reduce memory used in each backend,
55 * the same scatter/gather request at the disc. 55 * but can have a performance penalty.
56 * 56 *
57 * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** 57 * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
58 * 58 * be set to a lower value that might degrade performance on some intensive
59 * This will increase the chances of being able to write whole tracks. 59 * IO workloads.
60 * 64 should be enough to keep us competitive with Linux.
61 */ 60 */
62static int xen_blkif_reqs = 64;
63module_param_named(reqs, xen_blkif_reqs, int, 0);
64MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
65 61
66/* Run-time switchable: /sys/module/blkback/parameters/ */ 62static int xen_blkif_max_buffer_pages = 1024;
67static unsigned int log_stats; 63module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
68module_param(log_stats, int, 0644); 64MODULE_PARM_DESC(max_buffer_pages,
65"Maximum number of free pages to keep in each block backend buffer");
69 66
70/* 67/*
71 * Each outstanding request that we've passed to the lower device layers has a 68 * Maximum number of grants to map persistently in blkback. For maximum
72 * 'pending_req' allocated to it. Each buffer_head that completes decrements 69 * performance this should be the total numbers of grants that can be used
73 * the pendcnt towards zero. When it hits zero, the specified domain has a 70 * to fill the ring, but since this might become too high, specially with
74 * response queued for it, with the saved 'id' passed back. 71 * the use of indirect descriptors, we set it to a value that provides good
72 * performance without using too much memory.
73 *
74 * When the list of persistent grants is full we clean it up using a LRU
75 * algorithm.
75 */ 76 */
76struct pending_req {
77 struct xen_blkif *blkif;
78 u64 id;
79 int nr_pages;
80 atomic_t pendcnt;
81 unsigned short operation;
82 int status;
83 struct list_head free_list;
84 DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
85};
86 77
87#define BLKBACK_INVALID_HANDLE (~0) 78static int xen_blkif_max_pgrants = 1056;
79module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
80MODULE_PARM_DESC(max_persistent_grants,
81 "Maximum number of grants to map persistently");
88 82
89struct xen_blkbk { 83/*
90 struct pending_req *pending_reqs; 84 * The LRU mechanism to clean the lists of persistent grants needs to
91 /* List of all 'pending_req' available */ 85 * be executed periodically. The time interval between consecutive executions
92 struct list_head pending_free; 86 * of the purge mechanism is set in ms.
93 /* And its spinlock. */ 87 */
94 spinlock_t pending_free_lock; 88#define LRU_INTERVAL 100
95 wait_queue_head_t pending_free_wq;
96 /* The list of all pages that are available. */
97 struct page **pending_pages;
98 /* And the grant handles that are available. */
99 grant_handle_t *pending_grant_handles;
100};
101
102static struct xen_blkbk *blkbk;
103 89
104/* 90/*
105 * Maximum number of grant pages that can be mapped in blkback. 91 * When the persistent grants list is full we will remove unused grants
106 * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of 92 * from the list. The percent number of grants to be removed at each LRU
107 * pages that blkback will persistently map. 93 * execution.
108 * Currently, this is:
109 * RING_SIZE = 32 (for all known ring types)
110 * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
111 * sizeof(struct persistent_gnt) = 48
112 * So the maximum memory used to store the grants is:
113 * 32 * 11 * 48 = 16896 bytes
114 */ 94 */
115static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) 95#define LRU_PERCENT_CLEAN 5
96
97/* Run-time switchable: /sys/module/blkback/parameters/ */
98static unsigned int log_stats;
99module_param(log_stats, int, 0644);
100
101#define BLKBACK_INVALID_HANDLE (~0)
102
103/* Number of free pages to remove on each call to free_xenballooned_pages */
104#define NUM_BATCH_FREE_PAGES 10
105
106static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
116{ 107{
117 switch (protocol) { 108 unsigned long flags;
118 case BLKIF_PROTOCOL_NATIVE: 109
119 return __CONST_RING_SIZE(blkif, PAGE_SIZE) * 110 spin_lock_irqsave(&blkif->free_pages_lock, flags);
120 BLKIF_MAX_SEGMENTS_PER_REQUEST; 111 if (list_empty(&blkif->free_pages)) {
121 case BLKIF_PROTOCOL_X86_32: 112 BUG_ON(blkif->free_pages_num != 0);
122 return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * 113 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
123 BLKIF_MAX_SEGMENTS_PER_REQUEST; 114 return alloc_xenballooned_pages(1, page, false);
124 case BLKIF_PROTOCOL_X86_64:
125 return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
126 BLKIF_MAX_SEGMENTS_PER_REQUEST;
127 default:
128 BUG();
129 } 115 }
116 BUG_ON(blkif->free_pages_num == 0);
117 page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
118 list_del(&page[0]->lru);
119 blkif->free_pages_num--;
120 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
121
130 return 0; 122 return 0;
131} 123}
132 124
133 125static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
134/* 126 int num)
135 * Little helpful macro to figure out the index and virtual address of the
136 * pending_pages[..]. For each 'pending_req' we have have up to
137 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
138 * 10 and would index in the pending_pages[..].
139 */
140static inline int vaddr_pagenr(struct pending_req *req, int seg)
141{ 127{
142 return (req - blkbk->pending_reqs) * 128 unsigned long flags;
143 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; 129 int i;
144}
145 130
146#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] 131 spin_lock_irqsave(&blkif->free_pages_lock, flags);
132 for (i = 0; i < num; i++)
133 list_add(&page[i]->lru, &blkif->free_pages);
134 blkif->free_pages_num += num;
135 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
136}
147 137
148static inline unsigned long vaddr(struct pending_req *req, int seg) 138static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
149{ 139{
150 unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); 140 /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
151 return (unsigned long)pfn_to_kaddr(pfn); 141 struct page *page[NUM_BATCH_FREE_PAGES];
152} 142 unsigned int num_pages = 0;
143 unsigned long flags;
153 144
154#define pending_handle(_req, _seg) \ 145 spin_lock_irqsave(&blkif->free_pages_lock, flags);
155 (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) 146 while (blkif->free_pages_num > num) {
147 BUG_ON(list_empty(&blkif->free_pages));
148 page[num_pages] = list_first_entry(&blkif->free_pages,
149 struct page, lru);
150 list_del(&page[num_pages]->lru);
151 blkif->free_pages_num--;
152 if (++num_pages == NUM_BATCH_FREE_PAGES) {
153 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
154 free_xenballooned_pages(num_pages, page);
155 spin_lock_irqsave(&blkif->free_pages_lock, flags);
156 num_pages = 0;
157 }
158 }
159 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
160 if (num_pages != 0)
161 free_xenballooned_pages(num_pages, page);
162}
156 163
164#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
157 165
158static int do_block_io_op(struct xen_blkif *blkif); 166static int do_block_io_op(struct xen_blkif *blkif);
159static int dispatch_rw_block_io(struct xen_blkif *blkif, 167static int dispatch_rw_block_io(struct xen_blkif *blkif,
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id,
170 (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) 178 (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
171 179
172 180
173static void add_persistent_gnt(struct rb_root *root, 181/*
182 * We don't need locking around the persistent grant helpers
183 * because blkback uses a single-thread for each backed, so we
184 * can be sure that this functions will never be called recursively.
185 *
186 * The only exception to that is put_persistent_grant, that can be called
187 * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
188 * bit operations to modify the flags of a persistent grant and to count
189 * the number of used grants.
190 */
191static int add_persistent_gnt(struct xen_blkif *blkif,
174 struct persistent_gnt *persistent_gnt) 192 struct persistent_gnt *persistent_gnt)
175{ 193{
176 struct rb_node **new = &(root->rb_node), *parent = NULL; 194 struct rb_node **new = NULL, *parent = NULL;
177 struct persistent_gnt *this; 195 struct persistent_gnt *this;
178 196
197 if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
198 if (!blkif->vbd.overflow_max_grants)
199 blkif->vbd.overflow_max_grants = 1;
200 return -EBUSY;
201 }
179 /* Figure out where to put new node */ 202 /* Figure out where to put new node */
203 new = &blkif->persistent_gnts.rb_node;
180 while (*new) { 204 while (*new) {
181 this = container_of(*new, struct persistent_gnt, node); 205 this = container_of(*new, struct persistent_gnt, node);
182 206
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root,
186 else if (persistent_gnt->gnt > this->gnt) 210 else if (persistent_gnt->gnt > this->gnt)
187 new = &((*new)->rb_right); 211 new = &((*new)->rb_right);
188 else { 212 else {
189 pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); 213 pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n");
190 BUG(); 214 return -EINVAL;
191 } 215 }
192 } 216 }
193 217
218 bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
219 set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
194 /* Add new node and rebalance tree. */ 220 /* Add new node and rebalance tree. */
195 rb_link_node(&(persistent_gnt->node), parent, new); 221 rb_link_node(&(persistent_gnt->node), parent, new);
196 rb_insert_color(&(persistent_gnt->node), root); 222 rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
223 blkif->persistent_gnt_c++;
224 atomic_inc(&blkif->persistent_gnt_in_use);
225 return 0;
197} 226}
198 227
199static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, 228static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
200 grant_ref_t gref) 229 grant_ref_t gref)
201{ 230{
202 struct persistent_gnt *data; 231 struct persistent_gnt *data;
203 struct rb_node *node = root->rb_node; 232 struct rb_node *node = NULL;
204 233
234 node = blkif->persistent_gnts.rb_node;
205 while (node) { 235 while (node) {
206 data = container_of(node, struct persistent_gnt, node); 236 data = container_of(node, struct persistent_gnt, node);
207 237
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
209 node = node->rb_left; 239 node = node->rb_left;
210 else if (gref > data->gnt) 240 else if (gref > data->gnt)
211 node = node->rb_right; 241 node = node->rb_right;
212 else 242 else {
243 if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
244 pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n");
245 return NULL;
246 }
247 set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
248 atomic_inc(&blkif->persistent_gnt_in_use);
213 return data; 249 return data;
250 }
214 } 251 }
215 return NULL; 252 return NULL;
216} 253}
217 254
218static void free_persistent_gnts(struct rb_root *root, unsigned int num) 255static void put_persistent_gnt(struct xen_blkif *blkif,
256 struct persistent_gnt *persistent_gnt)
257{
258 if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
259 pr_alert_ratelimited(DRV_PFX " freeing a grant already unused");
260 set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
261 clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
262 atomic_dec(&blkif->persistent_gnt_in_use);
263}
264
265static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
266 unsigned int num)
219{ 267{
220 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 268 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
221 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 269 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
240 ret = gnttab_unmap_refs(unmap, NULL, pages, 288 ret = gnttab_unmap_refs(unmap, NULL, pages,
241 segs_to_unmap); 289 segs_to_unmap);
242 BUG_ON(ret); 290 BUG_ON(ret);
243 free_xenballooned_pages(segs_to_unmap, pages); 291 put_free_pages(blkif, pages, segs_to_unmap);
244 segs_to_unmap = 0; 292 segs_to_unmap = 0;
245 } 293 }
246 294
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
251 BUG_ON(num != 0); 299 BUG_ON(num != 0);
252} 300}
253 301
302static void unmap_purged_grants(struct work_struct *work)
303{
304 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
305 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
306 struct persistent_gnt *persistent_gnt;
307 int ret, segs_to_unmap = 0;
308 struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
309
310 while(!list_empty(&blkif->persistent_purge_list)) {
311 persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
312 struct persistent_gnt,
313 remove_node);
314 list_del(&persistent_gnt->remove_node);
315
316 gnttab_set_unmap_op(&unmap[segs_to_unmap],
317 vaddr(persistent_gnt->page),
318 GNTMAP_host_map,
319 persistent_gnt->handle);
320
321 pages[segs_to_unmap] = persistent_gnt->page;
322
323 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
324 ret = gnttab_unmap_refs(unmap, NULL, pages,
325 segs_to_unmap);
326 BUG_ON(ret);
327 put_free_pages(blkif, pages, segs_to_unmap);
328 segs_to_unmap = 0;
329 }
330 kfree(persistent_gnt);
331 }
332 if (segs_to_unmap > 0) {
333 ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
334 BUG_ON(ret);
335 put_free_pages(blkif, pages, segs_to_unmap);
336 }
337}
338
339static void purge_persistent_gnt(struct xen_blkif *blkif)
340{
341 struct persistent_gnt *persistent_gnt;
342 struct rb_node *n;
343 unsigned int num_clean, total;
344 bool scan_used = false, clean_used = false;
345 struct rb_root *root;
346
347 if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
348 (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
349 !blkif->vbd.overflow_max_grants)) {
350 return;
351 }
352
353 if (work_pending(&blkif->persistent_purge_work)) {
354 pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n");
355 return;
356 }
357
358 num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
359 num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
360 num_clean = min(blkif->persistent_gnt_c, num_clean);
361 if ((num_clean == 0) ||
362 (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
363 return;
364
365 /*
366 * At this point, we can assure that there will be no calls
367 * to get_persistent_grant (because we are executing this code from
368 * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
369 * which means that the number of currently used grants will go down,
370 * but never up, so we will always be able to remove the requested
371 * number of grants.
372 */
373
374 total = num_clean;
375
376 pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
377
378 INIT_LIST_HEAD(&blkif->persistent_purge_list);
379 root = &blkif->persistent_gnts;
380purge_list:
381 foreach_grant_safe(persistent_gnt, n, root, node) {
382 BUG_ON(persistent_gnt->handle ==
383 BLKBACK_INVALID_HANDLE);
384
385 if (clean_used) {
386 clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
387 continue;
388 }
389
390 if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
391 continue;
392 if (!scan_used &&
393 (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
394 continue;
395
396 rb_erase(&persistent_gnt->node, root);
397 list_add(&persistent_gnt->remove_node,
398 &blkif->persistent_purge_list);
399 if (--num_clean == 0)
400 goto finished;
401 }
402 /*
403 * If we get here it means we also need to start cleaning
404 * grants that were used since last purge in order to cope
405 * with the requested num
406 */
407 if (!scan_used && !clean_used) {
408 pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean);
409 scan_used = true;
410 goto purge_list;
411 }
412finished:
413 if (!clean_used) {
414 pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n");
415 clean_used = true;
416 goto purge_list;
417 }
418
419 blkif->persistent_gnt_c -= (total - num_clean);
420 blkif->vbd.overflow_max_grants = 0;
421
422 /* We can defer this work */
423 INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
424 schedule_work(&blkif->persistent_purge_work);
425 pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
426 return;
427}
428
254/* 429/*
255 * Retrieve from the 'pending_reqs' a free pending_req structure to be used. 430 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
256 */ 431 */
257static struct pending_req *alloc_req(void) 432static struct pending_req *alloc_req(struct xen_blkif *blkif)
258{ 433{
259 struct pending_req *req = NULL; 434 struct pending_req *req = NULL;
260 unsigned long flags; 435 unsigned long flags;
261 436
262 spin_lock_irqsave(&blkbk->pending_free_lock, flags); 437 spin_lock_irqsave(&blkif->pending_free_lock, flags);
263 if (!list_empty(&blkbk->pending_free)) { 438 if (!list_empty(&blkif->pending_free)) {
264 req = list_entry(blkbk->pending_free.next, struct pending_req, 439 req = list_entry(blkif->pending_free.next, struct pending_req,
265 free_list); 440 free_list);
266 list_del(&req->free_list); 441 list_del(&req->free_list);
267 } 442 }
268 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); 443 spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
269 return req; 444 return req;
270} 445}
271 446
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void)
273 * Return the 'pending_req' structure back to the freepool. We also 448 * Return the 'pending_req' structure back to the freepool. We also
274 * wake up the thread if it was waiting for a free page. 449 * wake up the thread if it was waiting for a free page.
275 */ 450 */
276static void free_req(struct pending_req *req) 451static void free_req(struct xen_blkif *blkif, struct pending_req *req)
277{ 452{
278 unsigned long flags; 453 unsigned long flags;
279 int was_empty; 454 int was_empty;
280 455
281 spin_lock_irqsave(&blkbk->pending_free_lock, flags); 456 spin_lock_irqsave(&blkif->pending_free_lock, flags);
282 was_empty = list_empty(&blkbk->pending_free); 457 was_empty = list_empty(&blkif->pending_free);
283 list_add(&req->free_list, &blkbk->pending_free); 458 list_add(&req->free_list, &blkif->pending_free);
284 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); 459 spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
285 if (was_empty) 460 if (was_empty)
286 wake_up(&blkbk->pending_free_wq); 461 wake_up(&blkif->pending_free_wq);
287} 462}
288 463
289/* 464/*
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
382static void print_stats(struct xen_blkif *blkif) 557static void print_stats(struct xen_blkif *blkif)
383{ 558{
384 pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" 559 pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
385 " | ds %4llu\n", 560 " | ds %4llu | pg: %4u/%4d\n",
386 current->comm, blkif->st_oo_req, 561 current->comm, blkif->st_oo_req,
387 blkif->st_rd_req, blkif->st_wr_req, 562 blkif->st_rd_req, blkif->st_wr_req,
388 blkif->st_f_req, blkif->st_ds_req); 563 blkif->st_f_req, blkif->st_ds_req,
564 blkif->persistent_gnt_c,
565 xen_blkif_max_pgrants);
389 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 566 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
390 blkif->st_rd_req = 0; 567 blkif->st_rd_req = 0;
391 blkif->st_wr_req = 0; 568 blkif->st_wr_req = 0;
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg)
397{ 574{
398 struct xen_blkif *blkif = arg; 575 struct xen_blkif *blkif = arg;
399 struct xen_vbd *vbd = &blkif->vbd; 576 struct xen_vbd *vbd = &blkif->vbd;
577 unsigned long timeout;
578 int ret;
400 579
401 xen_blkif_get(blkif); 580 xen_blkif_get(blkif);
402 581
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg)
406 if (unlikely(vbd->size != vbd_sz(vbd))) 585 if (unlikely(vbd->size != vbd_sz(vbd)))
407 xen_vbd_resize(blkif); 586 xen_vbd_resize(blkif);
408 587
409 wait_event_interruptible( 588 timeout = msecs_to_jiffies(LRU_INTERVAL);
589
590 timeout = wait_event_interruptible_timeout(
410 blkif->wq, 591 blkif->wq,
411 blkif->waiting_reqs || kthread_should_stop()); 592 blkif->waiting_reqs || kthread_should_stop(),
412 wait_event_interruptible( 593 timeout);
413 blkbk->pending_free_wq, 594 if (timeout == 0)
414 !list_empty(&blkbk->pending_free) || 595 goto purge_gnt_list;
415 kthread_should_stop()); 596 timeout = wait_event_interruptible_timeout(
597 blkif->pending_free_wq,
598 !list_empty(&blkif->pending_free) ||
599 kthread_should_stop(),
600 timeout);
601 if (timeout == 0)
602 goto purge_gnt_list;
416 603
417 blkif->waiting_reqs = 0; 604 blkif->waiting_reqs = 0;
418 smp_mb(); /* clear flag *before* checking for work */ 605 smp_mb(); /* clear flag *before* checking for work */
419 606
420 if (do_block_io_op(blkif)) 607 ret = do_block_io_op(blkif);
608 if (ret > 0)
421 blkif->waiting_reqs = 1; 609 blkif->waiting_reqs = 1;
610 if (ret == -EACCES)
611 wait_event_interruptible(blkif->shutdown_wq,
612 kthread_should_stop());
613
614purge_gnt_list:
615 if (blkif->vbd.feature_gnt_persistent &&
616 time_after(jiffies, blkif->next_lru)) {
617 purge_persistent_gnt(blkif);
618 blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
619 }
620
621 /* Shrink if we have more than xen_blkif_max_buffer_pages */
622 shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
422 623
423 if (log_stats && time_after(jiffies, blkif->st_print)) 624 if (log_stats && time_after(jiffies, blkif->st_print))
424 print_stats(blkif); 625 print_stats(blkif);
425 } 626 }
426 627
628 /* Since we are shutting down remove all pages from the buffer */
629 shrink_free_pagepool(blkif, 0 /* All */);
630
427 /* Free all persistent grant pages */ 631 /* Free all persistent grant pages */
428 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) 632 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
429 free_persistent_gnts(&blkif->persistent_gnts, 633 free_persistent_gnts(blkif, &blkif->persistent_gnts,
430 blkif->persistent_gnt_c); 634 blkif->persistent_gnt_c);
431 635
432 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); 636 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg)
441 return 0; 645 return 0;
442} 646}
443 647
444struct seg_buf {
445 unsigned int offset;
446 unsigned int nsec;
447};
448/* 648/*
449 * Unmap the grant references, and also remove the M2P over-rides 649 * Unmap the grant references, and also remove the M2P over-rides
450 * used in the 'pending_req'. 650 * used in the 'pending_req'.
451 */ 651 */
452static void xen_blkbk_unmap(struct pending_req *req) 652static void xen_blkbk_unmap(struct xen_blkif *blkif,
653 struct grant_page *pages[],
654 int num)
453{ 655{
454 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 656 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
455 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 657 struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
456 unsigned int i, invcount = 0; 658 unsigned int i, invcount = 0;
457 grant_handle_t handle;
458 int ret; 659 int ret;
459 660
460 for (i = 0; i < req->nr_pages; i++) { 661 for (i = 0; i < num; i++) {
461 if (!test_bit(i, req->unmap_seg)) 662 if (pages[i]->persistent_gnt != NULL) {
663 put_persistent_gnt(blkif, pages[i]->persistent_gnt);
462 continue; 664 continue;
463 handle = pending_handle(req, i); 665 }
464 if (handle == BLKBACK_INVALID_HANDLE) 666 if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
465 continue; 667 continue;
466 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), 668 unmap_pages[invcount] = pages[i]->page;
467 GNTMAP_host_map, handle); 669 gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page),
468 pending_handle(req, i) = BLKBACK_INVALID_HANDLE; 670 GNTMAP_host_map, pages[i]->handle);
469 pages[invcount] = virt_to_page(vaddr(req, i)); 671 pages[i]->handle = BLKBACK_INVALID_HANDLE;
470 invcount++; 672 if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
673 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
674 invcount);
675 BUG_ON(ret);
676 put_free_pages(blkif, unmap_pages, invcount);
677 invcount = 0;
678 }
679 }
680 if (invcount) {
681 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
682 BUG_ON(ret);
683 put_free_pages(blkif, unmap_pages, invcount);
471 } 684 }
472
473 ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
474 BUG_ON(ret);
475} 685}
476 686
477static int xen_blkbk_map(struct blkif_request *req, 687static int xen_blkbk_map(struct xen_blkif *blkif,
478 struct pending_req *pending_req, 688 struct grant_page *pages[],
479 struct seg_buf seg[], 689 int num, bool ro)
480 struct page *pages[])
481{ 690{
482 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 691 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
483 struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
484 struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 692 struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
485 struct persistent_gnt *persistent_gnt = NULL; 693 struct persistent_gnt *persistent_gnt = NULL;
486 struct xen_blkif *blkif = pending_req->blkif;
487 phys_addr_t addr = 0; 694 phys_addr_t addr = 0;
488 int i, j; 695 int i, seg_idx, new_map_idx;
489 bool new_map;
490 int nseg = req->u.rw.nr_segments;
491 int segs_to_map = 0; 696 int segs_to_map = 0;
492 int ret = 0; 697 int ret = 0;
698 int last_map = 0, map_until = 0;
493 int use_persistent_gnts; 699 int use_persistent_gnts;
494 700
495 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); 701 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
496 702
497 BUG_ON(blkif->persistent_gnt_c >
498 max_mapped_grant_pages(pending_req->blkif->blk_protocol));
499
500 /* 703 /*
501 * Fill out preq.nr_sects with proper amount of sectors, and setup 704 * Fill out preq.nr_sects with proper amount of sectors, and setup
502 * assign map[..] with the PFN of the page in our domain with the 705 * assign map[..] with the PFN of the page in our domain with the
503 * corresponding grant reference for each page. 706 * corresponding grant reference for each page.
504 */ 707 */
505 for (i = 0; i < nseg; i++) { 708again:
709 for (i = map_until; i < num; i++) {
506 uint32_t flags; 710 uint32_t flags;
507 711
508 if (use_persistent_gnts) 712 if (use_persistent_gnts)
509 persistent_gnt = get_persistent_gnt( 713 persistent_gnt = get_persistent_gnt(
510 &blkif->persistent_gnts, 714 blkif,
511 req->u.rw.seg[i].gref); 715 pages[i]->gref);
512 716
513 if (persistent_gnt) { 717 if (persistent_gnt) {
514 /* 718 /*
515 * We are using persistent grants and 719 * We are using persistent grants and
516 * the grant is already mapped 720 * the grant is already mapped
517 */ 721 */
518 new_map = false; 722 pages[i]->page = persistent_gnt->page;
519 } else if (use_persistent_gnts && 723 pages[i]->persistent_gnt = persistent_gnt;
520 blkif->persistent_gnt_c <
521 max_mapped_grant_pages(blkif->blk_protocol)) {
522 /*
523 * We are using persistent grants, the grant is
524 * not mapped but we have room for it
525 */
526 new_map = true;
527 persistent_gnt = kmalloc(
528 sizeof(struct persistent_gnt),
529 GFP_KERNEL);
530 if (!persistent_gnt)
531 return -ENOMEM;
532 if (alloc_xenballooned_pages(1, &persistent_gnt->page,
533 false)) {
534 kfree(persistent_gnt);
535 return -ENOMEM;
536 }
537 persistent_gnt->gnt = req->u.rw.seg[i].gref;
538 persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
539
540 pages_to_gnt[segs_to_map] =
541 persistent_gnt->page;
542 addr = (unsigned long) pfn_to_kaddr(
543 page_to_pfn(persistent_gnt->page));
544
545 add_persistent_gnt(&blkif->persistent_gnts,
546 persistent_gnt);
547 blkif->persistent_gnt_c++;
548 pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
549 persistent_gnt->gnt, blkif->persistent_gnt_c,
550 max_mapped_grant_pages(blkif->blk_protocol));
551 } else { 724 } else {
552 /* 725 if (get_free_page(blkif, &pages[i]->page))
553 * We are either using persistent grants and 726 goto out_of_memory;
554 * hit the maximum limit of grants mapped, 727 addr = vaddr(pages[i]->page);
555 * or we are not using persistent grants. 728 pages_to_gnt[segs_to_map] = pages[i]->page;
556 */ 729 pages[i]->persistent_gnt = NULL;
557 if (use_persistent_gnts &&
558 !blkif->vbd.overflow_max_grants) {
559 blkif->vbd.overflow_max_grants = 1;
560 pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
561 blkif->domid, blkif->vbd.handle);
562 }
563 new_map = true;
564 pages[i] = blkbk->pending_page(pending_req, i);
565 addr = vaddr(pending_req, i);
566 pages_to_gnt[segs_to_map] =
567 blkbk->pending_page(pending_req, i);
568 }
569
570 if (persistent_gnt) {
571 pages[i] = persistent_gnt->page;
572 persistent_gnts[i] = persistent_gnt;
573 } else {
574 persistent_gnts[i] = NULL;
575 }
576
577 if (new_map) {
578 flags = GNTMAP_host_map; 730 flags = GNTMAP_host_map;
579 if (!persistent_gnt && 731 if (!use_persistent_gnts && ro)
580 (pending_req->operation != BLKIF_OP_READ))
581 flags |= GNTMAP_readonly; 732 flags |= GNTMAP_readonly;
582 gnttab_set_map_op(&map[segs_to_map++], addr, 733 gnttab_set_map_op(&map[segs_to_map++], addr,
583 flags, req->u.rw.seg[i].gref, 734 flags, pages[i]->gref,
584 blkif->domid); 735 blkif->domid);
585 } 736 }
737 map_until = i + 1;
738 if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
739 break;
586 } 740 }
587 741
588 if (segs_to_map) { 742 if (segs_to_map) {
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req,
595 * so that when we access vaddr(pending_req,i) it has the contents of 749 * so that when we access vaddr(pending_req,i) it has the contents of
596 * the page from the other domain. 750 * the page from the other domain.
597 */ 751 */
598 bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); 752 for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
599 for (i = 0, j = 0; i < nseg; i++) { 753 if (!pages[seg_idx]->persistent_gnt) {
600 if (!persistent_gnts[i] ||
601 persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
602 /* This is a newly mapped grant */ 754 /* This is a newly mapped grant */
603 BUG_ON(j >= segs_to_map); 755 BUG_ON(new_map_idx >= segs_to_map);
604 if (unlikely(map[j].status != 0)) { 756 if (unlikely(map[new_map_idx].status != 0)) {
605 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); 757 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
606 map[j].handle = BLKBACK_INVALID_HANDLE; 758 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
607 ret |= 1; 759 ret |= 1;
608 if (persistent_gnts[i]) { 760 goto next;
609 rb_erase(&persistent_gnts[i]->node,
610 &blkif->persistent_gnts);
611 blkif->persistent_gnt_c--;
612 kfree(persistent_gnts[i]);
613 persistent_gnts[i] = NULL;
614 }
615 } 761 }
762 pages[seg_idx]->handle = map[new_map_idx].handle;
763 } else {
764 continue;
616 } 765 }
617 if (persistent_gnts[i]) { 766 if (use_persistent_gnts &&
618 if (persistent_gnts[i]->handle == 767 blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
619 BLKBACK_INVALID_HANDLE) { 768 /*
769 * We are using persistent grants, the grant is
770 * not mapped but we might have room for it.
771 */
772 persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
773 GFP_KERNEL);
774 if (!persistent_gnt) {
620 /* 775 /*
621 * If this is a new persistent grant 776 * If we don't have enough memory to
622 * save the handler 777 * allocate the persistent_gnt struct
778 * map this grant non-persistenly
623 */ 779 */
624 persistent_gnts[i]->handle = map[j++].handle; 780 goto next;
625 } 781 }
626 pending_handle(pending_req, i) = 782 persistent_gnt->gnt = map[new_map_idx].ref;
627 persistent_gnts[i]->handle; 783 persistent_gnt->handle = map[new_map_idx].handle;
784 persistent_gnt->page = pages[seg_idx]->page;
785 if (add_persistent_gnt(blkif,
786 persistent_gnt)) {
787 kfree(persistent_gnt);
788 persistent_gnt = NULL;
789 goto next;
790 }
791 pages[seg_idx]->persistent_gnt = persistent_gnt;
792 pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
793 persistent_gnt->gnt, blkif->persistent_gnt_c,
794 xen_blkif_max_pgrants);
795 goto next;
796 }
797 if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
798 blkif->vbd.overflow_max_grants = 1;
799 pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
800 blkif->domid, blkif->vbd.handle);
801 }
802 /*
803 * We could not map this grant persistently, so use it as
804 * a non-persistent grant.
805 */
806next:
807 new_map_idx++;
808 }
809 segs_to_map = 0;
810 last_map = map_until;
811 if (map_until != num)
812 goto again;
628 813
629 if (ret) 814 return ret;
630 continue; 815
631 } else { 816out_of_memory:
632 pending_handle(pending_req, i) = map[j++].handle; 817 pr_alert(DRV_PFX "%s: out of memory\n", __func__);
633 bitmap_set(pending_req->unmap_seg, i, 1); 818 put_free_pages(blkif, pages_to_gnt, segs_to_map);
819 return -ENOMEM;
820}
821
822static int xen_blkbk_map_seg(struct pending_req *pending_req)
823{
824 int rc;
825
826 rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
827 pending_req->nr_pages,
828 (pending_req->operation != BLKIF_OP_READ));
829
830 return rc;
831}
634 832
635 if (ret) 833static int xen_blkbk_parse_indirect(struct blkif_request *req,
636 continue; 834 struct pending_req *pending_req,
835 struct seg_buf seg[],
836 struct phys_req *preq)
837{
838 struct grant_page **pages = pending_req->indirect_pages;
839 struct xen_blkif *blkif = pending_req->blkif;
840 int indirect_grefs, rc, n, nseg, i;
841 struct blkif_request_segment_aligned *segments = NULL;
842
843 nseg = pending_req->nr_pages;
844 indirect_grefs = INDIRECT_PAGES(nseg);
845 BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
846
847 for (i = 0; i < indirect_grefs; i++)
848 pages[i]->gref = req->u.indirect.indirect_grefs[i];
849
850 rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
851 if (rc)
852 goto unmap;
853
854 for (n = 0, i = 0; n < nseg; n++) {
855 if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
856 /* Map indirect segments */
857 if (segments)
858 kunmap_atomic(segments);
859 segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
860 }
861 i = n % SEGS_PER_INDIRECT_FRAME;
862 pending_req->segments[n]->gref = segments[i].gref;
863 seg[n].nsec = segments[i].last_sect -
864 segments[i].first_sect + 1;
865 seg[n].offset = (segments[i].first_sect << 9);
866 if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
867 (segments[i].last_sect < segments[i].first_sect)) {
868 rc = -EINVAL;
869 goto unmap;
637 } 870 }
638 seg[i].offset = (req->u.rw.seg[i].first_sect << 9); 871 preq->nr_sects += seg[n].nsec;
639 } 872 }
640 return ret; 873
874unmap:
875 if (segments)
876 kunmap_atomic(segments);
877 xen_blkbk_unmap(blkif, pages, indirect_grefs);
878 return rc;
641} 879}
642 880
643static int dispatch_discard_io(struct xen_blkif *blkif, 881static int dispatch_discard_io(struct xen_blkif *blkif,
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
647 int status = BLKIF_RSP_OKAY; 885 int status = BLKIF_RSP_OKAY;
648 struct block_device *bdev = blkif->vbd.bdev; 886 struct block_device *bdev = blkif->vbd.bdev;
649 unsigned long secure; 887 unsigned long secure;
888 struct phys_req preq;
889
890 preq.sector_number = req->u.discard.sector_number;
891 preq.nr_sects = req->u.discard.nr_sectors;
650 892
893 err = xen_vbd_translate(&preq, blkif, WRITE);
894 if (err) {
895 pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n",
896 preq.sector_number,
897 preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
898 goto fail_response;
899 }
651 blkif->st_ds_req++; 900 blkif->st_ds_req++;
652 901
653 xen_blkif_get(blkif); 902 xen_blkif_get(blkif);
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
658 err = blkdev_issue_discard(bdev, req->u.discard.sector_number, 907 err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
659 req->u.discard.nr_sectors, 908 req->u.discard.nr_sectors,
660 GFP_KERNEL, secure); 909 GFP_KERNEL, secure);
661 910fail_response:
662 if (err == -EOPNOTSUPP) { 911 if (err == -EOPNOTSUPP) {
663 pr_debug(DRV_PFX "discard op failed, not supported\n"); 912 pr_debug(DRV_PFX "discard op failed, not supported\n");
664 status = BLKIF_RSP_EOPNOTSUPP; 913 status = BLKIF_RSP_EOPNOTSUPP;
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif,
674 struct blkif_request *req, 923 struct blkif_request *req,
675 struct pending_req *pending_req) 924 struct pending_req *pending_req)
676{ 925{
677 free_req(pending_req); 926 free_req(blkif, pending_req);
678 make_response(blkif, req->u.other.id, req->operation, 927 make_response(blkif, req->u.other.id, req->operation,
679 BLKIF_RSP_EOPNOTSUPP); 928 BLKIF_RSP_EOPNOTSUPP);
680 return -EIO; 929 return -EIO;
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
726 * the proper response on the ring. 975 * the proper response on the ring.
727 */ 976 */
728 if (atomic_dec_and_test(&pending_req->pendcnt)) { 977 if (atomic_dec_and_test(&pending_req->pendcnt)) {
729 xen_blkbk_unmap(pending_req); 978 xen_blkbk_unmap(pending_req->blkif,
979 pending_req->segments,
980 pending_req->nr_pages);
730 make_response(pending_req->blkif, pending_req->id, 981 make_response(pending_req->blkif, pending_req->id,
731 pending_req->operation, pending_req->status); 982 pending_req->operation, pending_req->status);
732 xen_blkif_put(pending_req->blkif); 983 xen_blkif_put(pending_req->blkif);
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
734 if (atomic_read(&pending_req->blkif->drain)) 985 if (atomic_read(&pending_req->blkif->drain))
735 complete(&pending_req->blkif->drain_complete); 986 complete(&pending_req->blkif->drain_complete);
736 } 987 }
737 free_req(pending_req); 988 free_req(pending_req->blkif, pending_req);
738 } 989 }
739} 990}
740 991
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif)
767 rp = blk_rings->common.sring->req_prod; 1018 rp = blk_rings->common.sring->req_prod;
768 rmb(); /* Ensure we see queued requests up to 'rp'. */ 1019 rmb(); /* Ensure we see queued requests up to 'rp'. */
769 1020
1021 if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1022 rc = blk_rings->common.rsp_prod_pvt;
1023 pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1024 rp, rc, rp - rc, blkif->vbd.pdevice);
1025 return -EACCES;
1026 }
770 while (rc != rp) { 1027 while (rc != rp) {
771 1028
772 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) 1029 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif)
777 break; 1034 break;
778 } 1035 }
779 1036
780 pending_req = alloc_req(); 1037 pending_req = alloc_req(blkif);
781 if (NULL == pending_req) { 1038 if (NULL == pending_req) {
782 blkif->st_oo_req++; 1039 blkif->st_oo_req++;
783 more_to_do = 1; 1040 more_to_do = 1;
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif)
807 case BLKIF_OP_WRITE: 1064 case BLKIF_OP_WRITE:
808 case BLKIF_OP_WRITE_BARRIER: 1065 case BLKIF_OP_WRITE_BARRIER:
809 case BLKIF_OP_FLUSH_DISKCACHE: 1066 case BLKIF_OP_FLUSH_DISKCACHE:
1067 case BLKIF_OP_INDIRECT:
810 if (dispatch_rw_block_io(blkif, &req, pending_req)) 1068 if (dispatch_rw_block_io(blkif, &req, pending_req))
811 goto done; 1069 goto done;
812 break; 1070 break;
813 case BLKIF_OP_DISCARD: 1071 case BLKIF_OP_DISCARD:
814 free_req(pending_req); 1072 free_req(blkif, pending_req);
815 if (dispatch_discard_io(blkif, &req)) 1073 if (dispatch_discard_io(blkif, &req))
816 goto done; 1074 goto done;
817 break; 1075 break;
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
853 struct pending_req *pending_req) 1111 struct pending_req *pending_req)
854{ 1112{
855 struct phys_req preq; 1113 struct phys_req preq;
856 struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1114 struct seg_buf *seg = pending_req->seg;
857 unsigned int nseg; 1115 unsigned int nseg;
858 struct bio *bio = NULL; 1116 struct bio *bio = NULL;
859 struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1117 struct bio **biolist = pending_req->biolist;
860 int i, nbio = 0; 1118 int i, nbio = 0;
861 int operation; 1119 int operation;
862 struct blk_plug plug; 1120 struct blk_plug plug;
863 bool drain = false; 1121 bool drain = false;
864 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1122 struct grant_page **pages = pending_req->segments;
1123 unsigned short req_operation;
1124
1125 req_operation = req->operation == BLKIF_OP_INDIRECT ?
1126 req->u.indirect.indirect_op : req->operation;
1127 if ((req->operation == BLKIF_OP_INDIRECT) &&
1128 (req_operation != BLKIF_OP_READ) &&
1129 (req_operation != BLKIF_OP_WRITE)) {
1130 pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
1131 req_operation);
1132 goto fail_response;
1133 }
865 1134
866 switch (req->operation) { 1135 switch (req_operation) {
867 case BLKIF_OP_READ: 1136 case BLKIF_OP_READ:
868 blkif->st_rd_req++; 1137 blkif->st_rd_req++;
869 operation = READ; 1138 operation = READ;
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
885 } 1154 }
886 1155
887 /* Check that the number of segments is sane. */ 1156 /* Check that the number of segments is sane. */
888 nseg = req->u.rw.nr_segments; 1157 nseg = req->operation == BLKIF_OP_INDIRECT ?
1158 req->u.indirect.nr_segments : req->u.rw.nr_segments;
889 1159
890 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || 1160 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
891 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 1161 unlikely((req->operation != BLKIF_OP_INDIRECT) &&
1162 (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
1163 unlikely((req->operation == BLKIF_OP_INDIRECT) &&
1164 (nseg > MAX_INDIRECT_SEGMENTS))) {
892 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", 1165 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
893 nseg); 1166 nseg);
894 /* Haven't submitted any bio's yet. */ 1167 /* Haven't submitted any bio's yet. */
895 goto fail_response; 1168 goto fail_response;
896 } 1169 }
897 1170
898 preq.sector_number = req->u.rw.sector_number;
899 preq.nr_sects = 0; 1171 preq.nr_sects = 0;
900 1172
901 pending_req->blkif = blkif; 1173 pending_req->blkif = blkif;
902 pending_req->id = req->u.rw.id; 1174 pending_req->id = req->u.rw.id;
903 pending_req->operation = req->operation; 1175 pending_req->operation = req_operation;
904 pending_req->status = BLKIF_RSP_OKAY; 1176 pending_req->status = BLKIF_RSP_OKAY;
905 pending_req->nr_pages = nseg; 1177 pending_req->nr_pages = nseg;
906 1178
907 for (i = 0; i < nseg; i++) { 1179 if (req->operation != BLKIF_OP_INDIRECT) {
908 seg[i].nsec = req->u.rw.seg[i].last_sect - 1180 preq.dev = req->u.rw.handle;
909 req->u.rw.seg[i].first_sect + 1; 1181 preq.sector_number = req->u.rw.sector_number;
910 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || 1182 for (i = 0; i < nseg; i++) {
911 (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) 1183 pages[i]->gref = req->u.rw.seg[i].gref;
1184 seg[i].nsec = req->u.rw.seg[i].last_sect -
1185 req->u.rw.seg[i].first_sect + 1;
1186 seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
1187 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
1188 (req->u.rw.seg[i].last_sect <
1189 req->u.rw.seg[i].first_sect))
1190 goto fail_response;
1191 preq.nr_sects += seg[i].nsec;
1192 }
1193 } else {
1194 preq.dev = req->u.indirect.handle;
1195 preq.sector_number = req->u.indirect.sector_number;
1196 if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
912 goto fail_response; 1197 goto fail_response;
913 preq.nr_sects += seg[i].nsec;
914
915 } 1198 }
916 1199
917 if (xen_vbd_translate(&preq, blkif, operation) != 0) { 1200 if (xen_vbd_translate(&preq, blkif, operation) != 0) {
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
948 * the hypercall to unmap the grants - that is all done in 1231 * the hypercall to unmap the grants - that is all done in
949 * xen_blkbk_unmap. 1232 * xen_blkbk_unmap.
950 */ 1233 */
951 if (xen_blkbk_map(req, pending_req, seg, pages)) 1234 if (xen_blkbk_map_seg(pending_req))
952 goto fail_flush; 1235 goto fail_flush;
953 1236
954 /* 1237 /*
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
960 for (i = 0; i < nseg; i++) { 1243 for (i = 0; i < nseg; i++) {
961 while ((bio == NULL) || 1244 while ((bio == NULL) ||
962 (bio_add_page(bio, 1245 (bio_add_page(bio,
963 pages[i], 1246 pages[i]->page,
964 seg[i].nsec << 9, 1247 seg[i].nsec << 9,
965 seg[i].offset) == 0)) { 1248 seg[i].offset) == 0)) {
966 1249
967 bio = bio_alloc(GFP_KERNEL, nseg-i); 1250 int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
1251 bio = bio_alloc(GFP_KERNEL, nr_iovecs);
968 if (unlikely(bio == NULL)) 1252 if (unlikely(bio == NULL))
969 goto fail_put_bio; 1253 goto fail_put_bio;
970 1254
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1009 return 0; 1293 return 0;
1010 1294
1011 fail_flush: 1295 fail_flush:
1012 xen_blkbk_unmap(pending_req); 1296 xen_blkbk_unmap(blkif, pending_req->segments,
1297 pending_req->nr_pages);
1013 fail_response: 1298 fail_response:
1014 /* Haven't submitted any bio's yet. */ 1299 /* Haven't submitted any bio's yet. */
1015 make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); 1300 make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1016 free_req(pending_req); 1301 free_req(blkif, pending_req);
1017 msleep(1); /* back off a bit */ 1302 msleep(1); /* back off a bit */
1018 return -EIO; 1303 return -EIO;
1019 1304
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
1070 1355
1071static int __init xen_blkif_init(void) 1356static int __init xen_blkif_init(void)
1072{ 1357{
1073 int i, mmap_pages;
1074 int rc = 0; 1358 int rc = 0;
1075 1359
1076 if (!xen_domain()) 1360 if (!xen_domain())
1077 return -ENODEV; 1361 return -ENODEV;
1078 1362
1079 blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
1080 if (!blkbk) {
1081 pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
1082 return -ENOMEM;
1083 }
1084
1085 mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
1086
1087 blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) *
1088 xen_blkif_reqs, GFP_KERNEL);
1089 blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
1090 mmap_pages, GFP_KERNEL);
1091 blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
1092 mmap_pages, GFP_KERNEL);
1093
1094 if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
1095 !blkbk->pending_pages) {
1096 rc = -ENOMEM;
1097 goto out_of_memory;
1098 }
1099
1100 for (i = 0; i < mmap_pages; i++) {
1101 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
1102 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
1103 if (blkbk->pending_pages[i] == NULL) {
1104 rc = -ENOMEM;
1105 goto out_of_memory;
1106 }
1107 }
1108 rc = xen_blkif_interface_init(); 1363 rc = xen_blkif_interface_init();
1109 if (rc) 1364 if (rc)
1110 goto failed_init; 1365 goto failed_init;
1111 1366
1112 INIT_LIST_HEAD(&blkbk->pending_free);
1113 spin_lock_init(&blkbk->pending_free_lock);
1114 init_waitqueue_head(&blkbk->pending_free_wq);
1115
1116 for (i = 0; i < xen_blkif_reqs; i++)
1117 list_add_tail(&blkbk->pending_reqs[i].free_list,
1118 &blkbk->pending_free);
1119
1120 rc = xen_blkif_xenbus_init(); 1367 rc = xen_blkif_xenbus_init();
1121 if (rc) 1368 if (rc)
1122 goto failed_init; 1369 goto failed_init;
1123 1370
1124 return 0;
1125
1126 out_of_memory:
1127 pr_alert(DRV_PFX "%s: out of memory\n", __func__);
1128 failed_init: 1371 failed_init:
1129 kfree(blkbk->pending_reqs);
1130 kfree(blkbk->pending_grant_handles);
1131 if (blkbk->pending_pages) {
1132 for (i = 0; i < mmap_pages; i++) {
1133 if (blkbk->pending_pages[i])
1134 __free_page(blkbk->pending_pages[i]);
1135 }
1136 kfree(blkbk->pending_pages);
1137 }
1138 kfree(blkbk);
1139 blkbk = NULL;
1140 return rc; 1372 return rc;
1141} 1373}
1142 1374
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 60103e2517ba..8d8807563d99 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -50,6 +50,19 @@
50 __func__, __LINE__, ##args) 50 __func__, __LINE__, ##args)
51 51
52 52
53/*
54 * This is the maximum number of segments that would be allowed in indirect
55 * requests. This value will also be passed to the frontend.
56 */
57#define MAX_INDIRECT_SEGMENTS 256
58
59#define SEGS_PER_INDIRECT_FRAME \
60 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
61#define MAX_INDIRECT_PAGES \
62 ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
63#define INDIRECT_PAGES(_segs) \
64 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
65
53/* Not a real protocol. Used to generate ring structs which contain 66/* Not a real protocol. Used to generate ring structs which contain
54 * the elements common to all protocols only. This way we get a 67 * the elements common to all protocols only. This way we get a
55 * compiler-checkable way to use common struct elements, so we can 68 * compiler-checkable way to use common struct elements, so we can
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
83 uint64_t id; /* private guest value, echoed in resp */ 96 uint64_t id; /* private guest value, echoed in resp */
84} __attribute__((__packed__)); 97} __attribute__((__packed__));
85 98
99struct blkif_x86_32_request_indirect {
100 uint8_t indirect_op;
101 uint16_t nr_segments;
102 uint64_t id;
103 blkif_sector_t sector_number;
104 blkif_vdev_t handle;
105 uint16_t _pad1;
106 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
107 /*
108 * The maximum number of indirect segments (and pages) that will
109 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
110 * is also exported to the guest (via xenstore
111 * feature-max-indirect-segments entry), so the frontend knows how
112 * many indirect segments the backend supports.
113 */
114 uint64_t _pad2; /* make it 64 byte aligned */
115} __attribute__((__packed__));
116
86struct blkif_x86_32_request { 117struct blkif_x86_32_request {
87 uint8_t operation; /* BLKIF_OP_??? */ 118 uint8_t operation; /* BLKIF_OP_??? */
88 union { 119 union {
89 struct blkif_x86_32_request_rw rw; 120 struct blkif_x86_32_request_rw rw;
90 struct blkif_x86_32_request_discard discard; 121 struct blkif_x86_32_request_discard discard;
91 struct blkif_x86_32_request_other other; 122 struct blkif_x86_32_request_other other;
123 struct blkif_x86_32_request_indirect indirect;
92 } u; 124 } u;
93} __attribute__((__packed__)); 125} __attribute__((__packed__));
94 126
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
127 uint64_t id; /* private guest value, echoed in resp */ 159 uint64_t id; /* private guest value, echoed in resp */
128} __attribute__((__packed__)); 160} __attribute__((__packed__));
129 161
162struct blkif_x86_64_request_indirect {
163 uint8_t indirect_op;
164 uint16_t nr_segments;
165 uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
166 uint64_t id;
167 blkif_sector_t sector_number;
168 blkif_vdev_t handle;
169 uint16_t _pad2;
170 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
171 /*
172 * The maximum number of indirect segments (and pages) that will
173 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
174 * is also exported to the guest (via xenstore
175 * feature-max-indirect-segments entry), so the frontend knows how
176 * many indirect segments the backend supports.
177 */
178 uint32_t _pad3; /* make it 64 byte aligned */
179} __attribute__((__packed__));
180
130struct blkif_x86_64_request { 181struct blkif_x86_64_request {
131 uint8_t operation; /* BLKIF_OP_??? */ 182 uint8_t operation; /* BLKIF_OP_??? */
132 union { 183 union {
133 struct blkif_x86_64_request_rw rw; 184 struct blkif_x86_64_request_rw rw;
134 struct blkif_x86_64_request_discard discard; 185 struct blkif_x86_64_request_discard discard;
135 struct blkif_x86_64_request_other other; 186 struct blkif_x86_64_request_other other;
187 struct blkif_x86_64_request_indirect indirect;
136 } u; 188 } u;
137} __attribute__((__packed__)); 189} __attribute__((__packed__));
138 190
@@ -182,12 +234,26 @@ struct xen_vbd {
182 234
183struct backend_info; 235struct backend_info;
184 236
237/* Number of available flags */
238#define PERSISTENT_GNT_FLAGS_SIZE 2
239/* This persistent grant is currently in use */
240#define PERSISTENT_GNT_ACTIVE 0
241/*
242 * This persistent grant has been used, this flag is set when we remove the
243 * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
244 */
245#define PERSISTENT_GNT_WAS_ACTIVE 1
246
247/* Number of requests that we can fit in a ring */
248#define XEN_BLKIF_REQS 32
185 249
186struct persistent_gnt { 250struct persistent_gnt {
187 struct page *page; 251 struct page *page;
188 grant_ref_t gnt; 252 grant_ref_t gnt;
189 grant_handle_t handle; 253 grant_handle_t handle;
254 DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
190 struct rb_node node; 255 struct rb_node node;
256 struct list_head remove_node;
191}; 257};
192 258
193struct xen_blkif { 259struct xen_blkif {
@@ -219,6 +285,23 @@ struct xen_blkif {
219 /* tree to store persistent grants */ 285 /* tree to store persistent grants */
220 struct rb_root persistent_gnts; 286 struct rb_root persistent_gnts;
221 unsigned int persistent_gnt_c; 287 unsigned int persistent_gnt_c;
288 atomic_t persistent_gnt_in_use;
289 unsigned long next_lru;
290
291 /* used by the kworker that offload work from the persistent purge */
292 struct list_head persistent_purge_list;
293 struct work_struct persistent_purge_work;
294
295 /* buffer of free pages to map grant refs */
296 spinlock_t free_pages_lock;
297 int free_pages_num;
298 struct list_head free_pages;
299
300 /* List of all 'pending_req' available */
301 struct list_head pending_free;
302 /* And its spinlock. */
303 spinlock_t pending_free_lock;
304 wait_queue_head_t pending_free_wq;
222 305
223 /* statistics */ 306 /* statistics */
224 unsigned long st_print; 307 unsigned long st_print;
@@ -231,6 +314,41 @@ struct xen_blkif {
231 unsigned long long st_wr_sect; 314 unsigned long long st_wr_sect;
232 315
233 wait_queue_head_t waiting_to_free; 316 wait_queue_head_t waiting_to_free;
317 /* Thread shutdown wait queue. */
318 wait_queue_head_t shutdown_wq;
319};
320
321struct seg_buf {
322 unsigned long offset;
323 unsigned int nsec;
324};
325
326struct grant_page {
327 struct page *page;
328 struct persistent_gnt *persistent_gnt;
329 grant_handle_t handle;
330 grant_ref_t gref;
331};
332
333/*
334 * Each outstanding request that we've passed to the lower device layers has a
335 * 'pending_req' allocated to it. Each buffer_head that completes decrements
336 * the pendcnt towards zero. When it hits zero, the specified domain has a
337 * response queued for it, with the saved 'id' passed back.
338 */
339struct pending_req {
340 struct xen_blkif *blkif;
341 u64 id;
342 int nr_pages;
343 atomic_t pendcnt;
344 unsigned short operation;
345 int status;
346 struct list_head free_list;
347 struct grant_page *segments[MAX_INDIRECT_SEGMENTS];
348 /* Indirect descriptors */
349 struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
350 struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
351 struct bio *biolist[MAX_INDIRECT_SEGMENTS];
234}; 352};
235 353
236 354
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void);
257 375
258irqreturn_t xen_blkif_be_int(int irq, void *dev_id); 376irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
259int xen_blkif_schedule(void *arg); 377int xen_blkif_schedule(void *arg);
378int xen_blkif_purge_persistent(void *arg);
260 379
261int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, 380int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
262 struct backend_info *be, int state); 381 struct backend_info *be, int state);
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
268static inline void blkif_get_x86_32_req(struct blkif_request *dst, 387static inline void blkif_get_x86_32_req(struct blkif_request *dst,
269 struct blkif_x86_32_request *src) 388 struct blkif_x86_32_request *src)
270{ 389{
271 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; 390 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
272 dst->operation = src->operation; 391 dst->operation = src->operation;
273 switch (src->operation) { 392 switch (src->operation) {
274 case BLKIF_OP_READ: 393 case BLKIF_OP_READ:
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
291 dst->u.discard.sector_number = src->u.discard.sector_number; 410 dst->u.discard.sector_number = src->u.discard.sector_number;
292 dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 411 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
293 break; 412 break;
413 case BLKIF_OP_INDIRECT:
414 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
415 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
416 dst->u.indirect.handle = src->u.indirect.handle;
417 dst->u.indirect.id = src->u.indirect.id;
418 dst->u.indirect.sector_number = src->u.indirect.sector_number;
419 barrier();
420 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
421 for (i = 0; i < j; i++)
422 dst->u.indirect.indirect_grefs[i] =
423 src->u.indirect.indirect_grefs[i];
424 break;
294 default: 425 default:
295 /* 426 /*
296 * Don't know how to translate this op. Only get the 427 * Don't know how to translate this op. Only get the
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
304static inline void blkif_get_x86_64_req(struct blkif_request *dst, 435static inline void blkif_get_x86_64_req(struct blkif_request *dst,
305 struct blkif_x86_64_request *src) 436 struct blkif_x86_64_request *src)
306{ 437{
307 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; 438 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
308 dst->operation = src->operation; 439 dst->operation = src->operation;
309 switch (src->operation) { 440 switch (src->operation) {
310 case BLKIF_OP_READ: 441 case BLKIF_OP_READ:
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
327 dst->u.discard.sector_number = src->u.discard.sector_number; 458 dst->u.discard.sector_number = src->u.discard.sector_number;
328 dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 459 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
329 break; 460 break;
461 case BLKIF_OP_INDIRECT:
462 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
463 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
464 dst->u.indirect.handle = src->u.indirect.handle;
465 dst->u.indirect.id = src->u.indirect.id;
466 dst->u.indirect.sector_number = src->u.indirect.sector_number;
467 barrier();
468 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
469 for (i = 0; i < j; i++)
470 dst->u.indirect.indirect_grefs[i] =
471 src->u.indirect.indirect_grefs[i];
472 break;
330 default: 473 default:
331 /* 474 /*
332 * Don't know how to translate this op. Only get the 475 * Don't know how to translate this op. Only get the
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 8bfd1bcf95ec..2e5b69d612ac 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
98 err = PTR_ERR(blkif->xenblkd); 98 err = PTR_ERR(blkif->xenblkd);
99 blkif->xenblkd = NULL; 99 blkif->xenblkd = NULL;
100 xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); 100 xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
101 return;
101 } 102 }
102} 103}
103 104
104static struct xen_blkif *xen_blkif_alloc(domid_t domid) 105static struct xen_blkif *xen_blkif_alloc(domid_t domid)
105{ 106{
106 struct xen_blkif *blkif; 107 struct xen_blkif *blkif;
108 struct pending_req *req, *n;
109 int i, j;
110
111 BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
107 112
108 blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); 113 blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
109 if (!blkif) 114 if (!blkif)
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
118 blkif->st_print = jiffies; 123 blkif->st_print = jiffies;
119 init_waitqueue_head(&blkif->waiting_to_free); 124 init_waitqueue_head(&blkif->waiting_to_free);
120 blkif->persistent_gnts.rb_node = NULL; 125 blkif->persistent_gnts.rb_node = NULL;
126 spin_lock_init(&blkif->free_pages_lock);
127 INIT_LIST_HEAD(&blkif->free_pages);
128 blkif->free_pages_num = 0;
129 atomic_set(&blkif->persistent_gnt_in_use, 0);
130
131 INIT_LIST_HEAD(&blkif->pending_free);
132
133 for (i = 0; i < XEN_BLKIF_REQS; i++) {
134 req = kzalloc(sizeof(*req), GFP_KERNEL);
135 if (!req)
136 goto fail;
137 list_add_tail(&req->free_list,
138 &blkif->pending_free);
139 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
140 req->segments[j] = kzalloc(sizeof(*req->segments[0]),
141 GFP_KERNEL);
142 if (!req->segments[j])
143 goto fail;
144 }
145 for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
146 req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
147 GFP_KERNEL);
148 if (!req->indirect_pages[j])
149 goto fail;
150 }
151 }
152 spin_lock_init(&blkif->pending_free_lock);
153 init_waitqueue_head(&blkif->pending_free_wq);
154 init_waitqueue_head(&blkif->shutdown_wq);
121 155
122 return blkif; 156 return blkif;
157
158fail:
159 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
160 list_del(&req->free_list);
161 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
162 if (!req->segments[j])
163 break;
164 kfree(req->segments[j]);
165 }
166 for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
167 if (!req->indirect_pages[j])
168 break;
169 kfree(req->indirect_pages[j]);
170 }
171 kfree(req);
172 }
173
174 kmem_cache_free(xen_blkif_cachep, blkif);
175
176 return ERR_PTR(-ENOMEM);
123} 177}
124 178
125static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, 179static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
178{ 232{
179 if (blkif->xenblkd) { 233 if (blkif->xenblkd) {
180 kthread_stop(blkif->xenblkd); 234 kthread_stop(blkif->xenblkd);
235 wake_up(&blkif->shutdown_wq);
181 blkif->xenblkd = NULL; 236 blkif->xenblkd = NULL;
182 } 237 }
183 238
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
198 253
199static void xen_blkif_free(struct xen_blkif *blkif) 254static void xen_blkif_free(struct xen_blkif *blkif)
200{ 255{
256 struct pending_req *req, *n;
257 int i = 0, j;
258
201 if (!atomic_dec_and_test(&blkif->refcnt)) 259 if (!atomic_dec_and_test(&blkif->refcnt))
202 BUG(); 260 BUG();
261
262 /* Check that there is no request in use */
263 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
264 list_del(&req->free_list);
265
266 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
267 kfree(req->segments[j]);
268
269 for (j = 0; j < MAX_INDIRECT_PAGES; j++)
270 kfree(req->indirect_pages[j]);
271
272 kfree(req);
273 i++;
274 }
275
276 WARN_ON(i != XEN_BLKIF_REQS);
277
203 kmem_cache_free(xen_blkif_cachep, blkif); 278 kmem_cache_free(xen_blkif_cachep, blkif);
204} 279}
205 280
@@ -678,6 +753,11 @@ again:
678 dev->nodename); 753 dev->nodename);
679 goto abort; 754 goto abort;
680 } 755 }
756 err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
757 MAX_INDIRECT_SEGMENTS);
758 if (err)
759 dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
760 dev->nodename, err);
681 761
682 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", 762 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
683 (unsigned long long)vbd_sz(&be->blkif->vbd)); 763 (unsigned long long)vbd_sz(&be->blkif->vbd));
@@ -704,6 +784,11 @@ again:
704 dev->nodename); 784 dev->nodename);
705 goto abort; 785 goto abort;
706 } 786 }
787 err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
788 bdev_physical_block_size(be->blkif->vbd.bdev));
789 if (err)
790 xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
791 dev->nodename);
707 792
708 err = xenbus_transaction_end(xbt, 0); 793 err = xenbus_transaction_end(xbt, 0);
709 if (err == -EAGAIN) 794 if (err == -EAGAIN)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d89ef86220f4..a4660bbee8a6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -74,12 +74,30 @@ struct grant {
74struct blk_shadow { 74struct blk_shadow {
75 struct blkif_request req; 75 struct blkif_request req;
76 struct request *request; 76 struct request *request;
77 struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 77 struct grant **grants_used;
78 struct grant **indirect_grants;
79 struct scatterlist *sg;
80};
81
82struct split_bio {
83 struct bio *bio;
84 atomic_t pending;
85 int err;
78}; 86};
79 87
80static DEFINE_MUTEX(blkfront_mutex); 88static DEFINE_MUTEX(blkfront_mutex);
81static const struct block_device_operations xlvbd_block_fops; 89static const struct block_device_operations xlvbd_block_fops;
82 90
91/*
92 * Maximum number of segments in indirect requests, the actual value used by
93 * the frontend driver is the minimum of this value and the value provided
94 * by the backend driver.
95 */
96
97static unsigned int xen_blkif_max_segments = 32;
98module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
99MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
100
83#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 101#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
84 102
85/* 103/*
@@ -98,7 +116,6 @@ struct blkfront_info
98 enum blkif_state connected; 116 enum blkif_state connected;
99 int ring_ref; 117 int ring_ref;
100 struct blkif_front_ring ring; 118 struct blkif_front_ring ring;
101 struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
102 unsigned int evtchn, irq; 119 unsigned int evtchn, irq;
103 struct request_queue *rq; 120 struct request_queue *rq;
104 struct work_struct work; 121 struct work_struct work;
@@ -114,6 +131,7 @@ struct blkfront_info
114 unsigned int discard_granularity; 131 unsigned int discard_granularity;
115 unsigned int discard_alignment; 132 unsigned int discard_alignment;
116 unsigned int feature_persistent:1; 133 unsigned int feature_persistent:1;
134 unsigned int max_indirect_segments;
117 int is_ready; 135 int is_ready;
118}; 136};
119 137
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock);
142 160
143#define DEV_NAME "xvd" /* name in /dev */ 161#define DEV_NAME "xvd" /* name in /dev */
144 162
163#define SEGS_PER_INDIRECT_FRAME \
164 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
165#define INDIRECT_GREFS(_segs) \
166 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
167
168static int blkfront_setup_indirect(struct blkfront_info *info);
169
145static int get_id_from_freelist(struct blkfront_info *info) 170static int get_id_from_freelist(struct blkfront_info *info)
146{ 171{
147 unsigned long free = info->shadow_free; 172 unsigned long free = info->shadow_free;
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req)
358 struct blkif_request *ring_req; 383 struct blkif_request *ring_req;
359 unsigned long id; 384 unsigned long id;
360 unsigned int fsect, lsect; 385 unsigned int fsect, lsect;
361 int i, ref; 386 int i, ref, n;
387 struct blkif_request_segment_aligned *segments = NULL;
362 388
363 /* 389 /*
364 * Used to store if we are able to queue the request by just using 390 * Used to store if we are able to queue the request by just using
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req)
369 grant_ref_t gref_head; 395 grant_ref_t gref_head;
370 struct grant *gnt_list_entry = NULL; 396 struct grant *gnt_list_entry = NULL;
371 struct scatterlist *sg; 397 struct scatterlist *sg;
398 int nseg, max_grefs;
372 399
373 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 400 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
374 return 1; 401 return 1;
375 402
376 /* Check if we have enought grants to allocate a requests */ 403 max_grefs = info->max_indirect_segments ?
377 if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { 404 info->max_indirect_segments +
405 INDIRECT_GREFS(info->max_indirect_segments) :
406 BLKIF_MAX_SEGMENTS_PER_REQUEST;
407
408 /* Check if we have enough grants to allocate a requests */
409 if (info->persistent_gnts_c < max_grefs) {
378 new_persistent_gnts = 1; 410 new_persistent_gnts = 1;
379 if (gnttab_alloc_grant_references( 411 if (gnttab_alloc_grant_references(
380 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, 412 max_grefs - info->persistent_gnts_c,
381 &gref_head) < 0) { 413 &gref_head) < 0) {
382 gnttab_request_free_callback( 414 gnttab_request_free_callback(
383 &info->callback, 415 &info->callback,
384 blkif_restart_queue_callback, 416 blkif_restart_queue_callback,
385 info, 417 info,
386 BLKIF_MAX_SEGMENTS_PER_REQUEST); 418 max_grefs);
387 return 1; 419 return 1;
388 } 420 }
389 } else 421 } else
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req)
394 id = get_id_from_freelist(info); 426 id = get_id_from_freelist(info);
395 info->shadow[id].request = req; 427 info->shadow[id].request = req;
396 428
397 ring_req->u.rw.id = id;
398 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
399 ring_req->u.rw.handle = info->handle;
400
401 ring_req->operation = rq_data_dir(req) ?
402 BLKIF_OP_WRITE : BLKIF_OP_READ;
403
404 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
405 /*
406 * Ideally we can do an unordered flush-to-disk. In case the
407 * backend onlysupports barriers, use that. A barrier request
408 * a superset of FUA, so we can implement it the same
409 * way. (It's also a FLUSH+FUA, since it is
410 * guaranteed ordered WRT previous writes.)
411 */
412 ring_req->operation = info->flush_op;
413 }
414
415 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { 429 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
416 /* id, sector_number and handle are set above. */
417 ring_req->operation = BLKIF_OP_DISCARD; 430 ring_req->operation = BLKIF_OP_DISCARD;
418 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 431 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
432 ring_req->u.discard.id = id;
433 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
419 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 434 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
420 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 435 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
421 else 436 else
422 ring_req->u.discard.flag = 0; 437 ring_req->u.discard.flag = 0;
423 } else { 438 } else {
424 ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, 439 BUG_ON(info->max_indirect_segments == 0 &&
425 info->sg); 440 req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
426 BUG_ON(ring_req->u.rw.nr_segments > 441 BUG_ON(info->max_indirect_segments &&
427 BLKIF_MAX_SEGMENTS_PER_REQUEST); 442 req->nr_phys_segments > info->max_indirect_segments);
428 443 nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
429 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { 444 ring_req->u.rw.id = id;
445 if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
446 /*
447 * The indirect operation can only be a BLKIF_OP_READ or
448 * BLKIF_OP_WRITE
449 */
450 BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
451 ring_req->operation = BLKIF_OP_INDIRECT;
452 ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
453 BLKIF_OP_WRITE : BLKIF_OP_READ;
454 ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
455 ring_req->u.indirect.handle = info->handle;
456 ring_req->u.indirect.nr_segments = nseg;
457 } else {
458 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
459 ring_req->u.rw.handle = info->handle;
460 ring_req->operation = rq_data_dir(req) ?
461 BLKIF_OP_WRITE : BLKIF_OP_READ;
462 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
463 /*
464 * Ideally we can do an unordered flush-to-disk. In case the
465 * backend onlysupports barriers, use that. A barrier request
466 * a superset of FUA, so we can implement it the same
467 * way. (It's also a FLUSH+FUA, since it is
468 * guaranteed ordered WRT previous writes.)
469 */
470 ring_req->operation = info->flush_op;
471 }
472 ring_req->u.rw.nr_segments = nseg;
473 }
474 for_each_sg(info->shadow[id].sg, sg, nseg, i) {
430 fsect = sg->offset >> 9; 475 fsect = sg->offset >> 9;
431 lsect = fsect + (sg->length >> 9) - 1; 476 lsect = fsect + (sg->length >> 9) - 1;
432 477
478 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
479 (i % SEGS_PER_INDIRECT_FRAME == 0)) {
480 if (segments)
481 kunmap_atomic(segments);
482
483 n = i / SEGS_PER_INDIRECT_FRAME;
484 gnt_list_entry = get_grant(&gref_head, info);
485 info->shadow[id].indirect_grants[n] = gnt_list_entry;
486 segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
487 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
488 }
489
433 gnt_list_entry = get_grant(&gref_head, info); 490 gnt_list_entry = get_grant(&gref_head, info);
434 ref = gnt_list_entry->gref; 491 ref = gnt_list_entry->gref;
435 492
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req)
441 498
442 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 499 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
443 500
444 shared_data = kmap_atomic( 501 shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
445 pfn_to_page(gnt_list_entry->pfn));
446 bvec_data = kmap_atomic(sg_page(sg)); 502 bvec_data = kmap_atomic(sg_page(sg));
447 503
448 /* 504 /*
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req)
461 kunmap_atomic(bvec_data); 517 kunmap_atomic(bvec_data);
462 kunmap_atomic(shared_data); 518 kunmap_atomic(shared_data);
463 } 519 }
464 520 if (ring_req->operation != BLKIF_OP_INDIRECT) {
465 ring_req->u.rw.seg[i] = 521 ring_req->u.rw.seg[i] =
466 (struct blkif_request_segment) { 522 (struct blkif_request_segment) {
467 .gref = ref, 523 .gref = ref,
468 .first_sect = fsect, 524 .first_sect = fsect,
469 .last_sect = lsect }; 525 .last_sect = lsect };
526 } else {
527 n = i % SEGS_PER_INDIRECT_FRAME;
528 segments[n] =
529 (struct blkif_request_segment_aligned) {
530 .gref = ref,
531 .first_sect = fsect,
532 .last_sect = lsect };
533 }
470 } 534 }
535 if (segments)
536 kunmap_atomic(segments);
471 } 537 }
472 538
473 info->ring.req_prod_pvt++; 539 info->ring.req_prod_pvt++;
@@ -542,7 +608,9 @@ wait:
542 flush_requests(info); 608 flush_requests(info);
543} 609}
544 610
545static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) 611static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
612 unsigned int physical_sector_size,
613 unsigned int segments)
546{ 614{
547 struct request_queue *rq; 615 struct request_queue *rq;
548 struct blkfront_info *info = gd->private_data; 616 struct blkfront_info *info = gd->private_data;
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
564 632
565 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 633 /* Hard sector size and max sectors impersonate the equiv. hardware. */
566 blk_queue_logical_block_size(rq, sector_size); 634 blk_queue_logical_block_size(rq, sector_size);
567 blk_queue_max_hw_sectors(rq, 512); 635 blk_queue_physical_block_size(rq, physical_sector_size);
636 blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
568 637
569 /* Each segment in a request is up to an aligned page in size. */ 638 /* Each segment in a request is up to an aligned page in size. */
570 blk_queue_segment_boundary(rq, PAGE_SIZE - 1); 639 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
571 blk_queue_max_segment_size(rq, PAGE_SIZE); 640 blk_queue_max_segment_size(rq, PAGE_SIZE);
572 641
573 /* Ensure a merged request will fit in a single I/O ring slot. */ 642 /* Ensure a merged request will fit in a single I/O ring slot. */
574 blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); 643 blk_queue_max_segments(rq, segments);
575 644
576 /* Make sure buffer addresses are sector-aligned. */ 645 /* Make sure buffer addresses are sector-aligned. */
577 blk_queue_dma_alignment(rq, 511); 646 blk_queue_dma_alignment(rq, 511);
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
588static void xlvbd_flush(struct blkfront_info *info) 657static void xlvbd_flush(struct blkfront_info *info)
589{ 658{
590 blk_queue_flush(info->rq, info->feature_flush); 659 blk_queue_flush(info->rq, info->feature_flush);
591 printk(KERN_INFO "blkfront: %s: %s: %s %s\n", 660 printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
592 info->gd->disk_name, 661 info->gd->disk_name,
593 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 662 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
594 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 663 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
595 "flush diskcache" : "barrier or flush"), 664 "flush diskcache" : "barrier or flush"),
596 info->feature_flush ? "enabled" : "disabled", 665 info->feature_flush ? "enabled;" : "disabled;",
597 info->feature_persistent ? "using persistent grants" : ""); 666 "persistent grants:",
667 info->feature_persistent ? "enabled;" : "disabled;",
668 "indirect descriptors:",
669 info->max_indirect_segments ? "enabled;" : "disabled;");
598} 670}
599 671
600static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 672static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n)
667 739
668static int xlvbd_alloc_gendisk(blkif_sector_t capacity, 740static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
669 struct blkfront_info *info, 741 struct blkfront_info *info,
670 u16 vdisk_info, u16 sector_size) 742 u16 vdisk_info, u16 sector_size,
743 unsigned int physical_sector_size)
671{ 744{
672 struct gendisk *gd; 745 struct gendisk *gd;
673 int nr_minors = 1; 746 int nr_minors = 1;
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
734 gd->driverfs_dev = &(info->xbdev->dev); 807 gd->driverfs_dev = &(info->xbdev->dev);
735 set_capacity(gd, capacity); 808 set_capacity(gd, capacity);
736 809
737 if (xlvbd_init_blk_queue(gd, sector_size)) { 810 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
811 info->max_indirect_segments ? :
812 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
738 del_gendisk(gd); 813 del_gendisk(gd);
739 goto release; 814 goto release;
740 } 815 }
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
818{ 893{
819 struct grant *persistent_gnt; 894 struct grant *persistent_gnt;
820 struct grant *n; 895 struct grant *n;
896 int i, j, segs;
821 897
822 /* Prevent new requests being issued until we fix things up. */ 898 /* Prevent new requests being issued until we fix things up. */
823 spin_lock_irq(&info->io_lock); 899 spin_lock_irq(&info->io_lock);
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
843 } 919 }
844 BUG_ON(info->persistent_gnts_c != 0); 920 BUG_ON(info->persistent_gnts_c != 0);
845 921
922 for (i = 0; i < BLK_RING_SIZE; i++) {
923 /*
924 * Clear persistent grants present in requests already
925 * on the shared ring
926 */
927 if (!info->shadow[i].request)
928 goto free_shadow;
929
930 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
931 info->shadow[i].req.u.indirect.nr_segments :
932 info->shadow[i].req.u.rw.nr_segments;
933 for (j = 0; j < segs; j++) {
934 persistent_gnt = info->shadow[i].grants_used[j];
935 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
936 __free_page(pfn_to_page(persistent_gnt->pfn));
937 kfree(persistent_gnt);
938 }
939
940 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
941 /*
942 * If this is not an indirect operation don't try to
943 * free indirect segments
944 */
945 goto free_shadow;
946
947 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
948 persistent_gnt = info->shadow[i].indirect_grants[j];
949 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
950 __free_page(pfn_to_page(persistent_gnt->pfn));
951 kfree(persistent_gnt);
952 }
953
954free_shadow:
955 kfree(info->shadow[i].grants_used);
956 info->shadow[i].grants_used = NULL;
957 kfree(info->shadow[i].indirect_grants);
958 info->shadow[i].indirect_grants = NULL;
959 kfree(info->shadow[i].sg);
960 info->shadow[i].sg = NULL;
961 }
962
846 /* No more gnttab callback work. */ 963 /* No more gnttab callback work. */
847 gnttab_cancel_free_callback(&info->callback); 964 gnttab_cancel_free_callback(&info->callback);
848 spin_unlock_irq(&info->io_lock); 965 spin_unlock_irq(&info->io_lock);
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
867 struct blkif_response *bret) 984 struct blkif_response *bret)
868{ 985{
869 int i = 0; 986 int i = 0;
870 struct bio_vec *bvec; 987 struct scatterlist *sg;
871 struct req_iterator iter;
872 unsigned long flags;
873 char *bvec_data; 988 char *bvec_data;
874 void *shared_data; 989 void *shared_data;
875 unsigned int offset = 0; 990 int nseg;
991
992 nseg = s->req.operation == BLKIF_OP_INDIRECT ?
993 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
876 994
877 if (bret->operation == BLKIF_OP_READ) { 995 if (bret->operation == BLKIF_OP_READ) {
878 /* 996 /*
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
881 * than PAGE_SIZE, we have to keep track of the current offset, 999 * than PAGE_SIZE, we have to keep track of the current offset,
882 * to be sure we are copying the data from the right shared page. 1000 * to be sure we are copying the data from the right shared page.
883 */ 1001 */
884 rq_for_each_segment(bvec, s->request, iter) { 1002 for_each_sg(s->sg, sg, nseg, i) {
885 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); 1003 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
886 if (bvec->bv_offset < offset)
887 i++;
888 BUG_ON(i >= s->req.u.rw.nr_segments);
889 shared_data = kmap_atomic( 1004 shared_data = kmap_atomic(
890 pfn_to_page(s->grants_used[i]->pfn)); 1005 pfn_to_page(s->grants_used[i]->pfn));
891 bvec_data = bvec_kmap_irq(bvec, &flags); 1006 bvec_data = kmap_atomic(sg_page(sg));
892 memcpy(bvec_data, shared_data + bvec->bv_offset, 1007 memcpy(bvec_data + sg->offset,
893 bvec->bv_len); 1008 shared_data + sg->offset,
894 bvec_kunmap_irq(bvec_data, &flags); 1009 sg->length);
1010 kunmap_atomic(bvec_data);
895 kunmap_atomic(shared_data); 1011 kunmap_atomic(shared_data);
896 offset = bvec->bv_offset + bvec->bv_len;
897 } 1012 }
898 } 1013 }
899 /* Add the persistent grant into the list of free grants */ 1014 /* Add the persistent grant into the list of free grants */
900 for (i = 0; i < s->req.u.rw.nr_segments; i++) { 1015 for (i = 0; i < nseg; i++) {
901 list_add(&s->grants_used[i]->node, &info->persistent_gnts); 1016 list_add(&s->grants_used[i]->node, &info->persistent_gnts);
902 info->persistent_gnts_c++; 1017 info->persistent_gnts_c++;
903 } 1018 }
1019 if (s->req.operation == BLKIF_OP_INDIRECT) {
1020 for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
1021 list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
1022 info->persistent_gnts_c++;
1023 }
1024 }
904} 1025}
905 1026
906static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1027static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev,
1034 SHARED_RING_INIT(sring); 1155 SHARED_RING_INIT(sring);
1035 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); 1156 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
1036 1157
1037 sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
1038
1039 /* Allocate memory for grants */
1040 err = fill_grant_buffer(info, BLK_RING_SIZE *
1041 BLKIF_MAX_SEGMENTS_PER_REQUEST);
1042 if (err)
1043 goto fail;
1044
1045 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); 1158 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
1046 if (err < 0) { 1159 if (err < 0) {
1047 free_page((unsigned long)sring); 1160 free_page((unsigned long)sring);
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev,
1223 return 0; 1336 return 0;
1224} 1337}
1225 1338
1339/*
1340 * This is a clone of md_trim_bio, used to split a bio into smaller ones
1341 */
1342static void trim_bio(struct bio *bio, int offset, int size)
1343{
1344 /* 'bio' is a cloned bio which we need to trim to match
1345 * the given offset and size.
1346 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1347 */
1348 int i;
1349 struct bio_vec *bvec;
1350 int sofar = 0;
1351
1352 size <<= 9;
1353 if (offset == 0 && size == bio->bi_size)
1354 return;
1355
1356 bio->bi_sector += offset;
1357 bio->bi_size = size;
1358 offset <<= 9;
1359 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1360
1361 while (bio->bi_idx < bio->bi_vcnt &&
1362 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
1363 /* remove this whole bio_vec */
1364 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
1365 bio->bi_idx++;
1366 }
1367 if (bio->bi_idx < bio->bi_vcnt) {
1368 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
1369 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
1370 }
1371 /* avoid any complications with bi_idx being non-zero*/
1372 if (bio->bi_idx) {
1373 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1374 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1375 bio->bi_vcnt -= bio->bi_idx;
1376 bio->bi_idx = 0;
1377 }
1378 /* Make sure vcnt and last bv are not too big */
1379 bio_for_each_segment(bvec, bio, i) {
1380 if (sofar + bvec->bv_len > size)
1381 bvec->bv_len = size - sofar;
1382 if (bvec->bv_len == 0) {
1383 bio->bi_vcnt = i;
1384 break;
1385 }
1386 sofar += bvec->bv_len;
1387 }
1388}
1389
1390static void split_bio_end(struct bio *bio, int error)
1391{
1392 struct split_bio *split_bio = bio->bi_private;
1393
1394 if (error)
1395 split_bio->err = error;
1396
1397 if (atomic_dec_and_test(&split_bio->pending)) {
1398 split_bio->bio->bi_phys_segments = 0;
1399 bio_endio(split_bio->bio, split_bio->err);
1400 kfree(split_bio);
1401 }
1402 bio_put(bio);
1403}
1226 1404
1227static int blkif_recover(struct blkfront_info *info) 1405static int blkif_recover(struct blkfront_info *info)
1228{ 1406{
1229 int i; 1407 int i;
1230 struct blkif_request *req; 1408 struct request *req, *n;
1231 struct blk_shadow *copy; 1409 struct blk_shadow *copy;
1232 int j; 1410 int rc;
1411 struct bio *bio, *cloned_bio;
1412 struct bio_list bio_list, merge_bio;
1413 unsigned int segs, offset;
1414 int pending, size;
1415 struct split_bio *split_bio;
1416 struct list_head requests;
1233 1417
1234 /* Stage 1: Make a safe copy of the shadow state. */ 1418 /* Stage 1: Make a safe copy of the shadow state. */
1235 copy = kmemdup(info->shadow, sizeof(info->shadow), 1419 copy = kmemdup(info->shadow, sizeof(info->shadow),
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info)
1244 info->shadow_free = info->ring.req_prod_pvt; 1428 info->shadow_free = info->ring.req_prod_pvt;
1245 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; 1429 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1246 1430
1247 /* Stage 3: Find pending requests and requeue them. */ 1431 rc = blkfront_setup_indirect(info);
1432 if (rc) {
1433 kfree(copy);
1434 return rc;
1435 }
1436
1437 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1438 blk_queue_max_segments(info->rq, segs);
1439 bio_list_init(&bio_list);
1440 INIT_LIST_HEAD(&requests);
1248 for (i = 0; i < BLK_RING_SIZE; i++) { 1441 for (i = 0; i < BLK_RING_SIZE; i++) {
1249 /* Not in use? */ 1442 /* Not in use? */
1250 if (!copy[i].request) 1443 if (!copy[i].request)
1251 continue; 1444 continue;
1252 1445
1253 /* Grab a request slot and copy shadow state into it. */ 1446 /*
1254 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 1447 * Get the bios in the request so we can re-queue them.
1255 *req = copy[i].req; 1448 */
1256 1449 if (copy[i].request->cmd_flags &
1257 /* We get a new request id, and must reset the shadow state. */ 1450 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1258 req->u.rw.id = get_id_from_freelist(info); 1451 /*
1259 memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i])); 1452 * Flush operations don't contain bios, so
1260 1453 * we need to requeue the whole request
1261 if (req->operation != BLKIF_OP_DISCARD) { 1454 */
1262 /* Rewrite any grant references invalidated by susp/resume. */ 1455 list_add(&copy[i].request->queuelist, &requests);
1263 for (j = 0; j < req->u.rw.nr_segments; j++) 1456 continue;
1264 gnttab_grant_foreign_access_ref(
1265 req->u.rw.seg[j].gref,
1266 info->xbdev->otherend_id,
1267 pfn_to_mfn(copy[i].grants_used[j]->pfn),
1268 0);
1269 } 1457 }
1270 info->shadow[req->u.rw.id].req = *req; 1458 merge_bio.head = copy[i].request->bio;
1271 1459 merge_bio.tail = copy[i].request->biotail;
1272 info->ring.req_prod_pvt++; 1460 bio_list_merge(&bio_list, &merge_bio);
1461 copy[i].request->bio = NULL;
1462 blk_put_request(copy[i].request);
1273 } 1463 }
1274 1464
1275 kfree(copy); 1465 kfree(copy);
1276 1466
1467 /*
1468 * Empty the queue, this is important because we might have
1469 * requests in the queue with more segments than what we
1470 * can handle now.
1471 */
1472 spin_lock_irq(&info->io_lock);
1473 while ((req = blk_fetch_request(info->rq)) != NULL) {
1474 if (req->cmd_flags &
1475 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1476 list_add(&req->queuelist, &requests);
1477 continue;
1478 }
1479 merge_bio.head = req->bio;
1480 merge_bio.tail = req->biotail;
1481 bio_list_merge(&bio_list, &merge_bio);
1482 req->bio = NULL;
1483 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
1484 pr_alert("diskcache flush request found!\n");
1485 __blk_put_request(info->rq, req);
1486 }
1487 spin_unlock_irq(&info->io_lock);
1488
1277 xenbus_switch_state(info->xbdev, XenbusStateConnected); 1489 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1278 1490
1279 spin_lock_irq(&info->io_lock); 1491 spin_lock_irq(&info->io_lock);
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info)
1281 /* Now safe for us to use the shared ring */ 1493 /* Now safe for us to use the shared ring */
1282 info->connected = BLKIF_STATE_CONNECTED; 1494 info->connected = BLKIF_STATE_CONNECTED;
1283 1495
1284 /* Send off requeued requests */
1285 flush_requests(info);
1286
1287 /* Kick any other new requests queued since we resumed */ 1496 /* Kick any other new requests queued since we resumed */
1288 kick_pending_request_queues(info); 1497 kick_pending_request_queues(info);
1289 1498
1499 list_for_each_entry_safe(req, n, &requests, queuelist) {
1500 /* Requeue pending requests (flush or discard) */
1501 list_del_init(&req->queuelist);
1502 BUG_ON(req->nr_phys_segments > segs);
1503 blk_requeue_request(info->rq, req);
1504 }
1290 spin_unlock_irq(&info->io_lock); 1505 spin_unlock_irq(&info->io_lock);
1291 1506
1507 while ((bio = bio_list_pop(&bio_list)) != NULL) {
1508 /* Traverse the list of pending bios and re-queue them */
1509 if (bio_segments(bio) > segs) {
1510 /*
1511 * This bio has more segments than what we can
1512 * handle, we have to split it.
1513 */
1514 pending = (bio_segments(bio) + segs - 1) / segs;
1515 split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
1516 BUG_ON(split_bio == NULL);
1517 atomic_set(&split_bio->pending, pending);
1518 split_bio->bio = bio;
1519 for (i = 0; i < pending; i++) {
1520 offset = (i * segs * PAGE_SIZE) >> 9;
1521 size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
1522 (unsigned int)(bio->bi_size >> 9) - offset);
1523 cloned_bio = bio_clone(bio, GFP_NOIO);
1524 BUG_ON(cloned_bio == NULL);
1525 trim_bio(cloned_bio, offset, size);
1526 cloned_bio->bi_private = split_bio;
1527 cloned_bio->bi_end_io = split_bio_end;
1528 submit_bio(cloned_bio->bi_rw, cloned_bio);
1529 }
1530 /*
1531 * Now we have to wait for all those smaller bios to
1532 * end, so we can also end the "parent" bio.
1533 */
1534 continue;
1535 }
1536 /* We don't need to split this bio */
1537 submit_bio(bio->bi_rw, bio);
1538 }
1539
1292 return 0; 1540 return 0;
1293} 1541}
1294 1542
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev)
1308 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); 1556 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
1309 1557
1310 err = talk_to_blkback(dev, info); 1558 err = talk_to_blkback(dev, info);
1311 if (info->connected == BLKIF_STATE_SUSPENDED && !err) 1559
1312 err = blkif_recover(info); 1560 /*
1561 * We have to wait for the backend to switch to
1562 * connected state, since we want to read which
1563 * features it supports.
1564 */
1313 1565
1314 return err; 1566 return err;
1315} 1567}
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info)
1387 kfree(type); 1639 kfree(type);
1388} 1640}
1389 1641
1642static int blkfront_setup_indirect(struct blkfront_info *info)
1643{
1644 unsigned int indirect_segments, segs;
1645 int err, i;
1646
1647 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1648 "feature-max-indirect-segments", "%u", &indirect_segments,
1649 NULL);
1650 if (err) {
1651 info->max_indirect_segments = 0;
1652 segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1653 } else {
1654 info->max_indirect_segments = min(indirect_segments,
1655 xen_blkif_max_segments);
1656 segs = info->max_indirect_segments;
1657 }
1658
1659 err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
1660 if (err)
1661 goto out_of_memory;
1662
1663 for (i = 0; i < BLK_RING_SIZE; i++) {
1664 info->shadow[i].grants_used = kzalloc(
1665 sizeof(info->shadow[i].grants_used[0]) * segs,
1666 GFP_NOIO);
1667 info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
1668 if (info->max_indirect_segments)
1669 info->shadow[i].indirect_grants = kzalloc(
1670 sizeof(info->shadow[i].indirect_grants[0]) *
1671 INDIRECT_GREFS(segs),
1672 GFP_NOIO);
1673 if ((info->shadow[i].grants_used == NULL) ||
1674 (info->shadow[i].sg == NULL) ||
1675 (info->max_indirect_segments &&
1676 (info->shadow[i].indirect_grants == NULL)))
1677 goto out_of_memory;
1678 sg_init_table(info->shadow[i].sg, segs);
1679 }
1680
1681
1682 return 0;
1683
1684out_of_memory:
1685 for (i = 0; i < BLK_RING_SIZE; i++) {
1686 kfree(info->shadow[i].grants_used);
1687 info->shadow[i].grants_used = NULL;
1688 kfree(info->shadow[i].sg);
1689 info->shadow[i].sg = NULL;
1690 kfree(info->shadow[i].indirect_grants);
1691 info->shadow[i].indirect_grants = NULL;
1692 }
1693 return -ENOMEM;
1694}
1695
1390/* 1696/*
1391 * Invoked when the backend is finally 'ready' (and has told produced 1697 * Invoked when the backend is finally 'ready' (and has told produced
1392 * the details about the physical device - #sectors, size, etc). 1698 * the details about the physical device - #sectors, size, etc).
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info)
1395{ 1701{
1396 unsigned long long sectors; 1702 unsigned long long sectors;
1397 unsigned long sector_size; 1703 unsigned long sector_size;
1704 unsigned int physical_sector_size;
1398 unsigned int binfo; 1705 unsigned int binfo;
1399 int err; 1706 int err;
1400 int barrier, flush, discard, persistent; 1707 int barrier, flush, discard, persistent;
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info)
1414 set_capacity(info->gd, sectors); 1721 set_capacity(info->gd, sectors);
1415 revalidate_disk(info->gd); 1722 revalidate_disk(info->gd);
1416 1723
1417 /* fall through */ 1724 return;
1418 case BLKIF_STATE_SUSPENDED: 1725 case BLKIF_STATE_SUSPENDED:
1726 /*
1727 * If we are recovering from suspension, we need to wait
1728 * for the backend to announce it's features before
1729 * reconnecting, at least we need to know if the backend
1730 * supports indirect descriptors, and how many.
1731 */
1732 blkif_recover(info);
1419 return; 1733 return;
1420 1734
1421 default: 1735 default:
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info)
1437 return; 1751 return;
1438 } 1752 }
1439 1753
1754 /*
1755 * physcial-sector-size is a newer field, so old backends may not
1756 * provide this. Assume physical sector size to be the same as
1757 * sector_size in that case.
1758 */
1759 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1760 "physical-sector-size", "%u", &physical_sector_size);
1761 if (err != 1)
1762 physical_sector_size = sector_size;
1763
1440 info->feature_flush = 0; 1764 info->feature_flush = 0;
1441 info->flush_op = 0; 1765 info->flush_op = 0;
1442 1766
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info)
1483 else 1807 else
1484 info->feature_persistent = persistent; 1808 info->feature_persistent = persistent;
1485 1809
1486 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1810 err = blkfront_setup_indirect(info);
1811 if (err) {
1812 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
1813 info->xbdev->otherend);
1814 return;
1815 }
1816
1817 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
1818 physical_sector_size);
1487 if (err) { 1819 if (err) {
1488 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1820 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
1489 info->xbdev->otherend); 1821 info->xbdev->otherend);
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index ffd4652de91c..65e12099ef89 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t;
103#define BLKIF_OP_DISCARD 5 103#define BLKIF_OP_DISCARD 5
104 104
105/* 105/*
106 * Recognized if "feature-max-indirect-segments" in present in the backend
107 * xenbus info. The "feature-max-indirect-segments" node contains the maximum
108 * number of segments allowed by the backend per request. If the node is
109 * present, the frontend might use blkif_request_indirect structs in order to
110 * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
111 * maximum number of indirect segments is fixed by the backend, but the
112 * frontend can issue requests with any number of indirect segments as long as
113 * it's less than the number provided by the backend. The indirect_grefs field
114 * in blkif_request_indirect should be filled by the frontend with the
115 * grant references of the pages that are holding the indirect segments.
116 * This pages are filled with an array of blkif_request_segment_aligned
117 * that hold the information about the segments. The number of indirect
118 * pages to use is determined by the maximum number of segments
119 * a indirect request contains. Every indirect page can contain a maximum
120 * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)),
121 * so to calculate the number of indirect pages to use we have to do
122 * ceil(indirect_segments/512).
123 *
124 * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
125 * create the "feature-max-indirect-segments" node!
126 */
127#define BLKIF_OP_INDIRECT 6
128
129/*
106 * Maximum scatter/gather segments per request. 130 * Maximum scatter/gather segments per request.
107 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. 131 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
108 * NB. This could be 12 if the ring indexes weren't stored in the same page. 132 * NB. This could be 12 if the ring indexes weren't stored in the same page.
109 */ 133 */
110#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 134#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
111 135
136#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
137
138struct blkif_request_segment_aligned {
139 grant_ref_t gref; /* reference to I/O buffer frame */
140 /* @first_sect: first sector in frame to transfer (inclusive). */
141 /* @last_sect: last sector in frame to transfer (inclusive). */
142 uint8_t first_sect, last_sect;
143 uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */
144} __attribute__((__packed__));
145
112struct blkif_request_rw { 146struct blkif_request_rw {
113 uint8_t nr_segments; /* number of segments */ 147 uint8_t nr_segments; /* number of segments */
114 blkif_vdev_t handle; /* only for read/write requests */ 148 blkif_vdev_t handle; /* only for read/write requests */
@@ -147,12 +181,31 @@ struct blkif_request_other {
147 uint64_t id; /* private guest value, echoed in resp */ 181 uint64_t id; /* private guest value, echoed in resp */
148} __attribute__((__packed__)); 182} __attribute__((__packed__));
149 183
184struct blkif_request_indirect {
185 uint8_t indirect_op;
186 uint16_t nr_segments;
187#ifdef CONFIG_X86_64
188 uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */
189#endif
190 uint64_t id;
191 blkif_sector_t sector_number;
192 blkif_vdev_t handle;
193 uint16_t _pad2;
194 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
195#ifdef CONFIG_X86_64
196 uint32_t _pad3; /* make it 64 byte aligned */
197#else
198 uint64_t _pad3; /* make it 64 byte aligned */
199#endif
200} __attribute__((__packed__));
201
150struct blkif_request { 202struct blkif_request {
151 uint8_t operation; /* BLKIF_OP_??? */ 203 uint8_t operation; /* BLKIF_OP_??? */
152 union { 204 union {
153 struct blkif_request_rw rw; 205 struct blkif_request_rw rw;
154 struct blkif_request_discard discard; 206 struct blkif_request_discard discard;
155 struct blkif_request_other other; 207 struct blkif_request_other other;
208 struct blkif_request_indirect indirect;
156 } u; 209 } u;
157} __attribute__((__packed__)); 210} __attribute__((__packed__));
158 211
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index 75271b9a8f61..7d28aff605c7 100644
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -188,6 +188,11 @@ struct __name##_back_ring { \
188#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ 188#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
189 (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) 189 (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
190 190
191/* Ill-behaved frontend determination: Can there be this many requests? */
192#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \
193 (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r))
194
195
191#define RING_PUSH_REQUESTS(_r) do { \ 196#define RING_PUSH_REQUESTS(_r) do { \
192 wmb(); /* back sees requests /before/ updated producer index */ \ 197 wmb(); /* back sees requests /before/ updated producer index */ \
193 (_r)->sring->req_prod = (_r)->req_prod_pvt; \ 198 (_r)->sring->req_prod = (_r)->req_prod_pvt; \