diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/xen-blkback/blkback.c | 872 | ||||
-rw-r--r-- | drivers/block/xen-blkback/common.h | 147 | ||||
-rw-r--r-- | drivers/block/xen-blkback/xenbus.c | 85 | ||||
-rw-r--r-- | drivers/block/xen-blkfront.c | 532 |
4 files changed, 1214 insertions, 422 deletions
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index dd5b2fed97e9..bf4b9d282c04 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c | |||
@@ -50,110 +50,118 @@ | |||
50 | #include "common.h" | 50 | #include "common.h" |
51 | 51 | ||
52 | /* | 52 | /* |
53 | * These are rather arbitrary. They are fairly large because adjacent requests | 53 | * Maximum number of unused free pages to keep in the internal buffer. |
54 | * pulled from a communication ring are quite likely to end up being part of | 54 | * Setting this to a value too low will reduce memory used in each backend, |
55 | * the same scatter/gather request at the disc. | 55 | * but can have a performance penalty. |
56 | * | 56 | * |
57 | * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** | 57 | * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can |
58 | * | 58 | * be set to a lower value that might degrade performance on some intensive |
59 | * This will increase the chances of being able to write whole tracks. | 59 | * IO workloads. |
60 | * 64 should be enough to keep us competitive with Linux. | ||
61 | */ | 60 | */ |
62 | static int xen_blkif_reqs = 64; | ||
63 | module_param_named(reqs, xen_blkif_reqs, int, 0); | ||
64 | MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); | ||
65 | 61 | ||
66 | /* Run-time switchable: /sys/module/blkback/parameters/ */ | 62 | static int xen_blkif_max_buffer_pages = 1024; |
67 | static unsigned int log_stats; | 63 | module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644); |
68 | module_param(log_stats, int, 0644); | 64 | MODULE_PARM_DESC(max_buffer_pages, |
65 | "Maximum number of free pages to keep in each block backend buffer"); | ||
69 | 66 | ||
70 | /* | 67 | /* |
71 | * Each outstanding request that we've passed to the lower device layers has a | 68 | * Maximum number of grants to map persistently in blkback. For maximum |
72 | * 'pending_req' allocated to it. Each buffer_head that completes decrements | 69 | * performance this should be the total numbers of grants that can be used |
73 | * the pendcnt towards zero. When it hits zero, the specified domain has a | 70 | * to fill the ring, but since this might become too high, specially with |
74 | * response queued for it, with the saved 'id' passed back. | 71 | * the use of indirect descriptors, we set it to a value that provides good |
72 | * performance without using too much memory. | ||
73 | * | ||
74 | * When the list of persistent grants is full we clean it up using a LRU | ||
75 | * algorithm. | ||
75 | */ | 76 | */ |
76 | struct pending_req { | ||
77 | struct xen_blkif *blkif; | ||
78 | u64 id; | ||
79 | int nr_pages; | ||
80 | atomic_t pendcnt; | ||
81 | unsigned short operation; | ||
82 | int status; | ||
83 | struct list_head free_list; | ||
84 | DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
85 | }; | ||
86 | 77 | ||
87 | #define BLKBACK_INVALID_HANDLE (~0) | 78 | static int xen_blkif_max_pgrants = 1056; |
79 | module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); | ||
80 | MODULE_PARM_DESC(max_persistent_grants, | ||
81 | "Maximum number of grants to map persistently"); | ||
88 | 82 | ||
89 | struct xen_blkbk { | 83 | /* |
90 | struct pending_req *pending_reqs; | 84 | * The LRU mechanism to clean the lists of persistent grants needs to |
91 | /* List of all 'pending_req' available */ | 85 | * be executed periodically. The time interval between consecutive executions |
92 | struct list_head pending_free; | 86 | * of the purge mechanism is set in ms. |
93 | /* And its spinlock. */ | 87 | */ |
94 | spinlock_t pending_free_lock; | 88 | #define LRU_INTERVAL 100 |
95 | wait_queue_head_t pending_free_wq; | ||
96 | /* The list of all pages that are available. */ | ||
97 | struct page **pending_pages; | ||
98 | /* And the grant handles that are available. */ | ||
99 | grant_handle_t *pending_grant_handles; | ||
100 | }; | ||
101 | |||
102 | static struct xen_blkbk *blkbk; | ||
103 | 89 | ||
104 | /* | 90 | /* |
105 | * Maximum number of grant pages that can be mapped in blkback. | 91 | * When the persistent grants list is full we will remove unused grants |
106 | * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of | 92 | * from the list. The percent number of grants to be removed at each LRU |
107 | * pages that blkback will persistently map. | 93 | * execution. |
108 | * Currently, this is: | ||
109 | * RING_SIZE = 32 (for all known ring types) | ||
110 | * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11 | ||
111 | * sizeof(struct persistent_gnt) = 48 | ||
112 | * So the maximum memory used to store the grants is: | ||
113 | * 32 * 11 * 48 = 16896 bytes | ||
114 | */ | 94 | */ |
115 | static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) | 95 | #define LRU_PERCENT_CLEAN 5 |
96 | |||
97 | /* Run-time switchable: /sys/module/blkback/parameters/ */ | ||
98 | static unsigned int log_stats; | ||
99 | module_param(log_stats, int, 0644); | ||
100 | |||
101 | #define BLKBACK_INVALID_HANDLE (~0) | ||
102 | |||
103 | /* Number of free pages to remove on each call to free_xenballooned_pages */ | ||
104 | #define NUM_BATCH_FREE_PAGES 10 | ||
105 | |||
106 | static inline int get_free_page(struct xen_blkif *blkif, struct page **page) | ||
116 | { | 107 | { |
117 | switch (protocol) { | 108 | unsigned long flags; |
118 | case BLKIF_PROTOCOL_NATIVE: | 109 | |
119 | return __CONST_RING_SIZE(blkif, PAGE_SIZE) * | 110 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
120 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | 111 | if (list_empty(&blkif->free_pages)) { |
121 | case BLKIF_PROTOCOL_X86_32: | 112 | BUG_ON(blkif->free_pages_num != 0); |
122 | return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * | 113 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); |
123 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | 114 | return alloc_xenballooned_pages(1, page, false); |
124 | case BLKIF_PROTOCOL_X86_64: | ||
125 | return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) * | ||
126 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
127 | default: | ||
128 | BUG(); | ||
129 | } | 115 | } |
116 | BUG_ON(blkif->free_pages_num == 0); | ||
117 | page[0] = list_first_entry(&blkif->free_pages, struct page, lru); | ||
118 | list_del(&page[0]->lru); | ||
119 | blkif->free_pages_num--; | ||
120 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
121 | |||
130 | return 0; | 122 | return 0; |
131 | } | 123 | } |
132 | 124 | ||
133 | 125 | static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, | |
134 | /* | 126 | int num) |
135 | * Little helpful macro to figure out the index and virtual address of the | ||
136 | * pending_pages[..]. For each 'pending_req' we have have up to | ||
137 | * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through | ||
138 | * 10 and would index in the pending_pages[..]. | ||
139 | */ | ||
140 | static inline int vaddr_pagenr(struct pending_req *req, int seg) | ||
141 | { | 127 | { |
142 | return (req - blkbk->pending_reqs) * | 128 | unsigned long flags; |
143 | BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; | 129 | int i; |
144 | } | ||
145 | 130 | ||
146 | #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] | 131 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
132 | for (i = 0; i < num; i++) | ||
133 | list_add(&page[i]->lru, &blkif->free_pages); | ||
134 | blkif->free_pages_num += num; | ||
135 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
136 | } | ||
147 | 137 | ||
148 | static inline unsigned long vaddr(struct pending_req *req, int seg) | 138 | static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) |
149 | { | 139 | { |
150 | unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); | 140 | /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ |
151 | return (unsigned long)pfn_to_kaddr(pfn); | 141 | struct page *page[NUM_BATCH_FREE_PAGES]; |
152 | } | 142 | unsigned int num_pages = 0; |
143 | unsigned long flags; | ||
153 | 144 | ||
154 | #define pending_handle(_req, _seg) \ | 145 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
155 | (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) | 146 | while (blkif->free_pages_num > num) { |
147 | BUG_ON(list_empty(&blkif->free_pages)); | ||
148 | page[num_pages] = list_first_entry(&blkif->free_pages, | ||
149 | struct page, lru); | ||
150 | list_del(&page[num_pages]->lru); | ||
151 | blkif->free_pages_num--; | ||
152 | if (++num_pages == NUM_BATCH_FREE_PAGES) { | ||
153 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
154 | free_xenballooned_pages(num_pages, page); | ||
155 | spin_lock_irqsave(&blkif->free_pages_lock, flags); | ||
156 | num_pages = 0; | ||
157 | } | ||
158 | } | ||
159 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
160 | if (num_pages != 0) | ||
161 | free_xenballooned_pages(num_pages, page); | ||
162 | } | ||
156 | 163 | ||
164 | #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) | ||
157 | 165 | ||
158 | static int do_block_io_op(struct xen_blkif *blkif); | 166 | static int do_block_io_op(struct xen_blkif *blkif); |
159 | static int dispatch_rw_block_io(struct xen_blkif *blkif, | 167 | static int dispatch_rw_block_io(struct xen_blkif *blkif, |
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id, | |||
170 | (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) | 178 | (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) |
171 | 179 | ||
172 | 180 | ||
173 | static void add_persistent_gnt(struct rb_root *root, | 181 | /* |
182 | * We don't need locking around the persistent grant helpers | ||
183 | * because blkback uses a single-thread for each backed, so we | ||
184 | * can be sure that this functions will never be called recursively. | ||
185 | * | ||
186 | * The only exception to that is put_persistent_grant, that can be called | ||
187 | * from interrupt context (by xen_blkbk_unmap), so we have to use atomic | ||
188 | * bit operations to modify the flags of a persistent grant and to count | ||
189 | * the number of used grants. | ||
190 | */ | ||
191 | static int add_persistent_gnt(struct xen_blkif *blkif, | ||
174 | struct persistent_gnt *persistent_gnt) | 192 | struct persistent_gnt *persistent_gnt) |
175 | { | 193 | { |
176 | struct rb_node **new = &(root->rb_node), *parent = NULL; | 194 | struct rb_node **new = NULL, *parent = NULL; |
177 | struct persistent_gnt *this; | 195 | struct persistent_gnt *this; |
178 | 196 | ||
197 | if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { | ||
198 | if (!blkif->vbd.overflow_max_grants) | ||
199 | blkif->vbd.overflow_max_grants = 1; | ||
200 | return -EBUSY; | ||
201 | } | ||
179 | /* Figure out where to put new node */ | 202 | /* Figure out where to put new node */ |
203 | new = &blkif->persistent_gnts.rb_node; | ||
180 | while (*new) { | 204 | while (*new) { |
181 | this = container_of(*new, struct persistent_gnt, node); | 205 | this = container_of(*new, struct persistent_gnt, node); |
182 | 206 | ||
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root, | |||
186 | else if (persistent_gnt->gnt > this->gnt) | 210 | else if (persistent_gnt->gnt > this->gnt) |
187 | new = &((*new)->rb_right); | 211 | new = &((*new)->rb_right); |
188 | else { | 212 | else { |
189 | pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); | 213 | pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n"); |
190 | BUG(); | 214 | return -EINVAL; |
191 | } | 215 | } |
192 | } | 216 | } |
193 | 217 | ||
218 | bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE); | ||
219 | set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); | ||
194 | /* Add new node and rebalance tree. */ | 220 | /* Add new node and rebalance tree. */ |
195 | rb_link_node(&(persistent_gnt->node), parent, new); | 221 | rb_link_node(&(persistent_gnt->node), parent, new); |
196 | rb_insert_color(&(persistent_gnt->node), root); | 222 | rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); |
223 | blkif->persistent_gnt_c++; | ||
224 | atomic_inc(&blkif->persistent_gnt_in_use); | ||
225 | return 0; | ||
197 | } | 226 | } |
198 | 227 | ||
199 | static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, | 228 | static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, |
200 | grant_ref_t gref) | 229 | grant_ref_t gref) |
201 | { | 230 | { |
202 | struct persistent_gnt *data; | 231 | struct persistent_gnt *data; |
203 | struct rb_node *node = root->rb_node; | 232 | struct rb_node *node = NULL; |
204 | 233 | ||
234 | node = blkif->persistent_gnts.rb_node; | ||
205 | while (node) { | 235 | while (node) { |
206 | data = container_of(node, struct persistent_gnt, node); | 236 | data = container_of(node, struct persistent_gnt, node); |
207 | 237 | ||
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, | |||
209 | node = node->rb_left; | 239 | node = node->rb_left; |
210 | else if (gref > data->gnt) | 240 | else if (gref > data->gnt) |
211 | node = node->rb_right; | 241 | node = node->rb_right; |
212 | else | 242 | else { |
243 | if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) { | ||
244 | pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n"); | ||
245 | return NULL; | ||
246 | } | ||
247 | set_bit(PERSISTENT_GNT_ACTIVE, data->flags); | ||
248 | atomic_inc(&blkif->persistent_gnt_in_use); | ||
213 | return data; | 249 | return data; |
250 | } | ||
214 | } | 251 | } |
215 | return NULL; | 252 | return NULL; |
216 | } | 253 | } |
217 | 254 | ||
218 | static void free_persistent_gnts(struct rb_root *root, unsigned int num) | 255 | static void put_persistent_gnt(struct xen_blkif *blkif, |
256 | struct persistent_gnt *persistent_gnt) | ||
257 | { | ||
258 | if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) | ||
259 | pr_alert_ratelimited(DRV_PFX " freeing a grant already unused"); | ||
260 | set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); | ||
261 | clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); | ||
262 | atomic_dec(&blkif->persistent_gnt_in_use); | ||
263 | } | ||
264 | |||
265 | static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, | ||
266 | unsigned int num) | ||
219 | { | 267 | { |
220 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 268 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
221 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 269 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) | |||
240 | ret = gnttab_unmap_refs(unmap, NULL, pages, | 288 | ret = gnttab_unmap_refs(unmap, NULL, pages, |
241 | segs_to_unmap); | 289 | segs_to_unmap); |
242 | BUG_ON(ret); | 290 | BUG_ON(ret); |
243 | free_xenballooned_pages(segs_to_unmap, pages); | 291 | put_free_pages(blkif, pages, segs_to_unmap); |
244 | segs_to_unmap = 0; | 292 | segs_to_unmap = 0; |
245 | } | 293 | } |
246 | 294 | ||
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) | |||
251 | BUG_ON(num != 0); | 299 | BUG_ON(num != 0); |
252 | } | 300 | } |
253 | 301 | ||
302 | static void unmap_purged_grants(struct work_struct *work) | ||
303 | { | ||
304 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
305 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
306 | struct persistent_gnt *persistent_gnt; | ||
307 | int ret, segs_to_unmap = 0; | ||
308 | struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); | ||
309 | |||
310 | while(!list_empty(&blkif->persistent_purge_list)) { | ||
311 | persistent_gnt = list_first_entry(&blkif->persistent_purge_list, | ||
312 | struct persistent_gnt, | ||
313 | remove_node); | ||
314 | list_del(&persistent_gnt->remove_node); | ||
315 | |||
316 | gnttab_set_unmap_op(&unmap[segs_to_unmap], | ||
317 | vaddr(persistent_gnt->page), | ||
318 | GNTMAP_host_map, | ||
319 | persistent_gnt->handle); | ||
320 | |||
321 | pages[segs_to_unmap] = persistent_gnt->page; | ||
322 | |||
323 | if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { | ||
324 | ret = gnttab_unmap_refs(unmap, NULL, pages, | ||
325 | segs_to_unmap); | ||
326 | BUG_ON(ret); | ||
327 | put_free_pages(blkif, pages, segs_to_unmap); | ||
328 | segs_to_unmap = 0; | ||
329 | } | ||
330 | kfree(persistent_gnt); | ||
331 | } | ||
332 | if (segs_to_unmap > 0) { | ||
333 | ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); | ||
334 | BUG_ON(ret); | ||
335 | put_free_pages(blkif, pages, segs_to_unmap); | ||
336 | } | ||
337 | } | ||
338 | |||
339 | static void purge_persistent_gnt(struct xen_blkif *blkif) | ||
340 | { | ||
341 | struct persistent_gnt *persistent_gnt; | ||
342 | struct rb_node *n; | ||
343 | unsigned int num_clean, total; | ||
344 | bool scan_used = false, clean_used = false; | ||
345 | struct rb_root *root; | ||
346 | |||
347 | if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || | ||
348 | (blkif->persistent_gnt_c == xen_blkif_max_pgrants && | ||
349 | !blkif->vbd.overflow_max_grants)) { | ||
350 | return; | ||
351 | } | ||
352 | |||
353 | if (work_pending(&blkif->persistent_purge_work)) { | ||
354 | pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n"); | ||
355 | return; | ||
356 | } | ||
357 | |||
358 | num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; | ||
359 | num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; | ||
360 | num_clean = min(blkif->persistent_gnt_c, num_clean); | ||
361 | if ((num_clean == 0) || | ||
362 | (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) | ||
363 | return; | ||
364 | |||
365 | /* | ||
366 | * At this point, we can assure that there will be no calls | ||
367 | * to get_persistent_grant (because we are executing this code from | ||
368 | * xen_blkif_schedule), there can only be calls to put_persistent_gnt, | ||
369 | * which means that the number of currently used grants will go down, | ||
370 | * but never up, so we will always be able to remove the requested | ||
371 | * number of grants. | ||
372 | */ | ||
373 | |||
374 | total = num_clean; | ||
375 | |||
376 | pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean); | ||
377 | |||
378 | INIT_LIST_HEAD(&blkif->persistent_purge_list); | ||
379 | root = &blkif->persistent_gnts; | ||
380 | purge_list: | ||
381 | foreach_grant_safe(persistent_gnt, n, root, node) { | ||
382 | BUG_ON(persistent_gnt->handle == | ||
383 | BLKBACK_INVALID_HANDLE); | ||
384 | |||
385 | if (clean_used) { | ||
386 | clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); | ||
387 | continue; | ||
388 | } | ||
389 | |||
390 | if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) | ||
391 | continue; | ||
392 | if (!scan_used && | ||
393 | (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags))) | ||
394 | continue; | ||
395 | |||
396 | rb_erase(&persistent_gnt->node, root); | ||
397 | list_add(&persistent_gnt->remove_node, | ||
398 | &blkif->persistent_purge_list); | ||
399 | if (--num_clean == 0) | ||
400 | goto finished; | ||
401 | } | ||
402 | /* | ||
403 | * If we get here it means we also need to start cleaning | ||
404 | * grants that were used since last purge in order to cope | ||
405 | * with the requested num | ||
406 | */ | ||
407 | if (!scan_used && !clean_used) { | ||
408 | pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean); | ||
409 | scan_used = true; | ||
410 | goto purge_list; | ||
411 | } | ||
412 | finished: | ||
413 | if (!clean_used) { | ||
414 | pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n"); | ||
415 | clean_used = true; | ||
416 | goto purge_list; | ||
417 | } | ||
418 | |||
419 | blkif->persistent_gnt_c -= (total - num_clean); | ||
420 | blkif->vbd.overflow_max_grants = 0; | ||
421 | |||
422 | /* We can defer this work */ | ||
423 | INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants); | ||
424 | schedule_work(&blkif->persistent_purge_work); | ||
425 | pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total); | ||
426 | return; | ||
427 | } | ||
428 | |||
254 | /* | 429 | /* |
255 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. | 430 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. |
256 | */ | 431 | */ |
257 | static struct pending_req *alloc_req(void) | 432 | static struct pending_req *alloc_req(struct xen_blkif *blkif) |
258 | { | 433 | { |
259 | struct pending_req *req = NULL; | 434 | struct pending_req *req = NULL; |
260 | unsigned long flags; | 435 | unsigned long flags; |
261 | 436 | ||
262 | spin_lock_irqsave(&blkbk->pending_free_lock, flags); | 437 | spin_lock_irqsave(&blkif->pending_free_lock, flags); |
263 | if (!list_empty(&blkbk->pending_free)) { | 438 | if (!list_empty(&blkif->pending_free)) { |
264 | req = list_entry(blkbk->pending_free.next, struct pending_req, | 439 | req = list_entry(blkif->pending_free.next, struct pending_req, |
265 | free_list); | 440 | free_list); |
266 | list_del(&req->free_list); | 441 | list_del(&req->free_list); |
267 | } | 442 | } |
268 | spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); | 443 | spin_unlock_irqrestore(&blkif->pending_free_lock, flags); |
269 | return req; | 444 | return req; |
270 | } | 445 | } |
271 | 446 | ||
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void) | |||
273 | * Return the 'pending_req' structure back to the freepool. We also | 448 | * Return the 'pending_req' structure back to the freepool. We also |
274 | * wake up the thread if it was waiting for a free page. | 449 | * wake up the thread if it was waiting for a free page. |
275 | */ | 450 | */ |
276 | static void free_req(struct pending_req *req) | 451 | static void free_req(struct xen_blkif *blkif, struct pending_req *req) |
277 | { | 452 | { |
278 | unsigned long flags; | 453 | unsigned long flags; |
279 | int was_empty; | 454 | int was_empty; |
280 | 455 | ||
281 | spin_lock_irqsave(&blkbk->pending_free_lock, flags); | 456 | spin_lock_irqsave(&blkif->pending_free_lock, flags); |
282 | was_empty = list_empty(&blkbk->pending_free); | 457 | was_empty = list_empty(&blkif->pending_free); |
283 | list_add(&req->free_list, &blkbk->pending_free); | 458 | list_add(&req->free_list, &blkif->pending_free); |
284 | spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); | 459 | spin_unlock_irqrestore(&blkif->pending_free_lock, flags); |
285 | if (was_empty) | 460 | if (was_empty) |
286 | wake_up(&blkbk->pending_free_wq); | 461 | wake_up(&blkif->pending_free_wq); |
287 | } | 462 | } |
288 | 463 | ||
289 | /* | 464 | /* |
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) | |||
382 | static void print_stats(struct xen_blkif *blkif) | 557 | static void print_stats(struct xen_blkif *blkif) |
383 | { | 558 | { |
384 | pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" | 559 | pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" |
385 | " | ds %4llu\n", | 560 | " | ds %4llu | pg: %4u/%4d\n", |
386 | current->comm, blkif->st_oo_req, | 561 | current->comm, blkif->st_oo_req, |
387 | blkif->st_rd_req, blkif->st_wr_req, | 562 | blkif->st_rd_req, blkif->st_wr_req, |
388 | blkif->st_f_req, blkif->st_ds_req); | 563 | blkif->st_f_req, blkif->st_ds_req, |
564 | blkif->persistent_gnt_c, | ||
565 | xen_blkif_max_pgrants); | ||
389 | blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); | 566 | blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); |
390 | blkif->st_rd_req = 0; | 567 | blkif->st_rd_req = 0; |
391 | blkif->st_wr_req = 0; | 568 | blkif->st_wr_req = 0; |
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg) | |||
397 | { | 574 | { |
398 | struct xen_blkif *blkif = arg; | 575 | struct xen_blkif *blkif = arg; |
399 | struct xen_vbd *vbd = &blkif->vbd; | 576 | struct xen_vbd *vbd = &blkif->vbd; |
577 | unsigned long timeout; | ||
578 | int ret; | ||
400 | 579 | ||
401 | xen_blkif_get(blkif); | 580 | xen_blkif_get(blkif); |
402 | 581 | ||
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg) | |||
406 | if (unlikely(vbd->size != vbd_sz(vbd))) | 585 | if (unlikely(vbd->size != vbd_sz(vbd))) |
407 | xen_vbd_resize(blkif); | 586 | xen_vbd_resize(blkif); |
408 | 587 | ||
409 | wait_event_interruptible( | 588 | timeout = msecs_to_jiffies(LRU_INTERVAL); |
589 | |||
590 | timeout = wait_event_interruptible_timeout( | ||
410 | blkif->wq, | 591 | blkif->wq, |
411 | blkif->waiting_reqs || kthread_should_stop()); | 592 | blkif->waiting_reqs || kthread_should_stop(), |
412 | wait_event_interruptible( | 593 | timeout); |
413 | blkbk->pending_free_wq, | 594 | if (timeout == 0) |
414 | !list_empty(&blkbk->pending_free) || | 595 | goto purge_gnt_list; |
415 | kthread_should_stop()); | 596 | timeout = wait_event_interruptible_timeout( |
597 | blkif->pending_free_wq, | ||
598 | !list_empty(&blkif->pending_free) || | ||
599 | kthread_should_stop(), | ||
600 | timeout); | ||
601 | if (timeout == 0) | ||
602 | goto purge_gnt_list; | ||
416 | 603 | ||
417 | blkif->waiting_reqs = 0; | 604 | blkif->waiting_reqs = 0; |
418 | smp_mb(); /* clear flag *before* checking for work */ | 605 | smp_mb(); /* clear flag *before* checking for work */ |
419 | 606 | ||
420 | if (do_block_io_op(blkif)) | 607 | ret = do_block_io_op(blkif); |
608 | if (ret > 0) | ||
421 | blkif->waiting_reqs = 1; | 609 | blkif->waiting_reqs = 1; |
610 | if (ret == -EACCES) | ||
611 | wait_event_interruptible(blkif->shutdown_wq, | ||
612 | kthread_should_stop()); | ||
613 | |||
614 | purge_gnt_list: | ||
615 | if (blkif->vbd.feature_gnt_persistent && | ||
616 | time_after(jiffies, blkif->next_lru)) { | ||
617 | purge_persistent_gnt(blkif); | ||
618 | blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); | ||
619 | } | ||
620 | |||
621 | /* Shrink if we have more than xen_blkif_max_buffer_pages */ | ||
622 | shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); | ||
422 | 623 | ||
423 | if (log_stats && time_after(jiffies, blkif->st_print)) | 624 | if (log_stats && time_after(jiffies, blkif->st_print)) |
424 | print_stats(blkif); | 625 | print_stats(blkif); |
425 | } | 626 | } |
426 | 627 | ||
628 | /* Since we are shutting down remove all pages from the buffer */ | ||
629 | shrink_free_pagepool(blkif, 0 /* All */); | ||
630 | |||
427 | /* Free all persistent grant pages */ | 631 | /* Free all persistent grant pages */ |
428 | if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) | 632 | if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) |
429 | free_persistent_gnts(&blkif->persistent_gnts, | 633 | free_persistent_gnts(blkif, &blkif->persistent_gnts, |
430 | blkif->persistent_gnt_c); | 634 | blkif->persistent_gnt_c); |
431 | 635 | ||
432 | BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); | 636 | BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); |
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg) | |||
441 | return 0; | 645 | return 0; |
442 | } | 646 | } |
443 | 647 | ||
444 | struct seg_buf { | ||
445 | unsigned int offset; | ||
446 | unsigned int nsec; | ||
447 | }; | ||
448 | /* | 648 | /* |
449 | * Unmap the grant references, and also remove the M2P over-rides | 649 | * Unmap the grant references, and also remove the M2P over-rides |
450 | * used in the 'pending_req'. | 650 | * used in the 'pending_req'. |
451 | */ | 651 | */ |
452 | static void xen_blkbk_unmap(struct pending_req *req) | 652 | static void xen_blkbk_unmap(struct xen_blkif *blkif, |
653 | struct grant_page *pages[], | ||
654 | int num) | ||
453 | { | 655 | { |
454 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 656 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
455 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 657 | struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
456 | unsigned int i, invcount = 0; | 658 | unsigned int i, invcount = 0; |
457 | grant_handle_t handle; | ||
458 | int ret; | 659 | int ret; |
459 | 660 | ||
460 | for (i = 0; i < req->nr_pages; i++) { | 661 | for (i = 0; i < num; i++) { |
461 | if (!test_bit(i, req->unmap_seg)) | 662 | if (pages[i]->persistent_gnt != NULL) { |
663 | put_persistent_gnt(blkif, pages[i]->persistent_gnt); | ||
462 | continue; | 664 | continue; |
463 | handle = pending_handle(req, i); | 665 | } |
464 | if (handle == BLKBACK_INVALID_HANDLE) | 666 | if (pages[i]->handle == BLKBACK_INVALID_HANDLE) |
465 | continue; | 667 | continue; |
466 | gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), | 668 | unmap_pages[invcount] = pages[i]->page; |
467 | GNTMAP_host_map, handle); | 669 | gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page), |
468 | pending_handle(req, i) = BLKBACK_INVALID_HANDLE; | 670 | GNTMAP_host_map, pages[i]->handle); |
469 | pages[invcount] = virt_to_page(vaddr(req, i)); | 671 | pages[i]->handle = BLKBACK_INVALID_HANDLE; |
470 | invcount++; | 672 | if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) { |
673 | ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, | ||
674 | invcount); | ||
675 | BUG_ON(ret); | ||
676 | put_free_pages(blkif, unmap_pages, invcount); | ||
677 | invcount = 0; | ||
678 | } | ||
679 | } | ||
680 | if (invcount) { | ||
681 | ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); | ||
682 | BUG_ON(ret); | ||
683 | put_free_pages(blkif, unmap_pages, invcount); | ||
471 | } | 684 | } |
472 | |||
473 | ret = gnttab_unmap_refs(unmap, NULL, pages, invcount); | ||
474 | BUG_ON(ret); | ||
475 | } | 685 | } |
476 | 686 | ||
477 | static int xen_blkbk_map(struct blkif_request *req, | 687 | static int xen_blkbk_map(struct xen_blkif *blkif, |
478 | struct pending_req *pending_req, | 688 | struct grant_page *pages[], |
479 | struct seg_buf seg[], | 689 | int num, bool ro) |
480 | struct page *pages[]) | ||
481 | { | 690 | { |
482 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 691 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
483 | struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
484 | struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 692 | struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
485 | struct persistent_gnt *persistent_gnt = NULL; | 693 | struct persistent_gnt *persistent_gnt = NULL; |
486 | struct xen_blkif *blkif = pending_req->blkif; | ||
487 | phys_addr_t addr = 0; | 694 | phys_addr_t addr = 0; |
488 | int i, j; | 695 | int i, seg_idx, new_map_idx; |
489 | bool new_map; | ||
490 | int nseg = req->u.rw.nr_segments; | ||
491 | int segs_to_map = 0; | 696 | int segs_to_map = 0; |
492 | int ret = 0; | 697 | int ret = 0; |
698 | int last_map = 0, map_until = 0; | ||
493 | int use_persistent_gnts; | 699 | int use_persistent_gnts; |
494 | 700 | ||
495 | use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); | 701 | use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); |
496 | 702 | ||
497 | BUG_ON(blkif->persistent_gnt_c > | ||
498 | max_mapped_grant_pages(pending_req->blkif->blk_protocol)); | ||
499 | |||
500 | /* | 703 | /* |
501 | * Fill out preq.nr_sects with proper amount of sectors, and setup | 704 | * Fill out preq.nr_sects with proper amount of sectors, and setup |
502 | * assign map[..] with the PFN of the page in our domain with the | 705 | * assign map[..] with the PFN of the page in our domain with the |
503 | * corresponding grant reference for each page. | 706 | * corresponding grant reference for each page. |
504 | */ | 707 | */ |
505 | for (i = 0; i < nseg; i++) { | 708 | again: |
709 | for (i = map_until; i < num; i++) { | ||
506 | uint32_t flags; | 710 | uint32_t flags; |
507 | 711 | ||
508 | if (use_persistent_gnts) | 712 | if (use_persistent_gnts) |
509 | persistent_gnt = get_persistent_gnt( | 713 | persistent_gnt = get_persistent_gnt( |
510 | &blkif->persistent_gnts, | 714 | blkif, |
511 | req->u.rw.seg[i].gref); | 715 | pages[i]->gref); |
512 | 716 | ||
513 | if (persistent_gnt) { | 717 | if (persistent_gnt) { |
514 | /* | 718 | /* |
515 | * We are using persistent grants and | 719 | * We are using persistent grants and |
516 | * the grant is already mapped | 720 | * the grant is already mapped |
517 | */ | 721 | */ |
518 | new_map = false; | 722 | pages[i]->page = persistent_gnt->page; |
519 | } else if (use_persistent_gnts && | 723 | pages[i]->persistent_gnt = persistent_gnt; |
520 | blkif->persistent_gnt_c < | ||
521 | max_mapped_grant_pages(blkif->blk_protocol)) { | ||
522 | /* | ||
523 | * We are using persistent grants, the grant is | ||
524 | * not mapped but we have room for it | ||
525 | */ | ||
526 | new_map = true; | ||
527 | persistent_gnt = kmalloc( | ||
528 | sizeof(struct persistent_gnt), | ||
529 | GFP_KERNEL); | ||
530 | if (!persistent_gnt) | ||
531 | return -ENOMEM; | ||
532 | if (alloc_xenballooned_pages(1, &persistent_gnt->page, | ||
533 | false)) { | ||
534 | kfree(persistent_gnt); | ||
535 | return -ENOMEM; | ||
536 | } | ||
537 | persistent_gnt->gnt = req->u.rw.seg[i].gref; | ||
538 | persistent_gnt->handle = BLKBACK_INVALID_HANDLE; | ||
539 | |||
540 | pages_to_gnt[segs_to_map] = | ||
541 | persistent_gnt->page; | ||
542 | addr = (unsigned long) pfn_to_kaddr( | ||
543 | page_to_pfn(persistent_gnt->page)); | ||
544 | |||
545 | add_persistent_gnt(&blkif->persistent_gnts, | ||
546 | persistent_gnt); | ||
547 | blkif->persistent_gnt_c++; | ||
548 | pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", | ||
549 | persistent_gnt->gnt, blkif->persistent_gnt_c, | ||
550 | max_mapped_grant_pages(blkif->blk_protocol)); | ||
551 | } else { | 724 | } else { |
552 | /* | 725 | if (get_free_page(blkif, &pages[i]->page)) |
553 | * We are either using persistent grants and | 726 | goto out_of_memory; |
554 | * hit the maximum limit of grants mapped, | 727 | addr = vaddr(pages[i]->page); |
555 | * or we are not using persistent grants. | 728 | pages_to_gnt[segs_to_map] = pages[i]->page; |
556 | */ | 729 | pages[i]->persistent_gnt = NULL; |
557 | if (use_persistent_gnts && | ||
558 | !blkif->vbd.overflow_max_grants) { | ||
559 | blkif->vbd.overflow_max_grants = 1; | ||
560 | pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", | ||
561 | blkif->domid, blkif->vbd.handle); | ||
562 | } | ||
563 | new_map = true; | ||
564 | pages[i] = blkbk->pending_page(pending_req, i); | ||
565 | addr = vaddr(pending_req, i); | ||
566 | pages_to_gnt[segs_to_map] = | ||
567 | blkbk->pending_page(pending_req, i); | ||
568 | } | ||
569 | |||
570 | if (persistent_gnt) { | ||
571 | pages[i] = persistent_gnt->page; | ||
572 | persistent_gnts[i] = persistent_gnt; | ||
573 | } else { | ||
574 | persistent_gnts[i] = NULL; | ||
575 | } | ||
576 | |||
577 | if (new_map) { | ||
578 | flags = GNTMAP_host_map; | 730 | flags = GNTMAP_host_map; |
579 | if (!persistent_gnt && | 731 | if (!use_persistent_gnts && ro) |
580 | (pending_req->operation != BLKIF_OP_READ)) | ||
581 | flags |= GNTMAP_readonly; | 732 | flags |= GNTMAP_readonly; |
582 | gnttab_set_map_op(&map[segs_to_map++], addr, | 733 | gnttab_set_map_op(&map[segs_to_map++], addr, |
583 | flags, req->u.rw.seg[i].gref, | 734 | flags, pages[i]->gref, |
584 | blkif->domid); | 735 | blkif->domid); |
585 | } | 736 | } |
737 | map_until = i + 1; | ||
738 | if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST) | ||
739 | break; | ||
586 | } | 740 | } |
587 | 741 | ||
588 | if (segs_to_map) { | 742 | if (segs_to_map) { |
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req, | |||
595 | * so that when we access vaddr(pending_req,i) it has the contents of | 749 | * so that when we access vaddr(pending_req,i) it has the contents of |
596 | * the page from the other domain. | 750 | * the page from the other domain. |
597 | */ | 751 | */ |
598 | bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | 752 | for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) { |
599 | for (i = 0, j = 0; i < nseg; i++) { | 753 | if (!pages[seg_idx]->persistent_gnt) { |
600 | if (!persistent_gnts[i] || | ||
601 | persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) { | ||
602 | /* This is a newly mapped grant */ | 754 | /* This is a newly mapped grant */ |
603 | BUG_ON(j >= segs_to_map); | 755 | BUG_ON(new_map_idx >= segs_to_map); |
604 | if (unlikely(map[j].status != 0)) { | 756 | if (unlikely(map[new_map_idx].status != 0)) { |
605 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); | 757 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); |
606 | map[j].handle = BLKBACK_INVALID_HANDLE; | 758 | pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; |
607 | ret |= 1; | 759 | ret |= 1; |
608 | if (persistent_gnts[i]) { | 760 | goto next; |
609 | rb_erase(&persistent_gnts[i]->node, | ||
610 | &blkif->persistent_gnts); | ||
611 | blkif->persistent_gnt_c--; | ||
612 | kfree(persistent_gnts[i]); | ||
613 | persistent_gnts[i] = NULL; | ||
614 | } | ||
615 | } | 761 | } |
762 | pages[seg_idx]->handle = map[new_map_idx].handle; | ||
763 | } else { | ||
764 | continue; | ||
616 | } | 765 | } |
617 | if (persistent_gnts[i]) { | 766 | if (use_persistent_gnts && |
618 | if (persistent_gnts[i]->handle == | 767 | blkif->persistent_gnt_c < xen_blkif_max_pgrants) { |
619 | BLKBACK_INVALID_HANDLE) { | 768 | /* |
769 | * We are using persistent grants, the grant is | ||
770 | * not mapped but we might have room for it. | ||
771 | */ | ||
772 | persistent_gnt = kmalloc(sizeof(struct persistent_gnt), | ||
773 | GFP_KERNEL); | ||
774 | if (!persistent_gnt) { | ||
620 | /* | 775 | /* |
621 | * If this is a new persistent grant | 776 | * If we don't have enough memory to |
622 | * save the handler | 777 | * allocate the persistent_gnt struct |
778 | * map this grant non-persistenly | ||
623 | */ | 779 | */ |
624 | persistent_gnts[i]->handle = map[j++].handle; | 780 | goto next; |
625 | } | 781 | } |
626 | pending_handle(pending_req, i) = | 782 | persistent_gnt->gnt = map[new_map_idx].ref; |
627 | persistent_gnts[i]->handle; | 783 | persistent_gnt->handle = map[new_map_idx].handle; |
784 | persistent_gnt->page = pages[seg_idx]->page; | ||
785 | if (add_persistent_gnt(blkif, | ||
786 | persistent_gnt)) { | ||
787 | kfree(persistent_gnt); | ||
788 | persistent_gnt = NULL; | ||
789 | goto next; | ||
790 | } | ||
791 | pages[seg_idx]->persistent_gnt = persistent_gnt; | ||
792 | pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", | ||
793 | persistent_gnt->gnt, blkif->persistent_gnt_c, | ||
794 | xen_blkif_max_pgrants); | ||
795 | goto next; | ||
796 | } | ||
797 | if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { | ||
798 | blkif->vbd.overflow_max_grants = 1; | ||
799 | pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", | ||
800 | blkif->domid, blkif->vbd.handle); | ||
801 | } | ||
802 | /* | ||
803 | * We could not map this grant persistently, so use it as | ||
804 | * a non-persistent grant. | ||
805 | */ | ||
806 | next: | ||
807 | new_map_idx++; | ||
808 | } | ||
809 | segs_to_map = 0; | ||
810 | last_map = map_until; | ||
811 | if (map_until != num) | ||
812 | goto again; | ||
628 | 813 | ||
629 | if (ret) | 814 | return ret; |
630 | continue; | 815 | |
631 | } else { | 816 | out_of_memory: |
632 | pending_handle(pending_req, i) = map[j++].handle; | 817 | pr_alert(DRV_PFX "%s: out of memory\n", __func__); |
633 | bitmap_set(pending_req->unmap_seg, i, 1); | 818 | put_free_pages(blkif, pages_to_gnt, segs_to_map); |
819 | return -ENOMEM; | ||
820 | } | ||
821 | |||
822 | static int xen_blkbk_map_seg(struct pending_req *pending_req) | ||
823 | { | ||
824 | int rc; | ||
825 | |||
826 | rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, | ||
827 | pending_req->nr_pages, | ||
828 | (pending_req->operation != BLKIF_OP_READ)); | ||
829 | |||
830 | return rc; | ||
831 | } | ||
634 | 832 | ||
635 | if (ret) | 833 | static int xen_blkbk_parse_indirect(struct blkif_request *req, |
636 | continue; | 834 | struct pending_req *pending_req, |
835 | struct seg_buf seg[], | ||
836 | struct phys_req *preq) | ||
837 | { | ||
838 | struct grant_page **pages = pending_req->indirect_pages; | ||
839 | struct xen_blkif *blkif = pending_req->blkif; | ||
840 | int indirect_grefs, rc, n, nseg, i; | ||
841 | struct blkif_request_segment_aligned *segments = NULL; | ||
842 | |||
843 | nseg = pending_req->nr_pages; | ||
844 | indirect_grefs = INDIRECT_PAGES(nseg); | ||
845 | BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); | ||
846 | |||
847 | for (i = 0; i < indirect_grefs; i++) | ||
848 | pages[i]->gref = req->u.indirect.indirect_grefs[i]; | ||
849 | |||
850 | rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); | ||
851 | if (rc) | ||
852 | goto unmap; | ||
853 | |||
854 | for (n = 0, i = 0; n < nseg; n++) { | ||
855 | if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { | ||
856 | /* Map indirect segments */ | ||
857 | if (segments) | ||
858 | kunmap_atomic(segments); | ||
859 | segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page); | ||
860 | } | ||
861 | i = n % SEGS_PER_INDIRECT_FRAME; | ||
862 | pending_req->segments[n]->gref = segments[i].gref; | ||
863 | seg[n].nsec = segments[i].last_sect - | ||
864 | segments[i].first_sect + 1; | ||
865 | seg[n].offset = (segments[i].first_sect << 9); | ||
866 | if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || | ||
867 | (segments[i].last_sect < segments[i].first_sect)) { | ||
868 | rc = -EINVAL; | ||
869 | goto unmap; | ||
637 | } | 870 | } |
638 | seg[i].offset = (req->u.rw.seg[i].first_sect << 9); | 871 | preq->nr_sects += seg[n].nsec; |
639 | } | 872 | } |
640 | return ret; | 873 | |
874 | unmap: | ||
875 | if (segments) | ||
876 | kunmap_atomic(segments); | ||
877 | xen_blkbk_unmap(blkif, pages, indirect_grefs); | ||
878 | return rc; | ||
641 | } | 879 | } |
642 | 880 | ||
643 | static int dispatch_discard_io(struct xen_blkif *blkif, | 881 | static int dispatch_discard_io(struct xen_blkif *blkif, |
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif, | |||
647 | int status = BLKIF_RSP_OKAY; | 885 | int status = BLKIF_RSP_OKAY; |
648 | struct block_device *bdev = blkif->vbd.bdev; | 886 | struct block_device *bdev = blkif->vbd.bdev; |
649 | unsigned long secure; | 887 | unsigned long secure; |
888 | struct phys_req preq; | ||
889 | |||
890 | preq.sector_number = req->u.discard.sector_number; | ||
891 | preq.nr_sects = req->u.discard.nr_sectors; | ||
650 | 892 | ||
893 | err = xen_vbd_translate(&preq, blkif, WRITE); | ||
894 | if (err) { | ||
895 | pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n", | ||
896 | preq.sector_number, | ||
897 | preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); | ||
898 | goto fail_response; | ||
899 | } | ||
651 | blkif->st_ds_req++; | 900 | blkif->st_ds_req++; |
652 | 901 | ||
653 | xen_blkif_get(blkif); | 902 | xen_blkif_get(blkif); |
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif, | |||
658 | err = blkdev_issue_discard(bdev, req->u.discard.sector_number, | 907 | err = blkdev_issue_discard(bdev, req->u.discard.sector_number, |
659 | req->u.discard.nr_sectors, | 908 | req->u.discard.nr_sectors, |
660 | GFP_KERNEL, secure); | 909 | GFP_KERNEL, secure); |
661 | 910 | fail_response: | |
662 | if (err == -EOPNOTSUPP) { | 911 | if (err == -EOPNOTSUPP) { |
663 | pr_debug(DRV_PFX "discard op failed, not supported\n"); | 912 | pr_debug(DRV_PFX "discard op failed, not supported\n"); |
664 | status = BLKIF_RSP_EOPNOTSUPP; | 913 | status = BLKIF_RSP_EOPNOTSUPP; |
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif, | |||
674 | struct blkif_request *req, | 923 | struct blkif_request *req, |
675 | struct pending_req *pending_req) | 924 | struct pending_req *pending_req) |
676 | { | 925 | { |
677 | free_req(pending_req); | 926 | free_req(blkif, pending_req); |
678 | make_response(blkif, req->u.other.id, req->operation, | 927 | make_response(blkif, req->u.other.id, req->operation, |
679 | BLKIF_RSP_EOPNOTSUPP); | 928 | BLKIF_RSP_EOPNOTSUPP); |
680 | return -EIO; | 929 | return -EIO; |
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) | |||
726 | * the proper response on the ring. | 975 | * the proper response on the ring. |
727 | */ | 976 | */ |
728 | if (atomic_dec_and_test(&pending_req->pendcnt)) { | 977 | if (atomic_dec_and_test(&pending_req->pendcnt)) { |
729 | xen_blkbk_unmap(pending_req); | 978 | xen_blkbk_unmap(pending_req->blkif, |
979 | pending_req->segments, | ||
980 | pending_req->nr_pages); | ||
730 | make_response(pending_req->blkif, pending_req->id, | 981 | make_response(pending_req->blkif, pending_req->id, |
731 | pending_req->operation, pending_req->status); | 982 | pending_req->operation, pending_req->status); |
732 | xen_blkif_put(pending_req->blkif); | 983 | xen_blkif_put(pending_req->blkif); |
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) | |||
734 | if (atomic_read(&pending_req->blkif->drain)) | 985 | if (atomic_read(&pending_req->blkif->drain)) |
735 | complete(&pending_req->blkif->drain_complete); | 986 | complete(&pending_req->blkif->drain_complete); |
736 | } | 987 | } |
737 | free_req(pending_req); | 988 | free_req(pending_req->blkif, pending_req); |
738 | } | 989 | } |
739 | } | 990 | } |
740 | 991 | ||
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
767 | rp = blk_rings->common.sring->req_prod; | 1018 | rp = blk_rings->common.sring->req_prod; |
768 | rmb(); /* Ensure we see queued requests up to 'rp'. */ | 1019 | rmb(); /* Ensure we see queued requests up to 'rp'. */ |
769 | 1020 | ||
1021 | if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { | ||
1022 | rc = blk_rings->common.rsp_prod_pvt; | ||
1023 | pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", | ||
1024 | rp, rc, rp - rc, blkif->vbd.pdevice); | ||
1025 | return -EACCES; | ||
1026 | } | ||
770 | while (rc != rp) { | 1027 | while (rc != rp) { |
771 | 1028 | ||
772 | if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) | 1029 | if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) |
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
777 | break; | 1034 | break; |
778 | } | 1035 | } |
779 | 1036 | ||
780 | pending_req = alloc_req(); | 1037 | pending_req = alloc_req(blkif); |
781 | if (NULL == pending_req) { | 1038 | if (NULL == pending_req) { |
782 | blkif->st_oo_req++; | 1039 | blkif->st_oo_req++; |
783 | more_to_do = 1; | 1040 | more_to_do = 1; |
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
807 | case BLKIF_OP_WRITE: | 1064 | case BLKIF_OP_WRITE: |
808 | case BLKIF_OP_WRITE_BARRIER: | 1065 | case BLKIF_OP_WRITE_BARRIER: |
809 | case BLKIF_OP_FLUSH_DISKCACHE: | 1066 | case BLKIF_OP_FLUSH_DISKCACHE: |
1067 | case BLKIF_OP_INDIRECT: | ||
810 | if (dispatch_rw_block_io(blkif, &req, pending_req)) | 1068 | if (dispatch_rw_block_io(blkif, &req, pending_req)) |
811 | goto done; | 1069 | goto done; |
812 | break; | 1070 | break; |
813 | case BLKIF_OP_DISCARD: | 1071 | case BLKIF_OP_DISCARD: |
814 | free_req(pending_req); | 1072 | free_req(blkif, pending_req); |
815 | if (dispatch_discard_io(blkif, &req)) | 1073 | if (dispatch_discard_io(blkif, &req)) |
816 | goto done; | 1074 | goto done; |
817 | break; | 1075 | break; |
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
853 | struct pending_req *pending_req) | 1111 | struct pending_req *pending_req) |
854 | { | 1112 | { |
855 | struct phys_req preq; | 1113 | struct phys_req preq; |
856 | struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1114 | struct seg_buf *seg = pending_req->seg; |
857 | unsigned int nseg; | 1115 | unsigned int nseg; |
858 | struct bio *bio = NULL; | 1116 | struct bio *bio = NULL; |
859 | struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1117 | struct bio **biolist = pending_req->biolist; |
860 | int i, nbio = 0; | 1118 | int i, nbio = 0; |
861 | int operation; | 1119 | int operation; |
862 | struct blk_plug plug; | 1120 | struct blk_plug plug; |
863 | bool drain = false; | 1121 | bool drain = false; |
864 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1122 | struct grant_page **pages = pending_req->segments; |
1123 | unsigned short req_operation; | ||
1124 | |||
1125 | req_operation = req->operation == BLKIF_OP_INDIRECT ? | ||
1126 | req->u.indirect.indirect_op : req->operation; | ||
1127 | if ((req->operation == BLKIF_OP_INDIRECT) && | ||
1128 | (req_operation != BLKIF_OP_READ) && | ||
1129 | (req_operation != BLKIF_OP_WRITE)) { | ||
1130 | pr_debug(DRV_PFX "Invalid indirect operation (%u)\n", | ||
1131 | req_operation); | ||
1132 | goto fail_response; | ||
1133 | } | ||
865 | 1134 | ||
866 | switch (req->operation) { | 1135 | switch (req_operation) { |
867 | case BLKIF_OP_READ: | 1136 | case BLKIF_OP_READ: |
868 | blkif->st_rd_req++; | 1137 | blkif->st_rd_req++; |
869 | operation = READ; | 1138 | operation = READ; |
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
885 | } | 1154 | } |
886 | 1155 | ||
887 | /* Check that the number of segments is sane. */ | 1156 | /* Check that the number of segments is sane. */ |
888 | nseg = req->u.rw.nr_segments; | 1157 | nseg = req->operation == BLKIF_OP_INDIRECT ? |
1158 | req->u.indirect.nr_segments : req->u.rw.nr_segments; | ||
889 | 1159 | ||
890 | if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || | 1160 | if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || |
891 | unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { | 1161 | unlikely((req->operation != BLKIF_OP_INDIRECT) && |
1162 | (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) || | ||
1163 | unlikely((req->operation == BLKIF_OP_INDIRECT) && | ||
1164 | (nseg > MAX_INDIRECT_SEGMENTS))) { | ||
892 | pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", | 1165 | pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", |
893 | nseg); | 1166 | nseg); |
894 | /* Haven't submitted any bio's yet. */ | 1167 | /* Haven't submitted any bio's yet. */ |
895 | goto fail_response; | 1168 | goto fail_response; |
896 | } | 1169 | } |
897 | 1170 | ||
898 | preq.sector_number = req->u.rw.sector_number; | ||
899 | preq.nr_sects = 0; | 1171 | preq.nr_sects = 0; |
900 | 1172 | ||
901 | pending_req->blkif = blkif; | 1173 | pending_req->blkif = blkif; |
902 | pending_req->id = req->u.rw.id; | 1174 | pending_req->id = req->u.rw.id; |
903 | pending_req->operation = req->operation; | 1175 | pending_req->operation = req_operation; |
904 | pending_req->status = BLKIF_RSP_OKAY; | 1176 | pending_req->status = BLKIF_RSP_OKAY; |
905 | pending_req->nr_pages = nseg; | 1177 | pending_req->nr_pages = nseg; |
906 | 1178 | ||
907 | for (i = 0; i < nseg; i++) { | 1179 | if (req->operation != BLKIF_OP_INDIRECT) { |
908 | seg[i].nsec = req->u.rw.seg[i].last_sect - | 1180 | preq.dev = req->u.rw.handle; |
909 | req->u.rw.seg[i].first_sect + 1; | 1181 | preq.sector_number = req->u.rw.sector_number; |
910 | if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || | 1182 | for (i = 0; i < nseg; i++) { |
911 | (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) | 1183 | pages[i]->gref = req->u.rw.seg[i].gref; |
1184 | seg[i].nsec = req->u.rw.seg[i].last_sect - | ||
1185 | req->u.rw.seg[i].first_sect + 1; | ||
1186 | seg[i].offset = (req->u.rw.seg[i].first_sect << 9); | ||
1187 | if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || | ||
1188 | (req->u.rw.seg[i].last_sect < | ||
1189 | req->u.rw.seg[i].first_sect)) | ||
1190 | goto fail_response; | ||
1191 | preq.nr_sects += seg[i].nsec; | ||
1192 | } | ||
1193 | } else { | ||
1194 | preq.dev = req->u.indirect.handle; | ||
1195 | preq.sector_number = req->u.indirect.sector_number; | ||
1196 | if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq)) | ||
912 | goto fail_response; | 1197 | goto fail_response; |
913 | preq.nr_sects += seg[i].nsec; | ||
914 | |||
915 | } | 1198 | } |
916 | 1199 | ||
917 | if (xen_vbd_translate(&preq, blkif, operation) != 0) { | 1200 | if (xen_vbd_translate(&preq, blkif, operation) != 0) { |
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
948 | * the hypercall to unmap the grants - that is all done in | 1231 | * the hypercall to unmap the grants - that is all done in |
949 | * xen_blkbk_unmap. | 1232 | * xen_blkbk_unmap. |
950 | */ | 1233 | */ |
951 | if (xen_blkbk_map(req, pending_req, seg, pages)) | 1234 | if (xen_blkbk_map_seg(pending_req)) |
952 | goto fail_flush; | 1235 | goto fail_flush; |
953 | 1236 | ||
954 | /* | 1237 | /* |
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
960 | for (i = 0; i < nseg; i++) { | 1243 | for (i = 0; i < nseg; i++) { |
961 | while ((bio == NULL) || | 1244 | while ((bio == NULL) || |
962 | (bio_add_page(bio, | 1245 | (bio_add_page(bio, |
963 | pages[i], | 1246 | pages[i]->page, |
964 | seg[i].nsec << 9, | 1247 | seg[i].nsec << 9, |
965 | seg[i].offset) == 0)) { | 1248 | seg[i].offset) == 0)) { |
966 | 1249 | ||
967 | bio = bio_alloc(GFP_KERNEL, nseg-i); | 1250 | int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES); |
1251 | bio = bio_alloc(GFP_KERNEL, nr_iovecs); | ||
968 | if (unlikely(bio == NULL)) | 1252 | if (unlikely(bio == NULL)) |
969 | goto fail_put_bio; | 1253 | goto fail_put_bio; |
970 | 1254 | ||
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
1009 | return 0; | 1293 | return 0; |
1010 | 1294 | ||
1011 | fail_flush: | 1295 | fail_flush: |
1012 | xen_blkbk_unmap(pending_req); | 1296 | xen_blkbk_unmap(blkif, pending_req->segments, |
1297 | pending_req->nr_pages); | ||
1013 | fail_response: | 1298 | fail_response: |
1014 | /* Haven't submitted any bio's yet. */ | 1299 | /* Haven't submitted any bio's yet. */ |
1015 | make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); | 1300 | make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); |
1016 | free_req(pending_req); | 1301 | free_req(blkif, pending_req); |
1017 | msleep(1); /* back off a bit */ | 1302 | msleep(1); /* back off a bit */ |
1018 | return -EIO; | 1303 | return -EIO; |
1019 | 1304 | ||
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id, | |||
1070 | 1355 | ||
1071 | static int __init xen_blkif_init(void) | 1356 | static int __init xen_blkif_init(void) |
1072 | { | 1357 | { |
1073 | int i, mmap_pages; | ||
1074 | int rc = 0; | 1358 | int rc = 0; |
1075 | 1359 | ||
1076 | if (!xen_domain()) | 1360 | if (!xen_domain()) |
1077 | return -ENODEV; | 1361 | return -ENODEV; |
1078 | 1362 | ||
1079 | blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL); | ||
1080 | if (!blkbk) { | ||
1081 | pr_alert(DRV_PFX "%s: out of memory!\n", __func__); | ||
1082 | return -ENOMEM; | ||
1083 | } | ||
1084 | |||
1085 | mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1086 | |||
1087 | blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) * | ||
1088 | xen_blkif_reqs, GFP_KERNEL); | ||
1089 | blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) * | ||
1090 | mmap_pages, GFP_KERNEL); | ||
1091 | blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * | ||
1092 | mmap_pages, GFP_KERNEL); | ||
1093 | |||
1094 | if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || | ||
1095 | !blkbk->pending_pages) { | ||
1096 | rc = -ENOMEM; | ||
1097 | goto out_of_memory; | ||
1098 | } | ||
1099 | |||
1100 | for (i = 0; i < mmap_pages; i++) { | ||
1101 | blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; | ||
1102 | blkbk->pending_pages[i] = alloc_page(GFP_KERNEL); | ||
1103 | if (blkbk->pending_pages[i] == NULL) { | ||
1104 | rc = -ENOMEM; | ||
1105 | goto out_of_memory; | ||
1106 | } | ||
1107 | } | ||
1108 | rc = xen_blkif_interface_init(); | 1363 | rc = xen_blkif_interface_init(); |
1109 | if (rc) | 1364 | if (rc) |
1110 | goto failed_init; | 1365 | goto failed_init; |
1111 | 1366 | ||
1112 | INIT_LIST_HEAD(&blkbk->pending_free); | ||
1113 | spin_lock_init(&blkbk->pending_free_lock); | ||
1114 | init_waitqueue_head(&blkbk->pending_free_wq); | ||
1115 | |||
1116 | for (i = 0; i < xen_blkif_reqs; i++) | ||
1117 | list_add_tail(&blkbk->pending_reqs[i].free_list, | ||
1118 | &blkbk->pending_free); | ||
1119 | |||
1120 | rc = xen_blkif_xenbus_init(); | 1367 | rc = xen_blkif_xenbus_init(); |
1121 | if (rc) | 1368 | if (rc) |
1122 | goto failed_init; | 1369 | goto failed_init; |
1123 | 1370 | ||
1124 | return 0; | ||
1125 | |||
1126 | out_of_memory: | ||
1127 | pr_alert(DRV_PFX "%s: out of memory\n", __func__); | ||
1128 | failed_init: | 1371 | failed_init: |
1129 | kfree(blkbk->pending_reqs); | ||
1130 | kfree(blkbk->pending_grant_handles); | ||
1131 | if (blkbk->pending_pages) { | ||
1132 | for (i = 0; i < mmap_pages; i++) { | ||
1133 | if (blkbk->pending_pages[i]) | ||
1134 | __free_page(blkbk->pending_pages[i]); | ||
1135 | } | ||
1136 | kfree(blkbk->pending_pages); | ||
1137 | } | ||
1138 | kfree(blkbk); | ||
1139 | blkbk = NULL; | ||
1140 | return rc; | 1372 | return rc; |
1141 | } | 1373 | } |
1142 | 1374 | ||
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 60103e2517ba..8d8807563d99 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h | |||
@@ -50,6 +50,19 @@ | |||
50 | __func__, __LINE__, ##args) | 50 | __func__, __LINE__, ##args) |
51 | 51 | ||
52 | 52 | ||
53 | /* | ||
54 | * This is the maximum number of segments that would be allowed in indirect | ||
55 | * requests. This value will also be passed to the frontend. | ||
56 | */ | ||
57 | #define MAX_INDIRECT_SEGMENTS 256 | ||
58 | |||
59 | #define SEGS_PER_INDIRECT_FRAME \ | ||
60 | (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) | ||
61 | #define MAX_INDIRECT_PAGES \ | ||
62 | ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
63 | #define INDIRECT_PAGES(_segs) \ | ||
64 | ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
65 | |||
53 | /* Not a real protocol. Used to generate ring structs which contain | 66 | /* Not a real protocol. Used to generate ring structs which contain |
54 | * the elements common to all protocols only. This way we get a | 67 | * the elements common to all protocols only. This way we get a |
55 | * compiler-checkable way to use common struct elements, so we can | 68 | * compiler-checkable way to use common struct elements, so we can |
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other { | |||
83 | uint64_t id; /* private guest value, echoed in resp */ | 96 | uint64_t id; /* private guest value, echoed in resp */ |
84 | } __attribute__((__packed__)); | 97 | } __attribute__((__packed__)); |
85 | 98 | ||
99 | struct blkif_x86_32_request_indirect { | ||
100 | uint8_t indirect_op; | ||
101 | uint16_t nr_segments; | ||
102 | uint64_t id; | ||
103 | blkif_sector_t sector_number; | ||
104 | blkif_vdev_t handle; | ||
105 | uint16_t _pad1; | ||
106 | grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; | ||
107 | /* | ||
108 | * The maximum number of indirect segments (and pages) that will | ||
109 | * be used is determined by MAX_INDIRECT_SEGMENTS, this value | ||
110 | * is also exported to the guest (via xenstore | ||
111 | * feature-max-indirect-segments entry), so the frontend knows how | ||
112 | * many indirect segments the backend supports. | ||
113 | */ | ||
114 | uint64_t _pad2; /* make it 64 byte aligned */ | ||
115 | } __attribute__((__packed__)); | ||
116 | |||
86 | struct blkif_x86_32_request { | 117 | struct blkif_x86_32_request { |
87 | uint8_t operation; /* BLKIF_OP_??? */ | 118 | uint8_t operation; /* BLKIF_OP_??? */ |
88 | union { | 119 | union { |
89 | struct blkif_x86_32_request_rw rw; | 120 | struct blkif_x86_32_request_rw rw; |
90 | struct blkif_x86_32_request_discard discard; | 121 | struct blkif_x86_32_request_discard discard; |
91 | struct blkif_x86_32_request_other other; | 122 | struct blkif_x86_32_request_other other; |
123 | struct blkif_x86_32_request_indirect indirect; | ||
92 | } u; | 124 | } u; |
93 | } __attribute__((__packed__)); | 125 | } __attribute__((__packed__)); |
94 | 126 | ||
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other { | |||
127 | uint64_t id; /* private guest value, echoed in resp */ | 159 | uint64_t id; /* private guest value, echoed in resp */ |
128 | } __attribute__((__packed__)); | 160 | } __attribute__((__packed__)); |
129 | 161 | ||
162 | struct blkif_x86_64_request_indirect { | ||
163 | uint8_t indirect_op; | ||
164 | uint16_t nr_segments; | ||
165 | uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */ | ||
166 | uint64_t id; | ||
167 | blkif_sector_t sector_number; | ||
168 | blkif_vdev_t handle; | ||
169 | uint16_t _pad2; | ||
170 | grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; | ||
171 | /* | ||
172 | * The maximum number of indirect segments (and pages) that will | ||
173 | * be used is determined by MAX_INDIRECT_SEGMENTS, this value | ||
174 | * is also exported to the guest (via xenstore | ||
175 | * feature-max-indirect-segments entry), so the frontend knows how | ||
176 | * many indirect segments the backend supports. | ||
177 | */ | ||
178 | uint32_t _pad3; /* make it 64 byte aligned */ | ||
179 | } __attribute__((__packed__)); | ||
180 | |||
130 | struct blkif_x86_64_request { | 181 | struct blkif_x86_64_request { |
131 | uint8_t operation; /* BLKIF_OP_??? */ | 182 | uint8_t operation; /* BLKIF_OP_??? */ |
132 | union { | 183 | union { |
133 | struct blkif_x86_64_request_rw rw; | 184 | struct blkif_x86_64_request_rw rw; |
134 | struct blkif_x86_64_request_discard discard; | 185 | struct blkif_x86_64_request_discard discard; |
135 | struct blkif_x86_64_request_other other; | 186 | struct blkif_x86_64_request_other other; |
187 | struct blkif_x86_64_request_indirect indirect; | ||
136 | } u; | 188 | } u; |
137 | } __attribute__((__packed__)); | 189 | } __attribute__((__packed__)); |
138 | 190 | ||
@@ -182,12 +234,26 @@ struct xen_vbd { | |||
182 | 234 | ||
183 | struct backend_info; | 235 | struct backend_info; |
184 | 236 | ||
237 | /* Number of available flags */ | ||
238 | #define PERSISTENT_GNT_FLAGS_SIZE 2 | ||
239 | /* This persistent grant is currently in use */ | ||
240 | #define PERSISTENT_GNT_ACTIVE 0 | ||
241 | /* | ||
242 | * This persistent grant has been used, this flag is set when we remove the | ||
243 | * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently. | ||
244 | */ | ||
245 | #define PERSISTENT_GNT_WAS_ACTIVE 1 | ||
246 | |||
247 | /* Number of requests that we can fit in a ring */ | ||
248 | #define XEN_BLKIF_REQS 32 | ||
185 | 249 | ||
186 | struct persistent_gnt { | 250 | struct persistent_gnt { |
187 | struct page *page; | 251 | struct page *page; |
188 | grant_ref_t gnt; | 252 | grant_ref_t gnt; |
189 | grant_handle_t handle; | 253 | grant_handle_t handle; |
254 | DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE); | ||
190 | struct rb_node node; | 255 | struct rb_node node; |
256 | struct list_head remove_node; | ||
191 | }; | 257 | }; |
192 | 258 | ||
193 | struct xen_blkif { | 259 | struct xen_blkif { |
@@ -219,6 +285,23 @@ struct xen_blkif { | |||
219 | /* tree to store persistent grants */ | 285 | /* tree to store persistent grants */ |
220 | struct rb_root persistent_gnts; | 286 | struct rb_root persistent_gnts; |
221 | unsigned int persistent_gnt_c; | 287 | unsigned int persistent_gnt_c; |
288 | atomic_t persistent_gnt_in_use; | ||
289 | unsigned long next_lru; | ||
290 | |||
291 | /* used by the kworker that offload work from the persistent purge */ | ||
292 | struct list_head persistent_purge_list; | ||
293 | struct work_struct persistent_purge_work; | ||
294 | |||
295 | /* buffer of free pages to map grant refs */ | ||
296 | spinlock_t free_pages_lock; | ||
297 | int free_pages_num; | ||
298 | struct list_head free_pages; | ||
299 | |||
300 | /* List of all 'pending_req' available */ | ||
301 | struct list_head pending_free; | ||
302 | /* And its spinlock. */ | ||
303 | spinlock_t pending_free_lock; | ||
304 | wait_queue_head_t pending_free_wq; | ||
222 | 305 | ||
223 | /* statistics */ | 306 | /* statistics */ |
224 | unsigned long st_print; | 307 | unsigned long st_print; |
@@ -231,6 +314,41 @@ struct xen_blkif { | |||
231 | unsigned long long st_wr_sect; | 314 | unsigned long long st_wr_sect; |
232 | 315 | ||
233 | wait_queue_head_t waiting_to_free; | 316 | wait_queue_head_t waiting_to_free; |
317 | /* Thread shutdown wait queue. */ | ||
318 | wait_queue_head_t shutdown_wq; | ||
319 | }; | ||
320 | |||
321 | struct seg_buf { | ||
322 | unsigned long offset; | ||
323 | unsigned int nsec; | ||
324 | }; | ||
325 | |||
326 | struct grant_page { | ||
327 | struct page *page; | ||
328 | struct persistent_gnt *persistent_gnt; | ||
329 | grant_handle_t handle; | ||
330 | grant_ref_t gref; | ||
331 | }; | ||
332 | |||
333 | /* | ||
334 | * Each outstanding request that we've passed to the lower device layers has a | ||
335 | * 'pending_req' allocated to it. Each buffer_head that completes decrements | ||
336 | * the pendcnt towards zero. When it hits zero, the specified domain has a | ||
337 | * response queued for it, with the saved 'id' passed back. | ||
338 | */ | ||
339 | struct pending_req { | ||
340 | struct xen_blkif *blkif; | ||
341 | u64 id; | ||
342 | int nr_pages; | ||
343 | atomic_t pendcnt; | ||
344 | unsigned short operation; | ||
345 | int status; | ||
346 | struct list_head free_list; | ||
347 | struct grant_page *segments[MAX_INDIRECT_SEGMENTS]; | ||
348 | /* Indirect descriptors */ | ||
349 | struct grant_page *indirect_pages[MAX_INDIRECT_PAGES]; | ||
350 | struct seg_buf seg[MAX_INDIRECT_SEGMENTS]; | ||
351 | struct bio *biolist[MAX_INDIRECT_SEGMENTS]; | ||
234 | }; | 352 | }; |
235 | 353 | ||
236 | 354 | ||
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void); | |||
257 | 375 | ||
258 | irqreturn_t xen_blkif_be_int(int irq, void *dev_id); | 376 | irqreturn_t xen_blkif_be_int(int irq, void *dev_id); |
259 | int xen_blkif_schedule(void *arg); | 377 | int xen_blkif_schedule(void *arg); |
378 | int xen_blkif_purge_persistent(void *arg); | ||
260 | 379 | ||
261 | int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, | 380 | int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, |
262 | struct backend_info *be, int state); | 381 | struct backend_info *be, int state); |
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); | |||
268 | static inline void blkif_get_x86_32_req(struct blkif_request *dst, | 387 | static inline void blkif_get_x86_32_req(struct blkif_request *dst, |
269 | struct blkif_x86_32_request *src) | 388 | struct blkif_x86_32_request *src) |
270 | { | 389 | { |
271 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; | 390 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; |
272 | dst->operation = src->operation; | 391 | dst->operation = src->operation; |
273 | switch (src->operation) { | 392 | switch (src->operation) { |
274 | case BLKIF_OP_READ: | 393 | case BLKIF_OP_READ: |
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, | |||
291 | dst->u.discard.sector_number = src->u.discard.sector_number; | 410 | dst->u.discard.sector_number = src->u.discard.sector_number; |
292 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; | 411 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; |
293 | break; | 412 | break; |
413 | case BLKIF_OP_INDIRECT: | ||
414 | dst->u.indirect.indirect_op = src->u.indirect.indirect_op; | ||
415 | dst->u.indirect.nr_segments = src->u.indirect.nr_segments; | ||
416 | dst->u.indirect.handle = src->u.indirect.handle; | ||
417 | dst->u.indirect.id = src->u.indirect.id; | ||
418 | dst->u.indirect.sector_number = src->u.indirect.sector_number; | ||
419 | barrier(); | ||
420 | j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); | ||
421 | for (i = 0; i < j; i++) | ||
422 | dst->u.indirect.indirect_grefs[i] = | ||
423 | src->u.indirect.indirect_grefs[i]; | ||
424 | break; | ||
294 | default: | 425 | default: |
295 | /* | 426 | /* |
296 | * Don't know how to translate this op. Only get the | 427 | * Don't know how to translate this op. Only get the |
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, | |||
304 | static inline void blkif_get_x86_64_req(struct blkif_request *dst, | 435 | static inline void blkif_get_x86_64_req(struct blkif_request *dst, |
305 | struct blkif_x86_64_request *src) | 436 | struct blkif_x86_64_request *src) |
306 | { | 437 | { |
307 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; | 438 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; |
308 | dst->operation = src->operation; | 439 | dst->operation = src->operation; |
309 | switch (src->operation) { | 440 | switch (src->operation) { |
310 | case BLKIF_OP_READ: | 441 | case BLKIF_OP_READ: |
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, | |||
327 | dst->u.discard.sector_number = src->u.discard.sector_number; | 458 | dst->u.discard.sector_number = src->u.discard.sector_number; |
328 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; | 459 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; |
329 | break; | 460 | break; |
461 | case BLKIF_OP_INDIRECT: | ||
462 | dst->u.indirect.indirect_op = src->u.indirect.indirect_op; | ||
463 | dst->u.indirect.nr_segments = src->u.indirect.nr_segments; | ||
464 | dst->u.indirect.handle = src->u.indirect.handle; | ||
465 | dst->u.indirect.id = src->u.indirect.id; | ||
466 | dst->u.indirect.sector_number = src->u.indirect.sector_number; | ||
467 | barrier(); | ||
468 | j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); | ||
469 | for (i = 0; i < j; i++) | ||
470 | dst->u.indirect.indirect_grefs[i] = | ||
471 | src->u.indirect.indirect_grefs[i]; | ||
472 | break; | ||
330 | default: | 473 | default: |
331 | /* | 474 | /* |
332 | * Don't know how to translate this op. Only get the | 475 | * Don't know how to translate this op. Only get the |
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 8bfd1bcf95ec..2e5b69d612ac 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c | |||
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) | |||
98 | err = PTR_ERR(blkif->xenblkd); | 98 | err = PTR_ERR(blkif->xenblkd); |
99 | blkif->xenblkd = NULL; | 99 | blkif->xenblkd = NULL; |
100 | xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); | 100 | xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); |
101 | return; | ||
101 | } | 102 | } |
102 | } | 103 | } |
103 | 104 | ||
104 | static struct xen_blkif *xen_blkif_alloc(domid_t domid) | 105 | static struct xen_blkif *xen_blkif_alloc(domid_t domid) |
105 | { | 106 | { |
106 | struct xen_blkif *blkif; | 107 | struct xen_blkif *blkif; |
108 | struct pending_req *req, *n; | ||
109 | int i, j; | ||
110 | |||
111 | BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); | ||
107 | 112 | ||
108 | blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); | 113 | blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); |
109 | if (!blkif) | 114 | if (!blkif) |
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) | |||
118 | blkif->st_print = jiffies; | 123 | blkif->st_print = jiffies; |
119 | init_waitqueue_head(&blkif->waiting_to_free); | 124 | init_waitqueue_head(&blkif->waiting_to_free); |
120 | blkif->persistent_gnts.rb_node = NULL; | 125 | blkif->persistent_gnts.rb_node = NULL; |
126 | spin_lock_init(&blkif->free_pages_lock); | ||
127 | INIT_LIST_HEAD(&blkif->free_pages); | ||
128 | blkif->free_pages_num = 0; | ||
129 | atomic_set(&blkif->persistent_gnt_in_use, 0); | ||
130 | |||
131 | INIT_LIST_HEAD(&blkif->pending_free); | ||
132 | |||
133 | for (i = 0; i < XEN_BLKIF_REQS; i++) { | ||
134 | req = kzalloc(sizeof(*req), GFP_KERNEL); | ||
135 | if (!req) | ||
136 | goto fail; | ||
137 | list_add_tail(&req->free_list, | ||
138 | &blkif->pending_free); | ||
139 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { | ||
140 | req->segments[j] = kzalloc(sizeof(*req->segments[0]), | ||
141 | GFP_KERNEL); | ||
142 | if (!req->segments[j]) | ||
143 | goto fail; | ||
144 | } | ||
145 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) { | ||
146 | req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), | ||
147 | GFP_KERNEL); | ||
148 | if (!req->indirect_pages[j]) | ||
149 | goto fail; | ||
150 | } | ||
151 | } | ||
152 | spin_lock_init(&blkif->pending_free_lock); | ||
153 | init_waitqueue_head(&blkif->pending_free_wq); | ||
154 | init_waitqueue_head(&blkif->shutdown_wq); | ||
121 | 155 | ||
122 | return blkif; | 156 | return blkif; |
157 | |||
158 | fail: | ||
159 | list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { | ||
160 | list_del(&req->free_list); | ||
161 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { | ||
162 | if (!req->segments[j]) | ||
163 | break; | ||
164 | kfree(req->segments[j]); | ||
165 | } | ||
166 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) { | ||
167 | if (!req->indirect_pages[j]) | ||
168 | break; | ||
169 | kfree(req->indirect_pages[j]); | ||
170 | } | ||
171 | kfree(req); | ||
172 | } | ||
173 | |||
174 | kmem_cache_free(xen_blkif_cachep, blkif); | ||
175 | |||
176 | return ERR_PTR(-ENOMEM); | ||
123 | } | 177 | } |
124 | 178 | ||
125 | static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, | 179 | static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, |
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) | |||
178 | { | 232 | { |
179 | if (blkif->xenblkd) { | 233 | if (blkif->xenblkd) { |
180 | kthread_stop(blkif->xenblkd); | 234 | kthread_stop(blkif->xenblkd); |
235 | wake_up(&blkif->shutdown_wq); | ||
181 | blkif->xenblkd = NULL; | 236 | blkif->xenblkd = NULL; |
182 | } | 237 | } |
183 | 238 | ||
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) | |||
198 | 253 | ||
199 | static void xen_blkif_free(struct xen_blkif *blkif) | 254 | static void xen_blkif_free(struct xen_blkif *blkif) |
200 | { | 255 | { |
256 | struct pending_req *req, *n; | ||
257 | int i = 0, j; | ||
258 | |||
201 | if (!atomic_dec_and_test(&blkif->refcnt)) | 259 | if (!atomic_dec_and_test(&blkif->refcnt)) |
202 | BUG(); | 260 | BUG(); |
261 | |||
262 | /* Check that there is no request in use */ | ||
263 | list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { | ||
264 | list_del(&req->free_list); | ||
265 | |||
266 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) | ||
267 | kfree(req->segments[j]); | ||
268 | |||
269 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) | ||
270 | kfree(req->indirect_pages[j]); | ||
271 | |||
272 | kfree(req); | ||
273 | i++; | ||
274 | } | ||
275 | |||
276 | WARN_ON(i != XEN_BLKIF_REQS); | ||
277 | |||
203 | kmem_cache_free(xen_blkif_cachep, blkif); | 278 | kmem_cache_free(xen_blkif_cachep, blkif); |
204 | } | 279 | } |
205 | 280 | ||
@@ -678,6 +753,11 @@ again: | |||
678 | dev->nodename); | 753 | dev->nodename); |
679 | goto abort; | 754 | goto abort; |
680 | } | 755 | } |
756 | err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u", | ||
757 | MAX_INDIRECT_SEGMENTS); | ||
758 | if (err) | ||
759 | dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)", | ||
760 | dev->nodename, err); | ||
681 | 761 | ||
682 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", | 762 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", |
683 | (unsigned long long)vbd_sz(&be->blkif->vbd)); | 763 | (unsigned long long)vbd_sz(&be->blkif->vbd)); |
@@ -704,6 +784,11 @@ again: | |||
704 | dev->nodename); | 784 | dev->nodename); |
705 | goto abort; | 785 | goto abort; |
706 | } | 786 | } |
787 | err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u", | ||
788 | bdev_physical_block_size(be->blkif->vbd.bdev)); | ||
789 | if (err) | ||
790 | xenbus_dev_error(dev, err, "writing %s/physical-sector-size", | ||
791 | dev->nodename); | ||
707 | 792 | ||
708 | err = xenbus_transaction_end(xbt, 0); | 793 | err = xenbus_transaction_end(xbt, 0); |
709 | if (err == -EAGAIN) | 794 | if (err == -EAGAIN) |
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d89ef86220f4..a4660bbee8a6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c | |||
@@ -74,12 +74,30 @@ struct grant { | |||
74 | struct blk_shadow { | 74 | struct blk_shadow { |
75 | struct blkif_request req; | 75 | struct blkif_request req; |
76 | struct request *request; | 76 | struct request *request; |
77 | struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 77 | struct grant **grants_used; |
78 | struct grant **indirect_grants; | ||
79 | struct scatterlist *sg; | ||
80 | }; | ||
81 | |||
82 | struct split_bio { | ||
83 | struct bio *bio; | ||
84 | atomic_t pending; | ||
85 | int err; | ||
78 | }; | 86 | }; |
79 | 87 | ||
80 | static DEFINE_MUTEX(blkfront_mutex); | 88 | static DEFINE_MUTEX(blkfront_mutex); |
81 | static const struct block_device_operations xlvbd_block_fops; | 89 | static const struct block_device_operations xlvbd_block_fops; |
82 | 90 | ||
91 | /* | ||
92 | * Maximum number of segments in indirect requests, the actual value used by | ||
93 | * the frontend driver is the minimum of this value and the value provided | ||
94 | * by the backend driver. | ||
95 | */ | ||
96 | |||
97 | static unsigned int xen_blkif_max_segments = 32; | ||
98 | module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); | ||
99 | MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); | ||
100 | |||
83 | #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) | 101 | #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) |
84 | 102 | ||
85 | /* | 103 | /* |
@@ -98,7 +116,6 @@ struct blkfront_info | |||
98 | enum blkif_state connected; | 116 | enum blkif_state connected; |
99 | int ring_ref; | 117 | int ring_ref; |
100 | struct blkif_front_ring ring; | 118 | struct blkif_front_ring ring; |
101 | struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
102 | unsigned int evtchn, irq; | 119 | unsigned int evtchn, irq; |
103 | struct request_queue *rq; | 120 | struct request_queue *rq; |
104 | struct work_struct work; | 121 | struct work_struct work; |
@@ -114,6 +131,7 @@ struct blkfront_info | |||
114 | unsigned int discard_granularity; | 131 | unsigned int discard_granularity; |
115 | unsigned int discard_alignment; | 132 | unsigned int discard_alignment; |
116 | unsigned int feature_persistent:1; | 133 | unsigned int feature_persistent:1; |
134 | unsigned int max_indirect_segments; | ||
117 | int is_ready; | 135 | int is_ready; |
118 | }; | 136 | }; |
119 | 137 | ||
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock); | |||
142 | 160 | ||
143 | #define DEV_NAME "xvd" /* name in /dev */ | 161 | #define DEV_NAME "xvd" /* name in /dev */ |
144 | 162 | ||
163 | #define SEGS_PER_INDIRECT_FRAME \ | ||
164 | (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) | ||
165 | #define INDIRECT_GREFS(_segs) \ | ||
166 | ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
167 | |||
168 | static int blkfront_setup_indirect(struct blkfront_info *info); | ||
169 | |||
145 | static int get_id_from_freelist(struct blkfront_info *info) | 170 | static int get_id_from_freelist(struct blkfront_info *info) |
146 | { | 171 | { |
147 | unsigned long free = info->shadow_free; | 172 | unsigned long free = info->shadow_free; |
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req) | |||
358 | struct blkif_request *ring_req; | 383 | struct blkif_request *ring_req; |
359 | unsigned long id; | 384 | unsigned long id; |
360 | unsigned int fsect, lsect; | 385 | unsigned int fsect, lsect; |
361 | int i, ref; | 386 | int i, ref, n; |
387 | struct blkif_request_segment_aligned *segments = NULL; | ||
362 | 388 | ||
363 | /* | 389 | /* |
364 | * Used to store if we are able to queue the request by just using | 390 | * Used to store if we are able to queue the request by just using |
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req) | |||
369 | grant_ref_t gref_head; | 395 | grant_ref_t gref_head; |
370 | struct grant *gnt_list_entry = NULL; | 396 | struct grant *gnt_list_entry = NULL; |
371 | struct scatterlist *sg; | 397 | struct scatterlist *sg; |
398 | int nseg, max_grefs; | ||
372 | 399 | ||
373 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) | 400 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) |
374 | return 1; | 401 | return 1; |
375 | 402 | ||
376 | /* Check if we have enought grants to allocate a requests */ | 403 | max_grefs = info->max_indirect_segments ? |
377 | if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { | 404 | info->max_indirect_segments + |
405 | INDIRECT_GREFS(info->max_indirect_segments) : | ||
406 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
407 | |||
408 | /* Check if we have enough grants to allocate a requests */ | ||
409 | if (info->persistent_gnts_c < max_grefs) { | ||
378 | new_persistent_gnts = 1; | 410 | new_persistent_gnts = 1; |
379 | if (gnttab_alloc_grant_references( | 411 | if (gnttab_alloc_grant_references( |
380 | BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, | 412 | max_grefs - info->persistent_gnts_c, |
381 | &gref_head) < 0) { | 413 | &gref_head) < 0) { |
382 | gnttab_request_free_callback( | 414 | gnttab_request_free_callback( |
383 | &info->callback, | 415 | &info->callback, |
384 | blkif_restart_queue_callback, | 416 | blkif_restart_queue_callback, |
385 | info, | 417 | info, |
386 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 418 | max_grefs); |
387 | return 1; | 419 | return 1; |
388 | } | 420 | } |
389 | } else | 421 | } else |
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req) | |||
394 | id = get_id_from_freelist(info); | 426 | id = get_id_from_freelist(info); |
395 | info->shadow[id].request = req; | 427 | info->shadow[id].request = req; |
396 | 428 | ||
397 | ring_req->u.rw.id = id; | ||
398 | ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
399 | ring_req->u.rw.handle = info->handle; | ||
400 | |||
401 | ring_req->operation = rq_data_dir(req) ? | ||
402 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
403 | |||
404 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { | ||
405 | /* | ||
406 | * Ideally we can do an unordered flush-to-disk. In case the | ||
407 | * backend onlysupports barriers, use that. A barrier request | ||
408 | * a superset of FUA, so we can implement it the same | ||
409 | * way. (It's also a FLUSH+FUA, since it is | ||
410 | * guaranteed ordered WRT previous writes.) | ||
411 | */ | ||
412 | ring_req->operation = info->flush_op; | ||
413 | } | ||
414 | |||
415 | if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { | 429 | if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { |
416 | /* id, sector_number and handle are set above. */ | ||
417 | ring_req->operation = BLKIF_OP_DISCARD; | 430 | ring_req->operation = BLKIF_OP_DISCARD; |
418 | ring_req->u.discard.nr_sectors = blk_rq_sectors(req); | 431 | ring_req->u.discard.nr_sectors = blk_rq_sectors(req); |
432 | ring_req->u.discard.id = id; | ||
433 | ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
419 | if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) | 434 | if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) |
420 | ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; | 435 | ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; |
421 | else | 436 | else |
422 | ring_req->u.discard.flag = 0; | 437 | ring_req->u.discard.flag = 0; |
423 | } else { | 438 | } else { |
424 | ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, | 439 | BUG_ON(info->max_indirect_segments == 0 && |
425 | info->sg); | 440 | req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); |
426 | BUG_ON(ring_req->u.rw.nr_segments > | 441 | BUG_ON(info->max_indirect_segments && |
427 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 442 | req->nr_phys_segments > info->max_indirect_segments); |
428 | 443 | nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); | |
429 | for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { | 444 | ring_req->u.rw.id = id; |
445 | if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { | ||
446 | /* | ||
447 | * The indirect operation can only be a BLKIF_OP_READ or | ||
448 | * BLKIF_OP_WRITE | ||
449 | */ | ||
450 | BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); | ||
451 | ring_req->operation = BLKIF_OP_INDIRECT; | ||
452 | ring_req->u.indirect.indirect_op = rq_data_dir(req) ? | ||
453 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
454 | ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
455 | ring_req->u.indirect.handle = info->handle; | ||
456 | ring_req->u.indirect.nr_segments = nseg; | ||
457 | } else { | ||
458 | ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
459 | ring_req->u.rw.handle = info->handle; | ||
460 | ring_req->operation = rq_data_dir(req) ? | ||
461 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
462 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { | ||
463 | /* | ||
464 | * Ideally we can do an unordered flush-to-disk. In case the | ||
465 | * backend onlysupports barriers, use that. A barrier request | ||
466 | * a superset of FUA, so we can implement it the same | ||
467 | * way. (It's also a FLUSH+FUA, since it is | ||
468 | * guaranteed ordered WRT previous writes.) | ||
469 | */ | ||
470 | ring_req->operation = info->flush_op; | ||
471 | } | ||
472 | ring_req->u.rw.nr_segments = nseg; | ||
473 | } | ||
474 | for_each_sg(info->shadow[id].sg, sg, nseg, i) { | ||
430 | fsect = sg->offset >> 9; | 475 | fsect = sg->offset >> 9; |
431 | lsect = fsect + (sg->length >> 9) - 1; | 476 | lsect = fsect + (sg->length >> 9) - 1; |
432 | 477 | ||
478 | if ((ring_req->operation == BLKIF_OP_INDIRECT) && | ||
479 | (i % SEGS_PER_INDIRECT_FRAME == 0)) { | ||
480 | if (segments) | ||
481 | kunmap_atomic(segments); | ||
482 | |||
483 | n = i / SEGS_PER_INDIRECT_FRAME; | ||
484 | gnt_list_entry = get_grant(&gref_head, info); | ||
485 | info->shadow[id].indirect_grants[n] = gnt_list_entry; | ||
486 | segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); | ||
487 | ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; | ||
488 | } | ||
489 | |||
433 | gnt_list_entry = get_grant(&gref_head, info); | 490 | gnt_list_entry = get_grant(&gref_head, info); |
434 | ref = gnt_list_entry->gref; | 491 | ref = gnt_list_entry->gref; |
435 | 492 | ||
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req) | |||
441 | 498 | ||
442 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); | 499 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); |
443 | 500 | ||
444 | shared_data = kmap_atomic( | 501 | shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); |
445 | pfn_to_page(gnt_list_entry->pfn)); | ||
446 | bvec_data = kmap_atomic(sg_page(sg)); | 502 | bvec_data = kmap_atomic(sg_page(sg)); |
447 | 503 | ||
448 | /* | 504 | /* |
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req) | |||
461 | kunmap_atomic(bvec_data); | 517 | kunmap_atomic(bvec_data); |
462 | kunmap_atomic(shared_data); | 518 | kunmap_atomic(shared_data); |
463 | } | 519 | } |
464 | 520 | if (ring_req->operation != BLKIF_OP_INDIRECT) { | |
465 | ring_req->u.rw.seg[i] = | 521 | ring_req->u.rw.seg[i] = |
466 | (struct blkif_request_segment) { | 522 | (struct blkif_request_segment) { |
467 | .gref = ref, | 523 | .gref = ref, |
468 | .first_sect = fsect, | 524 | .first_sect = fsect, |
469 | .last_sect = lsect }; | 525 | .last_sect = lsect }; |
526 | } else { | ||
527 | n = i % SEGS_PER_INDIRECT_FRAME; | ||
528 | segments[n] = | ||
529 | (struct blkif_request_segment_aligned) { | ||
530 | .gref = ref, | ||
531 | .first_sect = fsect, | ||
532 | .last_sect = lsect }; | ||
533 | } | ||
470 | } | 534 | } |
535 | if (segments) | ||
536 | kunmap_atomic(segments); | ||
471 | } | 537 | } |
472 | 538 | ||
473 | info->ring.req_prod_pvt++; | 539 | info->ring.req_prod_pvt++; |
@@ -542,7 +608,9 @@ wait: | |||
542 | flush_requests(info); | 608 | flush_requests(info); |
543 | } | 609 | } |
544 | 610 | ||
545 | static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | 611 | static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, |
612 | unsigned int physical_sector_size, | ||
613 | unsigned int segments) | ||
546 | { | 614 | { |
547 | struct request_queue *rq; | 615 | struct request_queue *rq; |
548 | struct blkfront_info *info = gd->private_data; | 616 | struct blkfront_info *info = gd->private_data; |
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
564 | 632 | ||
565 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ | 633 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ |
566 | blk_queue_logical_block_size(rq, sector_size); | 634 | blk_queue_logical_block_size(rq, sector_size); |
567 | blk_queue_max_hw_sectors(rq, 512); | 635 | blk_queue_physical_block_size(rq, physical_sector_size); |
636 | blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); | ||
568 | 637 | ||
569 | /* Each segment in a request is up to an aligned page in size. */ | 638 | /* Each segment in a request is up to an aligned page in size. */ |
570 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); | 639 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); |
571 | blk_queue_max_segment_size(rq, PAGE_SIZE); | 640 | blk_queue_max_segment_size(rq, PAGE_SIZE); |
572 | 641 | ||
573 | /* Ensure a merged request will fit in a single I/O ring slot. */ | 642 | /* Ensure a merged request will fit in a single I/O ring slot. */ |
574 | blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); | 643 | blk_queue_max_segments(rq, segments); |
575 | 644 | ||
576 | /* Make sure buffer addresses are sector-aligned. */ | 645 | /* Make sure buffer addresses are sector-aligned. */ |
577 | blk_queue_dma_alignment(rq, 511); | 646 | blk_queue_dma_alignment(rq, 511); |
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
588 | static void xlvbd_flush(struct blkfront_info *info) | 657 | static void xlvbd_flush(struct blkfront_info *info) |
589 | { | 658 | { |
590 | blk_queue_flush(info->rq, info->feature_flush); | 659 | blk_queue_flush(info->rq, info->feature_flush); |
591 | printk(KERN_INFO "blkfront: %s: %s: %s %s\n", | 660 | printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", |
592 | info->gd->disk_name, | 661 | info->gd->disk_name, |
593 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? | 662 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? |
594 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? | 663 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? |
595 | "flush diskcache" : "barrier or flush"), | 664 | "flush diskcache" : "barrier or flush"), |
596 | info->feature_flush ? "enabled" : "disabled", | 665 | info->feature_flush ? "enabled;" : "disabled;", |
597 | info->feature_persistent ? "using persistent grants" : ""); | 666 | "persistent grants:", |
667 | info->feature_persistent ? "enabled;" : "disabled;", | ||
668 | "indirect descriptors:", | ||
669 | info->max_indirect_segments ? "enabled;" : "disabled;"); | ||
598 | } | 670 | } |
599 | 671 | ||
600 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) | 672 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) |
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) | |||
667 | 739 | ||
668 | static int xlvbd_alloc_gendisk(blkif_sector_t capacity, | 740 | static int xlvbd_alloc_gendisk(blkif_sector_t capacity, |
669 | struct blkfront_info *info, | 741 | struct blkfront_info *info, |
670 | u16 vdisk_info, u16 sector_size) | 742 | u16 vdisk_info, u16 sector_size, |
743 | unsigned int physical_sector_size) | ||
671 | { | 744 | { |
672 | struct gendisk *gd; | 745 | struct gendisk *gd; |
673 | int nr_minors = 1; | 746 | int nr_minors = 1; |
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, | |||
734 | gd->driverfs_dev = &(info->xbdev->dev); | 807 | gd->driverfs_dev = &(info->xbdev->dev); |
735 | set_capacity(gd, capacity); | 808 | set_capacity(gd, capacity); |
736 | 809 | ||
737 | if (xlvbd_init_blk_queue(gd, sector_size)) { | 810 | if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, |
811 | info->max_indirect_segments ? : | ||
812 | BLKIF_MAX_SEGMENTS_PER_REQUEST)) { | ||
738 | del_gendisk(gd); | 813 | del_gendisk(gd); |
739 | goto release; | 814 | goto release; |
740 | } | 815 | } |
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
818 | { | 893 | { |
819 | struct grant *persistent_gnt; | 894 | struct grant *persistent_gnt; |
820 | struct grant *n; | 895 | struct grant *n; |
896 | int i, j, segs; | ||
821 | 897 | ||
822 | /* Prevent new requests being issued until we fix things up. */ | 898 | /* Prevent new requests being issued until we fix things up. */ |
823 | spin_lock_irq(&info->io_lock); | 899 | spin_lock_irq(&info->io_lock); |
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
843 | } | 919 | } |
844 | BUG_ON(info->persistent_gnts_c != 0); | 920 | BUG_ON(info->persistent_gnts_c != 0); |
845 | 921 | ||
922 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
923 | /* | ||
924 | * Clear persistent grants present in requests already | ||
925 | * on the shared ring | ||
926 | */ | ||
927 | if (!info->shadow[i].request) | ||
928 | goto free_shadow; | ||
929 | |||
930 | segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? | ||
931 | info->shadow[i].req.u.indirect.nr_segments : | ||
932 | info->shadow[i].req.u.rw.nr_segments; | ||
933 | for (j = 0; j < segs; j++) { | ||
934 | persistent_gnt = info->shadow[i].grants_used[j]; | ||
935 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | ||
936 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
937 | kfree(persistent_gnt); | ||
938 | } | ||
939 | |||
940 | if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) | ||
941 | /* | ||
942 | * If this is not an indirect operation don't try to | ||
943 | * free indirect segments | ||
944 | */ | ||
945 | goto free_shadow; | ||
946 | |||
947 | for (j = 0; j < INDIRECT_GREFS(segs); j++) { | ||
948 | persistent_gnt = info->shadow[i].indirect_grants[j]; | ||
949 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | ||
950 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
951 | kfree(persistent_gnt); | ||
952 | } | ||
953 | |||
954 | free_shadow: | ||
955 | kfree(info->shadow[i].grants_used); | ||
956 | info->shadow[i].grants_used = NULL; | ||
957 | kfree(info->shadow[i].indirect_grants); | ||
958 | info->shadow[i].indirect_grants = NULL; | ||
959 | kfree(info->shadow[i].sg); | ||
960 | info->shadow[i].sg = NULL; | ||
961 | } | ||
962 | |||
846 | /* No more gnttab callback work. */ | 963 | /* No more gnttab callback work. */ |
847 | gnttab_cancel_free_callback(&info->callback); | 964 | gnttab_cancel_free_callback(&info->callback); |
848 | spin_unlock_irq(&info->io_lock); | 965 | spin_unlock_irq(&info->io_lock); |
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, | |||
867 | struct blkif_response *bret) | 984 | struct blkif_response *bret) |
868 | { | 985 | { |
869 | int i = 0; | 986 | int i = 0; |
870 | struct bio_vec *bvec; | 987 | struct scatterlist *sg; |
871 | struct req_iterator iter; | ||
872 | unsigned long flags; | ||
873 | char *bvec_data; | 988 | char *bvec_data; |
874 | void *shared_data; | 989 | void *shared_data; |
875 | unsigned int offset = 0; | 990 | int nseg; |
991 | |||
992 | nseg = s->req.operation == BLKIF_OP_INDIRECT ? | ||
993 | s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; | ||
876 | 994 | ||
877 | if (bret->operation == BLKIF_OP_READ) { | 995 | if (bret->operation == BLKIF_OP_READ) { |
878 | /* | 996 | /* |
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, | |||
881 | * than PAGE_SIZE, we have to keep track of the current offset, | 999 | * than PAGE_SIZE, we have to keep track of the current offset, |
882 | * to be sure we are copying the data from the right shared page. | 1000 | * to be sure we are copying the data from the right shared page. |
883 | */ | 1001 | */ |
884 | rq_for_each_segment(bvec, s->request, iter) { | 1002 | for_each_sg(s->sg, sg, nseg, i) { |
885 | BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); | 1003 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); |
886 | if (bvec->bv_offset < offset) | ||
887 | i++; | ||
888 | BUG_ON(i >= s->req.u.rw.nr_segments); | ||
889 | shared_data = kmap_atomic( | 1004 | shared_data = kmap_atomic( |
890 | pfn_to_page(s->grants_used[i]->pfn)); | 1005 | pfn_to_page(s->grants_used[i]->pfn)); |
891 | bvec_data = bvec_kmap_irq(bvec, &flags); | 1006 | bvec_data = kmap_atomic(sg_page(sg)); |
892 | memcpy(bvec_data, shared_data + bvec->bv_offset, | 1007 | memcpy(bvec_data + sg->offset, |
893 | bvec->bv_len); | 1008 | shared_data + sg->offset, |
894 | bvec_kunmap_irq(bvec_data, &flags); | 1009 | sg->length); |
1010 | kunmap_atomic(bvec_data); | ||
895 | kunmap_atomic(shared_data); | 1011 | kunmap_atomic(shared_data); |
896 | offset = bvec->bv_offset + bvec->bv_len; | ||
897 | } | 1012 | } |
898 | } | 1013 | } |
899 | /* Add the persistent grant into the list of free grants */ | 1014 | /* Add the persistent grant into the list of free grants */ |
900 | for (i = 0; i < s->req.u.rw.nr_segments; i++) { | 1015 | for (i = 0; i < nseg; i++) { |
901 | list_add(&s->grants_used[i]->node, &info->persistent_gnts); | 1016 | list_add(&s->grants_used[i]->node, &info->persistent_gnts); |
902 | info->persistent_gnts_c++; | 1017 | info->persistent_gnts_c++; |
903 | } | 1018 | } |
1019 | if (s->req.operation == BLKIF_OP_INDIRECT) { | ||
1020 | for (i = 0; i < INDIRECT_GREFS(nseg); i++) { | ||
1021 | list_add(&s->indirect_grants[i]->node, &info->persistent_gnts); | ||
1022 | info->persistent_gnts_c++; | ||
1023 | } | ||
1024 | } | ||
904 | } | 1025 | } |
905 | 1026 | ||
906 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) | 1027 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) |
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev, | |||
1034 | SHARED_RING_INIT(sring); | 1155 | SHARED_RING_INIT(sring); |
1035 | FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); | 1156 | FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); |
1036 | 1157 | ||
1037 | sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
1038 | |||
1039 | /* Allocate memory for grants */ | ||
1040 | err = fill_grant_buffer(info, BLK_RING_SIZE * | ||
1041 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
1042 | if (err) | ||
1043 | goto fail; | ||
1044 | |||
1045 | err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); | 1158 | err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); |
1046 | if (err < 0) { | 1159 | if (err < 0) { |
1047 | free_page((unsigned long)sring); | 1160 | free_page((unsigned long)sring); |
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev, | |||
1223 | return 0; | 1336 | return 0; |
1224 | } | 1337 | } |
1225 | 1338 | ||
1339 | /* | ||
1340 | * This is a clone of md_trim_bio, used to split a bio into smaller ones | ||
1341 | */ | ||
1342 | static void trim_bio(struct bio *bio, int offset, int size) | ||
1343 | { | ||
1344 | /* 'bio' is a cloned bio which we need to trim to match | ||
1345 | * the given offset and size. | ||
1346 | * This requires adjusting bi_sector, bi_size, and bi_io_vec | ||
1347 | */ | ||
1348 | int i; | ||
1349 | struct bio_vec *bvec; | ||
1350 | int sofar = 0; | ||
1351 | |||
1352 | size <<= 9; | ||
1353 | if (offset == 0 && size == bio->bi_size) | ||
1354 | return; | ||
1355 | |||
1356 | bio->bi_sector += offset; | ||
1357 | bio->bi_size = size; | ||
1358 | offset <<= 9; | ||
1359 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
1360 | |||
1361 | while (bio->bi_idx < bio->bi_vcnt && | ||
1362 | bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { | ||
1363 | /* remove this whole bio_vec */ | ||
1364 | offset -= bio->bi_io_vec[bio->bi_idx].bv_len; | ||
1365 | bio->bi_idx++; | ||
1366 | } | ||
1367 | if (bio->bi_idx < bio->bi_vcnt) { | ||
1368 | bio->bi_io_vec[bio->bi_idx].bv_offset += offset; | ||
1369 | bio->bi_io_vec[bio->bi_idx].bv_len -= offset; | ||
1370 | } | ||
1371 | /* avoid any complications with bi_idx being non-zero*/ | ||
1372 | if (bio->bi_idx) { | ||
1373 | memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, | ||
1374 | (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); | ||
1375 | bio->bi_vcnt -= bio->bi_idx; | ||
1376 | bio->bi_idx = 0; | ||
1377 | } | ||
1378 | /* Make sure vcnt and last bv are not too big */ | ||
1379 | bio_for_each_segment(bvec, bio, i) { | ||
1380 | if (sofar + bvec->bv_len > size) | ||
1381 | bvec->bv_len = size - sofar; | ||
1382 | if (bvec->bv_len == 0) { | ||
1383 | bio->bi_vcnt = i; | ||
1384 | break; | ||
1385 | } | ||
1386 | sofar += bvec->bv_len; | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | static void split_bio_end(struct bio *bio, int error) | ||
1391 | { | ||
1392 | struct split_bio *split_bio = bio->bi_private; | ||
1393 | |||
1394 | if (error) | ||
1395 | split_bio->err = error; | ||
1396 | |||
1397 | if (atomic_dec_and_test(&split_bio->pending)) { | ||
1398 | split_bio->bio->bi_phys_segments = 0; | ||
1399 | bio_endio(split_bio->bio, split_bio->err); | ||
1400 | kfree(split_bio); | ||
1401 | } | ||
1402 | bio_put(bio); | ||
1403 | } | ||
1226 | 1404 | ||
1227 | static int blkif_recover(struct blkfront_info *info) | 1405 | static int blkif_recover(struct blkfront_info *info) |
1228 | { | 1406 | { |
1229 | int i; | 1407 | int i; |
1230 | struct blkif_request *req; | 1408 | struct request *req, *n; |
1231 | struct blk_shadow *copy; | 1409 | struct blk_shadow *copy; |
1232 | int j; | 1410 | int rc; |
1411 | struct bio *bio, *cloned_bio; | ||
1412 | struct bio_list bio_list, merge_bio; | ||
1413 | unsigned int segs, offset; | ||
1414 | int pending, size; | ||
1415 | struct split_bio *split_bio; | ||
1416 | struct list_head requests; | ||
1233 | 1417 | ||
1234 | /* Stage 1: Make a safe copy of the shadow state. */ | 1418 | /* Stage 1: Make a safe copy of the shadow state. */ |
1235 | copy = kmemdup(info->shadow, sizeof(info->shadow), | 1419 | copy = kmemdup(info->shadow, sizeof(info->shadow), |
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info) | |||
1244 | info->shadow_free = info->ring.req_prod_pvt; | 1428 | info->shadow_free = info->ring.req_prod_pvt; |
1245 | info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; | 1429 | info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; |
1246 | 1430 | ||
1247 | /* Stage 3: Find pending requests and requeue them. */ | 1431 | rc = blkfront_setup_indirect(info); |
1432 | if (rc) { | ||
1433 | kfree(copy); | ||
1434 | return rc; | ||
1435 | } | ||
1436 | |||
1437 | segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1438 | blk_queue_max_segments(info->rq, segs); | ||
1439 | bio_list_init(&bio_list); | ||
1440 | INIT_LIST_HEAD(&requests); | ||
1248 | for (i = 0; i < BLK_RING_SIZE; i++) { | 1441 | for (i = 0; i < BLK_RING_SIZE; i++) { |
1249 | /* Not in use? */ | 1442 | /* Not in use? */ |
1250 | if (!copy[i].request) | 1443 | if (!copy[i].request) |
1251 | continue; | 1444 | continue; |
1252 | 1445 | ||
1253 | /* Grab a request slot and copy shadow state into it. */ | 1446 | /* |
1254 | req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); | 1447 | * Get the bios in the request so we can re-queue them. |
1255 | *req = copy[i].req; | 1448 | */ |
1256 | 1449 | if (copy[i].request->cmd_flags & | |
1257 | /* We get a new request id, and must reset the shadow state. */ | 1450 | (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { |
1258 | req->u.rw.id = get_id_from_freelist(info); | 1451 | /* |
1259 | memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); | 1452 | * Flush operations don't contain bios, so |
1260 | 1453 | * we need to requeue the whole request | |
1261 | if (req->operation != BLKIF_OP_DISCARD) { | 1454 | */ |
1262 | /* Rewrite any grant references invalidated by susp/resume. */ | 1455 | list_add(©[i].request->queuelist, &requests); |
1263 | for (j = 0; j < req->u.rw.nr_segments; j++) | 1456 | continue; |
1264 | gnttab_grant_foreign_access_ref( | ||
1265 | req->u.rw.seg[j].gref, | ||
1266 | info->xbdev->otherend_id, | ||
1267 | pfn_to_mfn(copy[i].grants_used[j]->pfn), | ||
1268 | 0); | ||
1269 | } | 1457 | } |
1270 | info->shadow[req->u.rw.id].req = *req; | 1458 | merge_bio.head = copy[i].request->bio; |
1271 | 1459 | merge_bio.tail = copy[i].request->biotail; | |
1272 | info->ring.req_prod_pvt++; | 1460 | bio_list_merge(&bio_list, &merge_bio); |
1461 | copy[i].request->bio = NULL; | ||
1462 | blk_put_request(copy[i].request); | ||
1273 | } | 1463 | } |
1274 | 1464 | ||
1275 | kfree(copy); | 1465 | kfree(copy); |
1276 | 1466 | ||
1467 | /* | ||
1468 | * Empty the queue, this is important because we might have | ||
1469 | * requests in the queue with more segments than what we | ||
1470 | * can handle now. | ||
1471 | */ | ||
1472 | spin_lock_irq(&info->io_lock); | ||
1473 | while ((req = blk_fetch_request(info->rq)) != NULL) { | ||
1474 | if (req->cmd_flags & | ||
1475 | (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { | ||
1476 | list_add(&req->queuelist, &requests); | ||
1477 | continue; | ||
1478 | } | ||
1479 | merge_bio.head = req->bio; | ||
1480 | merge_bio.tail = req->biotail; | ||
1481 | bio_list_merge(&bio_list, &merge_bio); | ||
1482 | req->bio = NULL; | ||
1483 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) | ||
1484 | pr_alert("diskcache flush request found!\n"); | ||
1485 | __blk_put_request(info->rq, req); | ||
1486 | } | ||
1487 | spin_unlock_irq(&info->io_lock); | ||
1488 | |||
1277 | xenbus_switch_state(info->xbdev, XenbusStateConnected); | 1489 | xenbus_switch_state(info->xbdev, XenbusStateConnected); |
1278 | 1490 | ||
1279 | spin_lock_irq(&info->io_lock); | 1491 | spin_lock_irq(&info->io_lock); |
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info) | |||
1281 | /* Now safe for us to use the shared ring */ | 1493 | /* Now safe for us to use the shared ring */ |
1282 | info->connected = BLKIF_STATE_CONNECTED; | 1494 | info->connected = BLKIF_STATE_CONNECTED; |
1283 | 1495 | ||
1284 | /* Send off requeued requests */ | ||
1285 | flush_requests(info); | ||
1286 | |||
1287 | /* Kick any other new requests queued since we resumed */ | 1496 | /* Kick any other new requests queued since we resumed */ |
1288 | kick_pending_request_queues(info); | 1497 | kick_pending_request_queues(info); |
1289 | 1498 | ||
1499 | list_for_each_entry_safe(req, n, &requests, queuelist) { | ||
1500 | /* Requeue pending requests (flush or discard) */ | ||
1501 | list_del_init(&req->queuelist); | ||
1502 | BUG_ON(req->nr_phys_segments > segs); | ||
1503 | blk_requeue_request(info->rq, req); | ||
1504 | } | ||
1290 | spin_unlock_irq(&info->io_lock); | 1505 | spin_unlock_irq(&info->io_lock); |
1291 | 1506 | ||
1507 | while ((bio = bio_list_pop(&bio_list)) != NULL) { | ||
1508 | /* Traverse the list of pending bios and re-queue them */ | ||
1509 | if (bio_segments(bio) > segs) { | ||
1510 | /* | ||
1511 | * This bio has more segments than what we can | ||
1512 | * handle, we have to split it. | ||
1513 | */ | ||
1514 | pending = (bio_segments(bio) + segs - 1) / segs; | ||
1515 | split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); | ||
1516 | BUG_ON(split_bio == NULL); | ||
1517 | atomic_set(&split_bio->pending, pending); | ||
1518 | split_bio->bio = bio; | ||
1519 | for (i = 0; i < pending; i++) { | ||
1520 | offset = (i * segs * PAGE_SIZE) >> 9; | ||
1521 | size = min((unsigned int)(segs * PAGE_SIZE) >> 9, | ||
1522 | (unsigned int)(bio->bi_size >> 9) - offset); | ||
1523 | cloned_bio = bio_clone(bio, GFP_NOIO); | ||
1524 | BUG_ON(cloned_bio == NULL); | ||
1525 | trim_bio(cloned_bio, offset, size); | ||
1526 | cloned_bio->bi_private = split_bio; | ||
1527 | cloned_bio->bi_end_io = split_bio_end; | ||
1528 | submit_bio(cloned_bio->bi_rw, cloned_bio); | ||
1529 | } | ||
1530 | /* | ||
1531 | * Now we have to wait for all those smaller bios to | ||
1532 | * end, so we can also end the "parent" bio. | ||
1533 | */ | ||
1534 | continue; | ||
1535 | } | ||
1536 | /* We don't need to split this bio */ | ||
1537 | submit_bio(bio->bi_rw, bio); | ||
1538 | } | ||
1539 | |||
1292 | return 0; | 1540 | return 0; |
1293 | } | 1541 | } |
1294 | 1542 | ||
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev) | |||
1308 | blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); | 1556 | blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); |
1309 | 1557 | ||
1310 | err = talk_to_blkback(dev, info); | 1558 | err = talk_to_blkback(dev, info); |
1311 | if (info->connected == BLKIF_STATE_SUSPENDED && !err) | 1559 | |
1312 | err = blkif_recover(info); | 1560 | /* |
1561 | * We have to wait for the backend to switch to | ||
1562 | * connected state, since we want to read which | ||
1563 | * features it supports. | ||
1564 | */ | ||
1313 | 1565 | ||
1314 | return err; | 1566 | return err; |
1315 | } | 1567 | } |
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info) | |||
1387 | kfree(type); | 1639 | kfree(type); |
1388 | } | 1640 | } |
1389 | 1641 | ||
1642 | static int blkfront_setup_indirect(struct blkfront_info *info) | ||
1643 | { | ||
1644 | unsigned int indirect_segments, segs; | ||
1645 | int err, i; | ||
1646 | |||
1647 | err = xenbus_gather(XBT_NIL, info->xbdev->otherend, | ||
1648 | "feature-max-indirect-segments", "%u", &indirect_segments, | ||
1649 | NULL); | ||
1650 | if (err) { | ||
1651 | info->max_indirect_segments = 0; | ||
1652 | segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1653 | } else { | ||
1654 | info->max_indirect_segments = min(indirect_segments, | ||
1655 | xen_blkif_max_segments); | ||
1656 | segs = info->max_indirect_segments; | ||
1657 | } | ||
1658 | |||
1659 | err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); | ||
1660 | if (err) | ||
1661 | goto out_of_memory; | ||
1662 | |||
1663 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
1664 | info->shadow[i].grants_used = kzalloc( | ||
1665 | sizeof(info->shadow[i].grants_used[0]) * segs, | ||
1666 | GFP_NOIO); | ||
1667 | info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); | ||
1668 | if (info->max_indirect_segments) | ||
1669 | info->shadow[i].indirect_grants = kzalloc( | ||
1670 | sizeof(info->shadow[i].indirect_grants[0]) * | ||
1671 | INDIRECT_GREFS(segs), | ||
1672 | GFP_NOIO); | ||
1673 | if ((info->shadow[i].grants_used == NULL) || | ||
1674 | (info->shadow[i].sg == NULL) || | ||
1675 | (info->max_indirect_segments && | ||
1676 | (info->shadow[i].indirect_grants == NULL))) | ||
1677 | goto out_of_memory; | ||
1678 | sg_init_table(info->shadow[i].sg, segs); | ||
1679 | } | ||
1680 | |||
1681 | |||
1682 | return 0; | ||
1683 | |||
1684 | out_of_memory: | ||
1685 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
1686 | kfree(info->shadow[i].grants_used); | ||
1687 | info->shadow[i].grants_used = NULL; | ||
1688 | kfree(info->shadow[i].sg); | ||
1689 | info->shadow[i].sg = NULL; | ||
1690 | kfree(info->shadow[i].indirect_grants); | ||
1691 | info->shadow[i].indirect_grants = NULL; | ||
1692 | } | ||
1693 | return -ENOMEM; | ||
1694 | } | ||
1695 | |||
1390 | /* | 1696 | /* |
1391 | * Invoked when the backend is finally 'ready' (and has told produced | 1697 | * Invoked when the backend is finally 'ready' (and has told produced |
1392 | * the details about the physical device - #sectors, size, etc). | 1698 | * the details about the physical device - #sectors, size, etc). |
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1395 | { | 1701 | { |
1396 | unsigned long long sectors; | 1702 | unsigned long long sectors; |
1397 | unsigned long sector_size; | 1703 | unsigned long sector_size; |
1704 | unsigned int physical_sector_size; | ||
1398 | unsigned int binfo; | 1705 | unsigned int binfo; |
1399 | int err; | 1706 | int err; |
1400 | int barrier, flush, discard, persistent; | 1707 | int barrier, flush, discard, persistent; |
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1414 | set_capacity(info->gd, sectors); | 1721 | set_capacity(info->gd, sectors); |
1415 | revalidate_disk(info->gd); | 1722 | revalidate_disk(info->gd); |
1416 | 1723 | ||
1417 | /* fall through */ | 1724 | return; |
1418 | case BLKIF_STATE_SUSPENDED: | 1725 | case BLKIF_STATE_SUSPENDED: |
1726 | /* | ||
1727 | * If we are recovering from suspension, we need to wait | ||
1728 | * for the backend to announce it's features before | ||
1729 | * reconnecting, at least we need to know if the backend | ||
1730 | * supports indirect descriptors, and how many. | ||
1731 | */ | ||
1732 | blkif_recover(info); | ||
1419 | return; | 1733 | return; |
1420 | 1734 | ||
1421 | default: | 1735 | default: |
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1437 | return; | 1751 | return; |
1438 | } | 1752 | } |
1439 | 1753 | ||
1754 | /* | ||
1755 | * physcial-sector-size is a newer field, so old backends may not | ||
1756 | * provide this. Assume physical sector size to be the same as | ||
1757 | * sector_size in that case. | ||
1758 | */ | ||
1759 | err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, | ||
1760 | "physical-sector-size", "%u", &physical_sector_size); | ||
1761 | if (err != 1) | ||
1762 | physical_sector_size = sector_size; | ||
1763 | |||
1440 | info->feature_flush = 0; | 1764 | info->feature_flush = 0; |
1441 | info->flush_op = 0; | 1765 | info->flush_op = 0; |
1442 | 1766 | ||
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1483 | else | 1807 | else |
1484 | info->feature_persistent = persistent; | 1808 | info->feature_persistent = persistent; |
1485 | 1809 | ||
1486 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); | 1810 | err = blkfront_setup_indirect(info); |
1811 | if (err) { | ||
1812 | xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", | ||
1813 | info->xbdev->otherend); | ||
1814 | return; | ||
1815 | } | ||
1816 | |||
1817 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, | ||
1818 | physical_sector_size); | ||
1487 | if (err) { | 1819 | if (err) { |
1488 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", | 1820 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", |
1489 | info->xbdev->otherend); | 1821 | info->xbdev->otherend); |