diff options
Diffstat (limited to 'block/bio.c')
-rw-r--r-- | block/bio.c | 2038 |
1 files changed, 2038 insertions, 0 deletions
diff --git a/block/bio.c b/block/bio.c new file mode 100644 index 000000000000..96d28eee8a1e --- /dev/null +++ b/block/bio.c | |||
@@ -0,0 +1,2038 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public Licens | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- | ||
16 | * | ||
17 | */ | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/swap.h> | ||
20 | #include <linux/bio.h> | ||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/uio.h> | ||
23 | #include <linux/iocontext.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/kernel.h> | ||
27 | #include <linux/export.h> | ||
28 | #include <linux/mempool.h> | ||
29 | #include <linux/workqueue.h> | ||
30 | #include <linux/cgroup.h> | ||
31 | #include <scsi/sg.h> /* for struct sg_iovec */ | ||
32 | |||
33 | #include <trace/events/block.h> | ||
34 | |||
35 | /* | ||
36 | * Test patch to inline a certain number of bi_io_vec's inside the bio | ||
37 | * itself, to shrink a bio data allocation from two mempool calls to one | ||
38 | */ | ||
39 | #define BIO_INLINE_VECS 4 | ||
40 | |||
41 | /* | ||
42 | * if you change this list, also change bvec_alloc or things will | ||
43 | * break badly! cannot be bigger than what you can fit into an | ||
44 | * unsigned short | ||
45 | */ | ||
46 | #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } | ||
47 | static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { | ||
48 | BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), | ||
49 | }; | ||
50 | #undef BV | ||
51 | |||
52 | /* | ||
53 | * fs_bio_set is the bio_set containing bio and iovec memory pools used by | ||
54 | * IO code that does not need private memory pools. | ||
55 | */ | ||
56 | struct bio_set *fs_bio_set; | ||
57 | EXPORT_SYMBOL(fs_bio_set); | ||
58 | |||
59 | /* | ||
60 | * Our slab pool management | ||
61 | */ | ||
62 | struct bio_slab { | ||
63 | struct kmem_cache *slab; | ||
64 | unsigned int slab_ref; | ||
65 | unsigned int slab_size; | ||
66 | char name[8]; | ||
67 | }; | ||
68 | static DEFINE_MUTEX(bio_slab_lock); | ||
69 | static struct bio_slab *bio_slabs; | ||
70 | static unsigned int bio_slab_nr, bio_slab_max; | ||
71 | |||
72 | static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) | ||
73 | { | ||
74 | unsigned int sz = sizeof(struct bio) + extra_size; | ||
75 | struct kmem_cache *slab = NULL; | ||
76 | struct bio_slab *bslab, *new_bio_slabs; | ||
77 | unsigned int new_bio_slab_max; | ||
78 | unsigned int i, entry = -1; | ||
79 | |||
80 | mutex_lock(&bio_slab_lock); | ||
81 | |||
82 | i = 0; | ||
83 | while (i < bio_slab_nr) { | ||
84 | bslab = &bio_slabs[i]; | ||
85 | |||
86 | if (!bslab->slab && entry == -1) | ||
87 | entry = i; | ||
88 | else if (bslab->slab_size == sz) { | ||
89 | slab = bslab->slab; | ||
90 | bslab->slab_ref++; | ||
91 | break; | ||
92 | } | ||
93 | i++; | ||
94 | } | ||
95 | |||
96 | if (slab) | ||
97 | goto out_unlock; | ||
98 | |||
99 | if (bio_slab_nr == bio_slab_max && entry == -1) { | ||
100 | new_bio_slab_max = bio_slab_max << 1; | ||
101 | new_bio_slabs = krealloc(bio_slabs, | ||
102 | new_bio_slab_max * sizeof(struct bio_slab), | ||
103 | GFP_KERNEL); | ||
104 | if (!new_bio_slabs) | ||
105 | goto out_unlock; | ||
106 | bio_slab_max = new_bio_slab_max; | ||
107 | bio_slabs = new_bio_slabs; | ||
108 | } | ||
109 | if (entry == -1) | ||
110 | entry = bio_slab_nr++; | ||
111 | |||
112 | bslab = &bio_slabs[entry]; | ||
113 | |||
114 | snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); | ||
115 | slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); | ||
116 | if (!slab) | ||
117 | goto out_unlock; | ||
118 | |||
119 | bslab->slab = slab; | ||
120 | bslab->slab_ref = 1; | ||
121 | bslab->slab_size = sz; | ||
122 | out_unlock: | ||
123 | mutex_unlock(&bio_slab_lock); | ||
124 | return slab; | ||
125 | } | ||
126 | |||
127 | static void bio_put_slab(struct bio_set *bs) | ||
128 | { | ||
129 | struct bio_slab *bslab = NULL; | ||
130 | unsigned int i; | ||
131 | |||
132 | mutex_lock(&bio_slab_lock); | ||
133 | |||
134 | for (i = 0; i < bio_slab_nr; i++) { | ||
135 | if (bs->bio_slab == bio_slabs[i].slab) { | ||
136 | bslab = &bio_slabs[i]; | ||
137 | break; | ||
138 | } | ||
139 | } | ||
140 | |||
141 | if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) | ||
142 | goto out; | ||
143 | |||
144 | WARN_ON(!bslab->slab_ref); | ||
145 | |||
146 | if (--bslab->slab_ref) | ||
147 | goto out; | ||
148 | |||
149 | kmem_cache_destroy(bslab->slab); | ||
150 | bslab->slab = NULL; | ||
151 | |||
152 | out: | ||
153 | mutex_unlock(&bio_slab_lock); | ||
154 | } | ||
155 | |||
156 | unsigned int bvec_nr_vecs(unsigned short idx) | ||
157 | { | ||
158 | return bvec_slabs[idx].nr_vecs; | ||
159 | } | ||
160 | |||
161 | void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) | ||
162 | { | ||
163 | BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); | ||
164 | |||
165 | if (idx == BIOVEC_MAX_IDX) | ||
166 | mempool_free(bv, pool); | ||
167 | else { | ||
168 | struct biovec_slab *bvs = bvec_slabs + idx; | ||
169 | |||
170 | kmem_cache_free(bvs->slab, bv); | ||
171 | } | ||
172 | } | ||
173 | |||
174 | struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, | ||
175 | mempool_t *pool) | ||
176 | { | ||
177 | struct bio_vec *bvl; | ||
178 | |||
179 | /* | ||
180 | * see comment near bvec_array define! | ||
181 | */ | ||
182 | switch (nr) { | ||
183 | case 1: | ||
184 | *idx = 0; | ||
185 | break; | ||
186 | case 2 ... 4: | ||
187 | *idx = 1; | ||
188 | break; | ||
189 | case 5 ... 16: | ||
190 | *idx = 2; | ||
191 | break; | ||
192 | case 17 ... 64: | ||
193 | *idx = 3; | ||
194 | break; | ||
195 | case 65 ... 128: | ||
196 | *idx = 4; | ||
197 | break; | ||
198 | case 129 ... BIO_MAX_PAGES: | ||
199 | *idx = 5; | ||
200 | break; | ||
201 | default: | ||
202 | return NULL; | ||
203 | } | ||
204 | |||
205 | /* | ||
206 | * idx now points to the pool we want to allocate from. only the | ||
207 | * 1-vec entry pool is mempool backed. | ||
208 | */ | ||
209 | if (*idx == BIOVEC_MAX_IDX) { | ||
210 | fallback: | ||
211 | bvl = mempool_alloc(pool, gfp_mask); | ||
212 | } else { | ||
213 | struct biovec_slab *bvs = bvec_slabs + *idx; | ||
214 | gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); | ||
215 | |||
216 | /* | ||
217 | * Make this allocation restricted and don't dump info on | ||
218 | * allocation failures, since we'll fallback to the mempool | ||
219 | * in case of failure. | ||
220 | */ | ||
221 | __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; | ||
222 | |||
223 | /* | ||
224 | * Try a slab allocation. If this fails and __GFP_WAIT | ||
225 | * is set, retry with the 1-entry mempool | ||
226 | */ | ||
227 | bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); | ||
228 | if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { | ||
229 | *idx = BIOVEC_MAX_IDX; | ||
230 | goto fallback; | ||
231 | } | ||
232 | } | ||
233 | |||
234 | return bvl; | ||
235 | } | ||
236 | |||
237 | static void __bio_free(struct bio *bio) | ||
238 | { | ||
239 | bio_disassociate_task(bio); | ||
240 | |||
241 | if (bio_integrity(bio)) | ||
242 | bio_integrity_free(bio); | ||
243 | } | ||
244 | |||
245 | static void bio_free(struct bio *bio) | ||
246 | { | ||
247 | struct bio_set *bs = bio->bi_pool; | ||
248 | void *p; | ||
249 | |||
250 | __bio_free(bio); | ||
251 | |||
252 | if (bs) { | ||
253 | if (bio_flagged(bio, BIO_OWNS_VEC)) | ||
254 | bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); | ||
255 | |||
256 | /* | ||
257 | * If we have front padding, adjust the bio pointer before freeing | ||
258 | */ | ||
259 | p = bio; | ||
260 | p -= bs->front_pad; | ||
261 | |||
262 | mempool_free(p, bs->bio_pool); | ||
263 | } else { | ||
264 | /* Bio was allocated by bio_kmalloc() */ | ||
265 | kfree(bio); | ||
266 | } | ||
267 | } | ||
268 | |||
269 | void bio_init(struct bio *bio) | ||
270 | { | ||
271 | memset(bio, 0, sizeof(*bio)); | ||
272 | bio->bi_flags = 1 << BIO_UPTODATE; | ||
273 | atomic_set(&bio->bi_remaining, 1); | ||
274 | atomic_set(&bio->bi_cnt, 1); | ||
275 | } | ||
276 | EXPORT_SYMBOL(bio_init); | ||
277 | |||
278 | /** | ||
279 | * bio_reset - reinitialize a bio | ||
280 | * @bio: bio to reset | ||
281 | * | ||
282 | * Description: | ||
283 | * After calling bio_reset(), @bio will be in the same state as a freshly | ||
284 | * allocated bio returned bio bio_alloc_bioset() - the only fields that are | ||
285 | * preserved are the ones that are initialized by bio_alloc_bioset(). See | ||
286 | * comment in struct bio. | ||
287 | */ | ||
288 | void bio_reset(struct bio *bio) | ||
289 | { | ||
290 | unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); | ||
291 | |||
292 | __bio_free(bio); | ||
293 | |||
294 | memset(bio, 0, BIO_RESET_BYTES); | ||
295 | bio->bi_flags = flags|(1 << BIO_UPTODATE); | ||
296 | atomic_set(&bio->bi_remaining, 1); | ||
297 | } | ||
298 | EXPORT_SYMBOL(bio_reset); | ||
299 | |||
300 | static void bio_chain_endio(struct bio *bio, int error) | ||
301 | { | ||
302 | bio_endio(bio->bi_private, error); | ||
303 | bio_put(bio); | ||
304 | } | ||
305 | |||
306 | /** | ||
307 | * bio_chain - chain bio completions | ||
308 | * @bio: the target bio | ||
309 | * @parent: the @bio's parent bio | ||
310 | * | ||
311 | * The caller won't have a bi_end_io called when @bio completes - instead, | ||
312 | * @parent's bi_end_io won't be called until both @parent and @bio have | ||
313 | * completed; the chained bio will also be freed when it completes. | ||
314 | * | ||
315 | * The caller must not set bi_private or bi_end_io in @bio. | ||
316 | */ | ||
317 | void bio_chain(struct bio *bio, struct bio *parent) | ||
318 | { | ||
319 | BUG_ON(bio->bi_private || bio->bi_end_io); | ||
320 | |||
321 | bio->bi_private = parent; | ||
322 | bio->bi_end_io = bio_chain_endio; | ||
323 | atomic_inc(&parent->bi_remaining); | ||
324 | } | ||
325 | EXPORT_SYMBOL(bio_chain); | ||
326 | |||
327 | static void bio_alloc_rescue(struct work_struct *work) | ||
328 | { | ||
329 | struct bio_set *bs = container_of(work, struct bio_set, rescue_work); | ||
330 | struct bio *bio; | ||
331 | |||
332 | while (1) { | ||
333 | spin_lock(&bs->rescue_lock); | ||
334 | bio = bio_list_pop(&bs->rescue_list); | ||
335 | spin_unlock(&bs->rescue_lock); | ||
336 | |||
337 | if (!bio) | ||
338 | break; | ||
339 | |||
340 | generic_make_request(bio); | ||
341 | } | ||
342 | } | ||
343 | |||
344 | static void punt_bios_to_rescuer(struct bio_set *bs) | ||
345 | { | ||
346 | struct bio_list punt, nopunt; | ||
347 | struct bio *bio; | ||
348 | |||
349 | /* | ||
350 | * In order to guarantee forward progress we must punt only bios that | ||
351 | * were allocated from this bio_set; otherwise, if there was a bio on | ||
352 | * there for a stacking driver higher up in the stack, processing it | ||
353 | * could require allocating bios from this bio_set, and doing that from | ||
354 | * our own rescuer would be bad. | ||
355 | * | ||
356 | * Since bio lists are singly linked, pop them all instead of trying to | ||
357 | * remove from the middle of the list: | ||
358 | */ | ||
359 | |||
360 | bio_list_init(&punt); | ||
361 | bio_list_init(&nopunt); | ||
362 | |||
363 | while ((bio = bio_list_pop(current->bio_list))) | ||
364 | bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); | ||
365 | |||
366 | *current->bio_list = nopunt; | ||
367 | |||
368 | spin_lock(&bs->rescue_lock); | ||
369 | bio_list_merge(&bs->rescue_list, &punt); | ||
370 | spin_unlock(&bs->rescue_lock); | ||
371 | |||
372 | queue_work(bs->rescue_workqueue, &bs->rescue_work); | ||
373 | } | ||
374 | |||
375 | /** | ||
376 | * bio_alloc_bioset - allocate a bio for I/O | ||
377 | * @gfp_mask: the GFP_ mask given to the slab allocator | ||
378 | * @nr_iovecs: number of iovecs to pre-allocate | ||
379 | * @bs: the bio_set to allocate from. | ||
380 | * | ||
381 | * Description: | ||
382 | * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is | ||
383 | * backed by the @bs's mempool. | ||
384 | * | ||
385 | * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be | ||
386 | * able to allocate a bio. This is due to the mempool guarantees. To make this | ||
387 | * work, callers must never allocate more than 1 bio at a time from this pool. | ||
388 | * Callers that need to allocate more than 1 bio must always submit the | ||
389 | * previously allocated bio for IO before attempting to allocate a new one. | ||
390 | * Failure to do so can cause deadlocks under memory pressure. | ||
391 | * | ||
392 | * Note that when running under generic_make_request() (i.e. any block | ||
393 | * driver), bios are not submitted until after you return - see the code in | ||
394 | * generic_make_request() that converts recursion into iteration, to prevent | ||
395 | * stack overflows. | ||
396 | * | ||
397 | * This would normally mean allocating multiple bios under | ||
398 | * generic_make_request() would be susceptible to deadlocks, but we have | ||
399 | * deadlock avoidance code that resubmits any blocked bios from a rescuer | ||
400 | * thread. | ||
401 | * | ||
402 | * However, we do not guarantee forward progress for allocations from other | ||
403 | * mempools. Doing multiple allocations from the same mempool under | ||
404 | * generic_make_request() should be avoided - instead, use bio_set's front_pad | ||
405 | * for per bio allocations. | ||
406 | * | ||
407 | * RETURNS: | ||
408 | * Pointer to new bio on success, NULL on failure. | ||
409 | */ | ||
410 | struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) | ||
411 | { | ||
412 | gfp_t saved_gfp = gfp_mask; | ||
413 | unsigned front_pad; | ||
414 | unsigned inline_vecs; | ||
415 | unsigned long idx = BIO_POOL_NONE; | ||
416 | struct bio_vec *bvl = NULL; | ||
417 | struct bio *bio; | ||
418 | void *p; | ||
419 | |||
420 | if (!bs) { | ||
421 | if (nr_iovecs > UIO_MAXIOV) | ||
422 | return NULL; | ||
423 | |||
424 | p = kmalloc(sizeof(struct bio) + | ||
425 | nr_iovecs * sizeof(struct bio_vec), | ||
426 | gfp_mask); | ||
427 | front_pad = 0; | ||
428 | inline_vecs = nr_iovecs; | ||
429 | } else { | ||
430 | /* | ||
431 | * generic_make_request() converts recursion to iteration; this | ||
432 | * means if we're running beneath it, any bios we allocate and | ||
433 | * submit will not be submitted (and thus freed) until after we | ||
434 | * return. | ||
435 | * | ||
436 | * This exposes us to a potential deadlock if we allocate | ||
437 | * multiple bios from the same bio_set() while running | ||
438 | * underneath generic_make_request(). If we were to allocate | ||
439 | * multiple bios (say a stacking block driver that was splitting | ||
440 | * bios), we would deadlock if we exhausted the mempool's | ||
441 | * reserve. | ||
442 | * | ||
443 | * We solve this, and guarantee forward progress, with a rescuer | ||
444 | * workqueue per bio_set. If we go to allocate and there are | ||
445 | * bios on current->bio_list, we first try the allocation | ||
446 | * without __GFP_WAIT; if that fails, we punt those bios we | ||
447 | * would be blocking to the rescuer workqueue before we retry | ||
448 | * with the original gfp_flags. | ||
449 | */ | ||
450 | |||
451 | if (current->bio_list && !bio_list_empty(current->bio_list)) | ||
452 | gfp_mask &= ~__GFP_WAIT; | ||
453 | |||
454 | p = mempool_alloc(bs->bio_pool, gfp_mask); | ||
455 | if (!p && gfp_mask != saved_gfp) { | ||
456 | punt_bios_to_rescuer(bs); | ||
457 | gfp_mask = saved_gfp; | ||
458 | p = mempool_alloc(bs->bio_pool, gfp_mask); | ||
459 | } | ||
460 | |||
461 | front_pad = bs->front_pad; | ||
462 | inline_vecs = BIO_INLINE_VECS; | ||
463 | } | ||
464 | |||
465 | if (unlikely(!p)) | ||
466 | return NULL; | ||
467 | |||
468 | bio = p + front_pad; | ||
469 | bio_init(bio); | ||
470 | |||
471 | if (nr_iovecs > inline_vecs) { | ||
472 | bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); | ||
473 | if (!bvl && gfp_mask != saved_gfp) { | ||
474 | punt_bios_to_rescuer(bs); | ||
475 | gfp_mask = saved_gfp; | ||
476 | bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); | ||
477 | } | ||
478 | |||
479 | if (unlikely(!bvl)) | ||
480 | goto err_free; | ||
481 | |||
482 | bio->bi_flags |= 1 << BIO_OWNS_VEC; | ||
483 | } else if (nr_iovecs) { | ||
484 | bvl = bio->bi_inline_vecs; | ||
485 | } | ||
486 | |||
487 | bio->bi_pool = bs; | ||
488 | bio->bi_flags |= idx << BIO_POOL_OFFSET; | ||
489 | bio->bi_max_vecs = nr_iovecs; | ||
490 | bio->bi_io_vec = bvl; | ||
491 | return bio; | ||
492 | |||
493 | err_free: | ||
494 | mempool_free(p, bs->bio_pool); | ||
495 | return NULL; | ||
496 | } | ||
497 | EXPORT_SYMBOL(bio_alloc_bioset); | ||
498 | |||
499 | void zero_fill_bio(struct bio *bio) | ||
500 | { | ||
501 | unsigned long flags; | ||
502 | struct bio_vec bv; | ||
503 | struct bvec_iter iter; | ||
504 | |||
505 | bio_for_each_segment(bv, bio, iter) { | ||
506 | char *data = bvec_kmap_irq(&bv, &flags); | ||
507 | memset(data, 0, bv.bv_len); | ||
508 | flush_dcache_page(bv.bv_page); | ||
509 | bvec_kunmap_irq(data, &flags); | ||
510 | } | ||
511 | } | ||
512 | EXPORT_SYMBOL(zero_fill_bio); | ||
513 | |||
514 | /** | ||
515 | * bio_put - release a reference to a bio | ||
516 | * @bio: bio to release reference to | ||
517 | * | ||
518 | * Description: | ||
519 | * Put a reference to a &struct bio, either one you have gotten with | ||
520 | * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. | ||
521 | **/ | ||
522 | void bio_put(struct bio *bio) | ||
523 | { | ||
524 | BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); | ||
525 | |||
526 | /* | ||
527 | * last put frees it | ||
528 | */ | ||
529 | if (atomic_dec_and_test(&bio->bi_cnt)) | ||
530 | bio_free(bio); | ||
531 | } | ||
532 | EXPORT_SYMBOL(bio_put); | ||
533 | |||
534 | inline int bio_phys_segments(struct request_queue *q, struct bio *bio) | ||
535 | { | ||
536 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) | ||
537 | blk_recount_segments(q, bio); | ||
538 | |||
539 | return bio->bi_phys_segments; | ||
540 | } | ||
541 | EXPORT_SYMBOL(bio_phys_segments); | ||
542 | |||
543 | /** | ||
544 | * __bio_clone_fast - clone a bio that shares the original bio's biovec | ||
545 | * @bio: destination bio | ||
546 | * @bio_src: bio to clone | ||
547 | * | ||
548 | * Clone a &bio. Caller will own the returned bio, but not | ||
549 | * the actual data it points to. Reference count of returned | ||
550 | * bio will be one. | ||
551 | * | ||
552 | * Caller must ensure that @bio_src is not freed before @bio. | ||
553 | */ | ||
554 | void __bio_clone_fast(struct bio *bio, struct bio *bio_src) | ||
555 | { | ||
556 | BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); | ||
557 | |||
558 | /* | ||
559 | * most users will be overriding ->bi_bdev with a new target, | ||
560 | * so we don't set nor calculate new physical/hw segment counts here | ||
561 | */ | ||
562 | bio->bi_bdev = bio_src->bi_bdev; | ||
563 | bio->bi_flags |= 1 << BIO_CLONED; | ||
564 | bio->bi_rw = bio_src->bi_rw; | ||
565 | bio->bi_iter = bio_src->bi_iter; | ||
566 | bio->bi_io_vec = bio_src->bi_io_vec; | ||
567 | } | ||
568 | EXPORT_SYMBOL(__bio_clone_fast); | ||
569 | |||
570 | /** | ||
571 | * bio_clone_fast - clone a bio that shares the original bio's biovec | ||
572 | * @bio: bio to clone | ||
573 | * @gfp_mask: allocation priority | ||
574 | * @bs: bio_set to allocate from | ||
575 | * | ||
576 | * Like __bio_clone_fast, only also allocates the returned bio | ||
577 | */ | ||
578 | struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) | ||
579 | { | ||
580 | struct bio *b; | ||
581 | |||
582 | b = bio_alloc_bioset(gfp_mask, 0, bs); | ||
583 | if (!b) | ||
584 | return NULL; | ||
585 | |||
586 | __bio_clone_fast(b, bio); | ||
587 | |||
588 | if (bio_integrity(bio)) { | ||
589 | int ret; | ||
590 | |||
591 | ret = bio_integrity_clone(b, bio, gfp_mask); | ||
592 | |||
593 | if (ret < 0) { | ||
594 | bio_put(b); | ||
595 | return NULL; | ||
596 | } | ||
597 | } | ||
598 | |||
599 | return b; | ||
600 | } | ||
601 | EXPORT_SYMBOL(bio_clone_fast); | ||
602 | |||
603 | /** | ||
604 | * bio_clone_bioset - clone a bio | ||
605 | * @bio_src: bio to clone | ||
606 | * @gfp_mask: allocation priority | ||
607 | * @bs: bio_set to allocate from | ||
608 | * | ||
609 | * Clone bio. Caller will own the returned bio, but not the actual data it | ||
610 | * points to. Reference count of returned bio will be one. | ||
611 | */ | ||
612 | struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, | ||
613 | struct bio_set *bs) | ||
614 | { | ||
615 | struct bvec_iter iter; | ||
616 | struct bio_vec bv; | ||
617 | struct bio *bio; | ||
618 | |||
619 | /* | ||
620 | * Pre immutable biovecs, __bio_clone() used to just do a memcpy from | ||
621 | * bio_src->bi_io_vec to bio->bi_io_vec. | ||
622 | * | ||
623 | * We can't do that anymore, because: | ||
624 | * | ||
625 | * - The point of cloning the biovec is to produce a bio with a biovec | ||
626 | * the caller can modify: bi_idx and bi_bvec_done should be 0. | ||
627 | * | ||
628 | * - The original bio could've had more than BIO_MAX_PAGES biovecs; if | ||
629 | * we tried to clone the whole thing bio_alloc_bioset() would fail. | ||
630 | * But the clone should succeed as long as the number of biovecs we | ||
631 | * actually need to allocate is fewer than BIO_MAX_PAGES. | ||
632 | * | ||
633 | * - Lastly, bi_vcnt should not be looked at or relied upon by code | ||
634 | * that does not own the bio - reason being drivers don't use it for | ||
635 | * iterating over the biovec anymore, so expecting it to be kept up | ||
636 | * to date (i.e. for clones that share the parent biovec) is just | ||
637 | * asking for trouble and would force extra work on | ||
638 | * __bio_clone_fast() anyways. | ||
639 | */ | ||
640 | |||
641 | bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); | ||
642 | if (!bio) | ||
643 | return NULL; | ||
644 | |||
645 | bio->bi_bdev = bio_src->bi_bdev; | ||
646 | bio->bi_rw = bio_src->bi_rw; | ||
647 | bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; | ||
648 | bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; | ||
649 | |||
650 | if (bio->bi_rw & REQ_DISCARD) | ||
651 | goto integrity_clone; | ||
652 | |||
653 | if (bio->bi_rw & REQ_WRITE_SAME) { | ||
654 | bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; | ||
655 | goto integrity_clone; | ||
656 | } | ||
657 | |||
658 | bio_for_each_segment(bv, bio_src, iter) | ||
659 | bio->bi_io_vec[bio->bi_vcnt++] = bv; | ||
660 | |||
661 | integrity_clone: | ||
662 | if (bio_integrity(bio_src)) { | ||
663 | int ret; | ||
664 | |||
665 | ret = bio_integrity_clone(bio, bio_src, gfp_mask); | ||
666 | if (ret < 0) { | ||
667 | bio_put(bio); | ||
668 | return NULL; | ||
669 | } | ||
670 | } | ||
671 | |||
672 | return bio; | ||
673 | } | ||
674 | EXPORT_SYMBOL(bio_clone_bioset); | ||
675 | |||
676 | /** | ||
677 | * bio_get_nr_vecs - return approx number of vecs | ||
678 | * @bdev: I/O target | ||
679 | * | ||
680 | * Return the approximate number of pages we can send to this target. | ||
681 | * There's no guarantee that you will be able to fit this number of pages | ||
682 | * into a bio, it does not account for dynamic restrictions that vary | ||
683 | * on offset. | ||
684 | */ | ||
685 | int bio_get_nr_vecs(struct block_device *bdev) | ||
686 | { | ||
687 | struct request_queue *q = bdev_get_queue(bdev); | ||
688 | int nr_pages; | ||
689 | |||
690 | nr_pages = min_t(unsigned, | ||
691 | queue_max_segments(q), | ||
692 | queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); | ||
693 | |||
694 | return min_t(unsigned, nr_pages, BIO_MAX_PAGES); | ||
695 | |||
696 | } | ||
697 | EXPORT_SYMBOL(bio_get_nr_vecs); | ||
698 | |||
699 | static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page | ||
700 | *page, unsigned int len, unsigned int offset, | ||
701 | unsigned int max_sectors) | ||
702 | { | ||
703 | int retried_segments = 0; | ||
704 | struct bio_vec *bvec; | ||
705 | |||
706 | /* | ||
707 | * cloned bio must not modify vec list | ||
708 | */ | ||
709 | if (unlikely(bio_flagged(bio, BIO_CLONED))) | ||
710 | return 0; | ||
711 | |||
712 | if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) | ||
713 | return 0; | ||
714 | |||
715 | /* | ||
716 | * For filesystems with a blocksize smaller than the pagesize | ||
717 | * we will often be called with the same page as last time and | ||
718 | * a consecutive offset. Optimize this special case. | ||
719 | */ | ||
720 | if (bio->bi_vcnt > 0) { | ||
721 | struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; | ||
722 | |||
723 | if (page == prev->bv_page && | ||
724 | offset == prev->bv_offset + prev->bv_len) { | ||
725 | unsigned int prev_bv_len = prev->bv_len; | ||
726 | prev->bv_len += len; | ||
727 | |||
728 | if (q->merge_bvec_fn) { | ||
729 | struct bvec_merge_data bvm = { | ||
730 | /* prev_bvec is already charged in | ||
731 | bi_size, discharge it in order to | ||
732 | simulate merging updated prev_bvec | ||
733 | as new bvec. */ | ||
734 | .bi_bdev = bio->bi_bdev, | ||
735 | .bi_sector = bio->bi_iter.bi_sector, | ||
736 | .bi_size = bio->bi_iter.bi_size - | ||
737 | prev_bv_len, | ||
738 | .bi_rw = bio->bi_rw, | ||
739 | }; | ||
740 | |||
741 | if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { | ||
742 | prev->bv_len -= len; | ||
743 | return 0; | ||
744 | } | ||
745 | } | ||
746 | |||
747 | goto done; | ||
748 | } | ||
749 | } | ||
750 | |||
751 | if (bio->bi_vcnt >= bio->bi_max_vecs) | ||
752 | return 0; | ||
753 | |||
754 | /* | ||
755 | * we might lose a segment or two here, but rather that than | ||
756 | * make this too complex. | ||
757 | */ | ||
758 | |||
759 | while (bio->bi_phys_segments >= queue_max_segments(q)) { | ||
760 | |||
761 | if (retried_segments) | ||
762 | return 0; | ||
763 | |||
764 | retried_segments = 1; | ||
765 | blk_recount_segments(q, bio); | ||
766 | } | ||
767 | |||
768 | /* | ||
769 | * setup the new entry, we might clear it again later if we | ||
770 | * cannot add the page | ||
771 | */ | ||
772 | bvec = &bio->bi_io_vec[bio->bi_vcnt]; | ||
773 | bvec->bv_page = page; | ||
774 | bvec->bv_len = len; | ||
775 | bvec->bv_offset = offset; | ||
776 | |||
777 | /* | ||
778 | * if queue has other restrictions (eg varying max sector size | ||
779 | * depending on offset), it can specify a merge_bvec_fn in the | ||
780 | * queue to get further control | ||
781 | */ | ||
782 | if (q->merge_bvec_fn) { | ||
783 | struct bvec_merge_data bvm = { | ||
784 | .bi_bdev = bio->bi_bdev, | ||
785 | .bi_sector = bio->bi_iter.bi_sector, | ||
786 | .bi_size = bio->bi_iter.bi_size, | ||
787 | .bi_rw = bio->bi_rw, | ||
788 | }; | ||
789 | |||
790 | /* | ||
791 | * merge_bvec_fn() returns number of bytes it can accept | ||
792 | * at this offset | ||
793 | */ | ||
794 | if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { | ||
795 | bvec->bv_page = NULL; | ||
796 | bvec->bv_len = 0; | ||
797 | bvec->bv_offset = 0; | ||
798 | return 0; | ||
799 | } | ||
800 | } | ||
801 | |||
802 | /* If we may be able to merge these biovecs, force a recount */ | ||
803 | if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) | ||
804 | bio->bi_flags &= ~(1 << BIO_SEG_VALID); | ||
805 | |||
806 | bio->bi_vcnt++; | ||
807 | bio->bi_phys_segments++; | ||
808 | done: | ||
809 | bio->bi_iter.bi_size += len; | ||
810 | return len; | ||
811 | } | ||
812 | |||
813 | /** | ||
814 | * bio_add_pc_page - attempt to add page to bio | ||
815 | * @q: the target queue | ||
816 | * @bio: destination bio | ||
817 | * @page: page to add | ||
818 | * @len: vec entry length | ||
819 | * @offset: vec entry offset | ||
820 | * | ||
821 | * Attempt to add a page to the bio_vec maplist. This can fail for a | ||
822 | * number of reasons, such as the bio being full or target block device | ||
823 | * limitations. The target block device must allow bio's up to PAGE_SIZE, | ||
824 | * so it is always possible to add a single page to an empty bio. | ||
825 | * | ||
826 | * This should only be used by REQ_PC bios. | ||
827 | */ | ||
828 | int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, | ||
829 | unsigned int len, unsigned int offset) | ||
830 | { | ||
831 | return __bio_add_page(q, bio, page, len, offset, | ||
832 | queue_max_hw_sectors(q)); | ||
833 | } | ||
834 | EXPORT_SYMBOL(bio_add_pc_page); | ||
835 | |||
836 | /** | ||
837 | * bio_add_page - attempt to add page to bio | ||
838 | * @bio: destination bio | ||
839 | * @page: page to add | ||
840 | * @len: vec entry length | ||
841 | * @offset: vec entry offset | ||
842 | * | ||
843 | * Attempt to add a page to the bio_vec maplist. This can fail for a | ||
844 | * number of reasons, such as the bio being full or target block device | ||
845 | * limitations. The target block device must allow bio's up to PAGE_SIZE, | ||
846 | * so it is always possible to add a single page to an empty bio. | ||
847 | */ | ||
848 | int bio_add_page(struct bio *bio, struct page *page, unsigned int len, | ||
849 | unsigned int offset) | ||
850 | { | ||
851 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | ||
852 | return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); | ||
853 | } | ||
854 | EXPORT_SYMBOL(bio_add_page); | ||
855 | |||
856 | struct submit_bio_ret { | ||
857 | struct completion event; | ||
858 | int error; | ||
859 | }; | ||
860 | |||
861 | static void submit_bio_wait_endio(struct bio *bio, int error) | ||
862 | { | ||
863 | struct submit_bio_ret *ret = bio->bi_private; | ||
864 | |||
865 | ret->error = error; | ||
866 | complete(&ret->event); | ||
867 | } | ||
868 | |||
869 | /** | ||
870 | * submit_bio_wait - submit a bio, and wait until it completes | ||
871 | * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) | ||
872 | * @bio: The &struct bio which describes the I/O | ||
873 | * | ||
874 | * Simple wrapper around submit_bio(). Returns 0 on success, or the error from | ||
875 | * bio_endio() on failure. | ||
876 | */ | ||
877 | int submit_bio_wait(int rw, struct bio *bio) | ||
878 | { | ||
879 | struct submit_bio_ret ret; | ||
880 | |||
881 | rw |= REQ_SYNC; | ||
882 | init_completion(&ret.event); | ||
883 | bio->bi_private = &ret; | ||
884 | bio->bi_end_io = submit_bio_wait_endio; | ||
885 | submit_bio(rw, bio); | ||
886 | wait_for_completion(&ret.event); | ||
887 | |||
888 | return ret.error; | ||
889 | } | ||
890 | EXPORT_SYMBOL(submit_bio_wait); | ||
891 | |||
892 | /** | ||
893 | * bio_advance - increment/complete a bio by some number of bytes | ||
894 | * @bio: bio to advance | ||
895 | * @bytes: number of bytes to complete | ||
896 | * | ||
897 | * This updates bi_sector, bi_size and bi_idx; if the number of bytes to | ||
898 | * complete doesn't align with a bvec boundary, then bv_len and bv_offset will | ||
899 | * be updated on the last bvec as well. | ||
900 | * | ||
901 | * @bio will then represent the remaining, uncompleted portion of the io. | ||
902 | */ | ||
903 | void bio_advance(struct bio *bio, unsigned bytes) | ||
904 | { | ||
905 | if (bio_integrity(bio)) | ||
906 | bio_integrity_advance(bio, bytes); | ||
907 | |||
908 | bio_advance_iter(bio, &bio->bi_iter, bytes); | ||
909 | } | ||
910 | EXPORT_SYMBOL(bio_advance); | ||
911 | |||
912 | /** | ||
913 | * bio_alloc_pages - allocates a single page for each bvec in a bio | ||
914 | * @bio: bio to allocate pages for | ||
915 | * @gfp_mask: flags for allocation | ||
916 | * | ||
917 | * Allocates pages up to @bio->bi_vcnt. | ||
918 | * | ||
919 | * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are | ||
920 | * freed. | ||
921 | */ | ||
922 | int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) | ||
923 | { | ||
924 | int i; | ||
925 | struct bio_vec *bv; | ||
926 | |||
927 | bio_for_each_segment_all(bv, bio, i) { | ||
928 | bv->bv_page = alloc_page(gfp_mask); | ||
929 | if (!bv->bv_page) { | ||
930 | while (--bv >= bio->bi_io_vec) | ||
931 | __free_page(bv->bv_page); | ||
932 | return -ENOMEM; | ||
933 | } | ||
934 | } | ||
935 | |||
936 | return 0; | ||
937 | } | ||
938 | EXPORT_SYMBOL(bio_alloc_pages); | ||
939 | |||
940 | /** | ||
941 | * bio_copy_data - copy contents of data buffers from one chain of bios to | ||
942 | * another | ||
943 | * @src: source bio list | ||
944 | * @dst: destination bio list | ||
945 | * | ||
946 | * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats | ||
947 | * @src and @dst as linked lists of bios. | ||
948 | * | ||
949 | * Stops when it reaches the end of either @src or @dst - that is, copies | ||
950 | * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). | ||
951 | */ | ||
952 | void bio_copy_data(struct bio *dst, struct bio *src) | ||
953 | { | ||
954 | struct bvec_iter src_iter, dst_iter; | ||
955 | struct bio_vec src_bv, dst_bv; | ||
956 | void *src_p, *dst_p; | ||
957 | unsigned bytes; | ||
958 | |||
959 | src_iter = src->bi_iter; | ||
960 | dst_iter = dst->bi_iter; | ||
961 | |||
962 | while (1) { | ||
963 | if (!src_iter.bi_size) { | ||
964 | src = src->bi_next; | ||
965 | if (!src) | ||
966 | break; | ||
967 | |||
968 | src_iter = src->bi_iter; | ||
969 | } | ||
970 | |||
971 | if (!dst_iter.bi_size) { | ||
972 | dst = dst->bi_next; | ||
973 | if (!dst) | ||
974 | break; | ||
975 | |||
976 | dst_iter = dst->bi_iter; | ||
977 | } | ||
978 | |||
979 | src_bv = bio_iter_iovec(src, src_iter); | ||
980 | dst_bv = bio_iter_iovec(dst, dst_iter); | ||
981 | |||
982 | bytes = min(src_bv.bv_len, dst_bv.bv_len); | ||
983 | |||
984 | src_p = kmap_atomic(src_bv.bv_page); | ||
985 | dst_p = kmap_atomic(dst_bv.bv_page); | ||
986 | |||
987 | memcpy(dst_p + dst_bv.bv_offset, | ||
988 | src_p + src_bv.bv_offset, | ||
989 | bytes); | ||
990 | |||
991 | kunmap_atomic(dst_p); | ||
992 | kunmap_atomic(src_p); | ||
993 | |||
994 | bio_advance_iter(src, &src_iter, bytes); | ||
995 | bio_advance_iter(dst, &dst_iter, bytes); | ||
996 | } | ||
997 | } | ||
998 | EXPORT_SYMBOL(bio_copy_data); | ||
999 | |||
1000 | struct bio_map_data { | ||
1001 | int nr_sgvecs; | ||
1002 | int is_our_pages; | ||
1003 | struct sg_iovec sgvecs[]; | ||
1004 | }; | ||
1005 | |||
1006 | static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, | ||
1007 | const struct sg_iovec *iov, int iov_count, | ||
1008 | int is_our_pages) | ||
1009 | { | ||
1010 | memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); | ||
1011 | bmd->nr_sgvecs = iov_count; | ||
1012 | bmd->is_our_pages = is_our_pages; | ||
1013 | bio->bi_private = bmd; | ||
1014 | } | ||
1015 | |||
1016 | static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, | ||
1017 | gfp_t gfp_mask) | ||
1018 | { | ||
1019 | if (iov_count > UIO_MAXIOV) | ||
1020 | return NULL; | ||
1021 | |||
1022 | return kmalloc(sizeof(struct bio_map_data) + | ||
1023 | sizeof(struct sg_iovec) * iov_count, gfp_mask); | ||
1024 | } | ||
1025 | |||
1026 | static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, | ||
1027 | int to_user, int from_user, int do_free_page) | ||
1028 | { | ||
1029 | int ret = 0, i; | ||
1030 | struct bio_vec *bvec; | ||
1031 | int iov_idx = 0; | ||
1032 | unsigned int iov_off = 0; | ||
1033 | |||
1034 | bio_for_each_segment_all(bvec, bio, i) { | ||
1035 | char *bv_addr = page_address(bvec->bv_page); | ||
1036 | unsigned int bv_len = bvec->bv_len; | ||
1037 | |||
1038 | while (bv_len && iov_idx < iov_count) { | ||
1039 | unsigned int bytes; | ||
1040 | char __user *iov_addr; | ||
1041 | |||
1042 | bytes = min_t(unsigned int, | ||
1043 | iov[iov_idx].iov_len - iov_off, bv_len); | ||
1044 | iov_addr = iov[iov_idx].iov_base + iov_off; | ||
1045 | |||
1046 | if (!ret) { | ||
1047 | if (to_user) | ||
1048 | ret = copy_to_user(iov_addr, bv_addr, | ||
1049 | bytes); | ||
1050 | |||
1051 | if (from_user) | ||
1052 | ret = copy_from_user(bv_addr, iov_addr, | ||
1053 | bytes); | ||
1054 | |||
1055 | if (ret) | ||
1056 | ret = -EFAULT; | ||
1057 | } | ||
1058 | |||
1059 | bv_len -= bytes; | ||
1060 | bv_addr += bytes; | ||
1061 | iov_addr += bytes; | ||
1062 | iov_off += bytes; | ||
1063 | |||
1064 | if (iov[iov_idx].iov_len == iov_off) { | ||
1065 | iov_idx++; | ||
1066 | iov_off = 0; | ||
1067 | } | ||
1068 | } | ||
1069 | |||
1070 | if (do_free_page) | ||
1071 | __free_page(bvec->bv_page); | ||
1072 | } | ||
1073 | |||
1074 | return ret; | ||
1075 | } | ||
1076 | |||
1077 | /** | ||
1078 | * bio_uncopy_user - finish previously mapped bio | ||
1079 | * @bio: bio being terminated | ||
1080 | * | ||
1081 | * Free pages allocated from bio_copy_user() and write back data | ||
1082 | * to user space in case of a read. | ||
1083 | */ | ||
1084 | int bio_uncopy_user(struct bio *bio) | ||
1085 | { | ||
1086 | struct bio_map_data *bmd = bio->bi_private; | ||
1087 | struct bio_vec *bvec; | ||
1088 | int ret = 0, i; | ||
1089 | |||
1090 | if (!bio_flagged(bio, BIO_NULL_MAPPED)) { | ||
1091 | /* | ||
1092 | * if we're in a workqueue, the request is orphaned, so | ||
1093 | * don't copy into a random user address space, just free. | ||
1094 | */ | ||
1095 | if (current->mm) | ||
1096 | ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, | ||
1097 | bio_data_dir(bio) == READ, | ||
1098 | 0, bmd->is_our_pages); | ||
1099 | else if (bmd->is_our_pages) | ||
1100 | bio_for_each_segment_all(bvec, bio, i) | ||
1101 | __free_page(bvec->bv_page); | ||
1102 | } | ||
1103 | kfree(bmd); | ||
1104 | bio_put(bio); | ||
1105 | return ret; | ||
1106 | } | ||
1107 | EXPORT_SYMBOL(bio_uncopy_user); | ||
1108 | |||
1109 | /** | ||
1110 | * bio_copy_user_iov - copy user data to bio | ||
1111 | * @q: destination block queue | ||
1112 | * @map_data: pointer to the rq_map_data holding pages (if necessary) | ||
1113 | * @iov: the iovec. | ||
1114 | * @iov_count: number of elements in the iovec | ||
1115 | * @write_to_vm: bool indicating writing to pages or not | ||
1116 | * @gfp_mask: memory allocation flags | ||
1117 | * | ||
1118 | * Prepares and returns a bio for indirect user io, bouncing data | ||
1119 | * to/from kernel pages as necessary. Must be paired with | ||
1120 | * call bio_uncopy_user() on io completion. | ||
1121 | */ | ||
1122 | struct bio *bio_copy_user_iov(struct request_queue *q, | ||
1123 | struct rq_map_data *map_data, | ||
1124 | const struct sg_iovec *iov, int iov_count, | ||
1125 | int write_to_vm, gfp_t gfp_mask) | ||
1126 | { | ||
1127 | struct bio_map_data *bmd; | ||
1128 | struct bio_vec *bvec; | ||
1129 | struct page *page; | ||
1130 | struct bio *bio; | ||
1131 | int i, ret; | ||
1132 | int nr_pages = 0; | ||
1133 | unsigned int len = 0; | ||
1134 | unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; | ||
1135 | |||
1136 | for (i = 0; i < iov_count; i++) { | ||
1137 | unsigned long uaddr; | ||
1138 | unsigned long end; | ||
1139 | unsigned long start; | ||
1140 | |||
1141 | uaddr = (unsigned long)iov[i].iov_base; | ||
1142 | end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
1143 | start = uaddr >> PAGE_SHIFT; | ||
1144 | |||
1145 | /* | ||
1146 | * Overflow, abort | ||
1147 | */ | ||
1148 | if (end < start) | ||
1149 | return ERR_PTR(-EINVAL); | ||
1150 | |||
1151 | nr_pages += end - start; | ||
1152 | len += iov[i].iov_len; | ||
1153 | } | ||
1154 | |||
1155 | if (offset) | ||
1156 | nr_pages++; | ||
1157 | |||
1158 | bmd = bio_alloc_map_data(iov_count, gfp_mask); | ||
1159 | if (!bmd) | ||
1160 | return ERR_PTR(-ENOMEM); | ||
1161 | |||
1162 | ret = -ENOMEM; | ||
1163 | bio = bio_kmalloc(gfp_mask, nr_pages); | ||
1164 | if (!bio) | ||
1165 | goto out_bmd; | ||
1166 | |||
1167 | if (!write_to_vm) | ||
1168 | bio->bi_rw |= REQ_WRITE; | ||
1169 | |||
1170 | ret = 0; | ||
1171 | |||
1172 | if (map_data) { | ||
1173 | nr_pages = 1 << map_data->page_order; | ||
1174 | i = map_data->offset / PAGE_SIZE; | ||
1175 | } | ||
1176 | while (len) { | ||
1177 | unsigned int bytes = PAGE_SIZE; | ||
1178 | |||
1179 | bytes -= offset; | ||
1180 | |||
1181 | if (bytes > len) | ||
1182 | bytes = len; | ||
1183 | |||
1184 | if (map_data) { | ||
1185 | if (i == map_data->nr_entries * nr_pages) { | ||
1186 | ret = -ENOMEM; | ||
1187 | break; | ||
1188 | } | ||
1189 | |||
1190 | page = map_data->pages[i / nr_pages]; | ||
1191 | page += (i % nr_pages); | ||
1192 | |||
1193 | i++; | ||
1194 | } else { | ||
1195 | page = alloc_page(q->bounce_gfp | gfp_mask); | ||
1196 | if (!page) { | ||
1197 | ret = -ENOMEM; | ||
1198 | break; | ||
1199 | } | ||
1200 | } | ||
1201 | |||
1202 | if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) | ||
1203 | break; | ||
1204 | |||
1205 | len -= bytes; | ||
1206 | offset = 0; | ||
1207 | } | ||
1208 | |||
1209 | if (ret) | ||
1210 | goto cleanup; | ||
1211 | |||
1212 | /* | ||
1213 | * success | ||
1214 | */ | ||
1215 | if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || | ||
1216 | (map_data && map_data->from_user)) { | ||
1217 | ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); | ||
1218 | if (ret) | ||
1219 | goto cleanup; | ||
1220 | } | ||
1221 | |||
1222 | bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); | ||
1223 | return bio; | ||
1224 | cleanup: | ||
1225 | if (!map_data) | ||
1226 | bio_for_each_segment_all(bvec, bio, i) | ||
1227 | __free_page(bvec->bv_page); | ||
1228 | |||
1229 | bio_put(bio); | ||
1230 | out_bmd: | ||
1231 | kfree(bmd); | ||
1232 | return ERR_PTR(ret); | ||
1233 | } | ||
1234 | |||
1235 | /** | ||
1236 | * bio_copy_user - copy user data to bio | ||
1237 | * @q: destination block queue | ||
1238 | * @map_data: pointer to the rq_map_data holding pages (if necessary) | ||
1239 | * @uaddr: start of user address | ||
1240 | * @len: length in bytes | ||
1241 | * @write_to_vm: bool indicating writing to pages or not | ||
1242 | * @gfp_mask: memory allocation flags | ||
1243 | * | ||
1244 | * Prepares and returns a bio for indirect user io, bouncing data | ||
1245 | * to/from kernel pages as necessary. Must be paired with | ||
1246 | * call bio_uncopy_user() on io completion. | ||
1247 | */ | ||
1248 | struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, | ||
1249 | unsigned long uaddr, unsigned int len, | ||
1250 | int write_to_vm, gfp_t gfp_mask) | ||
1251 | { | ||
1252 | struct sg_iovec iov; | ||
1253 | |||
1254 | iov.iov_base = (void __user *)uaddr; | ||
1255 | iov.iov_len = len; | ||
1256 | |||
1257 | return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); | ||
1258 | } | ||
1259 | EXPORT_SYMBOL(bio_copy_user); | ||
1260 | |||
1261 | static struct bio *__bio_map_user_iov(struct request_queue *q, | ||
1262 | struct block_device *bdev, | ||
1263 | const struct sg_iovec *iov, int iov_count, | ||
1264 | int write_to_vm, gfp_t gfp_mask) | ||
1265 | { | ||
1266 | int i, j; | ||
1267 | int nr_pages = 0; | ||
1268 | struct page **pages; | ||
1269 | struct bio *bio; | ||
1270 | int cur_page = 0; | ||
1271 | int ret, offset; | ||
1272 | |||
1273 | for (i = 0; i < iov_count; i++) { | ||
1274 | unsigned long uaddr = (unsigned long)iov[i].iov_base; | ||
1275 | unsigned long len = iov[i].iov_len; | ||
1276 | unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
1277 | unsigned long start = uaddr >> PAGE_SHIFT; | ||
1278 | |||
1279 | /* | ||
1280 | * Overflow, abort | ||
1281 | */ | ||
1282 | if (end < start) | ||
1283 | return ERR_PTR(-EINVAL); | ||
1284 | |||
1285 | nr_pages += end - start; | ||
1286 | /* | ||
1287 | * buffer must be aligned to at least hardsector size for now | ||
1288 | */ | ||
1289 | if (uaddr & queue_dma_alignment(q)) | ||
1290 | return ERR_PTR(-EINVAL); | ||
1291 | } | ||
1292 | |||
1293 | if (!nr_pages) | ||
1294 | return ERR_PTR(-EINVAL); | ||
1295 | |||
1296 | bio = bio_kmalloc(gfp_mask, nr_pages); | ||
1297 | if (!bio) | ||
1298 | return ERR_PTR(-ENOMEM); | ||
1299 | |||
1300 | ret = -ENOMEM; | ||
1301 | pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); | ||
1302 | if (!pages) | ||
1303 | goto out; | ||
1304 | |||
1305 | for (i = 0; i < iov_count; i++) { | ||
1306 | unsigned long uaddr = (unsigned long)iov[i].iov_base; | ||
1307 | unsigned long len = iov[i].iov_len; | ||
1308 | unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
1309 | unsigned long start = uaddr >> PAGE_SHIFT; | ||
1310 | const int local_nr_pages = end - start; | ||
1311 | const int page_limit = cur_page + local_nr_pages; | ||
1312 | |||
1313 | ret = get_user_pages_fast(uaddr, local_nr_pages, | ||
1314 | write_to_vm, &pages[cur_page]); | ||
1315 | if (ret < local_nr_pages) { | ||
1316 | ret = -EFAULT; | ||
1317 | goto out_unmap; | ||
1318 | } | ||
1319 | |||
1320 | offset = uaddr & ~PAGE_MASK; | ||
1321 | for (j = cur_page; j < page_limit; j++) { | ||
1322 | unsigned int bytes = PAGE_SIZE - offset; | ||
1323 | |||
1324 | if (len <= 0) | ||
1325 | break; | ||
1326 | |||
1327 | if (bytes > len) | ||
1328 | bytes = len; | ||
1329 | |||
1330 | /* | ||
1331 | * sorry... | ||
1332 | */ | ||
1333 | if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < | ||
1334 | bytes) | ||
1335 | break; | ||
1336 | |||
1337 | len -= bytes; | ||
1338 | offset = 0; | ||
1339 | } | ||
1340 | |||
1341 | cur_page = j; | ||
1342 | /* | ||
1343 | * release the pages we didn't map into the bio, if any | ||
1344 | */ | ||
1345 | while (j < page_limit) | ||
1346 | page_cache_release(pages[j++]); | ||
1347 | } | ||
1348 | |||
1349 | kfree(pages); | ||
1350 | |||
1351 | /* | ||
1352 | * set data direction, and check if mapped pages need bouncing | ||
1353 | */ | ||
1354 | if (!write_to_vm) | ||
1355 | bio->bi_rw |= REQ_WRITE; | ||
1356 | |||
1357 | bio->bi_bdev = bdev; | ||
1358 | bio->bi_flags |= (1 << BIO_USER_MAPPED); | ||
1359 | return bio; | ||
1360 | |||
1361 | out_unmap: | ||
1362 | for (i = 0; i < nr_pages; i++) { | ||
1363 | if(!pages[i]) | ||
1364 | break; | ||
1365 | page_cache_release(pages[i]); | ||
1366 | } | ||
1367 | out: | ||
1368 | kfree(pages); | ||
1369 | bio_put(bio); | ||
1370 | return ERR_PTR(ret); | ||
1371 | } | ||
1372 | |||
1373 | /** | ||
1374 | * bio_map_user - map user address into bio | ||
1375 | * @q: the struct request_queue for the bio | ||
1376 | * @bdev: destination block device | ||
1377 | * @uaddr: start of user address | ||
1378 | * @len: length in bytes | ||
1379 | * @write_to_vm: bool indicating writing to pages or not | ||
1380 | * @gfp_mask: memory allocation flags | ||
1381 | * | ||
1382 | * Map the user space address into a bio suitable for io to a block | ||
1383 | * device. Returns an error pointer in case of error. | ||
1384 | */ | ||
1385 | struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, | ||
1386 | unsigned long uaddr, unsigned int len, int write_to_vm, | ||
1387 | gfp_t gfp_mask) | ||
1388 | { | ||
1389 | struct sg_iovec iov; | ||
1390 | |||
1391 | iov.iov_base = (void __user *)uaddr; | ||
1392 | iov.iov_len = len; | ||
1393 | |||
1394 | return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); | ||
1395 | } | ||
1396 | EXPORT_SYMBOL(bio_map_user); | ||
1397 | |||
1398 | /** | ||
1399 | * bio_map_user_iov - map user sg_iovec table into bio | ||
1400 | * @q: the struct request_queue for the bio | ||
1401 | * @bdev: destination block device | ||
1402 | * @iov: the iovec. | ||
1403 | * @iov_count: number of elements in the iovec | ||
1404 | * @write_to_vm: bool indicating writing to pages or not | ||
1405 | * @gfp_mask: memory allocation flags | ||
1406 | * | ||
1407 | * Map the user space address into a bio suitable for io to a block | ||
1408 | * device. Returns an error pointer in case of error. | ||
1409 | */ | ||
1410 | struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, | ||
1411 | const struct sg_iovec *iov, int iov_count, | ||
1412 | int write_to_vm, gfp_t gfp_mask) | ||
1413 | { | ||
1414 | struct bio *bio; | ||
1415 | |||
1416 | bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, | ||
1417 | gfp_mask); | ||
1418 | if (IS_ERR(bio)) | ||
1419 | return bio; | ||
1420 | |||
1421 | /* | ||
1422 | * subtle -- if __bio_map_user() ended up bouncing a bio, | ||
1423 | * it would normally disappear when its bi_end_io is run. | ||
1424 | * however, we need it for the unmap, so grab an extra | ||
1425 | * reference to it | ||
1426 | */ | ||
1427 | bio_get(bio); | ||
1428 | |||
1429 | return bio; | ||
1430 | } | ||
1431 | |||
1432 | static void __bio_unmap_user(struct bio *bio) | ||
1433 | { | ||
1434 | struct bio_vec *bvec; | ||
1435 | int i; | ||
1436 | |||
1437 | /* | ||
1438 | * make sure we dirty pages we wrote to | ||
1439 | */ | ||
1440 | bio_for_each_segment_all(bvec, bio, i) { | ||
1441 | if (bio_data_dir(bio) == READ) | ||
1442 | set_page_dirty_lock(bvec->bv_page); | ||
1443 | |||
1444 | page_cache_release(bvec->bv_page); | ||
1445 | } | ||
1446 | |||
1447 | bio_put(bio); | ||
1448 | } | ||
1449 | |||
1450 | /** | ||
1451 | * bio_unmap_user - unmap a bio | ||
1452 | * @bio: the bio being unmapped | ||
1453 | * | ||
1454 | * Unmap a bio previously mapped by bio_map_user(). Must be called with | ||
1455 | * a process context. | ||
1456 | * | ||
1457 | * bio_unmap_user() may sleep. | ||
1458 | */ | ||
1459 | void bio_unmap_user(struct bio *bio) | ||
1460 | { | ||
1461 | __bio_unmap_user(bio); | ||
1462 | bio_put(bio); | ||
1463 | } | ||
1464 | EXPORT_SYMBOL(bio_unmap_user); | ||
1465 | |||
1466 | static void bio_map_kern_endio(struct bio *bio, int err) | ||
1467 | { | ||
1468 | bio_put(bio); | ||
1469 | } | ||
1470 | |||
1471 | static struct bio *__bio_map_kern(struct request_queue *q, void *data, | ||
1472 | unsigned int len, gfp_t gfp_mask) | ||
1473 | { | ||
1474 | unsigned long kaddr = (unsigned long)data; | ||
1475 | unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
1476 | unsigned long start = kaddr >> PAGE_SHIFT; | ||
1477 | const int nr_pages = end - start; | ||
1478 | int offset, i; | ||
1479 | struct bio *bio; | ||
1480 | |||
1481 | bio = bio_kmalloc(gfp_mask, nr_pages); | ||
1482 | if (!bio) | ||
1483 | return ERR_PTR(-ENOMEM); | ||
1484 | |||
1485 | offset = offset_in_page(kaddr); | ||
1486 | for (i = 0; i < nr_pages; i++) { | ||
1487 | unsigned int bytes = PAGE_SIZE - offset; | ||
1488 | |||
1489 | if (len <= 0) | ||
1490 | break; | ||
1491 | |||
1492 | if (bytes > len) | ||
1493 | bytes = len; | ||
1494 | |||
1495 | if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, | ||
1496 | offset) < bytes) | ||
1497 | break; | ||
1498 | |||
1499 | data += bytes; | ||
1500 | len -= bytes; | ||
1501 | offset = 0; | ||
1502 | } | ||
1503 | |||
1504 | bio->bi_end_io = bio_map_kern_endio; | ||
1505 | return bio; | ||
1506 | } | ||
1507 | |||
1508 | /** | ||
1509 | * bio_map_kern - map kernel address into bio | ||
1510 | * @q: the struct request_queue for the bio | ||
1511 | * @data: pointer to buffer to map | ||
1512 | * @len: length in bytes | ||
1513 | * @gfp_mask: allocation flags for bio allocation | ||
1514 | * | ||
1515 | * Map the kernel address into a bio suitable for io to a block | ||
1516 | * device. Returns an error pointer in case of error. | ||
1517 | */ | ||
1518 | struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, | ||
1519 | gfp_t gfp_mask) | ||
1520 | { | ||
1521 | struct bio *bio; | ||
1522 | |||
1523 | bio = __bio_map_kern(q, data, len, gfp_mask); | ||
1524 | if (IS_ERR(bio)) | ||
1525 | return bio; | ||
1526 | |||
1527 | if (bio->bi_iter.bi_size == len) | ||
1528 | return bio; | ||
1529 | |||
1530 | /* | ||
1531 | * Don't support partial mappings. | ||
1532 | */ | ||
1533 | bio_put(bio); | ||
1534 | return ERR_PTR(-EINVAL); | ||
1535 | } | ||
1536 | EXPORT_SYMBOL(bio_map_kern); | ||
1537 | |||
1538 | static void bio_copy_kern_endio(struct bio *bio, int err) | ||
1539 | { | ||
1540 | struct bio_vec *bvec; | ||
1541 | const int read = bio_data_dir(bio) == READ; | ||
1542 | struct bio_map_data *bmd = bio->bi_private; | ||
1543 | int i; | ||
1544 | char *p = bmd->sgvecs[0].iov_base; | ||
1545 | |||
1546 | bio_for_each_segment_all(bvec, bio, i) { | ||
1547 | char *addr = page_address(bvec->bv_page); | ||
1548 | |||
1549 | if (read) | ||
1550 | memcpy(p, addr, bvec->bv_len); | ||
1551 | |||
1552 | __free_page(bvec->bv_page); | ||
1553 | p += bvec->bv_len; | ||
1554 | } | ||
1555 | |||
1556 | kfree(bmd); | ||
1557 | bio_put(bio); | ||
1558 | } | ||
1559 | |||
1560 | /** | ||
1561 | * bio_copy_kern - copy kernel address into bio | ||
1562 | * @q: the struct request_queue for the bio | ||
1563 | * @data: pointer to buffer to copy | ||
1564 | * @len: length in bytes | ||
1565 | * @gfp_mask: allocation flags for bio and page allocation | ||
1566 | * @reading: data direction is READ | ||
1567 | * | ||
1568 | * copy the kernel address into a bio suitable for io to a block | ||
1569 | * device. Returns an error pointer in case of error. | ||
1570 | */ | ||
1571 | struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, | ||
1572 | gfp_t gfp_mask, int reading) | ||
1573 | { | ||
1574 | struct bio *bio; | ||
1575 | struct bio_vec *bvec; | ||
1576 | int i; | ||
1577 | |||
1578 | bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); | ||
1579 | if (IS_ERR(bio)) | ||
1580 | return bio; | ||
1581 | |||
1582 | if (!reading) { | ||
1583 | void *p = data; | ||
1584 | |||
1585 | bio_for_each_segment_all(bvec, bio, i) { | ||
1586 | char *addr = page_address(bvec->bv_page); | ||
1587 | |||
1588 | memcpy(addr, p, bvec->bv_len); | ||
1589 | p += bvec->bv_len; | ||
1590 | } | ||
1591 | } | ||
1592 | |||
1593 | bio->bi_end_io = bio_copy_kern_endio; | ||
1594 | |||
1595 | return bio; | ||
1596 | } | ||
1597 | EXPORT_SYMBOL(bio_copy_kern); | ||
1598 | |||
1599 | /* | ||
1600 | * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions | ||
1601 | * for performing direct-IO in BIOs. | ||
1602 | * | ||
1603 | * The problem is that we cannot run set_page_dirty() from interrupt context | ||
1604 | * because the required locks are not interrupt-safe. So what we can do is to | ||
1605 | * mark the pages dirty _before_ performing IO. And in interrupt context, | ||
1606 | * check that the pages are still dirty. If so, fine. If not, redirty them | ||
1607 | * in process context. | ||
1608 | * | ||
1609 | * We special-case compound pages here: normally this means reads into hugetlb | ||
1610 | * pages. The logic in here doesn't really work right for compound pages | ||
1611 | * because the VM does not uniformly chase down the head page in all cases. | ||
1612 | * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't | ||
1613 | * handle them at all. So we skip compound pages here at an early stage. | ||
1614 | * | ||
1615 | * Note that this code is very hard to test under normal circumstances because | ||
1616 | * direct-io pins the pages with get_user_pages(). This makes | ||
1617 | * is_page_cache_freeable return false, and the VM will not clean the pages. | ||
1618 | * But other code (eg, flusher threads) could clean the pages if they are mapped | ||
1619 | * pagecache. | ||
1620 | * | ||
1621 | * Simply disabling the call to bio_set_pages_dirty() is a good way to test the | ||
1622 | * deferred bio dirtying paths. | ||
1623 | */ | ||
1624 | |||
1625 | /* | ||
1626 | * bio_set_pages_dirty() will mark all the bio's pages as dirty. | ||
1627 | */ | ||
1628 | void bio_set_pages_dirty(struct bio *bio) | ||
1629 | { | ||
1630 | struct bio_vec *bvec; | ||
1631 | int i; | ||
1632 | |||
1633 | bio_for_each_segment_all(bvec, bio, i) { | ||
1634 | struct page *page = bvec->bv_page; | ||
1635 | |||
1636 | if (page && !PageCompound(page)) | ||
1637 | set_page_dirty_lock(page); | ||
1638 | } | ||
1639 | } | ||
1640 | |||
1641 | static void bio_release_pages(struct bio *bio) | ||
1642 | { | ||
1643 | struct bio_vec *bvec; | ||
1644 | int i; | ||
1645 | |||
1646 | bio_for_each_segment_all(bvec, bio, i) { | ||
1647 | struct page *page = bvec->bv_page; | ||
1648 | |||
1649 | if (page) | ||
1650 | put_page(page); | ||
1651 | } | ||
1652 | } | ||
1653 | |||
1654 | /* | ||
1655 | * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. | ||
1656 | * If they are, then fine. If, however, some pages are clean then they must | ||
1657 | * have been written out during the direct-IO read. So we take another ref on | ||
1658 | * the BIO and the offending pages and re-dirty the pages in process context. | ||
1659 | * | ||
1660 | * It is expected that bio_check_pages_dirty() will wholly own the BIO from | ||
1661 | * here on. It will run one page_cache_release() against each page and will | ||
1662 | * run one bio_put() against the BIO. | ||
1663 | */ | ||
1664 | |||
1665 | static void bio_dirty_fn(struct work_struct *work); | ||
1666 | |||
1667 | static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); | ||
1668 | static DEFINE_SPINLOCK(bio_dirty_lock); | ||
1669 | static struct bio *bio_dirty_list; | ||
1670 | |||
1671 | /* | ||
1672 | * This runs in process context | ||
1673 | */ | ||
1674 | static void bio_dirty_fn(struct work_struct *work) | ||
1675 | { | ||
1676 | unsigned long flags; | ||
1677 | struct bio *bio; | ||
1678 | |||
1679 | spin_lock_irqsave(&bio_dirty_lock, flags); | ||
1680 | bio = bio_dirty_list; | ||
1681 | bio_dirty_list = NULL; | ||
1682 | spin_unlock_irqrestore(&bio_dirty_lock, flags); | ||
1683 | |||
1684 | while (bio) { | ||
1685 | struct bio *next = bio->bi_private; | ||
1686 | |||
1687 | bio_set_pages_dirty(bio); | ||
1688 | bio_release_pages(bio); | ||
1689 | bio_put(bio); | ||
1690 | bio = next; | ||
1691 | } | ||
1692 | } | ||
1693 | |||
1694 | void bio_check_pages_dirty(struct bio *bio) | ||
1695 | { | ||
1696 | struct bio_vec *bvec; | ||
1697 | int nr_clean_pages = 0; | ||
1698 | int i; | ||
1699 | |||
1700 | bio_for_each_segment_all(bvec, bio, i) { | ||
1701 | struct page *page = bvec->bv_page; | ||
1702 | |||
1703 | if (PageDirty(page) || PageCompound(page)) { | ||
1704 | page_cache_release(page); | ||
1705 | bvec->bv_page = NULL; | ||
1706 | } else { | ||
1707 | nr_clean_pages++; | ||
1708 | } | ||
1709 | } | ||
1710 | |||
1711 | if (nr_clean_pages) { | ||
1712 | unsigned long flags; | ||
1713 | |||
1714 | spin_lock_irqsave(&bio_dirty_lock, flags); | ||
1715 | bio->bi_private = bio_dirty_list; | ||
1716 | bio_dirty_list = bio; | ||
1717 | spin_unlock_irqrestore(&bio_dirty_lock, flags); | ||
1718 | schedule_work(&bio_dirty_work); | ||
1719 | } else { | ||
1720 | bio_put(bio); | ||
1721 | } | ||
1722 | } | ||
1723 | |||
1724 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE | ||
1725 | void bio_flush_dcache_pages(struct bio *bi) | ||
1726 | { | ||
1727 | struct bio_vec bvec; | ||
1728 | struct bvec_iter iter; | ||
1729 | |||
1730 | bio_for_each_segment(bvec, bi, iter) | ||
1731 | flush_dcache_page(bvec.bv_page); | ||
1732 | } | ||
1733 | EXPORT_SYMBOL(bio_flush_dcache_pages); | ||
1734 | #endif | ||
1735 | |||
1736 | /** | ||
1737 | * bio_endio - end I/O on a bio | ||
1738 | * @bio: bio | ||
1739 | * @error: error, if any | ||
1740 | * | ||
1741 | * Description: | ||
1742 | * bio_endio() will end I/O on the whole bio. bio_endio() is the | ||
1743 | * preferred way to end I/O on a bio, it takes care of clearing | ||
1744 | * BIO_UPTODATE on error. @error is 0 on success, and and one of the | ||
1745 | * established -Exxxx (-EIO, for instance) error values in case | ||
1746 | * something went wrong. No one should call bi_end_io() directly on a | ||
1747 | * bio unless they own it and thus know that it has an end_io | ||
1748 | * function. | ||
1749 | **/ | ||
1750 | void bio_endio(struct bio *bio, int error) | ||
1751 | { | ||
1752 | while (bio) { | ||
1753 | BUG_ON(atomic_read(&bio->bi_remaining) <= 0); | ||
1754 | |||
1755 | if (error) | ||
1756 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1757 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
1758 | error = -EIO; | ||
1759 | |||
1760 | if (!atomic_dec_and_test(&bio->bi_remaining)) | ||
1761 | return; | ||
1762 | |||
1763 | /* | ||
1764 | * Need to have a real endio function for chained bios, | ||
1765 | * otherwise various corner cases will break (like stacking | ||
1766 | * block devices that save/restore bi_end_io) - however, we want | ||
1767 | * to avoid unbounded recursion and blowing the stack. Tail call | ||
1768 | * optimization would handle this, but compiling with frame | ||
1769 | * pointers also disables gcc's sibling call optimization. | ||
1770 | */ | ||
1771 | if (bio->bi_end_io == bio_chain_endio) { | ||
1772 | struct bio *parent = bio->bi_private; | ||
1773 | bio_put(bio); | ||
1774 | bio = parent; | ||
1775 | } else { | ||
1776 | if (bio->bi_end_io) | ||
1777 | bio->bi_end_io(bio, error); | ||
1778 | bio = NULL; | ||
1779 | } | ||
1780 | } | ||
1781 | } | ||
1782 | EXPORT_SYMBOL(bio_endio); | ||
1783 | |||
1784 | /** | ||
1785 | * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining | ||
1786 | * @bio: bio | ||
1787 | * @error: error, if any | ||
1788 | * | ||
1789 | * For code that has saved and restored bi_end_io; thing hard before using this | ||
1790 | * function, probably you should've cloned the entire bio. | ||
1791 | **/ | ||
1792 | void bio_endio_nodec(struct bio *bio, int error) | ||
1793 | { | ||
1794 | atomic_inc(&bio->bi_remaining); | ||
1795 | bio_endio(bio, error); | ||
1796 | } | ||
1797 | EXPORT_SYMBOL(bio_endio_nodec); | ||
1798 | |||
1799 | /** | ||
1800 | * bio_split - split a bio | ||
1801 | * @bio: bio to split | ||
1802 | * @sectors: number of sectors to split from the front of @bio | ||
1803 | * @gfp: gfp mask | ||
1804 | * @bs: bio set to allocate from | ||
1805 | * | ||
1806 | * Allocates and returns a new bio which represents @sectors from the start of | ||
1807 | * @bio, and updates @bio to represent the remaining sectors. | ||
1808 | * | ||
1809 | * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's | ||
1810 | * responsibility to ensure that @bio is not freed before the split. | ||
1811 | */ | ||
1812 | struct bio *bio_split(struct bio *bio, int sectors, | ||
1813 | gfp_t gfp, struct bio_set *bs) | ||
1814 | { | ||
1815 | struct bio *split = NULL; | ||
1816 | |||
1817 | BUG_ON(sectors <= 0); | ||
1818 | BUG_ON(sectors >= bio_sectors(bio)); | ||
1819 | |||
1820 | split = bio_clone_fast(bio, gfp, bs); | ||
1821 | if (!split) | ||
1822 | return NULL; | ||
1823 | |||
1824 | split->bi_iter.bi_size = sectors << 9; | ||
1825 | |||
1826 | if (bio_integrity(split)) | ||
1827 | bio_integrity_trim(split, 0, sectors); | ||
1828 | |||
1829 | bio_advance(bio, split->bi_iter.bi_size); | ||
1830 | |||
1831 | return split; | ||
1832 | } | ||
1833 | EXPORT_SYMBOL(bio_split); | ||
1834 | |||
1835 | /** | ||
1836 | * bio_trim - trim a bio | ||
1837 | * @bio: bio to trim | ||
1838 | * @offset: number of sectors to trim from the front of @bio | ||
1839 | * @size: size we want to trim @bio to, in sectors | ||
1840 | */ | ||
1841 | void bio_trim(struct bio *bio, int offset, int size) | ||
1842 | { | ||
1843 | /* 'bio' is a cloned bio which we need to trim to match | ||
1844 | * the given offset and size. | ||
1845 | */ | ||
1846 | |||
1847 | size <<= 9; | ||
1848 | if (offset == 0 && size == bio->bi_iter.bi_size) | ||
1849 | return; | ||
1850 | |||
1851 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
1852 | |||
1853 | bio_advance(bio, offset << 9); | ||
1854 | |||
1855 | bio->bi_iter.bi_size = size; | ||
1856 | } | ||
1857 | EXPORT_SYMBOL_GPL(bio_trim); | ||
1858 | |||
1859 | /* | ||
1860 | * create memory pools for biovec's in a bio_set. | ||
1861 | * use the global biovec slabs created for general use. | ||
1862 | */ | ||
1863 | mempool_t *biovec_create_pool(int pool_entries) | ||
1864 | { | ||
1865 | struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; | ||
1866 | |||
1867 | return mempool_create_slab_pool(pool_entries, bp->slab); | ||
1868 | } | ||
1869 | |||
1870 | void bioset_free(struct bio_set *bs) | ||
1871 | { | ||
1872 | if (bs->rescue_workqueue) | ||
1873 | destroy_workqueue(bs->rescue_workqueue); | ||
1874 | |||
1875 | if (bs->bio_pool) | ||
1876 | mempool_destroy(bs->bio_pool); | ||
1877 | |||
1878 | if (bs->bvec_pool) | ||
1879 | mempool_destroy(bs->bvec_pool); | ||
1880 | |||
1881 | bioset_integrity_free(bs); | ||
1882 | bio_put_slab(bs); | ||
1883 | |||
1884 | kfree(bs); | ||
1885 | } | ||
1886 | EXPORT_SYMBOL(bioset_free); | ||
1887 | |||
1888 | /** | ||
1889 | * bioset_create - Create a bio_set | ||
1890 | * @pool_size: Number of bio and bio_vecs to cache in the mempool | ||
1891 | * @front_pad: Number of bytes to allocate in front of the returned bio | ||
1892 | * | ||
1893 | * Description: | ||
1894 | * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller | ||
1895 | * to ask for a number of bytes to be allocated in front of the bio. | ||
1896 | * Front pad allocation is useful for embedding the bio inside | ||
1897 | * another structure, to avoid allocating extra data to go with the bio. | ||
1898 | * Note that the bio must be embedded at the END of that structure always, | ||
1899 | * or things will break badly. | ||
1900 | */ | ||
1901 | struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) | ||
1902 | { | ||
1903 | unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); | ||
1904 | struct bio_set *bs; | ||
1905 | |||
1906 | bs = kzalloc(sizeof(*bs), GFP_KERNEL); | ||
1907 | if (!bs) | ||
1908 | return NULL; | ||
1909 | |||
1910 | bs->front_pad = front_pad; | ||
1911 | |||
1912 | spin_lock_init(&bs->rescue_lock); | ||
1913 | bio_list_init(&bs->rescue_list); | ||
1914 | INIT_WORK(&bs->rescue_work, bio_alloc_rescue); | ||
1915 | |||
1916 | bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); | ||
1917 | if (!bs->bio_slab) { | ||
1918 | kfree(bs); | ||
1919 | return NULL; | ||
1920 | } | ||
1921 | |||
1922 | bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); | ||
1923 | if (!bs->bio_pool) | ||
1924 | goto bad; | ||
1925 | |||
1926 | bs->bvec_pool = biovec_create_pool(pool_size); | ||
1927 | if (!bs->bvec_pool) | ||
1928 | goto bad; | ||
1929 | |||
1930 | bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); | ||
1931 | if (!bs->rescue_workqueue) | ||
1932 | goto bad; | ||
1933 | |||
1934 | return bs; | ||
1935 | bad: | ||
1936 | bioset_free(bs); | ||
1937 | return NULL; | ||
1938 | } | ||
1939 | EXPORT_SYMBOL(bioset_create); | ||
1940 | |||
1941 | #ifdef CONFIG_BLK_CGROUP | ||
1942 | /** | ||
1943 | * bio_associate_current - associate a bio with %current | ||
1944 | * @bio: target bio | ||
1945 | * | ||
1946 | * Associate @bio with %current if it hasn't been associated yet. Block | ||
1947 | * layer will treat @bio as if it were issued by %current no matter which | ||
1948 | * task actually issues it. | ||
1949 | * | ||
1950 | * This function takes an extra reference of @task's io_context and blkcg | ||
1951 | * which will be put when @bio is released. The caller must own @bio, | ||
1952 | * ensure %current->io_context exists, and is responsible for synchronizing | ||
1953 | * calls to this function. | ||
1954 | */ | ||
1955 | int bio_associate_current(struct bio *bio) | ||
1956 | { | ||
1957 | struct io_context *ioc; | ||
1958 | struct cgroup_subsys_state *css; | ||
1959 | |||
1960 | if (bio->bi_ioc) | ||
1961 | return -EBUSY; | ||
1962 | |||
1963 | ioc = current->io_context; | ||
1964 | if (!ioc) | ||
1965 | return -ENOENT; | ||
1966 | |||
1967 | /* acquire active ref on @ioc and associate */ | ||
1968 | get_io_context_active(ioc); | ||
1969 | bio->bi_ioc = ioc; | ||
1970 | |||
1971 | /* associate blkcg if exists */ | ||
1972 | rcu_read_lock(); | ||
1973 | css = task_css(current, blkio_cgrp_id); | ||
1974 | if (css && css_tryget(css)) | ||
1975 | bio->bi_css = css; | ||
1976 | rcu_read_unlock(); | ||
1977 | |||
1978 | return 0; | ||
1979 | } | ||
1980 | |||
1981 | /** | ||
1982 | * bio_disassociate_task - undo bio_associate_current() | ||
1983 | * @bio: target bio | ||
1984 | */ | ||
1985 | void bio_disassociate_task(struct bio *bio) | ||
1986 | { | ||
1987 | if (bio->bi_ioc) { | ||
1988 | put_io_context(bio->bi_ioc); | ||
1989 | bio->bi_ioc = NULL; | ||
1990 | } | ||
1991 | if (bio->bi_css) { | ||
1992 | css_put(bio->bi_css); | ||
1993 | bio->bi_css = NULL; | ||
1994 | } | ||
1995 | } | ||
1996 | |||
1997 | #endif /* CONFIG_BLK_CGROUP */ | ||
1998 | |||
1999 | static void __init biovec_init_slabs(void) | ||
2000 | { | ||
2001 | int i; | ||
2002 | |||
2003 | for (i = 0; i < BIOVEC_NR_POOLS; i++) { | ||
2004 | int size; | ||
2005 | struct biovec_slab *bvs = bvec_slabs + i; | ||
2006 | |||
2007 | if (bvs->nr_vecs <= BIO_INLINE_VECS) { | ||
2008 | bvs->slab = NULL; | ||
2009 | continue; | ||
2010 | } | ||
2011 | |||
2012 | size = bvs->nr_vecs * sizeof(struct bio_vec); | ||
2013 | bvs->slab = kmem_cache_create(bvs->name, size, 0, | ||
2014 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | ||
2015 | } | ||
2016 | } | ||
2017 | |||
2018 | static int __init init_bio(void) | ||
2019 | { | ||
2020 | bio_slab_max = 2; | ||
2021 | bio_slab_nr = 0; | ||
2022 | bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); | ||
2023 | if (!bio_slabs) | ||
2024 | panic("bio: can't allocate bios\n"); | ||
2025 | |||
2026 | bio_integrity_init(); | ||
2027 | biovec_init_slabs(); | ||
2028 | |||
2029 | fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); | ||
2030 | if (!fs_bio_set) | ||
2031 | panic("bio: can't allocate bios\n"); | ||
2032 | |||
2033 | if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) | ||
2034 | panic("bio: can't create integrity pool\n"); | ||
2035 | |||
2036 | return 0; | ||
2037 | } | ||
2038 | subsys_initcall(init_bio); | ||