aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-02 12:29:34 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-02 12:29:34 -0400
commit681a2895486243a82547d8c9f53043eb54b53da0 (patch)
tree464273280aed6db55a99cc0d8614d4393f94fc48 /fs
parent6c52486dedbb30a1313da64945dcd686b4579c51 (diff)
parented851860b4552fc8963ecf71eab9f6f7a5c19d74 (diff)
Merge branch 'for-3.16/core' of git://git.kernel.dk/linux-block into next
Pull block core updates from Jens Axboe: "It's a big(ish) round this time, lots of development effort has gone into blk-mq in the last 3 months. Generally we're heading to where 3.16 will be a feature complete and performant blk-mq. scsi-mq is progressing nicely and will hopefully be in 3.17. A nvme port is in progress, and the Micron pci-e flash driver, mtip32xx, is converted and will be sent in with the driver pull request for 3.16. This pull request contains: - Lots of prep and support patches for scsi-mq have been integrated. All from Christoph. - API and code cleanups for blk-mq from Christoph. - Lots of good corner case and error handling cleanup fixes for blk-mq from Ming Lei. - A flew of blk-mq updates from me: * Provide strict mappings so that the driver can rely on the CPU to queue mapping. This enables optimizations in the driver. * Provided a bitmap tagging instead of percpu_ida, which never really worked well for blk-mq. percpu_ida relies on the fact that we have a lot more tags available than we really need, it fails miserably for cases where we exhaust (or are close to exhausting) the tag space. * Provide sane support for shared tag maps, as utilized by scsi-mq * Various fixes for IO timeouts. * API cleanups, and lots of perf tweaks and optimizations. - Remove 'buffer' from struct request. This is ancient code, from when requests were always virtually mapped. Kill it, to reclaim some space in struct request. From me. - Remove 'magic' from blk_plug. Since we store these on the stack and since we've never caught any actual bugs with this, lets just get rid of it. From me. - Only call part_in_flight() once for IO completion, as includes two atomic reads. Hopefully we'll get a better implementation soon, as the part IO stats are now one of the more expensive parts of doing IO on blk-mq. From me. - File migration of block code from {mm,fs}/ to block/. This includes bio.c, bio-integrity.c, bounce.c, and ioprio.c. From me, from a discussion on lkml. That should describe the meat of the pull request. Also has various little fixes and cleanups from Dave Jones, Shaohua Li, Duan Jiong, Fengguang Wu, Fabian Frederick, Randy Dunlap, Robert Elliott, and Sam Bradshaw" * 'for-3.16/core' of git://git.kernel.dk/linux-block: (100 commits) blk-mq: push IPI or local end_io decision to __blk_mq_complete_request() blk-mq: remember to start timeout handler for direct queue block: ensure that the timer is always added blk-mq: blk_mq_unregister_hctx() can be static blk-mq: make the sysfs mq/ layout reflect current mappings blk-mq: blk_mq_tag_to_rq should handle flush request block: remove dead code in scsi_ioctl:blk_verify_command blk-mq: request initialization optimizations block: add queue flag for disabling SG merging block: remove 'magic' from struct blk_plug blk-mq: remove alloc_hctx and free_hctx methods blk-mq: add file comments and update copyright notices blk-mq: remove blk_mq_alloc_request_pinned blk-mq: do not use blk_mq_alloc_request_pinned in blk_mq_map_request blk-mq: remove blk_mq_wait_for_tags blk-mq: initialize request in __blk_mq_alloc_request blk-mq: merge blk_mq_alloc_reserved_request into blk_mq_alloc_request blk-mq: add helper to insert requests from irq context blk-mq: remove stale comment for blk_mq_complete_request() blk-mq: allow non-softirq completions ...
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile3
-rw-r--r--fs/bio-integrity.c657
-rw-r--r--fs/bio.c2037
-rw-r--r--fs/ioprio.c241
4 files changed, 1 insertions, 2937 deletions
diff --git a/fs/Makefile b/fs/Makefile
index f9cb9876e466..4030cbfbc9af 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \
14 stack.o fs_struct.o statfs.o 14 stack.o fs_struct.o statfs.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o block_dev.o direct-io.o mpage.o
18else 18else
19obj-y += no-block.o 19obj-y += no-block.o
20endif 20endif
21 21
22obj-$(CONFIG_PROC_FS) += proc_namespace.o 22obj-$(CONFIG_PROC_FS) += proc_namespace.o
23 23
24obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
25obj-y += notify/ 24obj-y += notify/
26obj-$(CONFIG_EPOLL) += eventpoll.o 25obj-$(CONFIG_EPOLL) += eventpoll.o
27obj-$(CONFIG_ANON_INODES) += anon_inodes.o 26obj-$(CONFIG_ANON_INODES) += anon_inodes.o
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
deleted file mode 100644
index 1c2ce0c87711..000000000000
--- a/fs/bio-integrity.c
+++ /dev/null
@@ -1,657 +0,0 @@
1/*
2 * bio-integrity.c - bio data integrity extensions
3 *
4 * Copyright (C) 2007, 2008, 2009 Oracle Corporation
5 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version
9 * 2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
19 * USA.
20 *
21 */
22
23#include <linux/blkdev.h>
24#include <linux/mempool.h>
25#include <linux/export.h>
26#include <linux/bio.h>
27#include <linux/workqueue.h>
28#include <linux/slab.h>
29
30#define BIP_INLINE_VECS 4
31
32static struct kmem_cache *bip_slab;
33static struct workqueue_struct *kintegrityd_wq;
34
35/**
36 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
37 * @bio: bio to attach integrity metadata to
38 * @gfp_mask: Memory allocation mask
39 * @nr_vecs: Number of integrity metadata scatter-gather elements
40 *
41 * Description: This function prepares a bio for attaching integrity
42 * metadata. nr_vecs specifies the maximum number of pages containing
43 * integrity metadata that can be attached.
44 */
45struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
46 gfp_t gfp_mask,
47 unsigned int nr_vecs)
48{
49 struct bio_integrity_payload *bip;
50 struct bio_set *bs = bio->bi_pool;
51 unsigned long idx = BIO_POOL_NONE;
52 unsigned inline_vecs;
53
54 if (!bs) {
55 bip = kmalloc(sizeof(struct bio_integrity_payload) +
56 sizeof(struct bio_vec) * nr_vecs, gfp_mask);
57 inline_vecs = nr_vecs;
58 } else {
59 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
60 inline_vecs = BIP_INLINE_VECS;
61 }
62
63 if (unlikely(!bip))
64 return NULL;
65
66 memset(bip, 0, sizeof(*bip));
67
68 if (nr_vecs > inline_vecs) {
69 bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
70 bs->bvec_integrity_pool);
71 if (!bip->bip_vec)
72 goto err;
73 } else {
74 bip->bip_vec = bip->bip_inline_vecs;
75 }
76
77 bip->bip_slab = idx;
78 bip->bip_bio = bio;
79 bio->bi_integrity = bip;
80
81 return bip;
82err:
83 mempool_free(bip, bs->bio_integrity_pool);
84 return NULL;
85}
86EXPORT_SYMBOL(bio_integrity_alloc);
87
88/**
89 * bio_integrity_free - Free bio integrity payload
90 * @bio: bio containing bip to be freed
91 *
92 * Description: Used to free the integrity portion of a bio. Usually
93 * called from bio_free().
94 */
95void bio_integrity_free(struct bio *bio)
96{
97 struct bio_integrity_payload *bip = bio->bi_integrity;
98 struct bio_set *bs = bio->bi_pool;
99
100 if (bip->bip_owns_buf)
101 kfree(bip->bip_buf);
102
103 if (bs) {
104 if (bip->bip_slab != BIO_POOL_NONE)
105 bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
106 bip->bip_slab);
107
108 mempool_free(bip, bs->bio_integrity_pool);
109 } else {
110 kfree(bip);
111 }
112
113 bio->bi_integrity = NULL;
114}
115EXPORT_SYMBOL(bio_integrity_free);
116
117static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
118{
119 if (bip->bip_slab == BIO_POOL_NONE)
120 return BIP_INLINE_VECS;
121
122 return bvec_nr_vecs(bip->bip_slab);
123}
124
125/**
126 * bio_integrity_add_page - Attach integrity metadata
127 * @bio: bio to update
128 * @page: page containing integrity metadata
129 * @len: number of bytes of integrity metadata in page
130 * @offset: start offset within page
131 *
132 * Description: Attach a page containing integrity metadata to bio.
133 */
134int bio_integrity_add_page(struct bio *bio, struct page *page,
135 unsigned int len, unsigned int offset)
136{
137 struct bio_integrity_payload *bip = bio->bi_integrity;
138 struct bio_vec *iv;
139
140 if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
141 printk(KERN_ERR "%s: bip_vec full\n", __func__);
142 return 0;
143 }
144
145 iv = bip->bip_vec + bip->bip_vcnt;
146
147 iv->bv_page = page;
148 iv->bv_len = len;
149 iv->bv_offset = offset;
150 bip->bip_vcnt++;
151
152 return len;
153}
154EXPORT_SYMBOL(bio_integrity_add_page);
155
156static int bdev_integrity_enabled(struct block_device *bdev, int rw)
157{
158 struct blk_integrity *bi = bdev_get_integrity(bdev);
159
160 if (bi == NULL)
161 return 0;
162
163 if (rw == READ && bi->verify_fn != NULL &&
164 (bi->flags & INTEGRITY_FLAG_READ))
165 return 1;
166
167 if (rw == WRITE && bi->generate_fn != NULL &&
168 (bi->flags & INTEGRITY_FLAG_WRITE))
169 return 1;
170
171 return 0;
172}
173
174/**
175 * bio_integrity_enabled - Check whether integrity can be passed
176 * @bio: bio to check
177 *
178 * Description: Determines whether bio_integrity_prep() can be called
179 * on this bio or not. bio data direction and target device must be
180 * set prior to calling. The functions honors the write_generate and
181 * read_verify flags in sysfs.
182 */
183int bio_integrity_enabled(struct bio *bio)
184{
185 if (!bio_is_rw(bio))
186 return 0;
187
188 /* Already protected? */
189 if (bio_integrity(bio))
190 return 0;
191
192 return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
193}
194EXPORT_SYMBOL(bio_integrity_enabled);
195
196/**
197 * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
198 * @bi: blk_integrity profile for device
199 * @sectors: Number of 512 sectors to convert
200 *
201 * Description: The block layer calculates everything in 512 byte
202 * sectors but integrity metadata is done in terms of the hardware
203 * sector size of the storage device. Convert the block layer sectors
204 * to physical sectors.
205 */
206static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
207 unsigned int sectors)
208{
209 /* At this point there are only 512b or 4096b DIF/EPP devices */
210 if (bi->sector_size == 4096)
211 return sectors >>= 3;
212
213 return sectors;
214}
215
216static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
217 unsigned int sectors)
218{
219 return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size;
220}
221
222/**
223 * bio_integrity_tag_size - Retrieve integrity tag space
224 * @bio: bio to inspect
225 *
226 * Description: Returns the maximum number of tag bytes that can be
227 * attached to this bio. Filesystems can use this to determine how
228 * much metadata to attach to an I/O.
229 */
230unsigned int bio_integrity_tag_size(struct bio *bio)
231{
232 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
233
234 BUG_ON(bio->bi_iter.bi_size == 0);
235
236 return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size);
237}
238EXPORT_SYMBOL(bio_integrity_tag_size);
239
240static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len,
241 int set)
242{
243 struct bio_integrity_payload *bip = bio->bi_integrity;
244 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
245 unsigned int nr_sectors;
246
247 BUG_ON(bip->bip_buf == NULL);
248
249 if (bi->tag_size == 0)
250 return -1;
251
252 nr_sectors = bio_integrity_hw_sectors(bi,
253 DIV_ROUND_UP(len, bi->tag_size));
254
255 if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) {
256 printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__,
257 nr_sectors * bi->tuple_size, bip->bip_iter.bi_size);
258 return -1;
259 }
260
261 if (set)
262 bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
263 else
264 bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
265
266 return 0;
267}
268
269/**
270 * bio_integrity_set_tag - Attach a tag buffer to a bio
271 * @bio: bio to attach buffer to
272 * @tag_buf: Pointer to a buffer containing tag data
273 * @len: Length of the included buffer
274 *
275 * Description: Use this function to tag a bio by leveraging the extra
276 * space provided by devices formatted with integrity protection. The
277 * size of the integrity buffer must be <= to the size reported by
278 * bio_integrity_tag_size().
279 */
280int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
281{
282 BUG_ON(bio_data_dir(bio) != WRITE);
283
284 return bio_integrity_tag(bio, tag_buf, len, 1);
285}
286EXPORT_SYMBOL(bio_integrity_set_tag);
287
288/**
289 * bio_integrity_get_tag - Retrieve a tag buffer from a bio
290 * @bio: bio to retrieve buffer from
291 * @tag_buf: Pointer to a buffer for the tag data
292 * @len: Length of the target buffer
293 *
294 * Description: Use this function to retrieve the tag buffer from a
295 * completed I/O. The size of the integrity buffer must be <= to the
296 * size reported by bio_integrity_tag_size().
297 */
298int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
299{
300 BUG_ON(bio_data_dir(bio) != READ);
301
302 return bio_integrity_tag(bio, tag_buf, len, 0);
303}
304EXPORT_SYMBOL(bio_integrity_get_tag);
305
306/**
307 * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio
308 * @bio: bio to generate/verify integrity metadata for
309 * @operate: operate number, 1 for generate, 0 for verify
310 */
311static int bio_integrity_generate_verify(struct bio *bio, int operate)
312{
313 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
314 struct blk_integrity_exchg bix;
315 struct bio_vec *bv;
316 sector_t sector;
317 unsigned int sectors, ret = 0, i;
318 void *prot_buf = bio->bi_integrity->bip_buf;
319
320 if (operate)
321 sector = bio->bi_iter.bi_sector;
322 else
323 sector = bio->bi_integrity->bip_iter.bi_sector;
324
325 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
326 bix.sector_size = bi->sector_size;
327
328 bio_for_each_segment_all(bv, bio, i) {
329 void *kaddr = kmap_atomic(bv->bv_page);
330 bix.data_buf = kaddr + bv->bv_offset;
331 bix.data_size = bv->bv_len;
332 bix.prot_buf = prot_buf;
333 bix.sector = sector;
334
335 if (operate)
336 bi->generate_fn(&bix);
337 else {
338 ret = bi->verify_fn(&bix);
339 if (ret) {
340 kunmap_atomic(kaddr);
341 return ret;
342 }
343 }
344
345 sectors = bv->bv_len / bi->sector_size;
346 sector += sectors;
347 prot_buf += sectors * bi->tuple_size;
348
349 kunmap_atomic(kaddr);
350 }
351 return ret;
352}
353
354/**
355 * bio_integrity_generate - Generate integrity metadata for a bio
356 * @bio: bio to generate integrity metadata for
357 *
358 * Description: Generates integrity metadata for a bio by calling the
359 * block device's generation callback function. The bio must have a
360 * bip attached with enough room to accommodate the generated
361 * integrity metadata.
362 */
363static void bio_integrity_generate(struct bio *bio)
364{
365 bio_integrity_generate_verify(bio, 1);
366}
367
368static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
369{
370 if (bi)
371 return bi->tuple_size;
372
373 return 0;
374}
375
376/**
377 * bio_integrity_prep - Prepare bio for integrity I/O
378 * @bio: bio to prepare
379 *
380 * Description: Allocates a buffer for integrity metadata, maps the
381 * pages and attaches them to a bio. The bio must have data
382 * direction, target device and start sector set priot to calling. In
383 * the WRITE case, integrity metadata will be generated using the
384 * block device's integrity function. In the READ case, the buffer
385 * will be prepared for DMA and a suitable end_io handler set up.
386 */
387int bio_integrity_prep(struct bio *bio)
388{
389 struct bio_integrity_payload *bip;
390 struct blk_integrity *bi;
391 struct request_queue *q;
392 void *buf;
393 unsigned long start, end;
394 unsigned int len, nr_pages;
395 unsigned int bytes, offset, i;
396 unsigned int sectors;
397
398 bi = bdev_get_integrity(bio->bi_bdev);
399 q = bdev_get_queue(bio->bi_bdev);
400 BUG_ON(bi == NULL);
401 BUG_ON(bio_integrity(bio));
402
403 sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
404
405 /* Allocate kernel buffer for protection data */
406 len = sectors * blk_integrity_tuple_size(bi);
407 buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
408 if (unlikely(buf == NULL)) {
409 printk(KERN_ERR "could not allocate integrity buffer\n");
410 return -ENOMEM;
411 }
412
413 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
414 start = ((unsigned long) buf) >> PAGE_SHIFT;
415 nr_pages = end - start;
416
417 /* Allocate bio integrity payload and integrity vectors */
418 bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
419 if (unlikely(bip == NULL)) {
420 printk(KERN_ERR "could not allocate data integrity bioset\n");
421 kfree(buf);
422 return -EIO;
423 }
424
425 bip->bip_owns_buf = 1;
426 bip->bip_buf = buf;
427 bip->bip_iter.bi_size = len;
428 bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
429
430 /* Map it */
431 offset = offset_in_page(buf);
432 for (i = 0 ; i < nr_pages ; i++) {
433 int ret;
434 bytes = PAGE_SIZE - offset;
435
436 if (len <= 0)
437 break;
438
439 if (bytes > len)
440 bytes = len;
441
442 ret = bio_integrity_add_page(bio, virt_to_page(buf),
443 bytes, offset);
444
445 if (ret == 0)
446 return 0;
447
448 if (ret < bytes)
449 break;
450
451 buf += bytes;
452 len -= bytes;
453 offset = 0;
454 }
455
456 /* Install custom I/O completion handler if read verify is enabled */
457 if (bio_data_dir(bio) == READ) {
458 bip->bip_end_io = bio->bi_end_io;
459 bio->bi_end_io = bio_integrity_endio;
460 }
461
462 /* Auto-generate integrity metadata if this is a write */
463 if (bio_data_dir(bio) == WRITE)
464 bio_integrity_generate(bio);
465
466 return 0;
467}
468EXPORT_SYMBOL(bio_integrity_prep);
469
470/**
471 * bio_integrity_verify - Verify integrity metadata for a bio
472 * @bio: bio to verify
473 *
474 * Description: This function is called to verify the integrity of a
475 * bio. The data in the bio io_vec is compared to the integrity
476 * metadata returned by the HBA.
477 */
478static int bio_integrity_verify(struct bio *bio)
479{
480 return bio_integrity_generate_verify(bio, 0);
481}
482
483/**
484 * bio_integrity_verify_fn - Integrity I/O completion worker
485 * @work: Work struct stored in bio to be verified
486 *
487 * Description: This workqueue function is called to complete a READ
488 * request. The function verifies the transferred integrity metadata
489 * and then calls the original bio end_io function.
490 */
491static void bio_integrity_verify_fn(struct work_struct *work)
492{
493 struct bio_integrity_payload *bip =
494 container_of(work, struct bio_integrity_payload, bip_work);
495 struct bio *bio = bip->bip_bio;
496 int error;
497
498 error = bio_integrity_verify(bio);
499
500 /* Restore original bio completion handler */
501 bio->bi_end_io = bip->bip_end_io;
502 bio_endio_nodec(bio, error);
503}
504
505/**
506 * bio_integrity_endio - Integrity I/O completion function
507 * @bio: Protected bio
508 * @error: Pointer to errno
509 *
510 * Description: Completion for integrity I/O
511 *
512 * Normally I/O completion is done in interrupt context. However,
513 * verifying I/O integrity is a time-consuming task which must be run
514 * in process context. This function postpones completion
515 * accordingly.
516 */
517void bio_integrity_endio(struct bio *bio, int error)
518{
519 struct bio_integrity_payload *bip = bio->bi_integrity;
520
521 BUG_ON(bip->bip_bio != bio);
522
523 /* In case of an I/O error there is no point in verifying the
524 * integrity metadata. Restore original bio end_io handler
525 * and run it.
526 */
527 if (error) {
528 bio->bi_end_io = bip->bip_end_io;
529 bio_endio(bio, error);
530
531 return;
532 }
533
534 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
535 queue_work(kintegrityd_wq, &bip->bip_work);
536}
537EXPORT_SYMBOL(bio_integrity_endio);
538
539/**
540 * bio_integrity_advance - Advance integrity vector
541 * @bio: bio whose integrity vector to update
542 * @bytes_done: number of data bytes that have been completed
543 *
544 * Description: This function calculates how many integrity bytes the
545 * number of completed data bytes correspond to and advances the
546 * integrity vector accordingly.
547 */
548void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
549{
550 struct bio_integrity_payload *bip = bio->bi_integrity;
551 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
552 unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
553
554 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
555}
556EXPORT_SYMBOL(bio_integrity_advance);
557
558/**
559 * bio_integrity_trim - Trim integrity vector
560 * @bio: bio whose integrity vector to update
561 * @offset: offset to first data sector
562 * @sectors: number of data sectors
563 *
564 * Description: Used to trim the integrity vector in a cloned bio.
565 * The ivec will be advanced corresponding to 'offset' data sectors
566 * and the length will be truncated corresponding to 'len' data
567 * sectors.
568 */
569void bio_integrity_trim(struct bio *bio, unsigned int offset,
570 unsigned int sectors)
571{
572 struct bio_integrity_payload *bip = bio->bi_integrity;
573 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
574
575 bio_integrity_advance(bio, offset << 9);
576 bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
577}
578EXPORT_SYMBOL(bio_integrity_trim);
579
580/**
581 * bio_integrity_clone - Callback for cloning bios with integrity metadata
582 * @bio: New bio
583 * @bio_src: Original bio
584 * @gfp_mask: Memory allocation mask
585 *
586 * Description: Called to allocate a bip when cloning a bio
587 */
588int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
589 gfp_t gfp_mask)
590{
591 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
592 struct bio_integrity_payload *bip;
593
594 BUG_ON(bip_src == NULL);
595
596 bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
597
598 if (bip == NULL)
599 return -EIO;
600
601 memcpy(bip->bip_vec, bip_src->bip_vec,
602 bip_src->bip_vcnt * sizeof(struct bio_vec));
603
604 bip->bip_vcnt = bip_src->bip_vcnt;
605 bip->bip_iter = bip_src->bip_iter;
606
607 return 0;
608}
609EXPORT_SYMBOL(bio_integrity_clone);
610
611int bioset_integrity_create(struct bio_set *bs, int pool_size)
612{
613 if (bs->bio_integrity_pool)
614 return 0;
615
616 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
617 if (!bs->bio_integrity_pool)
618 return -1;
619
620 bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
621 if (!bs->bvec_integrity_pool) {
622 mempool_destroy(bs->bio_integrity_pool);
623 return -1;
624 }
625
626 return 0;
627}
628EXPORT_SYMBOL(bioset_integrity_create);
629
630void bioset_integrity_free(struct bio_set *bs)
631{
632 if (bs->bio_integrity_pool)
633 mempool_destroy(bs->bio_integrity_pool);
634
635 if (bs->bvec_integrity_pool)
636 mempool_destroy(bs->bvec_integrity_pool);
637}
638EXPORT_SYMBOL(bioset_integrity_free);
639
640void __init bio_integrity_init(void)
641{
642 /*
643 * kintegrityd won't block much but may burn a lot of CPU cycles.
644 * Make it highpri CPU intensive wq with max concurrency of 1.
645 */
646 kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
647 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
648 if (!kintegrityd_wq)
649 panic("Failed to create kintegrityd\n");
650
651 bip_slab = kmem_cache_create("bio_integrity_payload",
652 sizeof(struct bio_integrity_payload) +
653 sizeof(struct bio_vec) * BIP_INLINE_VECS,
654 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
655 if (!bip_slab)
656 panic("Failed to create slab\n");
657}
diff --git a/fs/bio.c b/fs/bio.c
deleted file mode 100644
index 6f0362b77806..000000000000
--- a/fs/bio.c
+++ /dev/null
@@ -1,2037 +0,0 @@
1/*
2 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public Licens
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
16 *
17 */
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/bio.h>
21#include <linux/blkdev.h>
22#include <linux/uio.h>
23#include <linux/iocontext.h>
24#include <linux/slab.h>
25#include <linux/init.h>
26#include <linux/kernel.h>
27#include <linux/export.h>
28#include <linux/mempool.h>
29#include <linux/workqueue.h>
30#include <linux/cgroup.h>
31#include <scsi/sg.h> /* for struct sg_iovec */
32
33#include <trace/events/block.h>
34
35/*
36 * Test patch to inline a certain number of bi_io_vec's inside the bio
37 * itself, to shrink a bio data allocation from two mempool calls to one
38 */
39#define BIO_INLINE_VECS 4
40
41/*
42 * if you change this list, also change bvec_alloc or things will
43 * break badly! cannot be bigger than what you can fit into an
44 * unsigned short
45 */
46#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
47static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
48 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
49};
50#undef BV
51
52/*
53 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
54 * IO code that does not need private memory pools.
55 */
56struct bio_set *fs_bio_set;
57EXPORT_SYMBOL(fs_bio_set);
58
59/*
60 * Our slab pool management
61 */
62struct bio_slab {
63 struct kmem_cache *slab;
64 unsigned int slab_ref;
65 unsigned int slab_size;
66 char name[8];
67};
68static DEFINE_MUTEX(bio_slab_lock);
69static struct bio_slab *bio_slabs;
70static unsigned int bio_slab_nr, bio_slab_max;
71
72static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
73{
74 unsigned int sz = sizeof(struct bio) + extra_size;
75 struct kmem_cache *slab = NULL;
76 struct bio_slab *bslab, *new_bio_slabs;
77 unsigned int new_bio_slab_max;
78 unsigned int i, entry = -1;
79
80 mutex_lock(&bio_slab_lock);
81
82 i = 0;
83 while (i < bio_slab_nr) {
84 bslab = &bio_slabs[i];
85
86 if (!bslab->slab && entry == -1)
87 entry = i;
88 else if (bslab->slab_size == sz) {
89 slab = bslab->slab;
90 bslab->slab_ref++;
91 break;
92 }
93 i++;
94 }
95
96 if (slab)
97 goto out_unlock;
98
99 if (bio_slab_nr == bio_slab_max && entry == -1) {
100 new_bio_slab_max = bio_slab_max << 1;
101 new_bio_slabs = krealloc(bio_slabs,
102 new_bio_slab_max * sizeof(struct bio_slab),
103 GFP_KERNEL);
104 if (!new_bio_slabs)
105 goto out_unlock;
106 bio_slab_max = new_bio_slab_max;
107 bio_slabs = new_bio_slabs;
108 }
109 if (entry == -1)
110 entry = bio_slab_nr++;
111
112 bslab = &bio_slabs[entry];
113
114 snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
115 slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
116 if (!slab)
117 goto out_unlock;
118
119 bslab->slab = slab;
120 bslab->slab_ref = 1;
121 bslab->slab_size = sz;
122out_unlock:
123 mutex_unlock(&bio_slab_lock);
124 return slab;
125}
126
127static void bio_put_slab(struct bio_set *bs)
128{
129 struct bio_slab *bslab = NULL;
130 unsigned int i;
131
132 mutex_lock(&bio_slab_lock);
133
134 for (i = 0; i < bio_slab_nr; i++) {
135 if (bs->bio_slab == bio_slabs[i].slab) {
136 bslab = &bio_slabs[i];
137 break;
138 }
139 }
140
141 if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
142 goto out;
143
144 WARN_ON(!bslab->slab_ref);
145
146 if (--bslab->slab_ref)
147 goto out;
148
149 kmem_cache_destroy(bslab->slab);
150 bslab->slab = NULL;
151
152out:
153 mutex_unlock(&bio_slab_lock);
154}
155
156unsigned int bvec_nr_vecs(unsigned short idx)
157{
158 return bvec_slabs[idx].nr_vecs;
159}
160
161void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
162{
163 BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
164
165 if (idx == BIOVEC_MAX_IDX)
166 mempool_free(bv, pool);
167 else {
168 struct biovec_slab *bvs = bvec_slabs + idx;
169
170 kmem_cache_free(bvs->slab, bv);
171 }
172}
173
174struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
175 mempool_t *pool)
176{
177 struct bio_vec *bvl;
178
179 /*
180 * see comment near bvec_array define!
181 */
182 switch (nr) {
183 case 1:
184 *idx = 0;
185 break;
186 case 2 ... 4:
187 *idx = 1;
188 break;
189 case 5 ... 16:
190 *idx = 2;
191 break;
192 case 17 ... 64:
193 *idx = 3;
194 break;
195 case 65 ... 128:
196 *idx = 4;
197 break;
198 case 129 ... BIO_MAX_PAGES:
199 *idx = 5;
200 break;
201 default:
202 return NULL;
203 }
204
205 /*
206 * idx now points to the pool we want to allocate from. only the
207 * 1-vec entry pool is mempool backed.
208 */
209 if (*idx == BIOVEC_MAX_IDX) {
210fallback:
211 bvl = mempool_alloc(pool, gfp_mask);
212 } else {
213 struct biovec_slab *bvs = bvec_slabs + *idx;
214 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
215
216 /*
217 * Make this allocation restricted and don't dump info on
218 * allocation failures, since we'll fallback to the mempool
219 * in case of failure.
220 */
221 __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
222
223 /*
224 * Try a slab allocation. If this fails and __GFP_WAIT
225 * is set, retry with the 1-entry mempool
226 */
227 bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
228 if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
229 *idx = BIOVEC_MAX_IDX;
230 goto fallback;
231 }
232 }
233
234 return bvl;
235}
236
237static void __bio_free(struct bio *bio)
238{
239 bio_disassociate_task(bio);
240
241 if (bio_integrity(bio))
242 bio_integrity_free(bio);
243}
244
245static void bio_free(struct bio *bio)
246{
247 struct bio_set *bs = bio->bi_pool;
248 void *p;
249
250 __bio_free(bio);
251
252 if (bs) {
253 if (bio_flagged(bio, BIO_OWNS_VEC))
254 bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
255
256 /*
257 * If we have front padding, adjust the bio pointer before freeing
258 */
259 p = bio;
260 p -= bs->front_pad;
261
262 mempool_free(p, bs->bio_pool);
263 } else {
264 /* Bio was allocated by bio_kmalloc() */
265 kfree(bio);
266 }
267}
268
269void bio_init(struct bio *bio)
270{
271 memset(bio, 0, sizeof(*bio));
272 bio->bi_flags = 1 << BIO_UPTODATE;
273 atomic_set(&bio->bi_remaining, 1);
274 atomic_set(&bio->bi_cnt, 1);
275}
276EXPORT_SYMBOL(bio_init);
277
278/**
279 * bio_reset - reinitialize a bio
280 * @bio: bio to reset
281 *
282 * Description:
283 * After calling bio_reset(), @bio will be in the same state as a freshly
284 * allocated bio returned bio bio_alloc_bioset() - the only fields that are
285 * preserved are the ones that are initialized by bio_alloc_bioset(). See
286 * comment in struct bio.
287 */
288void bio_reset(struct bio *bio)
289{
290 unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
291
292 __bio_free(bio);
293
294 memset(bio, 0, BIO_RESET_BYTES);
295 bio->bi_flags = flags|(1 << BIO_UPTODATE);
296 atomic_set(&bio->bi_remaining, 1);
297}
298EXPORT_SYMBOL(bio_reset);
299
300static void bio_chain_endio(struct bio *bio, int error)
301{
302 bio_endio(bio->bi_private, error);
303 bio_put(bio);
304}
305
306/**
307 * bio_chain - chain bio completions
308 *
309 * The caller won't have a bi_end_io called when @bio completes - instead,
310 * @parent's bi_end_io won't be called until both @parent and @bio have
311 * completed; the chained bio will also be freed when it completes.
312 *
313 * The caller must not set bi_private or bi_end_io in @bio.
314 */
315void bio_chain(struct bio *bio, struct bio *parent)
316{
317 BUG_ON(bio->bi_private || bio->bi_end_io);
318
319 bio->bi_private = parent;
320 bio->bi_end_io = bio_chain_endio;
321 atomic_inc(&parent->bi_remaining);
322}
323EXPORT_SYMBOL(bio_chain);
324
325static void bio_alloc_rescue(struct work_struct *work)
326{
327 struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
328 struct bio *bio;
329
330 while (1) {
331 spin_lock(&bs->rescue_lock);
332 bio = bio_list_pop(&bs->rescue_list);
333 spin_unlock(&bs->rescue_lock);
334
335 if (!bio)
336 break;
337
338 generic_make_request(bio);
339 }
340}
341
342static void punt_bios_to_rescuer(struct bio_set *bs)
343{
344 struct bio_list punt, nopunt;
345 struct bio *bio;
346
347 /*
348 * In order to guarantee forward progress we must punt only bios that
349 * were allocated from this bio_set; otherwise, if there was a bio on
350 * there for a stacking driver higher up in the stack, processing it
351 * could require allocating bios from this bio_set, and doing that from
352 * our own rescuer would be bad.
353 *
354 * Since bio lists are singly linked, pop them all instead of trying to
355 * remove from the middle of the list:
356 */
357
358 bio_list_init(&punt);
359 bio_list_init(&nopunt);
360
361 while ((bio = bio_list_pop(current->bio_list)))
362 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
363
364 *current->bio_list = nopunt;
365
366 spin_lock(&bs->rescue_lock);
367 bio_list_merge(&bs->rescue_list, &punt);
368 spin_unlock(&bs->rescue_lock);
369
370 queue_work(bs->rescue_workqueue, &bs->rescue_work);
371}
372
373/**
374 * bio_alloc_bioset - allocate a bio for I/O
375 * @gfp_mask: the GFP_ mask given to the slab allocator
376 * @nr_iovecs: number of iovecs to pre-allocate
377 * @bs: the bio_set to allocate from.
378 *
379 * Description:
380 * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
381 * backed by the @bs's mempool.
382 *
383 * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
384 * able to allocate a bio. This is due to the mempool guarantees. To make this
385 * work, callers must never allocate more than 1 bio at a time from this pool.
386 * Callers that need to allocate more than 1 bio must always submit the
387 * previously allocated bio for IO before attempting to allocate a new one.
388 * Failure to do so can cause deadlocks under memory pressure.
389 *
390 * Note that when running under generic_make_request() (i.e. any block
391 * driver), bios are not submitted until after you return - see the code in
392 * generic_make_request() that converts recursion into iteration, to prevent
393 * stack overflows.
394 *
395 * This would normally mean allocating multiple bios under
396 * generic_make_request() would be susceptible to deadlocks, but we have
397 * deadlock avoidance code that resubmits any blocked bios from a rescuer
398 * thread.
399 *
400 * However, we do not guarantee forward progress for allocations from other
401 * mempools. Doing multiple allocations from the same mempool under
402 * generic_make_request() should be avoided - instead, use bio_set's front_pad
403 * for per bio allocations.
404 *
405 * RETURNS:
406 * Pointer to new bio on success, NULL on failure.
407 */
408struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
409{
410 gfp_t saved_gfp = gfp_mask;
411 unsigned front_pad;
412 unsigned inline_vecs;
413 unsigned long idx = BIO_POOL_NONE;
414 struct bio_vec *bvl = NULL;
415 struct bio *bio;
416 void *p;
417
418 if (!bs) {
419 if (nr_iovecs > UIO_MAXIOV)
420 return NULL;
421
422 p = kmalloc(sizeof(struct bio) +
423 nr_iovecs * sizeof(struct bio_vec),
424 gfp_mask);
425 front_pad = 0;
426 inline_vecs = nr_iovecs;
427 } else {
428 /*
429 * generic_make_request() converts recursion to iteration; this
430 * means if we're running beneath it, any bios we allocate and
431 * submit will not be submitted (and thus freed) until after we
432 * return.
433 *
434 * This exposes us to a potential deadlock if we allocate
435 * multiple bios from the same bio_set() while running
436 * underneath generic_make_request(). If we were to allocate
437 * multiple bios (say a stacking block driver that was splitting
438 * bios), we would deadlock if we exhausted the mempool's
439 * reserve.
440 *
441 * We solve this, and guarantee forward progress, with a rescuer
442 * workqueue per bio_set. If we go to allocate and there are
443 * bios on current->bio_list, we first try the allocation
444 * without __GFP_WAIT; if that fails, we punt those bios we
445 * would be blocking to the rescuer workqueue before we retry
446 * with the original gfp_flags.
447 */
448
449 if (current->bio_list && !bio_list_empty(current->bio_list))
450 gfp_mask &= ~__GFP_WAIT;
451
452 p = mempool_alloc(bs->bio_pool, gfp_mask);
453 if (!p && gfp_mask != saved_gfp) {
454 punt_bios_to_rescuer(bs);
455 gfp_mask = saved_gfp;
456 p = mempool_alloc(bs->bio_pool, gfp_mask);
457 }
458
459 front_pad = bs->front_pad;
460 inline_vecs = BIO_INLINE_VECS;
461 }
462
463 if (unlikely(!p))
464 return NULL;
465
466 bio = p + front_pad;
467 bio_init(bio);
468
469 if (nr_iovecs > inline_vecs) {
470 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
471 if (!bvl && gfp_mask != saved_gfp) {
472 punt_bios_to_rescuer(bs);
473 gfp_mask = saved_gfp;
474 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
475 }
476
477 if (unlikely(!bvl))
478 goto err_free;
479
480 bio->bi_flags |= 1 << BIO_OWNS_VEC;
481 } else if (nr_iovecs) {
482 bvl = bio->bi_inline_vecs;
483 }
484
485 bio->bi_pool = bs;
486 bio->bi_flags |= idx << BIO_POOL_OFFSET;
487 bio->bi_max_vecs = nr_iovecs;
488 bio->bi_io_vec = bvl;
489 return bio;
490
491err_free:
492 mempool_free(p, bs->bio_pool);
493 return NULL;
494}
495EXPORT_SYMBOL(bio_alloc_bioset);
496
497void zero_fill_bio(struct bio *bio)
498{
499 unsigned long flags;
500 struct bio_vec bv;
501 struct bvec_iter iter;
502
503 bio_for_each_segment(bv, bio, iter) {
504 char *data = bvec_kmap_irq(&bv, &flags);
505 memset(data, 0, bv.bv_len);
506 flush_dcache_page(bv.bv_page);
507 bvec_kunmap_irq(data, &flags);
508 }
509}
510EXPORT_SYMBOL(zero_fill_bio);
511
512/**
513 * bio_put - release a reference to a bio
514 * @bio: bio to release reference to
515 *
516 * Description:
517 * Put a reference to a &struct bio, either one you have gotten with
518 * bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
519 **/
520void bio_put(struct bio *bio)
521{
522 BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
523
524 /*
525 * last put frees it
526 */
527 if (atomic_dec_and_test(&bio->bi_cnt))
528 bio_free(bio);
529}
530EXPORT_SYMBOL(bio_put);
531
532inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
533{
534 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
535 blk_recount_segments(q, bio);
536
537 return bio->bi_phys_segments;
538}
539EXPORT_SYMBOL(bio_phys_segments);
540
541/**
542 * __bio_clone_fast - clone a bio that shares the original bio's biovec
543 * @bio: destination bio
544 * @bio_src: bio to clone
545 *
546 * Clone a &bio. Caller will own the returned bio, but not
547 * the actual data it points to. Reference count of returned
548 * bio will be one.
549 *
550 * Caller must ensure that @bio_src is not freed before @bio.
551 */
552void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
553{
554 BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE);
555
556 /*
557 * most users will be overriding ->bi_bdev with a new target,
558 * so we don't set nor calculate new physical/hw segment counts here
559 */
560 bio->bi_bdev = bio_src->bi_bdev;
561 bio->bi_flags |= 1 << BIO_CLONED;
562 bio->bi_rw = bio_src->bi_rw;
563 bio->bi_iter = bio_src->bi_iter;
564 bio->bi_io_vec = bio_src->bi_io_vec;
565}
566EXPORT_SYMBOL(__bio_clone_fast);
567
568/**
569 * bio_clone_fast - clone a bio that shares the original bio's biovec
570 * @bio: bio to clone
571 * @gfp_mask: allocation priority
572 * @bs: bio_set to allocate from
573 *
574 * Like __bio_clone_fast, only also allocates the returned bio
575 */
576struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
577{
578 struct bio *b;
579
580 b = bio_alloc_bioset(gfp_mask, 0, bs);
581 if (!b)
582 return NULL;
583
584 __bio_clone_fast(b, bio);
585
586 if (bio_integrity(bio)) {
587 int ret;
588
589 ret = bio_integrity_clone(b, bio, gfp_mask);
590
591 if (ret < 0) {
592 bio_put(b);
593 return NULL;
594 }
595 }
596
597 return b;
598}
599EXPORT_SYMBOL(bio_clone_fast);
600
601/**
602 * bio_clone_bioset - clone a bio
603 * @bio_src: bio to clone
604 * @gfp_mask: allocation priority
605 * @bs: bio_set to allocate from
606 *
607 * Clone bio. Caller will own the returned bio, but not the actual data it
608 * points to. Reference count of returned bio will be one.
609 */
610struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
611 struct bio_set *bs)
612{
613 struct bvec_iter iter;
614 struct bio_vec bv;
615 struct bio *bio;
616
617 /*
618 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
619 * bio_src->bi_io_vec to bio->bi_io_vec.
620 *
621 * We can't do that anymore, because:
622 *
623 * - The point of cloning the biovec is to produce a bio with a biovec
624 * the caller can modify: bi_idx and bi_bvec_done should be 0.
625 *
626 * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
627 * we tried to clone the whole thing bio_alloc_bioset() would fail.
628 * But the clone should succeed as long as the number of biovecs we
629 * actually need to allocate is fewer than BIO_MAX_PAGES.
630 *
631 * - Lastly, bi_vcnt should not be looked at or relied upon by code
632 * that does not own the bio - reason being drivers don't use it for
633 * iterating over the biovec anymore, so expecting it to be kept up
634 * to date (i.e. for clones that share the parent biovec) is just
635 * asking for trouble and would force extra work on
636 * __bio_clone_fast() anyways.
637 */
638
639 bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
640 if (!bio)
641 return NULL;
642
643 bio->bi_bdev = bio_src->bi_bdev;
644 bio->bi_rw = bio_src->bi_rw;
645 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
646 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
647
648 if (bio->bi_rw & REQ_DISCARD)
649 goto integrity_clone;
650
651 if (bio->bi_rw & REQ_WRITE_SAME) {
652 bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
653 goto integrity_clone;
654 }
655
656 bio_for_each_segment(bv, bio_src, iter)
657 bio->bi_io_vec[bio->bi_vcnt++] = bv;
658
659integrity_clone:
660 if (bio_integrity(bio_src)) {
661 int ret;
662
663 ret = bio_integrity_clone(bio, bio_src, gfp_mask);
664 if (ret < 0) {
665 bio_put(bio);
666 return NULL;
667 }
668 }
669
670 return bio;
671}
672EXPORT_SYMBOL(bio_clone_bioset);
673
674/**
675 * bio_get_nr_vecs - return approx number of vecs
676 * @bdev: I/O target
677 *
678 * Return the approximate number of pages we can send to this target.
679 * There's no guarantee that you will be able to fit this number of pages
680 * into a bio, it does not account for dynamic restrictions that vary
681 * on offset.
682 */
683int bio_get_nr_vecs(struct block_device *bdev)
684{
685 struct request_queue *q = bdev_get_queue(bdev);
686 int nr_pages;
687
688 nr_pages = min_t(unsigned,
689 queue_max_segments(q),
690 queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
691
692 return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
693
694}
695EXPORT_SYMBOL(bio_get_nr_vecs);
696
697static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
698 *page, unsigned int len, unsigned int offset,
699 unsigned int max_sectors)
700{
701 int retried_segments = 0;
702 struct bio_vec *bvec;
703
704 /*
705 * cloned bio must not modify vec list
706 */
707 if (unlikely(bio_flagged(bio, BIO_CLONED)))
708 return 0;
709
710 if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
711 return 0;
712
713 /*
714 * For filesystems with a blocksize smaller than the pagesize
715 * we will often be called with the same page as last time and
716 * a consecutive offset. Optimize this special case.
717 */
718 if (bio->bi_vcnt > 0) {
719 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
720
721 if (page == prev->bv_page &&
722 offset == prev->bv_offset + prev->bv_len) {
723 unsigned int prev_bv_len = prev->bv_len;
724 prev->bv_len += len;
725
726 if (q->merge_bvec_fn) {
727 struct bvec_merge_data bvm = {
728 /* prev_bvec is already charged in
729 bi_size, discharge it in order to
730 simulate merging updated prev_bvec
731 as new bvec. */
732 .bi_bdev = bio->bi_bdev,
733 .bi_sector = bio->bi_iter.bi_sector,
734 .bi_size = bio->bi_iter.bi_size -
735 prev_bv_len,
736 .bi_rw = bio->bi_rw,
737 };
738
739 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
740 prev->bv_len -= len;
741 return 0;
742 }
743 }
744
745 goto done;
746 }
747 }
748
749 if (bio->bi_vcnt >= bio->bi_max_vecs)
750 return 0;
751
752 /*
753 * we might lose a segment or two here, but rather that than
754 * make this too complex.
755 */
756
757 while (bio->bi_phys_segments >= queue_max_segments(q)) {
758
759 if (retried_segments)
760 return 0;
761
762 retried_segments = 1;
763 blk_recount_segments(q, bio);
764 }
765
766 /*
767 * setup the new entry, we might clear it again later if we
768 * cannot add the page
769 */
770 bvec = &bio->bi_io_vec[bio->bi_vcnt];
771 bvec->bv_page = page;
772 bvec->bv_len = len;
773 bvec->bv_offset = offset;
774
775 /*
776 * if queue has other restrictions (eg varying max sector size
777 * depending on offset), it can specify a merge_bvec_fn in the
778 * queue to get further control
779 */
780 if (q->merge_bvec_fn) {
781 struct bvec_merge_data bvm = {
782 .bi_bdev = bio->bi_bdev,
783 .bi_sector = bio->bi_iter.bi_sector,
784 .bi_size = bio->bi_iter.bi_size,
785 .bi_rw = bio->bi_rw,
786 };
787
788 /*
789 * merge_bvec_fn() returns number of bytes it can accept
790 * at this offset
791 */
792 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
793 bvec->bv_page = NULL;
794 bvec->bv_len = 0;
795 bvec->bv_offset = 0;
796 return 0;
797 }
798 }
799
800 /* If we may be able to merge these biovecs, force a recount */
801 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
802 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
803
804 bio->bi_vcnt++;
805 bio->bi_phys_segments++;
806 done:
807 bio->bi_iter.bi_size += len;
808 return len;
809}
810
811/**
812 * bio_add_pc_page - attempt to add page to bio
813 * @q: the target queue
814 * @bio: destination bio
815 * @page: page to add
816 * @len: vec entry length
817 * @offset: vec entry offset
818 *
819 * Attempt to add a page to the bio_vec maplist. This can fail for a
820 * number of reasons, such as the bio being full or target block device
821 * limitations. The target block device must allow bio's up to PAGE_SIZE,
822 * so it is always possible to add a single page to an empty bio.
823 *
824 * This should only be used by REQ_PC bios.
825 */
826int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
827 unsigned int len, unsigned int offset)
828{
829 return __bio_add_page(q, bio, page, len, offset,
830 queue_max_hw_sectors(q));
831}
832EXPORT_SYMBOL(bio_add_pc_page);
833
834/**
835 * bio_add_page - attempt to add page to bio
836 * @bio: destination bio
837 * @page: page to add
838 * @len: vec entry length
839 * @offset: vec entry offset
840 *
841 * Attempt to add a page to the bio_vec maplist. This can fail for a
842 * number of reasons, such as the bio being full or target block device
843 * limitations. The target block device must allow bio's up to PAGE_SIZE,
844 * so it is always possible to add a single page to an empty bio.
845 */
846int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
847 unsigned int offset)
848{
849 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
850 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
851}
852EXPORT_SYMBOL(bio_add_page);
853
854struct submit_bio_ret {
855 struct completion event;
856 int error;
857};
858
859static void submit_bio_wait_endio(struct bio *bio, int error)
860{
861 struct submit_bio_ret *ret = bio->bi_private;
862
863 ret->error = error;
864 complete(&ret->event);
865}
866
867/**
868 * submit_bio_wait - submit a bio, and wait until it completes
869 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
870 * @bio: The &struct bio which describes the I/O
871 *
872 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
873 * bio_endio() on failure.
874 */
875int submit_bio_wait(int rw, struct bio *bio)
876{
877 struct submit_bio_ret ret;
878
879 rw |= REQ_SYNC;
880 init_completion(&ret.event);
881 bio->bi_private = &ret;
882 bio->bi_end_io = submit_bio_wait_endio;
883 submit_bio(rw, bio);
884 wait_for_completion(&ret.event);
885
886 return ret.error;
887}
888EXPORT_SYMBOL(submit_bio_wait);
889
890/**
891 * bio_advance - increment/complete a bio by some number of bytes
892 * @bio: bio to advance
893 * @bytes: number of bytes to complete
894 *
895 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
896 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
897 * be updated on the last bvec as well.
898 *
899 * @bio will then represent the remaining, uncompleted portion of the io.
900 */
901void bio_advance(struct bio *bio, unsigned bytes)
902{
903 if (bio_integrity(bio))
904 bio_integrity_advance(bio, bytes);
905
906 bio_advance_iter(bio, &bio->bi_iter, bytes);
907}
908EXPORT_SYMBOL(bio_advance);
909
910/**
911 * bio_alloc_pages - allocates a single page for each bvec in a bio
912 * @bio: bio to allocate pages for
913 * @gfp_mask: flags for allocation
914 *
915 * Allocates pages up to @bio->bi_vcnt.
916 *
917 * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
918 * freed.
919 */
920int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
921{
922 int i;
923 struct bio_vec *bv;
924
925 bio_for_each_segment_all(bv, bio, i) {
926 bv->bv_page = alloc_page(gfp_mask);
927 if (!bv->bv_page) {
928 while (--bv >= bio->bi_io_vec)
929 __free_page(bv->bv_page);
930 return -ENOMEM;
931 }
932 }
933
934 return 0;
935}
936EXPORT_SYMBOL(bio_alloc_pages);
937
938/**
939 * bio_copy_data - copy contents of data buffers from one chain of bios to
940 * another
941 * @src: source bio list
942 * @dst: destination bio list
943 *
944 * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
945 * @src and @dst as linked lists of bios.
946 *
947 * Stops when it reaches the end of either @src or @dst - that is, copies
948 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
949 */
950void bio_copy_data(struct bio *dst, struct bio *src)
951{
952 struct bvec_iter src_iter, dst_iter;
953 struct bio_vec src_bv, dst_bv;
954 void *src_p, *dst_p;
955 unsigned bytes;
956
957 src_iter = src->bi_iter;
958 dst_iter = dst->bi_iter;
959
960 while (1) {
961 if (!src_iter.bi_size) {
962 src = src->bi_next;
963 if (!src)
964 break;
965
966 src_iter = src->bi_iter;
967 }
968
969 if (!dst_iter.bi_size) {
970 dst = dst->bi_next;
971 if (!dst)
972 break;
973
974 dst_iter = dst->bi_iter;
975 }
976
977 src_bv = bio_iter_iovec(src, src_iter);
978 dst_bv = bio_iter_iovec(dst, dst_iter);
979
980 bytes = min(src_bv.bv_len, dst_bv.bv_len);
981
982 src_p = kmap_atomic(src_bv.bv_page);
983 dst_p = kmap_atomic(dst_bv.bv_page);
984
985 memcpy(dst_p + dst_bv.bv_offset,
986 src_p + src_bv.bv_offset,
987 bytes);
988
989 kunmap_atomic(dst_p);
990 kunmap_atomic(src_p);
991
992 bio_advance_iter(src, &src_iter, bytes);
993 bio_advance_iter(dst, &dst_iter, bytes);
994 }
995}
996EXPORT_SYMBOL(bio_copy_data);
997
998struct bio_map_data {
999 int nr_sgvecs;
1000 int is_our_pages;
1001 struct sg_iovec sgvecs[];
1002};
1003
1004static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
1005 const struct sg_iovec *iov, int iov_count,
1006 int is_our_pages)
1007{
1008 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
1009 bmd->nr_sgvecs = iov_count;
1010 bmd->is_our_pages = is_our_pages;
1011 bio->bi_private = bmd;
1012}
1013
1014static struct bio_map_data *bio_alloc_map_data(int nr_segs,
1015 unsigned int iov_count,
1016 gfp_t gfp_mask)
1017{
1018 if (iov_count > UIO_MAXIOV)
1019 return NULL;
1020
1021 return kmalloc(sizeof(struct bio_map_data) +
1022 sizeof(struct sg_iovec) * iov_count, gfp_mask);
1023}
1024
1025static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
1026 int to_user, int from_user, int do_free_page)
1027{
1028 int ret = 0, i;
1029 struct bio_vec *bvec;
1030 int iov_idx = 0;
1031 unsigned int iov_off = 0;
1032
1033 bio_for_each_segment_all(bvec, bio, i) {
1034 char *bv_addr = page_address(bvec->bv_page);
1035 unsigned int bv_len = bvec->bv_len;
1036
1037 while (bv_len && iov_idx < iov_count) {
1038 unsigned int bytes;
1039 char __user *iov_addr;
1040
1041 bytes = min_t(unsigned int,
1042 iov[iov_idx].iov_len - iov_off, bv_len);
1043 iov_addr = iov[iov_idx].iov_base + iov_off;
1044
1045 if (!ret) {
1046 if (to_user)
1047 ret = copy_to_user(iov_addr, bv_addr,
1048 bytes);
1049
1050 if (from_user)
1051 ret = copy_from_user(bv_addr, iov_addr,
1052 bytes);
1053
1054 if (ret)
1055 ret = -EFAULT;
1056 }
1057
1058 bv_len -= bytes;
1059 bv_addr += bytes;
1060 iov_addr += bytes;
1061 iov_off += bytes;
1062
1063 if (iov[iov_idx].iov_len == iov_off) {
1064 iov_idx++;
1065 iov_off = 0;
1066 }
1067 }
1068
1069 if (do_free_page)
1070 __free_page(bvec->bv_page);
1071 }
1072
1073 return ret;
1074}
1075
1076/**
1077 * bio_uncopy_user - finish previously mapped bio
1078 * @bio: bio being terminated
1079 *
1080 * Free pages allocated from bio_copy_user() and write back data
1081 * to user space in case of a read.
1082 */
1083int bio_uncopy_user(struct bio *bio)
1084{
1085 struct bio_map_data *bmd = bio->bi_private;
1086 struct bio_vec *bvec;
1087 int ret = 0, i;
1088
1089 if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
1090 /*
1091 * if we're in a workqueue, the request is orphaned, so
1092 * don't copy into a random user address space, just free.
1093 */
1094 if (current->mm)
1095 ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
1096 bio_data_dir(bio) == READ,
1097 0, bmd->is_our_pages);
1098 else if (bmd->is_our_pages)
1099 bio_for_each_segment_all(bvec, bio, i)
1100 __free_page(bvec->bv_page);
1101 }
1102 kfree(bmd);
1103 bio_put(bio);
1104 return ret;
1105}
1106EXPORT_SYMBOL(bio_uncopy_user);
1107
1108/**
1109 * bio_copy_user_iov - copy user data to bio
1110 * @q: destination block queue
1111 * @map_data: pointer to the rq_map_data holding pages (if necessary)
1112 * @iov: the iovec.
1113 * @iov_count: number of elements in the iovec
1114 * @write_to_vm: bool indicating writing to pages or not
1115 * @gfp_mask: memory allocation flags
1116 *
1117 * Prepares and returns a bio for indirect user io, bouncing data
1118 * to/from kernel pages as necessary. Must be paired with
1119 * call bio_uncopy_user() on io completion.
1120 */
1121struct bio *bio_copy_user_iov(struct request_queue *q,
1122 struct rq_map_data *map_data,
1123 const struct sg_iovec *iov, int iov_count,
1124 int write_to_vm, gfp_t gfp_mask)
1125{
1126 struct bio_map_data *bmd;
1127 struct bio_vec *bvec;
1128 struct page *page;
1129 struct bio *bio;
1130 int i, ret;
1131 int nr_pages = 0;
1132 unsigned int len = 0;
1133 unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
1134
1135 for (i = 0; i < iov_count; i++) {
1136 unsigned long uaddr;
1137 unsigned long end;
1138 unsigned long start;
1139
1140 uaddr = (unsigned long)iov[i].iov_base;
1141 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1142 start = uaddr >> PAGE_SHIFT;
1143
1144 /*
1145 * Overflow, abort
1146 */
1147 if (end < start)
1148 return ERR_PTR(-EINVAL);
1149
1150 nr_pages += end - start;
1151 len += iov[i].iov_len;
1152 }
1153
1154 if (offset)
1155 nr_pages++;
1156
1157 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
1158 if (!bmd)
1159 return ERR_PTR(-ENOMEM);
1160
1161 ret = -ENOMEM;
1162 bio = bio_kmalloc(gfp_mask, nr_pages);
1163 if (!bio)
1164 goto out_bmd;
1165
1166 if (!write_to_vm)
1167 bio->bi_rw |= REQ_WRITE;
1168
1169 ret = 0;
1170
1171 if (map_data) {
1172 nr_pages = 1 << map_data->page_order;
1173 i = map_data->offset / PAGE_SIZE;
1174 }
1175 while (len) {
1176 unsigned int bytes = PAGE_SIZE;
1177
1178 bytes -= offset;
1179
1180 if (bytes > len)
1181 bytes = len;
1182
1183 if (map_data) {
1184 if (i == map_data->nr_entries * nr_pages) {
1185 ret = -ENOMEM;
1186 break;
1187 }
1188
1189 page = map_data->pages[i / nr_pages];
1190 page += (i % nr_pages);
1191
1192 i++;
1193 } else {
1194 page = alloc_page(q->bounce_gfp | gfp_mask);
1195 if (!page) {
1196 ret = -ENOMEM;
1197 break;
1198 }
1199 }
1200
1201 if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
1202 break;
1203
1204 len -= bytes;
1205 offset = 0;
1206 }
1207
1208 if (ret)
1209 goto cleanup;
1210
1211 /*
1212 * success
1213 */
1214 if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
1215 (map_data && map_data->from_user)) {
1216 ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
1217 if (ret)
1218 goto cleanup;
1219 }
1220
1221 bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
1222 return bio;
1223cleanup:
1224 if (!map_data)
1225 bio_for_each_segment_all(bvec, bio, i)
1226 __free_page(bvec->bv_page);
1227
1228 bio_put(bio);
1229out_bmd:
1230 kfree(bmd);
1231 return ERR_PTR(ret);
1232}
1233
1234/**
1235 * bio_copy_user - copy user data to bio
1236 * @q: destination block queue
1237 * @map_data: pointer to the rq_map_data holding pages (if necessary)
1238 * @uaddr: start of user address
1239 * @len: length in bytes
1240 * @write_to_vm: bool indicating writing to pages or not
1241 * @gfp_mask: memory allocation flags
1242 *
1243 * Prepares and returns a bio for indirect user io, bouncing data
1244 * to/from kernel pages as necessary. Must be paired with
1245 * call bio_uncopy_user() on io completion.
1246 */
1247struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
1248 unsigned long uaddr, unsigned int len,
1249 int write_to_vm, gfp_t gfp_mask)
1250{
1251 struct sg_iovec iov;
1252
1253 iov.iov_base = (void __user *)uaddr;
1254 iov.iov_len = len;
1255
1256 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
1257}
1258EXPORT_SYMBOL(bio_copy_user);
1259
1260static struct bio *__bio_map_user_iov(struct request_queue *q,
1261 struct block_device *bdev,
1262 const struct sg_iovec *iov, int iov_count,
1263 int write_to_vm, gfp_t gfp_mask)
1264{
1265 int i, j;
1266 int nr_pages = 0;
1267 struct page **pages;
1268 struct bio *bio;
1269 int cur_page = 0;
1270 int ret, offset;
1271
1272 for (i = 0; i < iov_count; i++) {
1273 unsigned long uaddr = (unsigned long)iov[i].iov_base;
1274 unsigned long len = iov[i].iov_len;
1275 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1276 unsigned long start = uaddr >> PAGE_SHIFT;
1277
1278 /*
1279 * Overflow, abort
1280 */
1281 if (end < start)
1282 return ERR_PTR(-EINVAL);
1283
1284 nr_pages += end - start;
1285 /*
1286 * buffer must be aligned to at least hardsector size for now
1287 */
1288 if (uaddr & queue_dma_alignment(q))
1289 return ERR_PTR(-EINVAL);
1290 }
1291
1292 if (!nr_pages)
1293 return ERR_PTR(-EINVAL);
1294
1295 bio = bio_kmalloc(gfp_mask, nr_pages);
1296 if (!bio)
1297 return ERR_PTR(-ENOMEM);
1298
1299 ret = -ENOMEM;
1300 pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
1301 if (!pages)
1302 goto out;
1303
1304 for (i = 0; i < iov_count; i++) {
1305 unsigned long uaddr = (unsigned long)iov[i].iov_base;
1306 unsigned long len = iov[i].iov_len;
1307 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1308 unsigned long start = uaddr >> PAGE_SHIFT;
1309 const int local_nr_pages = end - start;
1310 const int page_limit = cur_page + local_nr_pages;
1311
1312 ret = get_user_pages_fast(uaddr, local_nr_pages,
1313 write_to_vm, &pages[cur_page]);
1314 if (ret < local_nr_pages) {
1315 ret = -EFAULT;
1316 goto out_unmap;
1317 }
1318
1319 offset = uaddr & ~PAGE_MASK;
1320 for (j = cur_page; j < page_limit; j++) {
1321 unsigned int bytes = PAGE_SIZE - offset;
1322
1323 if (len <= 0)
1324 break;
1325
1326 if (bytes > len)
1327 bytes = len;
1328
1329 /*
1330 * sorry...
1331 */
1332 if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
1333 bytes)
1334 break;
1335
1336 len -= bytes;
1337 offset = 0;
1338 }
1339
1340 cur_page = j;
1341 /*
1342 * release the pages we didn't map into the bio, if any
1343 */
1344 while (j < page_limit)
1345 page_cache_release(pages[j++]);
1346 }
1347
1348 kfree(pages);
1349
1350 /*
1351 * set data direction, and check if mapped pages need bouncing
1352 */
1353 if (!write_to_vm)
1354 bio->bi_rw |= REQ_WRITE;
1355
1356 bio->bi_bdev = bdev;
1357 bio->bi_flags |= (1 << BIO_USER_MAPPED);
1358 return bio;
1359
1360 out_unmap:
1361 for (i = 0; i < nr_pages; i++) {
1362 if(!pages[i])
1363 break;
1364 page_cache_release(pages[i]);
1365 }
1366 out:
1367 kfree(pages);
1368 bio_put(bio);
1369 return ERR_PTR(ret);
1370}
1371
1372/**
1373 * bio_map_user - map user address into bio
1374 * @q: the struct request_queue for the bio
1375 * @bdev: destination block device
1376 * @uaddr: start of user address
1377 * @len: length in bytes
1378 * @write_to_vm: bool indicating writing to pages or not
1379 * @gfp_mask: memory allocation flags
1380 *
1381 * Map the user space address into a bio suitable for io to a block
1382 * device. Returns an error pointer in case of error.
1383 */
1384struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
1385 unsigned long uaddr, unsigned int len, int write_to_vm,
1386 gfp_t gfp_mask)
1387{
1388 struct sg_iovec iov;
1389
1390 iov.iov_base = (void __user *)uaddr;
1391 iov.iov_len = len;
1392
1393 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
1394}
1395EXPORT_SYMBOL(bio_map_user);
1396
1397/**
1398 * bio_map_user_iov - map user sg_iovec table into bio
1399 * @q: the struct request_queue for the bio
1400 * @bdev: destination block device
1401 * @iov: the iovec.
1402 * @iov_count: number of elements in the iovec
1403 * @write_to_vm: bool indicating writing to pages or not
1404 * @gfp_mask: memory allocation flags
1405 *
1406 * Map the user space address into a bio suitable for io to a block
1407 * device. Returns an error pointer in case of error.
1408 */
1409struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
1410 const struct sg_iovec *iov, int iov_count,
1411 int write_to_vm, gfp_t gfp_mask)
1412{
1413 struct bio *bio;
1414
1415 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
1416 gfp_mask);
1417 if (IS_ERR(bio))
1418 return bio;
1419
1420 /*
1421 * subtle -- if __bio_map_user() ended up bouncing a bio,
1422 * it would normally disappear when its bi_end_io is run.
1423 * however, we need it for the unmap, so grab an extra
1424 * reference to it
1425 */
1426 bio_get(bio);
1427
1428 return bio;
1429}
1430
1431static void __bio_unmap_user(struct bio *bio)
1432{
1433 struct bio_vec *bvec;
1434 int i;
1435
1436 /*
1437 * make sure we dirty pages we wrote to
1438 */
1439 bio_for_each_segment_all(bvec, bio, i) {
1440 if (bio_data_dir(bio) == READ)
1441 set_page_dirty_lock(bvec->bv_page);
1442
1443 page_cache_release(bvec->bv_page);
1444 }
1445
1446 bio_put(bio);
1447}
1448
1449/**
1450 * bio_unmap_user - unmap a bio
1451 * @bio: the bio being unmapped
1452 *
1453 * Unmap a bio previously mapped by bio_map_user(). Must be called with
1454 * a process context.
1455 *
1456 * bio_unmap_user() may sleep.
1457 */
1458void bio_unmap_user(struct bio *bio)
1459{
1460 __bio_unmap_user(bio);
1461 bio_put(bio);
1462}
1463EXPORT_SYMBOL(bio_unmap_user);
1464
1465static void bio_map_kern_endio(struct bio *bio, int err)
1466{
1467 bio_put(bio);
1468}
1469
1470static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1471 unsigned int len, gfp_t gfp_mask)
1472{
1473 unsigned long kaddr = (unsigned long)data;
1474 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1475 unsigned long start = kaddr >> PAGE_SHIFT;
1476 const int nr_pages = end - start;
1477 int offset, i;
1478 struct bio *bio;
1479
1480 bio = bio_kmalloc(gfp_mask, nr_pages);
1481 if (!bio)
1482 return ERR_PTR(-ENOMEM);
1483
1484 offset = offset_in_page(kaddr);
1485 for (i = 0; i < nr_pages; i++) {
1486 unsigned int bytes = PAGE_SIZE - offset;
1487
1488 if (len <= 0)
1489 break;
1490
1491 if (bytes > len)
1492 bytes = len;
1493
1494 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
1495 offset) < bytes)
1496 break;
1497
1498 data += bytes;
1499 len -= bytes;
1500 offset = 0;
1501 }
1502
1503 bio->bi_end_io = bio_map_kern_endio;
1504 return bio;
1505}
1506
1507/**
1508 * bio_map_kern - map kernel address into bio
1509 * @q: the struct request_queue for the bio
1510 * @data: pointer to buffer to map
1511 * @len: length in bytes
1512 * @gfp_mask: allocation flags for bio allocation
1513 *
1514 * Map the kernel address into a bio suitable for io to a block
1515 * device. Returns an error pointer in case of error.
1516 */
1517struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1518 gfp_t gfp_mask)
1519{
1520 struct bio *bio;
1521
1522 bio = __bio_map_kern(q, data, len, gfp_mask);
1523 if (IS_ERR(bio))
1524 return bio;
1525
1526 if (bio->bi_iter.bi_size == len)
1527 return bio;
1528
1529 /*
1530 * Don't support partial mappings.
1531 */
1532 bio_put(bio);
1533 return ERR_PTR(-EINVAL);
1534}
1535EXPORT_SYMBOL(bio_map_kern);
1536
1537static void bio_copy_kern_endio(struct bio *bio, int err)
1538{
1539 struct bio_vec *bvec;
1540 const int read = bio_data_dir(bio) == READ;
1541 struct bio_map_data *bmd = bio->bi_private;
1542 int i;
1543 char *p = bmd->sgvecs[0].iov_base;
1544
1545 bio_for_each_segment_all(bvec, bio, i) {
1546 char *addr = page_address(bvec->bv_page);
1547
1548 if (read)
1549 memcpy(p, addr, bvec->bv_len);
1550
1551 __free_page(bvec->bv_page);
1552 p += bvec->bv_len;
1553 }
1554
1555 kfree(bmd);
1556 bio_put(bio);
1557}
1558
1559/**
1560 * bio_copy_kern - copy kernel address into bio
1561 * @q: the struct request_queue for the bio
1562 * @data: pointer to buffer to copy
1563 * @len: length in bytes
1564 * @gfp_mask: allocation flags for bio and page allocation
1565 * @reading: data direction is READ
1566 *
1567 * copy the kernel address into a bio suitable for io to a block
1568 * device. Returns an error pointer in case of error.
1569 */
1570struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1571 gfp_t gfp_mask, int reading)
1572{
1573 struct bio *bio;
1574 struct bio_vec *bvec;
1575 int i;
1576
1577 bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
1578 if (IS_ERR(bio))
1579 return bio;
1580
1581 if (!reading) {
1582 void *p = data;
1583
1584 bio_for_each_segment_all(bvec, bio, i) {
1585 char *addr = page_address(bvec->bv_page);
1586
1587 memcpy(addr, p, bvec->bv_len);
1588 p += bvec->bv_len;
1589 }
1590 }
1591
1592 bio->bi_end_io = bio_copy_kern_endio;
1593
1594 return bio;
1595}
1596EXPORT_SYMBOL(bio_copy_kern);
1597
1598/*
1599 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
1600 * for performing direct-IO in BIOs.
1601 *
1602 * The problem is that we cannot run set_page_dirty() from interrupt context
1603 * because the required locks are not interrupt-safe. So what we can do is to
1604 * mark the pages dirty _before_ performing IO. And in interrupt context,
1605 * check that the pages are still dirty. If so, fine. If not, redirty them
1606 * in process context.
1607 *
1608 * We special-case compound pages here: normally this means reads into hugetlb
1609 * pages. The logic in here doesn't really work right for compound pages
1610 * because the VM does not uniformly chase down the head page in all cases.
1611 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
1612 * handle them at all. So we skip compound pages here at an early stage.
1613 *
1614 * Note that this code is very hard to test under normal circumstances because
1615 * direct-io pins the pages with get_user_pages(). This makes
1616 * is_page_cache_freeable return false, and the VM will not clean the pages.
1617 * But other code (eg, flusher threads) could clean the pages if they are mapped
1618 * pagecache.
1619 *
1620 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
1621 * deferred bio dirtying paths.
1622 */
1623
1624/*
1625 * bio_set_pages_dirty() will mark all the bio's pages as dirty.
1626 */
1627void bio_set_pages_dirty(struct bio *bio)
1628{
1629 struct bio_vec *bvec;
1630 int i;
1631
1632 bio_for_each_segment_all(bvec, bio, i) {
1633 struct page *page = bvec->bv_page;
1634
1635 if (page && !PageCompound(page))
1636 set_page_dirty_lock(page);
1637 }
1638}
1639
1640static void bio_release_pages(struct bio *bio)
1641{
1642 struct bio_vec *bvec;
1643 int i;
1644
1645 bio_for_each_segment_all(bvec, bio, i) {
1646 struct page *page = bvec->bv_page;
1647
1648 if (page)
1649 put_page(page);
1650 }
1651}
1652
1653/*
1654 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
1655 * If they are, then fine. If, however, some pages are clean then they must
1656 * have been written out during the direct-IO read. So we take another ref on
1657 * the BIO and the offending pages and re-dirty the pages in process context.
1658 *
1659 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
1660 * here on. It will run one page_cache_release() against each page and will
1661 * run one bio_put() against the BIO.
1662 */
1663
1664static void bio_dirty_fn(struct work_struct *work);
1665
1666static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
1667static DEFINE_SPINLOCK(bio_dirty_lock);
1668static struct bio *bio_dirty_list;
1669
1670/*
1671 * This runs in process context
1672 */
1673static void bio_dirty_fn(struct work_struct *work)
1674{
1675 unsigned long flags;
1676 struct bio *bio;
1677
1678 spin_lock_irqsave(&bio_dirty_lock, flags);
1679 bio = bio_dirty_list;
1680 bio_dirty_list = NULL;
1681 spin_unlock_irqrestore(&bio_dirty_lock, flags);
1682
1683 while (bio) {
1684 struct bio *next = bio->bi_private;
1685
1686 bio_set_pages_dirty(bio);
1687 bio_release_pages(bio);
1688 bio_put(bio);
1689 bio = next;
1690 }
1691}
1692
1693void bio_check_pages_dirty(struct bio *bio)
1694{
1695 struct bio_vec *bvec;
1696 int nr_clean_pages = 0;
1697 int i;
1698
1699 bio_for_each_segment_all(bvec, bio, i) {
1700 struct page *page = bvec->bv_page;
1701
1702 if (PageDirty(page) || PageCompound(page)) {
1703 page_cache_release(page);
1704 bvec->bv_page = NULL;
1705 } else {
1706 nr_clean_pages++;
1707 }
1708 }
1709
1710 if (nr_clean_pages) {
1711 unsigned long flags;
1712
1713 spin_lock_irqsave(&bio_dirty_lock, flags);
1714 bio->bi_private = bio_dirty_list;
1715 bio_dirty_list = bio;
1716 spin_unlock_irqrestore(&bio_dirty_lock, flags);
1717 schedule_work(&bio_dirty_work);
1718 } else {
1719 bio_put(bio);
1720 }
1721}
1722
1723#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1724void bio_flush_dcache_pages(struct bio *bi)
1725{
1726 struct bio_vec bvec;
1727 struct bvec_iter iter;
1728
1729 bio_for_each_segment(bvec, bi, iter)
1730 flush_dcache_page(bvec.bv_page);
1731}
1732EXPORT_SYMBOL(bio_flush_dcache_pages);
1733#endif
1734
1735/**
1736 * bio_endio - end I/O on a bio
1737 * @bio: bio
1738 * @error: error, if any
1739 *
1740 * Description:
1741 * bio_endio() will end I/O on the whole bio. bio_endio() is the
1742 * preferred way to end I/O on a bio, it takes care of clearing
1743 * BIO_UPTODATE on error. @error is 0 on success, and and one of the
1744 * established -Exxxx (-EIO, for instance) error values in case
1745 * something went wrong. No one should call bi_end_io() directly on a
1746 * bio unless they own it and thus know that it has an end_io
1747 * function.
1748 **/
1749void bio_endio(struct bio *bio, int error)
1750{
1751 while (bio) {
1752 BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
1753
1754 if (error)
1755 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1756 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1757 error = -EIO;
1758
1759 if (!atomic_dec_and_test(&bio->bi_remaining))
1760 return;
1761
1762 /*
1763 * Need to have a real endio function for chained bios,
1764 * otherwise various corner cases will break (like stacking
1765 * block devices that save/restore bi_end_io) - however, we want
1766 * to avoid unbounded recursion and blowing the stack. Tail call
1767 * optimization would handle this, but compiling with frame
1768 * pointers also disables gcc's sibling call optimization.
1769 */
1770 if (bio->bi_end_io == bio_chain_endio) {
1771 struct bio *parent = bio->bi_private;
1772 bio_put(bio);
1773 bio = parent;
1774 } else {
1775 if (bio->bi_end_io)
1776 bio->bi_end_io(bio, error);
1777 bio = NULL;
1778 }
1779 }
1780}
1781EXPORT_SYMBOL(bio_endio);
1782
1783/**
1784 * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
1785 * @bio: bio
1786 * @error: error, if any
1787 *
1788 * For code that has saved and restored bi_end_io; thing hard before using this
1789 * function, probably you should've cloned the entire bio.
1790 **/
1791void bio_endio_nodec(struct bio *bio, int error)
1792{
1793 atomic_inc(&bio->bi_remaining);
1794 bio_endio(bio, error);
1795}
1796EXPORT_SYMBOL(bio_endio_nodec);
1797
1798/**
1799 * bio_split - split a bio
1800 * @bio: bio to split
1801 * @sectors: number of sectors to split from the front of @bio
1802 * @gfp: gfp mask
1803 * @bs: bio set to allocate from
1804 *
1805 * Allocates and returns a new bio which represents @sectors from the start of
1806 * @bio, and updates @bio to represent the remaining sectors.
1807 *
1808 * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's
1809 * responsibility to ensure that @bio is not freed before the split.
1810 */
1811struct bio *bio_split(struct bio *bio, int sectors,
1812 gfp_t gfp, struct bio_set *bs)
1813{
1814 struct bio *split = NULL;
1815
1816 BUG_ON(sectors <= 0);
1817 BUG_ON(sectors >= bio_sectors(bio));
1818
1819 split = bio_clone_fast(bio, gfp, bs);
1820 if (!split)
1821 return NULL;
1822
1823 split->bi_iter.bi_size = sectors << 9;
1824
1825 if (bio_integrity(split))
1826 bio_integrity_trim(split, 0, sectors);
1827
1828 bio_advance(bio, split->bi_iter.bi_size);
1829
1830 return split;
1831}
1832EXPORT_SYMBOL(bio_split);
1833
1834/**
1835 * bio_trim - trim a bio
1836 * @bio: bio to trim
1837 * @offset: number of sectors to trim from the front of @bio
1838 * @size: size we want to trim @bio to, in sectors
1839 */
1840void bio_trim(struct bio *bio, int offset, int size)
1841{
1842 /* 'bio' is a cloned bio which we need to trim to match
1843 * the given offset and size.
1844 */
1845
1846 size <<= 9;
1847 if (offset == 0 && size == bio->bi_iter.bi_size)
1848 return;
1849
1850 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1851
1852 bio_advance(bio, offset << 9);
1853
1854 bio->bi_iter.bi_size = size;
1855}
1856EXPORT_SYMBOL_GPL(bio_trim);
1857
1858/*
1859 * create memory pools for biovec's in a bio_set.
1860 * use the global biovec slabs created for general use.
1861 */
1862mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
1863{
1864 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
1865
1866 return mempool_create_slab_pool(pool_entries, bp->slab);
1867}
1868
1869void bioset_free(struct bio_set *bs)
1870{
1871 if (bs->rescue_workqueue)
1872 destroy_workqueue(bs->rescue_workqueue);
1873
1874 if (bs->bio_pool)
1875 mempool_destroy(bs->bio_pool);
1876
1877 if (bs->bvec_pool)
1878 mempool_destroy(bs->bvec_pool);
1879
1880 bioset_integrity_free(bs);
1881 bio_put_slab(bs);
1882
1883 kfree(bs);
1884}
1885EXPORT_SYMBOL(bioset_free);
1886
1887/**
1888 * bioset_create - Create a bio_set
1889 * @pool_size: Number of bio and bio_vecs to cache in the mempool
1890 * @front_pad: Number of bytes to allocate in front of the returned bio
1891 *
1892 * Description:
1893 * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
1894 * to ask for a number of bytes to be allocated in front of the bio.
1895 * Front pad allocation is useful for embedding the bio inside
1896 * another structure, to avoid allocating extra data to go with the bio.
1897 * Note that the bio must be embedded at the END of that structure always,
1898 * or things will break badly.
1899 */
1900struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1901{
1902 unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
1903 struct bio_set *bs;
1904
1905 bs = kzalloc(sizeof(*bs), GFP_KERNEL);
1906 if (!bs)
1907 return NULL;
1908
1909 bs->front_pad = front_pad;
1910
1911 spin_lock_init(&bs->rescue_lock);
1912 bio_list_init(&bs->rescue_list);
1913 INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
1914
1915 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
1916 if (!bs->bio_slab) {
1917 kfree(bs);
1918 return NULL;
1919 }
1920
1921 bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
1922 if (!bs->bio_pool)
1923 goto bad;
1924
1925 bs->bvec_pool = biovec_create_pool(bs, pool_size);
1926 if (!bs->bvec_pool)
1927 goto bad;
1928
1929 bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
1930 if (!bs->rescue_workqueue)
1931 goto bad;
1932
1933 return bs;
1934bad:
1935 bioset_free(bs);
1936 return NULL;
1937}
1938EXPORT_SYMBOL(bioset_create);
1939
1940#ifdef CONFIG_BLK_CGROUP
1941/**
1942 * bio_associate_current - associate a bio with %current
1943 * @bio: target bio
1944 *
1945 * Associate @bio with %current if it hasn't been associated yet. Block
1946 * layer will treat @bio as if it were issued by %current no matter which
1947 * task actually issues it.
1948 *
1949 * This function takes an extra reference of @task's io_context and blkcg
1950 * which will be put when @bio is released. The caller must own @bio,
1951 * ensure %current->io_context exists, and is responsible for synchronizing
1952 * calls to this function.
1953 */
1954int bio_associate_current(struct bio *bio)
1955{
1956 struct io_context *ioc;
1957 struct cgroup_subsys_state *css;
1958
1959 if (bio->bi_ioc)
1960 return -EBUSY;
1961
1962 ioc = current->io_context;
1963 if (!ioc)
1964 return -ENOENT;
1965
1966 /* acquire active ref on @ioc and associate */
1967 get_io_context_active(ioc);
1968 bio->bi_ioc = ioc;
1969
1970 /* associate blkcg if exists */
1971 rcu_read_lock();
1972 css = task_css(current, blkio_cgrp_id);
1973 if (css && css_tryget(css))
1974 bio->bi_css = css;
1975 rcu_read_unlock();
1976
1977 return 0;
1978}
1979
1980/**
1981 * bio_disassociate_task - undo bio_associate_current()
1982 * @bio: target bio
1983 */
1984void bio_disassociate_task(struct bio *bio)
1985{
1986 if (bio->bi_ioc) {
1987 put_io_context(bio->bi_ioc);
1988 bio->bi_ioc = NULL;
1989 }
1990 if (bio->bi_css) {
1991 css_put(bio->bi_css);
1992 bio->bi_css = NULL;
1993 }
1994}
1995
1996#endif /* CONFIG_BLK_CGROUP */
1997
1998static void __init biovec_init_slabs(void)
1999{
2000 int i;
2001
2002 for (i = 0; i < BIOVEC_NR_POOLS; i++) {
2003 int size;
2004 struct biovec_slab *bvs = bvec_slabs + i;
2005
2006 if (bvs->nr_vecs <= BIO_INLINE_VECS) {
2007 bvs->slab = NULL;
2008 continue;
2009 }
2010
2011 size = bvs->nr_vecs * sizeof(struct bio_vec);
2012 bvs->slab = kmem_cache_create(bvs->name, size, 0,
2013 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2014 }
2015}
2016
2017static int __init init_bio(void)
2018{
2019 bio_slab_max = 2;
2020 bio_slab_nr = 0;
2021 bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
2022 if (!bio_slabs)
2023 panic("bio: can't allocate bios\n");
2024
2025 bio_integrity_init();
2026 biovec_init_slabs();
2027
2028 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
2029 if (!fs_bio_set)
2030 panic("bio: can't allocate bios\n");
2031
2032 if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
2033 panic("bio: can't create integrity pool\n");
2034
2035 return 0;
2036}
2037subsys_initcall(init_bio);
diff --git a/fs/ioprio.c b/fs/ioprio.c
deleted file mode 100644
index e50170ca7c33..000000000000
--- a/fs/ioprio.c
+++ /dev/null
@@ -1,241 +0,0 @@
1/*
2 * fs/ioprio.c
3 *
4 * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
5 *
6 * Helper functions for setting/querying io priorities of processes. The
7 * system calls closely mimmick getpriority/setpriority, see the man page for
8 * those. The prio argument is a composite of prio class and prio data, where
9 * the data argument has meaning within that class. The standard scheduling
10 * classes have 8 distinct prio levels, with 0 being the highest prio and 7
11 * being the lowest.
12 *
13 * IOW, setting BE scheduling class with prio 2 is done ala:
14 *
15 * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
16 *
17 * ioprio_set(PRIO_PROCESS, pid, prio);
18 *
19 * See also Documentation/block/ioprio.txt
20 *
21 */
22#include <linux/gfp.h>
23#include <linux/kernel.h>
24#include <linux/export.h>
25#include <linux/ioprio.h>
26#include <linux/blkdev.h>
27#include <linux/capability.h>
28#include <linux/syscalls.h>
29#include <linux/security.h>
30#include <linux/pid_namespace.h>
31
32int set_task_ioprio(struct task_struct *task, int ioprio)
33{
34 int err;
35 struct io_context *ioc;
36 const struct cred *cred = current_cred(), *tcred;
37
38 rcu_read_lock();
39 tcred = __task_cred(task);
40 if (!uid_eq(tcred->uid, cred->euid) &&
41 !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
42 rcu_read_unlock();
43 return -EPERM;
44 }
45 rcu_read_unlock();
46
47 err = security_task_setioprio(task, ioprio);
48 if (err)
49 return err;
50
51 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
52 if (ioc) {
53 ioc->ioprio = ioprio;
54 put_io_context(ioc);
55 }
56
57 return err;
58}
59EXPORT_SYMBOL_GPL(set_task_ioprio);
60
61SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
62{
63 int class = IOPRIO_PRIO_CLASS(ioprio);
64 int data = IOPRIO_PRIO_DATA(ioprio);
65 struct task_struct *p, *g;
66 struct user_struct *user;
67 struct pid *pgrp;
68 kuid_t uid;
69 int ret;
70
71 switch (class) {
72 case IOPRIO_CLASS_RT:
73 if (!capable(CAP_SYS_ADMIN))
74 return -EPERM;
75 /* fall through, rt has prio field too */
76 case IOPRIO_CLASS_BE:
77 if (data >= IOPRIO_BE_NR || data < 0)
78 return -EINVAL;
79
80 break;
81 case IOPRIO_CLASS_IDLE:
82 break;
83 case IOPRIO_CLASS_NONE:
84 if (data)
85 return -EINVAL;
86 break;
87 default:
88 return -EINVAL;
89 }
90
91 ret = -ESRCH;
92 rcu_read_lock();
93 switch (which) {
94 case IOPRIO_WHO_PROCESS:
95 if (!who)
96 p = current;
97 else
98 p = find_task_by_vpid(who);
99 if (p)
100 ret = set_task_ioprio(p, ioprio);
101 break;
102 case IOPRIO_WHO_PGRP:
103 if (!who)
104 pgrp = task_pgrp(current);
105 else
106 pgrp = find_vpid(who);
107 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
108 ret = set_task_ioprio(p, ioprio);
109 if (ret)
110 break;
111 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
112 break;
113 case IOPRIO_WHO_USER:
114 uid = make_kuid(current_user_ns(), who);
115 if (!uid_valid(uid))
116 break;
117 if (!who)
118 user = current_user();
119 else
120 user = find_user(uid);
121
122 if (!user)
123 break;
124
125 do_each_thread(g, p) {
126 if (!uid_eq(task_uid(p), uid))
127 continue;
128 ret = set_task_ioprio(p, ioprio);
129 if (ret)
130 goto free_uid;
131 } while_each_thread(g, p);
132free_uid:
133 if (who)
134 free_uid(user);
135 break;
136 default:
137 ret = -EINVAL;
138 }
139
140 rcu_read_unlock();
141 return ret;
142}
143
144static int get_task_ioprio(struct task_struct *p)
145{
146 int ret;
147
148 ret = security_task_getioprio(p);
149 if (ret)
150 goto out;
151 ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
152 if (p->io_context)
153 ret = p->io_context->ioprio;
154out:
155 return ret;
156}
157
158int ioprio_best(unsigned short aprio, unsigned short bprio)
159{
160 unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
161 unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
162
163 if (aclass == IOPRIO_CLASS_NONE)
164 aclass = IOPRIO_CLASS_BE;
165 if (bclass == IOPRIO_CLASS_NONE)
166 bclass = IOPRIO_CLASS_BE;
167
168 if (aclass == bclass)
169 return min(aprio, bprio);
170 if (aclass > bclass)
171 return bprio;
172 else
173 return aprio;
174}
175
176SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
177{
178 struct task_struct *g, *p;
179 struct user_struct *user;
180 struct pid *pgrp;
181 kuid_t uid;
182 int ret = -ESRCH;
183 int tmpio;
184
185 rcu_read_lock();
186 switch (which) {
187 case IOPRIO_WHO_PROCESS:
188 if (!who)
189 p = current;
190 else
191 p = find_task_by_vpid(who);
192 if (p)
193 ret = get_task_ioprio(p);
194 break;
195 case IOPRIO_WHO_PGRP:
196 if (!who)
197 pgrp = task_pgrp(current);
198 else
199 pgrp = find_vpid(who);
200 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
201 tmpio = get_task_ioprio(p);
202 if (tmpio < 0)
203 continue;
204 if (ret == -ESRCH)
205 ret = tmpio;
206 else
207 ret = ioprio_best(ret, tmpio);
208 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
209 break;
210 case IOPRIO_WHO_USER:
211 uid = make_kuid(current_user_ns(), who);
212 if (!who)
213 user = current_user();
214 else
215 user = find_user(uid);
216
217 if (!user)
218 break;
219
220 do_each_thread(g, p) {
221 if (!uid_eq(task_uid(p), user->uid))
222 continue;
223 tmpio = get_task_ioprio(p);
224 if (tmpio < 0)
225 continue;
226 if (ret == -ESRCH)
227 ret = tmpio;
228 else
229 ret = ioprio_best(ret, tmpio);
230 } while_each_thread(g, p);
231
232 if (who)
233 free_uid(user);
234 break;
235 default:
236 ret = -EINVAL;
237 }
238
239 rcu_read_unlock();
240 return ret;
241}