aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-08 13:13:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-08 13:13:35 -0400
commit4de13d7aa8f4d02f4dc99d4609575659f92b3c5a (patch)
tree3bc9729eabe79c6164cd29a5d605000bc82bf837
parent5af43c24ca59a448c9312dd4a4a51d27ec3b9a73 (diff)
parentb8d4a5bf6a049303a29a3275f463f09a490b50ea (diff)
Merge branch 'for-3.10/core' of git://git.kernel.dk/linux-block
Pull block core updates from Jens Axboe: - Major bit is Kents prep work for immutable bio vecs. - Stable candidate fix for a scheduling-while-atomic in the queue bypass operation. - Fix for the hang on exceeded rq->datalen 32-bit unsigned when merging discard bios. - Tejuns changes to convert the writeback thread pool to the generic workqueue mechanism. - Runtime PM framework, SCSI patches exists on top of these in James' tree. - A few random fixes. * 'for-3.10/core' of git://git.kernel.dk/linux-block: (40 commits) relay: move remove_buf_file inside relay_close_buf partitions/efi.c: replace useless kzalloc's by kmalloc's fs/block_dev.c: fix iov_shorten() criteria in blkdev_aio_read() block: fix max discard sectors limit blkcg: fix "scheduling while atomic" in blk_queue_bypass_start Documentation: cfq-iosched: update documentation help for cfq tunables writeback: expose the bdi_wq workqueue writeback: replace custom worker pool implementation with unbound workqueue writeback: remove unused bdi_pending_list aoe: Fix unitialized var usage bio-integrity: Add explicit field for owner of bip_buf block: Add an explicit bio flag for bios that own their bvec block: Add bio_alloc_pages() block: Convert some code to bio_for_each_segment_all() block: Add bio_for_each_segment_all() bounce: Refactor __blk_queue_bounce to not use bi_io_vec raid1: use bio_copy_data() pktcdvd: Use bio_reset() in disabled code to kill bi_idx usage pktcdvd: use bio_copy_data() block: Add bio_copy_data() ...
-rw-r--r--Documentation/block/cfq-iosched.txt47
-rw-r--r--block/blk-cgroup.c4
-rw-r--r--block/blk-core.c265
-rw-r--r--block/cfq-iosched.c7
-rw-r--r--block/deadline-iosched.c2
-rw-r--r--block/elevator.c26
-rw-r--r--block/partitions/efi.c4
-rw-r--r--drivers/block/aoe/aoecmd.c2
-rw-r--r--drivers/block/brd.c3
-rw-r--r--drivers/block/floppy.c1
-rw-r--r--drivers/block/pktcdvd.c102
-rw-r--r--drivers/block/rbd.c2
-rw-r--r--drivers/md/dm-crypt.c3
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm-verity.c4
-rw-r--r--drivers/md/faulty.c6
-rw-r--r--drivers/md/linear.c3
-rw-r--r--drivers/md/md.c17
-rw-r--r--drivers/md/raid0.c9
-rw-r--r--drivers/md/raid1.c133
-rw-r--r--drivers/md/raid10.c78
-rw-r--r--drivers/md/raid5.c49
-rw-r--r--drivers/message/fusion/mptsas.c6
-rw-r--r--drivers/s390/block/dcssblk.c3
-rw-r--r--drivers/scsi/libsas/sas_expander.c6
-rw-r--r--drivers/scsi/mpt2sas/mpt2sas_transport.c10
-rw-r--r--fs/bio-integrity.c144
-rw-r--r--fs/bio.c366
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/extent_io.c3
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/direct-io.c8
-rw-r--r--fs/exofs/ore.c2
-rw-r--r--fs/exofs/ore_raid.c2
-rw-r--r--fs/fs-writeback.c102
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/jfs/jfs_logmgr.c2
-rw-r--r--fs/logfs/dev_bdev.c5
-rw-r--r--include/linux/backing-dev.h16
-rw-r--r--include/linux/bio.h115
-rw-r--r--include/linux/blk_types.h5
-rw-r--r--include/linux/blkdev.h29
-rw-r--r--include/trace/events/block.h12
-rw-r--r--include/trace/events/writeback.h5
-rw-r--r--kernel/relay.c2
-rw-r--r--mm/backing-dev.c259
-rw-r--r--mm/bounce.c75
-rw-r--r--mm/page_io.c1
50 files changed, 1000 insertions, 956 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
index a5eb7d19a65d..9887f0414c16 100644
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -5,7 +5,7 @@ The main aim of CFQ scheduler is to provide a fair allocation of the disk
5I/O bandwidth for all the processes which requests an I/O operation. 5I/O bandwidth for all the processes which requests an I/O operation.
6 6
7CFQ maintains the per process queue for the processes which request I/O 7CFQ maintains the per process queue for the processes which request I/O
8operation(syncronous requests). In case of asynchronous requests, all the 8operation(synchronous requests). In case of asynchronous requests, all the
9requests from all the processes are batched together according to their 9requests from all the processes are batched together according to their
10process's I/O priority. 10process's I/O priority.
11 11
@@ -66,6 +66,47 @@ This parameter is used to set the timeout of synchronous requests. Default
66value of this is 124ms. In case to favor synchronous requests over asynchronous 66value of this is 124ms. In case to favor synchronous requests over asynchronous
67one, this value should be decreased relative to fifo_expire_async. 67one, this value should be decreased relative to fifo_expire_async.
68 68
69group_idle
70-----------
71This parameter forces idling at the CFQ group level instead of CFQ
72queue level. This was introduced after after a bottleneck was observed
73in higher end storage due to idle on sequential queue and allow dispatch
74from a single queue. The idea with this parameter is that it can be run with
75slice_idle=0 and group_idle=8, so that idling does not happen on individual
76queues in the group but happens overall on the group and thus still keeps the
77IO controller working.
78Not idling on individual queues in the group will dispatch requests from
79multiple queues in the group at the same time and achieve higher throughput
80on higher end storage.
81
82Default value for this parameter is 8ms.
83
84latency
85-------
86This parameter is used to enable/disable the latency mode of the CFQ
87scheduler. If latency mode (called low_latency) is enabled, CFQ tries
88to recompute the slice time for each process based on the target_latency set
89for the system. This favors fairness over throughput. Disabling low
90latency (setting it to 0) ignores target latency, allowing each process in the
91system to get a full time slice.
92
93By default low latency mode is enabled.
94
95target_latency
96--------------
97This parameter is used to calculate the time slice for a process if cfq's
98latency mode is enabled. It will ensure that sync requests have an estimated
99latency. But if sequential workload is higher(e.g. sequential read),
100then to meet the latency constraints, throughput may decrease because of less
101time for each process to issue I/O request before the cfq queue is switched.
102
103Though this can be overcome by disabling the latency_mode, it may increase
104the read latency for some applications. This parameter allows for changing
105target_latency through the sysfs interface which can provide the balanced
106throughput and read latency.
107
108Default value for target_latency is 300ms.
109
69slice_async 110slice_async
70----------- 111-----------
71This parameter is same as of slice_sync but for asynchronous queue. The 112This parameter is same as of slice_sync but for asynchronous queue. The
@@ -98,8 +139,8 @@ in the device exceeds this parameter. This parameter is used for synchronous
98request. 139request.
99 140
100In case of storage with several disk, this setting can limit the parallel 141In case of storage with several disk, this setting can limit the parallel
101processing of request. Therefore, increasing the value can imporve the 142processing of request. Therefore, increasing the value can improve the
102performace although this can cause the latency of some I/O to increase due 143performance although this can cause the latency of some I/O to increase due
103to more number of requests. 144to more number of requests.
104 145
105CFQ Group scheduling 146CFQ Group scheduling
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b2b9837f9dd3..e8918ffaf96d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -972,10 +972,10 @@ int blkcg_activate_policy(struct request_queue *q,
972 if (!new_blkg) 972 if (!new_blkg)
973 return -ENOMEM; 973 return -ENOMEM;
974 974
975 preloaded = !radix_tree_preload(GFP_KERNEL);
976
977 blk_queue_bypass_start(q); 975 blk_queue_bypass_start(q);
978 976
977 preloaded = !radix_tree_preload(GFP_KERNEL);
978
979 /* 979 /*
980 * Make sure the root blkg exists and count the existing blkgs. As 980 * Make sure the root blkg exists and count the existing blkgs. As
981 * @q is bypassing at this point, blkg_lookup_create() can't be 981 * @q is bypassing at this point, blkg_lookup_create() can't be
diff --git a/block/blk-core.c b/block/blk-core.c
index 7c288358a745..33c33bc99ddd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -30,6 +30,7 @@
30#include <linux/list_sort.h> 30#include <linux/list_sort.h>
31#include <linux/delay.h> 31#include <linux/delay.h>
32#include <linux/ratelimit.h> 32#include <linux/ratelimit.h>
33#include <linux/pm_runtime.h>
33 34
34#define CREATE_TRACE_POINTS 35#define CREATE_TRACE_POINTS
35#include <trace/events/block.h> 36#include <trace/events/block.h>
@@ -159,20 +160,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
159 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 160 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
160 error = -EIO; 161 error = -EIO;
161 162
162 if (unlikely(nbytes > bio->bi_size)) {
163 printk(KERN_ERR "%s: want %u bytes done, %u left\n",
164 __func__, nbytes, bio->bi_size);
165 nbytes = bio->bi_size;
166 }
167
168 if (unlikely(rq->cmd_flags & REQ_QUIET)) 163 if (unlikely(rq->cmd_flags & REQ_QUIET))
169 set_bit(BIO_QUIET, &bio->bi_flags); 164 set_bit(BIO_QUIET, &bio->bi_flags);
170 165
171 bio->bi_size -= nbytes; 166 bio_advance(bio, nbytes);
172 bio->bi_sector += (nbytes >> 9);
173
174 if (bio_integrity(bio))
175 bio_integrity_advance(bio, nbytes);
176 167
177 /* don't actually finish bio if it's part of flush sequence */ 168 /* don't actually finish bio if it's part of flush sequence */
178 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) 169 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
@@ -1264,6 +1255,16 @@ void part_round_stats(int cpu, struct hd_struct *part)
1264} 1255}
1265EXPORT_SYMBOL_GPL(part_round_stats); 1256EXPORT_SYMBOL_GPL(part_round_stats);
1266 1257
1258#ifdef CONFIG_PM_RUNTIME
1259static void blk_pm_put_request(struct request *rq)
1260{
1261 if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
1262 pm_runtime_mark_last_busy(rq->q->dev);
1263}
1264#else
1265static inline void blk_pm_put_request(struct request *rq) {}
1266#endif
1267
1267/* 1268/*
1268 * queue lock must be held 1269 * queue lock must be held
1269 */ 1270 */
@@ -1274,6 +1275,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1274 if (unlikely(--req->ref_count)) 1275 if (unlikely(--req->ref_count))
1275 return; 1276 return;
1276 1277
1278 blk_pm_put_request(req);
1279
1277 elv_completed_request(q, req); 1280 elv_completed_request(q, req);
1278 1281
1279 /* this is a bio leak */ 1282 /* this is a bio leak */
@@ -1597,7 +1600,7 @@ static void handle_bad_sector(struct bio *bio)
1597 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 1600 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
1598 bdevname(bio->bi_bdev, b), 1601 bdevname(bio->bi_bdev, b),
1599 bio->bi_rw, 1602 bio->bi_rw,
1600 (unsigned long long)bio->bi_sector + bio_sectors(bio), 1603 (unsigned long long)bio_end_sector(bio),
1601 (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); 1604 (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
1602 1605
1603 set_bit(BIO_EOF, &bio->bi_flags); 1606 set_bit(BIO_EOF, &bio->bi_flags);
@@ -2053,6 +2056,28 @@ static void blk_account_io_done(struct request *req)
2053 } 2056 }
2054} 2057}
2055 2058
2059#ifdef CONFIG_PM_RUNTIME
2060/*
2061 * Don't process normal requests when queue is suspended
2062 * or in the process of suspending/resuming
2063 */
2064static struct request *blk_pm_peek_request(struct request_queue *q,
2065 struct request *rq)
2066{
2067 if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
2068 (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
2069 return NULL;
2070 else
2071 return rq;
2072}
2073#else
2074static inline struct request *blk_pm_peek_request(struct request_queue *q,
2075 struct request *rq)
2076{
2077 return rq;
2078}
2079#endif
2080
2056/** 2081/**
2057 * blk_peek_request - peek at the top of a request queue 2082 * blk_peek_request - peek at the top of a request queue
2058 * @q: request queue to peek at 2083 * @q: request queue to peek at
@@ -2075,6 +2100,11 @@ struct request *blk_peek_request(struct request_queue *q)
2075 int ret; 2100 int ret;
2076 2101
2077 while ((rq = __elv_next_request(q)) != NULL) { 2102 while ((rq = __elv_next_request(q)) != NULL) {
2103
2104 rq = blk_pm_peek_request(q, rq);
2105 if (!rq)
2106 break;
2107
2078 if (!(rq->cmd_flags & REQ_STARTED)) { 2108 if (!(rq->cmd_flags & REQ_STARTED)) {
2079 /* 2109 /*
2080 * This is the first time the device driver 2110 * This is the first time the device driver
@@ -2253,8 +2283,7 @@ EXPORT_SYMBOL(blk_fetch_request);
2253 **/ 2283 **/
2254bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) 2284bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2255{ 2285{
2256 int total_bytes, bio_nbytes, next_idx = 0; 2286 int total_bytes;
2257 struct bio *bio;
2258 2287
2259 if (!req->bio) 2288 if (!req->bio)
2260 return false; 2289 return false;
@@ -2300,56 +2329,21 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2300 2329
2301 blk_account_io_completion(req, nr_bytes); 2330 blk_account_io_completion(req, nr_bytes);
2302 2331
2303 total_bytes = bio_nbytes = 0; 2332 total_bytes = 0;
2304 while ((bio = req->bio) != NULL) { 2333 while (req->bio) {
2305 int nbytes; 2334 struct bio *bio = req->bio;
2335 unsigned bio_bytes = min(bio->bi_size, nr_bytes);
2306 2336
2307 if (nr_bytes >= bio->bi_size) { 2337 if (bio_bytes == bio->bi_size)
2308 req->bio = bio->bi_next; 2338 req->bio = bio->bi_next;
2309 nbytes = bio->bi_size;
2310 req_bio_endio(req, bio, nbytes, error);
2311 next_idx = 0;
2312 bio_nbytes = 0;
2313 } else {
2314 int idx = bio->bi_idx + next_idx;
2315 2339
2316 if (unlikely(idx >= bio->bi_vcnt)) { 2340 req_bio_endio(req, bio, bio_bytes, error);
2317 blk_dump_rq_flags(req, "__end_that");
2318 printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
2319 __func__, idx, bio->bi_vcnt);
2320 break;
2321 }
2322 2341
2323 nbytes = bio_iovec_idx(bio, idx)->bv_len; 2342 total_bytes += bio_bytes;
2324 BIO_BUG_ON(nbytes > bio->bi_size); 2343 nr_bytes -= bio_bytes;
2325 2344
2326 /* 2345 if (!nr_bytes)
2327 * not a complete bvec done 2346 break;
2328 */
2329 if (unlikely(nbytes > nr_bytes)) {
2330 bio_nbytes += nr_bytes;
2331 total_bytes += nr_bytes;
2332 break;
2333 }
2334
2335 /*
2336 * advance to the next vector
2337 */
2338 next_idx++;
2339 bio_nbytes += nbytes;
2340 }
2341
2342 total_bytes += nbytes;
2343 nr_bytes -= nbytes;
2344
2345 bio = req->bio;
2346 if (bio) {
2347 /*
2348 * end more in this run, or just return 'not-done'
2349 */
2350 if (unlikely(nr_bytes <= 0))
2351 break;
2352 }
2353 } 2347 }
2354 2348
2355 /* 2349 /*
@@ -2365,16 +2359,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2365 return false; 2359 return false;
2366 } 2360 }
2367 2361
2368 /*
2369 * if the request wasn't completed, update state
2370 */
2371 if (bio_nbytes) {
2372 req_bio_endio(req, bio, bio_nbytes, error);
2373 bio->bi_idx += next_idx;
2374 bio_iovec(bio)->bv_offset += nr_bytes;
2375 bio_iovec(bio)->bv_len -= nr_bytes;
2376 }
2377
2378 req->__data_len -= total_bytes; 2362 req->__data_len -= total_bytes;
2379 req->buffer = bio_data(req->bio); 2363 req->buffer = bio_data(req->bio);
2380 2364
@@ -3046,6 +3030,149 @@ void blk_finish_plug(struct blk_plug *plug)
3046} 3030}
3047EXPORT_SYMBOL(blk_finish_plug); 3031EXPORT_SYMBOL(blk_finish_plug);
3048 3032
3033#ifdef CONFIG_PM_RUNTIME
3034/**
3035 * blk_pm_runtime_init - Block layer runtime PM initialization routine
3036 * @q: the queue of the device
3037 * @dev: the device the queue belongs to
3038 *
3039 * Description:
3040 * Initialize runtime-PM-related fields for @q and start auto suspend for
3041 * @dev. Drivers that want to take advantage of request-based runtime PM
3042 * should call this function after @dev has been initialized, and its
3043 * request queue @q has been allocated, and runtime PM for it can not happen
3044 * yet(either due to disabled/forbidden or its usage_count > 0). In most
3045 * cases, driver should call this function before any I/O has taken place.
3046 *
3047 * This function takes care of setting up using auto suspend for the device,
3048 * the autosuspend delay is set to -1 to make runtime suspend impossible
3049 * until an updated value is either set by user or by driver. Drivers do
3050 * not need to touch other autosuspend settings.
3051 *
3052 * The block layer runtime PM is request based, so only works for drivers
3053 * that use request as their IO unit instead of those directly use bio's.
3054 */
3055void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
3056{
3057 q->dev = dev;
3058 q->rpm_status = RPM_ACTIVE;
3059 pm_runtime_set_autosuspend_delay(q->dev, -1);
3060 pm_runtime_use_autosuspend(q->dev);
3061}
3062EXPORT_SYMBOL(blk_pm_runtime_init);
3063
3064/**
3065 * blk_pre_runtime_suspend - Pre runtime suspend check
3066 * @q: the queue of the device
3067 *
3068 * Description:
3069 * This function will check if runtime suspend is allowed for the device
3070 * by examining if there are any requests pending in the queue. If there
3071 * are requests pending, the device can not be runtime suspended; otherwise,
3072 * the queue's status will be updated to SUSPENDING and the driver can
3073 * proceed to suspend the device.
3074 *
3075 * For the not allowed case, we mark last busy for the device so that
3076 * runtime PM core will try to autosuspend it some time later.
3077 *
3078 * This function should be called near the start of the device's
3079 * runtime_suspend callback.
3080 *
3081 * Return:
3082 * 0 - OK to runtime suspend the device
3083 * -EBUSY - Device should not be runtime suspended
3084 */
3085int blk_pre_runtime_suspend(struct request_queue *q)
3086{
3087 int ret = 0;
3088
3089 spin_lock_irq(q->queue_lock);
3090 if (q->nr_pending) {
3091 ret = -EBUSY;
3092 pm_runtime_mark_last_busy(q->dev);
3093 } else {
3094 q->rpm_status = RPM_SUSPENDING;
3095 }
3096 spin_unlock_irq(q->queue_lock);
3097 return ret;
3098}
3099EXPORT_SYMBOL(blk_pre_runtime_suspend);
3100
3101/**
3102 * blk_post_runtime_suspend - Post runtime suspend processing
3103 * @q: the queue of the device
3104 * @err: return value of the device's runtime_suspend function
3105 *
3106 * Description:
3107 * Update the queue's runtime status according to the return value of the
3108 * device's runtime suspend function and mark last busy for the device so
3109 * that PM core will try to auto suspend the device at a later time.
3110 *
3111 * This function should be called near the end of the device's
3112 * runtime_suspend callback.
3113 */
3114void blk_post_runtime_suspend(struct request_queue *q, int err)
3115{
3116 spin_lock_irq(q->queue_lock);
3117 if (!err) {
3118 q->rpm_status = RPM_SUSPENDED;
3119 } else {
3120 q->rpm_status = RPM_ACTIVE;
3121 pm_runtime_mark_last_busy(q->dev);
3122 }
3123 spin_unlock_irq(q->queue_lock);
3124}
3125EXPORT_SYMBOL(blk_post_runtime_suspend);
3126
3127/**
3128 * blk_pre_runtime_resume - Pre runtime resume processing
3129 * @q: the queue of the device
3130 *
3131 * Description:
3132 * Update the queue's runtime status to RESUMING in preparation for the
3133 * runtime resume of the device.
3134 *
3135 * This function should be called near the start of the device's
3136 * runtime_resume callback.
3137 */
3138void blk_pre_runtime_resume(struct request_queue *q)
3139{
3140 spin_lock_irq(q->queue_lock);
3141 q->rpm_status = RPM_RESUMING;
3142 spin_unlock_irq(q->queue_lock);
3143}
3144EXPORT_SYMBOL(blk_pre_runtime_resume);
3145
3146/**
3147 * blk_post_runtime_resume - Post runtime resume processing
3148 * @q: the queue of the device
3149 * @err: return value of the device's runtime_resume function
3150 *
3151 * Description:
3152 * Update the queue's runtime status according to the return value of the
3153 * device's runtime_resume function. If it is successfully resumed, process
3154 * the requests that are queued into the device's queue when it is resuming
3155 * and then mark last busy and initiate autosuspend for it.
3156 *
3157 * This function should be called near the end of the device's
3158 * runtime_resume callback.
3159 */
3160void blk_post_runtime_resume(struct request_queue *q, int err)
3161{
3162 spin_lock_irq(q->queue_lock);
3163 if (!err) {
3164 q->rpm_status = RPM_ACTIVE;
3165 __blk_run_queue(q);
3166 pm_runtime_mark_last_busy(q->dev);
3167 pm_runtime_autosuspend(q->dev);
3168 } else {
3169 q->rpm_status = RPM_SUSPENDED;
3170 }
3171 spin_unlock_irq(q->queue_lock);
3172}
3173EXPORT_SYMBOL(blk_post_runtime_resume);
3174#endif
3175
3049int __init blk_dev_init(void) 3176int __init blk_dev_init(void)
3050{ 3177{
3051 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 3178 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4f0ade74cfd0..d5cd3131c57a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2270,11 +2270,8 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2270 return NULL; 2270 return NULL;
2271 2271
2272 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 2272 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2273 if (cfqq) { 2273 if (cfqq)
2274 sector_t sector = bio->bi_sector + bio_sectors(bio); 2274 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2275
2276 return elv_rb_find(&cfqq->sort_list, sector);
2277 }
2278 2275
2279 return NULL; 2276 return NULL;
2280} 2277}
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 90037b5eb17f..ba19a3afab79 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -132,7 +132,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
132 * check for front merge 132 * check for front merge
133 */ 133 */
134 if (dd->front_merges) { 134 if (dd->front_merges) {
135 sector_t sector = bio->bi_sector + bio_sectors(bio); 135 sector_t sector = bio_end_sector(bio);
136 136
137 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); 137 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
138 if (__rq) { 138 if (__rq) {
diff --git a/block/elevator.c b/block/elevator.c
index a0ffdd943c98..eba5b04c29b1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -34,6 +34,7 @@
34#include <linux/blktrace_api.h> 34#include <linux/blktrace_api.h>
35#include <linux/hash.h> 35#include <linux/hash.h>
36#include <linux/uaccess.h> 36#include <linux/uaccess.h>
37#include <linux/pm_runtime.h>
37 38
38#include <trace/events/block.h> 39#include <trace/events/block.h>
39 40
@@ -536,6 +537,27 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
536 e->type->ops.elevator_bio_merged_fn(q, rq, bio); 537 e->type->ops.elevator_bio_merged_fn(q, rq, bio);
537} 538}
538 539
540#ifdef CONFIG_PM_RUNTIME
541static void blk_pm_requeue_request(struct request *rq)
542{
543 if (rq->q->dev && !(rq->cmd_flags & REQ_PM))
544 rq->q->nr_pending--;
545}
546
547static void blk_pm_add_request(struct request_queue *q, struct request *rq)
548{
549 if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 &&
550 (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
551 pm_request_resume(q->dev);
552}
553#else
554static inline void blk_pm_requeue_request(struct request *rq) {}
555static inline void blk_pm_add_request(struct request_queue *q,
556 struct request *rq)
557{
558}
559#endif
560
539void elv_requeue_request(struct request_queue *q, struct request *rq) 561void elv_requeue_request(struct request_queue *q, struct request *rq)
540{ 562{
541 /* 563 /*
@@ -550,6 +572,8 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
550 572
551 rq->cmd_flags &= ~REQ_STARTED; 573 rq->cmd_flags &= ~REQ_STARTED;
552 574
575 blk_pm_requeue_request(rq);
576
553 __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE); 577 __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
554} 578}
555 579
@@ -572,6 +596,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
572{ 596{
573 trace_block_rq_insert(q, rq); 597 trace_block_rq_insert(q, rq);
574 598
599 blk_pm_add_request(q, rq);
600
575 rq->q = q; 601 rq->q = q;
576 602
577 if (rq->cmd_flags & REQ_SOFTBARRIER) { 603 if (rq->cmd_flags & REQ_SOFTBARRIER) {
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index ff5804e2f1d2..c85fc895ecdb 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -238,7 +238,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
238 le32_to_cpu(gpt->sizeof_partition_entry); 238 le32_to_cpu(gpt->sizeof_partition_entry);
239 if (!count) 239 if (!count)
240 return NULL; 240 return NULL;
241 pte = kzalloc(count, GFP_KERNEL); 241 pte = kmalloc(count, GFP_KERNEL);
242 if (!pte) 242 if (!pte)
243 return NULL; 243 return NULL;
244 244
@@ -267,7 +267,7 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
267 gpt_header *gpt; 267 gpt_header *gpt;
268 unsigned ssz = bdev_logical_block_size(state->bdev); 268 unsigned ssz = bdev_logical_block_size(state->bdev);
269 269
270 gpt = kzalloc(ssz, GFP_KERNEL); 270 gpt = kmalloc(ssz, GFP_KERNEL);
271 if (!gpt) 271 if (!gpt)
272 return NULL; 272 return NULL;
273 273
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 92b6d7c51e39..5efed089a702 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -928,7 +928,7 @@ bufinit(struct buf *buf, struct request *rq, struct bio *bio)
928 buf->resid = bio->bi_size; 928 buf->resid = bio->bi_size;
929 buf->sector = bio->bi_sector; 929 buf->sector = bio->bi_sector;
930 bio_pageinc(bio); 930 bio_pageinc(bio);
931 buf->bv = bv = &bio->bi_io_vec[bio->bi_idx]; 931 buf->bv = bv = bio_iovec(bio);
932 buf->bv_resid = bv->bv_len; 932 buf->bv_resid = bv->bv_len;
933 WARN_ON(buf->bv_resid == 0); 933 WARN_ON(buf->bv_resid == 0);
934} 934}
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 531ceb31d0ff..f1a29f8e9d33 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -334,8 +334,7 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
334 int err = -EIO; 334 int err = -EIO;
335 335
336 sector = bio->bi_sector; 336 sector = bio->bi_sector;
337 if (sector + (bio->bi_size >> SECTOR_SHIFT) > 337 if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
338 get_capacity(bdev->bd_disk))
339 goto out; 338 goto out;
340 339
341 if (unlikely(bio->bi_rw & REQ_DISCARD)) { 340 if (unlikely(bio->bi_rw & REQ_DISCARD)) {
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index c49e85608101..04ceb7e2fadd 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3775,7 +3775,6 @@ static int __floppy_read_block_0(struct block_device *bdev)
3775 bio_vec.bv_len = size; 3775 bio_vec.bv_len = size;
3776 bio_vec.bv_offset = 0; 3776 bio_vec.bv_offset = 0;
3777 bio.bi_vcnt = 1; 3777 bio.bi_vcnt = 1;
3778 bio.bi_idx = 0;
3779 bio.bi_size = size; 3778 bio.bi_size = size;
3780 bio.bi_bdev = bdev; 3779 bio.bi_bdev = bdev;
3781 bio.bi_sector = 0; 3780 bio.bi_sector = 0;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 9f2d348f7115..3c08983e600a 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -901,7 +901,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
901 pd->iosched.successive_reads += bio->bi_size >> 10; 901 pd->iosched.successive_reads += bio->bi_size >> 10;
902 else { 902 else {
903 pd->iosched.successive_reads = 0; 903 pd->iosched.successive_reads = 0;
904 pd->iosched.last_write = bio->bi_sector + bio_sectors(bio); 904 pd->iosched.last_write = bio_end_sector(bio);
905 } 905 }
906 if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { 906 if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
907 if (pd->read_speed == pd->write_speed) { 907 if (pd->read_speed == pd->write_speed) {
@@ -948,31 +948,6 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que
948} 948}
949 949
950/* 950/*
951 * Copy CD_FRAMESIZE bytes from src_bio into a destination page
952 */
953static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct page *dst_page, int dst_offs)
954{
955 unsigned int copy_size = CD_FRAMESIZE;
956
957 while (copy_size > 0) {
958 struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
959 void *vfrom = kmap_atomic(src_bvl->bv_page) +
960 src_bvl->bv_offset + offs;
961 void *vto = page_address(dst_page) + dst_offs;
962 int len = min_t(int, copy_size, src_bvl->bv_len - offs);
963
964 BUG_ON(len < 0);
965 memcpy(vto, vfrom, len);
966 kunmap_atomic(vfrom);
967
968 seg++;
969 offs = 0;
970 dst_offs += len;
971 copy_size -= len;
972 }
973}
974
975/*
976 * Copy all data for this packet to pkt->pages[], so that 951 * Copy all data for this packet to pkt->pages[], so that
977 * a) The number of required segments for the write bio is minimized, which 952 * a) The number of required segments for the write bio is minimized, which
978 * is necessary for some scsi controllers. 953 * is necessary for some scsi controllers.
@@ -1181,16 +1156,15 @@ static int pkt_start_recovery(struct packet_data *pkt)
1181 new_sector = new_block * (CD_FRAMESIZE >> 9); 1156 new_sector = new_block * (CD_FRAMESIZE >> 9);
1182 pkt->sector = new_sector; 1157 pkt->sector = new_sector;
1183 1158
1159 bio_reset(pkt->bio);
1160 pkt->bio->bi_bdev = pd->bdev;
1161 pkt->bio->bi_rw = REQ_WRITE;
1184 pkt->bio->bi_sector = new_sector; 1162 pkt->bio->bi_sector = new_sector;
1185 pkt->bio->bi_next = NULL; 1163 pkt->bio->bi_size = pkt->frames * CD_FRAMESIZE;
1186 pkt->bio->bi_flags = 1 << BIO_UPTODATE; 1164 pkt->bio->bi_vcnt = pkt->frames;
1187 pkt->bio->bi_idx = 0;
1188 1165
1189 BUG_ON(pkt->bio->bi_rw != REQ_WRITE); 1166 pkt->bio->bi_end_io = pkt_end_io_packet_write;
1190 BUG_ON(pkt->bio->bi_vcnt != pkt->frames); 1167 pkt->bio->bi_private = pkt;
1191 BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
1192 BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
1193 BUG_ON(pkt->bio->bi_private != pkt);
1194 1168
1195 drop_super(sb); 1169 drop_super(sb);
1196 return 1; 1170 return 1;
@@ -1325,55 +1299,35 @@ try_next_bio:
1325 */ 1299 */
1326static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) 1300static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
1327{ 1301{
1328 struct bio *bio;
1329 int f; 1302 int f;
1330 int frames_write;
1331 struct bio_vec *bvec = pkt->w_bio->bi_io_vec; 1303 struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
1332 1304
1305 bio_reset(pkt->w_bio);
1306 pkt->w_bio->bi_sector = pkt->sector;
1307 pkt->w_bio->bi_bdev = pd->bdev;
1308 pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
1309 pkt->w_bio->bi_private = pkt;
1310
1311 /* XXX: locking? */
1333 for (f = 0; f < pkt->frames; f++) { 1312 for (f = 0; f < pkt->frames; f++) {
1334 bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE]; 1313 bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
1335 bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE; 1314 bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
1315 if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
1316 BUG();
1336 } 1317 }
1318 VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
1337 1319
1338 /* 1320 /*
1339 * Fill-in bvec with data from orig_bios. 1321 * Fill-in bvec with data from orig_bios.
1340 */ 1322 */
1341 frames_write = 0;
1342 spin_lock(&pkt->lock); 1323 spin_lock(&pkt->lock);
1343 bio_list_for_each(bio, &pkt->orig_bios) { 1324 bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
1344 int segment = bio->bi_idx;
1345 int src_offs = 0;
1346 int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
1347 int num_frames = bio->bi_size / CD_FRAMESIZE;
1348 BUG_ON(first_frame < 0);
1349 BUG_ON(first_frame + num_frames > pkt->frames);
1350 for (f = first_frame; f < first_frame + num_frames; f++) {
1351 struct bio_vec *src_bvl = bio_iovec_idx(bio, segment);
1352
1353 while (src_offs >= src_bvl->bv_len) {
1354 src_offs -= src_bvl->bv_len;
1355 segment++;
1356 BUG_ON(segment >= bio->bi_vcnt);
1357 src_bvl = bio_iovec_idx(bio, segment);
1358 }
1359 1325
1360 if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) {
1361 bvec[f].bv_page = src_bvl->bv_page;
1362 bvec[f].bv_offset = src_bvl->bv_offset + src_offs;
1363 } else {
1364 pkt_copy_bio_data(bio, segment, src_offs,
1365 bvec[f].bv_page, bvec[f].bv_offset);
1366 }
1367 src_offs += CD_FRAMESIZE;
1368 frames_write++;
1369 }
1370 }
1371 pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); 1326 pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
1372 spin_unlock(&pkt->lock); 1327 spin_unlock(&pkt->lock);
1373 1328
1374 VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n", 1329 VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
1375 frames_write, (unsigned long long)pkt->sector); 1330 pkt->write_size, (unsigned long long)pkt->sector);
1376 BUG_ON(frames_write != pkt->write_size);
1377 1331
1378 if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) { 1332 if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
1379 pkt_make_local_copy(pkt, bvec); 1333 pkt_make_local_copy(pkt, bvec);
@@ -1383,16 +1337,6 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
1383 } 1337 }
1384 1338
1385 /* Start the write request */ 1339 /* Start the write request */
1386 bio_reset(pkt->w_bio);
1387 pkt->w_bio->bi_sector = pkt->sector;
1388 pkt->w_bio->bi_bdev = pd->bdev;
1389 pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
1390 pkt->w_bio->bi_private = pkt;
1391 for (f = 0; f < pkt->frames; f++)
1392 if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
1393 BUG();
1394 VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
1395
1396 atomic_set(&pkt->io_wait, 1); 1340 atomic_set(&pkt->io_wait, 1);
1397 pkt->w_bio->bi_rw = WRITE; 1341 pkt->w_bio->bi_rw = WRITE;
1398 pkt_queue_bio(pd, pkt->w_bio); 1342 pkt_queue_bio(pd, pkt->w_bio);
@@ -2431,7 +2375,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
2431 cloned_bio->bi_bdev = pd->bdev; 2375 cloned_bio->bi_bdev = pd->bdev;
2432 cloned_bio->bi_private = psd; 2376 cloned_bio->bi_private = psd;
2433 cloned_bio->bi_end_io = pkt_end_io_read_cloned; 2377 cloned_bio->bi_end_io = pkt_end_io_read_cloned;
2434 pd->stats.secs_r += bio->bi_size >> 9; 2378 pd->stats.secs_r += bio_sectors(bio);
2435 pkt_queue_bio(pd, cloned_bio); 2379 pkt_queue_bio(pd, cloned_bio);
2436 return; 2380 return;
2437 } 2381 }
@@ -2452,7 +2396,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
2452 zone = ZONE(bio->bi_sector, pd); 2396 zone = ZONE(bio->bi_sector, pd);
2453 VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n", 2397 VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
2454 (unsigned long long)bio->bi_sector, 2398 (unsigned long long)bio->bi_sector,
2455 (unsigned long long)(bio->bi_sector + bio_sectors(bio))); 2399 (unsigned long long)bio_end_sector(bio));
2456 2400
2457 /* Check if we have to split the bio */ 2401 /* Check if we have to split the bio */
2458 { 2402 {
@@ -2460,7 +2404,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
2460 sector_t last_zone; 2404 sector_t last_zone;
2461 int first_sectors; 2405 int first_sectors;
2462 2406
2463 last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd); 2407 last_zone = ZONE(bio_end_sector(bio) - 1, pd);
2464 if (last_zone != zone) { 2408 if (last_zone != zone) {
2465 BUG_ON(last_zone != zone + pd->settings.size); 2409 BUG_ON(last_zone != zone + pd->settings.size);
2466 first_sectors = last_zone - bio->bi_sector; 2410 first_sectors = last_zone - bio->bi_sector;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 22ffd5dcb168..ca63104136e0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1143,7 +1143,7 @@ static struct bio *bio_clone_range(struct bio *bio_src,
1143 /* Find first affected segment... */ 1143 /* Find first affected segment... */
1144 1144
1145 resid = offset; 1145 resid = offset;
1146 __bio_for_each_segment(bv, bio_src, idx, 0) { 1146 bio_for_each_segment(bv, bio_src, idx) {
1147 if (resid < bv->bv_len) 1147 if (resid < bv->bv_len)
1148 break; 1148 break;
1149 resid -= bv->bv_len; 1149 resid -= bv->bv_len;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 13c15480d940..6d2d41ae9e32 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -858,8 +858,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
858 unsigned int i; 858 unsigned int i;
859 struct bio_vec *bv; 859 struct bio_vec *bv;
860 860
861 for (i = 0; i < clone->bi_vcnt; i++) { 861 bio_for_each_segment_all(bv, clone, i) {
862 bv = bio_iovec_idx(clone, i);
863 BUG_ON(!bv->bv_page); 862 BUG_ON(!bv->bv_page);
864 mempool_free(bv->bv_page, cc->page_pool); 863 mempool_free(bv->bv_page, cc->page_pool);
865 bv->bv_page = NULL; 864 bv->bv_page = NULL;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d053098c6a91..699b5be68d31 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -458,7 +458,7 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
458{ 458{
459 io->bdev = m->dev->bdev; 459 io->bdev = m->dev->bdev;
460 io->sector = map_sector(m, bio); 460 io->sector = map_sector(m, bio);
461 io->count = bio->bi_size >> 9; 461 io->count = bio_sectors(bio);
462} 462}
463 463
464static void hold_bio(struct mirror_set *ms, struct bio *bio) 464static void hold_bio(struct mirror_set *ms, struct bio *bio)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index d8837d313f54..ea5e878a30b9 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -258,7 +258,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
258 sector_t begin, end; 258 sector_t begin, end;
259 259
260 stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); 260 stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
261 stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), 261 stripe_map_range_sector(sc, bio_end_sector(bio),
262 target_stripe, &end); 262 target_stripe, &end);
263 if (begin < end) { 263 if (begin < end) {
264 bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; 264 bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index a746f1d21c66..b948fd864d45 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -501,7 +501,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
501 return -EIO; 501 return -EIO;
502 } 502 }
503 503
504 if ((bio->bi_sector + bio_sectors(bio)) >> 504 if (bio_end_sector(bio) >>
505 (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { 505 (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
506 DMERR_LIMIT("io out of range"); 506 DMERR_LIMIT("io out of range");
507 return -EIO; 507 return -EIO;
@@ -519,7 +519,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
519 519
520 bio->bi_end_io = verity_end_io; 520 bio->bi_end_io = verity_end_io;
521 bio->bi_private = io; 521 bio->bi_private = io;
522 io->io_vec_size = bio->bi_vcnt - bio->bi_idx; 522 io->io_vec_size = bio_segments(bio);
523 if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) 523 if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
524 io->io_vec = io->io_vec_inline; 524 io->io_vec = io->io_vec_inline;
525 else 525 else
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 5e7dc772f5de..3193aefe982b 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -185,8 +185,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
185 return; 185 return;
186 } 186 }
187 187
188 if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), 188 if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE))
189 WRITE))
190 failit = 1; 189 failit = 1;
191 if (check_mode(conf, WritePersistent)) { 190 if (check_mode(conf, WritePersistent)) {
192 add_sector(conf, bio->bi_sector, WritePersistent); 191 add_sector(conf, bio->bi_sector, WritePersistent);
@@ -196,8 +195,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
196 failit = 1; 195 failit = 1;
197 } else { 196 } else {
198 /* read request */ 197 /* read request */
199 if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), 198 if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ))
200 READ))
201 failit = 1; 199 failit = 1;
202 if (check_mode(conf, ReadTransient)) 200 if (check_mode(conf, ReadTransient))
203 failit = 1; 201 failit = 1;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 21014836bdbf..f03fabd2b37b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -317,8 +317,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
317 bio_io_error(bio); 317 bio_io_error(bio);
318 return; 318 return;
319 } 319 }
320 if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > 320 if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) {
321 tmp_dev->end_sector)) {
322 /* This bio crosses a device boundary, so we have to 321 /* This bio crosses a device boundary, so we have to
323 * split it. 322 * split it.
324 */ 323 */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6330c727396c..681d1099a2d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -197,21 +197,12 @@ void md_trim_bio(struct bio *bio, int offset, int size)
197 if (offset == 0 && size == bio->bi_size) 197 if (offset == 0 && size == bio->bi_size)
198 return; 198 return;
199 199
200 bio->bi_sector += offset;
201 bio->bi_size = size;
202 offset <<= 9;
203 clear_bit(BIO_SEG_VALID, &bio->bi_flags); 200 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
204 201
205 while (bio->bi_idx < bio->bi_vcnt && 202 bio_advance(bio, offset << 9);
206 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { 203
207 /* remove this whole bio_vec */ 204 bio->bi_size = size;
208 offset -= bio->bi_io_vec[bio->bi_idx].bv_len; 205
209 bio->bi_idx++;
210 }
211 if (bio->bi_idx < bio->bi_vcnt) {
212 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
213 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
214 }
215 /* avoid any complications with bi_idx being non-zero*/ 206 /* avoid any complications with bi_idx being non-zero*/
216 if (bio->bi_idx) { 207 if (bio->bi_idx) {
217 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, 208 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 0505452de8d6..fcf65e512cf5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -502,11 +502,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
502{ 502{
503 if (likely(is_power_of_2(chunk_sects))) { 503 if (likely(is_power_of_2(chunk_sects))) {
504 return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) 504 return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
505 + (bio->bi_size >> 9)); 505 + bio_sectors(bio));
506 } else{ 506 } else{
507 sector_t sector = bio->bi_sector; 507 sector_t sector = bio->bi_sector;
508 return chunk_sects >= (sector_div(sector, chunk_sects) 508 return chunk_sects >= (sector_div(sector, chunk_sects)
509 + (bio->bi_size >> 9)); 509 + bio_sectors(bio));
510 } 510 }
511} 511}
512 512
@@ -527,8 +527,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
527 sector_t sector = bio->bi_sector; 527 sector_t sector = bio->bi_sector;
528 struct bio_pair *bp; 528 struct bio_pair *bp;
529 /* Sanity check -- queue functions should prevent this happening */ 529 /* Sanity check -- queue functions should prevent this happening */
530 if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 530 if (bio_segments(bio) > 1)
531 bio->bi_idx != 0)
532 goto bad_map; 531 goto bad_map;
533 /* This is a one page bio that upper layers 532 /* This is a one page bio that upper layers
534 * refuse to split for us, so we need to split it. 533 * refuse to split for us, so we need to split it.
@@ -567,7 +566,7 @@ bad_map:
567 printk("md/raid0:%s: make_request bug: can't convert block across chunks" 566 printk("md/raid0:%s: make_request bug: can't convert block across chunks"
568 " or bigger than %dk %llu %d\n", 567 " or bigger than %dk %llu %d\n",
569 mdname(mddev), chunk_sects / 2, 568 mdname(mddev), chunk_sects / 2,
570 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 569 (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
571 570
572 bio_io_error(bio); 571 bio_io_error(bio);
573 return; 572 return;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 851023e2ba5d..55951182af73 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -92,7 +92,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
92static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 92static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
93{ 93{
94 struct pool_info *pi = data; 94 struct pool_info *pi = data;
95 struct page *page;
96 struct r1bio *r1_bio; 95 struct r1bio *r1_bio;
97 struct bio *bio; 96 struct bio *bio;
98 int i, j; 97 int i, j;
@@ -122,14 +121,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
122 j = 1; 121 j = 1;
123 while(j--) { 122 while(j--) {
124 bio = r1_bio->bios[j]; 123 bio = r1_bio->bios[j];
125 for (i = 0; i < RESYNC_PAGES; i++) { 124 bio->bi_vcnt = RESYNC_PAGES;
126 page = alloc_page(gfp_flags);
127 if (unlikely(!page))
128 goto out_free_pages;
129 125
130 bio->bi_io_vec[i].bv_page = page; 126 if (bio_alloc_pages(bio, gfp_flags))
131 bio->bi_vcnt = i+1; 127 goto out_free_bio;
132 }
133 } 128 }
134 /* If not user-requests, copy the page pointers to all bios */ 129 /* If not user-requests, copy the page pointers to all bios */
135 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { 130 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -143,11 +138,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
143 138
144 return r1_bio; 139 return r1_bio;
145 140
146out_free_pages:
147 for (j=0 ; j < pi->raid_disks; j++)
148 for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
149 put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
150 j = -1;
151out_free_bio: 141out_free_bio:
152 while (++j < pi->raid_disks) 142 while (++j < pi->raid_disks)
153 bio_put(r1_bio->bios[j]); 143 bio_put(r1_bio->bios[j]);
@@ -267,7 +257,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
267 (bio_data_dir(bio) == WRITE) ? "write" : "read", 257 (bio_data_dir(bio) == WRITE) ? "write" : "read",
268 (unsigned long long) bio->bi_sector, 258 (unsigned long long) bio->bi_sector,
269 (unsigned long long) bio->bi_sector + 259 (unsigned long long) bio->bi_sector +
270 (bio->bi_size >> 9) - 1); 260 bio_sectors(bio) - 1);
271 261
272 call_bio_endio(r1_bio); 262 call_bio_endio(r1_bio);
273 } 263 }
@@ -458,7 +448,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
458 " %llu-%llu\n", 448 " %llu-%llu\n",
459 (unsigned long long) mbio->bi_sector, 449 (unsigned long long) mbio->bi_sector,
460 (unsigned long long) mbio->bi_sector + 450 (unsigned long long) mbio->bi_sector +
461 (mbio->bi_size >> 9) - 1); 451 bio_sectors(mbio) - 1);
462 call_bio_endio(r1_bio); 452 call_bio_endio(r1_bio);
463 } 453 }
464 } 454 }
@@ -925,7 +915,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
925 if (unlikely(!bvecs)) 915 if (unlikely(!bvecs))
926 return; 916 return;
927 917
928 bio_for_each_segment(bvec, bio, i) { 918 bio_for_each_segment_all(bvec, bio, i) {
929 bvecs[i] = *bvec; 919 bvecs[i] = *bvec;
930 bvecs[i].bv_page = alloc_page(GFP_NOIO); 920 bvecs[i].bv_page = alloc_page(GFP_NOIO);
931 if (unlikely(!bvecs[i].bv_page)) 921 if (unlikely(!bvecs[i].bv_page))
@@ -1023,7 +1013,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1023 md_write_start(mddev, bio); /* wait on superblock update early */ 1013 md_write_start(mddev, bio); /* wait on superblock update early */
1024 1014
1025 if (bio_data_dir(bio) == WRITE && 1015 if (bio_data_dir(bio) == WRITE &&
1026 bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && 1016 bio_end_sector(bio) > mddev->suspend_lo &&
1027 bio->bi_sector < mddev->suspend_hi) { 1017 bio->bi_sector < mddev->suspend_hi) {
1028 /* As the suspend_* range is controlled by 1018 /* As the suspend_* range is controlled by
1029 * userspace, we want an interruptible 1019 * userspace, we want an interruptible
@@ -1034,7 +1024,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1034 flush_signals(current); 1024 flush_signals(current);
1035 prepare_to_wait(&conf->wait_barrier, 1025 prepare_to_wait(&conf->wait_barrier,
1036 &w, TASK_INTERRUPTIBLE); 1026 &w, TASK_INTERRUPTIBLE);
1037 if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || 1027 if (bio_end_sector(bio) <= mddev->suspend_lo ||
1038 bio->bi_sector >= mddev->suspend_hi) 1028 bio->bi_sector >= mddev->suspend_hi)
1039 break; 1029 break;
1040 schedule(); 1030 schedule();
@@ -1054,7 +1044,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1054 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1044 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1055 1045
1056 r1_bio->master_bio = bio; 1046 r1_bio->master_bio = bio;
1057 r1_bio->sectors = bio->bi_size >> 9; 1047 r1_bio->sectors = bio_sectors(bio);
1058 r1_bio->state = 0; 1048 r1_bio->state = 0;
1059 r1_bio->mddev = mddev; 1049 r1_bio->mddev = mddev;
1060 r1_bio->sector = bio->bi_sector; 1050 r1_bio->sector = bio->bi_sector;
@@ -1132,7 +1122,7 @@ read_again:
1132 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1122 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1133 1123
1134 r1_bio->master_bio = bio; 1124 r1_bio->master_bio = bio;
1135 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; 1125 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1136 r1_bio->state = 0; 1126 r1_bio->state = 0;
1137 r1_bio->mddev = mddev; 1127 r1_bio->mddev = mddev;
1138 r1_bio->sector = bio->bi_sector + sectors_handled; 1128 r1_bio->sector = bio->bi_sector + sectors_handled;
@@ -1289,14 +1279,10 @@ read_again:
1289 struct bio_vec *bvec; 1279 struct bio_vec *bvec;
1290 int j; 1280 int j;
1291 1281
1292 /* Yes, I really want the '__' version so that 1282 /*
1293 * we clear any unused pointer in the io_vec, rather 1283 * We trimmed the bio, so _all is legit
1294 * than leave them unchanged. This is important
1295 * because when we come to free the pages, we won't
1296 * know the original bi_idx, so we just free
1297 * them all
1298 */ 1284 */
1299 __bio_for_each_segment(bvec, mbio, j, 0) 1285 bio_for_each_segment_all(bvec, mbio, j)
1300 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; 1286 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
1301 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 1287 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
1302 atomic_inc(&r1_bio->behind_remaining); 1288 atomic_inc(&r1_bio->behind_remaining);
@@ -1334,14 +1320,14 @@ read_again:
1334 /* Mustn't call r1_bio_write_done before this next test, 1320 /* Mustn't call r1_bio_write_done before this next test,
1335 * as it could result in the bio being freed. 1321 * as it could result in the bio being freed.
1336 */ 1322 */
1337 if (sectors_handled < (bio->bi_size >> 9)) { 1323 if (sectors_handled < bio_sectors(bio)) {
1338 r1_bio_write_done(r1_bio); 1324 r1_bio_write_done(r1_bio);
1339 /* We need another r1_bio. It has already been counted 1325 /* We need another r1_bio. It has already been counted
1340 * in bio->bi_phys_segments 1326 * in bio->bi_phys_segments
1341 */ 1327 */
1342 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1328 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1343 r1_bio->master_bio = bio; 1329 r1_bio->master_bio = bio;
1344 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; 1330 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1345 r1_bio->state = 0; 1331 r1_bio->state = 0;
1346 r1_bio->mddev = mddev; 1332 r1_bio->mddev = mddev;
1347 r1_bio->sector = bio->bi_sector + sectors_handled; 1333 r1_bio->sector = bio->bi_sector + sectors_handled;
@@ -1867,7 +1853,7 @@ static int process_checks(struct r1bio *r1_bio)
1867 struct bio *sbio = r1_bio->bios[i]; 1853 struct bio *sbio = r1_bio->bios[i];
1868 int size; 1854 int size;
1869 1855
1870 if (r1_bio->bios[i]->bi_end_io != end_sync_read) 1856 if (sbio->bi_end_io != end_sync_read)
1871 continue; 1857 continue;
1872 1858
1873 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { 1859 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
@@ -1892,16 +1878,15 @@ static int process_checks(struct r1bio *r1_bio)
1892 continue; 1878 continue;
1893 } 1879 }
1894 /* fixup the bio for reuse */ 1880 /* fixup the bio for reuse */
1881 bio_reset(sbio);
1895 sbio->bi_vcnt = vcnt; 1882 sbio->bi_vcnt = vcnt;
1896 sbio->bi_size = r1_bio->sectors << 9; 1883 sbio->bi_size = r1_bio->sectors << 9;
1897 sbio->bi_idx = 0;
1898 sbio->bi_phys_segments = 0;
1899 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1900 sbio->bi_flags |= 1 << BIO_UPTODATE;
1901 sbio->bi_next = NULL;
1902 sbio->bi_sector = r1_bio->sector + 1884 sbio->bi_sector = r1_bio->sector +
1903 conf->mirrors[i].rdev->data_offset; 1885 conf->mirrors[i].rdev->data_offset;
1904 sbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1886 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1887 sbio->bi_end_io = end_sync_read;
1888 sbio->bi_private = r1_bio;
1889
1905 size = sbio->bi_size; 1890 size = sbio->bi_size;
1906 for (j = 0; j < vcnt ; j++) { 1891 for (j = 0; j < vcnt ; j++) {
1907 struct bio_vec *bi; 1892 struct bio_vec *bi;
@@ -1912,10 +1897,9 @@ static int process_checks(struct r1bio *r1_bio)
1912 else 1897 else
1913 bi->bv_len = size; 1898 bi->bv_len = size;
1914 size -= PAGE_SIZE; 1899 size -= PAGE_SIZE;
1915 memcpy(page_address(bi->bv_page),
1916 page_address(pbio->bi_io_vec[j].bv_page),
1917 PAGE_SIZE);
1918 } 1900 }
1901
1902 bio_copy_data(sbio, pbio);
1919 } 1903 }
1920 return 0; 1904 return 0;
1921} 1905}
@@ -1952,7 +1936,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
1952 wbio->bi_rw = WRITE; 1936 wbio->bi_rw = WRITE;
1953 wbio->bi_end_io = end_sync_write; 1937 wbio->bi_end_io = end_sync_write;
1954 atomic_inc(&r1_bio->remaining); 1938 atomic_inc(&r1_bio->remaining);
1955 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 1939 md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
1956 1940
1957 generic_make_request(wbio); 1941 generic_make_request(wbio);
1958 } 1942 }
@@ -2064,32 +2048,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
2064 } 2048 }
2065} 2049}
2066 2050
2067static void bi_complete(struct bio *bio, int error)
2068{
2069 complete((struct completion *)bio->bi_private);
2070}
2071
2072static int submit_bio_wait(int rw, struct bio *bio)
2073{
2074 struct completion event;
2075 rw |= REQ_SYNC;
2076
2077 init_completion(&event);
2078 bio->bi_private = &event;
2079 bio->bi_end_io = bi_complete;
2080 submit_bio(rw, bio);
2081 wait_for_completion(&event);
2082
2083 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2084}
2085
2086static int narrow_write_error(struct r1bio *r1_bio, int i) 2051static int narrow_write_error(struct r1bio *r1_bio, int i)
2087{ 2052{
2088 struct mddev *mddev = r1_bio->mddev; 2053 struct mddev *mddev = r1_bio->mddev;
2089 struct r1conf *conf = mddev->private; 2054 struct r1conf *conf = mddev->private;
2090 struct md_rdev *rdev = conf->mirrors[i].rdev; 2055 struct md_rdev *rdev = conf->mirrors[i].rdev;
2091 int vcnt, idx;
2092 struct bio_vec *vec;
2093 2056
2094 /* bio has the data to be written to device 'i' where 2057 /* bio has the data to be written to device 'i' where
2095 * we just recently had a write error. 2058 * we just recently had a write error.
@@ -2117,30 +2080,32 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
2117 & ~(sector_t)(block_sectors - 1)) 2080 & ~(sector_t)(block_sectors - 1))
2118 - sector; 2081 - sector;
2119 2082
2120 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
2121 vcnt = r1_bio->behind_page_count;
2122 vec = r1_bio->behind_bvecs;
2123 idx = 0;
2124 while (vec[idx].bv_page == NULL)
2125 idx++;
2126 } else {
2127 vcnt = r1_bio->master_bio->bi_vcnt;
2128 vec = r1_bio->master_bio->bi_io_vec;
2129 idx = r1_bio->master_bio->bi_idx;
2130 }
2131 while (sect_to_write) { 2083 while (sect_to_write) {
2132 struct bio *wbio; 2084 struct bio *wbio;
2133 if (sectors > sect_to_write) 2085 if (sectors > sect_to_write)
2134 sectors = sect_to_write; 2086 sectors = sect_to_write;
2135 /* Write at 'sector' for 'sectors'*/ 2087 /* Write at 'sector' for 'sectors'*/
2136 2088
2137 wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); 2089 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
2138 memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); 2090 unsigned vcnt = r1_bio->behind_page_count;
2139 wbio->bi_sector = r1_bio->sector; 2091 struct bio_vec *vec = r1_bio->behind_bvecs;
2092
2093 while (!vec->bv_page) {
2094 vec++;
2095 vcnt--;
2096 }
2097
2098 wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
2099 memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
2100
2101 wbio->bi_vcnt = vcnt;
2102 } else {
2103 wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
2104 }
2105
2140 wbio->bi_rw = WRITE; 2106 wbio->bi_rw = WRITE;
2141 wbio->bi_vcnt = vcnt; 2107 wbio->bi_sector = r1_bio->sector;
2142 wbio->bi_size = r1_bio->sectors << 9; 2108 wbio->bi_size = r1_bio->sectors << 9;
2143 wbio->bi_idx = idx;
2144 2109
2145 md_trim_bio(wbio, sector - r1_bio->sector, sectors); 2110 md_trim_bio(wbio, sector - r1_bio->sector, sectors);
2146 wbio->bi_sector += rdev->data_offset; 2111 wbio->bi_sector += rdev->data_offset;
@@ -2289,8 +2254,7 @@ read_more:
2289 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 2254 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
2290 2255
2291 r1_bio->master_bio = mbio; 2256 r1_bio->master_bio = mbio;
2292 r1_bio->sectors = (mbio->bi_size >> 9) 2257 r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
2293 - sectors_handled;
2294 r1_bio->state = 0; 2258 r1_bio->state = 0;
2295 set_bit(R1BIO_ReadError, &r1_bio->state); 2259 set_bit(R1BIO_ReadError, &r1_bio->state);
2296 r1_bio->mddev = mddev; 2260 r1_bio->mddev = mddev;
@@ -2464,18 +2428,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2464 for (i = 0; i < conf->raid_disks * 2; i++) { 2428 for (i = 0; i < conf->raid_disks * 2; i++) {
2465 struct md_rdev *rdev; 2429 struct md_rdev *rdev;
2466 bio = r1_bio->bios[i]; 2430 bio = r1_bio->bios[i];
2467 2431 bio_reset(bio);
2468 /* take from bio_init */
2469 bio->bi_next = NULL;
2470 bio->bi_flags &= ~(BIO_POOL_MASK-1);
2471 bio->bi_flags |= 1 << BIO_UPTODATE;
2472 bio->bi_rw = READ;
2473 bio->bi_vcnt = 0;
2474 bio->bi_idx = 0;
2475 bio->bi_phys_segments = 0;
2476 bio->bi_size = 0;
2477 bio->bi_end_io = NULL;
2478 bio->bi_private = NULL;
2479 2432
2480 rdev = rcu_dereference(conf->mirrors[i].rdev); 2433 rdev = rcu_dereference(conf->mirrors[i].rdev);
2481 if (rdev == NULL || 2434 if (rdev == NULL ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 018741ba9310..59d4daa5f4c7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1174,14 +1174,13 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1174 /* If this request crosses a chunk boundary, we need to 1174 /* If this request crosses a chunk boundary, we need to
1175 * split it. This will only happen for 1 PAGE (or less) requests. 1175 * split it. This will only happen for 1 PAGE (or less) requests.
1176 */ 1176 */
1177 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) 1177 if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
1178 > chunk_sects 1178 > chunk_sects
1179 && (conf->geo.near_copies < conf->geo.raid_disks 1179 && (conf->geo.near_copies < conf->geo.raid_disks
1180 || conf->prev.near_copies < conf->prev.raid_disks))) { 1180 || conf->prev.near_copies < conf->prev.raid_disks))) {
1181 struct bio_pair *bp; 1181 struct bio_pair *bp;
1182 /* Sanity check -- queue functions should prevent this happening */ 1182 /* Sanity check -- queue functions should prevent this happening */
1183 if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 1183 if (bio_segments(bio) > 1)
1184 bio->bi_idx != 0)
1185 goto bad_map; 1184 goto bad_map;
1186 /* This is a one page bio that upper layers 1185 /* This is a one page bio that upper layers
1187 * refuse to split for us, so we need to split it. 1186 * refuse to split for us, so we need to split it.
@@ -1214,7 +1213,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1214 bad_map: 1213 bad_map:
1215 printk("md/raid10:%s: make_request bug: can't convert block across chunks" 1214 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1216 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, 1215 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1217 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 1216 (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
1218 1217
1219 bio_io_error(bio); 1218 bio_io_error(bio);
1220 return; 1219 return;
@@ -1229,7 +1228,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1229 */ 1228 */
1230 wait_barrier(conf); 1229 wait_barrier(conf);
1231 1230
1232 sectors = bio->bi_size >> 9; 1231 sectors = bio_sectors(bio);
1233 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1232 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1234 bio->bi_sector < conf->reshape_progress && 1233 bio->bi_sector < conf->reshape_progress &&
1235 bio->bi_sector + sectors > conf->reshape_progress) { 1234 bio->bi_sector + sectors > conf->reshape_progress) {
@@ -1331,8 +1330,7 @@ read_again:
1331 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1330 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1332 1331
1333 r10_bio->master_bio = bio; 1332 r10_bio->master_bio = bio;
1334 r10_bio->sectors = ((bio->bi_size >> 9) 1333 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1335 - sectors_handled);
1336 r10_bio->state = 0; 1334 r10_bio->state = 0;
1337 r10_bio->mddev = mddev; 1335 r10_bio->mddev = mddev;
1338 r10_bio->sector = bio->bi_sector + sectors_handled; 1336 r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -1574,7 +1572,7 @@ retry_write:
1574 * after checking if we need to go around again. 1572 * after checking if we need to go around again.
1575 */ 1573 */
1576 1574
1577 if (sectors_handled < (bio->bi_size >> 9)) { 1575 if (sectors_handled < bio_sectors(bio)) {
1578 one_write_done(r10_bio); 1576 one_write_done(r10_bio);
1579 /* We need another r10_bio. It has already been counted 1577 /* We need another r10_bio. It has already been counted
1580 * in bio->bi_phys_segments. 1578 * in bio->bi_phys_segments.
@@ -1582,7 +1580,7 @@ retry_write:
1582 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1580 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1583 1581
1584 r10_bio->master_bio = bio; 1582 r10_bio->master_bio = bio;
1585 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; 1583 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1586 1584
1587 r10_bio->mddev = mddev; 1585 r10_bio->mddev = mddev;
1588 r10_bio->sector = bio->bi_sector + sectors_handled; 1586 r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -2084,13 +2082,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2084 * First we need to fixup bv_offset, bv_len and 2082 * First we need to fixup bv_offset, bv_len and
2085 * bi_vecs, as the read request might have corrupted these 2083 * bi_vecs, as the read request might have corrupted these
2086 */ 2084 */
2085 bio_reset(tbio);
2086
2087 tbio->bi_vcnt = vcnt; 2087 tbio->bi_vcnt = vcnt;
2088 tbio->bi_size = r10_bio->sectors << 9; 2088 tbio->bi_size = r10_bio->sectors << 9;
2089 tbio->bi_idx = 0;
2090 tbio->bi_phys_segments = 0;
2091 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
2092 tbio->bi_flags |= 1 << BIO_UPTODATE;
2093 tbio->bi_next = NULL;
2094 tbio->bi_rw = WRITE; 2089 tbio->bi_rw = WRITE;
2095 tbio->bi_private = r10_bio; 2090 tbio->bi_private = r10_bio;
2096 tbio->bi_sector = r10_bio->devs[i].addr; 2091 tbio->bi_sector = r10_bio->devs[i].addr;
@@ -2108,7 +2103,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2108 d = r10_bio->devs[i].devnum; 2103 d = r10_bio->devs[i].devnum;
2109 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2104 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2110 atomic_inc(&r10_bio->remaining); 2105 atomic_inc(&r10_bio->remaining);
2111 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); 2106 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2112 2107
2113 tbio->bi_sector += conf->mirrors[d].rdev->data_offset; 2108 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2114 tbio->bi_bdev = conf->mirrors[d].rdev->bdev; 2109 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
@@ -2133,7 +2128,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2133 d = r10_bio->devs[i].devnum; 2128 d = r10_bio->devs[i].devnum;
2134 atomic_inc(&r10_bio->remaining); 2129 atomic_inc(&r10_bio->remaining);
2135 md_sync_acct(conf->mirrors[d].replacement->bdev, 2130 md_sync_acct(conf->mirrors[d].replacement->bdev,
2136 tbio->bi_size >> 9); 2131 bio_sectors(tbio));
2137 generic_make_request(tbio); 2132 generic_make_request(tbio);
2138 } 2133 }
2139 2134
@@ -2259,13 +2254,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2259 wbio2 = r10_bio->devs[1].repl_bio; 2254 wbio2 = r10_bio->devs[1].repl_bio;
2260 if (wbio->bi_end_io) { 2255 if (wbio->bi_end_io) {
2261 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2256 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2262 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 2257 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2263 generic_make_request(wbio); 2258 generic_make_request(wbio);
2264 } 2259 }
2265 if (wbio2 && wbio2->bi_end_io) { 2260 if (wbio2 && wbio2->bi_end_io) {
2266 atomic_inc(&conf->mirrors[d].replacement->nr_pending); 2261 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2267 md_sync_acct(conf->mirrors[d].replacement->bdev, 2262 md_sync_acct(conf->mirrors[d].replacement->bdev,
2268 wbio2->bi_size >> 9); 2263 bio_sectors(wbio2));
2269 generic_make_request(wbio2); 2264 generic_make_request(wbio2);
2270 } 2265 }
2271} 2266}
@@ -2536,25 +2531,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2536 } 2531 }
2537} 2532}
2538 2533
2539static void bi_complete(struct bio *bio, int error)
2540{
2541 complete((struct completion *)bio->bi_private);
2542}
2543
2544static int submit_bio_wait(int rw, struct bio *bio)
2545{
2546 struct completion event;
2547 rw |= REQ_SYNC;
2548
2549 init_completion(&event);
2550 bio->bi_private = &event;
2551 bio->bi_end_io = bi_complete;
2552 submit_bio(rw, bio);
2553 wait_for_completion(&event);
2554
2555 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2556}
2557
2558static int narrow_write_error(struct r10bio *r10_bio, int i) 2534static int narrow_write_error(struct r10bio *r10_bio, int i)
2559{ 2535{
2560 struct bio *bio = r10_bio->master_bio; 2536 struct bio *bio = r10_bio->master_bio;
@@ -2695,8 +2671,7 @@ read_more:
2695 r10_bio = mempool_alloc(conf->r10bio_pool, 2671 r10_bio = mempool_alloc(conf->r10bio_pool,
2696 GFP_NOIO); 2672 GFP_NOIO);
2697 r10_bio->master_bio = mbio; 2673 r10_bio->master_bio = mbio;
2698 r10_bio->sectors = (mbio->bi_size >> 9) 2674 r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
2699 - sectors_handled;
2700 r10_bio->state = 0; 2675 r10_bio->state = 0;
2701 set_bit(R10BIO_ReadError, 2676 set_bit(R10BIO_ReadError,
2702 &r10_bio->state); 2677 &r10_bio->state);
@@ -3133,6 +3108,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3133 } 3108 }
3134 } 3109 }
3135 bio = r10_bio->devs[0].bio; 3110 bio = r10_bio->devs[0].bio;
3111 bio_reset(bio);
3136 bio->bi_next = biolist; 3112 bio->bi_next = biolist;
3137 biolist = bio; 3113 biolist = bio;
3138 bio->bi_private = r10_bio; 3114 bio->bi_private = r10_bio;
@@ -3157,6 +3133,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3157 rdev = mirror->rdev; 3133 rdev = mirror->rdev;
3158 if (!test_bit(In_sync, &rdev->flags)) { 3134 if (!test_bit(In_sync, &rdev->flags)) {
3159 bio = r10_bio->devs[1].bio; 3135 bio = r10_bio->devs[1].bio;
3136 bio_reset(bio);
3160 bio->bi_next = biolist; 3137 bio->bi_next = biolist;
3161 biolist = bio; 3138 biolist = bio;
3162 bio->bi_private = r10_bio; 3139 bio->bi_private = r10_bio;
@@ -3185,6 +3162,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3185 if (rdev == NULL || bio == NULL || 3162 if (rdev == NULL || bio == NULL ||
3186 test_bit(Faulty, &rdev->flags)) 3163 test_bit(Faulty, &rdev->flags))
3187 break; 3164 break;
3165 bio_reset(bio);
3188 bio->bi_next = biolist; 3166 bio->bi_next = biolist;
3189 biolist = bio; 3167 biolist = bio;
3190 bio->bi_private = r10_bio; 3168 bio->bi_private = r10_bio;
@@ -3283,7 +3261,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3283 r10_bio->devs[i].repl_bio->bi_end_io = NULL; 3261 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3284 3262
3285 bio = r10_bio->devs[i].bio; 3263 bio = r10_bio->devs[i].bio;
3286 bio->bi_end_io = NULL; 3264 bio_reset(bio);
3287 clear_bit(BIO_UPTODATE, &bio->bi_flags); 3265 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3288 if (conf->mirrors[d].rdev == NULL || 3266 if (conf->mirrors[d].rdev == NULL ||
3289 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) 3267 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
@@ -3320,6 +3298,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3320 3298
3321 /* Need to set up for writing to the replacement */ 3299 /* Need to set up for writing to the replacement */
3322 bio = r10_bio->devs[i].repl_bio; 3300 bio = r10_bio->devs[i].repl_bio;
3301 bio_reset(bio);
3323 clear_bit(BIO_UPTODATE, &bio->bi_flags); 3302 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3324 3303
3325 sector = r10_bio->devs[i].addr; 3304 sector = r10_bio->devs[i].addr;
@@ -3353,17 +3332,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
3353 } 3332 }
3354 } 3333 }
3355 3334
3356 for (bio = biolist; bio ; bio=bio->bi_next) {
3357
3358 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
3359 if (bio->bi_end_io)
3360 bio->bi_flags |= 1 << BIO_UPTODATE;
3361 bio->bi_vcnt = 0;
3362 bio->bi_idx = 0;
3363 bio->bi_phys_segments = 0;
3364 bio->bi_size = 0;
3365 }
3366
3367 nr_sectors = 0; 3335 nr_sectors = 0;
3368 if (sector_nr + max_sync < max_sector) 3336 if (sector_nr + max_sync < max_sector)
3369 max_sector = sector_nr + max_sync; 3337 max_sector = sector_nr + max_sync;
@@ -4411,7 +4379,6 @@ read_more:
4411 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); 4379 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4412 read_bio->bi_flags |= 1 << BIO_UPTODATE; 4380 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4413 read_bio->bi_vcnt = 0; 4381 read_bio->bi_vcnt = 0;
4414 read_bio->bi_idx = 0;
4415 read_bio->bi_size = 0; 4382 read_bio->bi_size = 0;
4416 r10_bio->master_bio = read_bio; 4383 r10_bio->master_bio = read_bio;
4417 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 4384 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
@@ -4435,17 +4402,14 @@ read_more:
4435 } 4402 }
4436 if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4403 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4437 continue; 4404 continue;
4405
4406 bio_reset(b);
4438 b->bi_bdev = rdev2->bdev; 4407 b->bi_bdev = rdev2->bdev;
4439 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; 4408 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4440 b->bi_private = r10_bio; 4409 b->bi_private = r10_bio;
4441 b->bi_end_io = end_reshape_write; 4410 b->bi_end_io = end_reshape_write;
4442 b->bi_rw = WRITE; 4411 b->bi_rw = WRITE;
4443 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4444 b->bi_flags |= 1 << BIO_UPTODATE;
4445 b->bi_next = blist; 4412 b->bi_next = blist;
4446 b->bi_vcnt = 0;
4447 b->bi_idx = 0;
4448 b->bi_size = 0;
4449 blist = b; 4413 blist = b;
4450 } 4414 }
4451 4415
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4a7be455d6d8..9359828ffe26 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -90,7 +90,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
90 */ 90 */
91static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 91static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
92{ 92{
93 int sectors = bio->bi_size >> 9; 93 int sectors = bio_sectors(bio);
94 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 94 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
95 return bio->bi_next; 95 return bio->bi_next;
96 else 96 else
@@ -569,14 +569,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
569 bi = &sh->dev[i].req; 569 bi = &sh->dev[i].req;
570 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 570 rbi = &sh->dev[i].rreq; /* For writing to replacement */
571 571
572 bi->bi_rw = rw;
573 rbi->bi_rw = rw;
574 if (rw & WRITE) {
575 bi->bi_end_io = raid5_end_write_request;
576 rbi->bi_end_io = raid5_end_write_request;
577 } else
578 bi->bi_end_io = raid5_end_read_request;
579
580 rcu_read_lock(); 572 rcu_read_lock();
581 rrdev = rcu_dereference(conf->disks[i].replacement); 573 rrdev = rcu_dereference(conf->disks[i].replacement);
582 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 574 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
@@ -651,7 +643,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
651 643
652 set_bit(STRIPE_IO_STARTED, &sh->state); 644 set_bit(STRIPE_IO_STARTED, &sh->state);
653 645
646 bio_reset(bi);
654 bi->bi_bdev = rdev->bdev; 647 bi->bi_bdev = rdev->bdev;
648 bi->bi_rw = rw;
649 bi->bi_end_io = (rw & WRITE)
650 ? raid5_end_write_request
651 : raid5_end_read_request;
652 bi->bi_private = sh;
653
655 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 654 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
656 __func__, (unsigned long long)sh->sector, 655 __func__, (unsigned long long)sh->sector,
657 bi->bi_rw, i); 656 bi->bi_rw, i);
@@ -665,12 +664,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
665 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 664 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
666 bi->bi_rw |= REQ_FLUSH; 665 bi->bi_rw |= REQ_FLUSH;
667 666
668 bi->bi_flags = 1 << BIO_UPTODATE;
669 bi->bi_idx = 0;
670 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 667 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
671 bi->bi_io_vec[0].bv_offset = 0; 668 bi->bi_io_vec[0].bv_offset = 0;
672 bi->bi_size = STRIPE_SIZE; 669 bi->bi_size = STRIPE_SIZE;
673 bi->bi_next = NULL;
674 if (rrdev) 670 if (rrdev)
675 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 671 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
676 672
@@ -687,7 +683,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
687 683
688 set_bit(STRIPE_IO_STARTED, &sh->state); 684 set_bit(STRIPE_IO_STARTED, &sh->state);
689 685
686 bio_reset(rbi);
690 rbi->bi_bdev = rrdev->bdev; 687 rbi->bi_bdev = rrdev->bdev;
688 rbi->bi_rw = rw;
689 BUG_ON(!(rw & WRITE));
690 rbi->bi_end_io = raid5_end_write_request;
691 rbi->bi_private = sh;
692
691 pr_debug("%s: for %llu schedule op %ld on " 693 pr_debug("%s: for %llu schedule op %ld on "
692 "replacement disc %d\n", 694 "replacement disc %d\n",
693 __func__, (unsigned long long)sh->sector, 695 __func__, (unsigned long long)sh->sector,
@@ -699,12 +701,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
699 else 701 else
700 rbi->bi_sector = (sh->sector 702 rbi->bi_sector = (sh->sector
701 + rrdev->data_offset); 703 + rrdev->data_offset);
702 rbi->bi_flags = 1 << BIO_UPTODATE;
703 rbi->bi_idx = 0;
704 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 704 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
705 rbi->bi_io_vec[0].bv_offset = 0; 705 rbi->bi_io_vec[0].bv_offset = 0;
706 rbi->bi_size = STRIPE_SIZE; 706 rbi->bi_size = STRIPE_SIZE;
707 rbi->bi_next = NULL;
708 if (conf->mddev->gendisk) 707 if (conf->mddev->gendisk)
709 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 708 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
710 rbi, disk_devt(conf->mddev->gendisk), 709 rbi, disk_devt(conf->mddev->gendisk),
@@ -2402,11 +2401,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2402 } else 2401 } else
2403 bip = &sh->dev[dd_idx].toread; 2402 bip = &sh->dev[dd_idx].toread;
2404 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2403 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2405 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2404 if (bio_end_sector(*bip) > bi->bi_sector)
2406 goto overlap; 2405 goto overlap;
2407 bip = & (*bip)->bi_next; 2406 bip = & (*bip)->bi_next;
2408 } 2407 }
2409 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2408 if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
2410 goto overlap; 2409 goto overlap;
2411 2410
2412 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2411 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2422,8 +2421,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2422 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2421 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2423 bi && bi->bi_sector <= sector; 2422 bi && bi->bi_sector <= sector;
2424 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2423 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2425 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2424 if (bio_end_sector(bi) >= sector)
2426 sector = bi->bi_sector + (bi->bi_size>>9); 2425 sector = bio_end_sector(bi);
2427 } 2426 }
2428 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2427 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2429 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2428 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@ -3849,7 +3848,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
3849{ 3848{
3850 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3849 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3851 unsigned int chunk_sectors = mddev->chunk_sectors; 3850 unsigned int chunk_sectors = mddev->chunk_sectors;
3852 unsigned int bio_sectors = bio->bi_size >> 9; 3851 unsigned int bio_sectors = bio_sectors(bio);
3853 3852
3854 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3853 if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3855 chunk_sectors = mddev->new_chunk_sectors; 3854 chunk_sectors = mddev->new_chunk_sectors;
@@ -3941,7 +3940,7 @@ static int bio_fits_rdev(struct bio *bi)
3941{ 3940{
3942 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3941 struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3943 3942
3944 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3943 if (bio_sectors(bi) > queue_max_sectors(q))
3945 return 0; 3944 return 0;
3946 blk_recount_segments(q, bi); 3945 blk_recount_segments(q, bi);
3947 if (bi->bi_phys_segments > queue_max_segments(q)) 3946 if (bi->bi_phys_segments > queue_max_segments(q))
@@ -3988,7 +3987,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3988 0, 3987 0,
3989 &dd_idx, NULL); 3988 &dd_idx, NULL);
3990 3989
3991 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3990 end_sector = bio_end_sector(align_bi);
3992 rcu_read_lock(); 3991 rcu_read_lock();
3993 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3992 rdev = rcu_dereference(conf->disks[dd_idx].replacement);
3994 if (!rdev || test_bit(Faulty, &rdev->flags) || 3993 if (!rdev || test_bit(Faulty, &rdev->flags) ||
@@ -4011,7 +4010,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4011 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4010 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
4012 4011
4013 if (!bio_fits_rdev(align_bi) || 4012 if (!bio_fits_rdev(align_bi) ||
4014 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 4013 is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
4015 &first_bad, &bad_sectors)) { 4014 &first_bad, &bad_sectors)) {
4016 /* too big in some way, or has a known bad block */ 4015 /* too big in some way, or has a known bad block */
4017 bio_put(align_bi); 4016 bio_put(align_bi);
@@ -4273,7 +4272,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4273 } 4272 }
4274 4273
4275 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4274 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4276 last_sector = bi->bi_sector + (bi->bi_size>>9); 4275 last_sector = bio_end_sector(bi);
4277 bi->bi_next = NULL; 4276 bi->bi_next = NULL;
4278 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4277 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4279 4278
@@ -4739,7 +4738,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4739 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4738 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4740 sector = raid5_compute_sector(conf, logical_sector, 4739 sector = raid5_compute_sector(conf, logical_sector,
4741 0, &dd_idx, NULL); 4740 0, &dd_idx, NULL);
4742 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4741 last_sector = bio_end_sector(raid_bio);
4743 4742
4744 for (; logical_sector < last_sector; 4743 for (; logical_sector < last_sector;
4745 logical_sector += STRIPE_SECTORS, 4744 logical_sector += STRIPE_SECTORS,
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index ffee6f781e30..dd239bdbfcb4 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -2235,10 +2235,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
2235 } 2235 }
2236 2236
2237 /* do we need to support multiple segments? */ 2237 /* do we need to support multiple segments? */
2238 if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { 2238 if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
2239 printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n", 2239 printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n",
2240 ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req), 2240 ioc->name, __func__, bio_segments(req->bio), blk_rq_bytes(req),
2241 rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); 2241 bio_segments(rsp->bio), blk_rq_bytes(rsp));
2242 return -EINVAL; 2242 return -EINVAL;
2243 } 2243 }
2244 2244
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 07ba32b07fb0..6eca019bcf30 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -822,8 +822,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
822 if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0) 822 if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
823 /* Request is not page-aligned. */ 823 /* Request is not page-aligned. */
824 goto fail; 824 goto fail;
825 if (((bio->bi_size >> 9) + bio->bi_sector) 825 if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) {
826 > get_capacity(bio->bi_bdev->bd_disk)) {
827 /* Request beyond end of DCSS segment. */ 826 /* Request beyond end of DCSS segment. */
828 goto fail; 827 goto fail;
829 } 828 }
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 55cbd0180159..f42b0e15410f 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -2163,10 +2163,10 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
2163 } 2163 }
2164 2164
2165 /* do we need to support multiple segments? */ 2165 /* do we need to support multiple segments? */
2166 if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { 2166 if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
2167 printk("%s: multiple segments req %u %u, rsp %u %u\n", 2167 printk("%s: multiple segments req %u %u, rsp %u %u\n",
2168 __func__, req->bio->bi_vcnt, blk_rq_bytes(req), 2168 __func__, bio_segments(req->bio), blk_rq_bytes(req),
2169 rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); 2169 bio_segments(rsp->bio), blk_rq_bytes(rsp));
2170 return -EINVAL; 2170 return -EINVAL;
2171 } 2171 }
2172 2172
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index 8c2ffbe6af0f..193e7ae90c3b 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1939,7 +1939,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
1939 ioc->transport_cmds.status = MPT2_CMD_PENDING; 1939 ioc->transport_cmds.status = MPT2_CMD_PENDING;
1940 1940
1941 /* Check if the request is split across multiple segments */ 1941 /* Check if the request is split across multiple segments */
1942 if (req->bio->bi_vcnt > 1) { 1942 if (bio_segments(req->bio) > 1) {
1943 u32 offset = 0; 1943 u32 offset = 0;
1944 1944
1945 /* Allocate memory and copy the request */ 1945 /* Allocate memory and copy the request */
@@ -1971,7 +1971,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
1971 1971
1972 /* Check if the response needs to be populated across 1972 /* Check if the response needs to be populated across
1973 * multiple segments */ 1973 * multiple segments */
1974 if (rsp->bio->bi_vcnt > 1) { 1974 if (bio_segments(rsp->bio) > 1) {
1975 pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp), 1975 pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp),
1976 &pci_dma_in); 1976 &pci_dma_in);
1977 if (!pci_addr_in) { 1977 if (!pci_addr_in) {
@@ -2038,7 +2038,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
2038 sgl_flags = (MPI2_SGE_FLAGS_SIMPLE_ELEMENT | 2038 sgl_flags = (MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
2039 MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC); 2039 MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC);
2040 sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; 2040 sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
2041 if (req->bio->bi_vcnt > 1) { 2041 if (bio_segments(req->bio) > 1) {
2042 ioc->base_add_sg_single(psge, sgl_flags | 2042 ioc->base_add_sg_single(psge, sgl_flags |
2043 (blk_rq_bytes(req) - 4), pci_dma_out); 2043 (blk_rq_bytes(req) - 4), pci_dma_out);
2044 } else { 2044 } else {
@@ -2054,7 +2054,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
2054 MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER | 2054 MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER |
2055 MPI2_SGE_FLAGS_END_OF_LIST); 2055 MPI2_SGE_FLAGS_END_OF_LIST);
2056 sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; 2056 sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
2057 if (rsp->bio->bi_vcnt > 1) { 2057 if (bio_segments(rsp->bio) > 1) {
2058 ioc->base_add_sg_single(psge, sgl_flags | 2058 ioc->base_add_sg_single(psge, sgl_flags |
2059 (blk_rq_bytes(rsp) + 4), pci_dma_in); 2059 (blk_rq_bytes(rsp) + 4), pci_dma_in);
2060 } else { 2060 } else {
@@ -2099,7 +2099,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
2099 le16_to_cpu(mpi_reply->ResponseDataLength); 2099 le16_to_cpu(mpi_reply->ResponseDataLength);
2100 /* check if the resp needs to be copied from the allocated 2100 /* check if the resp needs to be copied from the allocated
2101 * pci mem */ 2101 * pci mem */
2102 if (rsp->bio->bi_vcnt > 1) { 2102 if (bio_segments(rsp->bio) > 1) {
2103 u32 offset = 0; 2103 u32 offset = 0;
2104 u32 bytes_to_copy = 2104 u32 bytes_to_copy =
2105 le16_to_cpu(mpi_reply->ResponseDataLength); 2105 le16_to_cpu(mpi_reply->ResponseDataLength);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a3f28f331b2b..8fb42916d8a2 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -27,48 +27,11 @@
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29 29
30struct integrity_slab { 30#define BIP_INLINE_VECS 4
31 struct kmem_cache *slab;
32 unsigned short nr_vecs;
33 char name[8];
34};
35
36#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
37struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
38 IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
39};
40#undef IS
41 31
32static struct kmem_cache *bip_slab;
42static struct workqueue_struct *kintegrityd_wq; 33static struct workqueue_struct *kintegrityd_wq;
43 34
44static inline unsigned int vecs_to_idx(unsigned int nr)
45{
46 switch (nr) {
47 case 1:
48 return 0;
49 case 2 ... 4:
50 return 1;
51 case 5 ... 16:
52 return 2;
53 case 17 ... 64:
54 return 3;
55 case 65 ... 128:
56 return 4;
57 case 129 ... BIO_MAX_PAGES:
58 return 5;
59 default:
60 BUG();
61 }
62}
63
64static inline int use_bip_pool(unsigned int idx)
65{
66 if (idx == BIOVEC_MAX_IDX)
67 return 1;
68
69 return 0;
70}
71
72/** 35/**
73 * bio_integrity_alloc - Allocate integrity payload and attach it to bio 36 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
74 * @bio: bio to attach integrity metadata to 37 * @bio: bio to attach integrity metadata to
@@ -84,37 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
84 unsigned int nr_vecs) 47 unsigned int nr_vecs)
85{ 48{
86 struct bio_integrity_payload *bip; 49 struct bio_integrity_payload *bip;
87 unsigned int idx = vecs_to_idx(nr_vecs);
88 struct bio_set *bs = bio->bi_pool; 50 struct bio_set *bs = bio->bi_pool;
89 51 unsigned long idx = BIO_POOL_NONE;
90 if (!bs) 52 unsigned inline_vecs;
91 bs = fs_bio_set; 53
92 54 if (!bs) {
93 BUG_ON(bio == NULL); 55 bip = kmalloc(sizeof(struct bio_integrity_payload) +
94 bip = NULL; 56 sizeof(struct bio_vec) * nr_vecs, gfp_mask);
95 57 inline_vecs = nr_vecs;
96 /* Lower order allocations come straight from slab */ 58 } else {
97 if (!use_bip_pool(idx))
98 bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
99
100 /* Use mempool if lower order alloc failed or max vecs were requested */
101 if (bip == NULL) {
102 idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */
103 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); 59 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
104 60 inline_vecs = BIP_INLINE_VECS;
105 if (unlikely(bip == NULL)) {
106 printk(KERN_ERR "%s: could not alloc bip\n", __func__);
107 return NULL;
108 }
109 } 61 }
110 62
63 if (unlikely(!bip))
64 return NULL;
65
111 memset(bip, 0, sizeof(*bip)); 66 memset(bip, 0, sizeof(*bip));
112 67
68 if (nr_vecs > inline_vecs) {
69 bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
70 bs->bvec_integrity_pool);
71 if (!bip->bip_vec)
72 goto err;
73 } else {
74 bip->bip_vec = bip->bip_inline_vecs;
75 }
76
113 bip->bip_slab = idx; 77 bip->bip_slab = idx;
114 bip->bip_bio = bio; 78 bip->bip_bio = bio;
115 bio->bi_integrity = bip; 79 bio->bi_integrity = bip;
116 80
117 return bip; 81 return bip;
82err:
83 mempool_free(bip, bs->bio_integrity_pool);
84 return NULL;
118} 85}
119EXPORT_SYMBOL(bio_integrity_alloc); 86EXPORT_SYMBOL(bio_integrity_alloc);
120 87
@@ -130,20 +97,18 @@ void bio_integrity_free(struct bio *bio)
130 struct bio_integrity_payload *bip = bio->bi_integrity; 97 struct bio_integrity_payload *bip = bio->bi_integrity;
131 struct bio_set *bs = bio->bi_pool; 98 struct bio_set *bs = bio->bi_pool;
132 99
133 if (!bs) 100 if (bip->bip_owns_buf)
134 bs = fs_bio_set;
135
136 BUG_ON(bip == NULL);
137
138 /* A cloned bio doesn't own the integrity metadata */
139 if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
140 && bip->bip_buf != NULL)
141 kfree(bip->bip_buf); 101 kfree(bip->bip_buf);
142 102
143 if (use_bip_pool(bip->bip_slab)) 103 if (bs) {
104 if (bip->bip_slab != BIO_POOL_NONE)
105 bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
106 bip->bip_slab);
107
144 mempool_free(bip, bs->bio_integrity_pool); 108 mempool_free(bip, bs->bio_integrity_pool);
145 else 109 } else {
146 kmem_cache_free(bip_slab[bip->bip_slab].slab, bip); 110 kfree(bip);
111 }
147 112
148 bio->bi_integrity = NULL; 113 bio->bi_integrity = NULL;
149} 114}
@@ -419,6 +384,7 @@ int bio_integrity_prep(struct bio *bio)
419 return -EIO; 384 return -EIO;
420 } 385 }
421 386
387 bip->bip_owns_buf = 1;
422 bip->bip_buf = buf; 388 bip->bip_buf = buf;
423 bip->bip_size = len; 389 bip->bip_size = len;
424 bip->bip_sector = bio->bi_sector; 390 bip->bip_sector = bio->bi_sector;
@@ -694,11 +660,11 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
694 bp->bio1.bi_integrity = &bp->bip1; 660 bp->bio1.bi_integrity = &bp->bip1;
695 bp->bio2.bi_integrity = &bp->bip2; 661 bp->bio2.bi_integrity = &bp->bip2;
696 662
697 bp->iv1 = bip->bip_vec[0]; 663 bp->iv1 = bip->bip_vec[bip->bip_idx];
698 bp->iv2 = bip->bip_vec[0]; 664 bp->iv2 = bip->bip_vec[bip->bip_idx];
699 665
700 bp->bip1.bip_vec[0] = bp->iv1; 666 bp->bip1.bip_vec = &bp->iv1;
701 bp->bip2.bip_vec[0] = bp->iv2; 667 bp->bip2.bip_vec = &bp->iv2;
702 668
703 bp->iv1.bv_len = sectors * bi->tuple_size; 669 bp->iv1.bv_len = sectors * bi->tuple_size;
704 bp->iv2.bv_offset += sectors * bi->tuple_size; 670 bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -746,13 +712,14 @@ EXPORT_SYMBOL(bio_integrity_clone);
746 712
747int bioset_integrity_create(struct bio_set *bs, int pool_size) 713int bioset_integrity_create(struct bio_set *bs, int pool_size)
748{ 714{
749 unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
750
751 if (bs->bio_integrity_pool) 715 if (bs->bio_integrity_pool)
752 return 0; 716 return 0;
753 717
754 bs->bio_integrity_pool = 718 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
755 mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); 719
720 bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
721 if (!bs->bvec_integrity_pool)
722 return -1;
756 723
757 if (!bs->bio_integrity_pool) 724 if (!bs->bio_integrity_pool)
758 return -1; 725 return -1;
@@ -765,13 +732,14 @@ void bioset_integrity_free(struct bio_set *bs)
765{ 732{
766 if (bs->bio_integrity_pool) 733 if (bs->bio_integrity_pool)
767 mempool_destroy(bs->bio_integrity_pool); 734 mempool_destroy(bs->bio_integrity_pool);
735
736 if (bs->bvec_integrity_pool)
737 mempool_destroy(bs->bio_integrity_pool);
768} 738}
769EXPORT_SYMBOL(bioset_integrity_free); 739EXPORT_SYMBOL(bioset_integrity_free);
770 740
771void __init bio_integrity_init(void) 741void __init bio_integrity_init(void)
772{ 742{
773 unsigned int i;
774
775 /* 743 /*
776 * kintegrityd won't block much but may burn a lot of CPU cycles. 744 * kintegrityd won't block much but may burn a lot of CPU cycles.
777 * Make it highpri CPU intensive wq with max concurrency of 1. 745 * Make it highpri CPU intensive wq with max concurrency of 1.
@@ -781,14 +749,10 @@ void __init bio_integrity_init(void)
781 if (!kintegrityd_wq) 749 if (!kintegrityd_wq)
782 panic("Failed to create kintegrityd\n"); 750 panic("Failed to create kintegrityd\n");
783 751
784 for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) { 752 bip_slab = kmem_cache_create("bio_integrity_payload",
785 unsigned int size; 753 sizeof(struct bio_integrity_payload) +
786 754 sizeof(struct bio_vec) * BIP_INLINE_VECS,
787 size = sizeof(struct bio_integrity_payload) 755 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
788 + bip_slab[i].nr_vecs * sizeof(struct bio_vec); 756 if (!bip_slab)
789 757 panic("Failed to create slab\n");
790 bip_slab[i].slab =
791 kmem_cache_create(bip_slab[i].name, size, 0,
792 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
793 }
794} 758}
diff --git a/fs/bio.c b/fs/bio.c
index 954d73124b41..94bbc04dba77 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -161,12 +161,12 @@ unsigned int bvec_nr_vecs(unsigned short idx)
161 return bvec_slabs[idx].nr_vecs; 161 return bvec_slabs[idx].nr_vecs;
162} 162}
163 163
164void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) 164void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
165{ 165{
166 BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); 166 BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
167 167
168 if (idx == BIOVEC_MAX_IDX) 168 if (idx == BIOVEC_MAX_IDX)
169 mempool_free(bv, bs->bvec_pool); 169 mempool_free(bv, pool);
170 else { 170 else {
171 struct biovec_slab *bvs = bvec_slabs + idx; 171 struct biovec_slab *bvs = bvec_slabs + idx;
172 172
@@ -174,8 +174,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
174 } 174 }
175} 175}
176 176
177struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, 177struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
178 struct bio_set *bs) 178 mempool_t *pool)
179{ 179{
180 struct bio_vec *bvl; 180 struct bio_vec *bvl;
181 181
@@ -211,7 +211,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
211 */ 211 */
212 if (*idx == BIOVEC_MAX_IDX) { 212 if (*idx == BIOVEC_MAX_IDX) {
213fallback: 213fallback:
214 bvl = mempool_alloc(bs->bvec_pool, gfp_mask); 214 bvl = mempool_alloc(pool, gfp_mask);
215 } else { 215 } else {
216 struct biovec_slab *bvs = bvec_slabs + *idx; 216 struct biovec_slab *bvs = bvec_slabs + *idx;
217 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); 217 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
@@ -253,8 +253,8 @@ static void bio_free(struct bio *bio)
253 __bio_free(bio); 253 __bio_free(bio);
254 254
255 if (bs) { 255 if (bs) {
256 if (bio_has_allocated_vec(bio)) 256 if (bio_flagged(bio, BIO_OWNS_VEC))
257 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); 257 bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
258 258
259 /* 259 /*
260 * If we have front padding, adjust the bio pointer before freeing 260 * If we have front padding, adjust the bio pointer before freeing
@@ -298,6 +298,54 @@ void bio_reset(struct bio *bio)
298} 298}
299EXPORT_SYMBOL(bio_reset); 299EXPORT_SYMBOL(bio_reset);
300 300
301static void bio_alloc_rescue(struct work_struct *work)
302{
303 struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
304 struct bio *bio;
305
306 while (1) {
307 spin_lock(&bs->rescue_lock);
308 bio = bio_list_pop(&bs->rescue_list);
309 spin_unlock(&bs->rescue_lock);
310
311 if (!bio)
312 break;
313
314 generic_make_request(bio);
315 }
316}
317
318static void punt_bios_to_rescuer(struct bio_set *bs)
319{
320 struct bio_list punt, nopunt;
321 struct bio *bio;
322
323 /*
324 * In order to guarantee forward progress we must punt only bios that
325 * were allocated from this bio_set; otherwise, if there was a bio on
326 * there for a stacking driver higher up in the stack, processing it
327 * could require allocating bios from this bio_set, and doing that from
328 * our own rescuer would be bad.
329 *
330 * Since bio lists are singly linked, pop them all instead of trying to
331 * remove from the middle of the list:
332 */
333
334 bio_list_init(&punt);
335 bio_list_init(&nopunt);
336
337 while ((bio = bio_list_pop(current->bio_list)))
338 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
339
340 *current->bio_list = nopunt;
341
342 spin_lock(&bs->rescue_lock);
343 bio_list_merge(&bs->rescue_list, &punt);
344 spin_unlock(&bs->rescue_lock);
345
346 queue_work(bs->rescue_workqueue, &bs->rescue_work);
347}
348
301/** 349/**
302 * bio_alloc_bioset - allocate a bio for I/O 350 * bio_alloc_bioset - allocate a bio for I/O
303 * @gfp_mask: the GFP_ mask given to the slab allocator 351 * @gfp_mask: the GFP_ mask given to the slab allocator
@@ -315,11 +363,27 @@ EXPORT_SYMBOL(bio_reset);
315 * previously allocated bio for IO before attempting to allocate a new one. 363 * previously allocated bio for IO before attempting to allocate a new one.
316 * Failure to do so can cause deadlocks under memory pressure. 364 * Failure to do so can cause deadlocks under memory pressure.
317 * 365 *
366 * Note that when running under generic_make_request() (i.e. any block
367 * driver), bios are not submitted until after you return - see the code in
368 * generic_make_request() that converts recursion into iteration, to prevent
369 * stack overflows.
370 *
371 * This would normally mean allocating multiple bios under
372 * generic_make_request() would be susceptible to deadlocks, but we have
373 * deadlock avoidance code that resubmits any blocked bios from a rescuer
374 * thread.
375 *
376 * However, we do not guarantee forward progress for allocations from other
377 * mempools. Doing multiple allocations from the same mempool under
378 * generic_make_request() should be avoided - instead, use bio_set's front_pad
379 * for per bio allocations.
380 *
318 * RETURNS: 381 * RETURNS:
319 * Pointer to new bio on success, NULL on failure. 382 * Pointer to new bio on success, NULL on failure.
320 */ 383 */
321struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 384struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
322{ 385{
386 gfp_t saved_gfp = gfp_mask;
323 unsigned front_pad; 387 unsigned front_pad;
324 unsigned inline_vecs; 388 unsigned inline_vecs;
325 unsigned long idx = BIO_POOL_NONE; 389 unsigned long idx = BIO_POOL_NONE;
@@ -337,7 +401,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
337 front_pad = 0; 401 front_pad = 0;
338 inline_vecs = nr_iovecs; 402 inline_vecs = nr_iovecs;
339 } else { 403 } else {
404 /*
405 * generic_make_request() converts recursion to iteration; this
406 * means if we're running beneath it, any bios we allocate and
407 * submit will not be submitted (and thus freed) until after we
408 * return.
409 *
410 * This exposes us to a potential deadlock if we allocate
411 * multiple bios from the same bio_set() while running
412 * underneath generic_make_request(). If we were to allocate
413 * multiple bios (say a stacking block driver that was splitting
414 * bios), we would deadlock if we exhausted the mempool's
415 * reserve.
416 *
417 * We solve this, and guarantee forward progress, with a rescuer
418 * workqueue per bio_set. If we go to allocate and there are
419 * bios on current->bio_list, we first try the allocation
420 * without __GFP_WAIT; if that fails, we punt those bios we
421 * would be blocking to the rescuer workqueue before we retry
422 * with the original gfp_flags.
423 */
424
425 if (current->bio_list && !bio_list_empty(current->bio_list))
426 gfp_mask &= ~__GFP_WAIT;
427
340 p = mempool_alloc(bs->bio_pool, gfp_mask); 428 p = mempool_alloc(bs->bio_pool, gfp_mask);
429 if (!p && gfp_mask != saved_gfp) {
430 punt_bios_to_rescuer(bs);
431 gfp_mask = saved_gfp;
432 p = mempool_alloc(bs->bio_pool, gfp_mask);
433 }
434
341 front_pad = bs->front_pad; 435 front_pad = bs->front_pad;
342 inline_vecs = BIO_INLINE_VECS; 436 inline_vecs = BIO_INLINE_VECS;
343 } 437 }
@@ -349,9 +443,17 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
349 bio_init(bio); 443 bio_init(bio);
350 444
351 if (nr_iovecs > inline_vecs) { 445 if (nr_iovecs > inline_vecs) {
352 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 446 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
447 if (!bvl && gfp_mask != saved_gfp) {
448 punt_bios_to_rescuer(bs);
449 gfp_mask = saved_gfp;
450 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
451 }
452
353 if (unlikely(!bvl)) 453 if (unlikely(!bvl))
354 goto err_free; 454 goto err_free;
455
456 bio->bi_flags |= 1 << BIO_OWNS_VEC;
355 } else if (nr_iovecs) { 457 } else if (nr_iovecs) {
356 bvl = bio->bi_inline_vecs; 458 bvl = bio->bi_inline_vecs;
357 } 459 }
@@ -653,6 +755,181 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
653} 755}
654EXPORT_SYMBOL(bio_add_page); 756EXPORT_SYMBOL(bio_add_page);
655 757
758struct submit_bio_ret {
759 struct completion event;
760 int error;
761};
762
763static void submit_bio_wait_endio(struct bio *bio, int error)
764{
765 struct submit_bio_ret *ret = bio->bi_private;
766
767 ret->error = error;
768 complete(&ret->event);
769}
770
771/**
772 * submit_bio_wait - submit a bio, and wait until it completes
773 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
774 * @bio: The &struct bio which describes the I/O
775 *
776 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
777 * bio_endio() on failure.
778 */
779int submit_bio_wait(int rw, struct bio *bio)
780{
781 struct submit_bio_ret ret;
782
783 rw |= REQ_SYNC;
784 init_completion(&ret.event);
785 bio->bi_private = &ret;
786 bio->bi_end_io = submit_bio_wait_endio;
787 submit_bio(rw, bio);
788 wait_for_completion(&ret.event);
789
790 return ret.error;
791}
792EXPORT_SYMBOL(submit_bio_wait);
793
794/**
795 * bio_advance - increment/complete a bio by some number of bytes
796 * @bio: bio to advance
797 * @bytes: number of bytes to complete
798 *
799 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
800 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
801 * be updated on the last bvec as well.
802 *
803 * @bio will then represent the remaining, uncompleted portion of the io.
804 */
805void bio_advance(struct bio *bio, unsigned bytes)
806{
807 if (bio_integrity(bio))
808 bio_integrity_advance(bio, bytes);
809
810 bio->bi_sector += bytes >> 9;
811 bio->bi_size -= bytes;
812
813 if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
814 return;
815
816 while (bytes) {
817 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
818 WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
819 bio->bi_idx, bio->bi_vcnt);
820 break;
821 }
822
823 if (bytes >= bio_iovec(bio)->bv_len) {
824 bytes -= bio_iovec(bio)->bv_len;
825 bio->bi_idx++;
826 } else {
827 bio_iovec(bio)->bv_len -= bytes;
828 bio_iovec(bio)->bv_offset += bytes;
829 bytes = 0;
830 }
831 }
832}
833EXPORT_SYMBOL(bio_advance);
834
835/**
836 * bio_alloc_pages - allocates a single page for each bvec in a bio
837 * @bio: bio to allocate pages for
838 * @gfp_mask: flags for allocation
839 *
840 * Allocates pages up to @bio->bi_vcnt.
841 *
842 * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
843 * freed.
844 */
845int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
846{
847 int i;
848 struct bio_vec *bv;
849
850 bio_for_each_segment_all(bv, bio, i) {
851 bv->bv_page = alloc_page(gfp_mask);
852 if (!bv->bv_page) {
853 while (--bv >= bio->bi_io_vec)
854 __free_page(bv->bv_page);
855 return -ENOMEM;
856 }
857 }
858
859 return 0;
860}
861EXPORT_SYMBOL(bio_alloc_pages);
862
863/**
864 * bio_copy_data - copy contents of data buffers from one chain of bios to
865 * another
866 * @src: source bio list
867 * @dst: destination bio list
868 *
869 * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
870 * @src and @dst as linked lists of bios.
871 *
872 * Stops when it reaches the end of either @src or @dst - that is, copies
873 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
874 */
875void bio_copy_data(struct bio *dst, struct bio *src)
876{
877 struct bio_vec *src_bv, *dst_bv;
878 unsigned src_offset, dst_offset, bytes;
879 void *src_p, *dst_p;
880
881 src_bv = bio_iovec(src);
882 dst_bv = bio_iovec(dst);
883
884 src_offset = src_bv->bv_offset;
885 dst_offset = dst_bv->bv_offset;
886
887 while (1) {
888 if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
889 src_bv++;
890 if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
891 src = src->bi_next;
892 if (!src)
893 break;
894
895 src_bv = bio_iovec(src);
896 }
897
898 src_offset = src_bv->bv_offset;
899 }
900
901 if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
902 dst_bv++;
903 if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
904 dst = dst->bi_next;
905 if (!dst)
906 break;
907
908 dst_bv = bio_iovec(dst);
909 }
910
911 dst_offset = dst_bv->bv_offset;
912 }
913
914 bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
915 src_bv->bv_offset + src_bv->bv_len - src_offset);
916
917 src_p = kmap_atomic(src_bv->bv_page);
918 dst_p = kmap_atomic(dst_bv->bv_page);
919
920 memcpy(dst_p + dst_bv->bv_offset,
921 src_p + src_bv->bv_offset,
922 bytes);
923
924 kunmap_atomic(dst_p);
925 kunmap_atomic(src_p);
926
927 src_offset += bytes;
928 dst_offset += bytes;
929 }
930}
931EXPORT_SYMBOL(bio_copy_data);
932
656struct bio_map_data { 933struct bio_map_data {
657 struct bio_vec *iovecs; 934 struct bio_vec *iovecs;
658 struct sg_iovec *sgvecs; 935 struct sg_iovec *sgvecs;
@@ -715,7 +992,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
715 int iov_idx = 0; 992 int iov_idx = 0;
716 unsigned int iov_off = 0; 993 unsigned int iov_off = 0;
717 994
718 __bio_for_each_segment(bvec, bio, i, 0) { 995 bio_for_each_segment_all(bvec, bio, i) {
719 char *bv_addr = page_address(bvec->bv_page); 996 char *bv_addr = page_address(bvec->bv_page);
720 unsigned int bv_len = iovecs[i].bv_len; 997 unsigned int bv_len = iovecs[i].bv_len;
721 998
@@ -897,7 +1174,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
897 return bio; 1174 return bio;
898cleanup: 1175cleanup:
899 if (!map_data) 1176 if (!map_data)
900 bio_for_each_segment(bvec, bio, i) 1177 bio_for_each_segment_all(bvec, bio, i)
901 __free_page(bvec->bv_page); 1178 __free_page(bvec->bv_page);
902 1179
903 bio_put(bio); 1180 bio_put(bio);
@@ -1111,7 +1388,7 @@ static void __bio_unmap_user(struct bio *bio)
1111 /* 1388 /*
1112 * make sure we dirty pages we wrote to 1389 * make sure we dirty pages we wrote to
1113 */ 1390 */
1114 __bio_for_each_segment(bvec, bio, i, 0) { 1391 bio_for_each_segment_all(bvec, bio, i) {
1115 if (bio_data_dir(bio) == READ) 1392 if (bio_data_dir(bio) == READ)
1116 set_page_dirty_lock(bvec->bv_page); 1393 set_page_dirty_lock(bvec->bv_page);
1117 1394
@@ -1217,7 +1494,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
1217 int i; 1494 int i;
1218 char *p = bmd->sgvecs[0].iov_base; 1495 char *p = bmd->sgvecs[0].iov_base;
1219 1496
1220 __bio_for_each_segment(bvec, bio, i, 0) { 1497 bio_for_each_segment_all(bvec, bio, i) {
1221 char *addr = page_address(bvec->bv_page); 1498 char *addr = page_address(bvec->bv_page);
1222 int len = bmd->iovecs[i].bv_len; 1499 int len = bmd->iovecs[i].bv_len;
1223 1500
@@ -1257,7 +1534,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1257 if (!reading) { 1534 if (!reading) {
1258 void *p = data; 1535 void *p = data;
1259 1536
1260 bio_for_each_segment(bvec, bio, i) { 1537 bio_for_each_segment_all(bvec, bio, i) {
1261 char *addr = page_address(bvec->bv_page); 1538 char *addr = page_address(bvec->bv_page);
1262 1539
1263 memcpy(addr, p, bvec->bv_len); 1540 memcpy(addr, p, bvec->bv_len);
@@ -1302,11 +1579,11 @@ EXPORT_SYMBOL(bio_copy_kern);
1302 */ 1579 */
1303void bio_set_pages_dirty(struct bio *bio) 1580void bio_set_pages_dirty(struct bio *bio)
1304{ 1581{
1305 struct bio_vec *bvec = bio->bi_io_vec; 1582 struct bio_vec *bvec;
1306 int i; 1583 int i;
1307 1584
1308 for (i = 0; i < bio->bi_vcnt; i++) { 1585 bio_for_each_segment_all(bvec, bio, i) {
1309 struct page *page = bvec[i].bv_page; 1586 struct page *page = bvec->bv_page;
1310 1587
1311 if (page && !PageCompound(page)) 1588 if (page && !PageCompound(page))
1312 set_page_dirty_lock(page); 1589 set_page_dirty_lock(page);
@@ -1315,11 +1592,11 @@ void bio_set_pages_dirty(struct bio *bio)
1315 1592
1316static void bio_release_pages(struct bio *bio) 1593static void bio_release_pages(struct bio *bio)
1317{ 1594{
1318 struct bio_vec *bvec = bio->bi_io_vec; 1595 struct bio_vec *bvec;
1319 int i; 1596 int i;
1320 1597
1321 for (i = 0; i < bio->bi_vcnt; i++) { 1598 bio_for_each_segment_all(bvec, bio, i) {
1322 struct page *page = bvec[i].bv_page; 1599 struct page *page = bvec->bv_page;
1323 1600
1324 if (page) 1601 if (page)
1325 put_page(page); 1602 put_page(page);
@@ -1368,16 +1645,16 @@ static void bio_dirty_fn(struct work_struct *work)
1368 1645
1369void bio_check_pages_dirty(struct bio *bio) 1646void bio_check_pages_dirty(struct bio *bio)
1370{ 1647{
1371 struct bio_vec *bvec = bio->bi_io_vec; 1648 struct bio_vec *bvec;
1372 int nr_clean_pages = 0; 1649 int nr_clean_pages = 0;
1373 int i; 1650 int i;
1374 1651
1375 for (i = 0; i < bio->bi_vcnt; i++) { 1652 bio_for_each_segment_all(bvec, bio, i) {
1376 struct page *page = bvec[i].bv_page; 1653 struct page *page = bvec->bv_page;
1377 1654
1378 if (PageDirty(page) || PageCompound(page)) { 1655 if (PageDirty(page) || PageCompound(page)) {
1379 page_cache_release(page); 1656 page_cache_release(page);
1380 bvec[i].bv_page = NULL; 1657 bvec->bv_page = NULL;
1381 } else { 1658 } else {
1382 nr_clean_pages++; 1659 nr_clean_pages++;
1383 } 1660 }
@@ -1478,8 +1755,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1478 trace_block_split(bdev_get_queue(bi->bi_bdev), bi, 1755 trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
1479 bi->bi_sector + first_sectors); 1756 bi->bi_sector + first_sectors);
1480 1757
1481 BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0); 1758 BUG_ON(bio_segments(bi) > 1);
1482 BUG_ON(bi->bi_idx != 0);
1483 atomic_set(&bp->cnt, 3); 1759 atomic_set(&bp->cnt, 3);
1484 bp->error = 0; 1760 bp->error = 0;
1485 bp->bio1 = *bi; 1761 bp->bio1 = *bi;
@@ -1489,8 +1765,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1489 bp->bio1.bi_size = first_sectors << 9; 1765 bp->bio1.bi_size = first_sectors << 9;
1490 1766
1491 if (bi->bi_vcnt != 0) { 1767 if (bi->bi_vcnt != 0) {
1492 bp->bv1 = bi->bi_io_vec[0]; 1768 bp->bv1 = *bio_iovec(bi);
1493 bp->bv2 = bi->bi_io_vec[0]; 1769 bp->bv2 = *bio_iovec(bi);
1494 1770
1495 if (bio_is_rw(bi)) { 1771 if (bio_is_rw(bi)) {
1496 bp->bv2.bv_offset += first_sectors << 9; 1772 bp->bv2.bv_offset += first_sectors << 9;
@@ -1542,7 +1818,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1542 if (index >= bio->bi_idx) 1818 if (index >= bio->bi_idx)
1543 index = bio->bi_vcnt - 1; 1819 index = bio->bi_vcnt - 1;
1544 1820
1545 __bio_for_each_segment(bv, bio, i, 0) { 1821 bio_for_each_segment_all(bv, bio, i) {
1546 if (i == index) { 1822 if (i == index) {
1547 if (offset > bv->bv_offset) 1823 if (offset > bv->bv_offset)
1548 sectors += (offset - bv->bv_offset) / sector_sz; 1824 sectors += (offset - bv->bv_offset) / sector_sz;
@@ -1560,29 +1836,25 @@ EXPORT_SYMBOL(bio_sector_offset);
1560 * create memory pools for biovec's in a bio_set. 1836 * create memory pools for biovec's in a bio_set.
1561 * use the global biovec slabs created for general use. 1837 * use the global biovec slabs created for general use.
1562 */ 1838 */
1563static int biovec_create_pools(struct bio_set *bs, int pool_entries) 1839mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
1564{ 1840{
1565 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; 1841 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
1566 1842
1567 bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab); 1843 return mempool_create_slab_pool(pool_entries, bp->slab);
1568 if (!bs->bvec_pool)
1569 return -ENOMEM;
1570
1571 return 0;
1572}
1573
1574static void biovec_free_pools(struct bio_set *bs)
1575{
1576 mempool_destroy(bs->bvec_pool);
1577} 1844}
1578 1845
1579void bioset_free(struct bio_set *bs) 1846void bioset_free(struct bio_set *bs)
1580{ 1847{
1848 if (bs->rescue_workqueue)
1849 destroy_workqueue(bs->rescue_workqueue);
1850
1581 if (bs->bio_pool) 1851 if (bs->bio_pool)
1582 mempool_destroy(bs->bio_pool); 1852 mempool_destroy(bs->bio_pool);
1583 1853
1854 if (bs->bvec_pool)
1855 mempool_destroy(bs->bvec_pool);
1856
1584 bioset_integrity_free(bs); 1857 bioset_integrity_free(bs);
1585 biovec_free_pools(bs);
1586 bio_put_slab(bs); 1858 bio_put_slab(bs);
1587 1859
1588 kfree(bs); 1860 kfree(bs);
@@ -1613,6 +1885,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1613 1885
1614 bs->front_pad = front_pad; 1886 bs->front_pad = front_pad;
1615 1887
1888 spin_lock_init(&bs->rescue_lock);
1889 bio_list_init(&bs->rescue_list);
1890 INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
1891
1616 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); 1892 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
1617 if (!bs->bio_slab) { 1893 if (!bs->bio_slab) {
1618 kfree(bs); 1894 kfree(bs);
@@ -1623,9 +1899,15 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1623 if (!bs->bio_pool) 1899 if (!bs->bio_pool)
1624 goto bad; 1900 goto bad;
1625 1901
1626 if (!biovec_create_pools(bs, pool_size)) 1902 bs->bvec_pool = biovec_create_pool(bs, pool_size);
1627 return bs; 1903 if (!bs->bvec_pool)
1904 goto bad;
1905
1906 bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
1907 if (!bs->rescue_workqueue)
1908 goto bad;
1628 1909
1910 return bs;
1629bad: 1911bad:
1630 bioset_free(bs); 1912 bioset_free(bs);
1631 return NULL; 1913 return NULL;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d9871c1f0894..2091db8cdd78 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1556,7 +1556,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1556 return 0; 1556 return 0;
1557 1557
1558 size -= pos; 1558 size -= pos;
1559 if (size < INT_MAX) 1559 if (size < iocb->ki_left)
1560 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); 1560 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
1561 return generic_file_aio_read(iocb, iov, nr_segs, pos); 1561 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1562} 1562}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cdee391fc7bf..73f2bfe3ac93 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2560,8 +2560,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2560 if (old_compressed) 2560 if (old_compressed)
2561 contig = bio->bi_sector == sector; 2561 contig = bio->bi_sector == sector;
2562 else 2562 else
2563 contig = bio->bi_sector + (bio->bi_size >> 9) == 2563 contig = bio_end_sector(bio) == sector;
2564 sector;
2565 2564
2566 if (prev_bio_flags != bio_flags || !contig || 2565 if (prev_bio_flags != bio_flags || !contig ||
2567 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || 2566 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2854c824ab64..678977226570 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5177,7 +5177,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
5177 } 5177 }
5178 5178
5179 prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 5179 prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
5180 if ((bio->bi_size >> 9) > max_sectors) 5180 if (bio_sectors(bio) > max_sectors)
5181 return 0; 5181 return 0;
5182 5182
5183 if (!q->merge_bvec_fn) 5183 if (!q->merge_bvec_fn)
diff --git a/fs/buffer.c b/fs/buffer.c
index bc1fe14aaa3e..d2a4d1bb2d57 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2977,7 +2977,6 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
2977 bio->bi_io_vec[0].bv_offset = bh_offset(bh); 2977 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2978 2978
2979 bio->bi_vcnt = 1; 2979 bio->bi_vcnt = 1;
2980 bio->bi_idx = 0;
2981 bio->bi_size = bh->b_size; 2980 bio->bi_size = bh->b_size;
2982 2981
2983 bio->bi_end_io = end_bio_bh_io_sync; 2982 bio->bi_end_io = end_bio_bh_io_sync;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 51d16e067d68..7ab90f5081ee 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -442,8 +442,8 @@ static struct bio *dio_await_one(struct dio *dio)
442static int dio_bio_complete(struct dio *dio, struct bio *bio) 442static int dio_bio_complete(struct dio *dio, struct bio *bio)
443{ 443{
444 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 444 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
445 struct bio_vec *bvec = bio->bi_io_vec; 445 struct bio_vec *bvec;
446 int page_no; 446 unsigned i;
447 447
448 if (!uptodate) 448 if (!uptodate)
449 dio->io_error = -EIO; 449 dio->io_error = -EIO;
@@ -451,8 +451,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
451 if (dio->is_async && dio->rw == READ) { 451 if (dio->is_async && dio->rw == READ) {
452 bio_check_pages_dirty(bio); /* transfers ownership */ 452 bio_check_pages_dirty(bio); /* transfers ownership */
453 } else { 453 } else {
454 for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { 454 bio_for_each_segment_all(bvec, bio, i) {
455 struct page *page = bvec[page_no].bv_page; 455 struct page *page = bvec->bv_page;
456 456
457 if (dio->rw == READ && !PageCompound(page)) 457 if (dio->rw == READ && !PageCompound(page))
458 set_page_dirty_lock(page); 458 set_page_dirty_lock(page);
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index f936cb50dc0d..b74422888604 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio)
401 struct bio_vec *bv; 401 struct bio_vec *bv;
402 unsigned i; 402 unsigned i;
403 403
404 __bio_for_each_segment(bv, bio, i, 0) { 404 bio_for_each_segment_all(bv, bio, i) {
405 unsigned this_count = bv->bv_len; 405 unsigned this_count = bv->bv_len;
406 406
407 if (likely(PAGE_SIZE == this_count)) 407 if (likely(PAGE_SIZE == this_count))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index b963f38ac298..7682b970d0f1 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
432 if (!bio) 432 if (!bio)
433 continue; 433 continue;
434 434
435 __bio_for_each_segment(bv, bio, i, 0) { 435 bio_for_each_segment_all(bv, bio, i) {
436 struct page *page = bv->bv_page; 436 struct page *page = bv->bv_page;
437 437
438 SetPageUptodate(page); 438 SetPageUptodate(page);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 798d4458a4d3..3be57189efd5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/freezer.h>
26#include <linux/writeback.h> 25#include <linux/writeback.h>
27#include <linux/blkdev.h> 26#include <linux/blkdev.h>
28#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
@@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head)
88#define CREATE_TRACE_POINTS 87#define CREATE_TRACE_POINTS
89#include <trace/events/writeback.h> 88#include <trace/events/writeback.h>
90 89
91/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
92static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
93{
94 if (bdi->wb.task) {
95 wake_up_process(bdi->wb.task);
96 } else {
97 /*
98 * The bdi thread isn't there, wake up the forker thread which
99 * will create and run it.
100 */
101 wake_up_process(default_backing_dev_info.wb.task);
102 }
103}
104
105static void bdi_queue_work(struct backing_dev_info *bdi, 90static void bdi_queue_work(struct backing_dev_info *bdi,
106 struct wb_writeback_work *work) 91 struct wb_writeback_work *work)
107{ 92{
@@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
109 94
110 spin_lock_bh(&bdi->wb_lock); 95 spin_lock_bh(&bdi->wb_lock);
111 list_add_tail(&work->list, &bdi->work_list); 96 list_add_tail(&work->list, &bdi->work_list);
112 if (!bdi->wb.task)
113 trace_writeback_nothread(bdi, work);
114 bdi_wakeup_flusher(bdi);
115 spin_unlock_bh(&bdi->wb_lock); 97 spin_unlock_bh(&bdi->wb_lock);
98
99 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
116} 100}
117 101
118static void 102static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
127 */ 111 */
128 work = kzalloc(sizeof(*work), GFP_ATOMIC); 112 work = kzalloc(sizeof(*work), GFP_ATOMIC);
129 if (!work) { 113 if (!work) {
130 if (bdi->wb.task) { 114 trace_writeback_nowork(bdi);
131 trace_writeback_nowork(bdi); 115 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
132 wake_up_process(bdi->wb.task);
133 }
134 return; 116 return;
135 } 117 }
136 118
@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
177 * writeback as soon as there is no other work to do. 159 * writeback as soon as there is no other work to do.
178 */ 160 */
179 trace_writeback_wake_background(bdi); 161 trace_writeback_wake_background(bdi);
180 spin_lock_bh(&bdi->wb_lock); 162 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
181 bdi_wakeup_flusher(bdi);
182 spin_unlock_bh(&bdi->wb_lock);
183} 163}
184 164
185/* 165/*
@@ -1020,67 +1000,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
1020 1000
1021/* 1001/*
1022 * Handle writeback of dirty data for the device backed by this bdi. Also 1002 * Handle writeback of dirty data for the device backed by this bdi. Also
1023 * wakes up periodically and does kupdated style flushing. 1003 * reschedules periodically and does kupdated style flushing.
1024 */ 1004 */
1025int bdi_writeback_thread(void *data) 1005void bdi_writeback_workfn(struct work_struct *work)
1026{ 1006{
1027 struct bdi_writeback *wb = data; 1007 struct bdi_writeback *wb = container_of(to_delayed_work(work),
1008 struct bdi_writeback, dwork);
1028 struct backing_dev_info *bdi = wb->bdi; 1009 struct backing_dev_info *bdi = wb->bdi;
1029 long pages_written; 1010 long pages_written;
1030 1011
1031 set_worker_desc("flush-%s", dev_name(bdi->dev)); 1012 set_worker_desc("flush-%s", dev_name(bdi->dev));
1032 current->flags |= PF_SWAPWRITE; 1013 current->flags |= PF_SWAPWRITE;
1033 set_freezable();
1034 wb->last_active = jiffies;
1035
1036 /*
1037 * Our parent may run at a different priority, just set us to normal
1038 */
1039 set_user_nice(current, 0);
1040
1041 trace_writeback_thread_start(bdi);
1042 1014
1043 while (!kthread_freezable_should_stop(NULL)) { 1015 if (likely(!current_is_workqueue_rescuer() ||
1016 list_empty(&bdi->bdi_list))) {
1044 /* 1017 /*
1045 * Remove own delayed wake-up timer, since we are already awake 1018 * The normal path. Keep writing back @bdi until its
1046 * and we'll take care of the periodic write-back. 1019 * work_list is empty. Note that this path is also taken
1020 * if @bdi is shutting down even when we're running off the
1021 * rescuer as work_list needs to be drained.
1047 */ 1022 */
1048 del_timer(&wb->wakeup_timer); 1023 do {
1049 1024 pages_written = wb_do_writeback(wb, 0);
1050 pages_written = wb_do_writeback(wb, 0); 1025 trace_writeback_pages_written(pages_written);
1051 1026 } while (!list_empty(&bdi->work_list));
1027 } else {
1028 /*
1029 * bdi_wq can't get enough workers and we're running off
1030 * the emergency worker. Don't hog it. Hopefully, 1024 is
1031 * enough for efficient IO.
1032 */
1033 pages_written = writeback_inodes_wb(&bdi->wb, 1024,
1034 WB_REASON_FORKER_THREAD);
1052 trace_writeback_pages_written(pages_written); 1035 trace_writeback_pages_written(pages_written);
1053
1054 if (pages_written)
1055 wb->last_active = jiffies;
1056
1057 set_current_state(TASK_INTERRUPTIBLE);
1058 if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
1059 __set_current_state(TASK_RUNNING);
1060 continue;
1061 }
1062
1063 if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1064 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
1065 else {
1066 /*
1067 * We have nothing to do, so can go sleep without any
1068 * timeout and save power. When a work is queued or
1069 * something is made dirty - we will be woken up.
1070 */
1071 schedule();
1072 }
1073 } 1036 }
1074 1037
1075 /* Flush any work that raced with us exiting */ 1038 if (!list_empty(&bdi->work_list) ||
1076 if (!list_empty(&bdi->work_list)) 1039 (wb_has_dirty_io(wb) && dirty_writeback_interval))
1077 wb_do_writeback(wb, 1); 1040 queue_delayed_work(bdi_wq, &wb->dwork,
1041 msecs_to_jiffies(dirty_writeback_interval * 10));
1078 1042
1079 trace_writeback_thread_stop(bdi); 1043 current->flags &= ~PF_SWAPWRITE;
1080 return 0;
1081} 1044}
1082 1045
1083
1084/* 1046/*
1085 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 1047 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
1086 * the whole world. 1048 * the whole world.
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 7318abf9d0fb..c5fa758fd844 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -300,7 +300,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
300 u64 nblk; 300 u64 nblk;
301 301
302 if (bio) { 302 if (bio) {
303 nblk = bio->bi_sector + bio_sectors(bio); 303 nblk = bio_end_sector(bio);
304 nblk >>= sdp->sd_fsb2bb_shift; 304 nblk >>= sdp->sd_fsb2bb_shift;
305 if (blkno == nblk) 305 if (blkno == nblk)
306 return bio; 306 return bio;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cbe48ea9318e..c57499dca89c 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2005,7 +2005,6 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
2005 bio->bi_io_vec[0].bv_offset = bp->l_offset; 2005 bio->bi_io_vec[0].bv_offset = bp->l_offset;
2006 2006
2007 bio->bi_vcnt = 1; 2007 bio->bi_vcnt = 1;
2008 bio->bi_idx = 0;
2009 bio->bi_size = LOGPSIZE; 2008 bio->bi_size = LOGPSIZE;
2010 2009
2011 bio->bi_end_io = lbmIODone; 2010 bio->bi_end_io = lbmIODone;
@@ -2146,7 +2145,6 @@ static void lbmStartIO(struct lbuf * bp)
2146 bio->bi_io_vec[0].bv_offset = bp->l_offset; 2145 bio->bi_io_vec[0].bv_offset = bp->l_offset;
2147 2146
2148 bio->bi_vcnt = 1; 2147 bio->bi_vcnt = 1;
2149 bio->bi_idx = 0;
2150 bio->bi_size = LOGPSIZE; 2148 bio->bi_size = LOGPSIZE;
2151 2149
2152 bio->bi_end_io = lbmIODone; 2150 bio->bi_end_io = lbmIODone;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index e784a217b500..550475ca6a0e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
32 bio_vec.bv_len = PAGE_SIZE; 32 bio_vec.bv_len = PAGE_SIZE;
33 bio_vec.bv_offset = 0; 33 bio_vec.bv_offset = 0;
34 bio.bi_vcnt = 1; 34 bio.bi_vcnt = 1;
35 bio.bi_idx = 0;
36 bio.bi_size = PAGE_SIZE; 35 bio.bi_size = PAGE_SIZE;
37 bio.bi_bdev = bdev; 36 bio.bi_bdev = bdev;
38 bio.bi_sector = page->index * (PAGE_SIZE >> 9); 37 bio.bi_sector = page->index * (PAGE_SIZE >> 9);
@@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
108 if (i >= max_pages) { 107 if (i >= max_pages) {
109 /* Block layer cannot split bios :( */ 108 /* Block layer cannot split bios :( */
110 bio->bi_vcnt = i; 109 bio->bi_vcnt = i;
111 bio->bi_idx = 0;
112 bio->bi_size = i * PAGE_SIZE; 110 bio->bi_size = i * PAGE_SIZE;
113 bio->bi_bdev = super->s_bdev; 111 bio->bi_bdev = super->s_bdev;
114 bio->bi_sector = ofs >> 9; 112 bio->bi_sector = ofs >> 9;
@@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
136 unlock_page(page); 134 unlock_page(page);
137 } 135 }
138 bio->bi_vcnt = nr_pages; 136 bio->bi_vcnt = nr_pages;
139 bio->bi_idx = 0;
140 bio->bi_size = nr_pages * PAGE_SIZE; 137 bio->bi_size = nr_pages * PAGE_SIZE;
141 bio->bi_bdev = super->s_bdev; 138 bio->bi_bdev = super->s_bdev;
142 bio->bi_sector = ofs >> 9; 139 bio->bi_sector = ofs >> 9;
@@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
202 if (i >= max_pages) { 199 if (i >= max_pages) {
203 /* Block layer cannot split bios :( */ 200 /* Block layer cannot split bios :( */
204 bio->bi_vcnt = i; 201 bio->bi_vcnt = i;
205 bio->bi_idx = 0;
206 bio->bi_size = i * PAGE_SIZE; 202 bio->bi_size = i * PAGE_SIZE;
207 bio->bi_bdev = super->s_bdev; 203 bio->bi_bdev = super->s_bdev;
208 bio->bi_sector = ofs >> 9; 204 bio->bi_sector = ofs >> 9;
@@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
224 bio->bi_io_vec[i].bv_offset = 0; 220 bio->bi_io_vec[i].bv_offset = 0;
225 } 221 }
226 bio->bi_vcnt = nr_pages; 222 bio->bi_vcnt = nr_pages;
227 bio->bi_idx = 0;
228 bio->bi_size = nr_pages * PAGE_SIZE; 223 bio->bi_size = nr_pages * PAGE_SIZE;
229 bio->bi_bdev = super->s_bdev; 224 bio->bi_bdev = super->s_bdev;
230 bio->bi_sector = ofs >> 9; 225 bio->bi_sector = ofs >> 9;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 350459910fe1..c3881553f7d1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -18,6 +18,7 @@
18#include <linux/writeback.h> 18#include <linux/writeback.h>
19#include <linux/atomic.h> 19#include <linux/atomic.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/workqueue.h>
21 22
22struct page; 23struct page;
23struct device; 24struct device;
@@ -27,7 +28,6 @@ struct dentry;
27 * Bits in backing_dev_info.state 28 * Bits in backing_dev_info.state
28 */ 29 */
29enum bdi_state { 30enum bdi_state {
30 BDI_pending, /* On its way to being activated */
31 BDI_wb_alloc, /* Default embedded wb allocated */ 31 BDI_wb_alloc, /* Default embedded wb allocated */
32 BDI_async_congested, /* The async (write) queue is getting full */ 32 BDI_async_congested, /* The async (write) queue is getting full */
33 BDI_sync_congested, /* The sync queue is getting full */ 33 BDI_sync_congested, /* The sync queue is getting full */
@@ -53,10 +53,8 @@ struct bdi_writeback {
53 unsigned int nr; 53 unsigned int nr;
54 54
55 unsigned long last_old_flush; /* last old data flush */ 55 unsigned long last_old_flush; /* last old data flush */
56 unsigned long last_active; /* last time bdi thread was active */
57 56
58 struct task_struct *task; /* writeback thread */ 57 struct delayed_work dwork; /* work item used for writeback */
59 struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
60 struct list_head b_dirty; /* dirty inodes */ 58 struct list_head b_dirty; /* dirty inodes */
61 struct list_head b_io; /* parked for writeback */ 59 struct list_head b_io; /* parked for writeback */
62 struct list_head b_more_io; /* parked for more writeback */ 60 struct list_head b_more_io; /* parked for more writeback */
@@ -123,14 +121,15 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
123void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 121void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
124 enum wb_reason reason); 122 enum wb_reason reason);
125void bdi_start_background_writeback(struct backing_dev_info *bdi); 123void bdi_start_background_writeback(struct backing_dev_info *bdi);
126int bdi_writeback_thread(void *data); 124void bdi_writeback_workfn(struct work_struct *work);
127int bdi_has_dirty_io(struct backing_dev_info *bdi); 125int bdi_has_dirty_io(struct backing_dev_info *bdi);
128void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); 126void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
129void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); 127void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
130 128
131extern spinlock_t bdi_lock; 129extern spinlock_t bdi_lock;
132extern struct list_head bdi_list; 130extern struct list_head bdi_list;
133extern struct list_head bdi_pending_list; 131
132extern struct workqueue_struct *bdi_wq;
134 133
135static inline int wb_has_dirty_io(struct bdi_writeback *wb) 134static inline int wb_has_dirty_io(struct bdi_writeback *wb)
136{ 135{
@@ -336,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
336 return bdi->capabilities & BDI_CAP_SWAP_BACKED; 335 return bdi->capabilities & BDI_CAP_SWAP_BACKED;
337} 336}
338 337
339static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
340{
341 return bdi == &default_backing_dev_info;
342}
343
344static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) 338static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
345{ 339{
346 return bdi_cap_writeback_dirty(mapping->backing_dev_info); 340 return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 820e7aaad4fd..ef24466d8f82 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -67,6 +67,7 @@
67#define bio_offset(bio) bio_iovec((bio))->bv_offset 67#define bio_offset(bio) bio_iovec((bio))->bv_offset
68#define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) 68#define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx)
69#define bio_sectors(bio) ((bio)->bi_size >> 9) 69#define bio_sectors(bio) ((bio)->bi_size >> 9)
70#define bio_end_sector(bio) ((bio)->bi_sector + bio_sectors((bio)))
70 71
71static inline unsigned int bio_cur_bytes(struct bio *bio) 72static inline unsigned int bio_cur_bytes(struct bio *bio)
72{ 73{
@@ -84,11 +85,6 @@ static inline void *bio_data(struct bio *bio)
84 return NULL; 85 return NULL;
85} 86}
86 87
87static inline int bio_has_allocated_vec(struct bio *bio)
88{
89 return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs;
90}
91
92/* 88/*
93 * will die 89 * will die
94 */ 90 */
@@ -136,16 +132,27 @@ static inline int bio_has_allocated_vec(struct bio *bio)
136#define bio_io_error(bio) bio_endio((bio), -EIO) 132#define bio_io_error(bio) bio_endio((bio), -EIO)
137 133
138/* 134/*
139 * drivers should not use the __ version unless they _really_ want to 135 * drivers should not use the __ version unless they _really_ know what
140 * run through the entire bio and not just pending pieces 136 * they're doing
141 */ 137 */
142#define __bio_for_each_segment(bvl, bio, i, start_idx) \ 138#define __bio_for_each_segment(bvl, bio, i, start_idx) \
143 for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ 139 for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \
144 i < (bio)->bi_vcnt; \ 140 i < (bio)->bi_vcnt; \
145 bvl++, i++) 141 bvl++, i++)
146 142
143/*
144 * drivers should _never_ use the all version - the bio may have been split
145 * before it got to the driver and the driver won't own all of it
146 */
147#define bio_for_each_segment_all(bvl, bio, i) \
148 for (i = 0; \
149 bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \
150 i++)
151
147#define bio_for_each_segment(bvl, bio, i) \ 152#define bio_for_each_segment(bvl, bio, i) \
148 __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) 153 for (i = (bio)->bi_idx; \
154 bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \
155 i++)
149 156
150/* 157/*
151 * get a reference to a bio, so it won't disappear. the intended use is 158 * get a reference to a bio, so it won't disappear. the intended use is
@@ -180,9 +187,12 @@ struct bio_integrity_payload {
180 unsigned short bip_slab; /* slab the bip came from */ 187 unsigned short bip_slab; /* slab the bip came from */
181 unsigned short bip_vcnt; /* # of integrity bio_vecs */ 188 unsigned short bip_vcnt; /* # of integrity bio_vecs */
182 unsigned short bip_idx; /* current bip_vec index */ 189 unsigned short bip_idx; /* current bip_vec index */
190 unsigned bip_owns_buf:1; /* should free bip_buf */
183 191
184 struct work_struct bip_work; /* I/O completion */ 192 struct work_struct bip_work; /* I/O completion */
185 struct bio_vec bip_vec[0]; /* embedded bvec array */ 193
194 struct bio_vec *bip_vec;
195 struct bio_vec bip_inline_vecs[0];/* embedded bvec array */
186}; 196};
187#endif /* CONFIG_BLK_DEV_INTEGRITY */ 197#endif /* CONFIG_BLK_DEV_INTEGRITY */
188 198
@@ -211,6 +221,7 @@ extern void bio_pair_release(struct bio_pair *dbio);
211 221
212extern struct bio_set *bioset_create(unsigned int, unsigned int); 222extern struct bio_set *bioset_create(unsigned int, unsigned int);
213extern void bioset_free(struct bio_set *); 223extern void bioset_free(struct bio_set *);
224extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
214 225
215extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); 226extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
216extern void bio_put(struct bio *); 227extern void bio_put(struct bio *);
@@ -245,6 +256,9 @@ extern void bio_endio(struct bio *, int);
245struct request_queue; 256struct request_queue;
246extern int bio_phys_segments(struct request_queue *, struct bio *); 257extern int bio_phys_segments(struct request_queue *, struct bio *);
247 258
259extern int submit_bio_wait(int rw, struct bio *bio);
260extern void bio_advance(struct bio *, unsigned);
261
248extern void bio_init(struct bio *); 262extern void bio_init(struct bio *);
249extern void bio_reset(struct bio *); 263extern void bio_reset(struct bio *);
250 264
@@ -279,6 +293,9 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
279} 293}
280#endif 294#endif
281 295
296extern void bio_copy_data(struct bio *dst, struct bio *src);
297extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
298
282extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, 299extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
283 unsigned long, unsigned int, int, gfp_t); 300 unsigned long, unsigned int, int, gfp_t);
284extern struct bio *bio_copy_user_iov(struct request_queue *, 301extern struct bio *bio_copy_user_iov(struct request_queue *,
@@ -286,8 +303,8 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
286 int, int, gfp_t); 303 int, int, gfp_t);
287extern int bio_uncopy_user(struct bio *); 304extern int bio_uncopy_user(struct bio *);
288void zero_fill_bio(struct bio *bio); 305void zero_fill_bio(struct bio *bio);
289extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); 306extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
290extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); 307extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
291extern unsigned int bvec_nr_vecs(unsigned short idx); 308extern unsigned int bvec_nr_vecs(unsigned short idx);
292 309
293#ifdef CONFIG_BLK_CGROUP 310#ifdef CONFIG_BLK_CGROUP
@@ -298,39 +315,6 @@ static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
298static inline void bio_disassociate_task(struct bio *bio) { } 315static inline void bio_disassociate_task(struct bio *bio) { }
299#endif /* CONFIG_BLK_CGROUP */ 316#endif /* CONFIG_BLK_CGROUP */
300 317
301/*
302 * bio_set is used to allow other portions of the IO system to
303 * allocate their own private memory pools for bio and iovec structures.
304 * These memory pools in turn all allocate from the bio_slab
305 * and the bvec_slabs[].
306 */
307#define BIO_POOL_SIZE 2
308#define BIOVEC_NR_POOLS 6
309#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1)
310
311struct bio_set {
312 struct kmem_cache *bio_slab;
313 unsigned int front_pad;
314
315 mempool_t *bio_pool;
316#if defined(CONFIG_BLK_DEV_INTEGRITY)
317 mempool_t *bio_integrity_pool;
318#endif
319 mempool_t *bvec_pool;
320};
321
322struct biovec_slab {
323 int nr_vecs;
324 char *name;
325 struct kmem_cache *slab;
326};
327
328/*
329 * a small number of entries is fine, not going to be performance critical.
330 * basically we just need to survive
331 */
332#define BIO_SPLIT_ENTRIES 2
333
334#ifdef CONFIG_HIGHMEM 318#ifdef CONFIG_HIGHMEM
335/* 319/*
336 * remember never ever reenable interrupts between a bvec_kmap_irq and 320 * remember never ever reenable interrupts between a bvec_kmap_irq and
@@ -527,6 +511,49 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
527 return bio; 511 return bio;
528} 512}
529 513
514/*
515 * bio_set is used to allow other portions of the IO system to
516 * allocate their own private memory pools for bio and iovec structures.
517 * These memory pools in turn all allocate from the bio_slab
518 * and the bvec_slabs[].
519 */
520#define BIO_POOL_SIZE 2
521#define BIOVEC_NR_POOLS 6
522#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1)
523
524struct bio_set {
525 struct kmem_cache *bio_slab;
526 unsigned int front_pad;
527
528 mempool_t *bio_pool;
529 mempool_t *bvec_pool;
530#if defined(CONFIG_BLK_DEV_INTEGRITY)
531 mempool_t *bio_integrity_pool;
532 mempool_t *bvec_integrity_pool;
533#endif
534
535 /*
536 * Deadlock avoidance for stacking block drivers: see comments in
537 * bio_alloc_bioset() for details
538 */
539 spinlock_t rescue_lock;
540 struct bio_list rescue_list;
541 struct work_struct rescue_work;
542 struct workqueue_struct *rescue_workqueue;
543};
544
545struct biovec_slab {
546 int nr_vecs;
547 char *name;
548 struct kmem_cache *slab;
549};
550
551/*
552 * a small number of entries is fine, not going to be performance critical.
553 * basically we just need to survive
554 */
555#define BIO_SPLIT_ENTRIES 2
556
530#if defined(CONFIG_BLK_DEV_INTEGRITY) 557#if defined(CONFIG_BLK_DEV_INTEGRITY)
531 558
532#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) 559#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 22990cf4439d..fa1abeb45b76 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -118,6 +118,7 @@ struct bio {
118 * BIO_POOL_IDX() 118 * BIO_POOL_IDX()
119 */ 119 */
120#define BIO_RESET_BITS 13 120#define BIO_RESET_BITS 13
121#define BIO_OWNS_VEC 13 /* bio_free() should free bvec */
121 122
122#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) 123#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
123 124
@@ -176,6 +177,7 @@ enum rq_flag_bits {
176 __REQ_IO_STAT, /* account I/O stat */ 177 __REQ_IO_STAT, /* account I/O stat */
177 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 178 __REQ_MIXED_MERGE, /* merge of different types, fail separately */
178 __REQ_KERNEL, /* direct IO to kernel pages */ 179 __REQ_KERNEL, /* direct IO to kernel pages */
180 __REQ_PM, /* runtime pm request */
179 __REQ_NR_BITS, /* stops here */ 181 __REQ_NR_BITS, /* stops here */
180}; 182};
181 183
@@ -198,6 +200,8 @@ enum rq_flag_bits {
198 REQ_SECURE) 200 REQ_SECURE)
199#define REQ_CLONE_MASK REQ_COMMON_MASK 201#define REQ_CLONE_MASK REQ_COMMON_MASK
200 202
203#define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME)
204
201/* This mask is used for both bio and request merge checking */ 205/* This mask is used for both bio and request merge checking */
202#define REQ_NOMERGE_FLAGS \ 206#define REQ_NOMERGE_FLAGS \
203 (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) 207 (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
@@ -224,5 +228,6 @@ enum rq_flag_bits {
224#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 228#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
225#define REQ_SECURE (1 << __REQ_SECURE) 229#define REQ_SECURE (1 << __REQ_SECURE)
226#define REQ_KERNEL (1 << __REQ_KERNEL) 230#define REQ_KERNEL (1 << __REQ_KERNEL)
231#define REQ_PM (1 << __REQ_PM)
227 232
228#endif /* __LINUX_BLK_TYPES_H */ 233#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e38cfe77f7f0..2fdb4a451b49 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -361,6 +361,12 @@ struct request_queue {
361 */ 361 */
362 struct kobject kobj; 362 struct kobject kobj;
363 363
364#ifdef CONFIG_PM_RUNTIME
365 struct device *dev;
366 int rpm_status;
367 unsigned int nr_pending;
368#endif
369
364 /* 370 /*
365 * queue settings 371 * queue settings
366 */ 372 */
@@ -838,7 +844,7 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
838 unsigned int cmd_flags) 844 unsigned int cmd_flags)
839{ 845{
840 if (unlikely(cmd_flags & REQ_DISCARD)) 846 if (unlikely(cmd_flags & REQ_DISCARD))
841 return q->limits.max_discard_sectors; 847 return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
842 848
843 if (unlikely(cmd_flags & REQ_WRITE_SAME)) 849 if (unlikely(cmd_flags & REQ_WRITE_SAME))
844 return q->limits.max_write_same_sectors; 850 return q->limits.max_write_same_sectors;
@@ -961,6 +967,27 @@ struct request_queue *blk_alloc_queue_node(gfp_t, int);
961extern void blk_put_queue(struct request_queue *); 967extern void blk_put_queue(struct request_queue *);
962 968
963/* 969/*
970 * block layer runtime pm functions
971 */
972#ifdef CONFIG_PM_RUNTIME
973extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev);
974extern int blk_pre_runtime_suspend(struct request_queue *q);
975extern void blk_post_runtime_suspend(struct request_queue *q, int err);
976extern void blk_pre_runtime_resume(struct request_queue *q);
977extern void blk_post_runtime_resume(struct request_queue *q, int err);
978#else
979static inline void blk_pm_runtime_init(struct request_queue *q,
980 struct device *dev) {}
981static inline int blk_pre_runtime_suspend(struct request_queue *q)
982{
983 return -ENOSYS;
984}
985static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {}
986static inline void blk_pre_runtime_resume(struct request_queue *q) {}
987static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
988#endif
989
990/*
964 * blk_plug permits building a queue of related requests by holding the I/O 991 * blk_plug permits building a queue of related requests by holding the I/O
965 * fragments for a short period. This allows merging of sequential requests 992 * fragments for a short period. This allows merging of sequential requests
966 * into single larger request. As the requests are moved from a per-task list to 993 * into single larger request. As the requests are moved from a per-task list to
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 9c1467357b03..60ae7c3db912 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -244,7 +244,7 @@ TRACE_EVENT(block_bio_bounce,
244 __entry->dev = bio->bi_bdev ? 244 __entry->dev = bio->bi_bdev ?
245 bio->bi_bdev->bd_dev : 0; 245 bio->bi_bdev->bd_dev : 0;
246 __entry->sector = bio->bi_sector; 246 __entry->sector = bio->bi_sector;
247 __entry->nr_sector = bio->bi_size >> 9; 247 __entry->nr_sector = bio_sectors(bio);
248 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 248 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
249 memcpy(__entry->comm, current->comm, TASK_COMM_LEN); 249 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
250 ), 250 ),
@@ -281,7 +281,7 @@ TRACE_EVENT(block_bio_complete,
281 TP_fast_assign( 281 TP_fast_assign(
282 __entry->dev = bio->bi_bdev->bd_dev; 282 __entry->dev = bio->bi_bdev->bd_dev;
283 __entry->sector = bio->bi_sector; 283 __entry->sector = bio->bi_sector;
284 __entry->nr_sector = bio->bi_size >> 9; 284 __entry->nr_sector = bio_sectors(bio);
285 __entry->error = error; 285 __entry->error = error;
286 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 286 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
287 ), 287 ),
@@ -309,7 +309,7 @@ DECLARE_EVENT_CLASS(block_bio_merge,
309 TP_fast_assign( 309 TP_fast_assign(
310 __entry->dev = bio->bi_bdev->bd_dev; 310 __entry->dev = bio->bi_bdev->bd_dev;
311 __entry->sector = bio->bi_sector; 311 __entry->sector = bio->bi_sector;
312 __entry->nr_sector = bio->bi_size >> 9; 312 __entry->nr_sector = bio_sectors(bio);
313 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 313 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
314 memcpy(__entry->comm, current->comm, TASK_COMM_LEN); 314 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
315 ), 315 ),
@@ -376,7 +376,7 @@ TRACE_EVENT(block_bio_queue,
376 TP_fast_assign( 376 TP_fast_assign(
377 __entry->dev = bio->bi_bdev->bd_dev; 377 __entry->dev = bio->bi_bdev->bd_dev;
378 __entry->sector = bio->bi_sector; 378 __entry->sector = bio->bi_sector;
379 __entry->nr_sector = bio->bi_size >> 9; 379 __entry->nr_sector = bio_sectors(bio);
380 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 380 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
381 memcpy(__entry->comm, current->comm, TASK_COMM_LEN); 381 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
382 ), 382 ),
@@ -404,7 +404,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
404 TP_fast_assign( 404 TP_fast_assign(
405 __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; 405 __entry->dev = bio ? bio->bi_bdev->bd_dev : 0;
406 __entry->sector = bio ? bio->bi_sector : 0; 406 __entry->sector = bio ? bio->bi_sector : 0;
407 __entry->nr_sector = bio ? bio->bi_size >> 9 : 0; 407 __entry->nr_sector = bio ? bio_sectors(bio) : 0;
408 blk_fill_rwbs(__entry->rwbs, 408 blk_fill_rwbs(__entry->rwbs,
409 bio ? bio->bi_rw : 0, __entry->nr_sector); 409 bio ? bio->bi_rw : 0, __entry->nr_sector);
410 memcpy(__entry->comm, current->comm, TASK_COMM_LEN); 410 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
@@ -580,7 +580,7 @@ TRACE_EVENT(block_bio_remap,
580 TP_fast_assign( 580 TP_fast_assign(
581 __entry->dev = bio->bi_bdev->bd_dev; 581 __entry->dev = bio->bi_bdev->bd_dev;
582 __entry->sector = bio->bi_sector; 582 __entry->sector = bio->bi_sector;
583 __entry->nr_sector = bio->bi_size >> 9; 583 __entry->nr_sector = bio_sectors(bio);
584 __entry->old_dev = dev; 584 __entry->old_dev = dev;
585 __entry->old_sector = from; 585 __entry->old_sector = from;
586 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); 586 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 6a16fd2e70ed..464ea82e10db 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class,
183DEFINE_EVENT(writeback_work_class, name, \ 183DEFINE_EVENT(writeback_work_class, name, \
184 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ 184 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
185 TP_ARGS(bdi, work)) 185 TP_ARGS(bdi, work))
186DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
187DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); 186DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
188DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); 187DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
189DEFINE_WRITEBACK_WORK_EVENT(writeback_start); 188DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \
222 221
223DEFINE_WRITEBACK_EVENT(writeback_nowork); 222DEFINE_WRITEBACK_EVENT(writeback_nowork);
224DEFINE_WRITEBACK_EVENT(writeback_wake_background); 223DEFINE_WRITEBACK_EVENT(writeback_wake_background);
225DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
226DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
227DEFINE_WRITEBACK_EVENT(writeback_bdi_register); 224DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
228DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); 225DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
229DEFINE_WRITEBACK_EVENT(writeback_thread_start);
230DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
231 226
232DECLARE_EVENT_CLASS(wbc_class, 227DECLARE_EVENT_CLASS(wbc_class,
233 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), 228 TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
diff --git a/kernel/relay.c b/kernel/relay.c
index eef0d113b79e..b91488ba2e5a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf)
234static void relay_remove_buf(struct kref *kref) 234static void relay_remove_buf(struct kref *kref)
235{ 235{
236 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); 236 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
237 buf->chan->cb->remove_buf_file(buf->dentry);
238 relay_destroy_buf(buf); 237 relay_destroy_buf(buf);
239} 238}
240 239
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf)
484{ 483{
485 buf->finalized = 1; 484 buf->finalized = 1;
486 del_timer_sync(&buf->timer); 485 del_timer_sync(&buf->timer);
486 buf->chan->cb->remove_buf_file(buf->dentry);
487 kref_put(&buf->kref, relay_remove_buf); 487 kref_put(&buf->kref, relay_remove_buf);
488} 488}
489 489
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 41733c5dc820..502517492258 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
31static struct class *bdi_class; 31static struct class *bdi_class;
32 32
33/* 33/*
34 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as 34 * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
35 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
36 * locking. 35 * locking.
37 */ 36 */
38DEFINE_SPINLOCK(bdi_lock); 37DEFINE_SPINLOCK(bdi_lock);
39LIST_HEAD(bdi_list); 38LIST_HEAD(bdi_list);
40LIST_HEAD(bdi_pending_list); 39
40/* bdi_wq serves all asynchronous writeback tasks */
41struct workqueue_struct *bdi_wq;
41 42
42void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) 43void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
43{ 44{
@@ -257,6 +258,11 @@ static int __init default_bdi_init(void)
257{ 258{
258 int err; 259 int err;
259 260
261 bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
262 WQ_UNBOUND | WQ_SYSFS, 0);
263 if (!bdi_wq)
264 return -ENOMEM;
265
260 err = bdi_init(&default_backing_dev_info); 266 err = bdi_init(&default_backing_dev_info);
261 if (!err) 267 if (!err)
262 bdi_register(&default_backing_dev_info, NULL, "default"); 268 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
271 return wb_has_dirty_io(&bdi->wb); 277 return wb_has_dirty_io(&bdi->wb);
272} 278}
273 279
274static void wakeup_timer_fn(unsigned long data)
275{
276 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
277
278 spin_lock_bh(&bdi->wb_lock);
279 if (bdi->wb.task) {
280 trace_writeback_wake_thread(bdi);
281 wake_up_process(bdi->wb.task);
282 } else if (bdi->dev) {
283 /*
284 * When bdi tasks are inactive for long time, they are killed.
285 * In this case we have to wake-up the forker thread which
286 * should create and run the bdi thread.
287 */
288 trace_writeback_wake_forker_thread(bdi);
289 wake_up_process(default_backing_dev_info.wb.task);
290 }
291 spin_unlock_bh(&bdi->wb_lock);
292}
293
294/* 280/*
295 * This function is used when the first inode for this bdi is marked dirty. It 281 * This function is used when the first inode for this bdi is marked dirty. It
296 * wakes-up the corresponding bdi thread which should then take care of the 282 * wakes-up the corresponding bdi thread which should then take care of the
@@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
307 unsigned long timeout; 293 unsigned long timeout;
308 294
309 timeout = msecs_to_jiffies(dirty_writeback_interval * 10); 295 timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
310 mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); 296 mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
311}
312
313/*
314 * Calculate the longest interval (jiffies) bdi threads are allowed to be
315 * inactive.
316 */
317static unsigned long bdi_longest_inactive(void)
318{
319 unsigned long interval;
320
321 interval = msecs_to_jiffies(dirty_writeback_interval * 10);
322 return max(5UL * 60 * HZ, interval);
323}
324
325/*
326 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
327 * shutdown
328 */
329static void bdi_clear_pending(struct backing_dev_info *bdi)
330{
331 clear_bit(BDI_pending, &bdi->state);
332 smp_mb__after_clear_bit();
333 wake_up_bit(&bdi->state, BDI_pending);
334}
335
336static int bdi_forker_thread(void *ptr)
337{
338 struct bdi_writeback *me = ptr;
339
340 current->flags |= PF_SWAPWRITE;
341 set_freezable();
342
343 /*
344 * Our parent may run at a different priority, just set us to normal
345 */
346 set_user_nice(current, 0);
347
348 for (;;) {
349 struct task_struct *task = NULL;
350 struct backing_dev_info *bdi;
351 enum {
352 NO_ACTION, /* Nothing to do */
353 FORK_THREAD, /* Fork bdi thread */
354 KILL_THREAD, /* Kill inactive bdi thread */
355 } action = NO_ACTION;
356
357 /*
358 * Temporary measure, we want to make sure we don't see
359 * dirty data on the default backing_dev_info
360 */
361 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
362 del_timer(&me->wakeup_timer);
363 wb_do_writeback(me, 0);
364 }
365
366 spin_lock_bh(&bdi_lock);
367 /*
368 * In the following loop we are going to check whether we have
369 * some work to do without any synchronization with tasks
370 * waking us up to do work for them. Set the task state here
371 * so that we don't miss wakeups after verifying conditions.
372 */
373 set_current_state(TASK_INTERRUPTIBLE);
374
375 list_for_each_entry(bdi, &bdi_list, bdi_list) {
376 bool have_dirty_io;
377
378 if (!bdi_cap_writeback_dirty(bdi) ||
379 bdi_cap_flush_forker(bdi))
380 continue;
381
382 WARN(!test_bit(BDI_registered, &bdi->state),
383 "bdi %p/%s is not registered!\n", bdi, bdi->name);
384
385 have_dirty_io = !list_empty(&bdi->work_list) ||
386 wb_has_dirty_io(&bdi->wb);
387
388 /*
389 * If the bdi has work to do, but the thread does not
390 * exist - create it.
391 */
392 if (!bdi->wb.task && have_dirty_io) {
393 /*
394 * Set the pending bit - if someone will try to
395 * unregister this bdi - it'll wait on this bit.
396 */
397 set_bit(BDI_pending, &bdi->state);
398 action = FORK_THREAD;
399 break;
400 }
401
402 spin_lock(&bdi->wb_lock);
403
404 /*
405 * If there is no work to do and the bdi thread was
406 * inactive long enough - kill it. The wb_lock is taken
407 * to make sure no-one adds more work to this bdi and
408 * wakes the bdi thread up.
409 */
410 if (bdi->wb.task && !have_dirty_io &&
411 time_after(jiffies, bdi->wb.last_active +
412 bdi_longest_inactive())) {
413 task = bdi->wb.task;
414 bdi->wb.task = NULL;
415 spin_unlock(&bdi->wb_lock);
416 set_bit(BDI_pending, &bdi->state);
417 action = KILL_THREAD;
418 break;
419 }
420 spin_unlock(&bdi->wb_lock);
421 }
422 spin_unlock_bh(&bdi_lock);
423
424 /* Keep working if default bdi still has things to do */
425 if (!list_empty(&me->bdi->work_list))
426 __set_current_state(TASK_RUNNING);
427
428 switch (action) {
429 case FORK_THREAD:
430 __set_current_state(TASK_RUNNING);
431 task = kthread_create(bdi_writeback_thread, &bdi->wb,
432 "flush-%s", dev_name(bdi->dev));
433 if (IS_ERR(task)) {
434 /*
435 * If thread creation fails, force writeout of
436 * the bdi from the thread. Hopefully 1024 is
437 * large enough for efficient IO.
438 */
439 writeback_inodes_wb(&bdi->wb, 1024,
440 WB_REASON_FORKER_THREAD);
441 } else {
442 /*
443 * The spinlock makes sure we do not lose
444 * wake-ups when racing with 'bdi_queue_work()'.
445 * And as soon as the bdi thread is visible, we
446 * can start it.
447 */
448 spin_lock_bh(&bdi->wb_lock);
449 bdi->wb.task = task;
450 spin_unlock_bh(&bdi->wb_lock);
451 wake_up_process(task);
452 }
453 bdi_clear_pending(bdi);
454 break;
455
456 case KILL_THREAD:
457 __set_current_state(TASK_RUNNING);
458 kthread_stop(task);
459 bdi_clear_pending(bdi);
460 break;
461
462 case NO_ACTION:
463 if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
464 /*
465 * There are no dirty data. The only thing we
466 * should now care about is checking for
467 * inactive bdi threads and killing them. Thus,
468 * let's sleep for longer time, save energy and
469 * be friendly for battery-driven devices.
470 */
471 schedule_timeout(bdi_longest_inactive());
472 else
473 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
474 try_to_freeze();
475 break;
476 }
477 }
478
479 return 0;
480} 297}
481 298
482/* 299/*
@@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
489 spin_unlock_bh(&bdi_lock); 306 spin_unlock_bh(&bdi_lock);
490 307
491 synchronize_rcu_expedited(); 308 synchronize_rcu_expedited();
309
310 /* bdi_list is now unused, clear it to mark @bdi dying */
311 INIT_LIST_HEAD(&bdi->bdi_list);
492} 312}
493 313
494int bdi_register(struct backing_dev_info *bdi, struct device *parent, 314int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
508 328
509 bdi->dev = dev; 329 bdi->dev = dev;
510 330
511 /*
512 * Just start the forker thread for our default backing_dev_info,
513 * and add other bdi's to the list. They will get a thread created
514 * on-demand when they need it.
515 */
516 if (bdi_cap_flush_forker(bdi)) {
517 struct bdi_writeback *wb = &bdi->wb;
518
519 wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
520 dev_name(dev));
521 if (IS_ERR(wb->task))
522 return PTR_ERR(wb->task);
523 }
524
525 bdi_debug_register(bdi, dev_name(dev)); 331 bdi_debug_register(bdi, dev_name(dev));
526 set_bit(BDI_registered, &bdi->state); 332 set_bit(BDI_registered, &bdi->state);
527 333
@@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
545 */ 351 */
546static void bdi_wb_shutdown(struct backing_dev_info *bdi) 352static void bdi_wb_shutdown(struct backing_dev_info *bdi)
547{ 353{
548 struct task_struct *task;
549
550 if (!bdi_cap_writeback_dirty(bdi)) 354 if (!bdi_cap_writeback_dirty(bdi))
551 return; 355 return;
552 356
@@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
556 bdi_remove_from_list(bdi); 360 bdi_remove_from_list(bdi);
557 361
558 /* 362 /*
559 * If setup is pending, wait for that to complete first 363 * Drain work list and shutdown the delayed_work. At this point,
364 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
365 * is dying and its work_list needs to be drained no matter what.
560 */ 366 */
561 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, 367 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
562 TASK_UNINTERRUPTIBLE); 368 flush_delayed_work(&bdi->wb.dwork);
369 WARN_ON(!list_empty(&bdi->work_list));
563 370
564 /* 371 /*
565 * Finally, kill the kernel thread. We don't need to be RCU 372 * This shouldn't be necessary unless @bdi for some reason has
566 * safe anymore, since the bdi is gone from visibility. 373 * unflushed dirty IO after work_list is drained. Do it anyway
374 * just in case.
567 */ 375 */
568 spin_lock_bh(&bdi->wb_lock); 376 cancel_delayed_work_sync(&bdi->wb.dwork);
569 task = bdi->wb.task;
570 bdi->wb.task = NULL;
571 spin_unlock_bh(&bdi->wb_lock);
572
573 if (task)
574 kthread_stop(task);
575} 377}
576 378
577/* 379/*
@@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
597 bdi_set_min_ratio(bdi, 0); 399 bdi_set_min_ratio(bdi, 0);
598 trace_writeback_bdi_unregister(bdi); 400 trace_writeback_bdi_unregister(bdi);
599 bdi_prune_sb(bdi); 401 bdi_prune_sb(bdi);
600 del_timer_sync(&bdi->wb.wakeup_timer);
601 402
602 if (!bdi_cap_flush_forker(bdi)) 403 bdi_wb_shutdown(bdi);
603 bdi_wb_shutdown(bdi);
604 bdi_debug_unregister(bdi); 404 bdi_debug_unregister(bdi);
605 405
606 spin_lock_bh(&bdi->wb_lock); 406 spin_lock_bh(&bdi->wb_lock);
@@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
622 INIT_LIST_HEAD(&wb->b_io); 422 INIT_LIST_HEAD(&wb->b_io);
623 INIT_LIST_HEAD(&wb->b_more_io); 423 INIT_LIST_HEAD(&wb->b_more_io);
624 spin_lock_init(&wb->list_lock); 424 spin_lock_init(&wb->list_lock);
625 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 425 INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
626} 426}
627 427
628/* 428/*
@@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
695 bdi_unregister(bdi); 495 bdi_unregister(bdi);
696 496
697 /* 497 /*
698 * If bdi_unregister() had already been called earlier, the 498 * If bdi_unregister() had already been called earlier, the dwork
699 * wakeup_timer could still be armed because bdi_prune_sb() 499 * could still be pending because bdi_prune_sb() can race with the
700 * can race with the bdi_wakeup_thread_delayed() calls from 500 * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
701 * __mark_inode_dirty().
702 */ 501 */
703 del_timer_sync(&bdi->wb.wakeup_timer); 502 cancel_delayed_work_sync(&bdi->wb.dwork);
704 503
705 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 504 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
706 percpu_counter_destroy(&bdi->bdi_stat[i]); 505 percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/bounce.c b/mm/bounce.c
index a5c2ec3589cb..c9f0a4339a7d 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
101 struct bio_vec *tovec, *fromvec; 101 struct bio_vec *tovec, *fromvec;
102 int i; 102 int i;
103 103
104 __bio_for_each_segment(tovec, to, i, 0) { 104 bio_for_each_segment(tovec, to, i) {
105 fromvec = from->bi_io_vec + i; 105 fromvec = from->bi_io_vec + i;
106 106
107 /* 107 /*
@@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
134 /* 134 /*
135 * free up bounce indirect pages used 135 * free up bounce indirect pages used
136 */ 136 */
137 __bio_for_each_segment(bvec, bio, i, 0) { 137 bio_for_each_segment_all(bvec, bio, i) {
138 org_vec = bio_orig->bi_io_vec + i; 138 org_vec = bio_orig->bi_io_vec + i;
139 if (bvec->bv_page == org_vec->bv_page) 139 if (bvec->bv_page == org_vec->bv_page)
140 continue; 140 continue;
@@ -199,78 +199,43 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
199static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 199static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
200 mempool_t *pool, int force) 200 mempool_t *pool, int force)
201{ 201{
202 struct page *page; 202 struct bio *bio;
203 struct bio *bio = NULL; 203 int rw = bio_data_dir(*bio_orig);
204 int i, rw = bio_data_dir(*bio_orig);
205 struct bio_vec *to, *from; 204 struct bio_vec *to, *from;
205 unsigned i;
206 206
207 bio_for_each_segment(from, *bio_orig, i) { 207 bio_for_each_segment(from, *bio_orig, i)
208 page = from->bv_page; 208 if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
209 goto bounce;
209 210
210 /* 211 return;
211 * is destination page below bounce pfn? 212bounce:
212 */ 213 bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
213 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
214 continue;
215
216 /*
217 * irk, bounce it
218 */
219 if (!bio) {
220 unsigned int cnt = (*bio_orig)->bi_vcnt;
221 214
222 bio = bio_alloc(GFP_NOIO, cnt); 215 bio_for_each_segment_all(to, bio, i) {
223 memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec)); 216 struct page *page = to->bv_page;
224 }
225
226 217
227 to = bio->bi_io_vec + i; 218 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
219 continue;
228 220
229 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
230 to->bv_len = from->bv_len;
231 to->bv_offset = from->bv_offset;
232 inc_zone_page_state(to->bv_page, NR_BOUNCE); 221 inc_zone_page_state(to->bv_page, NR_BOUNCE);
222 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
233 223
234 if (rw == WRITE) { 224 if (rw == WRITE) {
235 char *vto, *vfrom; 225 char *vto, *vfrom;
236 226
237 flush_dcache_page(from->bv_page); 227 flush_dcache_page(page);
228
238 vto = page_address(to->bv_page) + to->bv_offset; 229 vto = page_address(to->bv_page) + to->bv_offset;
239 vfrom = kmap(from->bv_page) + from->bv_offset; 230 vfrom = kmap_atomic(page) + to->bv_offset;
240 memcpy(vto, vfrom, to->bv_len); 231 memcpy(vto, vfrom, to->bv_len);
241 kunmap(from->bv_page); 232 kunmap_atomic(vfrom);
242 } 233 }
243 } 234 }
244 235
245 /*
246 * no pages bounced
247 */
248 if (!bio)
249 return;
250
251 trace_block_bio_bounce(q, *bio_orig); 236 trace_block_bio_bounce(q, *bio_orig);
252 237
253 /*
254 * at least one page was bounced, fill in possible non-highmem
255 * pages
256 */
257 __bio_for_each_segment(from, *bio_orig, i, 0) {
258 to = bio_iovec_idx(bio, i);
259 if (!to->bv_page) {
260 to->bv_page = from->bv_page;
261 to->bv_len = from->bv_len;
262 to->bv_offset = from->bv_offset;
263 }
264 }
265
266 bio->bi_bdev = (*bio_orig)->bi_bdev;
267 bio->bi_flags |= (1 << BIO_BOUNCED); 238 bio->bi_flags |= (1 << BIO_BOUNCED);
268 bio->bi_sector = (*bio_orig)->bi_sector;
269 bio->bi_rw = (*bio_orig)->bi_rw;
270
271 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
272 bio->bi_idx = (*bio_orig)->bi_idx;
273 bio->bi_size = (*bio_orig)->bi_size;
274 239
275 if (pool == page_pool) { 240 if (pool == page_pool) {
276 bio->bi_end_io = bounce_end_io_write; 241 bio->bi_end_io = bounce_end_io_write;
diff --git a/mm/page_io.c b/mm/page_io.c
index 06a8842a6ec6..a8a3ef45fed7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -36,7 +36,6 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
36 bio->bi_io_vec[0].bv_len = PAGE_SIZE; 36 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
37 bio->bi_io_vec[0].bv_offset = 0; 37 bio->bi_io_vec[0].bv_offset = 0;
38 bio->bi_vcnt = 1; 38 bio->bi_vcnt = 1;
39 bio->bi_idx = 0;
40 bio->bi_size = PAGE_SIZE; 39 bio->bi_size = PAGE_SIZE;
41 bio->bi_end_io = end_io; 40 bio->bi_end_io = end_io;
42 } 41 }