diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-08-25 14:36:43 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-08-25 14:36:43 -0400 |
| commit | a7e546f175f07630453c44b5afe14dd667dcfec9 (patch) | |
| tree | 352c2577161f0cbe8c3b49bb6f053cfd49ed32b4 | |
| parent | da31ce727e8cc6920de5840e35b4e770c08e86e3 (diff) | |
| parent | 676ce6d5ca3098339c028d44fe0427d1566a4d2d (diff) | |
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block-related fixes from Jens Axboe:
- Improvements to the buffered and direct write IO plugging from
Fengguang.
- Abstract out the mapping of a bio in a request, and use that to
provide a blk_bio_map_sg() helper. Useful for mapping just a bio
instead of a full request.
- Regression fix from Hugh, fixing up a patch that went into the
previous release cycle (and marked stable, too) attempting to prevent
a loop in __getblk_slow().
- Updates to discard requests, fixing up the sizing and how we align
them. Also a change to disallow merging of discard requests, since
that doesn't really work properly yet.
- A few drbd fixes.
- Documentation updates.
* 'for-linus' of git://git.kernel.dk/linux-block:
block: replace __getblk_slow misfix by grow_dev_page fix
drbd: Write all pages of the bitmap after an online resize
drbd: Finish requests that completed while IO was frozen
drbd: fix drbd wire compatibility for empty flushes
Documentation: update tunable options in block/cfq-iosched.txt
Documentation: update tunable options in block/cfq-iosched.txt
Documentation: update missing index files in block/00-INDEX
block: move down direct IO plugging
block: remove plugging at buffered write time
block: disable discard request merge temporarily
bio: Fix potential memory leak in bio_find_or_create_slab()
block: Don't use static to define "void *p" in show_partition_start()
block: Add blk_bio_map_sg() helper
block: Introduce __blk_segment_map_sg() helper
fs/block-dev.c:fix performance regression in O_DIRECT writes to md block devices
block: split discard into aligned requests
block: reorganize rounding of max_discard_sectors
| -rw-r--r-- | Documentation/block/00-INDEX | 10 | ||||
| -rw-r--r-- | Documentation/block/cfq-iosched.txt | 77 | ||||
| -rw-r--r-- | Documentation/block/queue-sysfs.txt | 64 | ||||
| -rw-r--r-- | block/blk-lib.c | 41 | ||||
| -rw-r--r-- | block/blk-merge.c | 117 | ||||
| -rw-r--r-- | block/genhd.c | 2 | ||||
| -rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 15 | ||||
| -rw-r--r-- | drivers/block/drbd/drbd_int.h | 1 | ||||
| -rw-r--r-- | drivers/block/drbd/drbd_main.c | 28 | ||||
| -rw-r--r-- | drivers/block/drbd/drbd_nl.c | 4 | ||||
| -rw-r--r-- | drivers/block/drbd/drbd_req.c | 36 | ||||
| -rw-r--r-- | fs/bio.c | 11 | ||||
| -rw-r--r-- | fs/block_dev.c | 3 | ||||
| -rw-r--r-- | fs/buffer.c | 66 | ||||
| -rw-r--r-- | fs/direct-io.c | 5 | ||||
| -rw-r--r-- | include/linux/blkdev.h | 14 | ||||
| -rw-r--r-- | mm/filemap.c | 7 |
17 files changed, 378 insertions, 123 deletions
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index d111e3b23db0..d18ecd827c40 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX | |||
| @@ -3,15 +3,21 @@ | |||
| 3 | biodoc.txt | 3 | biodoc.txt |
| 4 | - Notes on the Generic Block Layer Rewrite in Linux 2.5 | 4 | - Notes on the Generic Block Layer Rewrite in Linux 2.5 |
| 5 | capability.txt | 5 | capability.txt |
| 6 | - Generic Block Device Capability (/sys/block/<disk>/capability) | 6 | - Generic Block Device Capability (/sys/block/<device>/capability) |
| 7 | cfq-iosched.txt | ||
| 8 | - CFQ IO scheduler tunables | ||
| 9 | data-integrity.txt | ||
| 10 | - Block data integrity | ||
| 7 | deadline-iosched.txt | 11 | deadline-iosched.txt |
| 8 | - Deadline IO scheduler tunables | 12 | - Deadline IO scheduler tunables |
| 9 | ioprio.txt | 13 | ioprio.txt |
| 10 | - Block io priorities (in CFQ scheduler) | 14 | - Block io priorities (in CFQ scheduler) |
| 15 | queue-sysfs.txt | ||
| 16 | - Queue's sysfs entries | ||
| 11 | request.txt | 17 | request.txt |
| 12 | - The members of struct request (in include/linux/blkdev.h) | 18 | - The members of struct request (in include/linux/blkdev.h) |
| 13 | stat.txt | 19 | stat.txt |
| 14 | - Block layer statistics in /sys/block/<dev>/stat | 20 | - Block layer statistics in /sys/block/<device>/stat |
| 15 | switching-sched.txt | 21 | switching-sched.txt |
| 16 | - Switching I/O schedulers at runtime | 22 | - Switching I/O schedulers at runtime |
| 17 | writeback_cache_control.txt | 23 | writeback_cache_control.txt |
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index 6d670f570451..d89b4fe724d7 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt | |||
| @@ -1,3 +1,14 @@ | |||
| 1 | CFQ (Complete Fairness Queueing) | ||
| 2 | =============================== | ||
| 3 | |||
| 4 | The main aim of CFQ scheduler is to provide a fair allocation of the disk | ||
| 5 | I/O bandwidth for all the processes which requests an I/O operation. | ||
| 6 | |||
| 7 | CFQ maintains the per process queue for the processes which request I/O | ||
| 8 | operation(syncronous requests). In case of asynchronous requests, all the | ||
| 9 | requests from all the processes are batched together according to their | ||
| 10 | process's I/O priority. | ||
| 11 | |||
| 1 | CFQ ioscheduler tunables | 12 | CFQ ioscheduler tunables |
| 2 | ======================== | 13 | ======================== |
| 3 | 14 | ||
| @@ -25,6 +36,72 @@ there are multiple spindles behind single LUN (Host based hardware RAID | |||
| 25 | controller or for storage arrays), setting slice_idle=0 might end up in better | 36 | controller or for storage arrays), setting slice_idle=0 might end up in better |
| 26 | throughput and acceptable latencies. | 37 | throughput and acceptable latencies. |
| 27 | 38 | ||
| 39 | back_seek_max | ||
| 40 | ------------- | ||
| 41 | This specifies, given in Kbytes, the maximum "distance" for backward seeking. | ||
| 42 | The distance is the amount of space from the current head location to the | ||
| 43 | sectors that are backward in terms of distance. | ||
| 44 | |||
| 45 | This parameter allows the scheduler to anticipate requests in the "backward" | ||
| 46 | direction and consider them as being the "next" if they are within this | ||
| 47 | distance from the current head location. | ||
| 48 | |||
| 49 | back_seek_penalty | ||
| 50 | ----------------- | ||
| 51 | This parameter is used to compute the cost of backward seeking. If the | ||
| 52 | backward distance of request is just 1/back_seek_penalty from a "front" | ||
| 53 | request, then the seeking cost of two requests is considered equivalent. | ||
| 54 | |||
| 55 | So scheduler will not bias toward one or the other request (otherwise scheduler | ||
| 56 | will bias toward front request). Default value of back_seek_penalty is 2. | ||
| 57 | |||
| 58 | fifo_expire_async | ||
| 59 | ----------------- | ||
| 60 | This parameter is used to set the timeout of asynchronous requests. Default | ||
| 61 | value of this is 248ms. | ||
| 62 | |||
| 63 | fifo_expire_sync | ||
| 64 | ---------------- | ||
| 65 | This parameter is used to set the timeout of synchronous requests. Default | ||
| 66 | value of this is 124ms. In case to favor synchronous requests over asynchronous | ||
| 67 | one, this value should be decreased relative to fifo_expire_async. | ||
| 68 | |||
| 69 | slice_async | ||
| 70 | ----------- | ||
| 71 | This parameter is same as of slice_sync but for asynchronous queue. The | ||
| 72 | default value is 40ms. | ||
| 73 | |||
| 74 | slice_async_rq | ||
| 75 | -------------- | ||
| 76 | This parameter is used to limit the dispatching of asynchronous request to | ||
| 77 | device request queue in queue's slice time. The maximum number of request that | ||
| 78 | are allowed to be dispatched also depends upon the io priority. Default value | ||
| 79 | for this is 2. | ||
| 80 | |||
| 81 | slice_sync | ||
| 82 | ---------- | ||
| 83 | When a queue is selected for execution, the queues IO requests are only | ||
| 84 | executed for a certain amount of time(time_slice) before switching to another | ||
| 85 | queue. This parameter is used to calculate the time slice of synchronous | ||
| 86 | queue. | ||
| 87 | |||
| 88 | time_slice is computed using the below equation:- | ||
| 89 | time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the | ||
| 90 | time_slice of synchronous queue, increase the value of slice_sync. Default | ||
| 91 | value is 100ms. | ||
| 92 | |||
| 93 | quantum | ||
| 94 | ------- | ||
| 95 | This specifies the number of request dispatched to the device queue. In a | ||
| 96 | queue's time slice, a request will not be dispatched if the number of request | ||
| 97 | in the device exceeds this parameter. This parameter is used for synchronous | ||
| 98 | request. | ||
| 99 | |||
| 100 | In case of storage with several disk, this setting can limit the parallel | ||
| 101 | processing of request. Therefore, increasing the value can imporve the | ||
| 102 | performace although this can cause the latency of some I/O to increase due | ||
| 103 | to more number of requests. | ||
| 104 | |||
| 28 | CFQ IOPS Mode for group scheduling | 105 | CFQ IOPS Mode for group scheduling |
| 29 | =================================== | 106 | =================================== |
| 30 | Basic CFQ design is to provide priority based time slices. Higher priority | 107 | Basic CFQ design is to provide priority based time slices. Higher priority |
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index 6518a55273e7..e54ac1d53403 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt | |||
| @@ -9,20 +9,71 @@ These files are the ones found in the /sys/block/xxx/queue/ directory. | |||
| 9 | Files denoted with a RO postfix are readonly and the RW postfix means | 9 | Files denoted with a RO postfix are readonly and the RW postfix means |
| 10 | read-write. | 10 | read-write. |
| 11 | 11 | ||
| 12 | add_random (RW) | ||
| 13 | ---------------- | ||
| 14 | This file allows to trun off the disk entropy contribution. Default | ||
| 15 | value of this file is '1'(on). | ||
| 16 | |||
| 17 | discard_granularity (RO) | ||
| 18 | ----------------------- | ||
| 19 | This shows the size of internal allocation of the device in bytes, if | ||
| 20 | reported by the device. A value of '0' means device does not support | ||
| 21 | the discard functionality. | ||
| 22 | |||
| 23 | discard_max_bytes (RO) | ||
| 24 | ---------------------- | ||
| 25 | Devices that support discard functionality may have internal limits on | ||
| 26 | the number of bytes that can be trimmed or unmapped in a single operation. | ||
| 27 | The discard_max_bytes parameter is set by the device driver to the maximum | ||
| 28 | number of bytes that can be discarded in a single operation. Discard | ||
| 29 | requests issued to the device must not exceed this limit. A discard_max_bytes | ||
| 30 | value of 0 means that the device does not support discard functionality. | ||
| 31 | |||
| 32 | discard_zeroes_data (RO) | ||
| 33 | ------------------------ | ||
| 34 | When read, this file will show if the discarded block are zeroed by the | ||
| 35 | device or not. If its value is '1' the blocks are zeroed otherwise not. | ||
| 36 | |||
| 12 | hw_sector_size (RO) | 37 | hw_sector_size (RO) |
| 13 | ------------------- | 38 | ------------------- |
| 14 | This is the hardware sector size of the device, in bytes. | 39 | This is the hardware sector size of the device, in bytes. |
| 15 | 40 | ||
| 41 | iostats (RW) | ||
| 42 | ------------- | ||
| 43 | This file is used to control (on/off) the iostats accounting of the | ||
| 44 | disk. | ||
| 45 | |||
| 46 | logical_block_size (RO) | ||
| 47 | ----------------------- | ||
| 48 | This is the logcal block size of the device, in bytes. | ||
| 49 | |||
| 16 | max_hw_sectors_kb (RO) | 50 | max_hw_sectors_kb (RO) |
| 17 | ---------------------- | 51 | ---------------------- |
| 18 | This is the maximum number of kilobytes supported in a single data transfer. | 52 | This is the maximum number of kilobytes supported in a single data transfer. |
| 19 | 53 | ||
| 54 | max_integrity_segments (RO) | ||
| 55 | --------------------------- | ||
| 56 | When read, this file shows the max limit of integrity segments as | ||
| 57 | set by block layer which a hardware controller can handle. | ||
| 58 | |||
| 20 | max_sectors_kb (RW) | 59 | max_sectors_kb (RW) |
| 21 | ------------------- | 60 | ------------------- |
| 22 | This is the maximum number of kilobytes that the block layer will allow | 61 | This is the maximum number of kilobytes that the block layer will allow |
| 23 | for a filesystem request. Must be smaller than or equal to the maximum | 62 | for a filesystem request. Must be smaller than or equal to the maximum |
| 24 | size allowed by the hardware. | 63 | size allowed by the hardware. |
| 25 | 64 | ||
| 65 | max_segments (RO) | ||
| 66 | ----------------- | ||
| 67 | Maximum number of segments of the device. | ||
| 68 | |||
| 69 | max_segment_size (RO) | ||
| 70 | --------------------- | ||
| 71 | Maximum segment size of the device. | ||
| 72 | |||
| 73 | minimum_io_size (RO) | ||
| 74 | -------------------- | ||
| 75 | This is the smallest preferred io size reported by the device. | ||
| 76 | |||
| 26 | nomerges (RW) | 77 | nomerges (RW) |
| 27 | ------------- | 78 | ------------- |
| 28 | This enables the user to disable the lookup logic involved with IO | 79 | This enables the user to disable the lookup logic involved with IO |
| @@ -45,11 +96,24 @@ per-block-cgroup request pool. IOW, if there are N block cgroups, | |||
| 45 | each request queue may have upto N request pools, each independently | 96 | each request queue may have upto N request pools, each independently |
| 46 | regulated by nr_requests. | 97 | regulated by nr_requests. |
| 47 | 98 | ||
| 99 | optimal_io_size (RO) | ||
| 100 | -------------------- | ||
| 101 | This is the optimal io size reported by the device. | ||
| 102 | |||
| 103 | physical_block_size (RO) | ||
| 104 | ------------------------ | ||
| 105 | This is the physical block size of device, in bytes. | ||
| 106 | |||
| 48 | read_ahead_kb (RW) | 107 | read_ahead_kb (RW) |
| 49 | ------------------ | 108 | ------------------ |
| 50 | Maximum number of kilobytes to read-ahead for filesystems on this block | 109 | Maximum number of kilobytes to read-ahead for filesystems on this block |
| 51 | device. | 110 | device. |
| 52 | 111 | ||
| 112 | rotational (RW) | ||
| 113 | --------------- | ||
| 114 | This file is used to stat if the device is of rotational type or | ||
| 115 | non-rotational type. | ||
| 116 | |||
| 53 | rq_affinity (RW) | 117 | rq_affinity (RW) |
| 54 | ---------------- | 118 | ---------------- |
| 55 | If this option is '1', the block layer will migrate request completions to the | 119 | If this option is '1', the block layer will migrate request completions to the |
diff --git a/block/blk-lib.c b/block/blk-lib.c index 2b461b496a78..19cc761cacb2 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
| @@ -44,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
| 44 | struct request_queue *q = bdev_get_queue(bdev); | 44 | struct request_queue *q = bdev_get_queue(bdev); |
| 45 | int type = REQ_WRITE | REQ_DISCARD; | 45 | int type = REQ_WRITE | REQ_DISCARD; |
| 46 | unsigned int max_discard_sectors; | 46 | unsigned int max_discard_sectors; |
| 47 | unsigned int granularity, alignment, mask; | ||
| 47 | struct bio_batch bb; | 48 | struct bio_batch bb; |
| 48 | struct bio *bio; | 49 | struct bio *bio; |
| 49 | int ret = 0; | 50 | int ret = 0; |
| @@ -54,18 +55,20 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
| 54 | if (!blk_queue_discard(q)) | 55 | if (!blk_queue_discard(q)) |
| 55 | return -EOPNOTSUPP; | 56 | return -EOPNOTSUPP; |
| 56 | 57 | ||
| 58 | /* Zero-sector (unknown) and one-sector granularities are the same. */ | ||
| 59 | granularity = max(q->limits.discard_granularity >> 9, 1U); | ||
| 60 | mask = granularity - 1; | ||
| 61 | alignment = (bdev_discard_alignment(bdev) >> 9) & mask; | ||
| 62 | |||
| 57 | /* | 63 | /* |
| 58 | * Ensure that max_discard_sectors is of the proper | 64 | * Ensure that max_discard_sectors is of the proper |
| 59 | * granularity | 65 | * granularity, so that requests stay aligned after a split. |
| 60 | */ | 66 | */ |
| 61 | max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); | 67 | max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); |
| 68 | max_discard_sectors = round_down(max_discard_sectors, granularity); | ||
| 62 | if (unlikely(!max_discard_sectors)) { | 69 | if (unlikely(!max_discard_sectors)) { |
| 63 | /* Avoid infinite loop below. Being cautious never hurts. */ | 70 | /* Avoid infinite loop below. Being cautious never hurts. */ |
| 64 | return -EOPNOTSUPP; | 71 | return -EOPNOTSUPP; |
| 65 | } else if (q->limits.discard_granularity) { | ||
| 66 | unsigned int disc_sects = q->limits.discard_granularity >> 9; | ||
| 67 | |||
| 68 | max_discard_sectors &= ~(disc_sects - 1); | ||
| 69 | } | 72 | } |
| 70 | 73 | ||
| 71 | if (flags & BLKDEV_DISCARD_SECURE) { | 74 | if (flags & BLKDEV_DISCARD_SECURE) { |
| @@ -79,25 +82,37 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
| 79 | bb.wait = &wait; | 82 | bb.wait = &wait; |
| 80 | 83 | ||
| 81 | while (nr_sects) { | 84 | while (nr_sects) { |
| 85 | unsigned int req_sects; | ||
| 86 | sector_t end_sect; | ||
| 87 | |||
| 82 | bio = bio_alloc(gfp_mask, 1); | 88 | bio = bio_alloc(gfp_mask, 1); |
| 83 | if (!bio) { | 89 | if (!bio) { |
| 84 | ret = -ENOMEM; | 90 | ret = -ENOMEM; |
| 85 | break; | 91 | break; |
| 86 | } | 92 | } |
| 87 | 93 | ||
| 94 | req_sects = min_t(sector_t, nr_sects, max_discard_sectors); | ||
| 95 | |||
| 96 | /* | ||
| 97 | * If splitting a request, and the next starting sector would be | ||
| 98 | * misaligned, stop the discard at the previous aligned sector. | ||
| 99 | */ | ||
| 100 | end_sect = sector + req_sects; | ||
| 101 | if (req_sects < nr_sects && (end_sect & mask) != alignment) { | ||
| 102 | end_sect = | ||
| 103 | round_down(end_sect - alignment, granularity) | ||
| 104 | + alignment; | ||
| 105 | req_sects = end_sect - sector; | ||
| 106 | } | ||
| 107 | |||
| 88 | bio->bi_sector = sector; | 108 | bio->bi_sector = sector; |
| 89 | bio->bi_end_io = bio_batch_end_io; | 109 | bio->bi_end_io = bio_batch_end_io; |
| 90 | bio->bi_bdev = bdev; | 110 | bio->bi_bdev = bdev; |
| 91 | bio->bi_private = &bb; | 111 | bio->bi_private = &bb; |
| 92 | 112 | ||
| 93 | if (nr_sects > max_discard_sectors) { | 113 | bio->bi_size = req_sects << 9; |
| 94 | bio->bi_size = max_discard_sectors << 9; | 114 | nr_sects -= req_sects; |
| 95 | nr_sects -= max_discard_sectors; | 115 | sector = end_sect; |
| 96 | sector += max_discard_sectors; | ||
| 97 | } else { | ||
| 98 | bio->bi_size = nr_sects << 9; | ||
| 99 | nr_sects = 0; | ||
| 100 | } | ||
| 101 | 116 | ||
| 102 | atomic_inc(&bb.done); | 117 | atomic_inc(&bb.done); |
| 103 | submit_bio(type, bio); | 118 | submit_bio(type, bio); |
diff --git a/block/blk-merge.c b/block/blk-merge.c index 160035f54882..e76279e41162 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
| @@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, | |||
| 110 | return 0; | 110 | return 0; |
| 111 | } | 111 | } |
| 112 | 112 | ||
| 113 | static void | ||
| 114 | __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, | ||
| 115 | struct scatterlist *sglist, struct bio_vec **bvprv, | ||
| 116 | struct scatterlist **sg, int *nsegs, int *cluster) | ||
| 117 | { | ||
| 118 | |||
| 119 | int nbytes = bvec->bv_len; | ||
| 120 | |||
| 121 | if (*bvprv && *cluster) { | ||
| 122 | if ((*sg)->length + nbytes > queue_max_segment_size(q)) | ||
| 123 | goto new_segment; | ||
| 124 | |||
| 125 | if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec)) | ||
| 126 | goto new_segment; | ||
| 127 | if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec)) | ||
| 128 | goto new_segment; | ||
| 129 | |||
| 130 | (*sg)->length += nbytes; | ||
| 131 | } else { | ||
| 132 | new_segment: | ||
| 133 | if (!*sg) | ||
| 134 | *sg = sglist; | ||
| 135 | else { | ||
| 136 | /* | ||
| 137 | * If the driver previously mapped a shorter | ||
| 138 | * list, we could see a termination bit | ||
| 139 | * prematurely unless it fully inits the sg | ||
| 140 | * table on each mapping. We KNOW that there | ||
| 141 | * must be more entries here or the driver | ||
| 142 | * would be buggy, so force clear the | ||
| 143 | * termination bit to avoid doing a full | ||
| 144 | * sg_init_table() in drivers for each command. | ||
| 145 | */ | ||
| 146 | (*sg)->page_link &= ~0x02; | ||
| 147 | *sg = sg_next(*sg); | ||
| 148 | } | ||
| 149 | |||
| 150 | sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); | ||
| 151 | (*nsegs)++; | ||
| 152 | } | ||
| 153 | *bvprv = bvec; | ||
| 154 | } | ||
| 155 | |||
| 113 | /* | 156 | /* |
| 114 | * map a request to scatterlist, return number of sg entries setup. Caller | 157 | * map a request to scatterlist, return number of sg entries setup. Caller |
| 115 | * must make sure sg can hold rq->nr_phys_segments entries | 158 | * must make sure sg can hold rq->nr_phys_segments entries |
| @@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, | |||
| 131 | bvprv = NULL; | 174 | bvprv = NULL; |
| 132 | sg = NULL; | 175 | sg = NULL; |
| 133 | rq_for_each_segment(bvec, rq, iter) { | 176 | rq_for_each_segment(bvec, rq, iter) { |
| 134 | int nbytes = bvec->bv_len; | 177 | __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, |
| 135 | 178 | &nsegs, &cluster); | |
| 136 | if (bvprv && cluster) { | ||
| 137 | if (sg->length + nbytes > queue_max_segment_size(q)) | ||
| 138 | goto new_segment; | ||
| 139 | |||
| 140 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) | ||
| 141 | goto new_segment; | ||
| 142 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) | ||
| 143 | goto new_segment; | ||
| 144 | |||
| 145 | sg->length += nbytes; | ||
| 146 | } else { | ||
| 147 | new_segment: | ||
| 148 | if (!sg) | ||
| 149 | sg = sglist; | ||
| 150 | else { | ||
| 151 | /* | ||
| 152 | * If the driver previously mapped a shorter | ||
| 153 | * list, we could see a termination bit | ||
| 154 | * prematurely unless it fully inits the sg | ||
| 155 | * table on each mapping. We KNOW that there | ||
| 156 | * must be more entries here or the driver | ||
| 157 | * would be buggy, so force clear the | ||
| 158 | * termination bit to avoid doing a full | ||
| 159 | * sg_init_table() in drivers for each command. | ||
| 160 | */ | ||
| 161 | sg->page_link &= ~0x02; | ||
| 162 | sg = sg_next(sg); | ||
| 163 | } | ||
| 164 | |||
| 165 | sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset); | ||
| 166 | nsegs++; | ||
| 167 | } | ||
| 168 | bvprv = bvec; | ||
| 169 | } /* segments in rq */ | 179 | } /* segments in rq */ |
| 170 | 180 | ||
| 171 | 181 | ||
| @@ -199,6 +209,43 @@ new_segment: | |||
| 199 | } | 209 | } |
| 200 | EXPORT_SYMBOL(blk_rq_map_sg); | 210 | EXPORT_SYMBOL(blk_rq_map_sg); |
| 201 | 211 | ||
| 212 | /** | ||
| 213 | * blk_bio_map_sg - map a bio to a scatterlist | ||
| 214 | * @q: request_queue in question | ||
| 215 | * @bio: bio being mapped | ||
| 216 | * @sglist: scatterlist being mapped | ||
| 217 | * | ||
| 218 | * Note: | ||
| 219 | * Caller must make sure sg can hold bio->bi_phys_segments entries | ||
| 220 | * | ||
| 221 | * Will return the number of sg entries setup | ||
| 222 | */ | ||
| 223 | int blk_bio_map_sg(struct request_queue *q, struct bio *bio, | ||
| 224 | struct scatterlist *sglist) | ||
| 225 | { | ||
| 226 | struct bio_vec *bvec, *bvprv; | ||
| 227 | struct scatterlist *sg; | ||
| 228 | int nsegs, cluster; | ||
| 229 | unsigned long i; | ||
| 230 | |||
| 231 | nsegs = 0; | ||
| 232 | cluster = blk_queue_cluster(q); | ||
| 233 | |||
| 234 | bvprv = NULL; | ||
| 235 | sg = NULL; | ||
| 236 | bio_for_each_segment(bvec, bio, i) { | ||
| 237 | __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, | ||
| 238 | &nsegs, &cluster); | ||
| 239 | } /* segments in bio */ | ||
| 240 | |||
| 241 | if (sg) | ||
| 242 | sg_mark_end(sg); | ||
| 243 | |||
| 244 | BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); | ||
| 245 | return nsegs; | ||
| 246 | } | ||
| 247 | EXPORT_SYMBOL(blk_bio_map_sg); | ||
| 248 | |||
| 202 | static inline int ll_new_hw_segment(struct request_queue *q, | 249 | static inline int ll_new_hw_segment(struct request_queue *q, |
| 203 | struct request *req, | 250 | struct request *req, |
| 204 | struct bio *bio) | 251 | struct bio *bio) |
diff --git a/block/genhd.c b/block/genhd.c index cac7366957c3..d839723303c8 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
| @@ -835,7 +835,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) | |||
| 835 | 835 | ||
| 836 | static void *show_partition_start(struct seq_file *seqf, loff_t *pos) | 836 | static void *show_partition_start(struct seq_file *seqf, loff_t *pos) |
| 837 | { | 837 | { |
| 838 | static void *p; | 838 | void *p; |
| 839 | 839 | ||
| 840 | p = disk_seqf_start(seqf, pos); | 840 | p = disk_seqf_start(seqf, pos); |
| 841 | if (!IS_ERR_OR_NULL(p) && !*pos) | 841 | if (!IS_ERR_OR_NULL(p) && !*pos) |
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index ba91b408abad..d84566496746 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
| @@ -889,6 +889,7 @@ struct bm_aio_ctx { | |||
| 889 | unsigned int done; | 889 | unsigned int done; |
| 890 | unsigned flags; | 890 | unsigned flags; |
| 891 | #define BM_AIO_COPY_PAGES 1 | 891 | #define BM_AIO_COPY_PAGES 1 |
| 892 | #define BM_WRITE_ALL_PAGES 2 | ||
| 892 | int error; | 893 | int error; |
| 893 | struct kref kref; | 894 | struct kref kref; |
| 894 | }; | 895 | }; |
| @@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w | |||
| 1059 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) | 1060 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) |
| 1060 | break; | 1061 | break; |
| 1061 | if (rw & WRITE) { | 1062 | if (rw & WRITE) { |
| 1062 | if (bm_test_page_unchanged(b->bm_pages[i])) { | 1063 | if (!(flags & BM_WRITE_ALL_PAGES) && |
| 1064 | bm_test_page_unchanged(b->bm_pages[i])) { | ||
| 1063 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); | 1065 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); |
| 1064 | continue; | 1066 | continue; |
| 1065 | } | 1067 | } |
| @@ -1141,6 +1143,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) | |||
| 1141 | } | 1143 | } |
| 1142 | 1144 | ||
| 1143 | /** | 1145 | /** |
| 1146 | * drbd_bm_write_all() - Write the whole bitmap to its on disk location. | ||
| 1147 | * @mdev: DRBD device. | ||
| 1148 | * | ||
| 1149 | * Will write all pages. | ||
| 1150 | */ | ||
| 1151 | int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local) | ||
| 1152 | { | ||
| 1153 | return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0); | ||
| 1154 | } | ||
| 1155 | |||
| 1156 | /** | ||
| 1144 | * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. | 1157 | * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. |
| 1145 | * @mdev: DRBD device. | 1158 | * @mdev: DRBD device. |
| 1146 | * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages | 1159 | * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b2ca143d0053..b953cc7c9c00 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
| @@ -1469,6 +1469,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | |||
| 1469 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); | 1469 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); |
| 1470 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | 1470 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); |
| 1471 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | 1471 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); |
| 1472 | extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); | ||
| 1472 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); | 1473 | extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); |
| 1473 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | 1474 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, |
| 1474 | unsigned long al_enr); | 1475 | unsigned long al_enr); |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index dbe6135a2abe..f93a0320e952 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
| @@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); | |||
| 79 | static void md_sync_timer_fn(unsigned long data); | 79 | static void md_sync_timer_fn(unsigned long data); |
| 80 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 80 | static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); |
| 81 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); | 81 | static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); |
| 82 | static void _tl_clear(struct drbd_conf *mdev); | ||
| 82 | 83 | ||
| 83 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " | 84 | MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " |
| 84 | "Lars Ellenberg <lars@linbit.com>"); | 85 | "Lars Ellenberg <lars@linbit.com>"); |
| @@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
| 432 | 433 | ||
| 433 | /* Actions operating on the disk state, also want to work on | 434 | /* Actions operating on the disk state, also want to work on |
| 434 | requests that got barrier acked. */ | 435 | requests that got barrier acked. */ |
| 435 | switch (what) { | ||
| 436 | case fail_frozen_disk_io: | ||
| 437 | case restart_frozen_disk_io: | ||
| 438 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { | ||
| 439 | req = list_entry(le, struct drbd_request, tl_requests); | ||
| 440 | _req_mod(req, what); | ||
| 441 | } | ||
| 442 | 436 | ||
| 443 | case connection_lost_while_pending: | 437 | list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { |
| 444 | case resend: | 438 | req = list_entry(le, struct drbd_request, tl_requests); |
| 445 | break; | 439 | _req_mod(req, what); |
| 446 | default: | ||
| 447 | dev_err(DEV, "what = %d in _tl_restart()\n", what); | ||
| 448 | } | 440 | } |
| 449 | } | 441 | } |
| 450 | 442 | ||
| @@ -459,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | |||
| 459 | */ | 451 | */ |
| 460 | void tl_clear(struct drbd_conf *mdev) | 452 | void tl_clear(struct drbd_conf *mdev) |
| 461 | { | 453 | { |
| 454 | spin_lock_irq(&mdev->req_lock); | ||
| 455 | _tl_clear(mdev); | ||
| 456 | spin_unlock_irq(&mdev->req_lock); | ||
| 457 | } | ||
| 458 | |||
| 459 | static void _tl_clear(struct drbd_conf *mdev) | ||
| 460 | { | ||
| 462 | struct list_head *le, *tle; | 461 | struct list_head *le, *tle; |
| 463 | struct drbd_request *r; | 462 | struct drbd_request *r; |
| 464 | 463 | ||
| 465 | spin_lock_irq(&mdev->req_lock); | ||
| 466 | |||
| 467 | _tl_restart(mdev, connection_lost_while_pending); | 464 | _tl_restart(mdev, connection_lost_while_pending); |
| 468 | 465 | ||
| 469 | /* we expect this list to be empty. */ | 466 | /* we expect this list to be empty. */ |
| @@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev) | |||
| 482 | 479 | ||
| 483 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); | 480 | memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); |
| 484 | 481 | ||
| 485 | spin_unlock_irq(&mdev->req_lock); | ||
| 486 | } | 482 | } |
| 487 | 483 | ||
| 488 | void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) | 484 | void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) |
| @@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
| 1476 | if (ns.susp_fen) { | 1472 | if (ns.susp_fen) { |
| 1477 | /* case1: The outdate peer handler is successful: */ | 1473 | /* case1: The outdate peer handler is successful: */ |
| 1478 | if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { | 1474 | if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { |
| 1479 | tl_clear(mdev); | ||
| 1480 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { | 1475 | if (test_bit(NEW_CUR_UUID, &mdev->flags)) { |
| 1481 | drbd_uuid_new_current(mdev); | 1476 | drbd_uuid_new_current(mdev); |
| 1482 | clear_bit(NEW_CUR_UUID, &mdev->flags); | 1477 | clear_bit(NEW_CUR_UUID, &mdev->flags); |
| 1483 | } | 1478 | } |
| 1484 | spin_lock_irq(&mdev->req_lock); | 1479 | spin_lock_irq(&mdev->req_lock); |
| 1480 | _tl_clear(mdev); | ||
| 1485 | _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); | 1481 | _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); |
| 1486 | spin_unlock_irq(&mdev->req_lock); | 1482 | spin_unlock_irq(&mdev->req_lock); |
| 1487 | } | 1483 | } |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index fb9dce8daa24..edb490aad8b4 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
| @@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
| 674 | la_size_changed && md_moved ? "size changed and md moved" : | 674 | la_size_changed && md_moved ? "size changed and md moved" : |
| 675 | la_size_changed ? "size changed" : "md moved"); | 675 | la_size_changed ? "size changed" : "md moved"); |
| 676 | /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ | 676 | /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ |
| 677 | err = drbd_bitmap_io(mdev, &drbd_bm_write, | 677 | err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, |
| 678 | "size changed", BM_LOCKED_MASK); | 678 | "size changed", BM_LOCKED_MASK); |
| 679 | if (err) { | 679 | if (err) { |
| 680 | rv = dev_size_error; | 680 | rv = dev_size_error; |
| 681 | goto out; | 681 | goto out; |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 910335c30927..01b2ac641c7b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
| @@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
| 695 | break; | 695 | break; |
| 696 | 696 | ||
| 697 | case resend: | 697 | case resend: |
| 698 | /* Simply complete (local only) READs. */ | ||
| 699 | if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { | ||
| 700 | _req_may_be_done(req, m); | ||
| 701 | break; | ||
| 702 | } | ||
| 703 | |||
| 698 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK | 704 | /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK |
| 699 | before the connection loss (B&C only); only P_BARRIER_ACK was missing. | 705 | before the connection loss (B&C only); only P_BARRIER_ACK was missing. |
| 700 | Trowing them out of the TL here by pretending we got a BARRIER_ACK | 706 | Trowing them out of the TL here by pretending we got a BARRIER_ACK |
| @@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
| 834 | req->private_bio = NULL; | 840 | req->private_bio = NULL; |
| 835 | } | 841 | } |
| 836 | if (rw == WRITE) { | 842 | if (rw == WRITE) { |
| 837 | remote = 1; | 843 | /* Need to replicate writes. Unless it is an empty flush, |
| 844 | * which is better mapped to a DRBD P_BARRIER packet, | ||
| 845 | * also for drbd wire protocol compatibility reasons. */ | ||
| 846 | if (unlikely(size == 0)) { | ||
| 847 | /* The only size==0 bios we expect are empty flushes. */ | ||
| 848 | D_ASSERT(bio->bi_rw & REQ_FLUSH); | ||
| 849 | remote = 0; | ||
| 850 | } else | ||
| 851 | remote = 1; | ||
| 838 | } else { | 852 | } else { |
| 839 | /* READ || READA */ | 853 | /* READ || READA */ |
| 840 | if (local) { | 854 | if (local) { |
| @@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns | |||
| 870 | * extent. This waits for any resync activity in the corresponding | 884 | * extent. This waits for any resync activity in the corresponding |
| 871 | * resync extent to finish, and, if necessary, pulls in the target | 885 | * resync extent to finish, and, if necessary, pulls in the target |
| 872 | * extent into the activity log, which involves further disk io because | 886 | * extent into the activity log, which involves further disk io because |
| 873 | * of transactional on-disk meta data updates. */ | 887 | * of transactional on-disk meta data updates. |
| 874 | if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { | 888 | * Empty flushes don't need to go into the activity log, they can only |
| 889 | * flush data for pending writes which are already in there. */ | ||
| 890 | if (rw == WRITE && local && size | ||
| 891 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | ||
| 875 | req->rq_state |= RQ_IN_ACT_LOG; | 892 | req->rq_state |= RQ_IN_ACT_LOG; |
| 876 | drbd_al_begin_io(mdev, sector); | 893 | drbd_al_begin_io(mdev, sector); |
| 877 | } | 894 | } |
| @@ -994,7 +1011,10 @@ allocate_barrier: | |||
| 994 | if (rw == WRITE && _req_conflicts(req)) | 1011 | if (rw == WRITE && _req_conflicts(req)) |
| 995 | goto fail_conflicting; | 1012 | goto fail_conflicting; |
| 996 | 1013 | ||
| 997 | list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); | 1014 | /* no point in adding empty flushes to the transfer log, |
| 1015 | * they are mapped to drbd barriers already. */ | ||
| 1016 | if (likely(size!=0)) | ||
| 1017 | list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); | ||
| 998 | 1018 | ||
| 999 | /* NOTE remote first: to get the concurrent write detection right, | 1019 | /* NOTE remote first: to get the concurrent write detection right, |
| 1000 | * we must register the request before start of local IO. */ | 1020 | * we must register the request before start of local IO. */ |
| @@ -1014,6 +1034,14 @@ allocate_barrier: | |||
| 1014 | mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) | 1034 | mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) |
| 1015 | maybe_pull_ahead(mdev); | 1035 | maybe_pull_ahead(mdev); |
| 1016 | 1036 | ||
| 1037 | /* If this was a flush, queue a drbd barrier/start a new epoch. | ||
| 1038 | * Unless the current epoch was empty anyways, or we are not currently | ||
| 1039 | * replicating, in which case there is no point. */ | ||
| 1040 | if (unlikely(bio->bi_rw & REQ_FLUSH) | ||
| 1041 | && mdev->newest_tle->n_writes | ||
| 1042 | && drbd_should_do_remote(mdev->state)) | ||
| 1043 | queue_barrier(mdev); | ||
| 1044 | |||
| 1017 | spin_unlock_irq(&mdev->req_lock); | 1045 | spin_unlock_irq(&mdev->req_lock); |
| 1018 | kfree(b); /* if someone else has beaten us to it... */ | 1046 | kfree(b); /* if someone else has beaten us to it... */ |
| 1019 | 1047 | ||
| @@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) | |||
| 73 | { | 73 | { |
| 74 | unsigned int sz = sizeof(struct bio) + extra_size; | 74 | unsigned int sz = sizeof(struct bio) + extra_size; |
| 75 | struct kmem_cache *slab = NULL; | 75 | struct kmem_cache *slab = NULL; |
| 76 | struct bio_slab *bslab; | 76 | struct bio_slab *bslab, *new_bio_slabs; |
| 77 | unsigned int i, entry = -1; | 77 | unsigned int i, entry = -1; |
| 78 | 78 | ||
| 79 | mutex_lock(&bio_slab_lock); | 79 | mutex_lock(&bio_slab_lock); |
| @@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) | |||
| 97 | 97 | ||
| 98 | if (bio_slab_nr == bio_slab_max && entry == -1) { | 98 | if (bio_slab_nr == bio_slab_max && entry == -1) { |
| 99 | bio_slab_max <<= 1; | 99 | bio_slab_max <<= 1; |
| 100 | bio_slabs = krealloc(bio_slabs, | 100 | new_bio_slabs = krealloc(bio_slabs, |
| 101 | bio_slab_max * sizeof(struct bio_slab), | 101 | bio_slab_max * sizeof(struct bio_slab), |
| 102 | GFP_KERNEL); | 102 | GFP_KERNEL); |
| 103 | if (!bio_slabs) | 103 | if (!new_bio_slabs) |
| 104 | goto out_unlock; | 104 | goto out_unlock; |
| 105 | bio_slabs = new_bio_slabs; | ||
| 105 | } | 106 | } |
| 106 | if (entry == -1) | 107 | if (entry == -1) |
| 107 | entry = bio_slab_nr++; | 108 | entry = bio_slab_nr++; |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 1e519195d45b..38e721b35d45 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
| @@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1578 | unsigned long nr_segs, loff_t pos) | 1578 | unsigned long nr_segs, loff_t pos) |
| 1579 | { | 1579 | { |
| 1580 | struct file *file = iocb->ki_filp; | 1580 | struct file *file = iocb->ki_filp; |
| 1581 | struct blk_plug plug; | ||
| 1581 | ssize_t ret; | 1582 | ssize_t ret; |
| 1582 | 1583 | ||
| 1583 | BUG_ON(iocb->ki_pos != pos); | 1584 | BUG_ON(iocb->ki_pos != pos); |
| 1584 | 1585 | ||
| 1586 | blk_start_plug(&plug); | ||
| 1585 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 1587 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
| 1586 | if (ret > 0 || ret == -EIOCBQUEUED) { | 1588 | if (ret > 0 || ret == -EIOCBQUEUED) { |
| 1587 | ssize_t err; | 1589 | ssize_t err; |
| @@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1590 | if (err < 0 && ret > 0) | 1592 | if (err < 0 && ret > 0) |
| 1591 | ret = err; | 1593 | ret = err; |
| 1592 | } | 1594 | } |
| 1595 | blk_finish_plug(&plug); | ||
| 1593 | return ret; | 1596 | return ret; |
| 1594 | } | 1597 | } |
| 1595 | EXPORT_SYMBOL_GPL(blkdev_aio_write); | 1598 | EXPORT_SYMBOL_GPL(blkdev_aio_write); |
diff --git a/fs/buffer.c b/fs/buffer.c index 9f6d2e41281d..58e2e7b77372 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
| @@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) | |||
| 914 | /* | 914 | /* |
| 915 | * Initialise the state of a blockdev page's buffers. | 915 | * Initialise the state of a blockdev page's buffers. |
| 916 | */ | 916 | */ |
| 917 | static void | 917 | static sector_t |
| 918 | init_page_buffers(struct page *page, struct block_device *bdev, | 918 | init_page_buffers(struct page *page, struct block_device *bdev, |
| 919 | sector_t block, int size) | 919 | sector_t block, int size) |
| 920 | { | 920 | { |
| @@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev, | |||
| 936 | block++; | 936 | block++; |
| 937 | bh = bh->b_this_page; | 937 | bh = bh->b_this_page; |
| 938 | } while (bh != head); | 938 | } while (bh != head); |
| 939 | |||
| 940 | /* | ||
| 941 | * Caller needs to validate requested block against end of device. | ||
| 942 | */ | ||
| 943 | return end_block; | ||
| 939 | } | 944 | } |
| 940 | 945 | ||
| 941 | /* | 946 | /* |
| 942 | * Create the page-cache page that contains the requested block. | 947 | * Create the page-cache page that contains the requested block. |
| 943 | * | 948 | * |
| 944 | * This is user purely for blockdev mappings. | 949 | * This is used purely for blockdev mappings. |
| 945 | */ | 950 | */ |
| 946 | static struct page * | 951 | static int |
| 947 | grow_dev_page(struct block_device *bdev, sector_t block, | 952 | grow_dev_page(struct block_device *bdev, sector_t block, |
| 948 | pgoff_t index, int size) | 953 | pgoff_t index, int size, int sizebits) |
| 949 | { | 954 | { |
| 950 | struct inode *inode = bdev->bd_inode; | 955 | struct inode *inode = bdev->bd_inode; |
| 951 | struct page *page; | 956 | struct page *page; |
| 952 | struct buffer_head *bh; | 957 | struct buffer_head *bh; |
| 958 | sector_t end_block; | ||
| 959 | int ret = 0; /* Will call free_more_memory() */ | ||
| 953 | 960 | ||
| 954 | page = find_or_create_page(inode->i_mapping, index, | 961 | page = find_or_create_page(inode->i_mapping, index, |
| 955 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); | 962 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); |
| 956 | if (!page) | 963 | if (!page) |
| 957 | return NULL; | 964 | return ret; |
| 958 | 965 | ||
| 959 | BUG_ON(!PageLocked(page)); | 966 | BUG_ON(!PageLocked(page)); |
| 960 | 967 | ||
| 961 | if (page_has_buffers(page)) { | 968 | if (page_has_buffers(page)) { |
| 962 | bh = page_buffers(page); | 969 | bh = page_buffers(page); |
| 963 | if (bh->b_size == size) { | 970 | if (bh->b_size == size) { |
| 964 | init_page_buffers(page, bdev, block, size); | 971 | end_block = init_page_buffers(page, bdev, |
| 965 | return page; | 972 | index << sizebits, size); |
| 973 | goto done; | ||
| 966 | } | 974 | } |
| 967 | if (!try_to_free_buffers(page)) | 975 | if (!try_to_free_buffers(page)) |
| 968 | goto failed; | 976 | goto failed; |
| @@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block, | |||
| 982 | */ | 990 | */ |
| 983 | spin_lock(&inode->i_mapping->private_lock); | 991 | spin_lock(&inode->i_mapping->private_lock); |
| 984 | link_dev_buffers(page, bh); | 992 | link_dev_buffers(page, bh); |
| 985 | init_page_buffers(page, bdev, block, size); | 993 | end_block = init_page_buffers(page, bdev, index << sizebits, size); |
| 986 | spin_unlock(&inode->i_mapping->private_lock); | 994 | spin_unlock(&inode->i_mapping->private_lock); |
| 987 | return page; | 995 | done: |
| 988 | 996 | ret = (block < end_block) ? 1 : -ENXIO; | |
| 989 | failed: | 997 | failed: |
| 990 | unlock_page(page); | 998 | unlock_page(page); |
| 991 | page_cache_release(page); | 999 | page_cache_release(page); |
| 992 | return NULL; | 1000 | return ret; |
| 993 | } | 1001 | } |
| 994 | 1002 | ||
| 995 | /* | 1003 | /* |
| @@ -999,7 +1007,6 @@ failed: | |||
| 999 | static int | 1007 | static int |
| 1000 | grow_buffers(struct block_device *bdev, sector_t block, int size) | 1008 | grow_buffers(struct block_device *bdev, sector_t block, int size) |
| 1001 | { | 1009 | { |
| 1002 | struct page *page; | ||
| 1003 | pgoff_t index; | 1010 | pgoff_t index; |
| 1004 | int sizebits; | 1011 | int sizebits; |
| 1005 | 1012 | ||
| @@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) | |||
| 1023 | bdevname(bdev, b)); | 1030 | bdevname(bdev, b)); |
| 1024 | return -EIO; | 1031 | return -EIO; |
| 1025 | } | 1032 | } |
| 1026 | block = index << sizebits; | 1033 | |
| 1027 | /* Create a page with the proper size buffers.. */ | 1034 | /* Create a page with the proper size buffers.. */ |
| 1028 | page = grow_dev_page(bdev, block, index, size); | 1035 | return grow_dev_page(bdev, block, index, size, sizebits); |
| 1029 | if (!page) | ||
| 1030 | return 0; | ||
| 1031 | unlock_page(page); | ||
| 1032 | page_cache_release(page); | ||
| 1033 | return 1; | ||
| 1034 | } | 1036 | } |
| 1035 | 1037 | ||
| 1036 | static struct buffer_head * | 1038 | static struct buffer_head * |
| 1037 | __getblk_slow(struct block_device *bdev, sector_t block, int size) | 1039 | __getblk_slow(struct block_device *bdev, sector_t block, int size) |
| 1038 | { | 1040 | { |
| 1039 | int ret; | ||
| 1040 | struct buffer_head *bh; | ||
| 1041 | |||
| 1042 | /* Size must be multiple of hard sectorsize */ | 1041 | /* Size must be multiple of hard sectorsize */ |
| 1043 | if (unlikely(size & (bdev_logical_block_size(bdev)-1) || | 1042 | if (unlikely(size & (bdev_logical_block_size(bdev)-1) || |
| 1044 | (size < 512 || size > PAGE_SIZE))) { | 1043 | (size < 512 || size > PAGE_SIZE))) { |
| @@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) | |||
| 1051 | return NULL; | 1050 | return NULL; |
| 1052 | } | 1051 | } |
| 1053 | 1052 | ||
| 1054 | retry: | 1053 | for (;;) { |
| 1055 | bh = __find_get_block(bdev, block, size); | 1054 | struct buffer_head *bh; |
| 1056 | if (bh) | 1055 | int ret; |
| 1057 | return bh; | ||
| 1058 | 1056 | ||
| 1059 | ret = grow_buffers(bdev, block, size); | ||
| 1060 | if (ret == 0) { | ||
| 1061 | free_more_memory(); | ||
| 1062 | goto retry; | ||
| 1063 | } else if (ret > 0) { | ||
| 1064 | bh = __find_get_block(bdev, block, size); | 1057 | bh = __find_get_block(bdev, block, size); |
| 1065 | if (bh) | 1058 | if (bh) |
| 1066 | return bh; | 1059 | return bh; |
| 1060 | |||
| 1061 | ret = grow_buffers(bdev, block, size); | ||
| 1062 | if (ret < 0) | ||
| 1063 | return NULL; | ||
| 1064 | if (ret == 0) | ||
| 1065 | free_more_memory(); | ||
| 1067 | } | 1066 | } |
| 1068 | return NULL; | ||
| 1069 | } | 1067 | } |
| 1070 | 1068 | ||
| 1071 | /* | 1069 | /* |
| @@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block); | |||
| 1321 | * which corresponds to the passed block_device, block and size. The | 1319 | * which corresponds to the passed block_device, block and size. The |
| 1322 | * returned buffer has its reference count incremented. | 1320 | * returned buffer has its reference count incremented. |
| 1323 | * | 1321 | * |
| 1324 | * __getblk() cannot fail - it just keeps trying. If you pass it an | ||
| 1325 | * illegal block number, __getblk() will happily return a buffer_head | ||
| 1326 | * which represents the non-existent block. Very weird. | ||
| 1327 | * | ||
| 1328 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() | 1322 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() |
| 1329 | * attempt is failing. FIXME, perhaps? | 1323 | * attempt is failing. FIXME, perhaps? |
| 1330 | */ | 1324 | */ |
diff --git a/fs/direct-io.c b/fs/direct-io.c index 1faf4cb56f39..f86c720dba0e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
| @@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
| 1062 | unsigned long user_addr; | 1062 | unsigned long user_addr; |
| 1063 | size_t bytes; | 1063 | size_t bytes; |
| 1064 | struct buffer_head map_bh = { 0, }; | 1064 | struct buffer_head map_bh = { 0, }; |
| 1065 | struct blk_plug plug; | ||
| 1065 | 1066 | ||
| 1066 | if (rw & WRITE) | 1067 | if (rw & WRITE) |
| 1067 | rw = WRITE_ODIRECT; | 1068 | rw = WRITE_ODIRECT; |
| @@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
| 1177 | PAGE_SIZE - user_addr / PAGE_SIZE); | 1178 | PAGE_SIZE - user_addr / PAGE_SIZE); |
| 1178 | } | 1179 | } |
| 1179 | 1180 | ||
| 1181 | blk_start_plug(&plug); | ||
| 1182 | |||
| 1180 | for (seg = 0; seg < nr_segs; seg++) { | 1183 | for (seg = 0; seg < nr_segs; seg++) { |
| 1181 | user_addr = (unsigned long)iov[seg].iov_base; | 1184 | user_addr = (unsigned long)iov[seg].iov_base; |
| 1182 | sdio.size += bytes = iov[seg].iov_len; | 1185 | sdio.size += bytes = iov[seg].iov_len; |
| @@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
| 1235 | if (sdio.bio) | 1238 | if (sdio.bio) |
| 1236 | dio_bio_submit(dio, &sdio); | 1239 | dio_bio_submit(dio, &sdio); |
| 1237 | 1240 | ||
| 1241 | blk_finish_plug(&plug); | ||
| 1242 | |||
| 1238 | /* | 1243 | /* |
| 1239 | * It is possible that, we return short IO due to end of file. | 1244 | * It is possible that, we return short IO due to end of file. |
| 1240 | * In that case, we need to release all the pages we got hold on. | 1245 | * In that case, we need to release all the pages we got hold on. |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4e72a9d48232..4a2ab7c85393 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
| @@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync) | |||
| 601 | * it already be started by driver. | 601 | * it already be started by driver. |
| 602 | */ | 602 | */ |
| 603 | #define RQ_NOMERGE_FLAGS \ | 603 | #define RQ_NOMERGE_FLAGS \ |
| 604 | (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) | 604 | (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD) |
| 605 | #define rq_mergeable(rq) \ | 605 | #define rq_mergeable(rq) \ |
| 606 | (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ | 606 | (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ |
| 607 | (((rq)->cmd_flags & REQ_DISCARD) || \ | 607 | (((rq)->cmd_flags & REQ_DISCARD) || \ |
| @@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); | |||
| 894 | extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); | 894 | extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); |
| 895 | 895 | ||
| 896 | extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); | 896 | extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); |
| 897 | extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio, | ||
| 898 | struct scatterlist *sglist); | ||
| 897 | extern void blk_dump_rq_flags(struct request *, char *); | 899 | extern void blk_dump_rq_flags(struct request *, char *); |
| 898 | extern long nr_blockdev_pages(void); | 900 | extern long nr_blockdev_pages(void); |
| 899 | 901 | ||
| @@ -1139,6 +1141,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector | |||
| 1139 | & (lim->discard_granularity - 1); | 1141 | & (lim->discard_granularity - 1); |
| 1140 | } | 1142 | } |
| 1141 | 1143 | ||
| 1144 | static inline int bdev_discard_alignment(struct block_device *bdev) | ||
| 1145 | { | ||
| 1146 | struct request_queue *q = bdev_get_queue(bdev); | ||
| 1147 | |||
| 1148 | if (bdev != bdev->bd_contains) | ||
| 1149 | return bdev->bd_part->discard_alignment; | ||
| 1150 | |||
| 1151 | return q->limits.discard_alignment; | ||
| 1152 | } | ||
| 1153 | |||
| 1142 | static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) | 1154 | static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) |
| 1143 | { | 1155 | { |
| 1144 | if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) | 1156 | if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) |
diff --git a/mm/filemap.c b/mm/filemap.c index fa5ca304148e..384344575c37 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 1412 | retval = filemap_write_and_wait_range(mapping, pos, | 1412 | retval = filemap_write_and_wait_range(mapping, pos, |
| 1413 | pos + iov_length(iov, nr_segs) - 1); | 1413 | pos + iov_length(iov, nr_segs) - 1); |
| 1414 | if (!retval) { | 1414 | if (!retval) { |
| 1415 | struct blk_plug plug; | ||
| 1416 | |||
| 1417 | blk_start_plug(&plug); | ||
| 1418 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1415 | retval = mapping->a_ops->direct_IO(READ, iocb, |
| 1419 | iov, pos, nr_segs); | 1416 | iov, pos, nr_segs); |
| 1420 | blk_finish_plug(&plug); | ||
| 1421 | } | 1417 | } |
| 1422 | if (retval > 0) { | 1418 | if (retval > 0) { |
| 1423 | *ppos = pos + retval; | 1419 | *ppos = pos + retval; |
| @@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2527 | { | 2523 | { |
| 2528 | struct file *file = iocb->ki_filp; | 2524 | struct file *file = iocb->ki_filp; |
| 2529 | struct inode *inode = file->f_mapping->host; | 2525 | struct inode *inode = file->f_mapping->host; |
| 2530 | struct blk_plug plug; | ||
| 2531 | ssize_t ret; | 2526 | ssize_t ret; |
| 2532 | 2527 | ||
| 2533 | BUG_ON(iocb->ki_pos != pos); | 2528 | BUG_ON(iocb->ki_pos != pos); |
| 2534 | 2529 | ||
| 2535 | sb_start_write(inode->i_sb); | 2530 | sb_start_write(inode->i_sb); |
| 2536 | mutex_lock(&inode->i_mutex); | 2531 | mutex_lock(&inode->i_mutex); |
| 2537 | blk_start_plug(&plug); | ||
| 2538 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 2532 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
| 2539 | mutex_unlock(&inode->i_mutex); | 2533 | mutex_unlock(&inode->i_mutex); |
| 2540 | 2534 | ||
| @@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2545 | if (err < 0 && ret > 0) | 2539 | if (err < 0 && ret > 0) |
| 2546 | ret = err; | 2540 | ret = err; |
| 2547 | } | 2541 | } |
| 2548 | blk_finish_plug(&plug); | ||
| 2549 | sb_end_write(inode->i_sb); | 2542 | sb_end_write(inode->i_sb); |
| 2550 | return ret; | 2543 | return ret; |
| 2551 | } | 2544 | } |
