aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-22 20:07:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-22 20:07:18 -0400
commita2887097f25cd38cadfc11d10769e2b349fb5eca (patch)
treecd4adcb305365d6ba9acd2c02d4eb9d0125c6f8d
parent8abfc6e7a45eb74e51904bbae676fae008b11366 (diff)
parent005a1d15f5a6b2bb4ada80349513effbf22b4588 (diff)
Merge branch 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block: (46 commits) xen-blkfront: disable barrier/flush write support Added blk-lib.c and blk-barrier.c was renamed to blk-flush.c block: remove BLKDEV_IFL_WAIT aic7xxx_old: removed unused 'req' variable block: remove the BH_Eopnotsupp flag block: remove the BLKDEV_IFL_BARRIER flag block: remove the WRITE_BARRIER flag swap: do not send discards as barriers fat: do not send discards as barriers ext4: do not send discards as barriers jbd2: replace barriers with explicit flush / FUA usage jbd2: Modify ASYNC_COMMIT code to not rely on queue draining on barrier jbd: replace barriers with explicit flush / FUA usage nilfs2: replace barriers with explicit flush / FUA usage reiserfs: replace barriers with explicit flush / FUA usage gfs2: replace barriers with explicit flush / FUA usage btrfs: replace barriers with explicit flush / FUA usage xfs: replace barriers with explicit flush / FUA usage block: pass gfp_mask and flags to sb_issue_discard dm: convey that all flushes are processed as empty ...
-rw-r--r--Documentation/DocBook/kernel-api.tmpl3
-rw-r--r--Documentation/block/00-INDEX4
-rw-r--r--Documentation/block/barrier.txt261
-rw-r--r--Documentation/block/writeback_cache_control.txt86
-rw-r--r--Documentation/lguest/lguest.c29
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-barrier.c350
-rw-r--r--block/blk-core.c72
-rw-r--r--block/blk-flush.c262
-rw-r--r--block/blk-lib.c39
-rw-r--r--block/blk-settings.c20
-rw-r--r--block/blk.h8
-rw-r--r--block/elevator.c79
-rw-r--r--block/ioctl.c4
-rw-r--r--drivers/block/brd.c1
-rw-r--r--drivers/block/drbd/drbd_int.h3
-rw-r--r--drivers/block/drbd/drbd_receiver.c2
-rw-r--r--drivers/block/loop.c20
-rw-r--r--drivers/block/osdblk.c5
-rw-r--r--drivers/block/pktcdvd.c1
-rw-r--r--drivers/block/ps3disk.c2
-rw-r--r--drivers/block/virtio_blk.c37
-rw-r--r--drivers/block/xen-blkfront.c54
-rw-r--r--drivers/ide/ide-disk.c13
-rw-r--r--drivers/ide/ide-io.c13
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-io.c20
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-raid1.c8
-rw-r--r--drivers/md/dm-region-hash.c16
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-snap.c6
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm.c398
-rw-r--r--drivers/md/linear.c4
-rw-r--r--drivers/md/md.c117
-rw-r--r--drivers/md/md.h23
-rw-r--r--drivers/md/multipath.c4
-rw-r--r--drivers/md/raid0.c4
-rw-r--r--drivers/md/raid1.c176
-rw-r--r--drivers/md/raid1.h2
-rw-r--r--drivers/md/raid10.c7
-rw-r--r--drivers/md/raid5.c43
-rw-r--r--drivers/md/raid5.h1
-rw-r--r--drivers/mmc/card/queue.c1
-rw-r--r--drivers/s390/block/dasd.c1
-rw-r--r--drivers/scsi/aic7xxx_old.c22
-rw-r--r--drivers/scsi/libsas/sas_scsi_host.c13
-rw-r--r--drivers/scsi/sd.c18
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/disk-io.c19
-rw-r--r--fs/btrfs/extent-tree.c3
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/buffer.c7
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/gfs2/log.c19
-rw-r--r--fs/gfs2/rgrp.c6
-rw-r--r--fs/jbd/commit.c30
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c74
-rw-r--r--fs/nilfs2/super.c10
-rw-r--r--fs/nilfs2/the_nilfs.c7
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/journal.c106
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h11
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_log.c13
-rw-r--r--include/linux/blk_types.h5
-rw-r--r--include/linux/blkdev.h101
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/linux/fs.h28
-rw-r--r--include/scsi/scsi_tcq.h6
-rw-r--r--mm/swapfile.c6
80 files changed, 845 insertions, 1921 deletions
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 6899f471fb1..6b4e07f28b6 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -257,7 +257,8 @@ X!Earch/x86/kernel/mca_32.c
257!Iblock/blk-sysfs.c 257!Iblock/blk-sysfs.c
258!Eblock/blk-settings.c 258!Eblock/blk-settings.c
259!Eblock/blk-exec.c 259!Eblock/blk-exec.c
260!Eblock/blk-barrier.c 260!Eblock/blk-flush.c
261!Eblock/blk-lib.c
261!Eblock/blk-tag.c 262!Eblock/blk-tag.c
262!Iblock/blk-tag.c 263!Iblock/blk-tag.c
263!Eblock/blk-integrity.c 264!Eblock/blk-integrity.c
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index a406286f6f3..d111e3b23db 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,7 +1,5 @@
100-INDEX 100-INDEX
2 - This file 2 - This file
3barrier.txt
4 - I/O Barriers
5biodoc.txt 3biodoc.txt
6 - Notes on the Generic Block Layer Rewrite in Linux 2.5 4 - Notes on the Generic Block Layer Rewrite in Linux 2.5
7capability.txt 5capability.txt
@@ -16,3 +14,5 @@ stat.txt
16 - Block layer statistics in /sys/block/<dev>/stat 14 - Block layer statistics in /sys/block/<dev>/stat
17switching-sched.txt 15switching-sched.txt
18 - Switching I/O schedulers at runtime 16 - Switching I/O schedulers at runtime
17writeback_cache_control.txt
18 - Control of volatile write back caches
diff --git a/Documentation/block/barrier.txt b/Documentation/block/barrier.txt
deleted file mode 100644
index 2c2f24f634e..00000000000
--- a/Documentation/block/barrier.txt
+++ /dev/null
@@ -1,261 +0,0 @@
1I/O Barriers
2============
3Tejun Heo <htejun@gmail.com>, July 22 2005
4
5I/O barrier requests are used to guarantee ordering around the barrier
6requests. Unless you're crazy enough to use disk drives for
7implementing synchronization constructs (wow, sounds interesting...),
8the ordering is meaningful only for write requests for things like
9journal checkpoints. All requests queued before a barrier request
10must be finished (made it to the physical medium) before the barrier
11request is started, and all requests queued after the barrier request
12must be started only after the barrier request is finished (again,
13made it to the physical medium).
14
15In other words, I/O barrier requests have the following two properties.
16
171. Request ordering
18
19Requests cannot pass the barrier request. Preceding requests are
20processed before the barrier and following requests after.
21
22Depending on what features a drive supports, this can be done in one
23of the following three ways.
24
25i. For devices which have queue depth greater than 1 (TCQ devices) and
26support ordered tags, block layer can just issue the barrier as an
27ordered request and the lower level driver, controller and drive
28itself are responsible for making sure that the ordering constraint is
29met. Most modern SCSI controllers/drives should support this.
30
31NOTE: SCSI ordered tag isn't currently used due to limitation in the
32 SCSI midlayer, see the following random notes section.
33
34ii. For devices which have queue depth greater than 1 but don't
35support ordered tags, block layer ensures that the requests preceding
36a barrier request finishes before issuing the barrier request. Also,
37it defers requests following the barrier until the barrier request is
38finished. Older SCSI controllers/drives and SATA drives fall in this
39category.
40
41iii. Devices which have queue depth of 1. This is a degenerate case
42of ii. Just keeping issue order suffices. Ancient SCSI
43controllers/drives and IDE drives are in this category.
44
452. Forced flushing to physical medium
46
47Again, if you're not gonna do synchronization with disk drives (dang,
48it sounds even more appealing now!), the reason you use I/O barriers
49is mainly to protect filesystem integrity when power failure or some
50other events abruptly stop the drive from operating and possibly make
51the drive lose data in its cache. So, I/O barriers need to guarantee
52that requests actually get written to non-volatile medium in order.
53
54There are four cases,
55
56i. No write-back cache. Keeping requests ordered is enough.
57
58ii. Write-back cache but no flush operation. There's no way to
59guarantee physical-medium commit order. This kind of devices can't to
60I/O barriers.
61
62iii. Write-back cache and flush operation but no FUA (forced unit
63access). We need two cache flushes - before and after the barrier
64request.
65
66iv. Write-back cache, flush operation and FUA. We still need one
67flush to make sure requests preceding a barrier are written to medium,
68but post-barrier flush can be avoided by using FUA write on the
69barrier itself.
70
71
72How to support barrier requests in drivers
73------------------------------------------
74
75All barrier handling is done inside block layer proper. All low level
76drivers have to are implementing its prepare_flush_fn and using one
77the following two functions to indicate what barrier type it supports
78and how to prepare flush requests. Note that the term 'ordered' is
79used to indicate the whole sequence of performing barrier requests
80including draining and flushing.
81
82typedef void (prepare_flush_fn)(struct request_queue *q, struct request *rq);
83
84int blk_queue_ordered(struct request_queue *q, unsigned ordered,
85 prepare_flush_fn *prepare_flush_fn);
86
87@q : the queue in question
88@ordered : the ordered mode the driver/device supports
89@prepare_flush_fn : this function should prepare @rq such that it
90 flushes cache to physical medium when executed
91
92For example, SCSI disk driver's prepare_flush_fn looks like the
93following.
94
95static void sd_prepare_flush(struct request_queue *q, struct request *rq)
96{
97 memset(rq->cmd, 0, sizeof(rq->cmd));
98 rq->cmd_type = REQ_TYPE_BLOCK_PC;
99 rq->timeout = SD_TIMEOUT;
100 rq->cmd[0] = SYNCHRONIZE_CACHE;
101 rq->cmd_len = 10;
102}
103
104The following seven ordered modes are supported. The following table
105shows which mode should be used depending on what features a
106device/driver supports. In the leftmost column of table,
107QUEUE_ORDERED_ prefix is omitted from the mode names to save space.
108
109The table is followed by description of each mode. Note that in the
110descriptions of QUEUE_ORDERED_DRAIN*, '=>' is used whereas '->' is
111used for QUEUE_ORDERED_TAG* descriptions. '=>' indicates that the
112preceding step must be complete before proceeding to the next step.
113'->' indicates that the next step can start as soon as the previous
114step is issued.
115
116 write-back cache ordered tag flush FUA
117-----------------------------------------------------------------------
118NONE yes/no N/A no N/A
119DRAIN no no N/A N/A
120DRAIN_FLUSH yes no yes no
121DRAIN_FUA yes no yes yes
122TAG no yes N/A N/A
123TAG_FLUSH yes yes yes no
124TAG_FUA yes yes yes yes
125
126
127QUEUE_ORDERED_NONE
128 I/O barriers are not needed and/or supported.
129
130 Sequence: N/A
131
132QUEUE_ORDERED_DRAIN
133 Requests are ordered by draining the request queue and cache
134 flushing isn't needed.
135
136 Sequence: drain => barrier
137
138QUEUE_ORDERED_DRAIN_FLUSH
139 Requests are ordered by draining the request queue and both
140 pre-barrier and post-barrier cache flushings are needed.
141
142 Sequence: drain => preflush => barrier => postflush
143
144QUEUE_ORDERED_DRAIN_FUA
145 Requests are ordered by draining the request queue and
146 pre-barrier cache flushing is needed. By using FUA on barrier
147 request, post-barrier flushing can be skipped.
148
149 Sequence: drain => preflush => barrier
150
151QUEUE_ORDERED_TAG
152 Requests are ordered by ordered tag and cache flushing isn't
153 needed.
154
155 Sequence: barrier
156
157QUEUE_ORDERED_TAG_FLUSH
158 Requests are ordered by ordered tag and both pre-barrier and
159 post-barrier cache flushings are needed.
160
161 Sequence: preflush -> barrier -> postflush
162
163QUEUE_ORDERED_TAG_FUA
164 Requests are ordered by ordered tag and pre-barrier cache
165 flushing is needed. By using FUA on barrier request,
166 post-barrier flushing can be skipped.
167
168 Sequence: preflush -> barrier
169
170
171Random notes/caveats
172--------------------
173
174* SCSI layer currently can't use TAG ordering even if the drive,
175controller and driver support it. The problem is that SCSI midlayer
176request dispatch function is not atomic. It releases queue lock and
177switch to SCSI host lock during issue and it's possible and likely to
178happen in time that requests change their relative positions. Once
179this problem is solved, TAG ordering can be enabled.
180
181* Currently, no matter which ordered mode is used, there can be only
182one barrier request in progress. All I/O barriers are held off by
183block layer until the previous I/O barrier is complete. This doesn't
184make any difference for DRAIN ordered devices, but, for TAG ordered
185devices with very high command latency, passing multiple I/O barriers
186to low level *might* be helpful if they are very frequent. Well, this
187certainly is a non-issue. I'm writing this just to make clear that no
188two I/O barrier is ever passed to low-level driver.
189
190* Completion order. Requests in ordered sequence are issued in order
191but not required to finish in order. Barrier implementation can
192handle out-of-order completion of ordered sequence. IOW, the requests
193MUST be processed in order but the hardware/software completion paths
194are allowed to reorder completion notifications - eg. current SCSI
195midlayer doesn't preserve completion order during error handling.
196
197* Requeueing order. Low-level drivers are free to requeue any request
198after they removed it from the request queue with
199blkdev_dequeue_request(). As barrier sequence should be kept in order
200when requeued, generic elevator code takes care of putting requests in
201order around barrier. See blk_ordered_req_seq() and
202ELEVATOR_INSERT_REQUEUE handling in __elv_add_request() for details.
203
204Note that block drivers must not requeue preceding requests while
205completing latter requests in an ordered sequence. Currently, no
206error checking is done against this.
207
208* Error handling. Currently, block layer will report error to upper
209layer if any of requests in an ordered sequence fails. Unfortunately,
210this doesn't seem to be enough. Look at the following request flow.
211QUEUE_ORDERED_TAG_FLUSH is in use.
212
213 [0] [1] [2] [3] [pre] [barrier] [post] < [4] [5] [6] ... >
214 still in elevator
215
216Let's say request [2], [3] are write requests to update file system
217metadata (journal or whatever) and [barrier] is used to mark that
218those updates are valid. Consider the following sequence.
219
220 i. Requests [0] ~ [post] leaves the request queue and enters
221 low-level driver.
222 ii. After a while, unfortunately, something goes wrong and the
223 drive fails [2]. Note that any of [0], [1] and [3] could have
224 completed by this time, but [pre] couldn't have been finished
225 as the drive must process it in order and it failed before
226 processing that command.
227 iii. Error handling kicks in and determines that the error is
228 unrecoverable and fails [2], and resumes operation.
229 iv. [pre] [barrier] [post] gets processed.
230 v. *BOOM* power fails
231
232The problem here is that the barrier request is *supposed* to indicate
233that filesystem update requests [2] and [3] made it safely to the
234physical medium and, if the machine crashes after the barrier is
235written, filesystem recovery code can depend on that. Sadly, that
236isn't true in this case anymore. IOW, the success of a I/O barrier
237should also be dependent on success of some of the preceding requests,
238where only upper layer (filesystem) knows what 'some' is.
239
240This can be solved by implementing a way to tell the block layer which
241requests affect the success of the following barrier request and
242making lower lever drivers to resume operation on error only after
243block layer tells it to do so.
244
245As the probability of this happening is very low and the drive should
246be faulty, implementing the fix is probably an overkill. But, still,
247it's there.
248
249* In previous drafts of barrier implementation, there was fallback
250mechanism such that, if FUA or ordered TAG fails, less fancy ordered
251mode can be selected and the failed barrier request is retried
252automatically. The rationale for this feature was that as FUA is
253pretty new in ATA world and ordered tag was never used widely, there
254could be devices which report to support those features but choke when
255actually given such requests.
256
257 This was removed for two reasons 1. it's an overkill 2. it's
258impossible to implement properly when TAG ordering is used as low
259level drivers resume after an error automatically. If it's ever
260needed adding it back and modifying low level drivers accordingly
261shouldn't be difficult.
diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt
new file mode 100644
index 00000000000..83407d36630
--- /dev/null
+++ b/Documentation/block/writeback_cache_control.txt
@@ -0,0 +1,86 @@
1
2Explicit volatile write back cache control
3=====================================
4
5Introduction
6------------
7
8Many storage devices, especially in the consumer market, come with volatile
9write back caches. That means the devices signal I/O completion to the
10operating system before data actually has hit the non-volatile storage. This
11behavior obviously speeds up various workloads, but it means the operating
12system needs to force data out to the non-volatile storage when it performs
13a data integrity operation like fsync, sync or an unmount.
14
15The Linux block layer provides two simple mechanisms that let filesystems
16control the caching behavior of the storage device. These mechanisms are
17a forced cache flush, and the Force Unit Access (FUA) flag for requests.
18
19
20Explicit cache flushes
21----------------------
22
23The REQ_FLUSH flag can be OR ed into the r/w flags of a bio submitted from
24the filesystem and will make sure the volatile cache of the storage device
25has been flushed before the actual I/O operation is started. This explicitly
26guarantees that previously completed write requests are on non-volatile
27storage before the flagged bio starts. In addition the REQ_FLUSH flag can be
28set on an otherwise empty bio structure, which causes only an explicit cache
29flush without any dependent I/O. It is recommend to use
30the blkdev_issue_flush() helper for a pure cache flush.
31
32
33Forced Unit Access
34-----------------
35
36The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the
37filesystem and will make sure that I/O completion for this request is only
38signaled after the data has been committed to non-volatile storage.
39
40
41Implementation details for filesystems
42--------------------------------------
43
44Filesystems can simply set the REQ_FLUSH and REQ_FUA bits and do not have to
45worry if the underlying devices need any explicit cache flushing and how
46the Forced Unit Access is implemented. The REQ_FLUSH and REQ_FUA flags
47may both be set on a single bio.
48
49
50Implementation details for make_request_fn based block drivers
51--------------------------------------------------------------
52
53These drivers will always see the REQ_FLUSH and REQ_FUA bits as they sit
54directly below the submit_bio interface. For remapping drivers the REQ_FUA
55bits need to be propagated to underlying devices, and a global flush needs
56to be implemented for bios with the REQ_FLUSH bit set. For real device
57drivers that do not have a volatile cache the REQ_FLUSH and REQ_FUA bits
58on non-empty bios can simply be ignored, and REQ_FLUSH requests without
59data can be completed successfully without doing any work. Drivers for
60devices with volatile caches need to implement the support for these
61flags themselves without any help from the block layer.
62
63
64Implementation details for request_fn based block drivers
65--------------------------------------------------------------
66
67For devices that do not support volatile write caches there is no driver
68support required, the block layer completes empty REQ_FLUSH requests before
69entering the driver and strips off the REQ_FLUSH and REQ_FUA bits from
70requests that have a payload. For devices with volatile write caches the
71driver needs to tell the block layer that it supports flushing caches by
72doing:
73
74 blk_queue_flush(sdkp->disk->queue, REQ_FLUSH);
75
76and handle empty REQ_FLUSH requests in its prep_fn/request_fn. Note that
77REQ_FLUSH requests with a payload are automatically turned into a sequence
78of an empty REQ_FLUSH request followed by the actual write by the block
79layer. For devices that also support the FUA bit the block layer needs
80to be told to pass through the REQ_FUA bit using:
81
82 blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA);
83
84and the driver must handle write requests that have the REQ_FUA bit set
85in prep_fn/request_fn. If the FUA bit is not natively supported the block
86layer turns it into an empty REQ_FLUSH request after the actual write.
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 8a6a8c6d498..dc73bc54cc4 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1640,15 +1640,6 @@ static void blk_request(struct virtqueue *vq)
1640 off = out->sector * 512; 1640 off = out->sector * 512;
1641 1641
1642 /* 1642 /*
1643 * The block device implements "barriers", where the Guest indicates
1644 * that it wants all previous writes to occur before this write. We
1645 * don't have a way of asking our kernel to do a barrier, so we just
1646 * synchronize all the data in the file. Pretty poor, no?
1647 */
1648 if (out->type & VIRTIO_BLK_T_BARRIER)
1649 fdatasync(vblk->fd);
1650
1651 /*
1652 * In general the virtio block driver is allowed to try SCSI commands. 1643 * In general the virtio block driver is allowed to try SCSI commands.
1653 * It'd be nice if we supported eject, for example, but we don't. 1644 * It'd be nice if we supported eject, for example, but we don't.
1654 */ 1645 */
@@ -1680,6 +1671,13 @@ static void blk_request(struct virtqueue *vq)
1680 /* Die, bad Guest, die. */ 1671 /* Die, bad Guest, die. */
1681 errx(1, "Write past end %llu+%u", off, ret); 1672 errx(1, "Write past end %llu+%u", off, ret);
1682 } 1673 }
1674
1675 wlen = sizeof(*in);
1676 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1677 } else if (out->type & VIRTIO_BLK_T_FLUSH) {
1678 /* Flush */
1679 ret = fdatasync(vblk->fd);
1680 verbose("FLUSH fdatasync: %i\n", ret);
1683 wlen = sizeof(*in); 1681 wlen = sizeof(*in);
1684 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1682 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1685 } else { 1683 } else {
@@ -1703,15 +1701,6 @@ static void blk_request(struct virtqueue *vq)
1703 } 1701 }
1704 } 1702 }
1705 1703
1706 /*
1707 * OK, so we noted that it was pretty poor to use an fdatasync as a
1708 * barrier. But Christoph Hellwig points out that we need a sync
1709 * *afterwards* as well: "Barriers specify no reordering to the front
1710 * or the back." And Jens Axboe confirmed it, so here we are:
1711 */
1712 if (out->type & VIRTIO_BLK_T_BARRIER)
1713 fdatasync(vblk->fd);
1714
1715 /* Finished that request. */ 1704 /* Finished that request. */
1716 add_used(vq, head, wlen); 1705 add_used(vq, head, wlen);
1717} 1706}
@@ -1736,8 +1725,8 @@ static void setup_block_file(const char *filename)
1736 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 1725 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1737 vblk->len = lseek64(vblk->fd, 0, SEEK_END); 1726 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1738 1727
1739 /* We support barriers. */ 1728 /* We support FLUSH. */
1740 add_feature(dev, VIRTIO_BLK_F_BARRIER); 1729 add_feature(dev, VIRTIO_BLK_F_FLUSH);
1741 1730
1742 /* Tell Guest how many sectors this device has. */ 1731 /* Tell Guest how many sectors this device has. */
1743 conf.capacity = cpu_to_le64(vblk->len / 512); 1732 conf.capacity = cpu_to_le64(vblk->len / 512);
diff --git a/block/Makefile b/block/Makefile
index c850d5ef80a..0fec4b3fab5 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
9 9
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
deleted file mode 100644
index f0faefca032..00000000000
--- a/block/blk-barrier.c
+++ /dev/null
@@ -1,350 +0,0 @@
1/*
2 * Functions related to barrier IO handling
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/gfp.h>
9
10#include "blk.h"
11
12/**
13 * blk_queue_ordered - does this queue support ordered writes
14 * @q: the request queue
15 * @ordered: one of QUEUE_ORDERED_*
16 *
17 * Description:
18 * For journalled file systems, doing ordered writes on a commit
19 * block instead of explicitly doing wait_on_buffer (which is bad
20 * for performance) can be a big win. Block drivers supporting this
21 * feature should call this function and indicate so.
22 *
23 **/
24int blk_queue_ordered(struct request_queue *q, unsigned ordered)
25{
26 if (ordered != QUEUE_ORDERED_NONE &&
27 ordered != QUEUE_ORDERED_DRAIN &&
28 ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
29 ordered != QUEUE_ORDERED_DRAIN_FUA &&
30 ordered != QUEUE_ORDERED_TAG &&
31 ordered != QUEUE_ORDERED_TAG_FLUSH &&
32 ordered != QUEUE_ORDERED_TAG_FUA) {
33 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
34 return -EINVAL;
35 }
36
37 q->ordered = ordered;
38 q->next_ordered = ordered;
39
40 return 0;
41}
42EXPORT_SYMBOL(blk_queue_ordered);
43
44/*
45 * Cache flushing for ordered writes handling
46 */
47unsigned blk_ordered_cur_seq(struct request_queue *q)
48{
49 if (!q->ordseq)
50 return 0;
51 return 1 << ffz(q->ordseq);
52}
53
54unsigned blk_ordered_req_seq(struct request *rq)
55{
56 struct request_queue *q = rq->q;
57
58 BUG_ON(q->ordseq == 0);
59
60 if (rq == &q->pre_flush_rq)
61 return QUEUE_ORDSEQ_PREFLUSH;
62 if (rq == &q->bar_rq)
63 return QUEUE_ORDSEQ_BAR;
64 if (rq == &q->post_flush_rq)
65 return QUEUE_ORDSEQ_POSTFLUSH;
66
67 /*
68 * !fs requests don't need to follow barrier ordering. Always
69 * put them at the front. This fixes the following deadlock.
70 *
71 * http://thread.gmane.org/gmane.linux.kernel/537473
72 */
73 if (rq->cmd_type != REQ_TYPE_FS)
74 return QUEUE_ORDSEQ_DRAIN;
75
76 if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
77 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
78 return QUEUE_ORDSEQ_DRAIN;
79 else
80 return QUEUE_ORDSEQ_DONE;
81}
82
83bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
84{
85 struct request *rq;
86
87 if (error && !q->orderr)
88 q->orderr = error;
89
90 BUG_ON(q->ordseq & seq);
91 q->ordseq |= seq;
92
93 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
94 return false;
95
96 /*
97 * Okay, sequence complete.
98 */
99 q->ordseq = 0;
100 rq = q->orig_bar_rq;
101 __blk_end_request_all(rq, q->orderr);
102 return true;
103}
104
105static void pre_flush_end_io(struct request *rq, int error)
106{
107 elv_completed_request(rq->q, rq);
108 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
109}
110
111static void bar_end_io(struct request *rq, int error)
112{
113 elv_completed_request(rq->q, rq);
114 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
115}
116
117static void post_flush_end_io(struct request *rq, int error)
118{
119 elv_completed_request(rq->q, rq);
120 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
121}
122
123static void queue_flush(struct request_queue *q, unsigned which)
124{
125 struct request *rq;
126 rq_end_io_fn *end_io;
127
128 if (which == QUEUE_ORDERED_DO_PREFLUSH) {
129 rq = &q->pre_flush_rq;
130 end_io = pre_flush_end_io;
131 } else {
132 rq = &q->post_flush_rq;
133 end_io = post_flush_end_io;
134 }
135
136 blk_rq_init(q, rq);
137 rq->cmd_type = REQ_TYPE_FS;
138 rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
139 rq->rq_disk = q->orig_bar_rq->rq_disk;
140 rq->end_io = end_io;
141
142 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
143}
144
145static inline bool start_ordered(struct request_queue *q, struct request **rqp)
146{
147 struct request *rq = *rqp;
148 unsigned skip = 0;
149
150 q->orderr = 0;
151 q->ordered = q->next_ordered;
152 q->ordseq |= QUEUE_ORDSEQ_STARTED;
153
154 /*
155 * For an empty barrier, there's no actual BAR request, which
156 * in turn makes POSTFLUSH unnecessary. Mask them off.
157 */
158 if (!blk_rq_sectors(rq)) {
159 q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
160 QUEUE_ORDERED_DO_POSTFLUSH);
161 /*
162 * Empty barrier on a write-through device w/ ordered
163 * tag has no command to issue and without any command
164 * to issue, ordering by tag can't be used. Drain
165 * instead.
166 */
167 if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
168 !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
169 q->ordered &= ~QUEUE_ORDERED_BY_TAG;
170 q->ordered |= QUEUE_ORDERED_BY_DRAIN;
171 }
172 }
173
174 /* stash away the original request */
175 blk_dequeue_request(rq);
176 q->orig_bar_rq = rq;
177 rq = NULL;
178
179 /*
180 * Queue ordered sequence. As we stack them at the head, we
181 * need to queue in reverse order. Note that we rely on that
182 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
183 * request gets inbetween ordered sequence.
184 */
185 if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
186 queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
187 rq = &q->post_flush_rq;
188 } else
189 skip |= QUEUE_ORDSEQ_POSTFLUSH;
190
191 if (q->ordered & QUEUE_ORDERED_DO_BAR) {
192 rq = &q->bar_rq;
193
194 /* initialize proxy request and queue it */
195 blk_rq_init(q, rq);
196 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
197 rq->cmd_flags |= REQ_WRITE;
198 if (q->ordered & QUEUE_ORDERED_DO_FUA)
199 rq->cmd_flags |= REQ_FUA;
200 init_request_from_bio(rq, q->orig_bar_rq->bio);
201 rq->end_io = bar_end_io;
202
203 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
204 } else
205 skip |= QUEUE_ORDSEQ_BAR;
206
207 if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
208 queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
209 rq = &q->pre_flush_rq;
210 } else
211 skip |= QUEUE_ORDSEQ_PREFLUSH;
212
213 if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
214 rq = NULL;
215 else
216 skip |= QUEUE_ORDSEQ_DRAIN;
217
218 *rqp = rq;
219
220 /*
221 * Complete skipped sequences. If whole sequence is complete,
222 * return false to tell elevator that this request is gone.
223 */
224 return !blk_ordered_complete_seq(q, skip, 0);
225}
226
227bool blk_do_ordered(struct request_queue *q, struct request **rqp)
228{
229 struct request *rq = *rqp;
230 const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
231 (rq->cmd_flags & REQ_HARDBARRIER);
232
233 if (!q->ordseq) {
234 if (!is_barrier)
235 return true;
236
237 if (q->next_ordered != QUEUE_ORDERED_NONE)
238 return start_ordered(q, rqp);
239 else {
240 /*
241 * Queue ordering not supported. Terminate
242 * with prejudice.
243 */
244 blk_dequeue_request(rq);
245 __blk_end_request_all(rq, -EOPNOTSUPP);
246 *rqp = NULL;
247 return false;
248 }
249 }
250
251 /*
252 * Ordered sequence in progress
253 */
254
255 /* Special requests are not subject to ordering rules. */
256 if (rq->cmd_type != REQ_TYPE_FS &&
257 rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
258 return true;
259
260 if (q->ordered & QUEUE_ORDERED_BY_TAG) {
261 /* Ordered by tag. Blocking the next barrier is enough. */
262 if (is_barrier && rq != &q->bar_rq)
263 *rqp = NULL;
264 } else {
265 /* Ordered by draining. Wait for turn. */
266 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
267 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
268 *rqp = NULL;
269 }
270
271 return true;
272}
273
274static void bio_end_empty_barrier(struct bio *bio, int err)
275{
276 if (err) {
277 if (err == -EOPNOTSUPP)
278 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
279 clear_bit(BIO_UPTODATE, &bio->bi_flags);
280 }
281 if (bio->bi_private)
282 complete(bio->bi_private);
283 bio_put(bio);
284}
285
286/**
287 * blkdev_issue_flush - queue a flush
288 * @bdev: blockdev to issue flush for
289 * @gfp_mask: memory allocation flags (for bio_alloc)
290 * @error_sector: error sector
291 * @flags: BLKDEV_IFL_* flags to control behaviour
292 *
293 * Description:
294 * Issue a flush for the block device in question. Caller can supply
295 * room for storing the error offset in case of a flush error, if they
296 * wish to. If WAIT flag is not passed then caller may check only what
297 * request was pushed in some internal queue for later handling.
298 */
299int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
300 sector_t *error_sector, unsigned long flags)
301{
302 DECLARE_COMPLETION_ONSTACK(wait);
303 struct request_queue *q;
304 struct bio *bio;
305 int ret = 0;
306
307 if (bdev->bd_disk == NULL)
308 return -ENXIO;
309
310 q = bdev_get_queue(bdev);
311 if (!q)
312 return -ENXIO;
313
314 /*
315 * some block devices may not have their queue correctly set up here
316 * (e.g. loop device without a backing file) and so issuing a flush
317 * here will panic. Ensure there is a request function before issuing
318 * the barrier.
319 */
320 if (!q->make_request_fn)
321 return -ENXIO;
322
323 bio = bio_alloc(gfp_mask, 0);
324 bio->bi_end_io = bio_end_empty_barrier;
325 bio->bi_bdev = bdev;
326 if (test_bit(BLKDEV_WAIT, &flags))
327 bio->bi_private = &wait;
328
329 bio_get(bio);
330 submit_bio(WRITE_BARRIER, bio);
331 if (test_bit(BLKDEV_WAIT, &flags)) {
332 wait_for_completion(&wait);
333 /*
334 * The driver must store the error location in ->bi_sector, if
335 * it supports it. For non-stacked drivers, this should be
336 * copied from blk_rq_pos(rq).
337 */
338 if (error_sector)
339 *error_sector = bio->bi_sector;
340 }
341
342 if (bio_flagged(bio, BIO_EOPNOTSUPP))
343 ret = -EOPNOTSUPP;
344 else if (!bio_flagged(bio, BIO_UPTODATE))
345 ret = -EIO;
346
347 bio_put(bio);
348 return ret;
349}
350EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-core.c b/block/blk-core.c
index 500eb859886..45141469e89 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -139,7 +139,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
139{ 139{
140 struct request_queue *q = rq->q; 140 struct request_queue *q = rq->q;
141 141
142 if (&q->bar_rq != rq) { 142 if (&q->flush_rq != rq) {
143 if (error) 143 if (error)
144 clear_bit(BIO_UPTODATE, &bio->bi_flags); 144 clear_bit(BIO_UPTODATE, &bio->bi_flags);
145 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 145 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -163,13 +163,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
163 if (bio->bi_size == 0) 163 if (bio->bi_size == 0)
164 bio_endio(bio, error); 164 bio_endio(bio, error);
165 } else { 165 } else {
166
167 /* 166 /*
168 * Okay, this is the barrier request in progress, just 167 * Okay, this is the sequenced flush request in
169 * record the error; 168 * progress, just record the error;
170 */ 169 */
171 if (error && !q->orderr) 170 if (error && !q->flush_err)
172 q->orderr = error; 171 q->flush_err = error;
173 } 172 }
174} 173}
175 174
@@ -531,6 +530,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
531 init_timer(&q->unplug_timer); 530 init_timer(&q->unplug_timer);
532 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 531 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
533 INIT_LIST_HEAD(&q->timeout_list); 532 INIT_LIST_HEAD(&q->timeout_list);
533 INIT_LIST_HEAD(&q->pending_flushes);
534 INIT_WORK(&q->unplug_work, blk_unplug_work); 534 INIT_WORK(&q->unplug_work, blk_unplug_work);
535 535
536 kobject_init(&q->kobj, &blk_queue_ktype); 536 kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1053,22 +1053,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
1053} 1053}
1054EXPORT_SYMBOL(blk_insert_request); 1054EXPORT_SYMBOL(blk_insert_request);
1055 1055
1056/*
1057 * add-request adds a request to the linked list.
1058 * queue lock is held and interrupts disabled, as we muck with the
1059 * request queue list.
1060 */
1061static inline void add_request(struct request_queue *q, struct request *req)
1062{
1063 drive_stat_acct(req, 1);
1064
1065 /*
1066 * elevator indicated where it wants this request to be
1067 * inserted at elevator_merge time
1068 */
1069 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
1070}
1071
1072static void part_round_stats_single(int cpu, struct hd_struct *part, 1056static void part_round_stats_single(int cpu, struct hd_struct *part,
1073 unsigned long now) 1057 unsigned long now)
1074{ 1058{
@@ -1217,13 +1201,16 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1217 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1201 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1218 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); 1202 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
1219 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; 1203 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
1204 int where = ELEVATOR_INSERT_SORT;
1220 int rw_flags; 1205 int rw_flags;
1221 1206
1222 if ((bio->bi_rw & REQ_HARDBARRIER) && 1207 /* REQ_HARDBARRIER is no more */
1223 (q->next_ordered == QUEUE_ORDERED_NONE)) { 1208 if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
1209 "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
1224 bio_endio(bio, -EOPNOTSUPP); 1210 bio_endio(bio, -EOPNOTSUPP);
1225 return 0; 1211 return 0;
1226 } 1212 }
1213
1227 /* 1214 /*
1228 * low level driver can indicate that it wants pages above a 1215 * low level driver can indicate that it wants pages above a
1229 * certain limit bounced to low memory (ie for highmem, or even 1216 * certain limit bounced to low memory (ie for highmem, or even
@@ -1233,7 +1220,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1233 1220
1234 spin_lock_irq(q->queue_lock); 1221 spin_lock_irq(q->queue_lock);
1235 1222
1236 if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) 1223 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1224 where = ELEVATOR_INSERT_FRONT;
1225 goto get_rq;
1226 }
1227
1228 if (elv_queue_empty(q))
1237 goto get_rq; 1229 goto get_rq;
1238 1230
1239 el_ret = elv_merge(q, &req, bio); 1231 el_ret = elv_merge(q, &req, bio);
@@ -1330,7 +1322,10 @@ get_rq:
1330 req->cpu = blk_cpu_to_group(smp_processor_id()); 1322 req->cpu = blk_cpu_to_group(smp_processor_id());
1331 if (queue_should_plug(q) && elv_queue_empty(q)) 1323 if (queue_should_plug(q) && elv_queue_empty(q))
1332 blk_plug_device(q); 1324 blk_plug_device(q);
1333 add_request(q, req); 1325
1326 /* insert the request into the elevator */
1327 drive_stat_acct(req, 1);
1328 __elv_add_request(q, req, where, 0);
1334out: 1329out:
1335 if (unplug || !queue_should_plug(q)) 1330 if (unplug || !queue_should_plug(q))
1336 __generic_unplug_device(q); 1331 __generic_unplug_device(q);
@@ -1530,6 +1525,19 @@ static inline void __generic_make_request(struct bio *bio)
1530 if (bio_check_eod(bio, nr_sectors)) 1525 if (bio_check_eod(bio, nr_sectors))
1531 goto end_io; 1526 goto end_io;
1532 1527
1528 /*
1529 * Filter flush bio's early so that make_request based
1530 * drivers without flush support don't have to worry
1531 * about them.
1532 */
1533 if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
1534 bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
1535 if (!nr_sectors) {
1536 err = 0;
1537 goto end_io;
1538 }
1539 }
1540
1533 if ((bio->bi_rw & REQ_DISCARD) && 1541 if ((bio->bi_rw & REQ_DISCARD) &&
1534 (!blk_queue_discard(q) || 1542 (!blk_queue_discard(q) ||
1535 ((bio->bi_rw & REQ_SECURE) && 1543 ((bio->bi_rw & REQ_SECURE) &&
@@ -1794,11 +1802,11 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1794static void blk_account_io_done(struct request *req) 1802static void blk_account_io_done(struct request *req)
1795{ 1803{
1796 /* 1804 /*
1797 * Account IO completion. bar_rq isn't accounted as a normal 1805 * Account IO completion. flush_rq isn't accounted as a
1798 * IO on queueing nor completion. Accounting the containing 1806 * normal IO on queueing nor completion. Accounting the
1799 * request is enough. 1807 * containing request is enough.
1800 */ 1808 */
1801 if (blk_do_io_stat(req) && req != &req->q->bar_rq) { 1809 if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
1802 unsigned long duration = jiffies - req->start_time; 1810 unsigned long duration = jiffies - req->start_time;
1803 const int rw = rq_data_dir(req); 1811 const int rw = rq_data_dir(req);
1804 struct hd_struct *part; 1812 struct hd_struct *part;
@@ -2523,9 +2531,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2523static void __blk_rq_prep_clone(struct request *dst, struct request *src) 2531static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2524{ 2532{
2525 dst->cpu = src->cpu; 2533 dst->cpu = src->cpu;
2526 dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); 2534 dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
2527 if (src->cmd_flags & REQ_DISCARD)
2528 dst->cmd_flags |= REQ_DISCARD;
2529 dst->cmd_type = src->cmd_type; 2535 dst->cmd_type = src->cmd_type;
2530 dst->__sector = blk_rq_pos(src); 2536 dst->__sector = blk_rq_pos(src);
2531 dst->__data_len = blk_rq_bytes(src); 2537 dst->__data_len = blk_rq_bytes(src);
diff --git a/block/blk-flush.c b/block/blk-flush.c
new file mode 100644
index 00000000000..54b123d6563
--- /dev/null
+++ b/block/blk-flush.c
@@ -0,0 +1,262 @@
1/*
2 * Functions to sequence FLUSH and FUA writes.
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/gfp.h>
9
10#include "blk.h"
11
12/* FLUSH/FUA sequences */
13enum {
14 QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */
15 QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */
16 QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */
17 QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */
18 QUEUE_FSEQ_DONE = (1 << 4),
19};
20
21static struct request *queue_next_fseq(struct request_queue *q);
22
23unsigned blk_flush_cur_seq(struct request_queue *q)
24{
25 if (!q->flush_seq)
26 return 0;
27 return 1 << ffz(q->flush_seq);
28}
29
30static struct request *blk_flush_complete_seq(struct request_queue *q,
31 unsigned seq, int error)
32{
33 struct request *next_rq = NULL;
34
35 if (error && !q->flush_err)
36 q->flush_err = error;
37
38 BUG_ON(q->flush_seq & seq);
39 q->flush_seq |= seq;
40
41 if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
42 /* not complete yet, queue the next flush sequence */
43 next_rq = queue_next_fseq(q);
44 } else {
45 /* complete this flush request */
46 __blk_end_request_all(q->orig_flush_rq, q->flush_err);
47 q->orig_flush_rq = NULL;
48 q->flush_seq = 0;
49
50 /* dispatch the next flush if there's one */
51 if (!list_empty(&q->pending_flushes)) {
52 next_rq = list_entry_rq(q->pending_flushes.next);
53 list_move(&next_rq->queuelist, &q->queue_head);
54 }
55 }
56 return next_rq;
57}
58
59static void blk_flush_complete_seq_end_io(struct request_queue *q,
60 unsigned seq, int error)
61{
62 bool was_empty = elv_queue_empty(q);
63 struct request *next_rq;
64
65 next_rq = blk_flush_complete_seq(q, seq, error);
66
67 /*
68 * Moving a request silently to empty queue_head may stall the
69 * queue. Kick the queue in those cases.
70 */
71 if (was_empty && next_rq)
72 __blk_run_queue(q);
73}
74
75static void pre_flush_end_io(struct request *rq, int error)
76{
77 elv_completed_request(rq->q, rq);
78 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
79}
80
81static void flush_data_end_io(struct request *rq, int error)
82{
83 elv_completed_request(rq->q, rq);
84 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
85}
86
87static void post_flush_end_io(struct request *rq, int error)
88{
89 elv_completed_request(rq->q, rq);
90 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
91}
92
93static void init_flush_request(struct request *rq, struct gendisk *disk)
94{
95 rq->cmd_type = REQ_TYPE_FS;
96 rq->cmd_flags = WRITE_FLUSH;
97 rq->rq_disk = disk;
98}
99
100static struct request *queue_next_fseq(struct request_queue *q)
101{
102 struct request *orig_rq = q->orig_flush_rq;
103 struct request *rq = &q->flush_rq;
104
105 blk_rq_init(q, rq);
106
107 switch (blk_flush_cur_seq(q)) {
108 case QUEUE_FSEQ_PREFLUSH:
109 init_flush_request(rq, orig_rq->rq_disk);
110 rq->end_io = pre_flush_end_io;
111 break;
112 case QUEUE_FSEQ_DATA:
113 init_request_from_bio(rq, orig_rq->bio);
114 /*
115 * orig_rq->rq_disk may be different from
116 * bio->bi_bdev->bd_disk if orig_rq got here through
117 * remapping drivers. Make sure rq->rq_disk points
118 * to the same one as orig_rq.
119 */
120 rq->rq_disk = orig_rq->rq_disk;
121 rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
122 rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
123 rq->end_io = flush_data_end_io;
124 break;
125 case QUEUE_FSEQ_POSTFLUSH:
126 init_flush_request(rq, orig_rq->rq_disk);
127 rq->end_io = post_flush_end_io;
128 break;
129 default:
130 BUG();
131 }
132
133 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
134 return rq;
135}
136
137struct request *blk_do_flush(struct request_queue *q, struct request *rq)
138{
139 unsigned int fflags = q->flush_flags; /* may change, cache it */
140 bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
141 bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
142 bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
143 unsigned skip = 0;
144
145 /*
146 * Special case. If there's data but flush is not necessary,
147 * the request can be issued directly.
148 *
149 * Flush w/o data should be able to be issued directly too but
150 * currently some drivers assume that rq->bio contains
151 * non-zero data if it isn't NULL and empty FLUSH requests
152 * getting here usually have bio's without data.
153 */
154 if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
155 rq->cmd_flags &= ~REQ_FLUSH;
156 if (!has_fua)
157 rq->cmd_flags &= ~REQ_FUA;
158 return rq;
159 }
160
161 /*
162 * Sequenced flushes can't be processed in parallel. If
163 * another one is already in progress, queue for later
164 * processing.
165 */
166 if (q->flush_seq) {
167 list_move_tail(&rq->queuelist, &q->pending_flushes);
168 return NULL;
169 }
170
171 /*
172 * Start a new flush sequence
173 */
174 q->flush_err = 0;
175 q->flush_seq |= QUEUE_FSEQ_STARTED;
176
177 /* adjust FLUSH/FUA of the original request and stash it away */
178 rq->cmd_flags &= ~REQ_FLUSH;
179 if (!has_fua)
180 rq->cmd_flags &= ~REQ_FUA;
181 blk_dequeue_request(rq);
182 q->orig_flush_rq = rq;
183
184 /* skip unneded sequences and return the first one */
185 if (!do_preflush)
186 skip |= QUEUE_FSEQ_PREFLUSH;
187 if (!blk_rq_sectors(rq))
188 skip |= QUEUE_FSEQ_DATA;
189 if (!do_postflush)
190 skip |= QUEUE_FSEQ_POSTFLUSH;
191 return blk_flush_complete_seq(q, skip, 0);
192}
193
194static void bio_end_flush(struct bio *bio, int err)
195{
196 if (err)
197 clear_bit(BIO_UPTODATE, &bio->bi_flags);
198 if (bio->bi_private)
199 complete(bio->bi_private);
200 bio_put(bio);
201}
202
203/**
204 * blkdev_issue_flush - queue a flush
205 * @bdev: blockdev to issue flush for
206 * @gfp_mask: memory allocation flags (for bio_alloc)
207 * @error_sector: error sector
208 *
209 * Description:
210 * Issue a flush for the block device in question. Caller can supply
211 * room for storing the error offset in case of a flush error, if they
212 * wish to. If WAIT flag is not passed then caller may check only what
213 * request was pushed in some internal queue for later handling.
214 */
215int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
216 sector_t *error_sector)
217{
218 DECLARE_COMPLETION_ONSTACK(wait);
219 struct request_queue *q;
220 struct bio *bio;
221 int ret = 0;
222
223 if (bdev->bd_disk == NULL)
224 return -ENXIO;
225
226 q = bdev_get_queue(bdev);
227 if (!q)
228 return -ENXIO;
229
230 /*
231 * some block devices may not have their queue correctly set up here
232 * (e.g. loop device without a backing file) and so issuing a flush
233 * here will panic. Ensure there is a request function before issuing
234 * the flush.
235 */
236 if (!q->make_request_fn)
237 return -ENXIO;
238
239 bio = bio_alloc(gfp_mask, 0);
240 bio->bi_end_io = bio_end_flush;
241 bio->bi_bdev = bdev;
242 bio->bi_private = &wait;
243
244 bio_get(bio);
245 submit_bio(WRITE_FLUSH, bio);
246 wait_for_completion(&wait);
247
248 /*
249 * The driver must store the error location in ->bi_sector, if
250 * it supports it. For non-stacked drivers, this should be
251 * copied from blk_rq_pos(rq).
252 */
253 if (error_sector)
254 *error_sector = bio->bi_sector;
255
256 if (!bio_flagged(bio, BIO_UPTODATE))
257 ret = -EIO;
258
259 bio_put(bio);
260 return ret;
261}
262EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index c392029a104..1a320d2406b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
39{ 39{
40 DECLARE_COMPLETION_ONSTACK(wait); 40 DECLARE_COMPLETION_ONSTACK(wait);
41 struct request_queue *q = bdev_get_queue(bdev); 41 struct request_queue *q = bdev_get_queue(bdev);
42 int type = flags & BLKDEV_IFL_BARRIER ? 42 int type = REQ_WRITE | REQ_DISCARD;
43 DISCARD_BARRIER : DISCARD_NOBARRIER;
44 unsigned int max_discard_sectors; 43 unsigned int max_discard_sectors;
45 struct bio *bio; 44 struct bio *bio;
46 int ret = 0; 45 int ret = 0;
@@ -62,10 +61,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
62 max_discard_sectors &= ~(disc_sects - 1); 61 max_discard_sectors &= ~(disc_sects - 1);
63 } 62 }
64 63
65 if (flags & BLKDEV_IFL_SECURE) { 64 if (flags & BLKDEV_DISCARD_SECURE) {
66 if (!blk_queue_secdiscard(q)) 65 if (!blk_queue_secdiscard(q))
67 return -EOPNOTSUPP; 66 return -EOPNOTSUPP;
68 type |= DISCARD_SECURE; 67 type |= REQ_SECURE;
69 } 68 }
70 69
71 while (nr_sects && !ret) { 70 while (nr_sects && !ret) {
@@ -78,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
78 bio->bi_sector = sector; 77 bio->bi_sector = sector;
79 bio->bi_end_io = blkdev_discard_end_io; 78 bio->bi_end_io = blkdev_discard_end_io;
80 bio->bi_bdev = bdev; 79 bio->bi_bdev = bdev;
81 if (flags & BLKDEV_IFL_WAIT) 80 bio->bi_private = &wait;
82 bio->bi_private = &wait;
83 81
84 if (nr_sects > max_discard_sectors) { 82 if (nr_sects > max_discard_sectors) {
85 bio->bi_size = max_discard_sectors << 9; 83 bio->bi_size = max_discard_sectors << 9;
@@ -93,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
93 bio_get(bio); 91 bio_get(bio);
94 submit_bio(type, bio); 92 submit_bio(type, bio);
95 93
96 if (flags & BLKDEV_IFL_WAIT) 94 wait_for_completion(&wait);
97 wait_for_completion(&wait);
98 95
99 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 96 if (bio_flagged(bio, BIO_EOPNOTSUPP))
100 ret = -EOPNOTSUPP; 97 ret = -EOPNOTSUPP;
@@ -140,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
140 * @sector: start sector 137 * @sector: start sector
141 * @nr_sects: number of sectors to write 138 * @nr_sects: number of sectors to write
142 * @gfp_mask: memory allocation flags (for bio_alloc) 139 * @gfp_mask: memory allocation flags (for bio_alloc)
143 * @flags: BLKDEV_IFL_* flags to control behaviour
144 * 140 *
145 * Description: 141 * Description:
146 * Generate and issue number of bios with zerofiled pages. 142 * Generate and issue number of bios with zerofiled pages.
@@ -149,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err)
149 */ 145 */
150 146
151int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 147int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
152 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) 148 sector_t nr_sects, gfp_t gfp_mask)
153{ 149{
154 int ret; 150 int ret;
155 struct bio *bio; 151 struct bio *bio;
@@ -162,12 +158,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
162 bb.wait = &wait; 158 bb.wait = &wait;
163 bb.end_io = NULL; 159 bb.end_io = NULL;
164 160
165 if (flags & BLKDEV_IFL_BARRIER) {
166 /* issue async barrier before the data */
167 ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
168 if (ret)
169 return ret;
170 }
171submit: 161submit:
172 ret = 0; 162 ret = 0;
173 while (nr_sects != 0) { 163 while (nr_sects != 0) {
@@ -181,8 +171,7 @@ submit:
181 bio->bi_sector = sector; 171 bio->bi_sector = sector;
182 bio->bi_bdev = bdev; 172 bio->bi_bdev = bdev;
183 bio->bi_end_io = bio_batch_end_io; 173 bio->bi_end_io = bio_batch_end_io;
184 if (flags & BLKDEV_IFL_WAIT) 174 bio->bi_private = &bb;
185 bio->bi_private = &bb;
186 175
187 while (nr_sects != 0) { 176 while (nr_sects != 0) {
188 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); 177 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -199,18 +188,10 @@ submit:
199 issued++; 188 issued++;
200 submit_bio(WRITE, bio); 189 submit_bio(WRITE, bio);
201 } 190 }
202 /*
203 * When all data bios are in flight. Send final barrier if requeted.
204 */
205 if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
206 ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
207 flags & BLKDEV_IFL_WAIT);
208
209 191
210 if (flags & BLKDEV_IFL_WAIT) 192 /* Wait for bios in-flight */
211 /* Wait for bios in-flight */ 193 while (issued != atomic_read(&bb.done))
212 while ( issued != atomic_read(&bb.done)) 194 wait_for_completion(&wait);
213 wait_for_completion(&wait);
214 195
215 if (!test_bit(BIO_UPTODATE, &bb.flags)) 196 if (!test_bit(BIO_UPTODATE, &bb.flags))
216 /* One of bios in the batch was completed with error.*/ 197 /* One of bios in the batch was completed with error.*/
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 315b88c8cbb..701859fb964 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -792,6 +792,26 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
792} 792}
793EXPORT_SYMBOL(blk_queue_update_dma_alignment); 793EXPORT_SYMBOL(blk_queue_update_dma_alignment);
794 794
795/**
796 * blk_queue_flush - configure queue's cache flush capability
797 * @q: the request queue for the device
798 * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
799 *
800 * Tell block layer cache flush capability of @q. If it supports
801 * flushing, REQ_FLUSH should be set. If it supports bypassing
802 * write cache for individual writes, REQ_FUA should be set.
803 */
804void blk_queue_flush(struct request_queue *q, unsigned int flush)
805{
806 WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
807
808 if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
809 flush &= ~REQ_FUA;
810
811 q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
812}
813EXPORT_SYMBOL_GPL(blk_queue_flush);
814
795static int __init blk_settings_init(void) 815static int __init blk_settings_init(void)
796{ 816{
797 blk_max_low_pfn = max_low_pfn - 1; 817 blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk.h b/block/blk.h
index f864012ec30..1e675e5ade0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 51 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 53
54struct request *blk_do_flush(struct request_queue *q, struct request *rq);
55
54static inline struct request *__elv_next_request(struct request_queue *q) 56static inline struct request *__elv_next_request(struct request_queue *q)
55{ 57{
56 struct request *rq; 58 struct request *rq;
@@ -58,7 +60,11 @@ static inline struct request *__elv_next_request(struct request_queue *q)
58 while (1) { 60 while (1) {
59 while (!list_empty(&q->queue_head)) { 61 while (!list_empty(&q->queue_head)) {
60 rq = list_entry_rq(q->queue_head.next); 62 rq = list_entry_rq(q->queue_head.next);
61 if (blk_do_ordered(q, &rq)) 63 if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
64 rq == &q->flush_rq)
65 return rq;
66 rq = blk_do_flush(q, rq);
67 if (rq)
62 return rq; 68 return rq;
63 } 69 }
64 70
diff --git a/block/elevator.c b/block/elevator.c
index 4e11559aa2b..282e8308f7e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -617,8 +617,6 @@ void elv_quiesce_end(struct request_queue *q)
617 617
618void elv_insert(struct request_queue *q, struct request *rq, int where) 618void elv_insert(struct request_queue *q, struct request *rq, int where)
619{ 619{
620 struct list_head *pos;
621 unsigned ordseq;
622 int unplug_it = 1; 620 int unplug_it = 1;
623 621
624 trace_block_rq_insert(q, rq); 622 trace_block_rq_insert(q, rq);
@@ -626,9 +624,16 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
626 rq->q = q; 624 rq->q = q;
627 625
628 switch (where) { 626 switch (where) {
627 case ELEVATOR_INSERT_REQUEUE:
628 /*
629 * Most requeues happen because of a busy condition,
630 * don't force unplug of the queue for that case.
631 * Clear unplug_it and fall through.
632 */
633 unplug_it = 0;
634
629 case ELEVATOR_INSERT_FRONT: 635 case ELEVATOR_INSERT_FRONT:
630 rq->cmd_flags |= REQ_SOFTBARRIER; 636 rq->cmd_flags |= REQ_SOFTBARRIER;
631
632 list_add(&rq->queuelist, &q->queue_head); 637 list_add(&rq->queuelist, &q->queue_head);
633 break; 638 break;
634 639
@@ -668,36 +673,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
668 q->elevator->ops->elevator_add_req_fn(q, rq); 673 q->elevator->ops->elevator_add_req_fn(q, rq);
669 break; 674 break;
670 675
671 case ELEVATOR_INSERT_REQUEUE:
672 /*
673 * If ordered flush isn't in progress, we do front
674 * insertion; otherwise, requests should be requeued
675 * in ordseq order.
676 */
677 rq->cmd_flags |= REQ_SOFTBARRIER;
678
679 /*
680 * Most requeues happen because of a busy condition,
681 * don't force unplug of the queue for that case.
682 */
683 unplug_it = 0;
684
685 if (q->ordseq == 0) {
686 list_add(&rq->queuelist, &q->queue_head);
687 break;
688 }
689
690 ordseq = blk_ordered_req_seq(rq);
691
692 list_for_each(pos, &q->queue_head) {
693 struct request *pos_rq = list_entry_rq(pos);
694 if (ordseq <= blk_ordered_req_seq(pos_rq))
695 break;
696 }
697
698 list_add_tail(&rq->queuelist, pos);
699 break;
700
701 default: 676 default:
702 printk(KERN_ERR "%s: bad insertion point %d\n", 677 printk(KERN_ERR "%s: bad insertion point %d\n",
703 __func__, where); 678 __func__, where);
@@ -716,26 +691,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
716void __elv_add_request(struct request_queue *q, struct request *rq, int where, 691void __elv_add_request(struct request_queue *q, struct request *rq, int where,
717 int plug) 692 int plug)
718{ 693{
719 if (q->ordcolor)
720 rq->cmd_flags |= REQ_ORDERED_COLOR;
721
722 if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { 694 if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
723 /* 695 /* barriers are scheduling boundary, update end_sector */
724 * toggle ordered color
725 */
726 if (rq->cmd_flags & REQ_HARDBARRIER)
727 q->ordcolor ^= 1;
728
729 /*
730 * barriers implicitly indicate back insertion
731 */
732 if (where == ELEVATOR_INSERT_SORT)
733 where = ELEVATOR_INSERT_BACK;
734
735 /*
736 * this request is scheduling boundary, update
737 * end_sector
738 */
739 if (rq->cmd_type == REQ_TYPE_FS || 696 if (rq->cmd_type == REQ_TYPE_FS ||
740 (rq->cmd_flags & REQ_DISCARD)) { 697 (rq->cmd_flags & REQ_DISCARD)) {
741 q->end_sector = rq_end_sector(rq); 698 q->end_sector = rq_end_sector(rq);
@@ -855,24 +812,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
855 e->ops->elevator_completed_req_fn) 812 e->ops->elevator_completed_req_fn)
856 e->ops->elevator_completed_req_fn(q, rq); 813 e->ops->elevator_completed_req_fn(q, rq);
857 } 814 }
858
859 /*
860 * Check if the queue is waiting for fs requests to be
861 * drained for flush sequence.
862 */
863 if (unlikely(q->ordseq)) {
864 struct request *next = NULL;
865
866 if (!list_empty(&q->queue_head))
867 next = list_entry_rq(q->queue_head.next);
868
869 if (!queue_in_flight(q) &&
870 blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
871 (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
872 blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
873 __blk_run_queue(q);
874 }
875 }
876} 815}
877 816
878#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 817#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
diff --git a/block/ioctl.c b/block/ioctl.c
index 2c15fe0912c..d724ceb1d46 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -116,7 +116,7 @@ static int blkdev_reread_part(struct block_device *bdev)
116static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, 116static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
117 uint64_t len, int secure) 117 uint64_t len, int secure)
118{ 118{
119 unsigned long flags = BLKDEV_IFL_WAIT; 119 unsigned long flags = 0;
120 120
121 if (start & 511) 121 if (start & 511)
122 return -EINVAL; 122 return -EINVAL;
@@ -128,7 +128,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
128 if (start + len > (bdev->bd_inode->i_size >> 9)) 128 if (start + len > (bdev->bd_inode->i_size >> 9))
129 return -EINVAL; 129 return -EINVAL;
130 if (secure) 130 if (secure)
131 flags |= BLKDEV_IFL_SECURE; 131 flags |= BLKDEV_DISCARD_SECURE;
132 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); 132 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
133} 133}
134 134
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 82bfd5bb4a9..b7f51e4594f 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -483,7 +483,6 @@ static struct brd_device *brd_alloc(int i)
483 if (!brd->brd_queue) 483 if (!brd->brd_queue)
484 goto out_free_dev; 484 goto out_free_dev;
485 blk_queue_make_request(brd->brd_queue, brd_make_request); 485 blk_queue_make_request(brd->brd_queue, brd_make_request);
486 blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
487 blk_queue_max_hw_sectors(brd->brd_queue, 1024); 486 blk_queue_max_hw_sectors(brd->brd_queue, 1024);
488 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 487 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
489 488
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c07c370c4c8..9bdcf4393c0 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2409,8 +2409,7 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
2409 if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2409 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2410 return; 2410 return;
2411 2411
2412 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL, 2412 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
2413 BLKDEV_IFL_WAIT);
2414 if (r) { 2413 if (r) {
2415 set_bit(MD_NO_BARRIER, &mdev->flags); 2414 set_bit(MD_NO_BARRIER, &mdev->flags);
2416 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2415 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 760ae0df925..efd6169acf2 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -987,7 +987,7 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
987 987
988 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { 988 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
989 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, 989 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
990 NULL, BLKDEV_IFL_WAIT); 990 NULL);
991 if (rv) { 991 if (rv) {
992 dev_err(DEV, "local disk flush failed with status %d\n", rv); 992 dev_err(DEV, "local disk flush failed with status %d\n", rv);
993 /* would rather check on EOPNOTSUPP, but that is not reliable. 993 /* would rather check on EOPNOTSUPP, but that is not reliable.
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index de3083b0a4f..6c48b3545f8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -479,17 +479,17 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
479 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; 479 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
480 480
481 if (bio_rw(bio) == WRITE) { 481 if (bio_rw(bio) == WRITE) {
482 bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
483 struct file *file = lo->lo_backing_file; 482 struct file *file = lo->lo_backing_file;
484 483
485 if (barrier) { 484 /* REQ_HARDBARRIER is deprecated */
486 if (unlikely(!file->f_op->fsync)) { 485 if (bio->bi_rw & REQ_HARDBARRIER) {
487 ret = -EOPNOTSUPP; 486 ret = -EOPNOTSUPP;
488 goto out; 487 goto out;
489 } 488 }
490 489
490 if (bio->bi_rw & REQ_FLUSH) {
491 ret = vfs_fsync(file, 0); 491 ret = vfs_fsync(file, 0);
492 if (unlikely(ret)) { 492 if (unlikely(ret && ret != -EINVAL)) {
493 ret = -EIO; 493 ret = -EIO;
494 goto out; 494 goto out;
495 } 495 }
@@ -497,9 +497,9 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
497 497
498 ret = lo_send(lo, bio, pos); 498 ret = lo_send(lo, bio, pos);
499 499
500 if (barrier && !ret) { 500 if ((bio->bi_rw & REQ_FUA) && !ret) {
501 ret = vfs_fsync(file, 0); 501 ret = vfs_fsync(file, 0);
502 if (unlikely(ret)) 502 if (unlikely(ret && ret != -EINVAL))
503 ret = -EIO; 503 ret = -EIO;
504 } 504 }
505 } else 505 } else
@@ -931,7 +931,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
931 lo->lo_queue->unplug_fn = loop_unplug; 931 lo->lo_queue->unplug_fn = loop_unplug;
932 932
933 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) 933 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
934 blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN); 934 blk_queue_flush(lo->lo_queue, REQ_FLUSH);
935 935
936 set_capacity(lo->lo_disk, size); 936 set_capacity(lo->lo_disk, size);
937 bd_set_size(bdev, size << 9); 937 bd_set_size(bdev, size << 9);
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 2284b4f05c6..87311ebac0d 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -310,8 +310,7 @@ static void osdblk_rq_fn(struct request_queue *q)
310 break; 310 break;
311 311
312 /* filter out block requests we don't understand */ 312 /* filter out block requests we don't understand */
313 if (rq->cmd_type != REQ_TYPE_FS && 313 if (rq->cmd_type != REQ_TYPE_FS) {
314 !(rq->cmd_flags & REQ_HARDBARRIER)) {
315 blk_end_request_all(rq, 0); 314 blk_end_request_all(rq, 0);
316 continue; 315 continue;
317 } 316 }
@@ -439,7 +438,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
439 blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); 438 blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
440 439
441 blk_queue_prep_rq(q, blk_queue_start_tag); 440 blk_queue_prep_rq(q, blk_queue_start_tag);
442 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH); 441 blk_queue_flush(q, REQ_FLUSH);
443 442
444 disk->queue = q; 443 disk->queue = q;
445 444
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ef58fccadad..19b3568e932 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
753 753
754 rq->timeout = 60*HZ; 754 rq->timeout = 60*HZ;
755 rq->cmd_type = REQ_TYPE_BLOCK_PC; 755 rq->cmd_type = REQ_TYPE_BLOCK_PC;
756 rq->cmd_flags |= REQ_HARDBARRIER;
757 if (cgc->quiet) 756 if (cgc->quiet)
758 rq->cmd_flags |= REQ_QUIET; 757 rq->cmd_flags |= REQ_QUIET;
759 758
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 03688c2da31..8e1ce2e2916 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
468 blk_queue_dma_alignment(queue, dev->blk_size-1); 468 blk_queue_dma_alignment(queue, dev->blk_size-1);
469 blk_queue_logical_block_size(queue, dev->blk_size); 469 blk_queue_logical_block_size(queue, dev->blk_size);
470 470
471 blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH); 471 blk_queue_flush(queue, REQ_FLUSH);
472 472
473 blk_queue_max_segments(queue, -1); 473 blk_queue_max_segments(queue, -1);
474 blk_queue_max_segment_size(queue, dev->bounce_size); 474 blk_queue_max_segment_size(queue, dev->bounce_size);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8320490226b..6ecf89cdf00 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -127,9 +127,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
127 } 127 }
128 } 128 }
129 129
130 if (vbr->req->cmd_flags & REQ_HARDBARRIER)
131 vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
132
133 sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); 130 sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
134 131
135 /* 132 /*
@@ -379,31 +376,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
379 vblk->disk->driverfs_dev = &vdev->dev; 376 vblk->disk->driverfs_dev = &vdev->dev;
380 index++; 377 index++;
381 378
382 if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) { 379 /* configure queue flush support */
383 /* 380 if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
384 * If the FLUSH feature is supported we do have support for 381 blk_queue_flush(q, REQ_FLUSH);
385 * flushing a volatile write cache on the host. Use that
386 * to implement write barrier support.
387 */
388 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
389 } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
390 /*
391 * If the BARRIER feature is supported the host expects us
392 * to order request by tags. This implies there is not
393 * volatile write cache on the host, and that the host
394 * never re-orders outstanding I/O. This feature is not
395 * useful for real life scenarious and deprecated.
396 */
397 blk_queue_ordered(q, QUEUE_ORDERED_TAG);
398 } else {
399 /*
400 * If the FLUSH feature is not supported we must assume that
401 * the host does not perform any kind of volatile write
402 * caching. We still need to drain the queue to provider
403 * proper barrier semantics.
404 */
405 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
406 }
407 382
408 /* If disk is read-only in the host, the guest should obey */ 383 /* If disk is read-only in the host, the guest should obey */
409 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) 384 if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@ -522,9 +497,9 @@ static const struct virtio_device_id id_table[] = {
522}; 497};
523 498
524static unsigned int features[] = { 499static unsigned int features[] = {
525 VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, 500 VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
526 VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, 501 VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
527 VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY 502 VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
528}; 503};
529 504
530/* 505/*
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 3ff06f475ee..4b33a18c32e 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -96,7 +96,7 @@ struct blkfront_info
96 struct gnttab_free_callback callback; 96 struct gnttab_free_callback callback;
97 struct blk_shadow shadow[BLK_RING_SIZE]; 97 struct blk_shadow shadow[BLK_RING_SIZE];
98 unsigned long shadow_free; 98 unsigned long shadow_free;
99 int feature_barrier; 99 unsigned int feature_flush;
100 int is_ready; 100 int is_ready;
101}; 101};
102 102
@@ -419,26 +419,12 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
419} 419}
420 420
421 421
422static int xlvbd_barrier(struct blkfront_info *info) 422static void xlvbd_flush(struct blkfront_info *info)
423{ 423{
424 int err; 424 blk_queue_flush(info->rq, info->feature_flush);
425 const char *barrier;
426
427 switch (info->feature_barrier) {
428 case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break;
429 case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break;
430 case QUEUE_ORDERED_NONE: barrier = "disabled"; break;
431 default: return -EINVAL;
432 }
433
434 err = blk_queue_ordered(info->rq, info->feature_barrier);
435
436 if (err)
437 return err;
438
439 printk(KERN_INFO "blkfront: %s: barriers %s\n", 425 printk(KERN_INFO "blkfront: %s: barriers %s\n",
440 info->gd->disk_name, barrier); 426 info->gd->disk_name,
441 return 0; 427 info->feature_flush ? "enabled" : "disabled");
442} 428}
443 429
444 430
@@ -517,7 +503,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
517 info->rq = gd->queue; 503 info->rq = gd->queue;
518 info->gd = gd; 504 info->gd = gd;
519 505
520 xlvbd_barrier(info); 506 xlvbd_flush(info);
521 507
522 if (vdisk_info & VDISK_READONLY) 508 if (vdisk_info & VDISK_READONLY)
523 set_disk_ro(gd, 1); 509 set_disk_ro(gd, 1);
@@ -663,8 +649,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
663 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", 649 printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
664 info->gd->disk_name); 650 info->gd->disk_name);
665 error = -EOPNOTSUPP; 651 error = -EOPNOTSUPP;
666 info->feature_barrier = QUEUE_ORDERED_NONE; 652 info->feature_flush = 0;
667 xlvbd_barrier(info); 653 xlvbd_flush(info);
668 } 654 }
669 /* fall through */ 655 /* fall through */
670 case BLKIF_OP_READ: 656 case BLKIF_OP_READ:
@@ -1077,20 +1063,20 @@ static void blkfront_connect(struct blkfront_info *info)
1077 /* 1063 /*
1078 * If there's no "feature-barrier" defined, then it means 1064 * If there's no "feature-barrier" defined, then it means
1079 * we're dealing with a very old backend which writes 1065 * we're dealing with a very old backend which writes
1080 * synchronously; draining will do what needs to get done. 1066 * synchronously; nothing to do.
1081 * 1067 *
1082 * If there are barriers, then we can do full queued writes 1068 * If there are barriers, then we use flush.
1083 * with tagged barriers.
1084 *
1085 * If barriers are not supported, then there's no much we can
1086 * do, so just set ordering to NONE.
1087 */ 1069 */
1088 if (err) 1070 info->feature_flush = 0;
1089 info->feature_barrier = QUEUE_ORDERED_DRAIN; 1071
1090 else if (barrier) 1072 /*
1091 info->feature_barrier = QUEUE_ORDERED_TAG; 1073 * The driver doesn't properly handled empty flushes, so
1092 else 1074 * lets disable barrier support for now.
1093 info->feature_barrier = QUEUE_ORDERED_NONE; 1075 */
1076#if 0
1077 if (!err && barrier)
1078 info->feature_flush = REQ_FLUSH;
1079#endif
1094 1080
1095 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1081 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1096 if (err) { 1082 if (err) {
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 7433e07de30..7c5b01ce51d 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -516,10 +516,10 @@ static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect)
516 return ide_no_data_taskfile(drive, &cmd); 516 return ide_no_data_taskfile(drive, &cmd);
517} 517}
518 518
519static void update_ordered(ide_drive_t *drive) 519static void update_flush(ide_drive_t *drive)
520{ 520{
521 u16 *id = drive->id; 521 u16 *id = drive->id;
522 unsigned ordered = QUEUE_ORDERED_NONE; 522 unsigned flush = 0;
523 523
524 if (drive->dev_flags & IDE_DFLAG_WCACHE) { 524 if (drive->dev_flags & IDE_DFLAG_WCACHE) {
525 unsigned long long capacity; 525 unsigned long long capacity;
@@ -543,13 +543,12 @@ static void update_ordered(ide_drive_t *drive)
543 drive->name, barrier ? "" : "not "); 543 drive->name, barrier ? "" : "not ");
544 544
545 if (barrier) { 545 if (barrier) {
546 ordered = QUEUE_ORDERED_DRAIN_FLUSH; 546 flush = REQ_FLUSH;
547 blk_queue_prep_rq(drive->queue, idedisk_prep_fn); 547 blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
548 } 548 }
549 } else 549 }
550 ordered = QUEUE_ORDERED_DRAIN;
551 550
552 blk_queue_ordered(drive->queue, ordered); 551 blk_queue_flush(drive->queue, flush);
553} 552}
554 553
555ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE); 554ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
@@ -572,7 +571,7 @@ static int set_wcache(ide_drive_t *drive, int arg)
572 } 571 }
573 } 572 }
574 573
575 update_ordered(drive); 574 update_flush(drive);
576 575
577 return err; 576 return err;
578} 577}
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index a381be81407..999dac054bc 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -441,19 +441,6 @@ void do_ide_request(struct request_queue *q)
441 struct request *rq = NULL; 441 struct request *rq = NULL;
442 ide_startstop_t startstop; 442 ide_startstop_t startstop;
443 443
444 /*
445 * drive is doing pre-flush, ordered write, post-flush sequence. even
446 * though that is 3 requests, it must be seen as a single transaction.
447 * we must not preempt this drive until that is complete
448 */
449 if (blk_queue_flushing(q))
450 /*
451 * small race where queue could get replugged during
452 * the 3-request flush cycle, just yank the plug since
453 * we want it to finish asap
454 */
455 blk_remove_plug(q);
456
457 spin_unlock_irq(q->queue_lock); 444 spin_unlock_irq(q->queue_lock);
458 445
459 /* HLD do_request() callback might sleep, make sure it's okay */ 446 /* HLD do_request() callback might sleep, make sure it's okay */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 368e8e98f70..d5b0e4c0e70 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1278,7 +1278,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1278 struct dm_crypt_io *io; 1278 struct dm_crypt_io *io;
1279 struct crypt_config *cc; 1279 struct crypt_config *cc;
1280 1280
1281 if (unlikely(bio_empty_barrier(bio))) { 1281 if (bio->bi_rw & REQ_FLUSH) {
1282 cc = ti->private; 1282 cc = ti->private;
1283 bio->bi_bdev = cc->dev->bdev; 1283 bio->bi_bdev = cc->dev->bdev;
1284 return DM_MAPIO_REMAPPED; 1284 return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0590c75b0ab..136d4f71a11 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -31,7 +31,6 @@ struct dm_io_client {
31 */ 31 */
32struct io { 32struct io {
33 unsigned long error_bits; 33 unsigned long error_bits;
34 unsigned long eopnotsupp_bits;
35 atomic_t count; 34 atomic_t count;
36 struct task_struct *sleeper; 35 struct task_struct *sleeper;
37 struct dm_io_client *client; 36 struct dm_io_client *client;
@@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
130 *---------------------------------------------------------------*/ 129 *---------------------------------------------------------------*/
131static void dec_count(struct io *io, unsigned int region, int error) 130static void dec_count(struct io *io, unsigned int region, int error)
132{ 131{
133 if (error) { 132 if (error)
134 set_bit(region, &io->error_bits); 133 set_bit(region, &io->error_bits);
135 if (error == -EOPNOTSUPP)
136 set_bit(region, &io->eopnotsupp_bits);
137 }
138 134
139 if (atomic_dec_and_test(&io->count)) { 135 if (atomic_dec_and_test(&io->count)) {
140 if (io->sleeper) 136 if (io->sleeper)
@@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
310 sector_t remaining = where->count; 306 sector_t remaining = where->count;
311 307
312 /* 308 /*
313 * where->count may be zero if rw holds a write barrier and we 309 * where->count may be zero if rw holds a flush and we need to
314 * need to send a zero-sized barrier. 310 * send a zero-sized flush.
315 */ 311 */
316 do { 312 do {
317 /* 313 /*
@@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
364 */ 360 */
365 for (i = 0; i < num_regions; i++) { 361 for (i = 0; i < num_regions; i++) {
366 *dp = old_pages; 362 *dp = old_pages;
367 if (where[i].count || (rw & REQ_HARDBARRIER)) 363 if (where[i].count || (rw & REQ_FLUSH))
368 do_region(rw, i, where + i, dp, io); 364 do_region(rw, i, where + i, dp, io);
369 } 365 }
370 366
@@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
393 return -EIO; 389 return -EIO;
394 } 390 }
395 391
396retry:
397 io->error_bits = 0; 392 io->error_bits = 0;
398 io->eopnotsupp_bits = 0;
399 atomic_set(&io->count, 1); /* see dispatch_io() */ 393 atomic_set(&io->count, 1); /* see dispatch_io() */
400 io->sleeper = current; 394 io->sleeper = current;
401 io->client = client; 395 io->client = client;
@@ -412,11 +406,6 @@ retry:
412 } 406 }
413 set_current_state(TASK_RUNNING); 407 set_current_state(TASK_RUNNING);
414 408
415 if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
416 rw &= ~REQ_HARDBARRIER;
417 goto retry;
418 }
419
420 if (error_bits) 409 if (error_bits)
421 *error_bits = io->error_bits; 410 *error_bits = io->error_bits;
422 411
@@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
437 426
438 io = mempool_alloc(client->pool, GFP_NOIO); 427 io = mempool_alloc(client->pool, GFP_NOIO);
439 io->error_bits = 0; 428 io->error_bits = 0;
440 io->eopnotsupp_bits = 0;
441 atomic_set(&io->count, 1); /* see dispatch_io() */ 429 atomic_set(&io->count, 1); /* see dispatch_io() */
442 io->sleeper = NULL; 430 io->sleeper = NULL;
443 io->client = client; 431 io->client = client;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5a08be0222d..33420e68d15 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc)
300 .count = 0, 300 .count = 0,
301 }; 301 };
302 302
303 lc->io_req.bi_rw = WRITE_BARRIER; 303 lc->io_req.bi_rw = WRITE_FLUSH;
304 304
305 return dm_io(&lc->io_req, 1, &null_location, NULL); 305 return dm_io(&lc->io_req, 1, &null_location, NULL);
306} 306}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7c081bcbc3c..19a59b041c2 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti)
259 struct dm_io_region io[ms->nr_mirrors]; 259 struct dm_io_region io[ms->nr_mirrors];
260 struct mirror *m; 260 struct mirror *m;
261 struct dm_io_request io_req = { 261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_BARRIER, 262 .bi_rw = WRITE_FLUSH,
263 .mem.type = DM_IO_KMEM, 263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL, 264 .mem.ptr.bvec = NULL,
265 .client = ms->io_client, 265 .client = ms->io_client,
@@ -629,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
629 struct dm_io_region io[ms->nr_mirrors], *dest = io; 629 struct dm_io_region io[ms->nr_mirrors], *dest = io;
630 struct mirror *m; 630 struct mirror *m;
631 struct dm_io_request io_req = { 631 struct dm_io_request io_req = {
632 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), 632 .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
633 .mem.type = DM_IO_BVEC, 633 .mem.type = DM_IO_BVEC,
634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
635 .notify.fn = write_callback, 635 .notify.fn = write_callback,
@@ -670,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
670 bio_list_init(&requeue); 670 bio_list_init(&requeue);
671 671
672 while ((bio = bio_list_pop(writes))) { 672 while ((bio = bio_list_pop(writes))) {
673 if (unlikely(bio_empty_barrier(bio))) { 673 if (bio->bi_rw & REQ_FLUSH) {
674 bio_list_add(&sync, bio); 674 bio_list_add(&sync, bio);
675 continue; 675 continue;
676 } 676 }
@@ -1203,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1203 * We need to dec pending if this was a write. 1203 * We need to dec pending if this was a write.
1204 */ 1204 */
1205 if (rw == WRITE) { 1205 if (rw == WRITE) {
1206 if (likely(!bio_empty_barrier(bio))) 1206 if (!(bio->bi_rw & REQ_FLUSH))
1207 dm_rh_dec(ms->rh, map_context->ll); 1207 dm_rh_dec(ms->rh, map_context->ll);
1208 return error; 1208 return error;
1209 } 1209 }
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index bd5c58b2886..dad011aed0c 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@ struct dm_region_hash {
81 struct list_head failed_recovered_regions; 81 struct list_head failed_recovered_regions;
82 82
83 /* 83 /*
84 * If there was a barrier failure no regions can be marked clean. 84 * If there was a flush failure no regions can be marked clean.
85 */ 85 */
86 int barrier_failure; 86 int flush_failure;
87 87
88 void *context; 88 void *context;
89 sector_t target_begin; 89 sector_t target_begin;
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
217 INIT_LIST_HEAD(&rh->quiesced_regions); 217 INIT_LIST_HEAD(&rh->quiesced_regions);
218 INIT_LIST_HEAD(&rh->recovered_regions); 218 INIT_LIST_HEAD(&rh->recovered_regions);
219 INIT_LIST_HEAD(&rh->failed_recovered_regions); 219 INIT_LIST_HEAD(&rh->failed_recovered_regions);
220 rh->barrier_failure = 0; 220 rh->flush_failure = 0;
221 221
222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
223 sizeof(struct dm_region)); 223 sizeof(struct dm_region));
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
399 region_t region = dm_rh_bio_to_region(rh, bio); 399 region_t region = dm_rh_bio_to_region(rh, bio);
400 int recovering = 0; 400 int recovering = 0;
401 401
402 if (bio_empty_barrier(bio)) { 402 if (bio->bi_rw & REQ_FLUSH) {
403 rh->barrier_failure = 1; 403 rh->flush_failure = 1;
404 return; 404 return;
405 } 405 }
406 406
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
524 struct bio *bio; 524 struct bio *bio;
525 525
526 for (bio = bios->head; bio; bio = bio->bi_next) { 526 for (bio = bios->head; bio; bio = bio->bi_next) {
527 if (bio_empty_barrier(bio)) 527 if (bio->bi_rw & REQ_FLUSH)
528 continue; 528 continue;
529 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 529 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
530 } 530 }
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
555 */ 555 */
556 556
557 /* do nothing for DM_RH_NOSYNC */ 557 /* do nothing for DM_RH_NOSYNC */
558 if (unlikely(rh->barrier_failure)) { 558 if (unlikely(rh->flush_failure)) {
559 /* 559 /*
560 * If a write barrier failed some time ago, we 560 * If a write flush failed some time ago, we
561 * don't know whether or not this write made it 561 * don't know whether or not this write made it
562 * to the disk, so we must resync the device. 562 * to the disk, so we must resync the device.
563 */ 563 */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index cc2bdb83f9a..0b61792a278 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
687 /* 687 /*
688 * Commit exceptions to disk. 688 * Commit exceptions to disk.
689 */ 689 */
690 if (ps->valid && area_io(ps, WRITE_BARRIER)) 690 if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
691 ps->valid = 0; 691 ps->valid = 0;
692 692
693 /* 693 /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f30f6e8d594..53cf79d8bcb 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1585,7 +1585,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1585 chunk_t chunk; 1585 chunk_t chunk;
1586 struct dm_snap_pending_exception *pe = NULL; 1586 struct dm_snap_pending_exception *pe = NULL;
1587 1587
1588 if (unlikely(bio_empty_barrier(bio))) { 1588 if (bio->bi_rw & REQ_FLUSH) {
1589 bio->bi_bdev = s->cow->bdev; 1589 bio->bi_bdev = s->cow->bdev;
1590 return DM_MAPIO_REMAPPED; 1590 return DM_MAPIO_REMAPPED;
1591 } 1591 }
@@ -1689,7 +1689,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1689 int r = DM_MAPIO_REMAPPED; 1689 int r = DM_MAPIO_REMAPPED;
1690 chunk_t chunk; 1690 chunk_t chunk;
1691 1691
1692 if (unlikely(bio_empty_barrier(bio))) { 1692 if (bio->bi_rw & REQ_FLUSH) {
1693 if (!map_context->target_request_nr) 1693 if (!map_context->target_request_nr)
1694 bio->bi_bdev = s->origin->bdev; 1694 bio->bi_bdev = s->origin->bdev;
1695 else 1695 else
@@ -2133,7 +2133,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
2133 struct dm_dev *dev = ti->private; 2133 struct dm_dev *dev = ti->private;
2134 bio->bi_bdev = dev->bdev; 2134 bio->bi_bdev = dev->bdev;
2135 2135
2136 if (unlikely(bio_empty_barrier(bio))) 2136 if (bio->bi_rw & REQ_FLUSH)
2137 return DM_MAPIO_REMAPPED; 2137 return DM_MAPIO_REMAPPED;
2138 2138
2139 /* Only tell snapshots if this is a write */ 2139 /* Only tell snapshots if this is a write */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c297f6da91e..f0371b4c4fb 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -271,7 +271,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
271 uint32_t stripe; 271 uint32_t stripe;
272 unsigned target_request_nr; 272 unsigned target_request_nr;
273 273
274 if (unlikely(bio_empty_barrier(bio))) { 274 if (bio->bi_rw & REQ_FLUSH) {
275 target_request_nr = map_context->target_request_nr; 275 target_request_nr = map_context->target_request_nr;
276 BUG_ON(target_request_nr >= sc->stripes); 276 BUG_ON(target_request_nr >= sc->stripes);
277 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; 277 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7967eca5a2d..7cb1352f7e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -110,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
110#define DMF_FREEING 3 110#define DMF_FREEING 3
111#define DMF_DELETING 4 111#define DMF_DELETING 4
112#define DMF_NOFLUSH_SUSPENDING 5 112#define DMF_NOFLUSH_SUSPENDING 5
113#define DMF_QUEUE_IO_TO_THREAD 6
114 113
115/* 114/*
116 * Work processed by per-device workqueue. 115 * Work processed by per-device workqueue.
@@ -144,24 +143,9 @@ struct mapped_device {
144 spinlock_t deferred_lock; 143 spinlock_t deferred_lock;
145 144
146 /* 145 /*
147 * An error from the barrier request currently being processed. 146 * Processing queue (flush)
148 */
149 int barrier_error;
150
151 /*
152 * Protect barrier_error from concurrent endio processing
153 * in request-based dm.
154 */
155 spinlock_t barrier_error_lock;
156
157 /*
158 * Processing queue (flush/barriers)
159 */ 147 */
160 struct workqueue_struct *wq; 148 struct workqueue_struct *wq;
161 struct work_struct barrier_work;
162
163 /* A pointer to the currently processing pre/post flush request */
164 struct request *flush_request;
165 149
166 /* 150 /*
167 * The current mapping. 151 * The current mapping.
@@ -200,8 +184,8 @@ struct mapped_device {
200 /* sysfs handle */ 184 /* sysfs handle */
201 struct kobject kobj; 185 struct kobject kobj;
202 186
203 /* zero-length barrier that will be cloned and submitted to targets */ 187 /* zero-length flush that will be cloned and submitted to targets */
204 struct bio barrier_bio; 188 struct bio flush_bio;
205}; 189};
206 190
207/* 191/*
@@ -512,7 +496,7 @@ static void end_io_acct(struct dm_io *io)
512 496
513 /* 497 /*
514 * After this is decremented the bio must not be touched if it is 498 * After this is decremented the bio must not be touched if it is
515 * a barrier. 499 * a flush.
516 */ 500 */
517 dm_disk(md)->part0.in_flight[rw] = pending = 501 dm_disk(md)->part0.in_flight[rw] = pending =
518 atomic_dec_return(&md->pending[rw]); 502 atomic_dec_return(&md->pending[rw]);
@@ -528,16 +512,12 @@ static void end_io_acct(struct dm_io *io)
528 */ 512 */
529static void queue_io(struct mapped_device *md, struct bio *bio) 513static void queue_io(struct mapped_device *md, struct bio *bio)
530{ 514{
531 down_write(&md->io_lock); 515 unsigned long flags;
532 516
533 spin_lock_irq(&md->deferred_lock); 517 spin_lock_irqsave(&md->deferred_lock, flags);
534 bio_list_add(&md->deferred, bio); 518 bio_list_add(&md->deferred, bio);
535 spin_unlock_irq(&md->deferred_lock); 519 spin_unlock_irqrestore(&md->deferred_lock, flags);
536 520 queue_work(md->wq, &md->work);
537 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
538 queue_work(md->wq, &md->work);
539
540 up_write(&md->io_lock);
541} 521}
542 522
543/* 523/*
@@ -625,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error)
625 * Target requested pushing back the I/O. 605 * Target requested pushing back the I/O.
626 */ 606 */
627 spin_lock_irqsave(&md->deferred_lock, flags); 607 spin_lock_irqsave(&md->deferred_lock, flags);
628 if (__noflush_suspending(md)) { 608 if (__noflush_suspending(md))
629 if (!(io->bio->bi_rw & REQ_HARDBARRIER)) 609 bio_list_add_head(&md->deferred, io->bio);
630 bio_list_add_head(&md->deferred, 610 else
631 io->bio);
632 } else
633 /* noflush suspend was interrupted. */ 611 /* noflush suspend was interrupted. */
634 io->error = -EIO; 612 io->error = -EIO;
635 spin_unlock_irqrestore(&md->deferred_lock, flags); 613 spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -637,32 +615,23 @@ static void dec_pending(struct dm_io *io, int error)
637 615
638 io_error = io->error; 616 io_error = io->error;
639 bio = io->bio; 617 bio = io->bio;
618 end_io_acct(io);
619 free_io(md, io);
620
621 if (io_error == DM_ENDIO_REQUEUE)
622 return;
640 623
641 if (bio->bi_rw & REQ_HARDBARRIER) { 624 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
642 /* 625 /*
643 * There can be just one barrier request so we use 626 * Preflush done for flush with data, reissue
644 * a per-device variable for error reporting. 627 * without REQ_FLUSH.
645 * Note that you can't touch the bio after end_io_acct
646 *
647 * We ignore -EOPNOTSUPP for empty flush reported by
648 * underlying devices. We assume that if the device
649 * doesn't support empty barriers, it doesn't need
650 * cache flushing commands.
651 */ 628 */
652 if (!md->barrier_error && 629 bio->bi_rw &= ~REQ_FLUSH;
653 !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) 630 queue_io(md, bio);
654 md->barrier_error = io_error;
655 end_io_acct(io);
656 free_io(md, io);
657 } else { 631 } else {
658 end_io_acct(io); 632 /* done with normal IO or empty flush */
659 free_io(md, io); 633 trace_block_bio_complete(md->queue, bio);
660 634 bio_endio(bio, io_error);
661 if (io_error != DM_ENDIO_REQUEUE) {
662 trace_block_bio_complete(md->queue, bio);
663
664 bio_endio(bio, io_error);
665 }
666 } 635 }
667 } 636 }
668} 637}
@@ -755,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error)
755 blk_update_request(tio->orig, 0, nr_bytes); 724 blk_update_request(tio->orig, 0, nr_bytes);
756} 725}
757 726
758static void store_barrier_error(struct mapped_device *md, int error)
759{
760 unsigned long flags;
761
762 spin_lock_irqsave(&md->barrier_error_lock, flags);
763 /*
764 * Basically, the first error is taken, but:
765 * -EOPNOTSUPP supersedes any I/O error.
766 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
767 */
768 if (!md->barrier_error || error == -EOPNOTSUPP ||
769 (md->barrier_error != -EOPNOTSUPP &&
770 error == DM_ENDIO_REQUEUE))
771 md->barrier_error = error;
772 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
773}
774
775/* 727/*
776 * Don't touch any member of the md after calling this function because 728 * Don't touch any member of the md after calling this function because
777 * the md may be freed in dm_put() at the end of this function. 729 * the md may be freed in dm_put() at the end of this function.
@@ -809,13 +761,11 @@ static void free_rq_clone(struct request *clone)
809static void dm_end_request(struct request *clone, int error) 761static void dm_end_request(struct request *clone, int error)
810{ 762{
811 int rw = rq_data_dir(clone); 763 int rw = rq_data_dir(clone);
812 int run_queue = 1;
813 bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
814 struct dm_rq_target_io *tio = clone->end_io_data; 764 struct dm_rq_target_io *tio = clone->end_io_data;
815 struct mapped_device *md = tio->md; 765 struct mapped_device *md = tio->md;
816 struct request *rq = tio->orig; 766 struct request *rq = tio->orig;
817 767
818 if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { 768 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
819 rq->errors = clone->errors; 769 rq->errors = clone->errors;
820 rq->resid_len = clone->resid_len; 770 rq->resid_len = clone->resid_len;
821 771
@@ -829,15 +779,8 @@ static void dm_end_request(struct request *clone, int error)
829 } 779 }
830 780
831 free_rq_clone(clone); 781 free_rq_clone(clone);
832 782 blk_end_request_all(rq, error);
833 if (unlikely(is_barrier)) { 783 rq_completed(md, rw, true);
834 if (unlikely(error))
835 store_barrier_error(md, error);
836 run_queue = 0;
837 } else
838 blk_end_request_all(rq, error);
839
840 rq_completed(md, rw, run_queue);
841} 784}
842 785
843static void dm_unprep_request(struct request *rq) 786static void dm_unprep_request(struct request *rq)
@@ -862,16 +805,6 @@ void dm_requeue_unmapped_request(struct request *clone)
862 struct request_queue *q = rq->q; 805 struct request_queue *q = rq->q;
863 unsigned long flags; 806 unsigned long flags;
864 807
865 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
866 /*
867 * Barrier clones share an original request.
868 * Leave it to dm_end_request(), which handles this special
869 * case.
870 */
871 dm_end_request(clone, DM_ENDIO_REQUEUE);
872 return;
873 }
874
875 dm_unprep_request(rq); 808 dm_unprep_request(rq);
876 809
877 spin_lock_irqsave(q->queue_lock, flags); 810 spin_lock_irqsave(q->queue_lock, flags);
@@ -961,19 +894,6 @@ static void dm_complete_request(struct request *clone, int error)
961 struct dm_rq_target_io *tio = clone->end_io_data; 894 struct dm_rq_target_io *tio = clone->end_io_data;
962 struct request *rq = tio->orig; 895 struct request *rq = tio->orig;
963 896
964 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
965 /*
966 * Barrier clones share an original request. So can't use
967 * softirq_done with the original.
968 * Pass the clone to dm_done() directly in this special case.
969 * It is safe (even if clone->q->queue_lock is held here)
970 * because there is no I/O dispatching during the completion
971 * of barrier clone.
972 */
973 dm_done(clone, error, true);
974 return;
975 }
976
977 tio->error = error; 897 tio->error = error;
978 rq->completion_data = clone; 898 rq->completion_data = clone;
979 blk_complete_request(rq); 899 blk_complete_request(rq);
@@ -990,17 +910,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
990 struct dm_rq_target_io *tio = clone->end_io_data; 910 struct dm_rq_target_io *tio = clone->end_io_data;
991 struct request *rq = tio->orig; 911 struct request *rq = tio->orig;
992 912
993 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
994 /*
995 * Barrier clones share an original request.
996 * Leave it to dm_end_request(), which handles this special
997 * case.
998 */
999 BUG_ON(error > 0);
1000 dm_end_request(clone, error);
1001 return;
1002 }
1003
1004 rq->cmd_flags |= REQ_FAILED; 913 rq->cmd_flags |= REQ_FAILED;
1005 dm_complete_request(clone, error); 914 dm_complete_request(clone, error);
1006} 915}
@@ -1119,7 +1028,7 @@ static void dm_bio_destructor(struct bio *bio)
1119} 1028}
1120 1029
1121/* 1030/*
1122 * Creates a little bio that is just does part of a bvec. 1031 * Creates a little bio that just does part of a bvec.
1123 */ 1032 */
1124static struct bio *split_bvec(struct bio *bio, sector_t sector, 1033static struct bio *split_bvec(struct bio *bio, sector_t sector,
1125 unsigned short idx, unsigned int offset, 1034 unsigned short idx, unsigned int offset,
@@ -1134,7 +1043,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
1134 1043
1135 clone->bi_sector = sector; 1044 clone->bi_sector = sector;
1136 clone->bi_bdev = bio->bi_bdev; 1045 clone->bi_bdev = bio->bi_bdev;
1137 clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; 1046 clone->bi_rw = bio->bi_rw;
1138 clone->bi_vcnt = 1; 1047 clone->bi_vcnt = 1;
1139 clone->bi_size = to_bytes(len); 1048 clone->bi_size = to_bytes(len);
1140 clone->bi_io_vec->bv_offset = offset; 1049 clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1070,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
1161 1070
1162 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1071 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1163 __bio_clone(clone, bio); 1072 __bio_clone(clone, bio);
1164 clone->bi_rw &= ~REQ_HARDBARRIER;
1165 clone->bi_destructor = dm_bio_destructor; 1073 clone->bi_destructor = dm_bio_destructor;
1166 clone->bi_sector = sector; 1074 clone->bi_sector = sector;
1167 clone->bi_idx = idx; 1075 clone->bi_idx = idx;
@@ -1225,16 +1133,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1225 __issue_target_request(ci, ti, request_nr, len); 1133 __issue_target_request(ci, ti, request_nr, len);
1226} 1134}
1227 1135
1228static int __clone_and_map_empty_barrier(struct clone_info *ci) 1136static int __clone_and_map_empty_flush(struct clone_info *ci)
1229{ 1137{
1230 unsigned target_nr = 0; 1138 unsigned target_nr = 0;
1231 struct dm_target *ti; 1139 struct dm_target *ti;
1232 1140
1141 BUG_ON(bio_has_data(ci->bio));
1233 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1142 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1234 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1143 __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1235 1144
1236 ci->sector_count = 0;
1237
1238 return 0; 1145 return 0;
1239} 1146}
1240 1147
@@ -1289,9 +1196,6 @@ static int __clone_and_map(struct clone_info *ci)
1289 sector_t len = 0, max; 1196 sector_t len = 0, max;
1290 struct dm_target_io *tio; 1197 struct dm_target_io *tio;
1291 1198
1292 if (unlikely(bio_empty_barrier(bio)))
1293 return __clone_and_map_empty_barrier(ci);
1294
1295 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1199 if (unlikely(bio->bi_rw & REQ_DISCARD))
1296 return __clone_and_map_discard(ci); 1200 return __clone_and_map_discard(ci);
1297 1201
@@ -1383,16 +1287,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1383 1287
1384 ci.map = dm_get_live_table(md); 1288 ci.map = dm_get_live_table(md);
1385 if (unlikely(!ci.map)) { 1289 if (unlikely(!ci.map)) {
1386 if (!(bio->bi_rw & REQ_HARDBARRIER)) 1290 bio_io_error(bio);
1387 bio_io_error(bio);
1388 else
1389 if (!md->barrier_error)
1390 md->barrier_error = -EIO;
1391 return; 1291 return;
1392 } 1292 }
1393 1293
1394 ci.md = md; 1294 ci.md = md;
1395 ci.bio = bio;
1396 ci.io = alloc_io(md); 1295 ci.io = alloc_io(md);
1397 ci.io->error = 0; 1296 ci.io->error = 0;
1398 atomic_set(&ci.io->io_count, 1); 1297 atomic_set(&ci.io->io_count, 1);
@@ -1400,14 +1299,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1400 ci.io->md = md; 1299 ci.io->md = md;
1401 spin_lock_init(&ci.io->endio_lock); 1300 spin_lock_init(&ci.io->endio_lock);
1402 ci.sector = bio->bi_sector; 1301 ci.sector = bio->bi_sector;
1403 ci.sector_count = bio_sectors(bio);
1404 if (unlikely(bio_empty_barrier(bio)))
1405 ci.sector_count = 1;
1406 ci.idx = bio->bi_idx; 1302 ci.idx = bio->bi_idx;
1407 1303
1408 start_io_acct(ci.io); 1304 start_io_acct(ci.io);
1409 while (ci.sector_count && !error) 1305 if (bio->bi_rw & REQ_FLUSH) {
1410 error = __clone_and_map(&ci); 1306 ci.bio = &ci.md->flush_bio;
1307 ci.sector_count = 0;
1308 error = __clone_and_map_empty_flush(&ci);
1309 /* dec_pending submits any data associated with flush */
1310 } else {
1311 ci.bio = bio;
1312 ci.sector_count = bio_sectors(bio);
1313 while (ci.sector_count && !error)
1314 error = __clone_and_map(&ci);
1315 }
1411 1316
1412 /* drop the extra reference count */ 1317 /* drop the extra reference count */
1413 dec_pending(ci.io, error); 1318 dec_pending(ci.io, error);
@@ -1491,22 +1396,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
1491 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1396 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1492 part_stat_unlock(); 1397 part_stat_unlock();
1493 1398
1494 /* 1399 /* if we're suspended, we have to queue this io for later */
1495 * If we're suspended or the thread is processing barriers 1400 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1496 * we have to queue this io for later.
1497 */
1498 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1499 unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
1500 up_read(&md->io_lock); 1401 up_read(&md->io_lock);
1501 1402
1502 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1403 if (bio_rw(bio) != READA)
1503 bio_rw(bio) == READA) { 1404 queue_io(md, bio);
1405 else
1504 bio_io_error(bio); 1406 bio_io_error(bio);
1505 return 0;
1506 }
1507
1508 queue_io(md, bio);
1509
1510 return 0; 1407 return 0;
1511 } 1408 }
1512 1409
@@ -1537,14 +1434,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1537 return _dm_request(q, bio); 1434 return _dm_request(q, bio);
1538} 1435}
1539 1436
1540static bool dm_rq_is_flush_request(struct request *rq)
1541{
1542 if (rq->cmd_flags & REQ_FLUSH)
1543 return true;
1544 else
1545 return false;
1546}
1547
1548void dm_dispatch_request(struct request *rq) 1437void dm_dispatch_request(struct request *rq)
1549{ 1438{
1550 int r; 1439 int r;
@@ -1592,22 +1481,15 @@ static int setup_clone(struct request *clone, struct request *rq,
1592{ 1481{
1593 int r; 1482 int r;
1594 1483
1595 if (dm_rq_is_flush_request(rq)) { 1484 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1596 blk_rq_init(NULL, clone); 1485 dm_rq_bio_constructor, tio);
1597 clone->cmd_type = REQ_TYPE_FS; 1486 if (r)
1598 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1487 return r;
1599 } else {
1600 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1601 dm_rq_bio_constructor, tio);
1602 if (r)
1603 return r;
1604
1605 clone->cmd = rq->cmd;
1606 clone->cmd_len = rq->cmd_len;
1607 clone->sense = rq->sense;
1608 clone->buffer = rq->buffer;
1609 }
1610 1488
1489 clone->cmd = rq->cmd;
1490 clone->cmd_len = rq->cmd_len;
1491 clone->sense = rq->sense;
1492 clone->buffer = rq->buffer;
1611 clone->end_io = end_clone_request; 1493 clone->end_io = end_clone_request;
1612 clone->end_io_data = tio; 1494 clone->end_io_data = tio;
1613 1495
@@ -1648,9 +1530,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
1648 struct mapped_device *md = q->queuedata; 1530 struct mapped_device *md = q->queuedata;
1649 struct request *clone; 1531 struct request *clone;
1650 1532
1651 if (unlikely(dm_rq_is_flush_request(rq)))
1652 return BLKPREP_OK;
1653
1654 if (unlikely(rq->special)) { 1533 if (unlikely(rq->special)) {
1655 DMWARN("Already has something in rq->special."); 1534 DMWARN("Already has something in rq->special.");
1656 return BLKPREP_KILL; 1535 return BLKPREP_KILL;
@@ -1727,6 +1606,7 @@ static void dm_request_fn(struct request_queue *q)
1727 struct dm_table *map = dm_get_live_table(md); 1606 struct dm_table *map = dm_get_live_table(md);
1728 struct dm_target *ti; 1607 struct dm_target *ti;
1729 struct request *rq, *clone; 1608 struct request *rq, *clone;
1609 sector_t pos;
1730 1610
1731 /* 1611 /*
1732 * For suspend, check blk_queue_stopped() and increment 1612 * For suspend, check blk_queue_stopped() and increment
@@ -1739,15 +1619,14 @@ static void dm_request_fn(struct request_queue *q)
1739 if (!rq) 1619 if (!rq)
1740 goto plug_and_out; 1620 goto plug_and_out;
1741 1621
1742 if (unlikely(dm_rq_is_flush_request(rq))) { 1622 /* always use block 0 to find the target for flushes for now */
1743 BUG_ON(md->flush_request); 1623 pos = 0;
1744 md->flush_request = rq; 1624 if (!(rq->cmd_flags & REQ_FLUSH))
1745 blk_start_request(rq); 1625 pos = blk_rq_pos(rq);
1746 queue_work(md->wq, &md->barrier_work); 1626
1747 goto out; 1627 ti = dm_table_find_target(map, pos);
1748 } 1628 BUG_ON(!dm_target_is_valid(ti));
1749 1629
1750 ti = dm_table_find_target(map, blk_rq_pos(rq));
1751 if (ti->type->busy && ti->type->busy(ti)) 1630 if (ti->type->busy && ti->type->busy(ti))
1752 goto plug_and_out; 1631 goto plug_and_out;
1753 1632
@@ -1918,7 +1797,6 @@ out:
1918static const struct block_device_operations dm_blk_dops; 1797static const struct block_device_operations dm_blk_dops;
1919 1798
1920static void dm_wq_work(struct work_struct *work); 1799static void dm_wq_work(struct work_struct *work);
1921static void dm_rq_barrier_work(struct work_struct *work);
1922 1800
1923static void dm_init_md_queue(struct mapped_device *md) 1801static void dm_init_md_queue(struct mapped_device *md)
1924{ 1802{
@@ -1940,6 +1818,7 @@ static void dm_init_md_queue(struct mapped_device *md)
1940 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1818 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1941 md->queue->unplug_fn = dm_unplug_all; 1819 md->queue->unplug_fn = dm_unplug_all;
1942 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1820 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1821 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1943} 1822}
1944 1823
1945/* 1824/*
@@ -1972,7 +1851,6 @@ static struct mapped_device *alloc_dev(int minor)
1972 mutex_init(&md->suspend_lock); 1851 mutex_init(&md->suspend_lock);
1973 mutex_init(&md->type_lock); 1852 mutex_init(&md->type_lock);
1974 spin_lock_init(&md->deferred_lock); 1853 spin_lock_init(&md->deferred_lock);
1975 spin_lock_init(&md->barrier_error_lock);
1976 rwlock_init(&md->map_lock); 1854 rwlock_init(&md->map_lock);
1977 atomic_set(&md->holders, 1); 1855 atomic_set(&md->holders, 1);
1978 atomic_set(&md->open_count, 0); 1856 atomic_set(&md->open_count, 0);
@@ -1995,7 +1873,6 @@ static struct mapped_device *alloc_dev(int minor)
1995 atomic_set(&md->pending[1], 0); 1873 atomic_set(&md->pending[1], 0);
1996 init_waitqueue_head(&md->wait); 1874 init_waitqueue_head(&md->wait);
1997 INIT_WORK(&md->work, dm_wq_work); 1875 INIT_WORK(&md->work, dm_wq_work);
1998 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1999 init_waitqueue_head(&md->eventq); 1876 init_waitqueue_head(&md->eventq);
2000 1877
2001 md->disk->major = _major; 1878 md->disk->major = _major;
@@ -2015,6 +1892,10 @@ static struct mapped_device *alloc_dev(int minor)
2015 if (!md->bdev) 1892 if (!md->bdev)
2016 goto bad_bdev; 1893 goto bad_bdev;
2017 1894
1895 bio_init(&md->flush_bio);
1896 md->flush_bio.bi_bdev = md->bdev;
1897 md->flush_bio.bi_rw = WRITE_FLUSH;
1898
2018 /* Populate the mapping, nobody knows we exist yet */ 1899 /* Populate the mapping, nobody knows we exist yet */
2019 spin_lock(&_minor_lock); 1900 spin_lock(&_minor_lock);
2020 old_md = idr_replace(&_minor_idr, md, minor); 1901 old_md = idr_replace(&_minor_idr, md, minor);
@@ -2245,7 +2126,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
2245 blk_queue_softirq_done(md->queue, dm_softirq_done); 2126 blk_queue_softirq_done(md->queue, dm_softirq_done);
2246 blk_queue_prep_rq(md->queue, dm_prep_fn); 2127 blk_queue_prep_rq(md->queue, dm_prep_fn);
2247 blk_queue_lld_busy(md->queue, dm_lld_busy); 2128 blk_queue_lld_busy(md->queue, dm_lld_busy);
2248 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
2249 2129
2250 elv_register_queue(md->queue); 2130 elv_register_queue(md->queue);
2251 2131
@@ -2406,43 +2286,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2406 return r; 2286 return r;
2407} 2287}
2408 2288
2409static void dm_flush(struct mapped_device *md)
2410{
2411 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2412
2413 bio_init(&md->barrier_bio);
2414 md->barrier_bio.bi_bdev = md->bdev;
2415 md->barrier_bio.bi_rw = WRITE_BARRIER;
2416 __split_and_process_bio(md, &md->barrier_bio);
2417
2418 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2419}
2420
2421static void process_barrier(struct mapped_device *md, struct bio *bio)
2422{
2423 md->barrier_error = 0;
2424
2425 dm_flush(md);
2426
2427 if (!bio_empty_barrier(bio)) {
2428 __split_and_process_bio(md, bio);
2429 /*
2430 * If the request isn't supported, don't waste time with
2431 * the second flush.
2432 */
2433 if (md->barrier_error != -EOPNOTSUPP)
2434 dm_flush(md);
2435 }
2436
2437 if (md->barrier_error != DM_ENDIO_REQUEUE)
2438 bio_endio(bio, md->barrier_error);
2439 else {
2440 spin_lock_irq(&md->deferred_lock);
2441 bio_list_add_head(&md->deferred, bio);
2442 spin_unlock_irq(&md->deferred_lock);
2443 }
2444}
2445
2446/* 2289/*
2447 * Process the deferred bios 2290 * Process the deferred bios
2448 */ 2291 */
@@ -2452,33 +2295,27 @@ static void dm_wq_work(struct work_struct *work)
2452 work); 2295 work);
2453 struct bio *c; 2296 struct bio *c;
2454 2297
2455 down_write(&md->io_lock); 2298 down_read(&md->io_lock);
2456 2299
2457 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2300 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2458 spin_lock_irq(&md->deferred_lock); 2301 spin_lock_irq(&md->deferred_lock);
2459 c = bio_list_pop(&md->deferred); 2302 c = bio_list_pop(&md->deferred);
2460 spin_unlock_irq(&md->deferred_lock); 2303 spin_unlock_irq(&md->deferred_lock);
2461 2304
2462 if (!c) { 2305 if (!c)
2463 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2464 break; 2306 break;
2465 }
2466 2307
2467 up_write(&md->io_lock); 2308 up_read(&md->io_lock);
2468 2309
2469 if (dm_request_based(md)) 2310 if (dm_request_based(md))
2470 generic_make_request(c); 2311 generic_make_request(c);
2471 else { 2312 else
2472 if (c->bi_rw & REQ_HARDBARRIER) 2313 __split_and_process_bio(md, c);
2473 process_barrier(md, c);
2474 else
2475 __split_and_process_bio(md, c);
2476 }
2477 2314
2478 down_write(&md->io_lock); 2315 down_read(&md->io_lock);
2479 } 2316 }
2480 2317
2481 up_write(&md->io_lock); 2318 up_read(&md->io_lock);
2482} 2319}
2483 2320
2484static void dm_queue_flush(struct mapped_device *md) 2321static void dm_queue_flush(struct mapped_device *md)
@@ -2488,73 +2325,6 @@ static void dm_queue_flush(struct mapped_device *md)
2488 queue_work(md->wq, &md->work); 2325 queue_work(md->wq, &md->work);
2489} 2326}
2490 2327
2491static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
2492{
2493 struct dm_rq_target_io *tio = clone->end_io_data;
2494
2495 tio->info.target_request_nr = request_nr;
2496}
2497
2498/* Issue barrier requests to targets and wait for their completion. */
2499static int dm_rq_barrier(struct mapped_device *md)
2500{
2501 int i, j;
2502 struct dm_table *map = dm_get_live_table(md);
2503 unsigned num_targets = dm_table_get_num_targets(map);
2504 struct dm_target *ti;
2505 struct request *clone;
2506
2507 md->barrier_error = 0;
2508
2509 for (i = 0; i < num_targets; i++) {
2510 ti = dm_table_get_target(map, i);
2511 for (j = 0; j < ti->num_flush_requests; j++) {
2512 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2513 dm_rq_set_target_request_nr(clone, j);
2514 atomic_inc(&md->pending[rq_data_dir(clone)]);
2515 map_request(ti, clone, md);
2516 }
2517 }
2518
2519 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2520 dm_table_put(map);
2521
2522 return md->barrier_error;
2523}
2524
2525static void dm_rq_barrier_work(struct work_struct *work)
2526{
2527 int error;
2528 struct mapped_device *md = container_of(work, struct mapped_device,
2529 barrier_work);
2530 struct request_queue *q = md->queue;
2531 struct request *rq;
2532 unsigned long flags;
2533
2534 /*
2535 * Hold the md reference here and leave it at the last part so that
2536 * the md can't be deleted by device opener when the barrier request
2537 * completes.
2538 */
2539 dm_get(md);
2540
2541 error = dm_rq_barrier(md);
2542
2543 rq = md->flush_request;
2544 md->flush_request = NULL;
2545
2546 if (error == DM_ENDIO_REQUEUE) {
2547 spin_lock_irqsave(q->queue_lock, flags);
2548 blk_requeue_request(q, rq);
2549 spin_unlock_irqrestore(q->queue_lock, flags);
2550 } else
2551 blk_end_request_all(rq, error);
2552
2553 blk_run_queue(q);
2554
2555 dm_put(md);
2556}
2557
2558/* 2328/*
2559 * Swap in a new table, returning the old one for the caller to destroy. 2329 * Swap in a new table, returning the old one for the caller to destroy.
2560 */ 2330 */
@@ -2677,23 +2447,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2677 * 2447 *
2678 * To get all processes out of __split_and_process_bio in dm_request, 2448 * To get all processes out of __split_and_process_bio in dm_request,
2679 * we take the write lock. To prevent any process from reentering 2449 * we take the write lock. To prevent any process from reentering
2680 * __split_and_process_bio from dm_request, we set 2450 * __split_and_process_bio from dm_request and quiesce the thread
2681 * DMF_QUEUE_IO_TO_THREAD. 2451 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2682 * 2452 * flush_workqueue(md->wq).
2683 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
2684 * and call flush_workqueue(md->wq). flush_workqueue will wait until
2685 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
2686 * further calls to __split_and_process_bio from dm_wq_work.
2687 */ 2453 */
2688 down_write(&md->io_lock); 2454 down_write(&md->io_lock);
2689 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2455 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2690 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2691 up_write(&md->io_lock); 2456 up_write(&md->io_lock);
2692 2457
2693 /* 2458 /*
2694 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2459 * Stop md->queue before flushing md->wq in case request-based
2695 * can be kicked until md->queue is stopped. So stop md->queue before 2460 * dm defers requests to md->wq from md->queue.
2696 * flushing md->wq.
2697 */ 2461 */
2698 if (dm_request_based(md)) 2462 if (dm_request_based(md))
2699 stop_queue(md->queue); 2463 stop_queue(md->queue);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3..8a2f767f26d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
294 dev_info_t *tmp_dev; 294 dev_info_t *tmp_dev;
295 sector_t start_sector; 295 sector_t start_sector;
296 296
297 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 297 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
298 md_barrier_request(mddev, bio); 298 md_flush_request(mddev, bio);
299 return 0; 299 return 0;
300 } 300 }
301 301
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dbf822df942..225815197a3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -227,12 +227,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
227 return 0; 227 return 0;
228 } 228 }
229 rcu_read_lock(); 229 rcu_read_lock();
230 if (mddev->suspended || mddev->barrier) { 230 if (mddev->suspended) {
231 DEFINE_WAIT(__wait); 231 DEFINE_WAIT(__wait);
232 for (;;) { 232 for (;;) {
233 prepare_to_wait(&mddev->sb_wait, &__wait, 233 prepare_to_wait(&mddev->sb_wait, &__wait,
234 TASK_UNINTERRUPTIBLE); 234 TASK_UNINTERRUPTIBLE);
235 if (!mddev->suspended && !mddev->barrier) 235 if (!mddev->suspended)
236 break; 236 break;
237 rcu_read_unlock(); 237 rcu_read_unlock();
238 schedule(); 238 schedule();
@@ -283,40 +283,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
283 283
284int mddev_congested(mddev_t *mddev, int bits) 284int mddev_congested(mddev_t *mddev, int bits)
285{ 285{
286 if (mddev->barrier)
287 return 1;
288 return mddev->suspended; 286 return mddev->suspended;
289} 287}
290EXPORT_SYMBOL(mddev_congested); 288EXPORT_SYMBOL(mddev_congested);
291 289
292/* 290/*
293 * Generic barrier handling for md 291 * Generic flush handling for md
294 */ 292 */
295 293
296#define POST_REQUEST_BARRIER ((void*)1) 294static void md_end_flush(struct bio *bio, int err)
297
298static void md_end_barrier(struct bio *bio, int err)
299{ 295{
300 mdk_rdev_t *rdev = bio->bi_private; 296 mdk_rdev_t *rdev = bio->bi_private;
301 mddev_t *mddev = rdev->mddev; 297 mddev_t *mddev = rdev->mddev;
302 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
303 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
304 298
305 rdev_dec_pending(rdev, mddev); 299 rdev_dec_pending(rdev, mddev);
306 300
307 if (atomic_dec_and_test(&mddev->flush_pending)) { 301 if (atomic_dec_and_test(&mddev->flush_pending)) {
308 if (mddev->barrier == POST_REQUEST_BARRIER) { 302 /* The pre-request flush has finished */
309 /* This was a post-request barrier */ 303 schedule_work(&mddev->flush_work);
310 mddev->barrier = NULL;
311 wake_up(&mddev->sb_wait);
312 } else
313 /* The pre-request barrier has finished */
314 schedule_work(&mddev->barrier_work);
315 } 304 }
316 bio_put(bio); 305 bio_put(bio);
317} 306}
318 307
319static void submit_barriers(mddev_t *mddev) 308static void submit_flushes(mddev_t *mddev)
320{ 309{
321 mdk_rdev_t *rdev; 310 mdk_rdev_t *rdev;
322 311
@@ -333,60 +322,56 @@ static void submit_barriers(mddev_t *mddev)
333 atomic_inc(&rdev->nr_pending); 322 atomic_inc(&rdev->nr_pending);
334 rcu_read_unlock(); 323 rcu_read_unlock();
335 bi = bio_alloc(GFP_KERNEL, 0); 324 bi = bio_alloc(GFP_KERNEL, 0);
336 bi->bi_end_io = md_end_barrier; 325 bi->bi_end_io = md_end_flush;
337 bi->bi_private = rdev; 326 bi->bi_private = rdev;
338 bi->bi_bdev = rdev->bdev; 327 bi->bi_bdev = rdev->bdev;
339 atomic_inc(&mddev->flush_pending); 328 atomic_inc(&mddev->flush_pending);
340 submit_bio(WRITE_BARRIER, bi); 329 submit_bio(WRITE_FLUSH, bi);
341 rcu_read_lock(); 330 rcu_read_lock();
342 rdev_dec_pending(rdev, mddev); 331 rdev_dec_pending(rdev, mddev);
343 } 332 }
344 rcu_read_unlock(); 333 rcu_read_unlock();
345} 334}
346 335
347static void md_submit_barrier(struct work_struct *ws) 336static void md_submit_flush_data(struct work_struct *ws)
348{ 337{
349 mddev_t *mddev = container_of(ws, mddev_t, barrier_work); 338 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
350 struct bio *bio = mddev->barrier; 339 struct bio *bio = mddev->flush_bio;
351 340
352 atomic_set(&mddev->flush_pending, 1); 341 atomic_set(&mddev->flush_pending, 1);
353 342
354 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 343 if (bio->bi_size == 0)
355 bio_endio(bio, -EOPNOTSUPP);
356 else if (bio->bi_size == 0)
357 /* an empty barrier - all done */ 344 /* an empty barrier - all done */
358 bio_endio(bio, 0); 345 bio_endio(bio, 0);
359 else { 346 else {
360 bio->bi_rw &= ~REQ_HARDBARRIER; 347 bio->bi_rw &= ~REQ_FLUSH;
361 if (mddev->pers->make_request(mddev, bio)) 348 if (mddev->pers->make_request(mddev, bio))
362 generic_make_request(bio); 349 generic_make_request(bio);
363 mddev->barrier = POST_REQUEST_BARRIER;
364 submit_barriers(mddev);
365 } 350 }
366 if (atomic_dec_and_test(&mddev->flush_pending)) { 351 if (atomic_dec_and_test(&mddev->flush_pending)) {
367 mddev->barrier = NULL; 352 mddev->flush_bio = NULL;
368 wake_up(&mddev->sb_wait); 353 wake_up(&mddev->sb_wait);
369 } 354 }
370} 355}
371 356
372void md_barrier_request(mddev_t *mddev, struct bio *bio) 357void md_flush_request(mddev_t *mddev, struct bio *bio)
373{ 358{
374 spin_lock_irq(&mddev->write_lock); 359 spin_lock_irq(&mddev->write_lock);
375 wait_event_lock_irq(mddev->sb_wait, 360 wait_event_lock_irq(mddev->sb_wait,
376 !mddev->barrier, 361 !mddev->flush_bio,
377 mddev->write_lock, /*nothing*/); 362 mddev->write_lock, /*nothing*/);
378 mddev->barrier = bio; 363 mddev->flush_bio = bio;
379 spin_unlock_irq(&mddev->write_lock); 364 spin_unlock_irq(&mddev->write_lock);
380 365
381 atomic_set(&mddev->flush_pending, 1); 366 atomic_set(&mddev->flush_pending, 1);
382 INIT_WORK(&mddev->barrier_work, md_submit_barrier); 367 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
383 368
384 submit_barriers(mddev); 369 submit_flushes(mddev);
385 370
386 if (atomic_dec_and_test(&mddev->flush_pending)) 371 if (atomic_dec_and_test(&mddev->flush_pending))
387 schedule_work(&mddev->barrier_work); 372 schedule_work(&mddev->flush_work);
388} 373}
389EXPORT_SYMBOL(md_barrier_request); 374EXPORT_SYMBOL(md_flush_request);
390 375
391/* Support for plugging. 376/* Support for plugging.
392 * This mirrors the plugging support in request_queue, but does not 377 * This mirrors the plugging support in request_queue, but does not
@@ -697,31 +682,6 @@ static void super_written(struct bio *bio, int error)
697 bio_put(bio); 682 bio_put(bio);
698} 683}
699 684
700static void super_written_barrier(struct bio *bio, int error)
701{
702 struct bio *bio2 = bio->bi_private;
703 mdk_rdev_t *rdev = bio2->bi_private;
704 mddev_t *mddev = rdev->mddev;
705
706 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
707 error == -EOPNOTSUPP) {
708 unsigned long flags;
709 /* barriers don't appear to be supported :-( */
710 set_bit(BarriersNotsupp, &rdev->flags);
711 mddev->barriers_work = 0;
712 spin_lock_irqsave(&mddev->write_lock, flags);
713 bio2->bi_next = mddev->biolist;
714 mddev->biolist = bio2;
715 spin_unlock_irqrestore(&mddev->write_lock, flags);
716 wake_up(&mddev->sb_wait);
717 bio_put(bio);
718 } else {
719 bio_put(bio2);
720 bio->bi_private = rdev;
721 super_written(bio, error);
722 }
723}
724
725void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 685void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
726 sector_t sector, int size, struct page *page) 686 sector_t sector, int size, struct page *page)
727{ 687{
@@ -730,51 +690,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
730 * and decrement it on completion, waking up sb_wait 690 * and decrement it on completion, waking up sb_wait
731 * if zero is reached. 691 * if zero is reached.
732 * If an error occurred, call md_error 692 * If an error occurred, call md_error
733 *
734 * As we might need to resubmit the request if REQ_HARDBARRIER
735 * causes ENOTSUPP, we allocate a spare bio...
736 */ 693 */
737 struct bio *bio = bio_alloc(GFP_NOIO, 1); 694 struct bio *bio = bio_alloc(GFP_NOIO, 1);
738 int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
739 695
740 bio->bi_bdev = rdev->bdev; 696 bio->bi_bdev = rdev->bdev;
741 bio->bi_sector = sector; 697 bio->bi_sector = sector;
742 bio_add_page(bio, page, size, 0); 698 bio_add_page(bio, page, size, 0);
743 bio->bi_private = rdev; 699 bio->bi_private = rdev;
744 bio->bi_end_io = super_written; 700 bio->bi_end_io = super_written;
745 bio->bi_rw = rw;
746 701
747 atomic_inc(&mddev->pending_writes); 702 atomic_inc(&mddev->pending_writes);
748 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 703 submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
749 struct bio *rbio; 704 bio);
750 rw |= REQ_HARDBARRIER;
751 rbio = bio_clone(bio, GFP_NOIO);
752 rbio->bi_private = bio;
753 rbio->bi_end_io = super_written_barrier;
754 submit_bio(rw, rbio);
755 } else
756 submit_bio(rw, bio);
757} 705}
758 706
759void md_super_wait(mddev_t *mddev) 707void md_super_wait(mddev_t *mddev)
760{ 708{
761 /* wait for all superblock writes that were scheduled to complete. 709 /* wait for all superblock writes that were scheduled to complete */
762 * if any had to be retried (due to BARRIER problems), retry them
763 */
764 DEFINE_WAIT(wq); 710 DEFINE_WAIT(wq);
765 for(;;) { 711 for(;;) {
766 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 712 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
767 if (atomic_read(&mddev->pending_writes)==0) 713 if (atomic_read(&mddev->pending_writes)==0)
768 break; 714 break;
769 while (mddev->biolist) {
770 struct bio *bio;
771 spin_lock_irq(&mddev->write_lock);
772 bio = mddev->biolist;
773 mddev->biolist = bio->bi_next ;
774 bio->bi_next = NULL;
775 spin_unlock_irq(&mddev->write_lock);
776 submit_bio(bio->bi_rw, bio);
777 }
778 schedule(); 715 schedule();
779 } 716 }
780 finish_wait(&mddev->sb_wait, &wq); 717 finish_wait(&mddev->sb_wait, &wq);
@@ -1071,7 +1008,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1071 clear_bit(Faulty, &rdev->flags); 1008 clear_bit(Faulty, &rdev->flags);
1072 clear_bit(In_sync, &rdev->flags); 1009 clear_bit(In_sync, &rdev->flags);
1073 clear_bit(WriteMostly, &rdev->flags); 1010 clear_bit(WriteMostly, &rdev->flags);
1074 clear_bit(BarriersNotsupp, &rdev->flags);
1075 1011
1076 if (mddev->raid_disks == 0) { 1012 if (mddev->raid_disks == 0) {
1077 mddev->major_version = 0; 1013 mddev->major_version = 0;
@@ -1486,7 +1422,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1486 clear_bit(Faulty, &rdev->flags); 1422 clear_bit(Faulty, &rdev->flags);
1487 clear_bit(In_sync, &rdev->flags); 1423 clear_bit(In_sync, &rdev->flags);
1488 clear_bit(WriteMostly, &rdev->flags); 1424 clear_bit(WriteMostly, &rdev->flags);
1489 clear_bit(BarriersNotsupp, &rdev->flags);
1490 1425
1491 if (mddev->raid_disks == 0) { 1426 if (mddev->raid_disks == 0) {
1492 mddev->major_version = 1; 1427 mddev->major_version = 1;
@@ -4505,7 +4440,6 @@ int md_run(mddev_t *mddev)
4505 /* may be over-ridden by personality */ 4440 /* may be over-ridden by personality */
4506 mddev->resync_max_sectors = mddev->dev_sectors; 4441 mddev->resync_max_sectors = mddev->dev_sectors;
4507 4442
4508 mddev->barriers_work = 1;
4509 mddev->ok_start_degraded = start_dirty_degraded; 4443 mddev->ok_start_degraded = start_dirty_degraded;
4510 4444
4511 if (start_readonly && mddev->ro == 0) 4445 if (start_readonly && mddev->ro == 0)
@@ -4684,7 +4618,6 @@ static void md_clean(mddev_t *mddev)
4684 mddev->recovery = 0; 4618 mddev->recovery = 0;
4685 mddev->in_sync = 0; 4619 mddev->in_sync = 0;
4686 mddev->degraded = 0; 4620 mddev->degraded = 0;
4687 mddev->barriers_work = 0;
4688 mddev->safemode = 0; 4621 mddev->safemode = 0;
4689 mddev->bitmap_info.offset = 0; 4622 mddev->bitmap_info.offset = 0;
4690 mddev->bitmap_info.default_offset = 0; 4623 mddev->bitmap_info.default_offset = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 3931299788d..112a2c32db0 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,7 +87,6 @@ struct mdk_rdev_s
87#define Faulty 1 /* device is known to have a fault */ 87#define Faulty 1 /* device is known to have a fault */
88#define In_sync 2 /* device is in_sync with rest of array */ 88#define In_sync 2 /* device is in_sync with rest of array */
89#define WriteMostly 4 /* Avoid reading if at all possible */ 89#define WriteMostly 4 /* Avoid reading if at all possible */
90#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */
91#define AllReserved 6 /* If whole device is reserved for 90#define AllReserved 6 /* If whole device is reserved for
92 * one array */ 91 * one array */
93#define AutoDetected 7 /* added by auto-detect */ 92#define AutoDetected 7 /* added by auto-detect */
@@ -273,13 +272,6 @@ struct mddev_s
273 int degraded; /* whether md should consider 272 int degraded; /* whether md should consider
274 * adding a spare 273 * adding a spare
275 */ 274 */
276 int barriers_work; /* initialised to true, cleared as soon
277 * as a barrier request to slave
278 * fails. Only supported
279 */
280 struct bio *biolist; /* bios that need to be retried
281 * because REQ_HARDBARRIER is not supported
282 */
283 275
284 atomic_t recovery_active; /* blocks scheduled, but not written */ 276 atomic_t recovery_active; /* blocks scheduled, but not written */
285 wait_queue_head_t recovery_wait; 277 wait_queue_head_t recovery_wait;
@@ -339,16 +331,13 @@ struct mddev_s
339 struct attribute_group *to_remove; 331 struct attribute_group *to_remove;
340 struct plug_handle *plug; /* if used by personality */ 332 struct plug_handle *plug; /* if used by personality */
341 333
342 /* Generic barrier handling. 334 /* Generic flush handling.
343 * If there is a pending barrier request, all other 335 * The last to finish preflush schedules a worker to submit
344 * writes are blocked while the devices are flushed. 336 * the rest of the request (without the REQ_FLUSH flag).
345 * The last to finish a flush schedules a worker to
346 * submit the barrier request (without the barrier flag),
347 * then submit more flush requests.
348 */ 337 */
349 struct bio *barrier; 338 struct bio *flush_bio;
350 atomic_t flush_pending; 339 atomic_t flush_pending;
351 struct work_struct barrier_work; 340 struct work_struct flush_work;
352 struct work_struct event_work; /* used by dm to report failure event */ 341 struct work_struct event_work; /* used by dm to report failure event */
353}; 342};
354 343
@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
502extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 491extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
503 492
504extern int mddev_congested(mddev_t *mddev, int bits); 493extern int mddev_congested(mddev_t *mddev, int bits);
505extern void md_barrier_request(mddev_t *mddev, struct bio *bio); 494extern void md_flush_request(mddev_t *mddev, struct bio *bio);
506extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 495extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
507 sector_t sector, int size, struct page *page); 496 sector_t sector, int size, struct page *page);
508extern void md_super_wait(mddev_t *mddev); 497extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a..6d7ddf32ef2 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
142 struct multipath_bh * mp_bh; 142 struct multipath_bh * mp_bh;
143 struct multipath_info *multipath; 143 struct multipath_info *multipath;
144 144
145 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 145 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
146 md_barrier_request(mddev, bio); 146 md_flush_request(mddev, bio);
147 return 0; 147 return 0;
148 } 148 }
149 149
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623..a39f4c355e5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
483 struct strip_zone *zone; 483 struct strip_zone *zone;
484 mdk_rdev_t *tmp_dev; 484 mdk_rdev_t *tmp_dev;
485 485
486 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 486 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
487 md_barrier_request(mddev, bio); 487 md_flush_request(mddev, bio);
488 return 0; 488 return 0;
489 } 489 }
490 490
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0b830bbe1d8..378a25894c5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
319 if (r1_bio->bios[mirror] == bio) 319 if (r1_bio->bios[mirror] == bio)
320 break; 320 break;
321 321
322 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { 322 /*
323 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); 323 * 'one mirror IO has finished' event handler:
324 set_bit(R1BIO_BarrierRetry, &r1_bio->state); 324 */
325 r1_bio->mddev->barriers_work = 0; 325 r1_bio->bios[mirror] = NULL;
326 /* Don't rdev_dec_pending in this branch - keep it for the retry */ 326 to_put = bio;
327 } else { 327 if (!uptodate) {
328 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
329 /* an I/O failed, we can't clear the bitmap */
330 set_bit(R1BIO_Degraded, &r1_bio->state);
331 } else
328 /* 332 /*
329 * this branch is our 'one mirror IO has finished' event handler: 333 * Set R1BIO_Uptodate in our master bio, so that we
334 * will return a good error code for to the higher
335 * levels even if IO on some other mirrored buffer
336 * fails.
337 *
338 * The 'master' represents the composite IO operation
339 * to user-side. So if something waits for IO, then it
340 * will wait for the 'master' bio.
330 */ 341 */
331 r1_bio->bios[mirror] = NULL; 342 set_bit(R1BIO_Uptodate, &r1_bio->state);
332 to_put = bio; 343
333 if (!uptodate) { 344 update_head_pos(mirror, r1_bio);
334 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 345
335 /* an I/O failed, we can't clear the bitmap */ 346 if (behind) {
336 set_bit(R1BIO_Degraded, &r1_bio->state); 347 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
337 } else 348 atomic_dec(&r1_bio->behind_remaining);
338 /* 349
339 * Set R1BIO_Uptodate in our master bio, so that 350 /*
340 * we will return a good error code for to the higher 351 * In behind mode, we ACK the master bio once the I/O
341 * levels even if IO on some other mirrored buffer fails. 352 * has safely reached all non-writemostly
342 * 353 * disks. Setting the Returned bit ensures that this
343 * The 'master' represents the composite IO operation to 354 * gets done only once -- we don't ever want to return
344 * user-side. So if something waits for IO, then it will 355 * -EIO here, instead we'll wait
345 * wait for the 'master' bio. 356 */
346 */ 357 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
347 set_bit(R1BIO_Uptodate, &r1_bio->state); 358 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
348 359 /* Maybe we can return now */
349 update_head_pos(mirror, r1_bio); 360 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
350 361 struct bio *mbio = r1_bio->master_bio;
351 if (behind) { 362 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
352 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 363 (unsigned long long) mbio->bi_sector,
353 atomic_dec(&r1_bio->behind_remaining); 364 (unsigned long long) mbio->bi_sector +
354 365 (mbio->bi_size >> 9) - 1);
355 /* In behind mode, we ACK the master bio once the I/O has safely 366 bio_endio(mbio, 0);
356 * reached all non-writemostly disks. Setting the Returned bit
357 * ensures that this gets done only once -- we don't ever want to
358 * return -EIO here, instead we'll wait */
359
360 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
361 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
362 /* Maybe we can return now */
363 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
364 struct bio *mbio = r1_bio->master_bio;
365 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
366 (unsigned long long) mbio->bi_sector,
367 (unsigned long long) mbio->bi_sector +
368 (mbio->bi_size >> 9) - 1);
369 bio_endio(mbio, 0);
370 }
371 } 367 }
372 } 368 }
373 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
374 } 369 }
370 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
371
375 /* 372 /*
376 *
377 * Let's see if all mirrored write operations have finished 373 * Let's see if all mirrored write operations have finished
378 * already. 374 * already.
379 */ 375 */
380 if (atomic_dec_and_test(&r1_bio->remaining)) { 376 if (atomic_dec_and_test(&r1_bio->remaining)) {
381 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) 377 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382 reschedule_retry(r1_bio); 378 /* free extra copy of the data pages */
383 else { 379 int i = bio->bi_vcnt;
384 /* it really is the end of this request */ 380 while (i--)
385 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 381 safe_put_page(bio->bi_io_vec[i].bv_page);
386 /* free extra copy of the data pages */
387 int i = bio->bi_vcnt;
388 while (i--)
389 safe_put_page(bio->bi_io_vec[i].bv_page);
390 }
391 /* clear the bitmap if all writes complete successfully */
392 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
393 r1_bio->sectors,
394 !test_bit(R1BIO_Degraded, &r1_bio->state),
395 behind);
396 md_write_end(r1_bio->mddev);
397 raid_end_bio_io(r1_bio);
398 } 382 }
383 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
385 r1_bio->sectors,
386 !test_bit(R1BIO_Degraded, &r1_bio->state),
387 behind);
388 md_write_end(r1_bio->mddev);
389 raid_end_bio_io(r1_bio);
399 } 390 }
400 391
401 if (to_put) 392 if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
788 struct page **behind_pages = NULL; 779 struct page **behind_pages = NULL;
789 const int rw = bio_data_dir(bio); 780 const int rw = bio_data_dir(bio);
790 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 781 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
791 unsigned long do_barriers; 782 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
792 mdk_rdev_t *blocked_rdev; 783 mdk_rdev_t *blocked_rdev;
793 784
794 /* 785 /*
795 * Register the new request and wait if the reconstruction 786 * Register the new request and wait if the reconstruction
796 * thread has put up a bar for new requests. 787 * thread has put up a bar for new requests.
797 * Continue immediately if no resync is active currently. 788 * Continue immediately if no resync is active currently.
798 * We test barriers_work *after* md_write_start as md_write_start
799 * may cause the first superblock write, and that will check out
800 * if barriers work.
801 */ 789 */
802 790
803 md_write_start(mddev, bio); /* wait on superblock update early */ 791 md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 } 809 }
822 finish_wait(&conf->wait_barrier, &w); 810 finish_wait(&conf->wait_barrier, &w);
823 } 811 }
824 if (unlikely(!mddev->barriers_work &&
825 (bio->bi_rw & REQ_HARDBARRIER))) {
826 if (rw == WRITE)
827 md_write_end(mddev);
828 bio_endio(bio, -EOPNOTSUPP);
829 return 0;
830 }
831 812
832 wait_barrier(conf); 813 wait_barrier(conf);
833 814
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
959 atomic_set(&r1_bio->remaining, 0); 940 atomic_set(&r1_bio->remaining, 0);
960 atomic_set(&r1_bio->behind_remaining, 0); 941 atomic_set(&r1_bio->behind_remaining, 0);
961 942
962 do_barriers = bio->bi_rw & REQ_HARDBARRIER;
963 if (do_barriers)
964 set_bit(R1BIO_Barrier, &r1_bio->state);
965
966 bio_list_init(&bl); 943 bio_list_init(&bl);
967 for (i = 0; i < disks; i++) { 944 for (i = 0; i < disks; i++) {
968 struct bio *mbio; 945 struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
975 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 952 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
976 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 953 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
977 mbio->bi_end_io = raid1_end_write_request; 954 mbio->bi_end_io = raid1_end_write_request;
978 mbio->bi_rw = WRITE | do_barriers | do_sync; 955 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
979 mbio->bi_private = r1_bio; 956 mbio->bi_private = r1_bio;
980 957
981 if (behind_pages) { 958 if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
1634 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1611 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1635 sync_request_write(mddev, r1_bio); 1612 sync_request_write(mddev, r1_bio);
1636 unplug = 1; 1613 unplug = 1;
1637 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1638 /* some requests in the r1bio were REQ_HARDBARRIER
1639 * requests which failed with -EOPNOTSUPP. Hohumm..
1640 * Better resubmit without the barrier.
1641 * We know which devices to resubmit for, because
1642 * all others have had their bios[] entry cleared.
1643 * We already have a nr_pending reference on these rdevs.
1644 */
1645 int i;
1646 const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
1647 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1648 clear_bit(R1BIO_Barrier, &r1_bio->state);
1649 for (i=0; i < conf->raid_disks; i++)
1650 if (r1_bio->bios[i])
1651 atomic_inc(&r1_bio->remaining);
1652 for (i=0; i < conf->raid_disks; i++)
1653 if (r1_bio->bios[i]) {
1654 struct bio_vec *bvec;
1655 int j;
1656
1657 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1658 /* copy pages from the failed bio, as
1659 * this might be a write-behind device */
1660 __bio_for_each_segment(bvec, bio, j, 0)
1661 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1662 bio_put(r1_bio->bios[i]);
1663 bio->bi_sector = r1_bio->sector +
1664 conf->mirrors[i].rdev->data_offset;
1665 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1666 bio->bi_end_io = raid1_end_write_request;
1667 bio->bi_rw = WRITE | do_sync;
1668 bio->bi_private = r1_bio;
1669 r1_bio->bios[i] = bio;
1670 generic_make_request(bio);
1671 }
1672 } else { 1614 } else {
1673 int disk; 1615 int disk;
1674 1616
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28..adf8cfd7331 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -117,8 +117,6 @@ struct r1bio_s {
117#define R1BIO_IsSync 1 117#define R1BIO_IsSync 1
118#define R1BIO_Degraded 2 118#define R1BIO_Degraded 2
119#define R1BIO_BehindIO 3 119#define R1BIO_BehindIO 3
120#define R1BIO_Barrier 4
121#define R1BIO_BarrierRetry 5
122/* For write-behind requests, we call bi_end_io when 120/* For write-behind requests, we call bi_end_io when
123 * the last non-write-behind device completes, providing 121 * the last non-write-behind device completes, providing
124 * any write was successful. Otherwise we call when 122 * any write was successful. Otherwise we call when
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124..f0d082f749b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
800 int chunk_sects = conf->chunk_mask + 1; 800 int chunk_sects = conf->chunk_mask + 1;
801 const int rw = bio_data_dir(bio); 801 const int rw = bio_data_dir(bio);
802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
803 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
803 struct bio_list bl; 804 struct bio_list bl;
804 unsigned long flags; 805 unsigned long flags;
805 mdk_rdev_t *blocked_rdev; 806 mdk_rdev_t *blocked_rdev;
806 807
807 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 808 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
808 md_barrier_request(mddev, bio); 809 md_flush_request(mddev, bio);
809 return 0; 810 return 0;
810 } 811 }
811 812
@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
965 conf->mirrors[d].rdev->data_offset; 966 conf->mirrors[d].rdev->data_offset;
966 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 967 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
967 mbio->bi_end_io = raid10_end_write_request; 968 mbio->bi_end_io = raid10_end_write_request;
968 mbio->bi_rw = WRITE | do_sync; 969 mbio->bi_rw = WRITE | do_sync | do_fua;
969 mbio->bi_private = r10_bio; 970 mbio->bi_private = r10_bio;
970 971
971 atomic_inc(&r10_bio->remaining); 972 atomic_inc(&r10_bio->remaining);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43..31140d1259d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
506 int rw; 506 int rw;
507 struct bio *bi; 507 struct bio *bi;
508 mdk_rdev_t *rdev; 508 mdk_rdev_t *rdev;
509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
510 rw = WRITE; 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
511 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 511 rw = WRITE_FUA;
512 else
513 rw = WRITE;
514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
512 rw = READ; 515 rw = READ;
513 else 516 else
514 continue; 517 continue;
@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1031 1034
1032 while (wbi && wbi->bi_sector < 1035 while (wbi && wbi->bi_sector <
1033 dev->sector + STRIPE_SECTORS) { 1036 dev->sector + STRIPE_SECTORS) {
1037 if (wbi->bi_rw & REQ_FUA)
1038 set_bit(R5_WantFUA, &dev->flags);
1034 tx = async_copy_data(1, wbi, dev->page, 1039 tx = async_copy_data(1, wbi, dev->page,
1035 dev->sector, tx); 1040 dev->sector, tx);
1036 wbi = r5_next_bio(wbi, dev->sector); 1041 wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1048 int pd_idx = sh->pd_idx; 1053 int pd_idx = sh->pd_idx;
1049 int qd_idx = sh->qd_idx; 1054 int qd_idx = sh->qd_idx;
1050 int i; 1055 int i;
1056 bool fua = false;
1051 1057
1052 pr_debug("%s: stripe %llu\n", __func__, 1058 pr_debug("%s: stripe %llu\n", __func__,
1053 (unsigned long long)sh->sector); 1059 (unsigned long long)sh->sector);
1054 1060
1061 for (i = disks; i--; )
1062 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1063
1055 for (i = disks; i--; ) { 1064 for (i = disks; i--; ) {
1056 struct r5dev *dev = &sh->dev[i]; 1065 struct r5dev *dev = &sh->dev[i];
1057 1066
1058 if (dev->written || i == pd_idx || i == qd_idx) 1067 if (dev->written || i == pd_idx || i == qd_idx) {
1059 set_bit(R5_UPTODATE, &dev->flags); 1068 set_bit(R5_UPTODATE, &dev->flags);
1069 if (fua)
1070 set_bit(R5_WantFUA, &dev->flags);
1071 }
1060 } 1072 }
1061 1073
1062 if (sh->reconstruct_state == reconstruct_state_drain_run) 1074 if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh)
3281 3293
3282 if (dec_preread_active) { 3294 if (dec_preread_active) {
3283 /* We delay this until after ops_run_io so that if make_request 3295 /* We delay this until after ops_run_io so that if make_request
3284 * is waiting on a barrier, it won't continue until the writes 3296 * is waiting on a flush, it won't continue until the writes
3285 * have actually been submitted. 3297 * have actually been submitted.
3286 */ 3298 */
3287 atomic_dec(&conf->preread_active_stripes); 3299 atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh)
3583 3595
3584 if (dec_preread_active) { 3596 if (dec_preread_active) {
3585 /* We delay this until after ops_run_io so that if make_request 3597 /* We delay this until after ops_run_io so that if make_request
3586 * is waiting on a barrier, it won't continue until the writes 3598 * is waiting on a flush, it won't continue until the writes
3587 * have actually been submitted. 3599 * have actually been submitted.
3588 */ 3600 */
3589 atomic_dec(&conf->preread_active_stripes); 3601 atomic_dec(&conf->preread_active_stripes);
@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi)
3978 const int rw = bio_data_dir(bi); 3990 const int rw = bio_data_dir(bi);
3979 int remaining; 3991 int remaining;
3980 3992
3981 if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { 3993 if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3982 /* Drain all pending writes. We only really need 3994 md_flush_request(mddev, bi);
3983 * to ensure they have been submitted, but this is
3984 * easier.
3985 */
3986 mddev->pers->quiesce(mddev, 1);
3987 mddev->pers->quiesce(mddev, 0);
3988 md_barrier_request(mddev, bi);
3989 return 0; 3995 return 0;
3990 } 3996 }
3991 3997
@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4103 finish_wait(&conf->wait_for_overlap, &w); 4109 finish_wait(&conf->wait_for_overlap, &w);
4104 set_bit(STRIPE_HANDLE, &sh->state); 4110 set_bit(STRIPE_HANDLE, &sh->state);
4105 clear_bit(STRIPE_DELAYED, &sh->state); 4111 clear_bit(STRIPE_DELAYED, &sh->state);
4106 if (mddev->barrier && 4112 if ((bi->bi_rw & REQ_SYNC) &&
4107 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4113 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4108 atomic_inc(&conf->preread_active_stripes); 4114 atomic_inc(&conf->preread_active_stripes);
4109 release_stripe(sh); 4115 release_stripe(sh);
@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4126 bio_endio(bi, 0); 4132 bio_endio(bi, 0);
4127 } 4133 }
4128 4134
4129 if (mddev->barrier) {
4130 /* We need to wait for the stripes to all be handled.
4131 * So: wait for preread_active_stripes to drop to 0.
4132 */
4133 wait_event(mddev->thread->wqueue,
4134 atomic_read(&conf->preread_active_stripes) == 0);
4135 }
4136 return 0; 4135 return 0;
4137} 4136}
4138 4137
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6..2ace0582b40 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
275 * filling 275 * filling
276 */ 276 */
277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
278#define R5_WantFUA 14 /* Write should be FUA */
278/* 279/*
279 * Write method 280 * Write method
280 */ 281 */
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index e876678176b..9c0b42bfe08 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -128,7 +128,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock
128 mq->req = NULL; 128 mq->req = NULL;
129 129
130 blk_queue_prep_rq(mq->queue, mmc_prep_request); 130 blk_queue_prep_rq(mq->queue, mmc_prep_request);
131 blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN);
132 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue); 131 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
133 if (mmc_can_erase(card)) { 132 if (mmc_can_erase(card)) {
134 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mq->queue); 133 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mq->queue);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 38e6fa9a201..aa95f100176 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2196,7 +2196,6 @@ static void dasd_setup_queue(struct dasd_block *block)
2196 */ 2196 */
2197 blk_queue_max_segment_size(block->request_queue, PAGE_SIZE); 2197 blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
2198 blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1); 2198 blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
2199 blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
2200} 2199}
2201 2200
2202/* 2201/*
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c
index 93984c9dfe1..aee73fafccc 100644
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -2850,12 +2850,6 @@ aic7xxx_done(struct aic7xxx_host *p, struct aic7xxx_scb *scb)
2850 aic_dev->r_total++; 2850 aic_dev->r_total++;
2851 ptr = aic_dev->r_bins; 2851 ptr = aic_dev->r_bins;
2852 } 2852 }
2853 if(cmd->device->simple_tags && cmd->request->cmd_flags & REQ_HARDBARRIER)
2854 {
2855 aic_dev->barrier_total++;
2856 if(scb->tag_action == MSG_ORDERED_Q_TAG)
2857 aic_dev->ordered_total++;
2858 }
2859 x = scb->sg_length; 2853 x = scb->sg_length;
2860 x >>= 10; 2854 x >>= 10;
2861 for(i=0; i<6; i++) 2855 for(i=0; i<6; i++)
@@ -10125,7 +10119,6 @@ static void aic7xxx_buildscb(struct aic7xxx_host *p, struct scsi_cmnd *cmd,
10125 struct aic_dev_data *aic_dev = cmd->device->hostdata; 10119 struct aic_dev_data *aic_dev = cmd->device->hostdata;
10126 struct scsi_device *sdptr = cmd->device; 10120 struct scsi_device *sdptr = cmd->device;
10127 unsigned char tindex = TARGET_INDEX(cmd); 10121 unsigned char tindex = TARGET_INDEX(cmd);
10128 struct request *req = cmd->request;
10129 int use_sg; 10122 int use_sg;
10130 10123
10131 mask = (0x01 << tindex); 10124 mask = (0x01 << tindex);
@@ -10144,19 +10137,8 @@ static void aic7xxx_buildscb(struct aic7xxx_host *p, struct scsi_cmnd *cmd,
10144 /* We always force TEST_UNIT_READY to untagged */ 10137 /* We always force TEST_UNIT_READY to untagged */
10145 if (cmd->cmnd[0] != TEST_UNIT_READY && sdptr->simple_tags) 10138 if (cmd->cmnd[0] != TEST_UNIT_READY && sdptr->simple_tags)
10146 { 10139 {
10147 if (req->cmd_flags & REQ_HARDBARRIER) 10140 hscb->control |= MSG_SIMPLE_Q_TAG;
10148 { 10141 scb->tag_action = MSG_SIMPLE_Q_TAG;
10149 if(sdptr->ordered_tags)
10150 {
10151 hscb->control |= MSG_ORDERED_Q_TAG;
10152 scb->tag_action = MSG_ORDERED_Q_TAG;
10153 }
10154 }
10155 else
10156 {
10157 hscb->control |= MSG_SIMPLE_Q_TAG;
10158 scb->tag_action = MSG_SIMPLE_Q_TAG;
10159 }
10160 } 10142 }
10161 } 10143 }
10162 if ( !(aic_dev->dtr_pending) && 10144 if ( !(aic_dev->dtr_pending) &&
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index f0cfba9a1fc..535085cd27e 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -130,17 +130,6 @@ static void sas_scsi_task_done(struct sas_task *task)
130 sc->scsi_done(sc); 130 sc->scsi_done(sc);
131} 131}
132 132
133static enum task_attribute sas_scsi_get_task_attr(struct scsi_cmnd *cmd)
134{
135 enum task_attribute ta = TASK_ATTR_SIMPLE;
136 if (cmd->request && blk_rq_tagged(cmd->request)) {
137 if (cmd->device->ordered_tags &&
138 (cmd->request->cmd_flags & REQ_HARDBARRIER))
139 ta = TASK_ATTR_ORDERED;
140 }
141 return ta;
142}
143
144static struct sas_task *sas_create_task(struct scsi_cmnd *cmd, 133static struct sas_task *sas_create_task(struct scsi_cmnd *cmd,
145 struct domain_device *dev, 134 struct domain_device *dev,
146 gfp_t gfp_flags) 135 gfp_t gfp_flags)
@@ -160,7 +149,7 @@ static struct sas_task *sas_create_task(struct scsi_cmnd *cmd,
160 task->ssp_task.retry_count = 1; 149 task->ssp_task.retry_count = 1;
161 int_to_scsilun(cmd->device->lun, &lun); 150 int_to_scsilun(cmd->device->lun, &lun);
162 memcpy(task->ssp_task.LUN, &lun.scsi_lun, 8); 151 memcpy(task->ssp_task.LUN, &lun.scsi_lun, 8);
163 task->ssp_task.task_attr = sas_scsi_get_task_attr(cmd); 152 task->ssp_task.task_attr = TASK_ATTR_SIMPLE;
164 memcpy(task->ssp_task.cdb, cmd->cmnd, 16); 153 memcpy(task->ssp_task.cdb, cmd->cmnd, 16);
165 154
166 task->scatter = scsi_sglist(cmd); 155 task->scatter = scsi_sglist(cmd);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index ffa0689ee84..20514c47a5a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2109,7 +2109,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
2109 struct scsi_disk *sdkp = scsi_disk(disk); 2109 struct scsi_disk *sdkp = scsi_disk(disk);
2110 struct scsi_device *sdp = sdkp->device; 2110 struct scsi_device *sdp = sdkp->device;
2111 unsigned char *buffer; 2111 unsigned char *buffer;
2112 unsigned ordered; 2112 unsigned flush = 0;
2113 2113
2114 SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, 2114 SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
2115 "sd_revalidate_disk\n")); 2115 "sd_revalidate_disk\n"));
@@ -2151,17 +2151,15 @@ static int sd_revalidate_disk(struct gendisk *disk)
2151 2151
2152 /* 2152 /*
2153 * We now have all cache related info, determine how we deal 2153 * We now have all cache related info, determine how we deal
2154 * with ordered requests. Note that as the current SCSI 2154 * with flush requests.
2155 * dispatch function can alter request order, we cannot use
2156 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
2157 */ 2155 */
2158 if (sdkp->WCE) 2156 if (sdkp->WCE) {
2159 ordered = sdkp->DPOFUA 2157 flush |= REQ_FLUSH;
2160 ? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH; 2158 if (sdkp->DPOFUA)
2161 else 2159 flush |= REQ_FUA;
2162 ordered = QUEUE_ORDERED_DRAIN; 2160 }
2163 2161
2164 blk_queue_ordered(sdkp->disk->queue, ordered); 2162 blk_queue_flush(sdkp->disk->queue, flush);
2165 2163
2166 set_capacity(disk, sdkp->capacity); 2164 set_capacity(disk, sdkp->capacity);
2167 kfree(buffer); 2165 kfree(buffer);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582fa..b737451e2e9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -370,7 +370,7 @@ int blkdev_fsync(struct file *filp, int datasync)
370 */ 370 */
371 mutex_unlock(&bd_inode->i_mutex); 371 mutex_unlock(&bd_inode->i_mutex);
372 372
373 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); 373 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
374 if (error == -EOPNOTSUPP) 374 if (error == -EOPNOTSUPP)
375 error = 0; 375 error = 0;
376 376
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f04..5e789f4a3ed 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2063,7 +2063,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2063 if (uptodate) { 2063 if (uptodate) {
2064 set_buffer_uptodate(bh); 2064 set_buffer_uptodate(bh);
2065 } else { 2065 } else {
2066 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2066 if (printk_ratelimit()) {
2067 printk(KERN_WARNING "lost page write due to " 2067 printk(KERN_WARNING "lost page write due to "
2068 "I/O error on %s\n", 2068 "I/O error on %s\n",
2069 bdevname(bh->b_bdev, b)); 2069 bdevname(bh->b_bdev, b));
@@ -2200,21 +2200,10 @@ static int write_dev_supers(struct btrfs_device *device,
2200 bh->b_end_io = btrfs_end_buffer_write_sync; 2200 bh->b_end_io = btrfs_end_buffer_write_sync;
2201 } 2201 }
2202 2202
2203 if (i == last_barrier && do_barriers && device->barriers) { 2203 if (i == last_barrier && do_barriers)
2204 ret = submit_bh(WRITE_BARRIER, bh); 2204 ret = submit_bh(WRITE_FLUSH_FUA, bh);
2205 if (ret == -EOPNOTSUPP) { 2205 else
2206 printk("btrfs: disabling barriers on dev %s\n",
2207 device->name);
2208 set_buffer_uptodate(bh);
2209 device->barriers = 0;
2210 /* one reference for submit_bh */
2211 get_bh(bh);
2212 lock_buffer(bh);
2213 ret = submit_bh(WRITE_SYNC, bh);
2214 }
2215 } else {
2216 ret = submit_bh(WRITE_SYNC, bh); 2206 ret = submit_bh(WRITE_SYNC, bh);
2217 }
2218 2207
2219 if (ret) 2208 if (ret)
2220 errors++; 2209 errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a5..0b81ecdb101 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1695,8 +1695,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1695static void btrfs_issue_discard(struct block_device *bdev, 1695static void btrfs_issue_discard(struct block_device *bdev,
1696 u64 start, u64 len) 1696 u64 start, u64 len)
1697{ 1697{
1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1700} 1699}
1701 1700
1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1701static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b..e25e46a8b4e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path,
398 device->work.func = pending_bios_fn; 398 device->work.func = pending_bios_fn;
399 memcpy(device->uuid, disk_super->dev_item.uuid, 399 memcpy(device->uuid, disk_super->dev_item.uuid,
400 BTRFS_UUID_SIZE); 400 BTRFS_UUID_SIZE);
401 device->barriers = 1;
402 spin_lock_init(&device->io_lock); 401 spin_lock_init(&device->io_lock);
403 device->name = kstrdup(path, GFP_NOFS); 402 device->name = kstrdup(path, GFP_NOFS);
404 if (!device->name) { 403 if (!device->name) {
@@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
462 device->devid = orig_dev->devid; 461 device->devid = orig_dev->devid;
463 device->work.func = pending_bios_fn; 462 device->work.func = pending_bios_fn;
464 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 463 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
465 device->barriers = 1;
466 spin_lock_init(&device->io_lock); 464 spin_lock_init(&device->io_lock);
467 INIT_LIST_HEAD(&device->dev_list); 465 INIT_LIST_HEAD(&device->dev_list);
468 INIT_LIST_HEAD(&device->dev_alloc_list); 466 INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1489 trans = btrfs_start_transaction(root, 0); 1487 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1488 lock_chunks(root);
1491 1489
1492 device->barriers = 1;
1493 device->writeable = 1; 1490 device->writeable = 1;
1494 device->work.func = pending_bios_fn; 1491 device->work.func = pending_bios_fn;
1495 generate_random_uuid(device->uuid); 1492 generate_random_uuid(device->uuid);
@@ -3084,7 +3081,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3084 return NULL; 3081 return NULL;
3085 list_add(&device->dev_list, 3082 list_add(&device->dev_list,
3086 &fs_devices->devices); 3083 &fs_devices->devices);
3087 device->barriers = 1;
3088 device->dev_root = root->fs_info->dev_root; 3084 device->dev_root = root->fs_info->dev_root;
3089 device->devid = devid; 3085 device->devid = devid;
3090 device->work.func = pending_bios_fn; 3086 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2e..2b638b6e4ee 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,7 +42,6 @@ struct btrfs_device {
42 int running_pending; 42 int running_pending;
43 u64 generation; 43 u64 generation;
44 44
45 int barriers;
46 int writeable; 45 int writeable;
47 int in_fs_metadata; 46 int in_fs_metadata;
48 47
diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1..7f0b9b083f7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
156 if (uptodate) { 156 if (uptodate) {
157 set_buffer_uptodate(bh); 157 set_buffer_uptodate(bh);
158 } else { 158 } else {
159 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { 159 if (!quiet_error(bh)) {
160 buffer_io_error(bh); 160 buffer_io_error(bh);
161 printk(KERN_WARNING "lost page write due to " 161 printk(KERN_WARNING "lost page write due to "
162 "I/O error on %s\n", 162 "I/O error on %s\n",
@@ -2891,7 +2891,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2891 2891
2892 if (err == -EOPNOTSUPP) { 2892 if (err == -EOPNOTSUPP) {
2893 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2893 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2894 set_bit(BH_Eopnotsupp, &bh->b_state);
2895 } 2894 }
2896 2895
2897 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) 2896 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3030,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3031 bh->b_end_io = end_buffer_write_sync; 3030 bh->b_end_io = end_buffer_write_sync;
3032 ret = submit_bh(rw, bh); 3031 ret = submit_bh(rw, bh);
3033 wait_on_buffer(bh); 3032 wait_on_buffer(bh);
3034 if (buffer_eopnotsupp(bh)) {
3035 clear_buffer_eopnotsupp(bh);
3036 ret = -EOPNOTSUPP;
3037 }
3038 if (!ret && !buffer_uptodate(bh)) 3033 if (!ret && !buffer_uptodate(bh))
3039 ret = -EIO; 3034 ret = -EIO;
3040 } else { 3035 } else {
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a..09b13bb34c9 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
90 * storage 90 * storage
91 */ 91 */
92 if (needs_barrier) 92 if (needs_barrier)
93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
94 BLKDEV_IFL_WAIT);
95 return ret; 94 return ret;
96} 95}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546..3f3ff5ee8f9 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -128,10 +128,9 @@ int ext4_sync_file(struct file *file, int datasync)
128 (journal->j_fs_dev != journal->j_dev) && 128 (journal->j_fs_dev != journal->j_dev) &&
129 (journal->j_flags & JBD2_BARRIER)) 129 (journal->j_flags & JBD2_BARRIER))
130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, 130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
131 NULL, BLKDEV_IFL_WAIT); 131 NULL);
132 ret = jbd2_log_wait_commit(journal, commit_tid); 132 ret = jbd2_log_wait_commit(journal, commit_tid);
133 } else if (journal->j_flags & JBD2_BARRIER) 133 } else if (journal->j_flags & JBD2_BARRIER)
134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
135 BLKDEV_IFL_WAIT);
136 return ret; 135 return ret;
137} 136}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce5..19aa0d44d82 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2566,7 +2566,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
2566 discard_block = block + ext4_group_first_block_no(sb, block_group); 2566 discard_block = block + ext4_group_first_block_no(sb, block_group);
2567 trace_ext4_discard_blocks(sb, 2567 trace_ext4_discard_blocks(sb,
2568 (unsigned long long) discard_block, count); 2568 (unsigned long long) discard_block, count);
2569 ret = sb_issue_discard(sb, discard_block, count); 2569 ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2570 if (ret == EOPNOTSUPP) { 2570 if (ret == EOPNOTSUPP) {
2571 ext4_warning(sb, "discard not supported, disabling"); 2571 ext4_warning(sb, "discard not supported, disabling");
2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); 2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a..b47d2c9f4fa 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,8 @@ int fat_free_clusters(struct inode *inode, int cluster)
577 577
578 sb_issue_discard(sb, 578 sb_issue_discard(sb,
579 fat_clus_to_blknr(sbi, first_cl), 579 fat_clus_to_blknr(sbi, first_cl),
580 nr_clus * sbi->sec_per_clus); 580 nr_clus * sbi->sec_per_clus,
581 GFP_NOFS, 0);
581 582
582 first_cl = cluster; 583 first_cl = cluster;
583 } 584 }
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1736f235638..970e682ea75 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
255 255
256 for (i = 0; i < nr_bhs; i++) { 256 for (i = 0; i < nr_bhs; i++) {
257 wait_on_buffer(bhs[i]); 257 wait_on_buffer(bhs[i]);
258 if (buffer_eopnotsupp(bhs[i])) { 258 if (!err && !buffer_uptodate(bhs[i]))
259 clear_buffer_eopnotsupp(bhs[i]);
260 err = -EOPNOTSUPP;
261 } else if (!err && !buffer_uptodate(bhs[i]))
262 err = -EIO; 259 err = -EIO;
263 } 260 }
264 return err; 261 return err;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ac750bd31a6..eb01f3575e1 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
592 lh->lh_hash = cpu_to_be32(hash); 592 lh->lh_hash = cpu_to_be32(hash);
593 593
594 bh->b_end_io = end_buffer_write_sync; 594 bh->b_end_io = end_buffer_write_sync;
595 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
596 goto skip_barrier;
597 get_bh(bh); 595 get_bh(bh);
598 submit_bh(WRITE_BARRIER | REQ_META, bh); 596 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
599 wait_on_buffer(bh);
600 if (buffer_eopnotsupp(bh)) {
601 clear_buffer_eopnotsupp(bh);
602 set_buffer_uptodate(bh);
603 fs_info(sdp, "barrier sync failed - disabling barriers\n");
604 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
605 lock_buffer(bh);
606skip_barrier:
607 get_bh(bh);
608 submit_bh(WRITE_SYNC | REQ_META, bh); 597 submit_bh(WRITE_SYNC | REQ_META, bh);
609 wait_on_buffer(bh); 598 else
610 } 599 submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
600 wait_on_buffer(bh);
601
611 if (!buffer_uptodate(bh)) 602 if (!buffer_uptodate(bh))
612 gfs2_io_error_bh(sdp, bh); 603 gfs2_io_error_bh(sdp, bh);
613 brelse(bh); 604 brelse(bh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fb67f593f40..bef3ab6cf5c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -866,8 +866,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
866 if ((start + nr_sects) != blk) { 866 if ((start + nr_sects) != blk) {
867 rv = blkdev_issue_discard(bdev, start, 867 rv = blkdev_issue_discard(bdev, start,
868 nr_sects, GFP_NOFS, 868 nr_sects, GFP_NOFS,
869 BLKDEV_IFL_WAIT | 869 0);
870 BLKDEV_IFL_BARRIER);
871 if (rv) 870 if (rv)
872 goto fail; 871 goto fail;
873 nr_sects = 0; 872 nr_sects = 0;
@@ -881,8 +880,7 @@ start_new_extent:
881 } 880 }
882 } 881 }
883 if (nr_sects) { 882 if (nr_sects) {
884 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 883 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
885 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
886 if (rv) 884 if (rv)
887 goto fail; 885 goto fail;
888 } 886 }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3f030e9efea..85a6883c0ac 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal,
137 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "write commit block");
138 set_buffer_dirty(bh); 138 set_buffer_dirty(bh);
139 139
140 if (journal->j_flags & JFS_BARRIER) { 140 if (journal->j_flags & JFS_BARRIER)
141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER); 141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
142 142 else
143 /*
144 * Is it possible for another commit to fail at roughly
145 * the same time as this one? If so, we don't want to
146 * trust the barrier flag in the super, but instead want
147 * to remember if we sent a barrier request
148 */
149 if (ret == -EOPNOTSUPP) {
150 char b[BDEVNAME_SIZE];
151
152 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n",
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JFS_BARRIER;
158 spin_unlock(&journal->j_state_lock);
159
160 /* And try again, without the barrier */
161 set_buffer_uptodate(bh);
162 set_buffer_dirty(bh);
163 ret = sync_dirty_buffer(bh);
164 }
165 } else {
166 ret = sync_dirty_buffer(bh); 143 ret = sync_dirty_buffer(bh);
167 }
168 144
169 put_bh(bh); /* One for getblk() */ 145 put_bh(bh); /* One for getblk() */
170 journal_put_journal_head(descriptor); 146 journal_put_journal_head(descriptor);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb..6571a056e55 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -532,8 +532,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
532 */ 532 */
533 if ((journal->j_fs_dev != journal->j_dev) && 533 if ((journal->j_fs_dev != journal->j_dev) &&
534 (journal->j_flags & JBD2_BARRIER)) 534 (journal->j_flags & JBD2_BARRIER))
535 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 535 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
536 BLKDEV_IFL_WAIT);
537 if (!(journal->j_flags & JBD2_ABORT)) 536 if (!(journal->j_flags & JBD2_ABORT))
538 jbd2_journal_update_superblock(journal, 1); 537 jbd2_journal_update_superblock(journal, 1);
539 return 0; 538 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 80910f51d4b..bc6be8bda1c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -134,25 +134,11 @@ static int journal_submit_commit_record(journal_t *journal,
134 134
135 if (journal->j_flags & JBD2_BARRIER && 135 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 136 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
138 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh); 138 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
139 if (ret == -EOPNOTSUPP) { 139 else
140 printk(KERN_WARNING
141 "JBD2: Disabling barriers on %s, "
142 "not supported by device\n", journal->j_devname);
143 write_lock(&journal->j_state_lock);
144 journal->j_flags &= ~JBD2_BARRIER;
145 write_unlock(&journal->j_state_lock);
146
147 /* And try again, without the barrier */
148 lock_buffer(bh);
149 set_buffer_uptodate(bh);
150 clear_buffer_dirty(bh);
151 ret = submit_bh(WRITE_SYNC_PLUG, bh);
152 }
153 } else {
154 ret = submit_bh(WRITE_SYNC_PLUG, bh); 140 ret = submit_bh(WRITE_SYNC_PLUG, bh);
155 } 141
156 *cbh = bh; 142 *cbh = bh;
157 return ret; 143 return ret;
158} 144}
@@ -166,29 +152,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
166{ 152{
167 int ret = 0; 153 int ret = 0;
168 154
169retry:
170 clear_buffer_dirty(bh); 155 clear_buffer_dirty(bh);
171 wait_on_buffer(bh); 156 wait_on_buffer(bh);
172 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
173 printk(KERN_WARNING
174 "JBD2: %s: disabling barries on %s - not supported "
175 "by device\n", __func__, journal->j_devname);
176 write_lock(&journal->j_state_lock);
177 journal->j_flags &= ~JBD2_BARRIER;
178 write_unlock(&journal->j_state_lock);
179
180 lock_buffer(bh);
181 clear_buffer_dirty(bh);
182 set_buffer_uptodate(bh);
183 bh->b_end_io = journal_end_buffer_io_sync;
184
185 ret = submit_bh(WRITE_SYNC_PLUG, bh);
186 if (ret) {
187 unlock_buffer(bh);
188 return ret;
189 }
190 goto retry;
191 }
192 157
193 if (unlikely(!buffer_uptodate(bh))) 158 if (unlikely(!buffer_uptodate(bh)))
194 ret = -EIO; 159 ret = -EIO;
@@ -701,6 +666,16 @@ start_journal_io:
701 } 666 }
702 } 667 }
703 668
669 err = journal_finish_inode_data_buffers(journal, commit_transaction);
670 if (err) {
671 printk(KERN_WARNING
672 "JBD2: Detected IO errors while flushing file data "
673 "on %s\n", journal->j_devname);
674 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
675 jbd2_journal_abort(journal, err);
676 err = 0;
677 }
678
704 /* 679 /*
705 * If the journal is not located on the file system device, 680 * If the journal is not located on the file system device,
706 * then we must flush the file system device before we issue 681 * then we must flush the file system device before we issue
@@ -709,8 +684,7 @@ start_journal_io:
709 if (commit_transaction->t_flushed_data_blocks && 684 if (commit_transaction->t_flushed_data_blocks &&
710 (journal->j_fs_dev != journal->j_dev) && 685 (journal->j_fs_dev != journal->j_dev) &&
711 (journal->j_flags & JBD2_BARRIER)) 686 (journal->j_flags & JBD2_BARRIER))
712 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, 687 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
713 BLKDEV_IFL_WAIT);
714 688
715 /* Done it all: now write the commit record asynchronously. */ 689 /* Done it all: now write the commit record asynchronously. */
716 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 690 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -719,19 +693,6 @@ start_journal_io:
719 &cbh, crc32_sum); 693 &cbh, crc32_sum);
720 if (err) 694 if (err)
721 __jbd2_journal_abort_hard(journal); 695 __jbd2_journal_abort_hard(journal);
722 if (journal->j_flags & JBD2_BARRIER)
723 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
724 BLKDEV_IFL_WAIT);
725 }
726
727 err = journal_finish_inode_data_buffers(journal, commit_transaction);
728 if (err) {
729 printk(KERN_WARNING
730 "JBD2: Detected IO errors while flushing file data "
731 "on %s\n", journal->j_devname);
732 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
733 jbd2_journal_abort(journal, err);
734 err = 0;
735 } 696 }
736 697
737 /* Lo and behold: we have just managed to send a transaction to 698 /* Lo and behold: we have just managed to send a transaction to
@@ -845,6 +806,11 @@ wait_for_iobuf:
845 } 806 }
846 if (!err && !is_journal_aborted(journal)) 807 if (!err && !is_journal_aborted(journal))
847 err = journal_wait_on_commit_record(journal, cbh); 808 err = journal_wait_on_commit_record(journal, cbh);
809 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
810 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
811 journal->j_flags & JBD2_BARRIER) {
812 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
813 }
848 814
849 if (err) 815 if (err)
850 jbd2_journal_abort(journal, err); 816 jbd2_journal_abort(journal, err);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 9f4913f7840..f3b75206e95 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -177,17 +177,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
177 177
178 retry: 178 retry:
179 set_buffer_dirty(nilfs->ns_sbh[0]); 179 set_buffer_dirty(nilfs->ns_sbh[0]);
180
181 if (nilfs_test_opt(sbi, BARRIER)) { 180 if (nilfs_test_opt(sbi, BARRIER)) {
182 err = __sync_dirty_buffer(nilfs->ns_sbh[0], 181 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
183 WRITE_SYNC | WRITE_BARRIER); 182 WRITE_SYNC | WRITE_FLUSH_FUA);
184 if (err == -EOPNOTSUPP) {
185 nilfs_warning(sbi->s_super, __func__,
186 "barrier-based sync failed. "
187 "disabling barriers\n");
188 nilfs_clear_opt(sbi, BARRIER);
189 goto retry;
190 }
191 } else { 183 } else {
192 err = sync_dirty_buffer(nilfs->ns_sbh[0]); 184 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
193 } 185 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ba7c10c917f..d2771510337 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -775,9 +775,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
775 ret = blkdev_issue_discard(nilfs->ns_bdev, 775 ret = blkdev_issue_discard(nilfs->ns_bdev,
776 start * sects_per_block, 776 start * sects_per_block,
777 nblocks * sects_per_block, 777 nblocks * sects_per_block,
778 GFP_NOFS, 778 GFP_NOFS, 0);
779 BLKDEV_IFL_WAIT |
780 BLKDEV_IFL_BARRIER);
781 if (ret < 0) 779 if (ret < 0)
782 return ret; 780 return ret;
783 nblocks = 0; 781 nblocks = 0;
@@ -787,8 +785,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
787 ret = blkdev_issue_discard(nilfs->ns_bdev, 785 ret = blkdev_issue_discard(nilfs->ns_bdev,
788 start * sects_per_block, 786 start * sects_per_block,
789 nblocks * sects_per_block, 787 nblocks * sects_per_block,
790 GFP_NOFS, 788 GFP_NOFS, 0);
791 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
792 return ret; 789 return ret;
793} 790}
794 791
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b..91f080cc76c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
152 barrier_done = reiserfs_commit_for_inode(inode); 152 barrier_done = reiserfs_commit_for_inode(inode);
153 reiserfs_write_unlock(inode->i_sb); 153 reiserfs_write_unlock(inode->i_sb);
154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 154 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 155 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
156 BLKDEV_IFL_WAIT);
157 if (barrier_done < 0) 156 if (barrier_done < 0)
158 return barrier_done; 157 return barrier_done;
159 return (err < 0) ? -EIO : 0; 158 return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 812e2c05aa2..076c8b19468 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
138 return 0; 138 return 0;
139} 139}
140 140
141static void disable_barrier(struct super_block *s)
142{
143 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
144 printk("reiserfs: disabling flush barriers on %s\n",
145 reiserfs_bdevname(s));
146}
147
148static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block 141static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
149 *sb) 142 *sb)
150{ 143{
@@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
677 submit_bh(WRITE, bh); 670 submit_bh(WRITE, bh);
678} 671}
679 672
680static int submit_barrier_buffer(struct buffer_head *bh)
681{
682 get_bh(bh);
683 bh->b_end_io = reiserfs_end_ordered_io;
684 clear_buffer_dirty(bh);
685 if (!buffer_uptodate(bh))
686 BUG();
687 return submit_bh(WRITE_BARRIER, bh);
688}
689
690static void check_barrier_completion(struct super_block *s,
691 struct buffer_head *bh)
692{
693 if (buffer_eopnotsupp(bh)) {
694 clear_buffer_eopnotsupp(bh);
695 disable_barrier(s);
696 set_buffer_uptodate(bh);
697 set_buffer_dirty(bh);
698 reiserfs_write_unlock(s);
699 sync_dirty_buffer(bh);
700 reiserfs_write_lock(s);
701 }
702}
703
704#define CHUNK_SIZE 32 673#define CHUNK_SIZE 32
705struct buffer_chunk { 674struct buffer_chunk {
706 struct buffer_head *bh[CHUNK_SIZE]; 675 struct buffer_head *bh[CHUNK_SIZE];
@@ -1009,7 +978,6 @@ static int flush_commit_list(struct super_block *s,
1009 struct buffer_head *tbh = NULL; 978 struct buffer_head *tbh = NULL;
1010 unsigned int trans_id = jl->j_trans_id; 979 unsigned int trans_id = jl->j_trans_id;
1011 struct reiserfs_journal *journal = SB_JOURNAL(s); 980 struct reiserfs_journal *journal = SB_JOURNAL(s);
1012 int barrier = 0;
1013 int retval = 0; 981 int retval = 0;
1014 int write_len; 982 int write_len;
1015 983
@@ -1094,24 +1062,6 @@ static int flush_commit_list(struct super_block *s,
1094 } 1062 }
1095 atomic_dec(&journal->j_async_throttle); 1063 atomic_dec(&journal->j_async_throttle);
1096 1064
1097 /* We're skipping the commit if there's an error */
1098 if (retval || reiserfs_is_journal_aborted(journal))
1099 barrier = 0;
1100
1101 /* wait on everything written so far before writing the commit
1102 * if we are in barrier mode, send the commit down now
1103 */
1104 barrier = reiserfs_barrier_flush(s);
1105 if (barrier) {
1106 int ret;
1107 lock_buffer(jl->j_commit_bh);
1108 ret = submit_barrier_buffer(jl->j_commit_bh);
1109 if (ret == -EOPNOTSUPP) {
1110 set_buffer_uptodate(jl->j_commit_bh);
1111 disable_barrier(s);
1112 barrier = 0;
1113 }
1114 }
1115 for (i = 0; i < (jl->j_len + 1); i++) { 1065 for (i = 0; i < (jl->j_len + 1); i++) {
1116 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 1066 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1117 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1067 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1143,27 +1093,22 @@ static int flush_commit_list(struct super_block *s,
1143 1093
1144 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); 1094 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
1145 1095
1146 if (!barrier) { 1096 /* If there was a write error in the journal - we can't commit
1147 /* If there was a write error in the journal - we can't commit 1097 * this transaction - it will be invalid and, if successful,
1148 * this transaction - it will be invalid and, if successful, 1098 * will just end up propagating the write error out to
1149 * will just end up propagating the write error out to 1099 * the file system. */
1150 * the file system. */ 1100 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1151 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { 1101 if (buffer_dirty(jl->j_commit_bh))
1152 if (buffer_dirty(jl->j_commit_bh)) 1102 BUG();
1153 BUG(); 1103 mark_buffer_dirty(jl->j_commit_bh) ;
1154 mark_buffer_dirty(jl->j_commit_bh) ;
1155 reiserfs_write_unlock(s);
1156 sync_dirty_buffer(jl->j_commit_bh) ;
1157 reiserfs_write_lock(s);
1158 }
1159 } else {
1160 reiserfs_write_unlock(s); 1104 reiserfs_write_unlock(s);
1161 wait_on_buffer(jl->j_commit_bh); 1105 if (reiserfs_barrier_flush(s))
1106 __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
1107 else
1108 sync_dirty_buffer(jl->j_commit_bh);
1162 reiserfs_write_lock(s); 1109 reiserfs_write_lock(s);
1163 } 1110 }
1164 1111
1165 check_barrier_completion(s, jl->j_commit_bh);
1166
1167 /* If there was a write error in the journal - we can't commit this 1112 /* If there was a write error in the journal - we can't commit this
1168 * transaction - it will be invalid and, if successful, will just end 1113 * transaction - it will be invalid and, if successful, will just end
1169 * up propagating the write error out to the filesystem. */ 1114 * up propagating the write error out to the filesystem. */
@@ -1319,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb,
1319 jh->j_first_unflushed_offset = cpu_to_le32(offset); 1264 jh->j_first_unflushed_offset = cpu_to_le32(offset);
1320 jh->j_mount_id = cpu_to_le32(journal->j_mount_id); 1265 jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1321 1266
1322 if (reiserfs_barrier_flush(sb)) { 1267 set_buffer_dirty(journal->j_header_bh);
1323 int ret; 1268 reiserfs_write_unlock(sb);
1324 lock_buffer(journal->j_header_bh); 1269
1325 ret = submit_barrier_buffer(journal->j_header_bh); 1270 if (reiserfs_barrier_flush(sb))
1326 if (ret == -EOPNOTSUPP) { 1271 __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
1327 set_buffer_uptodate(journal->j_header_bh); 1272 else
1328 disable_barrier(sb);
1329 goto sync;
1330 }
1331 reiserfs_write_unlock(sb);
1332 wait_on_buffer(journal->j_header_bh);
1333 reiserfs_write_lock(sb);
1334 check_barrier_completion(sb, journal->j_header_bh);
1335 } else {
1336 sync:
1337 set_buffer_dirty(journal->j_header_bh);
1338 reiserfs_write_unlock(sb);
1339 sync_dirty_buffer(journal->j_header_bh); 1273 sync_dirty_buffer(journal->j_header_bh);
1340 reiserfs_write_lock(sb); 1274
1341 } 1275 reiserfs_write_lock(sb);
1342 if (!buffer_uptodate(journal->j_header_bh)) { 1276 if (!buffer_uptodate(journal->j_header_bh)) {
1343 reiserfs_warning(sb, "journal-837", 1277 reiserfs_warning(sb, "journal-837",
1344 "IO error during journal replay"); 1278 "IO error during journal replay");
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21da..1846a0dd703 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -924,19 +924,7 @@ xfs_buf_iodone_work(
924 xfs_buf_t *bp = 924 xfs_buf_t *bp =
925 container_of(work, xfs_buf_t, b_iodone_work); 925 container_of(work, xfs_buf_t, b_iodone_work);
926 926
927 /* 927 if (bp->b_iodone)
928 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
929 * ordered flag and reissue them. Because we can't tell the higher
930 * layers directly that they should not issue ordered I/O anymore, they
931 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
932 */
933 if ((bp->b_error == EOPNOTSUPP) &&
934 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
935 trace_xfs_buf_ordered_retry(bp, _RET_IP_);
936 bp->b_flags &= ~XBF_ORDERED;
937 bp->b_flags |= _XFS_BARRIER_FAILED;
938 xfs_buf_iorequest(bp);
939 } else if (bp->b_iodone)
940 (*(bp->b_iodone))(bp); 928 (*(bp->b_iodone))(bp);
941 else if (bp->b_flags & XBF_ASYNC) 929 else if (bp->b_flags & XBF_ASYNC)
942 xfs_buf_relse(bp); 930 xfs_buf_relse(bp);
@@ -1195,7 +1183,7 @@ _xfs_buf_ioapply(
1195 1183
1196 if (bp->b_flags & XBF_ORDERED) { 1184 if (bp->b_flags & XBF_ORDERED) {
1197 ASSERT(!(bp->b_flags & XBF_READ)); 1185 ASSERT(!(bp->b_flags & XBF_READ));
1198 rw = WRITE_BARRIER; 1186 rw = WRITE_FLUSH_FUA;
1199 } else if (bp->b_flags & XBF_LOG_BUFFER) { 1187 } else if (bp->b_flags & XBF_LOG_BUFFER) {
1200 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1188 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1201 bp->b_flags &= ~_XBF_RUN_QUEUES; 1189 bp->b_flags &= ~_XBF_RUN_QUEUES;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b9..9d021c73ea5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -86,14 +86,6 @@ typedef enum {
86 */ 86 */
87#define _XBF_PAGE_LOCKED (1 << 22) 87#define _XBF_PAGE_LOCKED (1 << 22)
88 88
89/*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95#define _XFS_BARRIER_FAILED (1 << 23)
96
97typedef unsigned int xfs_buf_flags_t; 89typedef unsigned int xfs_buf_flags_t;
98 90
99#define XFS_BUF_FLAGS \ 91#define XFS_BUF_FLAGS \
@@ -114,8 +106,7 @@ typedef unsigned int xfs_buf_flags_t;
114 { _XBF_PAGES, "PAGES" }, \ 106 { _XBF_PAGES, "PAGES" }, \
115 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 107 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
116 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 108 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
117 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ 109 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }
118 { _XFS_BARRIER_FAILED, "BARRIER_FAILED" }
119 110
120 111
121typedef enum { 112typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955..08fd3102128 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -693,8 +693,7 @@ void
693xfs_blkdev_issue_flush( 693xfs_blkdev_issue_flush(
694 xfs_buftarg_t *buftarg) 694 xfs_buftarg_t *buftarg)
695{ 695{
696 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, 696 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
697 BLKDEV_IFL_WAIT);
698} 697}
699 698
700STATIC void 699STATIC void
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a..8fe311a456e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -325,7 +325,6 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
325DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
326DEFINE_BUF_EVENT(xfs_buf_cond_lock); 326DEFINE_BUF_EVENT(xfs_buf_cond_lock);
327DEFINE_BUF_EVENT(xfs_buf_unlock); 327DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
329DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
330DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
331DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a4..ba8e36e0b4e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -917,19 +917,6 @@ xlog_iodone(xfs_buf_t *bp)
917 l = iclog->ic_log; 917 l = iclog->ic_log;
918 918
919 /* 919 /*
920 * If the _XFS_BARRIER_FAILED flag was set by a lower
921 * layer, it means the underlying device no longer supports
922 * barrier I/O. Warn loudly and turn off barriers.
923 */
924 if (bp->b_flags & _XFS_BARRIER_FAILED) {
925 bp->b_flags &= ~_XFS_BARRIER_FAILED;
926 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
927 xfs_fs_cmn_err(CE_WARN, l->l_mp,
928 "xlog_iodone: Barriers are no longer supported"
929 " by device. Disabling barriers\n");
930 }
931
932 /*
933 * Race to shutdown the filesystem if we see an error. 920 * Race to shutdown the filesystem if we see an error.
934 */ 921 */
935 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 922 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d36629620a4..0437ab6bb54 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -146,7 +146,6 @@ enum rq_flag_bits {
146 __REQ_FAILED, /* set if the request failed */ 146 __REQ_FAILED, /* set if the request failed */
147 __REQ_QUIET, /* don't worry about errors */ 147 __REQ_QUIET, /* don't worry about errors */
148 __REQ_PREEMPT, /* set for "ide_preempt" requests */ 148 __REQ_PREEMPT, /* set for "ide_preempt" requests */
149 __REQ_ORDERED_COLOR, /* is before or after barrier */
150 __REQ_ALLOCED, /* request came from our alloc pool */ 149 __REQ_ALLOCED, /* request came from our alloc pool */
151 __REQ_COPY_USER, /* contains copies of user pages */ 150 __REQ_COPY_USER, /* contains copies of user pages */
152 __REQ_FLUSH, /* request for cache flush */ 151 __REQ_FLUSH, /* request for cache flush */
@@ -170,7 +169,8 @@ enum rq_flag_bits {
170 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) 169 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
171#define REQ_COMMON_MASK \ 170#define REQ_COMMON_MASK \
172 (REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \ 171 (REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
173 REQ_META| REQ_DISCARD | REQ_NOIDLE) 172 REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
173#define REQ_CLONE_MASK REQ_COMMON_MASK
174 174
175#define REQ_UNPLUG (1 << __REQ_UNPLUG) 175#define REQ_UNPLUG (1 << __REQ_UNPLUG)
176#define REQ_RAHEAD (1 << __REQ_RAHEAD) 176#define REQ_RAHEAD (1 << __REQ_RAHEAD)
@@ -187,7 +187,6 @@ enum rq_flag_bits {
187#define REQ_FAILED (1 << __REQ_FAILED) 187#define REQ_FAILED (1 << __REQ_FAILED)
188#define REQ_QUIET (1 << __REQ_QUIET) 188#define REQ_QUIET (1 << __REQ_QUIET)
189#define REQ_PREEMPT (1 << __REQ_PREEMPT) 189#define REQ_PREEMPT (1 << __REQ_PREEMPT)
190#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
191#define REQ_ALLOCED (1 << __REQ_ALLOCED) 190#define REQ_ALLOCED (1 << __REQ_ALLOCED)
192#define REQ_COPY_USER (1 << __REQ_COPY_USER) 191#define REQ_COPY_USER (1 << __REQ_COPY_USER)
193#define REQ_FLUSH (1 << __REQ_FLUSH) 192#define REQ_FLUSH (1 << __REQ_FLUSH)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 16f7f1be1ac..009b80e49f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -360,12 +360,14 @@ struct request_queue
360 struct blk_trace *blk_trace; 360 struct blk_trace *blk_trace;
361#endif 361#endif
362 /* 362 /*
363 * reserved for flush operations 363 * for flush operations
364 */ 364 */
365 unsigned int ordered, next_ordered, ordseq; 365 unsigned int flush_flags;
366 int orderr, ordcolor; 366 unsigned int flush_seq;
367 struct request pre_flush_rq, bar_rq, post_flush_rq; 367 int flush_err;
368 struct request *orig_bar_rq; 368 struct request flush_rq;
369 struct request *orig_flush_rq;
370 struct list_head pending_flushes;
369 371
370 struct mutex sysfs_lock; 372 struct mutex sysfs_lock;
371 373
@@ -472,56 +474,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
472 __clear_bit(flag, &q->queue_flags); 474 __clear_bit(flag, &q->queue_flags);
473} 475}
474 476
475enum {
476 /*
477 * Hardbarrier is supported with one of the following methods.
478 *
479 * NONE : hardbarrier unsupported
480 * DRAIN : ordering by draining is enough
481 * DRAIN_FLUSH : ordering by draining w/ pre and post flushes
482 * DRAIN_FUA : ordering by draining w/ pre flush and FUA write
483 * TAG : ordering by tag is enough
484 * TAG_FLUSH : ordering by tag w/ pre and post flushes
485 * TAG_FUA : ordering by tag w/ pre flush and FUA write
486 */
487 QUEUE_ORDERED_BY_DRAIN = 0x01,
488 QUEUE_ORDERED_BY_TAG = 0x02,
489 QUEUE_ORDERED_DO_PREFLUSH = 0x10,
490 QUEUE_ORDERED_DO_BAR = 0x20,
491 QUEUE_ORDERED_DO_POSTFLUSH = 0x40,
492 QUEUE_ORDERED_DO_FUA = 0x80,
493
494 QUEUE_ORDERED_NONE = 0x00,
495
496 QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN |
497 QUEUE_ORDERED_DO_BAR,
498 QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
499 QUEUE_ORDERED_DO_PREFLUSH |
500 QUEUE_ORDERED_DO_POSTFLUSH,
501 QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN |
502 QUEUE_ORDERED_DO_PREFLUSH |
503 QUEUE_ORDERED_DO_FUA,
504
505 QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG |
506 QUEUE_ORDERED_DO_BAR,
507 QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG |
508 QUEUE_ORDERED_DO_PREFLUSH |
509 QUEUE_ORDERED_DO_POSTFLUSH,
510 QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG |
511 QUEUE_ORDERED_DO_PREFLUSH |
512 QUEUE_ORDERED_DO_FUA,
513
514 /*
515 * Ordered operation sequence
516 */
517 QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */
518 QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */
519 QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */
520 QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */
521 QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */
522 QUEUE_ORDSEQ_DONE = 0x20,
523};
524
525#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) 477#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
526#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) 478#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
527#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) 479#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
@@ -531,7 +483,6 @@ enum {
531#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) 483#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
532#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) 484#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
533#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) 485#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
534#define blk_queue_flushing(q) ((q)->ordseq)
535#define blk_queue_stackable(q) \ 486#define blk_queue_stackable(q) \
536 test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) 487 test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
537#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) 488#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
@@ -602,7 +553,8 @@ static inline void blk_clear_queue_full(struct request_queue *q, int sync)
602 * it already be started by driver. 553 * it already be started by driver.
603 */ 554 */
604#define RQ_NOMERGE_FLAGS \ 555#define RQ_NOMERGE_FLAGS \
605 (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER) 556 (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \
557 REQ_FLUSH | REQ_FUA)
606#define rq_mergeable(rq) \ 558#define rq_mergeable(rq) \
607 (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ 559 (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
608 (((rq)->cmd_flags & REQ_DISCARD) || \ 560 (((rq)->cmd_flags & REQ_DISCARD) || \
@@ -891,12 +843,8 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int);
891extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); 843extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
892extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); 844extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
893extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); 845extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
846extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
894extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); 847extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
895extern int blk_queue_ordered(struct request_queue *, unsigned);
896extern bool blk_do_ordered(struct request_queue *, struct request **);
897extern unsigned blk_ordered_cur_seq(struct request_queue *);
898extern unsigned blk_ordered_req_seq(struct request *);
899extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
900 848
901extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); 849extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
902extern void blk_dump_rq_flags(struct request *, char *); 850extern void blk_dump_rq_flags(struct request *, char *);
@@ -929,27 +877,20 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
929 return NULL; 877 return NULL;
930 return bqt->tag_index[tag]; 878 return bqt->tag_index[tag];
931} 879}
932enum{ 880
933 BLKDEV_WAIT, /* wait for completion */ 881#define BLKDEV_DISCARD_SECURE 0x01 /* secure discard */
934 BLKDEV_BARRIER, /* issue request with barrier */ 882
935 BLKDEV_SECURE, /* secure discard */ 883extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
936};
937#define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT)
938#define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER)
939#define BLKDEV_IFL_SECURE (1 << BLKDEV_SECURE)
940extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
941 unsigned long);
942extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, 884extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
943 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); 885 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
944extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 886extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
945 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); 887 sector_t nr_sects, gfp_t gfp_mask);
946static inline int sb_issue_discard(struct super_block *sb, 888static inline int sb_issue_discard(struct super_block *sb, sector_t block,
947 sector_t block, sector_t nr_blocks) 889 sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
948{ 890{
949 block <<= (sb->s_blocksize_bits - 9); 891 return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
950 nr_blocks <<= (sb->s_blocksize_bits - 9); 892 nr_blocks << (sb->s_blocksize_bits - 9),
951 return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS, 893 gfp_mask, flags);
952 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
953} 894}
954 895
955extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); 896extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index ec94c12f21d..dd1b25b2641 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -32,7 +32,6 @@ enum bh_state_bits {
32 BH_Delay, /* Buffer is not yet allocated on disk */ 32 BH_Delay, /* Buffer is not yet allocated on disk */
33 BH_Boundary, /* Block is followed by a discontiguity */ 33 BH_Boundary, /* Block is followed by a discontiguity */
34 BH_Write_EIO, /* I/O error on write */ 34 BH_Write_EIO, /* I/O error on write */
35 BH_Eopnotsupp, /* operation not supported (barrier) */
36 BH_Unwritten, /* Buffer is allocated on disk but not written */ 35 BH_Unwritten, /* Buffer is allocated on disk but not written */
37 BH_Quiet, /* Buffer Error Prinks to be quiet */ 36 BH_Quiet, /* Buffer Error Prinks to be quiet */
38 37
@@ -124,7 +123,6 @@ BUFFER_FNS(Async_Write, async_write)
124BUFFER_FNS(Delay, delay) 123BUFFER_FNS(Delay, delay)
125BUFFER_FNS(Boundary, boundary) 124BUFFER_FNS(Boundary, boundary)
126BUFFER_FNS(Write_EIO, write_io_error) 125BUFFER_FNS(Write_EIO, write_io_error)
127BUFFER_FNS(Eopnotsupp, eopnotsupp)
128BUFFER_FNS(Unwritten, unwritten) 126BUFFER_FNS(Unwritten, unwritten)
129 127
130#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) 128#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0a81b87ea15..4f34ff6e555 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -135,12 +135,12 @@ struct inodes_stat_t {
135 * immediately after submission. The write equivalent 135 * immediately after submission. The write equivalent
136 * of READ_SYNC. 136 * of READ_SYNC.
137 * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. 137 * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
138 * WRITE_BARRIER Like WRITE_SYNC, but tells the block layer that all 138 * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush.
139 * previously submitted writes must be safely on storage 139 * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on
140 * before this one is started. Also guarantees that when 140 * non-volatile media on completion.
141 * this write is complete, it itself is also safely on 141 * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
142 * storage. Prevents reordering of writes on both sides 142 * by a cache flush and data is guaranteed to be on
143 * of this IO. 143 * non-volatile media on completion.
144 * 144 *
145 */ 145 */
146#define RW_MASK REQ_WRITE 146#define RW_MASK REQ_WRITE
@@ -156,16 +156,12 @@ struct inodes_stat_t {
156#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) 156#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
157#define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC) 157#define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC)
158#define WRITE_META (WRITE | REQ_META) 158#define WRITE_META (WRITE | REQ_META)
159#define WRITE_BARRIER (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ 159#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
160 REQ_HARDBARRIER) 160 REQ_FLUSH)
161 161#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
162/* 162 REQ_FUA)
163 * These aren't really reads or writes, they pass down information about 163#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
164 * parts of device that are now unused by the file system. 164 REQ_FLUSH | REQ_FUA)
165 */
166#define DISCARD_NOBARRIER (WRITE | REQ_DISCARD)
167#define DISCARD_BARRIER (WRITE | REQ_DISCARD | REQ_HARDBARRIER)
168#define DISCARD_SECURE (DISCARD_NOBARRIER | REQ_SECURE)
169 165
170#define SEL_IN 1 166#define SEL_IN 1
171#define SEL_OUT 2 167#define SEL_OUT 2
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index 17231385cb3..d6e7994aa63 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -97,13 +97,9 @@ static inline void scsi_deactivate_tcq(struct scsi_device *sdev, int depth)
97static inline int scsi_populate_tag_msg(struct scsi_cmnd *cmd, char *msg) 97static inline int scsi_populate_tag_msg(struct scsi_cmnd *cmd, char *msg)
98{ 98{
99 struct request *req = cmd->request; 99 struct request *req = cmd->request;
100 struct scsi_device *sdev = cmd->device;
101 100
102 if (blk_rq_tagged(req)) { 101 if (blk_rq_tagged(req)) {
103 if (sdev->ordered_tags && req->cmd_flags & REQ_HARDBARRIER) 102 *msg++ = MSG_SIMPLE_TAG;
104 *msg++ = MSG_ORDERED_TAG;
105 else
106 *msg++ = MSG_SIMPLE_TAG;
107 *msg++ = req->tag; 103 *msg++ = req->tag;
108 return 2; 104 return 2;
109 } 105 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7c703ff2f36..9fc7bac7db0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,7 @@ static int discard_swap(struct swap_info_struct *si)
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) { 140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block, 141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); 142 nr_blocks, GFP_KERNEL, 0);
143 if (err) 143 if (err)
144 return err; 144 return err;
145 cond_resched(); 145 cond_resched();
@@ -150,7 +150,7 @@ static int discard_swap(struct swap_info_struct *si)
150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
151 151
152 err = blkdev_issue_discard(si->bdev, start_block, 152 err = blkdev_issue_discard(si->bdev, start_block,
153 nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); 153 nr_blocks, GFP_KERNEL, 0);
154 if (err) 154 if (err)
155 break; 155 break;
156 156
@@ -189,7 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
189 start_block <<= PAGE_SHIFT - 9; 189 start_block <<= PAGE_SHIFT - 9;
190 nr_blocks <<= PAGE_SHIFT - 9; 190 nr_blocks <<= PAGE_SHIFT - 9;
191 if (blkdev_issue_discard(si->bdev, start_block, 191 if (blkdev_issue_discard(si->bdev, start_block,
192 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) 192 nr_blocks, GFP_NOIO, 0))
193 break; 193 break;
194 } 194 }
195 195