diff options
-rw-r--r-- | Documentation/block/barrier.txt | 271 | ||||
-rw-r--r-- | block/elevator.c | 4 |
2 files changed, 273 insertions, 2 deletions
diff --git a/Documentation/block/barrier.txt b/Documentation/block/barrier.txt new file mode 100644 index 00000000000..03971518b22 --- /dev/null +++ b/Documentation/block/barrier.txt | |||
@@ -0,0 +1,271 @@ | |||
1 | I/O Barriers | ||
2 | ============ | ||
3 | Tejun Heo <htejun@gmail.com>, July 22 2005 | ||
4 | |||
5 | I/O barrier requests are used to guarantee ordering around the barrier | ||
6 | requests. Unless you're crazy enough to use disk drives for | ||
7 | implementing synchronization constructs (wow, sounds interesting...), | ||
8 | the ordering is meaningful only for write requests for things like | ||
9 | journal checkpoints. All requests queued before a barrier request | ||
10 | must be finished (made it to the physical medium) before the barrier | ||
11 | request is started, and all requests queued after the barrier request | ||
12 | must be started only after the barrier request is finished (again, | ||
13 | made it to the physical medium). | ||
14 | |||
15 | In other words, I/O barrier requests have the following two properties. | ||
16 | |||
17 | 1. Request ordering | ||
18 | |||
19 | Requests cannot pass the barrier request. Preceding requests are | ||
20 | processed before the barrier and following requests after. | ||
21 | |||
22 | Depending on what features a drive supports, this can be done in one | ||
23 | of the following three ways. | ||
24 | |||
25 | i. For devices which have queue depth greater than 1 (TCQ devices) and | ||
26 | support ordered tags, block layer can just issue the barrier as an | ||
27 | ordered request and the lower level driver, controller and drive | ||
28 | itself are responsible for making sure that the ordering contraint is | ||
29 | met. Most modern SCSI controllers/drives should support this. | ||
30 | |||
31 | NOTE: SCSI ordered tag isn't currently used due to limitation in the | ||
32 | SCSI midlayer, see the following random notes section. | ||
33 | |||
34 | ii. For devices which have queue depth greater than 1 but don't | ||
35 | support ordered tags, block layer ensures that the requests preceding | ||
36 | a barrier request finishes before issuing the barrier request. Also, | ||
37 | it defers requests following the barrier until the barrier request is | ||
38 | finished. Older SCSI controllers/drives and SATA drives fall in this | ||
39 | category. | ||
40 | |||
41 | iii. Devices which have queue depth of 1. This is a degenerate case | ||
42 | of ii. Just keeping issue order suffices. Ancient SCSI | ||
43 | controllers/drives and IDE drives are in this category. | ||
44 | |||
45 | 2. Forced flushing to physcial medium | ||
46 | |||
47 | Again, if you're not gonna do synchronization with disk drives (dang, | ||
48 | it sounds even more appealing now!), the reason you use I/O barriers | ||
49 | is mainly to protect filesystem integrity when power failure or some | ||
50 | other events abruptly stop the drive from operating and possibly make | ||
51 | the drive lose data in its cache. So, I/O barriers need to guarantee | ||
52 | that requests actually get written to non-volatile medium in order. | ||
53 | |||
54 | There are four cases, | ||
55 | |||
56 | i. No write-back cache. Keeping requests ordered is enough. | ||
57 | |||
58 | ii. Write-back cache but no flush operation. There's no way to | ||
59 | gurantee physical-medium commit order. This kind of devices can't to | ||
60 | I/O barriers. | ||
61 | |||
62 | iii. Write-back cache and flush operation but no FUA (forced unit | ||
63 | access). We need two cache flushes - before and after the barrier | ||
64 | request. | ||
65 | |||
66 | iv. Write-back cache, flush operation and FUA. We still need one | ||
67 | flush to make sure requests preceding a barrier are written to medium, | ||
68 | but post-barrier flush can be avoided by using FUA write on the | ||
69 | barrier itself. | ||
70 | |||
71 | |||
72 | How to support barrier requests in drivers | ||
73 | ------------------------------------------ | ||
74 | |||
75 | All barrier handling is done inside block layer proper. All low level | ||
76 | drivers have to are implementing its prepare_flush_fn and using one | ||
77 | the following two functions to indicate what barrier type it supports | ||
78 | and how to prepare flush requests. Note that the term 'ordered' is | ||
79 | used to indicate the whole sequence of performing barrier requests | ||
80 | including draining and flushing. | ||
81 | |||
82 | typedef void (prepare_flush_fn)(request_queue_t *q, struct request *rq); | ||
83 | |||
84 | int blk_queue_ordered(request_queue_t *q, unsigned ordered, | ||
85 | prepare_flush_fn *prepare_flush_fn, | ||
86 | unsigned gfp_mask); | ||
87 | |||
88 | int blk_queue_ordered_locked(request_queue_t *q, unsigned ordered, | ||
89 | prepare_flush_fn *prepare_flush_fn, | ||
90 | unsigned gfp_mask); | ||
91 | |||
92 | The only difference between the two functions is whether or not the | ||
93 | caller is holding q->queue_lock on entry. The latter expects the | ||
94 | caller is holding the lock. | ||
95 | |||
96 | @q : the queue in question | ||
97 | @ordered : the ordered mode the driver/device supports | ||
98 | @prepare_flush_fn : this function should prepare @rq such that it | ||
99 | flushes cache to physical medium when executed | ||
100 | @gfp_mask : gfp_mask used when allocating data structures | ||
101 | for ordered processing | ||
102 | |||
103 | For example, SCSI disk driver's prepare_flush_fn looks like the | ||
104 | following. | ||
105 | |||
106 | static void sd_prepare_flush(request_queue_t *q, struct request *rq) | ||
107 | { | ||
108 | memset(rq->cmd, 0, sizeof(rq->cmd)); | ||
109 | rq->flags |= REQ_BLOCK_PC; | ||
110 | rq->timeout = SD_TIMEOUT; | ||
111 | rq->cmd[0] = SYNCHRONIZE_CACHE; | ||
112 | } | ||
113 | |||
114 | The following seven ordered modes are supported. The following table | ||
115 | shows which mode should be used depending on what features a | ||
116 | device/driver supports. In the leftmost column of table, | ||
117 | QUEUE_ORDERED_ prefix is omitted from the mode names to save space. | ||
118 | |||
119 | The table is followed by description of each mode. Note that in the | ||
120 | descriptions of QUEUE_ORDERED_DRAIN*, '=>' is used whereas '->' is | ||
121 | used for QUEUE_ORDERED_TAG* descriptions. '=>' indicates that the | ||
122 | preceding step must be complete before proceeding to the next step. | ||
123 | '->' indicates that the next step can start as soon as the previous | ||
124 | step is issued. | ||
125 | |||
126 | write-back cache ordered tag flush FUA | ||
127 | ----------------------------------------------------------------------- | ||
128 | NONE yes/no N/A no N/A | ||
129 | DRAIN no no N/A N/A | ||
130 | DRAIN_FLUSH yes no yes no | ||
131 | DRAIN_FUA yes no yes yes | ||
132 | TAG no yes N/A N/A | ||
133 | TAG_FLUSH yes yes yes no | ||
134 | TAG_FUA yes yes yes yes | ||
135 | |||
136 | |||
137 | QUEUE_ORDERED_NONE | ||
138 | I/O barriers are not needed and/or supported. | ||
139 | |||
140 | Sequence: N/A | ||
141 | |||
142 | QUEUE_ORDERED_DRAIN | ||
143 | Requests are ordered by draining the request queue and cache | ||
144 | flushing isn't needed. | ||
145 | |||
146 | Sequence: drain => barrier | ||
147 | |||
148 | QUEUE_ORDERED_DRAIN_FLUSH | ||
149 | Requests are ordered by draining the request queue and both | ||
150 | pre-barrier and post-barrier cache flushings are needed. | ||
151 | |||
152 | Sequence: drain => preflush => barrier => postflush | ||
153 | |||
154 | QUEUE_ORDERED_DRAIN_FUA | ||
155 | Requests are ordered by draining the request queue and | ||
156 | pre-barrier cache flushing is needed. By using FUA on barrier | ||
157 | request, post-barrier flushing can be skipped. | ||
158 | |||
159 | Sequence: drain => preflush => barrier | ||
160 | |||
161 | QUEUE_ORDERED_TAG | ||
162 | Requests are ordered by ordered tag and cache flushing isn't | ||
163 | needed. | ||
164 | |||
165 | Sequence: barrier | ||
166 | |||
167 | QUEUE_ORDERED_TAG_FLUSH | ||
168 | Requests are ordered by ordered tag and both pre-barrier and | ||
169 | post-barrier cache flushings are needed. | ||
170 | |||
171 | Sequence: preflush -> barrier -> postflush | ||
172 | |||
173 | QUEUE_ORDERED_TAG_FUA | ||
174 | Requests are ordered by ordered tag and pre-barrier cache | ||
175 | flushing is needed. By using FUA on barrier request, | ||
176 | post-barrier flushing can be skipped. | ||
177 | |||
178 | Sequence: preflush -> barrier | ||
179 | |||
180 | |||
181 | Random notes/caveats | ||
182 | -------------------- | ||
183 | |||
184 | * SCSI layer currently can't use TAG ordering even if the drive, | ||
185 | controller and driver support it. The problem is that SCSI midlayer | ||
186 | request dispatch function is not atomic. It releases queue lock and | ||
187 | switch to SCSI host lock during issue and it's possible and likely to | ||
188 | happen in time that requests change their relative positions. Once | ||
189 | this problem is solved, TAG ordering can be enabled. | ||
190 | |||
191 | * Currently, no matter which ordered mode is used, there can be only | ||
192 | one barrier request in progress. All I/O barriers are held off by | ||
193 | block layer until the previous I/O barrier is complete. This doesn't | ||
194 | make any difference for DRAIN ordered devices, but, for TAG ordered | ||
195 | devices with very high command latency, passing multiple I/O barriers | ||
196 | to low level *might* be helpful if they are very frequent. Well, this | ||
197 | certainly is a non-issue. I'm writing this just to make clear that no | ||
198 | two I/O barrier is ever passed to low-level driver. | ||
199 | |||
200 | * Completion order. Requests in ordered sequence are issued in order | ||
201 | but not required to finish in order. Barrier implementation can | ||
202 | handle out-of-order completion of ordered sequence. IOW, the requests | ||
203 | MUST be processed in order but the hardware/software completion paths | ||
204 | are allowed to reorder completion notifications - eg. current SCSI | ||
205 | midlayer doesn't preserve completion order during error handling. | ||
206 | |||
207 | * Requeueing order. Low-level drivers are free to requeue any request | ||
208 | after they removed it from the request queue with | ||
209 | blkdev_dequeue_request(). As barrier sequence should be kept in order | ||
210 | when requeued, generic elevator code takes care of putting requests in | ||
211 | order around barrier. See blk_ordered_req_seq() and | ||
212 | ELEVATOR_INSERT_REQUEUE handling in __elv_add_request() for details. | ||
213 | |||
214 | Note that block drivers must not requeue preceding requests while | ||
215 | completing latter requests in an ordered sequence. Currently, no | ||
216 | error checking is done against this. | ||
217 | |||
218 | * Error handling. Currently, block layer will report error to upper | ||
219 | layer if any of requests in an ordered sequence fails. Unfortunately, | ||
220 | this doesn't seem to be enough. Look at the following request flow. | ||
221 | QUEUE_ORDERED_TAG_FLUSH is in use. | ||
222 | |||
223 | [0] [1] [2] [3] [pre] [barrier] [post] < [4] [5] [6] ... > | ||
224 | still in elevator | ||
225 | |||
226 | Let's say request [2], [3] are write requests to update file system | ||
227 | metadata (journal or whatever) and [barrier] is used to mark that | ||
228 | those updates are valid. Consider the following sequence. | ||
229 | |||
230 | i. Requests [0] ~ [post] leaves the request queue and enters | ||
231 | low-level driver. | ||
232 | ii. After a while, unfortunately, something goes wrong and the | ||
233 | drive fails [2]. Note that any of [0], [1] and [3] could have | ||
234 | completed by this time, but [pre] couldn't have been finished | ||
235 | as the drive must process it in order and it failed before | ||
236 | processing that command. | ||
237 | iii. Error handling kicks in and determines that the error is | ||
238 | unrecoverable and fails [2], and resumes operation. | ||
239 | iv. [pre] [barrier] [post] gets processed. | ||
240 | v. *BOOM* power fails | ||
241 | |||
242 | The problem here is that the barrier request is *supposed* to indicate | ||
243 | that filesystem update requests [2] and [3] made it safely to the | ||
244 | physical medium and, if the machine crashes after the barrier is | ||
245 | written, filesystem recovery code can depend on that. Sadly, that | ||
246 | isn't true in this case anymore. IOW, the success of a I/O barrier | ||
247 | should also be dependent on success of some of the preceding requests, | ||
248 | where only upper layer (filesystem) knows what 'some' is. | ||
249 | |||
250 | This can be solved by implementing a way to tell the block layer which | ||
251 | requests affect the success of the following barrier request and | ||
252 | making lower lever drivers to resume operation on error only after | ||
253 | block layer tells it to do so. | ||
254 | |||
255 | As the probability of this happening is very low and the drive should | ||
256 | be faulty, implementing the fix is probably an overkill. But, still, | ||
257 | it's there. | ||
258 | |||
259 | * In previous drafts of barrier implementation, there was fallback | ||
260 | mechanism such that, if FUA or ordered TAG fails, less fancy ordered | ||
261 | mode can be selected and the failed barrier request is retried | ||
262 | automatically. The rationale for this feature was that as FUA is | ||
263 | pretty new in ATA world and ordered tag was never used widely, there | ||
264 | could be devices which report to support those features but choke when | ||
265 | actually given such requests. | ||
266 | |||
267 | This was removed for two reasons 1. it's an overkill 2. it's | ||
268 | impossible to implement properly when TAG ordering is used as low | ||
269 | level drivers resume after an error automatically. If it's ever | ||
270 | needed adding it back and modifying low level drivers accordingly | ||
271 | shouldn't be difficult. | ||
diff --git a/block/elevator.c b/block/elevator.c index e8025b2ec54..c9f424d5399 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -157,12 +157,12 @@ static void elevator_setup_default(void) | |||
157 | strcpy(chosen_elevator, "anticipatory"); | 157 | strcpy(chosen_elevator, "anticipatory"); |
158 | 158 | ||
159 | /* | 159 | /* |
160 | * If the given scheduler is not available, fall back to no-op. | 160 | * If the given scheduler is not available, fall back to the default |
161 | */ | 161 | */ |
162 | if ((e = elevator_find(chosen_elevator))) | 162 | if ((e = elevator_find(chosen_elevator))) |
163 | elevator_put(e); | 163 | elevator_put(e); |
164 | else | 164 | else |
165 | strcpy(chosen_elevator, "noop"); | 165 | strcpy(chosen_elevator, CONFIG_DEFAULT_IOSCHED); |
166 | } | 166 | } |
167 | 167 | ||
168 | static int __init elevator_setup(char *str) | 168 | static int __init elevator_setup(char *str) |