aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2006-09-26 02:30:53 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-09-26 11:48:44 -0400
commit3998b9301d3d55be8373add22b6bc5e11c1d9b71 (patch)
tree9ec65c5b492a9bffc46467f4210739cd07ac510c /fs
parent632bbfeee4f042c05bc65150b4433a297d3fe387 (diff)
[PATCH] jbd: fix commit of ordered data buffers
Original commit code assumes, that when a buffer on BJ_SyncData list is locked, it is being written to disk. But this is not true and hence it can lead to a potential data loss on crash. Also the code didn't count with the fact that journal_dirty_data() can steal buffers from committing transaction and hence could write buffers that no longer belong to the committing transaction. Finally it could possibly happen that we tried writing out one buffer several times. The patch below tries to solve these problems by a complete rewrite of the data commit code. We go through buffers on t_sync_datalist, lock buffers needing write out and store them in an array. Buffers are also immediately refiled to BJ_Locked list or unfiled (if the write out is completed). When the array is full or we have to block on buffer lock, we submit all accumulated buffers for IO. [suitable for 2.6.18.x around the 2.6.19-rc2 timeframe] Signed-off-by: Jan Kara <jack@suse.cz> Cc: Badari Pulavarty <pbadari@us.ibm.com> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/jbd/commit.c182
1 files changed, 113 insertions, 69 deletions
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 42da60784311..32a8caf0c41e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -160,6 +160,117 @@ static int journal_write_commit_record(journal_t *journal,
160 return (ret == -EIO); 160 return (ret == -EIO);
161} 161}
162 162
163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{
165 int i;
166
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
171 }
172}
173
174/*
175 * Submit all the data buffers to disk
176 */
177static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
179{
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
185
186 /*
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
190 *
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
193 */
194write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
197
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
202
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
210 */
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
220 }
221 locked = 1;
222 }
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
227 }
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
238 }
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
242 __journal_file_buffer(jh, commit_transaction,
243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
250 }
251 }
252 else {
253 BUFFER_TRACE(bh, "writeout complete: unfile");
254 __journal_unfile_buffer(jh);
255 jbd_unlock_bh_state(bh);
256 if (locked)
257 unlock_buffer(bh);
258 journal_remove_journal_head(bh);
259 /* Once for our safety reference, once for
260 * journal_remove_journal_head() */
261 put_bh(bh);
262 put_bh(bh);
263 }
264
265 if (lock_need_resched(&journal->j_list_lock)) {
266 spin_unlock(&journal->j_list_lock);
267 goto write_out_data;
268 }
269 }
270 spin_unlock(&journal->j_list_lock);
271 journal_do_submit_data(wbuf, bufs);
272}
273
163/* 274/*
164 * journal_commit_transaction 275 * journal_commit_transaction
165 * 276 *
@@ -313,80 +424,13 @@ void journal_commit_transaction(journal_t *journal)
313 * Now start flushing things to disk, in the order they appear 424 * Now start flushing things to disk, in the order they appear
314 * on the transaction lists. Data blocks go first. 425 * on the transaction lists. Data blocks go first.
315 */ 426 */
316
317 err = 0; 427 err = 0;
318 /* 428 journal_submit_data_buffers(journal, commit_transaction);
319 * Whenever we unlock the journal and sleep, things can get added
320 * onto ->t_sync_datalist, so we have to keep looping back to
321 * write_out_data until we *know* that the list is empty.
322 */
323 bufs = 0;
324 /*
325 * Cleanup any flushed data buffers from the data list. Even in
326 * abort mode, we want to flush this out as soon as possible.
327 */
328write_out_data:
329 cond_resched();
330 spin_lock(&journal->j_list_lock);
331
332 while (commit_transaction->t_sync_datalist) {
333 struct buffer_head *bh;
334
335 jh = commit_transaction->t_sync_datalist;
336 commit_transaction->t_sync_datalist = jh->b_tnext;
337 bh = jh2bh(jh);
338 if (buffer_locked(bh)) {
339 BUFFER_TRACE(bh, "locked");
340 if (!inverted_lock(journal, bh))
341 goto write_out_data;
342 __journal_temp_unlink_buffer(jh);
343 __journal_file_buffer(jh, commit_transaction,
344 BJ_Locked);
345 jbd_unlock_bh_state(bh);
346 if (lock_need_resched(&journal->j_list_lock)) {
347 spin_unlock(&journal->j_list_lock);
348 goto write_out_data;
349 }
350 } else {
351 if (buffer_dirty(bh)) {
352 BUFFER_TRACE(bh, "start journal writeout");
353 get_bh(bh);
354 wbuf[bufs++] = bh;
355 if (bufs == journal->j_wbufsize) {
356 jbd_debug(2, "submit %d writes\n",
357 bufs);
358 spin_unlock(&journal->j_list_lock);
359 ll_rw_block(SWRITE, bufs, wbuf);
360 journal_brelse_array(wbuf, bufs);
361 bufs = 0;
362 goto write_out_data;
363 }
364 } else {
365 BUFFER_TRACE(bh, "writeout complete: unfile");
366 if (!inverted_lock(journal, bh))
367 goto write_out_data;
368 __journal_unfile_buffer(jh);
369 jbd_unlock_bh_state(bh);
370 journal_remove_journal_head(bh);
371 put_bh(bh);
372 if (lock_need_resched(&journal->j_list_lock)) {
373 spin_unlock(&journal->j_list_lock);
374 goto write_out_data;
375 }
376 }
377 }
378 }
379
380 if (bufs) {
381 spin_unlock(&journal->j_list_lock);
382 ll_rw_block(SWRITE, bufs, wbuf);
383 journal_brelse_array(wbuf, bufs);
384 spin_lock(&journal->j_list_lock);
385 }
386 429
387 /* 430 /*
388 * Wait for all previously submitted IO to complete. 431 * Wait for all previously submitted IO to complete.
389 */ 432 */
433 spin_lock(&journal->j_list_lock);
390 while (commit_transaction->t_locked_list) { 434 while (commit_transaction->t_locked_list) {
391 struct buffer_head *bh; 435 struct buffer_head *bh;
392 436