aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZach Brown <zach.brown@oracle.com>2007-03-16 17:38:11 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-03-16 22:25:04 -0400
commit65b8291c4000e5f38fc94fb2ca0cb7e8683c8a1b (patch)
treeaeebdb654ce5db3cf02a9110e3b2db7e7aef06f0
parent00e9fa2d6421fbbefb4c02821a1e779a3ce47781 (diff)
[PATCH] dio: invalidate clean pages before dio write
This patch fixes a user-triggerable oops that was reported by Leonid Ananiev as archived at http://lkml.org/lkml/2007/2/8/337. dio writes invalidate clean pages that intersect the written region so that subsequent buffered reads go to disk to read the new data. If this fails the interface tries to tell the caller that the cache is inconsistent by returning EIO. Before this patch we had the problem where this invalidation failure would clobber -EIOCBQUEUED as it made its way from fs/direct-io.c to fs/aio.c. Both fs/aio.c and bio completion call aio_complete() and we reference freed memory, usually oopsing. This patch addresses this problem by invalidating before the write so that we can cleanly return -EIO before ->direct_IO() has had a chance to return -EIOCBQUEUED. There is a compromise here. During the dio write we can fault in mmap()ed pages which intersect the written range with get_user_pages() if the user provided them for the source buffer. This is a crazy thing to do, but we can make it mostly work in most cases by trying the invalidation again. The compromise is that we won't return an error if this second invalidation fails if it's an AIO write and we have -EIOCBQUEUED. This was tested by having two processes race performing large O_DIRECT and buffered ordered writes. Within minutes ext3 would see a race between ext3_releasepage() and jbd holding a reference on ordered data buffers and would cause invalidation to fail, panicing the box. The test can be found in the 'aio_dio_bugs' test group in test.kernel.org/autotest. After this patch the test passes. Signed-off-by: Zach Brown <zach.brown@oracle.com> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org> Cc: Leonid Ananiev <leonid.i.ananiev@linux.intel.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/filemap.c46
1 files changed, 35 insertions, 11 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index d1060b8d3cd6..5dfc093ceb3d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2379,7 +2379,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2379 struct file *file = iocb->ki_filp; 2379 struct file *file = iocb->ki_filp;
2380 struct address_space *mapping = file->f_mapping; 2380 struct address_space *mapping = file->f_mapping;
2381 ssize_t retval; 2381 ssize_t retval;
2382 size_t write_len = 0; 2382 size_t write_len;
2383 pgoff_t end = 0; /* silence gcc */
2383 2384
2384 /* 2385 /*
2385 * If it's a write, unmap all mmappings of the file up-front. This 2386 * If it's a write, unmap all mmappings of the file up-front. This
@@ -2388,23 +2389,46 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2388 */ 2389 */
2389 if (rw == WRITE) { 2390 if (rw == WRITE) {
2390 write_len = iov_length(iov, nr_segs); 2391 write_len = iov_length(iov, nr_segs);
2392 end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2391 if (mapping_mapped(mapping)) 2393 if (mapping_mapped(mapping))
2392 unmap_mapping_range(mapping, offset, write_len, 0); 2394 unmap_mapping_range(mapping, offset, write_len, 0);
2393 } 2395 }
2394 2396
2395 retval = filemap_write_and_wait(mapping); 2397 retval = filemap_write_and_wait(mapping);
2396 if (retval == 0) { 2398 if (retval)
2397 retval = mapping->a_ops->direct_IO(rw, iocb, iov, 2399 goto out;
2398 offset, nr_segs); 2400
2399 if (rw == WRITE && mapping->nrpages) { 2401 /*
2400 pgoff_t end = (offset + write_len - 1) 2402 * After a write we want buffered reads to be sure to go to disk to get
2401 >> PAGE_CACHE_SHIFT; 2403 * the new data. We invalidate clean cached page from the region we're
2402 int err = invalidate_inode_pages2_range(mapping, 2404 * about to write. We do this *before* the write so that we can return
2405 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2406 */
2407 if (rw == WRITE && mapping->nrpages) {
2408 retval = invalidate_inode_pages2_range(mapping,
2403 offset >> PAGE_CACHE_SHIFT, end); 2409 offset >> PAGE_CACHE_SHIFT, end);
2404 if (err) 2410 if (retval)
2405 retval = err; 2411 goto out;
2406 }
2407 } 2412 }
2413
2414 retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2415 if (retval)
2416 goto out;
2417
2418 /*
2419 * Finally, try again to invalidate clean pages which might have been
2420 * faulted in by get_user_pages() if the source of the write was an
2421 * mmap()ed region of the file we're writing. That's a pretty crazy
2422 * thing to do, so we don't support it 100%. If this invalidation
2423 * fails and we have -EIOCBQUEUED we ignore the failure.
2424 */
2425 if (rw == WRITE && mapping->nrpages) {
2426 int err = invalidate_inode_pages2_range(mapping,
2427 offset >> PAGE_CACHE_SHIFT, end);
2428 if (err && retval >= 0)
2429 retval = err;
2430 }
2431out:
2408 return retval; 2432 return retval;
2409} 2433}
2410 2434