aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2009-12-15 19:47:50 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-16 10:20:13 -0500
commit5fe878ae7f82fbf0830dbfaee4c5ca18f3aee442 (patch)
tree7e7ad290cfb30705948d8ebeb46b013afa913f42 /fs
parent23aee091d804efa8cc732a31c1ae5d625e1ec886 (diff)
direct-io: cleanup blockdev_direct_IO locking
Currently the locking in blockdev_direct_IO is a mess, we have three different locking types and very confusing checks for some of them. The most complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be used. This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case is unused anyway, and the write side is almost identical to DIO_NO_LOCKING. The difference is that DIO_NO_LOCKING always sets the create argument for the get_blocks callback to zero, but we can easily move that to the actual get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode: gfs already ignores the create argument and thus is fine with the new version, ocfs2 only errors out if create were ever set, and we can remove this dead code now, the block device code only ever uses create for an error message if we are fully beyond the device which can never happen, and last but not least XFS will need the new behavour for writes. Now we can replace the lock_type variable with a flags one, where no flag means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first flag. Separate out the check for not allowing to fill holes into a separate flag, although for now both flags always get set at the same time. Also revamp the documentation of the locking scheme to actually make sense. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Hellwig <hch@lst.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Badari Pulavarty <pbadari@us.ibm.com> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Zach Brown <zach.brown@oracle.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Alex Elder <aelder@sgi.com> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <joel.becker@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/direct-io.c129
-rw-r--r--fs/ocfs2/aops.c34
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c20
3 files changed, 63 insertions, 120 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9f34bb9b1ecb..4012885d027f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53 * 53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's 54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize. 55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks. If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */ 56 */
64 57
65struct dio { 58struct dio {
@@ -68,7 +61,7 @@ struct dio {
68 struct inode *inode; 61 struct inode *inode;
69 int rw; 62 int rw;
70 loff_t i_size; /* i_size when submitted */ 63 loff_t i_size; /* i_size when submitted */
71 int lock_type; /* doesn't change */ 64 int flags; /* doesn't change */
72 unsigned blkbits; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */
73 unsigned blkfactor; /* When we're using an alignment which 66 unsigned blkfactor; /* When we're using an alignment which
74 is finer than the filesystem's soft 67 is finer than the filesystem's soft
@@ -246,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
246 if (dio->end_io && dio->result) 239 if (dio->end_io && dio->result)
247 dio->end_io(dio->iocb, offset, transferred, 240 dio->end_io(dio->iocb, offset, transferred,
248 dio->map_bh.b_private); 241 dio->map_bh.b_private);
249 if (dio->lock_type == DIO_LOCKING) 242
243 if (dio->flags & DIO_LOCKING)
250 /* lockdep: non-owner release */ 244 /* lockdep: non-owner release */
251 up_read_non_owner(&dio->inode->i_alloc_sem); 245 up_read_non_owner(&dio->inode->i_alloc_sem);
252 246
@@ -521,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
521 map_bh->b_state = 0; 515 map_bh->b_state = 0;
522 map_bh->b_size = fs_count << dio->inode->i_blkbits; 516 map_bh->b_size = fs_count << dio->inode->i_blkbits;
523 517
518 /*
519 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
520 * forbid block creations: only overwrites are permitted.
521 * We will return early to the caller once we see an
522 * unmapped buffer head returned, and the caller will fall
523 * back to buffered I/O.
524 *
525 * Otherwise the decision is left to the get_blocks method,
526 * which may decide to handle it or also return an unmapped
527 * buffer head.
528 */
524 create = dio->rw & WRITE; 529 create = dio->rw & WRITE;
525 if (dio->lock_type == DIO_LOCKING) { 530 if (dio->flags & DIO_SKIP_HOLES) {
526 if (dio->block_in_file < (i_size_read(dio->inode) >> 531 if (dio->block_in_file < (i_size_read(dio->inode) >>
527 dio->blkbits)) 532 dio->blkbits))
528 create = 0; 533 create = 0;
529 } else if (dio->lock_type == DIO_NO_LOCKING) {
530 create = 0;
531 } 534 }
532 535
533 /*
534 * For writes inside i_size we forbid block creations: only
535 * overwrites are permitted. We fall back to buffered writes
536 * at a higher level for inside-i_size block-instantiating
537 * writes.
538 */
539 ret = (*dio->get_block)(dio->inode, fs_startblk, 536 ret = (*dio->get_block)(dio->inode, fs_startblk,
540 map_bh, create); 537 map_bh, create);
541 } 538 }
@@ -1045,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1045 * we can let i_mutex go now that its achieved its purpose 1042 * we can let i_mutex go now that its achieved its purpose
1046 * of protecting us from looking up uninitialized blocks. 1043 * of protecting us from looking up uninitialized blocks.
1047 */ 1044 */
1048 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1045 if (rw == READ && (dio->flags & DIO_LOCKING))
1049 mutex_unlock(&dio->inode->i_mutex); 1046 mutex_unlock(&dio->inode->i_mutex);
1050 1047
1051 /* 1048 /*
@@ -1092,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1092 1089
1093/* 1090/*
1094 * This is a library function for use by filesystem drivers. 1091 * This is a library function for use by filesystem drivers.
1095 * The locking rules are governed by the dio_lock_type parameter.
1096 * 1092 *
1097 * DIO_NO_LOCKING (no locking, for raw block device access) 1093 * The locking rules are governed by the flags parameter:
1098 * For writes, i_mutex is not held on entry; it is never taken. 1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1099 * 1102 *
1100 * DIO_LOCKING (simple locking for regular files) 1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1101 * For writes we are called under i_mutex and return with i_mutex held, even 1104 * internal locking but rather rely on the filesystem to synchronize
1102 * though it is internally dropped. 1105 * direct I/O reads/writes versus each other and truncate.
1103 * For reads, i_mutex is not held on entry, but it is taken and dropped before 1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1104 * returning. 1107 * entry and are never taken.
1105 *
1106 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1107 * uninitialised data, allowing parallel direct readers and writers)
1108 * For writes we are called without i_mutex, return without it, never touch it.
1109 * For reads we are called under i_mutex and return with i_mutex held, even
1110 * though it may be internally dropped.
1111 *
1112 * Additional i_alloc_sem locking requirements described inline below.
1113 */ 1108 */
1114ssize_t 1109ssize_t
1115__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1116 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1111 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1117 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1118 int dio_lock_type) 1113 int flags)
1119{ 1114{
1120 int seg; 1115 int seg;
1121 size_t size; 1116 size_t size;
@@ -1126,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1126 ssize_t retval = -EINVAL; 1121 ssize_t retval = -EINVAL;
1127 loff_t end = offset; 1122 loff_t end = offset;
1128 struct dio *dio; 1123 struct dio *dio;
1129 int release_i_mutex = 0;
1130 int acquire_i_mutex = 0;
1131 1124
1132 if (rw & WRITE) 1125 if (rw & WRITE)
1133 rw = WRITE_ODIRECT_PLUG; 1126 rw = WRITE_ODIRECT_PLUG;
@@ -1168,43 +1161,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1168 */ 1161 */
1169 memset(dio, 0, offsetof(struct dio, pages)); 1162 memset(dio, 0, offsetof(struct dio, pages));
1170 1163
1171 /* 1164 dio->flags = flags;
1172 * For block device access DIO_NO_LOCKING is used, 1165 if (dio->flags & DIO_LOCKING) {
1173 * neither readers nor writers do any locking at all
1174 * For regular files using DIO_LOCKING,
1175 * readers need to grab i_mutex and i_alloc_sem
1176 * writers need to grab i_alloc_sem only (i_mutex is already held)
1177 * For regular files using DIO_OWN_LOCKING,
1178 * neither readers nor writers take any locks here
1179 */
1180 dio->lock_type = dio_lock_type;
1181 if (dio_lock_type != DIO_NO_LOCKING) {
1182 /* watch out for a 0 len io from a tricksy fs */ 1166 /* watch out for a 0 len io from a tricksy fs */
1183 if (rw == READ && end > offset) { 1167 if (rw == READ && end > offset) {
1184 struct address_space *mapping; 1168 struct address_space *mapping =
1169 iocb->ki_filp->f_mapping;
1185 1170
1186 mapping = iocb->ki_filp->f_mapping; 1171 /* will be released by direct_io_worker */
1187 if (dio_lock_type != DIO_OWN_LOCKING) { 1172 mutex_lock(&inode->i_mutex);
1188 mutex_lock(&inode->i_mutex);
1189 release_i_mutex = 1;
1190 }
1191 1173
1192 retval = filemap_write_and_wait_range(mapping, offset, 1174 retval = filemap_write_and_wait_range(mapping, offset,
1193 end - 1); 1175 end - 1);
1194 if (retval) { 1176 if (retval) {
1177 mutex_unlock(&inode->i_mutex);
1195 kfree(dio); 1178 kfree(dio);
1196 goto out; 1179 goto out;
1197 } 1180 }
1198
1199 if (dio_lock_type == DIO_OWN_LOCKING) {
1200 mutex_unlock(&inode->i_mutex);
1201 acquire_i_mutex = 1;
1202 }
1203 } 1181 }
1204 1182
1205 if (dio_lock_type == DIO_LOCKING) 1183 /*
1206 /* lockdep: not the owner will release it */ 1184 * Will be released at I/O completion, possibly in a
1207 down_read_non_owner(&inode->i_alloc_sem); 1185 * different thread.
1186 */
1187 down_read_non_owner(&inode->i_alloc_sem);
1208 } 1188 }
1209 1189
1210 /* 1190 /*
@@ -1222,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1222 /* 1202 /*
1223 * In case of error extending write may have instantiated a few 1203 * In case of error extending write may have instantiated a few
1224 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1204 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1225 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by 1205 *
1226 * it's own meaner. 1206 * NOTE: filesystems with their own locking have to handle this
1207 * on their own.
1227 */ 1208 */
1228 if (unlikely(retval < 0 && (rw & WRITE))) { 1209 if (dio->flags & DIO_LOCKING) {
1229 loff_t isize = i_size_read(inode); 1210 if (unlikely((rw & WRITE) && retval < 0)) {
1230 1211 loff_t isize = i_size_read(inode);
1231 if (end > isize && dio_lock_type == DIO_LOCKING) 1212 if (end > isize)
1232 vmtruncate(inode, isize); 1213 vmtruncate(inode, isize);
1214 }
1233 } 1215 }
1234 1216
1235 if (rw == READ && dio_lock_type == DIO_LOCKING)
1236 release_i_mutex = 0;
1237
1238out: 1217out:
1239 if (release_i_mutex)
1240 mutex_unlock(&inode->i_mutex);
1241 else if (acquire_i_mutex)
1242 mutex_lock(&inode->i_mutex);
1243 return retval; 1218 return retval;
1244} 1219}
1245EXPORT_SYMBOL(__blockdev_direct_IO); 1220EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..3dae4a13f6e4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
547 * 547 *
548 * called like this: dio->get_blocks(dio->inode, fs_startblk, 548 * called like this: dio->get_blocks(dio->inode, fs_startblk,
549 * fs_count, map_bh, dio->rw == WRITE); 549 * fs_count, map_bh, dio->rw == WRITE);
550 *
551 * Note that we never bother to allocate blocks here, and thus ignore the
552 * create argument.
550 */ 553 */
551static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 554static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
552 struct buffer_head *bh_result, int create) 555 struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
563 566
564 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 567 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
565 568
566 /*
567 * Any write past EOF is not allowed because we'd be extending.
568 */
569 if (create && (iblock + max_blocks) > inode_blocks) {
570 ret = -EIO;
571 goto bail;
572 }
573
574 /* This figures out the size of the next contiguous block, and 569 /* This figures out the size of the next contiguous block, and
575 * our logical offset */ 570 * our logical offset */
576 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 571 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
582 goto bail; 577 goto bail;
583 } 578 }
584 579
585 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
586 ocfs2_error(inode->i_sb,
587 "Inode %llu has a hole at block %llu\n",
588 (unsigned long long)OCFS2_I(inode)->ip_blkno,
589 (unsigned long long)iblock);
590 ret = -EROFS;
591 goto bail;
592 }
593
594 /* We should already CoW the refcounted extent. */ 580 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); 581 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
596 /* 582 /*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
601 */ 587 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 588 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno); 589 map_bh(bh_result, inode->i_sb, p_blkno);
604 else { 590 else
605 /*
606 * ocfs2_prepare_inode_for_write() should have caught
607 * the case where we'd be filling a hole and triggered
608 * a buffered write instead.
609 */
610 if (create) {
611 ret = -EIO;
612 mlog_errno(ret);
613 goto bail;
614 }
615
616 clear_buffer_mapped(bh_result); 591 clear_buffer_mapped(bh_result);
617 }
618 592
619 /* make sure we don't map more than max_blocks blocks here as 593 /* make sure we don't map more than max_blocks blocks here as
620 that's all the kernel will handle at this point. */ 594 that's all the kernel will handle at this point. */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d798c54296eb..66abe36c1213 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO(
1474 1474
1475 bdev = xfs_find_bdev_for_inode(XFS_I(inode)); 1475 bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1476 1476
1477 if (rw == WRITE) { 1477 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1478 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); 1478 IOMAP_UNWRITTEN : IOMAP_READ);
1479 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1479
1480 bdev, iov, offset, nr_segs, 1480 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1481 xfs_get_blocks_direct, 1481 offset, nr_segs,
1482 xfs_end_io_direct); 1482 xfs_get_blocks_direct,
1483 } else { 1483 xfs_end_io_direct);
1484 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1485 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1486 bdev, iov, offset, nr_segs,
1487 xfs_get_blocks_direct,
1488 xfs_end_io_direct);
1489 }
1490 1484
1491 if (unlikely(ret != -EIOCBQUEUED && iocb->private)) 1485 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1492 xfs_destroy_ioend(iocb->private); 1486 xfs_destroy_ioend(iocb->private);