aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2009-11-03 10:44:53 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2009-12-16 12:16:49 -0500
commit1e431f5ce78f3ae8254d725060288b78ff74f086 (patch)
treea144fd7b6120ec61958c82023b25620a18aa3d6d
parent1c7c474c31aea6d5cb2fb35f31d9e9e91ae466b1 (diff)
cleanup blockdev_direct_IO locking
Currently the locking in blockdev_direct_IO is a mess, we have three different locking types and very confusing checks for some of them. The most complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be used. This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case is unused anyway, and the write side is almost identical to DIO_NO_LOCKING. The difference is that DIO_NO_LOCKING always sets the create argument for the get_blocks callback to zero, but we can easily move that to the actual get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode: gfs already ignores the create argument and thus is fine with the new version, ocfs2 only errors out if create were ever set, and we can remove this dead code now, the block device code only ever uses create for an error message if we are fully beyond the device which can never happen, and last but not least XFS will need the new behavour for writes. Now we can replace the lock_type variable with a flags one, where no flag means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first flag. Separate out the check for not allowing to fill holes into a separate flag, although for now both flags always get set at the same time. Also revamp the documentation of the locking scheme to actually make sense. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/direct-io.c129
-rw-r--r--fs/ocfs2/aops.c34
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c20
-rw-r--r--include/linux/fs.h22
4 files changed, 71 insertions, 134 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b912270942fa..7dde0df8e8b6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53 * 53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's 54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize. 55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks. If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */ 56 */
64 57
65struct dio { 58struct dio {
@@ -68,7 +61,7 @@ struct dio {
68 struct inode *inode; 61 struct inode *inode;
69 int rw; 62 int rw;
70 loff_t i_size; /* i_size when submitted */ 63 loff_t i_size; /* i_size when submitted */
71 int lock_type; /* doesn't change */ 64 int flags; /* doesn't change */
72 unsigned blkbits; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */
73 unsigned blkfactor; /* When we're using an alignment which 66 unsigned blkfactor; /* When we're using an alignment which
74 is finer than the filesystem's soft 67 is finer than the filesystem's soft
@@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
240 if (dio->end_io && dio->result) 233 if (dio->end_io && dio->result)
241 dio->end_io(dio->iocb, offset, transferred, 234 dio->end_io(dio->iocb, offset, transferred,
242 dio->map_bh.b_private); 235 dio->map_bh.b_private);
243 if (dio->lock_type == DIO_LOCKING) 236
237 if (dio->flags & DIO_LOCKING)
244 /* lockdep: non-owner release */ 238 /* lockdep: non-owner release */
245 up_read_non_owner(&dio->inode->i_alloc_sem); 239 up_read_non_owner(&dio->inode->i_alloc_sem);
246 240
@@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *dio)
515 map_bh->b_state = 0; 509 map_bh->b_state = 0;
516 map_bh->b_size = fs_count << dio->inode->i_blkbits; 510 map_bh->b_size = fs_count << dio->inode->i_blkbits;
517 511
512 /*
513 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
514 * forbid block creations: only overwrites are permitted.
515 * We will return early to the caller once we see an
516 * unmapped buffer head returned, and the caller will fall
517 * back to buffered I/O.
518 *
519 * Otherwise the decision is left to the get_blocks method,
520 * which may decide to handle it or also return an unmapped
521 * buffer head.
522 */
518 create = dio->rw & WRITE; 523 create = dio->rw & WRITE;
519 if (dio->lock_type == DIO_LOCKING) { 524 if (dio->flags & DIO_SKIP_HOLES) {
520 if (dio->block_in_file < (i_size_read(dio->inode) >> 525 if (dio->block_in_file < (i_size_read(dio->inode) >>
521 dio->blkbits)) 526 dio->blkbits))
522 create = 0; 527 create = 0;
523 } else if (dio->lock_type == DIO_NO_LOCKING) {
524 create = 0;
525 } 528 }
526 529
527 /*
528 * For writes inside i_size we forbid block creations: only
529 * overwrites are permitted. We fall back to buffered writes
530 * at a higher level for inside-i_size block-instantiating
531 * writes.
532 */
533 ret = (*dio->get_block)(dio->inode, fs_startblk, 530 ret = (*dio->get_block)(dio->inode, fs_startblk,
534 map_bh, create); 531 map_bh, create);
535 } 532 }
@@ -1039,7 +1036,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1039 * we can let i_mutex go now that its achieved its purpose 1036 * we can let i_mutex go now that its achieved its purpose
1040 * of protecting us from looking up uninitialized blocks. 1037 * of protecting us from looking up uninitialized blocks.
1041 */ 1038 */
1042 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1039 if (rw == READ && (dio->flags & DIO_LOCKING))
1043 mutex_unlock(&dio->inode->i_mutex); 1040 mutex_unlock(&dio->inode->i_mutex);
1044 1041
1045 /* 1042 /*
@@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1086 1083
1087/* 1084/*
1088 * This is a library function for use by filesystem drivers. 1085 * This is a library function for use by filesystem drivers.
1089 * The locking rules are governed by the dio_lock_type parameter.
1090 * 1086 *
1091 * DIO_NO_LOCKING (no locking, for raw block device access) 1087 * The locking rules are governed by the flags parameter:
1092 * For writes, i_mutex is not held on entry; it is never taken. 1088 * - if the flags value contains DIO_LOCKING we use a fancy locking
1089 * scheme for dumb filesystems.
1090 * For writes this function is called under i_mutex and returns with
1091 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1092 * taken and dropped again before returning.
1093 * For reads and writes i_alloc_sem is taken in shared mode and released
1094 * on I/O completion (which may happen asynchronously after returning to
1095 * the caller).
1093 * 1096 *
1094 * DIO_LOCKING (simple locking for regular files) 1097 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1095 * For writes we are called under i_mutex and return with i_mutex held, even 1098 * internal locking but rather rely on the filesystem to synchronize
1096 * though it is internally dropped. 1099 * direct I/O reads/writes versus each other and truncate.
1097 * For reads, i_mutex is not held on entry, but it is taken and dropped before 1100 * For reads and writes both i_mutex and i_alloc_sem are not held on
1098 * returning. 1101 * entry and are never taken.
1099 *
1100 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1101 * uninitialised data, allowing parallel direct readers and writers)
1102 * For writes we are called without i_mutex, return without it, never touch it.
1103 * For reads we are called under i_mutex and return with i_mutex held, even
1104 * though it may be internally dropped.
1105 *
1106 * Additional i_alloc_sem locking requirements described inline below.
1107 */ 1102 */
1108ssize_t 1103ssize_t
1109__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1104__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1110 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1105 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1111 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1106 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1112 int dio_lock_type) 1107 int flags)
1113{ 1108{
1114 int seg; 1109 int seg;
1115 size_t size; 1110 size_t size;
@@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1120 ssize_t retval = -EINVAL; 1115 ssize_t retval = -EINVAL;
1121 loff_t end = offset; 1116 loff_t end = offset;
1122 struct dio *dio; 1117 struct dio *dio;
1123 int release_i_mutex = 0;
1124 int acquire_i_mutex = 0;
1125 1118
1126 if (rw & WRITE) 1119 if (rw & WRITE)
1127 rw = WRITE_ODIRECT_PLUG; 1120 rw = WRITE_ODIRECT_PLUG;
@@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1156 if (!dio) 1149 if (!dio)
1157 goto out; 1150 goto out;
1158 1151
1159 /* 1152 dio->flags = flags;
1160 * For block device access DIO_NO_LOCKING is used, 1153 if (dio->flags & DIO_LOCKING) {
1161 * neither readers nor writers do any locking at all
1162 * For regular files using DIO_LOCKING,
1163 * readers need to grab i_mutex and i_alloc_sem
1164 * writers need to grab i_alloc_sem only (i_mutex is already held)
1165 * For regular files using DIO_OWN_LOCKING,
1166 * neither readers nor writers take any locks here
1167 */
1168 dio->lock_type = dio_lock_type;
1169 if (dio_lock_type != DIO_NO_LOCKING) {
1170 /* watch out for a 0 len io from a tricksy fs */ 1154 /* watch out for a 0 len io from a tricksy fs */
1171 if (rw == READ && end > offset) { 1155 if (rw == READ && end > offset) {
1172 struct address_space *mapping; 1156 struct address_space *mapping =
1157 iocb->ki_filp->f_mapping;
1173 1158
1174 mapping = iocb->ki_filp->f_mapping; 1159 /* will be released by direct_io_worker */
1175 if (dio_lock_type != DIO_OWN_LOCKING) { 1160 mutex_lock(&inode->i_mutex);
1176 mutex_lock(&inode->i_mutex);
1177 release_i_mutex = 1;
1178 }
1179 1161
1180 retval = filemap_write_and_wait_range(mapping, offset, 1162 retval = filemap_write_and_wait_range(mapping, offset,
1181 end - 1); 1163 end - 1);
1182 if (retval) { 1164 if (retval) {
1165 mutex_unlock(&inode->i_mutex);
1183 kfree(dio); 1166 kfree(dio);
1184 goto out; 1167 goto out;
1185 } 1168 }
1186
1187 if (dio_lock_type == DIO_OWN_LOCKING) {
1188 mutex_unlock(&inode->i_mutex);
1189 acquire_i_mutex = 1;
1190 }
1191 } 1169 }
1192 1170
1193 if (dio_lock_type == DIO_LOCKING) 1171 /*
1194 /* lockdep: not the owner will release it */ 1172 * Will be released at I/O completion, possibly in a
1195 down_read_non_owner(&inode->i_alloc_sem); 1173 * different thread.
1174 */
1175 down_read_non_owner(&inode->i_alloc_sem);
1196 } 1176 }
1197 1177
1198 /* 1178 /*
@@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1210 /* 1190 /*
1211 * In case of error extending write may have instantiated a few 1191 * In case of error extending write may have instantiated a few
1212 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1192 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1213 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by 1193 *
1214 * it's own meaner. 1194 * NOTE: filesystems with their own locking have to handle this
1195 * on their own.
1215 */ 1196 */
1216 if (unlikely(retval < 0 && (rw & WRITE))) { 1197 if (dio->flags & DIO_LOCKING) {
1217 loff_t isize = i_size_read(inode); 1198 if (unlikely((rw & WRITE) && retval < 0)) {
1218 1199 loff_t isize = i_size_read(inode);
1219 if (end > isize && dio_lock_type == DIO_LOCKING) 1200 if (end > isize )
1220 vmtruncate(inode, isize); 1201 vmtruncate(inode, isize);
1202 }
1221 } 1203 }
1222 1204
1223 if (rw == READ && dio_lock_type == DIO_LOCKING)
1224 release_i_mutex = 0;
1225
1226out: 1205out:
1227 if (release_i_mutex)
1228 mutex_unlock(&inode->i_mutex);
1229 else if (acquire_i_mutex)
1230 mutex_lock(&inode->i_mutex);
1231 return retval; 1206 return retval;
1232} 1207}
1233EXPORT_SYMBOL(__blockdev_direct_IO); 1208EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..3dae4a13f6e4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
547 * 547 *
548 * called like this: dio->get_blocks(dio->inode, fs_startblk, 548 * called like this: dio->get_blocks(dio->inode, fs_startblk,
549 * fs_count, map_bh, dio->rw == WRITE); 549 * fs_count, map_bh, dio->rw == WRITE);
550 *
551 * Note that we never bother to allocate blocks here, and thus ignore the
552 * create argument.
550 */ 553 */
551static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 554static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
552 struct buffer_head *bh_result, int create) 555 struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
563 566
564 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 567 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
565 568
566 /*
567 * Any write past EOF is not allowed because we'd be extending.
568 */
569 if (create && (iblock + max_blocks) > inode_blocks) {
570 ret = -EIO;
571 goto bail;
572 }
573
574 /* This figures out the size of the next contiguous block, and 569 /* This figures out the size of the next contiguous block, and
575 * our logical offset */ 570 * our logical offset */
576 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 571 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
582 goto bail; 577 goto bail;
583 } 578 }
584 579
585 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
586 ocfs2_error(inode->i_sb,
587 "Inode %llu has a hole at block %llu\n",
588 (unsigned long long)OCFS2_I(inode)->ip_blkno,
589 (unsigned long long)iblock);
590 ret = -EROFS;
591 goto bail;
592 }
593
594 /* We should already CoW the refcounted extent. */ 580 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); 581 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
596 /* 582 /*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
601 */ 587 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 588 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno); 589 map_bh(bh_result, inode->i_sb, p_blkno);
604 else { 590 else
605 /*
606 * ocfs2_prepare_inode_for_write() should have caught
607 * the case where we'd be filling a hole and triggered
608 * a buffered write instead.
609 */
610 if (create) {
611 ret = -EIO;
612 mlog_errno(ret);
613 goto bail;
614 }
615
616 clear_buffer_mapped(bh_result); 591 clear_buffer_mapped(bh_result);
617 }
618 592
619 /* make sure we don't map more than max_blocks blocks here as 593 /* make sure we don't map more than max_blocks blocks here as
620 that's all the kernel will handle at this point. */ 594 that's all the kernel will handle at this point. */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d798c54296eb..66abe36c1213 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO(
1474 1474
1475 bdev = xfs_find_bdev_for_inode(XFS_I(inode)); 1475 bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1476 1476
1477 if (rw == WRITE) { 1477 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1478 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); 1478 IOMAP_UNWRITTEN : IOMAP_READ);
1479 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1479
1480 bdev, iov, offset, nr_segs, 1480 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1481 xfs_get_blocks_direct, 1481 offset, nr_segs,
1482 xfs_end_io_direct); 1482 xfs_get_blocks_direct,
1483 } else { 1483 xfs_end_io_direct);
1484 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1485 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1486 bdev, iov, offset, nr_segs,
1487 xfs_get_blocks_direct,
1488 xfs_end_io_direct);
1489 }
1490 1484
1491 if (unlikely(ret != -EIOCBQUEUED && iocb->private)) 1485 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1492 xfs_destroy_ioend(iocb->private); 1486 xfs_destroy_ioend(iocb->private);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cdc23be4edde..7c8ff12d1995 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2263,9 +2263,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
2263 int lock_type); 2263 int lock_type);
2264 2264
2265enum { 2265enum {
2266 DIO_LOCKING = 1, /* need locking between buffered and direct access */ 2266 /* need locking between buffered and direct access */
2267 DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ 2267 DIO_LOCKING = 0x01,
2268 DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ 2268
2269 /* filesystem does not support filling holes */
2270 DIO_SKIP_HOLES = 0x02,
2269}; 2271};
2270 2272
2271static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, 2273static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2274,7 +2276,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
2274 dio_iodone_t end_io) 2276 dio_iodone_t end_io)
2275{ 2277{
2276 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2278 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2277 nr_segs, get_block, end_io, DIO_LOCKING); 2279 nr_segs, get_block, end_io,
2280 DIO_LOCKING | DIO_SKIP_HOLES);
2278} 2281}
2279 2282
2280static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, 2283static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2283,16 +2286,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
2283 dio_iodone_t end_io) 2286 dio_iodone_t end_io)
2284{ 2287{
2285 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2288 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2286 nr_segs, get_block, end_io, DIO_NO_LOCKING); 2289 nr_segs, get_block, end_io, 0);
2287}
2288
2289static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
2290 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
2291 loff_t offset, unsigned long nr_segs, get_block_t get_block,
2292 dio_iodone_t end_io)
2293{
2294 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2295 nr_segs, get_block, end_io, DIO_OWN_LOCKING);
2296} 2290}
2297#endif 2291#endif
2298 2292