aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/direct-io.c129
-rw-r--r--fs/ocfs2/aops.c34
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c20
-rw-r--r--include/linux/fs.h22
4 files changed, 71 insertions, 134 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9f34bb9b1ecb..4012885d027f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53 * 53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's 54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize. 55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks. If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */ 56 */
64 57
65struct dio { 58struct dio {
@@ -68,7 +61,7 @@ struct dio {
68 struct inode *inode; 61 struct inode *inode;
69 int rw; 62 int rw;
70 loff_t i_size; /* i_size when submitted */ 63 loff_t i_size; /* i_size when submitted */
71 int lock_type; /* doesn't change */ 64 int flags; /* doesn't change */
72 unsigned blkbits; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */
73 unsigned blkfactor; /* When we're using an alignment which 66 unsigned blkfactor; /* When we're using an alignment which
74 is finer than the filesystem's soft 67 is finer than the filesystem's soft
@@ -246,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
246 if (dio->end_io && dio->result) 239 if (dio->end_io && dio->result)
247 dio->end_io(dio->iocb, offset, transferred, 240 dio->end_io(dio->iocb, offset, transferred,
248 dio->map_bh.b_private); 241 dio->map_bh.b_private);
249 if (dio->lock_type == DIO_LOCKING) 242
243 if (dio->flags & DIO_LOCKING)
250 /* lockdep: non-owner release */ 244 /* lockdep: non-owner release */
251 up_read_non_owner(&dio->inode->i_alloc_sem); 245 up_read_non_owner(&dio->inode->i_alloc_sem);
252 246
@@ -521,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
521 map_bh->b_state = 0; 515 map_bh->b_state = 0;
522 map_bh->b_size = fs_count << dio->inode->i_blkbits; 516 map_bh->b_size = fs_count << dio->inode->i_blkbits;
523 517
518 /*
519 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
520 * forbid block creations: only overwrites are permitted.
521 * We will return early to the caller once we see an
522 * unmapped buffer head returned, and the caller will fall
523 * back to buffered I/O.
524 *
525 * Otherwise the decision is left to the get_blocks method,
526 * which may decide to handle it or also return an unmapped
527 * buffer head.
528 */
524 create = dio->rw & WRITE; 529 create = dio->rw & WRITE;
525 if (dio->lock_type == DIO_LOCKING) { 530 if (dio->flags & DIO_SKIP_HOLES) {
526 if (dio->block_in_file < (i_size_read(dio->inode) >> 531 if (dio->block_in_file < (i_size_read(dio->inode) >>
527 dio->blkbits)) 532 dio->blkbits))
528 create = 0; 533 create = 0;
529 } else if (dio->lock_type == DIO_NO_LOCKING) {
530 create = 0;
531 } 534 }
532 535
533 /*
534 * For writes inside i_size we forbid block creations: only
535 * overwrites are permitted. We fall back to buffered writes
536 * at a higher level for inside-i_size block-instantiating
537 * writes.
538 */
539 ret = (*dio->get_block)(dio->inode, fs_startblk, 536 ret = (*dio->get_block)(dio->inode, fs_startblk,
540 map_bh, create); 537 map_bh, create);
541 } 538 }
@@ -1045,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1045 * we can let i_mutex go now that its achieved its purpose 1042 * we can let i_mutex go now that its achieved its purpose
1046 * of protecting us from looking up uninitialized blocks. 1043 * of protecting us from looking up uninitialized blocks.
1047 */ 1044 */
1048 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1045 if (rw == READ && (dio->flags & DIO_LOCKING))
1049 mutex_unlock(&dio->inode->i_mutex); 1046 mutex_unlock(&dio->inode->i_mutex);
1050 1047
1051 /* 1048 /*
@@ -1092,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1092 1089
1093/* 1090/*
1094 * This is a library function for use by filesystem drivers. 1091 * This is a library function for use by filesystem drivers.
1095 * The locking rules are governed by the dio_lock_type parameter.
1096 * 1092 *
1097 * DIO_NO_LOCKING (no locking, for raw block device access) 1093 * The locking rules are governed by the flags parameter:
1098 * For writes, i_mutex is not held on entry; it is never taken. 1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1099 * 1102 *
1100 * DIO_LOCKING (simple locking for regular files) 1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1101 * For writes we are called under i_mutex and return with i_mutex held, even 1104 * internal locking but rather rely on the filesystem to synchronize
1102 * though it is internally dropped. 1105 * direct I/O reads/writes versus each other and truncate.
1103 * For reads, i_mutex is not held on entry, but it is taken and dropped before 1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1104 * returning. 1107 * entry and are never taken.
1105 *
1106 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1107 * uninitialised data, allowing parallel direct readers and writers)
1108 * For writes we are called without i_mutex, return without it, never touch it.
1109 * For reads we are called under i_mutex and return with i_mutex held, even
1110 * though it may be internally dropped.
1111 *
1112 * Additional i_alloc_sem locking requirements described inline below.
1113 */ 1108 */
1114ssize_t 1109ssize_t
1115__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1116 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1111 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1117 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1118 int dio_lock_type) 1113 int flags)
1119{ 1114{
1120 int seg; 1115 int seg;
1121 size_t size; 1116 size_t size;
@@ -1126,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1126 ssize_t retval = -EINVAL; 1121 ssize_t retval = -EINVAL;
1127 loff_t end = offset; 1122 loff_t end = offset;
1128 struct dio *dio; 1123 struct dio *dio;
1129 int release_i_mutex = 0;
1130 int acquire_i_mutex = 0;
1131 1124
1132 if (rw & WRITE) 1125 if (rw & WRITE)
1133 rw = WRITE_ODIRECT_PLUG; 1126 rw = WRITE_ODIRECT_PLUG;
@@ -1168,43 +1161,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1168 */ 1161 */
1169 memset(dio, 0, offsetof(struct dio, pages)); 1162 memset(dio, 0, offsetof(struct dio, pages));
1170 1163
1171 /* 1164 dio->flags = flags;
1172 * For block device access DIO_NO_LOCKING is used, 1165 if (dio->flags & DIO_LOCKING) {
1173 * neither readers nor writers do any locking at all
1174 * For regular files using DIO_LOCKING,
1175 * readers need to grab i_mutex and i_alloc_sem
1176 * writers need to grab i_alloc_sem only (i_mutex is already held)
1177 * For regular files using DIO_OWN_LOCKING,
1178 * neither readers nor writers take any locks here
1179 */
1180 dio->lock_type = dio_lock_type;
1181 if (dio_lock_type != DIO_NO_LOCKING) {
1182 /* watch out for a 0 len io from a tricksy fs */ 1166 /* watch out for a 0 len io from a tricksy fs */
1183 if (rw == READ && end > offset) { 1167 if (rw == READ && end > offset) {
1184 struct address_space *mapping; 1168 struct address_space *mapping =
1169 iocb->ki_filp->f_mapping;
1185 1170
1186 mapping = iocb->ki_filp->f_mapping; 1171 /* will be released by direct_io_worker */
1187 if (dio_lock_type != DIO_OWN_LOCKING) { 1172 mutex_lock(&inode->i_mutex);
1188 mutex_lock(&inode->i_mutex);
1189 release_i_mutex = 1;
1190 }
1191 1173
1192 retval = filemap_write_and_wait_range(mapping, offset, 1174 retval = filemap_write_and_wait_range(mapping, offset,
1193 end - 1); 1175 end - 1);
1194 if (retval) { 1176 if (retval) {
1177 mutex_unlock(&inode->i_mutex);
1195 kfree(dio); 1178 kfree(dio);
1196 goto out; 1179 goto out;
1197 } 1180 }
1198
1199 if (dio_lock_type == DIO_OWN_LOCKING) {
1200 mutex_unlock(&inode->i_mutex);
1201 acquire_i_mutex = 1;
1202 }
1203 } 1181 }
1204 1182
1205 if (dio_lock_type == DIO_LOCKING) 1183 /*
1206 /* lockdep: not the owner will release it */ 1184 * Will be released at I/O completion, possibly in a
1207 down_read_non_owner(&inode->i_alloc_sem); 1185 * different thread.
1186 */
1187 down_read_non_owner(&inode->i_alloc_sem);
1208 } 1188 }
1209 1189
1210 /* 1190 /*
@@ -1222,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1222 /* 1202 /*
1223 * In case of error extending write may have instantiated a few 1203 * In case of error extending write may have instantiated a few
1224 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1204 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1225 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by 1205 *
1226 * it's own meaner. 1206 * NOTE: filesystems with their own locking have to handle this
1207 * on their own.
1227 */ 1208 */
1228 if (unlikely(retval < 0 && (rw & WRITE))) { 1209 if (dio->flags & DIO_LOCKING) {
1229 loff_t isize = i_size_read(inode); 1210 if (unlikely((rw & WRITE) && retval < 0)) {
1230 1211 loff_t isize = i_size_read(inode);
1231 if (end > isize && dio_lock_type == DIO_LOCKING) 1212 if (end > isize)
1232 vmtruncate(inode, isize); 1213 vmtruncate(inode, isize);
1214 }
1233 } 1215 }
1234 1216
1235 if (rw == READ && dio_lock_type == DIO_LOCKING)
1236 release_i_mutex = 0;
1237
1238out: 1217out:
1239 if (release_i_mutex)
1240 mutex_unlock(&inode->i_mutex);
1241 else if (acquire_i_mutex)
1242 mutex_lock(&inode->i_mutex);
1243 return retval; 1218 return retval;
1244} 1219}
1245EXPORT_SYMBOL(__blockdev_direct_IO); 1220EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..3dae4a13f6e4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
547 * 547 *
548 * called like this: dio->get_blocks(dio->inode, fs_startblk, 548 * called like this: dio->get_blocks(dio->inode, fs_startblk,
549 * fs_count, map_bh, dio->rw == WRITE); 549 * fs_count, map_bh, dio->rw == WRITE);
550 *
551 * Note that we never bother to allocate blocks here, and thus ignore the
552 * create argument.
550 */ 553 */
551static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 554static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
552 struct buffer_head *bh_result, int create) 555 struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
563 566
564 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 567 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
565 568
566 /*
567 * Any write past EOF is not allowed because we'd be extending.
568 */
569 if (create && (iblock + max_blocks) > inode_blocks) {
570 ret = -EIO;
571 goto bail;
572 }
573
574 /* This figures out the size of the next contiguous block, and 569 /* This figures out the size of the next contiguous block, and
575 * our logical offset */ 570 * our logical offset */
576 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 571 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
582 goto bail; 577 goto bail;
583 } 578 }
584 579
585 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
586 ocfs2_error(inode->i_sb,
587 "Inode %llu has a hole at block %llu\n",
588 (unsigned long long)OCFS2_I(inode)->ip_blkno,
589 (unsigned long long)iblock);
590 ret = -EROFS;
591 goto bail;
592 }
593
594 /* We should already CoW the refcounted extent. */ 580 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); 581 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
596 /* 582 /*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
601 */ 587 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 588 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno); 589 map_bh(bh_result, inode->i_sb, p_blkno);
604 else { 590 else
605 /*
606 * ocfs2_prepare_inode_for_write() should have caught
607 * the case where we'd be filling a hole and triggered
608 * a buffered write instead.
609 */
610 if (create) {
611 ret = -EIO;
612 mlog_errno(ret);
613 goto bail;
614 }
615
616 clear_buffer_mapped(bh_result); 591 clear_buffer_mapped(bh_result);
617 }
618 592
619 /* make sure we don't map more than max_blocks blocks here as 593 /* make sure we don't map more than max_blocks blocks here as
620 that's all the kernel will handle at this point. */ 594 that's all the kernel will handle at this point. */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d798c54296eb..66abe36c1213 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO(
1474 1474
1475 bdev = xfs_find_bdev_for_inode(XFS_I(inode)); 1475 bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1476 1476
1477 if (rw == WRITE) { 1477 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1478 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); 1478 IOMAP_UNWRITTEN : IOMAP_READ);
1479 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1479
1480 bdev, iov, offset, nr_segs, 1480 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1481 xfs_get_blocks_direct, 1481 offset, nr_segs,
1482 xfs_end_io_direct); 1482 xfs_get_blocks_direct,
1483 } else { 1483 xfs_end_io_direct);
1484 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1485 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1486 bdev, iov, offset, nr_segs,
1487 xfs_get_blocks_direct,
1488 xfs_end_io_direct);
1489 }
1490 1484
1491 if (unlikely(ret != -EIOCBQUEUED && iocb->private)) 1485 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1492 xfs_destroy_ioend(iocb->private); 1486 xfs_destroy_ioend(iocb->private);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a057f48eb156..b23a7018eb90 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2264,9 +2264,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
2264 int lock_type); 2264 int lock_type);
2265 2265
2266enum { 2266enum {
2267 DIO_LOCKING = 1, /* need locking between buffered and direct access */ 2267 /* need locking between buffered and direct access */
2268 DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ 2268 DIO_LOCKING = 0x01,
2269 DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ 2269
2270 /* filesystem does not support filling holes */
2271 DIO_SKIP_HOLES = 0x02,
2270}; 2272};
2271 2273
2272static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, 2274static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2275,7 +2277,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
2275 dio_iodone_t end_io) 2277 dio_iodone_t end_io)
2276{ 2278{
2277 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2279 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2278 nr_segs, get_block, end_io, DIO_LOCKING); 2280 nr_segs, get_block, end_io,
2281 DIO_LOCKING | DIO_SKIP_HOLES);
2279} 2282}
2280 2283
2281static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, 2284static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2284,16 +2287,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
2284 dio_iodone_t end_io) 2287 dio_iodone_t end_io)
2285{ 2288{
2286 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2289 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2287 nr_segs, get_block, end_io, DIO_NO_LOCKING); 2290 nr_segs, get_block, end_io, 0);
2288}
2289
2290static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
2291 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
2292 loff_t offset, unsigned long nr_segs, get_block_t get_block,
2293 dio_iodone_t end_io)
2294{
2295 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2296 nr_segs, get_block, end_io, DIO_OWN_LOCKING);
2297} 2291}
2298#endif 2292#endif
2299 2293