aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/direct-io.c129
-rw-r--r--fs/ocfs2/aops.c34
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c20
-rw-r--r--include/linux/fs.h22
4 files changed, 71 insertions, 134 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b912270942fa..7dde0df8e8b6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53 * 53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's 54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize. 55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks. If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */ 56 */
64 57
65struct dio { 58struct dio {
@@ -68,7 +61,7 @@ struct dio {
68 struct inode *inode; 61 struct inode *inode;
69 int rw; 62 int rw;
70 loff_t i_size; /* i_size when submitted */ 63 loff_t i_size; /* i_size when submitted */
71 int lock_type; /* doesn't change */ 64 int flags; /* doesn't change */
72 unsigned blkbits; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */
73 unsigned blkfactor; /* When we're using an alignment which 66 unsigned blkfactor; /* When we're using an alignment which
74 is finer than the filesystem's soft 67 is finer than the filesystem's soft
@@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
240 if (dio->end_io && dio->result) 233 if (dio->end_io && dio->result)
241 dio->end_io(dio->iocb, offset, transferred, 234 dio->end_io(dio->iocb, offset, transferred,
242 dio->map_bh.b_private); 235 dio->map_bh.b_private);
243 if (dio->lock_type == DIO_LOCKING) 236
237 if (dio->flags & DIO_LOCKING)
244 /* lockdep: non-owner release */ 238 /* lockdep: non-owner release */
245 up_read_non_owner(&dio->inode->i_alloc_sem); 239 up_read_non_owner(&dio->inode->i_alloc_sem);
246 240
@@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *dio)
515 map_bh->b_state = 0; 509 map_bh->b_state = 0;
516 map_bh->b_size = fs_count << dio->inode->i_blkbits; 510 map_bh->b_size = fs_count << dio->inode->i_blkbits;
517 511
512 /*
513 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
514 * forbid block creations: only overwrites are permitted.
515 * We will return early to the caller once we see an
516 * unmapped buffer head returned, and the caller will fall
517 * back to buffered I/O.
518 *
519 * Otherwise the decision is left to the get_blocks method,
520 * which may decide to handle it or also return an unmapped
521 * buffer head.
522 */
518 create = dio->rw & WRITE; 523 create = dio->rw & WRITE;
519 if (dio->lock_type == DIO_LOCKING) { 524 if (dio->flags & DIO_SKIP_HOLES) {
520 if (dio->block_in_file < (i_size_read(dio->inode) >> 525 if (dio->block_in_file < (i_size_read(dio->inode) >>
521 dio->blkbits)) 526 dio->blkbits))
522 create = 0; 527 create = 0;
523 } else if (dio->lock_type == DIO_NO_LOCKING) {
524 create = 0;
525 } 528 }
526 529
527 /*
528 * For writes inside i_size we forbid block creations: only
529 * overwrites are permitted. We fall back to buffered writes
530 * at a higher level for inside-i_size block-instantiating
531 * writes.
532 */
533 ret = (*dio->get_block)(dio->inode, fs_startblk, 530 ret = (*dio->get_block)(dio->inode, fs_startblk,
534 map_bh, create); 531 map_bh, create);
535 } 532 }
@@ -1039,7 +1036,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1039 * we can let i_mutex go now that its achieved its purpose 1036 * we can let i_mutex go now that its achieved its purpose
1040 * of protecting us from looking up uninitialized blocks. 1037 * of protecting us from looking up uninitialized blocks.
1041 */ 1038 */
1042 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1039 if (rw == READ && (dio->flags & DIO_LOCKING))
1043 mutex_unlock(&dio->inode->i_mutex); 1040 mutex_unlock(&dio->inode->i_mutex);
1044 1041
1045 /* 1042 /*
@@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1086 1083
1087/* 1084/*
1088 * This is a library function for use by filesystem drivers. 1085 * This is a library function for use by filesystem drivers.
1089 * The locking rules are governed by the dio_lock_type parameter.
1090 * 1086 *
1091 * DIO_NO_LOCKING (no locking, for raw block device access) 1087 * The locking rules are governed by the flags parameter:
1092 * For writes, i_mutex is not held on entry; it is never taken. 1088 * - if the flags value contains DIO_LOCKING we use a fancy locking
1089 * scheme for dumb filesystems.
1090 * For writes this function is called under i_mutex and returns with
1091 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1092 * taken and dropped again before returning.
1093 * For reads and writes i_alloc_sem is taken in shared mode and released
1094 * on I/O completion (which may happen asynchronously after returning to
1095 * the caller).
1093 * 1096 *
1094 * DIO_LOCKING (simple locking for regular files) 1097 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1095 * For writes we are called under i_mutex and return with i_mutex held, even 1098 * internal locking but rather rely on the filesystem to synchronize
1096 * though it is internally dropped. 1099 * direct I/O reads/writes versus each other and truncate.
1097 * For reads, i_mutex is not held on entry, but it is taken and dropped before 1100 * For reads and writes both i_mutex and i_alloc_sem are not held on
1098 * returning. 1101 * entry and are never taken.
1099 *
1100 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1101 * uninitialised data, allowing parallel direct readers and writers)
1102 * For writes we are called without i_mutex, return without it, never touch it.
1103 * For reads we are called under i_mutex and return with i_mutex held, even
1104 * though it may be internally dropped.
1105 *
1106 * Additional i_alloc_sem locking requirements described inline below.
1107 */ 1102 */
1108ssize_t 1103ssize_t
1109__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1104__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1110 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1105 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1111 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1106 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1112 int dio_lock_type) 1107 int flags)
1113{ 1108{
1114 int seg; 1109 int seg;
1115 size_t size; 1110 size_t size;
@@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1120 ssize_t retval = -EINVAL; 1115 ssize_t retval = -EINVAL;
1121 loff_t end = offset; 1116 loff_t end = offset;
1122 struct dio *dio; 1117 struct dio *dio;
1123 int release_i_mutex = 0;
1124 int acquire_i_mutex = 0;
1125 1118
1126 if (rw & WRITE) 1119 if (rw & WRITE)
1127 rw = WRITE_ODIRECT_PLUG; 1120 rw = WRITE_ODIRECT_PLUG;
@@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1156 if (!dio) 1149 if (!dio)
1157 goto out; 1150 goto out;
1158 1151
1159 /* 1152 dio->flags = flags;
1160 * For block device access DIO_NO_LOCKING is used, 1153 if (dio->flags & DIO_LOCKING) {
1161 * neither readers nor writers do any locking at all
1162 * For regular files using DIO_LOCKING,
1163 * readers need to grab i_mutex and i_alloc_sem
1164 * writers need to grab i_alloc_sem only (i_mutex is already held)
1165 * For regular files using DIO_OWN_LOCKING,
1166 * neither readers nor writers take any locks here
1167 */
1168 dio->lock_type = dio_lock_type;
1169 if (dio_lock_type != DIO_NO_LOCKING) {
1170 /* watch out for a 0 len io from a tricksy fs */ 1154 /* watch out for a 0 len io from a tricksy fs */
1171 if (rw == READ && end > offset) { 1155 if (rw == READ && end > offset) {
1172 struct address_space *mapping; 1156 struct address_space *mapping =
1157 iocb->ki_filp->f_mapping;
1173 1158
1174 mapping = iocb->ki_filp->f_mapping; 1159 /* will be released by direct_io_worker */
1175 if (dio_lock_type != DIO_OWN_LOCKING) { 1160 mutex_lock(&inode->i_mutex);
1176 mutex_lock(&inode->i_mutex);
1177 release_i_mutex = 1;
1178 }
1179 1161
1180 retval = filemap_write_and_wait_range(mapping, offset, 1162 retval = filemap_write_and_wait_range(mapping, offset,
1181 end - 1); 1163 end - 1);
1182 if (retval) { 1164 if (retval) {
1165 mutex_unlock(&inode->i_mutex);
1183 kfree(dio); 1166 kfree(dio);
1184 goto out; 1167 goto out;
1185 } 1168 }
1186
1187 if (dio_lock_type == DIO_OWN_LOCKING) {
1188 mutex_unlock(&inode->i_mutex);
1189 acquire_i_mutex = 1;
1190 }
1191 } 1169 }
1192 1170
1193 if (dio_lock_type == DIO_LOCKING) 1171 /*
1194 /* lockdep: not the owner will release it */ 1172 * Will be released at I/O completion, possibly in a
1195 down_read_non_owner(&inode->i_alloc_sem); 1173 * different thread.
1174 */
1175 down_read_non_owner(&inode->i_alloc_sem);
1196 } 1176 }
1197 1177
1198 /* 1178 /*
@@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1210 /* 1190 /*
1211 * In case of error extending write may have instantiated a few 1191 * In case of error extending write may have instantiated a few
1212 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1192 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1213 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by 1193 *
1214 * it's own meaner. 1194 * NOTE: filesystems with their own locking have to handle this
1195 * on their own.
1215 */ 1196 */
1216 if (unlikely(retval < 0 && (rw & WRITE))) { 1197 if (dio->flags & DIO_LOCKING) {
1217 loff_t isize = i_size_read(inode); 1198 if (unlikely((rw & WRITE) && retval < 0)) {
1218 1199 loff_t isize = i_size_read(inode);
1219 if (end > isize && dio_lock_type == DIO_LOCKING) 1200 if (end > isize )
1220 vmtruncate(inode, isize); 1201 vmtruncate(inode, isize);
1202 }
1221 } 1203 }
1222 1204
1223 if (rw == READ && dio_lock_type == DIO_LOCKING)
1224 release_i_mutex = 0;
1225
1226out: 1205out:
1227 if (release_i_mutex)
1228 mutex_unlock(&inode->i_mutex);
1229 else if (acquire_i_mutex)
1230 mutex_lock(&inode->i_mutex);
1231 return retval; 1206 return retval;
1232} 1207}
1233EXPORT_SYMBOL(__blockdev_direct_IO); 1208EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..3dae4a13f6e4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
547 * 547 *
548 * called like this: dio->get_blocks(dio->inode, fs_startblk, 548 * called like this: dio->get_blocks(dio->inode, fs_startblk,
549 * fs_count, map_bh, dio->rw == WRITE); 549 * fs_count, map_bh, dio->rw == WRITE);
550 *
551 * Note that we never bother to allocate blocks here, and thus ignore the
552 * create argument.
550 */ 553 */
551static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 554static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
552 struct buffer_head *bh_result, int create) 555 struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
563 566
564 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 567 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
565 568
566 /*
567 * Any write past EOF is not allowed because we'd be extending.
568 */
569 if (create && (iblock + max_blocks) > inode_blocks) {
570 ret = -EIO;
571 goto bail;
572 }
573
574 /* This figures out the size of the next contiguous block, and 569 /* This figures out the size of the next contiguous block, and
575 * our logical offset */ 570 * our logical offset */
576 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 571 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
582 goto bail; 577 goto bail;
583 } 578 }
584 579
585 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
586 ocfs2_error(inode->i_sb,
587 "Inode %llu has a hole at block %llu\n",
588 (unsigned long long)OCFS2_I(inode)->ip_blkno,
589 (unsigned long long)iblock);
590 ret = -EROFS;
591 goto bail;
592 }
593
594 /* We should already CoW the refcounted extent. */ 580 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); 581 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
596 /* 582 /*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
601 */ 587 */
602 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 588 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
603 map_bh(bh_result, inode->i_sb, p_blkno); 589 map_bh(bh_result, inode->i_sb, p_blkno);
604 else { 590 else
605 /*
606 * ocfs2_prepare_inode_for_write() should have caught
607 * the case where we'd be filling a hole and triggered
608 * a buffered write instead.
609 */
610 if (create) {
611 ret = -EIO;
612 mlog_errno(ret);
613 goto bail;
614 }
615
616 clear_buffer_mapped(bh_result); 591 clear_buffer_mapped(bh_result);
617 }
618 592
619 /* make sure we don't map more than max_blocks blocks here as 593 /* make sure we don't map more than max_blocks blocks here as
620 that's all the kernel will handle at this point. */ 594 that's all the kernel will handle at this point. */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d798c54296eb..66abe36c1213 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO(
1474 1474
1475 bdev = xfs_find_bdev_for_inode(XFS_I(inode)); 1475 bdev = xfs_find_bdev_for_inode(XFS_I(inode));
1476 1476
1477 if (rw == WRITE) { 1477 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
1478 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); 1478 IOMAP_UNWRITTEN : IOMAP_READ);
1479 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1479
1480 bdev, iov, offset, nr_segs, 1480 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1481 xfs_get_blocks_direct, 1481 offset, nr_segs,
1482 xfs_end_io_direct); 1482 xfs_get_blocks_direct,
1483 } else { 1483 xfs_end_io_direct);
1484 iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
1485 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1486 bdev, iov, offset, nr_segs,
1487 xfs_get_blocks_direct,
1488 xfs_end_io_direct);
1489 }
1490 1484
1491 if (unlikely(ret != -EIOCBQUEUED && iocb->private)) 1485 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1492 xfs_destroy_ioend(iocb->private); 1486 xfs_destroy_ioend(iocb->private);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cdc23be4edde..7c8ff12d1995 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2263,9 +2263,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
2263 int lock_type); 2263 int lock_type);
2264 2264
2265enum { 2265enum {
2266 DIO_LOCKING = 1, /* need locking between buffered and direct access */ 2266 /* need locking between buffered and direct access */
2267 DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ 2267 DIO_LOCKING = 0x01,
2268 DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ 2268
2269 /* filesystem does not support filling holes */
2270 DIO_SKIP_HOLES = 0x02,
2269}; 2271};
2270 2272
2271static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, 2273static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2274,7 +2276,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
2274 dio_iodone_t end_io) 2276 dio_iodone_t end_io)
2275{ 2277{
2276 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2278 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2277 nr_segs, get_block, end_io, DIO_LOCKING); 2279 nr_segs, get_block, end_io,
2280 DIO_LOCKING | DIO_SKIP_HOLES);
2278} 2281}
2279 2282
2280static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, 2283static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2283,16 +2286,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
2283 dio_iodone_t end_io) 2286 dio_iodone_t end_io)
2284{ 2287{
2285 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 2288 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2286 nr_segs, get_block, end_io, DIO_NO_LOCKING); 2289 nr_segs, get_block, end_io, 0);
2287}
2288
2289static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
2290 struct inode *inode, struct block_device *bdev, const struct iovec *iov,
2291 loff_t offset, unsigned long nr_segs, get_block_t get_block,
2292 dio_iodone_t end_io)
2293{
2294 return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
2295 nr_segs, get_block, end_io, DIO_OWN_LOCKING);
2296} 2290}
2297#endif 2291#endif
2298 2292