diff options
-rw-r--r-- | fs/direct-io.c | 129 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 34 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 20 | ||||
-rw-r--r-- | include/linux/fs.h | 22 |
4 files changed, 71 insertions, 134 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index 9f34bb9b1ecb..4012885d027f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -53,13 +53,6 @@ | |||
53 | * | 53 | * |
54 | * If blkfactor is zero then the user's request was aligned to the filesystem's | 54 | * If blkfactor is zero then the user's request was aligned to the filesystem's |
55 | * blocksize. | 55 | * blocksize. |
56 | * | ||
57 | * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. | ||
58 | * This determines whether we need to do the fancy locking which prevents | ||
59 | * direct-IO from being able to read uninitialised disk blocks. If its zero | ||
60 | * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is | ||
61 | * not held for the entire direct write (taken briefly, initially, during a | ||
62 | * direct read though, but its never held for the duration of a direct-IO). | ||
63 | */ | 56 | */ |
64 | 57 | ||
65 | struct dio { | 58 | struct dio { |
@@ -68,7 +61,7 @@ struct dio { | |||
68 | struct inode *inode; | 61 | struct inode *inode; |
69 | int rw; | 62 | int rw; |
70 | loff_t i_size; /* i_size when submitted */ | 63 | loff_t i_size; /* i_size when submitted */ |
71 | int lock_type; /* doesn't change */ | 64 | int flags; /* doesn't change */ |
72 | unsigned blkbits; /* doesn't change */ | 65 | unsigned blkbits; /* doesn't change */ |
73 | unsigned blkfactor; /* When we're using an alignment which | 66 | unsigned blkfactor; /* When we're using an alignment which |
74 | is finer than the filesystem's soft | 67 | is finer than the filesystem's soft |
@@ -246,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) | |||
246 | if (dio->end_io && dio->result) | 239 | if (dio->end_io && dio->result) |
247 | dio->end_io(dio->iocb, offset, transferred, | 240 | dio->end_io(dio->iocb, offset, transferred, |
248 | dio->map_bh.b_private); | 241 | dio->map_bh.b_private); |
249 | if (dio->lock_type == DIO_LOCKING) | 242 | |
243 | if (dio->flags & DIO_LOCKING) | ||
250 | /* lockdep: non-owner release */ | 244 | /* lockdep: non-owner release */ |
251 | up_read_non_owner(&dio->inode->i_alloc_sem); | 245 | up_read_non_owner(&dio->inode->i_alloc_sem); |
252 | 246 | ||
@@ -521,21 +515,24 @@ static int get_more_blocks(struct dio *dio) | |||
521 | map_bh->b_state = 0; | 515 | map_bh->b_state = 0; |
522 | map_bh->b_size = fs_count << dio->inode->i_blkbits; | 516 | map_bh->b_size = fs_count << dio->inode->i_blkbits; |
523 | 517 | ||
518 | /* | ||
519 | * For writes inside i_size on a DIO_SKIP_HOLES filesystem we | ||
520 | * forbid block creations: only overwrites are permitted. | ||
521 | * We will return early to the caller once we see an | ||
522 | * unmapped buffer head returned, and the caller will fall | ||
523 | * back to buffered I/O. | ||
524 | * | ||
525 | * Otherwise the decision is left to the get_blocks method, | ||
526 | * which may decide to handle it or also return an unmapped | ||
527 | * buffer head. | ||
528 | */ | ||
524 | create = dio->rw & WRITE; | 529 | create = dio->rw & WRITE; |
525 | if (dio->lock_type == DIO_LOCKING) { | 530 | if (dio->flags & DIO_SKIP_HOLES) { |
526 | if (dio->block_in_file < (i_size_read(dio->inode) >> | 531 | if (dio->block_in_file < (i_size_read(dio->inode) >> |
527 | dio->blkbits)) | 532 | dio->blkbits)) |
528 | create = 0; | 533 | create = 0; |
529 | } else if (dio->lock_type == DIO_NO_LOCKING) { | ||
530 | create = 0; | ||
531 | } | 534 | } |
532 | 535 | ||
533 | /* | ||
534 | * For writes inside i_size we forbid block creations: only | ||
535 | * overwrites are permitted. We fall back to buffered writes | ||
536 | * at a higher level for inside-i_size block-instantiating | ||
537 | * writes. | ||
538 | */ | ||
539 | ret = (*dio->get_block)(dio->inode, fs_startblk, | 536 | ret = (*dio->get_block)(dio->inode, fs_startblk, |
540 | map_bh, create); | 537 | map_bh, create); |
541 | } | 538 | } |
@@ -1045,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1045 | * we can let i_mutex go now that its achieved its purpose | 1042 | * we can let i_mutex go now that its achieved its purpose |
1046 | * of protecting us from looking up uninitialized blocks. | 1043 | * of protecting us from looking up uninitialized blocks. |
1047 | */ | 1044 | */ |
1048 | if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) | 1045 | if (rw == READ && (dio->flags & DIO_LOCKING)) |
1049 | mutex_unlock(&dio->inode->i_mutex); | 1046 | mutex_unlock(&dio->inode->i_mutex); |
1050 | 1047 | ||
1051 | /* | 1048 | /* |
@@ -1092,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1092 | 1089 | ||
1093 | /* | 1090 | /* |
1094 | * This is a library function for use by filesystem drivers. | 1091 | * This is a library function for use by filesystem drivers. |
1095 | * The locking rules are governed by the dio_lock_type parameter. | ||
1096 | * | 1092 | * |
1097 | * DIO_NO_LOCKING (no locking, for raw block device access) | 1093 | * The locking rules are governed by the flags parameter: |
1098 | * For writes, i_mutex is not held on entry; it is never taken. | 1094 | * - if the flags value contains DIO_LOCKING we use a fancy locking |
1095 | * scheme for dumb filesystems. | ||
1096 | * For writes this function is called under i_mutex and returns with | ||
1097 | * i_mutex held, for reads, i_mutex is not held on entry, but it is | ||
1098 | * taken and dropped again before returning. | ||
1099 | * For reads and writes i_alloc_sem is taken in shared mode and released | ||
1100 | * on I/O completion (which may happen asynchronously after returning to | ||
1101 | * the caller). | ||
1099 | * | 1102 | * |
1100 | * DIO_LOCKING (simple locking for regular files) | 1103 | * - if the flags value does NOT contain DIO_LOCKING we don't use any |
1101 | * For writes we are called under i_mutex and return with i_mutex held, even | 1104 | * internal locking but rather rely on the filesystem to synchronize |
1102 | * though it is internally dropped. | 1105 | * direct I/O reads/writes versus each other and truncate. |
1103 | * For reads, i_mutex is not held on entry, but it is taken and dropped before | 1106 | * For reads and writes both i_mutex and i_alloc_sem are not held on |
1104 | * returning. | 1107 | * entry and are never taken. |
1105 | * | ||
1106 | * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of | ||
1107 | * uninitialised data, allowing parallel direct readers and writers) | ||
1108 | * For writes we are called without i_mutex, return without it, never touch it. | ||
1109 | * For reads we are called under i_mutex and return with i_mutex held, even | ||
1110 | * though it may be internally dropped. | ||
1111 | * | ||
1112 | * Additional i_alloc_sem locking requirements described inline below. | ||
1113 | */ | 1108 | */ |
1114 | ssize_t | 1109 | ssize_t |
1115 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 1110 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
1116 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 1111 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
1117 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 1112 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
1118 | int dio_lock_type) | 1113 | int flags) |
1119 | { | 1114 | { |
1120 | int seg; | 1115 | int seg; |
1121 | size_t size; | 1116 | size_t size; |
@@ -1126,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1126 | ssize_t retval = -EINVAL; | 1121 | ssize_t retval = -EINVAL; |
1127 | loff_t end = offset; | 1122 | loff_t end = offset; |
1128 | struct dio *dio; | 1123 | struct dio *dio; |
1129 | int release_i_mutex = 0; | ||
1130 | int acquire_i_mutex = 0; | ||
1131 | 1124 | ||
1132 | if (rw & WRITE) | 1125 | if (rw & WRITE) |
1133 | rw = WRITE_ODIRECT_PLUG; | 1126 | rw = WRITE_ODIRECT_PLUG; |
@@ -1168,43 +1161,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1168 | */ | 1161 | */ |
1169 | memset(dio, 0, offsetof(struct dio, pages)); | 1162 | memset(dio, 0, offsetof(struct dio, pages)); |
1170 | 1163 | ||
1171 | /* | 1164 | dio->flags = flags; |
1172 | * For block device access DIO_NO_LOCKING is used, | 1165 | if (dio->flags & DIO_LOCKING) { |
1173 | * neither readers nor writers do any locking at all | ||
1174 | * For regular files using DIO_LOCKING, | ||
1175 | * readers need to grab i_mutex and i_alloc_sem | ||
1176 | * writers need to grab i_alloc_sem only (i_mutex is already held) | ||
1177 | * For regular files using DIO_OWN_LOCKING, | ||
1178 | * neither readers nor writers take any locks here | ||
1179 | */ | ||
1180 | dio->lock_type = dio_lock_type; | ||
1181 | if (dio_lock_type != DIO_NO_LOCKING) { | ||
1182 | /* watch out for a 0 len io from a tricksy fs */ | 1166 | /* watch out for a 0 len io from a tricksy fs */ |
1183 | if (rw == READ && end > offset) { | 1167 | if (rw == READ && end > offset) { |
1184 | struct address_space *mapping; | 1168 | struct address_space *mapping = |
1169 | iocb->ki_filp->f_mapping; | ||
1185 | 1170 | ||
1186 | mapping = iocb->ki_filp->f_mapping; | 1171 | /* will be released by direct_io_worker */ |
1187 | if (dio_lock_type != DIO_OWN_LOCKING) { | 1172 | mutex_lock(&inode->i_mutex); |
1188 | mutex_lock(&inode->i_mutex); | ||
1189 | release_i_mutex = 1; | ||
1190 | } | ||
1191 | 1173 | ||
1192 | retval = filemap_write_and_wait_range(mapping, offset, | 1174 | retval = filemap_write_and_wait_range(mapping, offset, |
1193 | end - 1); | 1175 | end - 1); |
1194 | if (retval) { | 1176 | if (retval) { |
1177 | mutex_unlock(&inode->i_mutex); | ||
1195 | kfree(dio); | 1178 | kfree(dio); |
1196 | goto out; | 1179 | goto out; |
1197 | } | 1180 | } |
1198 | |||
1199 | if (dio_lock_type == DIO_OWN_LOCKING) { | ||
1200 | mutex_unlock(&inode->i_mutex); | ||
1201 | acquire_i_mutex = 1; | ||
1202 | } | ||
1203 | } | 1181 | } |
1204 | 1182 | ||
1205 | if (dio_lock_type == DIO_LOCKING) | 1183 | /* |
1206 | /* lockdep: not the owner will release it */ | 1184 | * Will be released at I/O completion, possibly in a |
1207 | down_read_non_owner(&inode->i_alloc_sem); | 1185 | * different thread. |
1186 | */ | ||
1187 | down_read_non_owner(&inode->i_alloc_sem); | ||
1208 | } | 1188 | } |
1209 | 1189 | ||
1210 | /* | 1190 | /* |
@@ -1222,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1222 | /* | 1202 | /* |
1223 | * In case of error extending write may have instantiated a few | 1203 | * In case of error extending write may have instantiated a few |
1224 | * blocks outside i_size. Trim these off again for DIO_LOCKING. | 1204 | * blocks outside i_size. Trim these off again for DIO_LOCKING. |
1225 | * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by | 1205 | * |
1226 | * it's own meaner. | 1206 | * NOTE: filesystems with their own locking have to handle this |
1207 | * on their own. | ||
1227 | */ | 1208 | */ |
1228 | if (unlikely(retval < 0 && (rw & WRITE))) { | 1209 | if (dio->flags & DIO_LOCKING) { |
1229 | loff_t isize = i_size_read(inode); | 1210 | if (unlikely((rw & WRITE) && retval < 0)) { |
1230 | 1211 | loff_t isize = i_size_read(inode); | |
1231 | if (end > isize && dio_lock_type == DIO_LOCKING) | 1212 | if (end > isize) |
1232 | vmtruncate(inode, isize); | 1213 | vmtruncate(inode, isize); |
1214 | } | ||
1233 | } | 1215 | } |
1234 | 1216 | ||
1235 | if (rw == READ && dio_lock_type == DIO_LOCKING) | ||
1236 | release_i_mutex = 0; | ||
1237 | |||
1238 | out: | 1217 | out: |
1239 | if (release_i_mutex) | ||
1240 | mutex_unlock(&inode->i_mutex); | ||
1241 | else if (acquire_i_mutex) | ||
1242 | mutex_lock(&inode->i_mutex); | ||
1243 | return retval; | 1218 | return retval; |
1244 | } | 1219 | } |
1245 | EXPORT_SYMBOL(__blockdev_direct_IO); | 1220 | EXPORT_SYMBOL(__blockdev_direct_IO); |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index deb2b132ae5e..3dae4a13f6e4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -547,6 +547,9 @@ bail: | |||
547 | * | 547 | * |
548 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | 548 | * called like this: dio->get_blocks(dio->inode, fs_startblk, |
549 | * fs_count, map_bh, dio->rw == WRITE); | 549 | * fs_count, map_bh, dio->rw == WRITE); |
550 | * | ||
551 | * Note that we never bother to allocate blocks here, and thus ignore the | ||
552 | * create argument. | ||
550 | */ | 553 | */ |
551 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | 554 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, |
552 | struct buffer_head *bh_result, int create) | 555 | struct buffer_head *bh_result, int create) |
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
563 | 566 | ||
564 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 567 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
565 | 568 | ||
566 | /* | ||
567 | * Any write past EOF is not allowed because we'd be extending. | ||
568 | */ | ||
569 | if (create && (iblock + max_blocks) > inode_blocks) { | ||
570 | ret = -EIO; | ||
571 | goto bail; | ||
572 | } | ||
573 | |||
574 | /* This figures out the size of the next contiguous block, and | 569 | /* This figures out the size of the next contiguous block, and |
575 | * our logical offset */ | 570 | * our logical offset */ |
576 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, | 571 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
582 | goto bail; | 577 | goto bail; |
583 | } | 578 | } |
584 | 579 | ||
585 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { | ||
586 | ocfs2_error(inode->i_sb, | ||
587 | "Inode %llu has a hole at block %llu\n", | ||
588 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
589 | (unsigned long long)iblock); | ||
590 | ret = -EROFS; | ||
591 | goto bail; | ||
592 | } | ||
593 | |||
594 | /* We should already CoW the refcounted extent. */ | 580 | /* We should already CoW the refcounted extent. */ |
595 | BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); | 581 | BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); |
596 | /* | 582 | /* |
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
601 | */ | 587 | */ |
602 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | 588 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) |
603 | map_bh(bh_result, inode->i_sb, p_blkno); | 589 | map_bh(bh_result, inode->i_sb, p_blkno); |
604 | else { | 590 | else |
605 | /* | ||
606 | * ocfs2_prepare_inode_for_write() should have caught | ||
607 | * the case where we'd be filling a hole and triggered | ||
608 | * a buffered write instead. | ||
609 | */ | ||
610 | if (create) { | ||
611 | ret = -EIO; | ||
612 | mlog_errno(ret); | ||
613 | goto bail; | ||
614 | } | ||
615 | |||
616 | clear_buffer_mapped(bh_result); | 591 | clear_buffer_mapped(bh_result); |
617 | } | ||
618 | 592 | ||
619 | /* make sure we don't map more than max_blocks blocks here as | 593 | /* make sure we don't map more than max_blocks blocks here as |
620 | that's all the kernel will handle at this point. */ | 594 | that's all the kernel will handle at this point. */ |
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index d798c54296eb..66abe36c1213 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c | |||
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO( | |||
1474 | 1474 | ||
1475 | bdev = xfs_find_bdev_for_inode(XFS_I(inode)); | 1475 | bdev = xfs_find_bdev_for_inode(XFS_I(inode)); |
1476 | 1476 | ||
1477 | if (rw == WRITE) { | 1477 | iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? |
1478 | iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); | 1478 | IOMAP_UNWRITTEN : IOMAP_READ); |
1479 | ret = blockdev_direct_IO_own_locking(rw, iocb, inode, | 1479 | |
1480 | bdev, iov, offset, nr_segs, | 1480 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, |
1481 | xfs_get_blocks_direct, | 1481 | offset, nr_segs, |
1482 | xfs_end_io_direct); | 1482 | xfs_get_blocks_direct, |
1483 | } else { | 1483 | xfs_end_io_direct); |
1484 | iocb->private = xfs_alloc_ioend(inode, IOMAP_READ); | ||
1485 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | ||
1486 | bdev, iov, offset, nr_segs, | ||
1487 | xfs_get_blocks_direct, | ||
1488 | xfs_end_io_direct); | ||
1489 | } | ||
1490 | 1484 | ||
1491 | if (unlikely(ret != -EIOCBQUEUED && iocb->private)) | 1485 | if (unlikely(ret != -EIOCBQUEUED && iocb->private)) |
1492 | xfs_destroy_ioend(iocb->private); | 1486 | xfs_destroy_ioend(iocb->private); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index a057f48eb156..b23a7018eb90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -2264,9 +2264,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
2264 | int lock_type); | 2264 | int lock_type); |
2265 | 2265 | ||
2266 | enum { | 2266 | enum { |
2267 | DIO_LOCKING = 1, /* need locking between buffered and direct access */ | 2267 | /* need locking between buffered and direct access */ |
2268 | DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ | 2268 | DIO_LOCKING = 0x01, |
2269 | DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ | 2269 | |
2270 | /* filesystem does not support filling holes */ | ||
2271 | DIO_SKIP_HOLES = 0x02, | ||
2270 | }; | 2272 | }; |
2271 | 2273 | ||
2272 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, | 2274 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, |
@@ -2275,7 +2277,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, | |||
2275 | dio_iodone_t end_io) | 2277 | dio_iodone_t end_io) |
2276 | { | 2278 | { |
2277 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 2279 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
2278 | nr_segs, get_block, end_io, DIO_LOCKING); | 2280 | nr_segs, get_block, end_io, |
2281 | DIO_LOCKING | DIO_SKIP_HOLES); | ||
2279 | } | 2282 | } |
2280 | 2283 | ||
2281 | static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, | 2284 | static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, |
@@ -2284,16 +2287,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, | |||
2284 | dio_iodone_t end_io) | 2287 | dio_iodone_t end_io) |
2285 | { | 2288 | { |
2286 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 2289 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
2287 | nr_segs, get_block, end_io, DIO_NO_LOCKING); | 2290 | nr_segs, get_block, end_io, 0); |
2288 | } | ||
2289 | |||
2290 | static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, | ||
2291 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | ||
2292 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | ||
2293 | dio_iodone_t end_io) | ||
2294 | { | ||
2295 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | ||
2296 | nr_segs, get_block, end_io, DIO_OWN_LOCKING); | ||
2297 | } | 2291 | } |
2298 | #endif | 2292 | #endif |
2299 | 2293 | ||