diff options
-rw-r--r-- | fs/direct-io.c | 129 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 34 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 20 | ||||
-rw-r--r-- | include/linux/fs.h | 22 |
4 files changed, 71 insertions, 134 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index b912270942fa..7dde0df8e8b6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -53,13 +53,6 @@ | |||
53 | * | 53 | * |
54 | * If blkfactor is zero then the user's request was aligned to the filesystem's | 54 | * If blkfactor is zero then the user's request was aligned to the filesystem's |
55 | * blocksize. | 55 | * blocksize. |
56 | * | ||
57 | * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. | ||
58 | * This determines whether we need to do the fancy locking which prevents | ||
59 | * direct-IO from being able to read uninitialised disk blocks. If its zero | ||
60 | * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is | ||
61 | * not held for the entire direct write (taken briefly, initially, during a | ||
62 | * direct read though, but its never held for the duration of a direct-IO). | ||
63 | */ | 56 | */ |
64 | 57 | ||
65 | struct dio { | 58 | struct dio { |
@@ -68,7 +61,7 @@ struct dio { | |||
68 | struct inode *inode; | 61 | struct inode *inode; |
69 | int rw; | 62 | int rw; |
70 | loff_t i_size; /* i_size when submitted */ | 63 | loff_t i_size; /* i_size when submitted */ |
71 | int lock_type; /* doesn't change */ | 64 | int flags; /* doesn't change */ |
72 | unsigned blkbits; /* doesn't change */ | 65 | unsigned blkbits; /* doesn't change */ |
73 | unsigned blkfactor; /* When we're using an alignment which | 66 | unsigned blkfactor; /* When we're using an alignment which |
74 | is finer than the filesystem's soft | 67 | is finer than the filesystem's soft |
@@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) | |||
240 | if (dio->end_io && dio->result) | 233 | if (dio->end_io && dio->result) |
241 | dio->end_io(dio->iocb, offset, transferred, | 234 | dio->end_io(dio->iocb, offset, transferred, |
242 | dio->map_bh.b_private); | 235 | dio->map_bh.b_private); |
243 | if (dio->lock_type == DIO_LOCKING) | 236 | |
237 | if (dio->flags & DIO_LOCKING) | ||
244 | /* lockdep: non-owner release */ | 238 | /* lockdep: non-owner release */ |
245 | up_read_non_owner(&dio->inode->i_alloc_sem); | 239 | up_read_non_owner(&dio->inode->i_alloc_sem); |
246 | 240 | ||
@@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *dio) | |||
515 | map_bh->b_state = 0; | 509 | map_bh->b_state = 0; |
516 | map_bh->b_size = fs_count << dio->inode->i_blkbits; | 510 | map_bh->b_size = fs_count << dio->inode->i_blkbits; |
517 | 511 | ||
512 | /* | ||
513 | * For writes inside i_size on a DIO_SKIP_HOLES filesystem we | ||
514 | * forbid block creations: only overwrites are permitted. | ||
515 | * We will return early to the caller once we see an | ||
516 | * unmapped buffer head returned, and the caller will fall | ||
517 | * back to buffered I/O. | ||
518 | * | ||
519 | * Otherwise the decision is left to the get_blocks method, | ||
520 | * which may decide to handle it or also return an unmapped | ||
521 | * buffer head. | ||
522 | */ | ||
518 | create = dio->rw & WRITE; | 523 | create = dio->rw & WRITE; |
519 | if (dio->lock_type == DIO_LOCKING) { | 524 | if (dio->flags & DIO_SKIP_HOLES) { |
520 | if (dio->block_in_file < (i_size_read(dio->inode) >> | 525 | if (dio->block_in_file < (i_size_read(dio->inode) >> |
521 | dio->blkbits)) | 526 | dio->blkbits)) |
522 | create = 0; | 527 | create = 0; |
523 | } else if (dio->lock_type == DIO_NO_LOCKING) { | ||
524 | create = 0; | ||
525 | } | 528 | } |
526 | 529 | ||
527 | /* | ||
528 | * For writes inside i_size we forbid block creations: only | ||
529 | * overwrites are permitted. We fall back to buffered writes | ||
530 | * at a higher level for inside-i_size block-instantiating | ||
531 | * writes. | ||
532 | */ | ||
533 | ret = (*dio->get_block)(dio->inode, fs_startblk, | 530 | ret = (*dio->get_block)(dio->inode, fs_startblk, |
534 | map_bh, create); | 531 | map_bh, create); |
535 | } | 532 | } |
@@ -1039,7 +1036,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1039 | * we can let i_mutex go now that its achieved its purpose | 1036 | * we can let i_mutex go now that its achieved its purpose |
1040 | * of protecting us from looking up uninitialized blocks. | 1037 | * of protecting us from looking up uninitialized blocks. |
1041 | */ | 1038 | */ |
1042 | if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) | 1039 | if (rw == READ && (dio->flags & DIO_LOCKING)) |
1043 | mutex_unlock(&dio->inode->i_mutex); | 1040 | mutex_unlock(&dio->inode->i_mutex); |
1044 | 1041 | ||
1045 | /* | 1042 | /* |
@@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1086 | 1083 | ||
1087 | /* | 1084 | /* |
1088 | * This is a library function for use by filesystem drivers. | 1085 | * This is a library function for use by filesystem drivers. |
1089 | * The locking rules are governed by the dio_lock_type parameter. | ||
1090 | * | 1086 | * |
1091 | * DIO_NO_LOCKING (no locking, for raw block device access) | 1087 | * The locking rules are governed by the flags parameter: |
1092 | * For writes, i_mutex is not held on entry; it is never taken. | 1088 | * - if the flags value contains DIO_LOCKING we use a fancy locking |
1089 | * scheme for dumb filesystems. | ||
1090 | * For writes this function is called under i_mutex and returns with | ||
1091 | * i_mutex held, for reads, i_mutex is not held on entry, but it is | ||
1092 | * taken and dropped again before returning. | ||
1093 | * For reads and writes i_alloc_sem is taken in shared mode and released | ||
1094 | * on I/O completion (which may happen asynchronously after returning to | ||
1095 | * the caller). | ||
1093 | * | 1096 | * |
1094 | * DIO_LOCKING (simple locking for regular files) | 1097 | * - if the flags value does NOT contain DIO_LOCKING we don't use any |
1095 | * For writes we are called under i_mutex and return with i_mutex held, even | 1098 | * internal locking but rather rely on the filesystem to synchronize |
1096 | * though it is internally dropped. | 1099 | * direct I/O reads/writes versus each other and truncate. |
1097 | * For reads, i_mutex is not held on entry, but it is taken and dropped before | 1100 | * For reads and writes both i_mutex and i_alloc_sem are not held on |
1098 | * returning. | 1101 | * entry and are never taken. |
1099 | * | ||
1100 | * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of | ||
1101 | * uninitialised data, allowing parallel direct readers and writers) | ||
1102 | * For writes we are called without i_mutex, return without it, never touch it. | ||
1103 | * For reads we are called under i_mutex and return with i_mutex held, even | ||
1104 | * though it may be internally dropped. | ||
1105 | * | ||
1106 | * Additional i_alloc_sem locking requirements described inline below. | ||
1107 | */ | 1102 | */ |
1108 | ssize_t | 1103 | ssize_t |
1109 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 1104 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
1110 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 1105 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
1111 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 1106 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
1112 | int dio_lock_type) | 1107 | int flags) |
1113 | { | 1108 | { |
1114 | int seg; | 1109 | int seg; |
1115 | size_t size; | 1110 | size_t size; |
@@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1120 | ssize_t retval = -EINVAL; | 1115 | ssize_t retval = -EINVAL; |
1121 | loff_t end = offset; | 1116 | loff_t end = offset; |
1122 | struct dio *dio; | 1117 | struct dio *dio; |
1123 | int release_i_mutex = 0; | ||
1124 | int acquire_i_mutex = 0; | ||
1125 | 1118 | ||
1126 | if (rw & WRITE) | 1119 | if (rw & WRITE) |
1127 | rw = WRITE_ODIRECT_PLUG; | 1120 | rw = WRITE_ODIRECT_PLUG; |
@@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1156 | if (!dio) | 1149 | if (!dio) |
1157 | goto out; | 1150 | goto out; |
1158 | 1151 | ||
1159 | /* | 1152 | dio->flags = flags; |
1160 | * For block device access DIO_NO_LOCKING is used, | 1153 | if (dio->flags & DIO_LOCKING) { |
1161 | * neither readers nor writers do any locking at all | ||
1162 | * For regular files using DIO_LOCKING, | ||
1163 | * readers need to grab i_mutex and i_alloc_sem | ||
1164 | * writers need to grab i_alloc_sem only (i_mutex is already held) | ||
1165 | * For regular files using DIO_OWN_LOCKING, | ||
1166 | * neither readers nor writers take any locks here | ||
1167 | */ | ||
1168 | dio->lock_type = dio_lock_type; | ||
1169 | if (dio_lock_type != DIO_NO_LOCKING) { | ||
1170 | /* watch out for a 0 len io from a tricksy fs */ | 1154 | /* watch out for a 0 len io from a tricksy fs */ |
1171 | if (rw == READ && end > offset) { | 1155 | if (rw == READ && end > offset) { |
1172 | struct address_space *mapping; | 1156 | struct address_space *mapping = |
1157 | iocb->ki_filp->f_mapping; | ||
1173 | 1158 | ||
1174 | mapping = iocb->ki_filp->f_mapping; | 1159 | /* will be released by direct_io_worker */ |
1175 | if (dio_lock_type != DIO_OWN_LOCKING) { | 1160 | mutex_lock(&inode->i_mutex); |
1176 | mutex_lock(&inode->i_mutex); | ||
1177 | release_i_mutex = 1; | ||
1178 | } | ||
1179 | 1161 | ||
1180 | retval = filemap_write_and_wait_range(mapping, offset, | 1162 | retval = filemap_write_and_wait_range(mapping, offset, |
1181 | end - 1); | 1163 | end - 1); |
1182 | if (retval) { | 1164 | if (retval) { |
1165 | mutex_unlock(&inode->i_mutex); | ||
1183 | kfree(dio); | 1166 | kfree(dio); |
1184 | goto out; | 1167 | goto out; |
1185 | } | 1168 | } |
1186 | |||
1187 | if (dio_lock_type == DIO_OWN_LOCKING) { | ||
1188 | mutex_unlock(&inode->i_mutex); | ||
1189 | acquire_i_mutex = 1; | ||
1190 | } | ||
1191 | } | 1169 | } |
1192 | 1170 | ||
1193 | if (dio_lock_type == DIO_LOCKING) | 1171 | /* |
1194 | /* lockdep: not the owner will release it */ | 1172 | * Will be released at I/O completion, possibly in a |
1195 | down_read_non_owner(&inode->i_alloc_sem); | 1173 | * different thread. |
1174 | */ | ||
1175 | down_read_non_owner(&inode->i_alloc_sem); | ||
1196 | } | 1176 | } |
1197 | 1177 | ||
1198 | /* | 1178 | /* |
@@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1210 | /* | 1190 | /* |
1211 | * In case of error extending write may have instantiated a few | 1191 | * In case of error extending write may have instantiated a few |
1212 | * blocks outside i_size. Trim these off again for DIO_LOCKING. | 1192 | * blocks outside i_size. Trim these off again for DIO_LOCKING. |
1213 | * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by | 1193 | * |
1214 | * it's own meaner. | 1194 | * NOTE: filesystems with their own locking have to handle this |
1195 | * on their own. | ||
1215 | */ | 1196 | */ |
1216 | if (unlikely(retval < 0 && (rw & WRITE))) { | 1197 | if (dio->flags & DIO_LOCKING) { |
1217 | loff_t isize = i_size_read(inode); | 1198 | if (unlikely((rw & WRITE) && retval < 0)) { |
1218 | 1199 | loff_t isize = i_size_read(inode); | |
1219 | if (end > isize && dio_lock_type == DIO_LOCKING) | 1200 | if (end > isize ) |
1220 | vmtruncate(inode, isize); | 1201 | vmtruncate(inode, isize); |
1202 | } | ||
1221 | } | 1203 | } |
1222 | 1204 | ||
1223 | if (rw == READ && dio_lock_type == DIO_LOCKING) | ||
1224 | release_i_mutex = 0; | ||
1225 | |||
1226 | out: | 1205 | out: |
1227 | if (release_i_mutex) | ||
1228 | mutex_unlock(&inode->i_mutex); | ||
1229 | else if (acquire_i_mutex) | ||
1230 | mutex_lock(&inode->i_mutex); | ||
1231 | return retval; | 1206 | return retval; |
1232 | } | 1207 | } |
1233 | EXPORT_SYMBOL(__blockdev_direct_IO); | 1208 | EXPORT_SYMBOL(__blockdev_direct_IO); |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index deb2b132ae5e..3dae4a13f6e4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -547,6 +547,9 @@ bail: | |||
547 | * | 547 | * |
548 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | 548 | * called like this: dio->get_blocks(dio->inode, fs_startblk, |
549 | * fs_count, map_bh, dio->rw == WRITE); | 549 | * fs_count, map_bh, dio->rw == WRITE); |
550 | * | ||
551 | * Note that we never bother to allocate blocks here, and thus ignore the | ||
552 | * create argument. | ||
550 | */ | 553 | */ |
551 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | 554 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, |
552 | struct buffer_head *bh_result, int create) | 555 | struct buffer_head *bh_result, int create) |
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
563 | 566 | ||
564 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 567 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
565 | 568 | ||
566 | /* | ||
567 | * Any write past EOF is not allowed because we'd be extending. | ||
568 | */ | ||
569 | if (create && (iblock + max_blocks) > inode_blocks) { | ||
570 | ret = -EIO; | ||
571 | goto bail; | ||
572 | } | ||
573 | |||
574 | /* This figures out the size of the next contiguous block, and | 569 | /* This figures out the size of the next contiguous block, and |
575 | * our logical offset */ | 570 | * our logical offset */ |
576 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, | 571 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
582 | goto bail; | 577 | goto bail; |
583 | } | 578 | } |
584 | 579 | ||
585 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { | ||
586 | ocfs2_error(inode->i_sb, | ||
587 | "Inode %llu has a hole at block %llu\n", | ||
588 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
589 | (unsigned long long)iblock); | ||
590 | ret = -EROFS; | ||
591 | goto bail; | ||
592 | } | ||
593 | |||
594 | /* We should already CoW the refcounted extent. */ | 580 | /* We should already CoW the refcounted extent. */ |
595 | BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); | 581 | BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); |
596 | /* | 582 | /* |
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
601 | */ | 587 | */ |
602 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | 588 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) |
603 | map_bh(bh_result, inode->i_sb, p_blkno); | 589 | map_bh(bh_result, inode->i_sb, p_blkno); |
604 | else { | 590 | else |
605 | /* | ||
606 | * ocfs2_prepare_inode_for_write() should have caught | ||
607 | * the case where we'd be filling a hole and triggered | ||
608 | * a buffered write instead. | ||
609 | */ | ||
610 | if (create) { | ||
611 | ret = -EIO; | ||
612 | mlog_errno(ret); | ||
613 | goto bail; | ||
614 | } | ||
615 | |||
616 | clear_buffer_mapped(bh_result); | 591 | clear_buffer_mapped(bh_result); |
617 | } | ||
618 | 592 | ||
619 | /* make sure we don't map more than max_blocks blocks here as | 593 | /* make sure we don't map more than max_blocks blocks here as |
620 | that's all the kernel will handle at this point. */ | 594 | that's all the kernel will handle at this point. */ |
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index d798c54296eb..66abe36c1213 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c | |||
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO( | |||
1474 | 1474 | ||
1475 | bdev = xfs_find_bdev_for_inode(XFS_I(inode)); | 1475 | bdev = xfs_find_bdev_for_inode(XFS_I(inode)); |
1476 | 1476 | ||
1477 | if (rw == WRITE) { | 1477 | iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? |
1478 | iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); | 1478 | IOMAP_UNWRITTEN : IOMAP_READ); |
1479 | ret = blockdev_direct_IO_own_locking(rw, iocb, inode, | 1479 | |
1480 | bdev, iov, offset, nr_segs, | 1480 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, |
1481 | xfs_get_blocks_direct, | 1481 | offset, nr_segs, |
1482 | xfs_end_io_direct); | 1482 | xfs_get_blocks_direct, |
1483 | } else { | 1483 | xfs_end_io_direct); |
1484 | iocb->private = xfs_alloc_ioend(inode, IOMAP_READ); | ||
1485 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | ||
1486 | bdev, iov, offset, nr_segs, | ||
1487 | xfs_get_blocks_direct, | ||
1488 | xfs_end_io_direct); | ||
1489 | } | ||
1490 | 1484 | ||
1491 | if (unlikely(ret != -EIOCBQUEUED && iocb->private)) | 1485 | if (unlikely(ret != -EIOCBQUEUED && iocb->private)) |
1492 | xfs_destroy_ioend(iocb->private); | 1486 | xfs_destroy_ioend(iocb->private); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index cdc23be4edde..7c8ff12d1995 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -2263,9 +2263,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
2263 | int lock_type); | 2263 | int lock_type); |
2264 | 2264 | ||
2265 | enum { | 2265 | enum { |
2266 | DIO_LOCKING = 1, /* need locking between buffered and direct access */ | 2266 | /* need locking between buffered and direct access */ |
2267 | DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ | 2267 | DIO_LOCKING = 0x01, |
2268 | DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ | 2268 | |
2269 | /* filesystem does not support filling holes */ | ||
2270 | DIO_SKIP_HOLES = 0x02, | ||
2269 | }; | 2271 | }; |
2270 | 2272 | ||
2271 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, | 2273 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, |
@@ -2274,7 +2276,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, | |||
2274 | dio_iodone_t end_io) | 2276 | dio_iodone_t end_io) |
2275 | { | 2277 | { |
2276 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 2278 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
2277 | nr_segs, get_block, end_io, DIO_LOCKING); | 2279 | nr_segs, get_block, end_io, |
2280 | DIO_LOCKING | DIO_SKIP_HOLES); | ||
2278 | } | 2281 | } |
2279 | 2282 | ||
2280 | static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, | 2283 | static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, |
@@ -2283,16 +2286,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, | |||
2283 | dio_iodone_t end_io) | 2286 | dio_iodone_t end_io) |
2284 | { | 2287 | { |
2285 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | 2288 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, |
2286 | nr_segs, get_block, end_io, DIO_NO_LOCKING); | 2289 | nr_segs, get_block, end_io, 0); |
2287 | } | ||
2288 | |||
2289 | static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, | ||
2290 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | ||
2291 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | ||
2292 | dio_iodone_t end_io) | ||
2293 | { | ||
2294 | return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, | ||
2295 | nr_segs, get_block, end_io, DIO_OWN_LOCKING); | ||
2296 | } | 2290 | } |
2297 | #endif | 2291 | #endif |
2298 | 2292 | ||