aboutsummaryrefslogtreecommitdiffstats
path: root/fs/direct-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r--fs/direct-io.c129
1 files changed, 52 insertions, 77 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9f34bb9b1ecb..4012885d027f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53 * 53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's 54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize. 55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks. If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */ 56 */
64 57
65struct dio { 58struct dio {
@@ -68,7 +61,7 @@ struct dio {
68 struct inode *inode; 61 struct inode *inode;
69 int rw; 62 int rw;
70 loff_t i_size; /* i_size when submitted */ 63 loff_t i_size; /* i_size when submitted */
71 int lock_type; /* doesn't change */ 64 int flags; /* doesn't change */
72 unsigned blkbits; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */
73 unsigned blkfactor; /* When we're using an alignment which 66 unsigned blkfactor; /* When we're using an alignment which
74 is finer than the filesystem's soft 67 is finer than the filesystem's soft
@@ -246,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
246 if (dio->end_io && dio->result) 239 if (dio->end_io && dio->result)
247 dio->end_io(dio->iocb, offset, transferred, 240 dio->end_io(dio->iocb, offset, transferred,
248 dio->map_bh.b_private); 241 dio->map_bh.b_private);
249 if (dio->lock_type == DIO_LOCKING) 242
243 if (dio->flags & DIO_LOCKING)
250 /* lockdep: non-owner release */ 244 /* lockdep: non-owner release */
251 up_read_non_owner(&dio->inode->i_alloc_sem); 245 up_read_non_owner(&dio->inode->i_alloc_sem);
252 246
@@ -521,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
521 map_bh->b_state = 0; 515 map_bh->b_state = 0;
522 map_bh->b_size = fs_count << dio->inode->i_blkbits; 516 map_bh->b_size = fs_count << dio->inode->i_blkbits;
523 517
518 /*
519 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
520 * forbid block creations: only overwrites are permitted.
521 * We will return early to the caller once we see an
522 * unmapped buffer head returned, and the caller will fall
523 * back to buffered I/O.
524 *
525 * Otherwise the decision is left to the get_blocks method,
526 * which may decide to handle it or also return an unmapped
527 * buffer head.
528 */
524 create = dio->rw & WRITE; 529 create = dio->rw & WRITE;
525 if (dio->lock_type == DIO_LOCKING) { 530 if (dio->flags & DIO_SKIP_HOLES) {
526 if (dio->block_in_file < (i_size_read(dio->inode) >> 531 if (dio->block_in_file < (i_size_read(dio->inode) >>
527 dio->blkbits)) 532 dio->blkbits))
528 create = 0; 533 create = 0;
529 } else if (dio->lock_type == DIO_NO_LOCKING) {
530 create = 0;
531 } 534 }
532 535
533 /*
534 * For writes inside i_size we forbid block creations: only
535 * overwrites are permitted. We fall back to buffered writes
536 * at a higher level for inside-i_size block-instantiating
537 * writes.
538 */
539 ret = (*dio->get_block)(dio->inode, fs_startblk, 536 ret = (*dio->get_block)(dio->inode, fs_startblk,
540 map_bh, create); 537 map_bh, create);
541 } 538 }
@@ -1045,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1045 * we can let i_mutex go now that its achieved its purpose 1042 * we can let i_mutex go now that its achieved its purpose
1046 * of protecting us from looking up uninitialized blocks. 1043 * of protecting us from looking up uninitialized blocks.
1047 */ 1044 */
1048 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1045 if (rw == READ && (dio->flags & DIO_LOCKING))
1049 mutex_unlock(&dio->inode->i_mutex); 1046 mutex_unlock(&dio->inode->i_mutex);
1050 1047
1051 /* 1048 /*
@@ -1092,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1092 1089
1093/* 1090/*
1094 * This is a library function for use by filesystem drivers. 1091 * This is a library function for use by filesystem drivers.
1095 * The locking rules are governed by the dio_lock_type parameter.
1096 * 1092 *
1097 * DIO_NO_LOCKING (no locking, for raw block device access) 1093 * The locking rules are governed by the flags parameter:
1098 * For writes, i_mutex is not held on entry; it is never taken. 1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1099 * 1102 *
1100 * DIO_LOCKING (simple locking for regular files) 1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1101 * For writes we are called under i_mutex and return with i_mutex held, even 1104 * internal locking but rather rely on the filesystem to synchronize
1102 * though it is internally dropped. 1105 * direct I/O reads/writes versus each other and truncate.
1103 * For reads, i_mutex is not held on entry, but it is taken and dropped before 1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1104 * returning. 1107 * entry and are never taken.
1105 *
1106 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1107 * uninitialised data, allowing parallel direct readers and writers)
1108 * For writes we are called without i_mutex, return without it, never touch it.
1109 * For reads we are called under i_mutex and return with i_mutex held, even
1110 * though it may be internally dropped.
1111 *
1112 * Additional i_alloc_sem locking requirements described inline below.
1113 */ 1108 */
1114ssize_t 1109ssize_t
1115__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1116 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1111 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1117 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1118 int dio_lock_type) 1113 int flags)
1119{ 1114{
1120 int seg; 1115 int seg;
1121 size_t size; 1116 size_t size;
@@ -1126,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1126 ssize_t retval = -EINVAL; 1121 ssize_t retval = -EINVAL;
1127 loff_t end = offset; 1122 loff_t end = offset;
1128 struct dio *dio; 1123 struct dio *dio;
1129 int release_i_mutex = 0;
1130 int acquire_i_mutex = 0;
1131 1124
1132 if (rw & WRITE) 1125 if (rw & WRITE)
1133 rw = WRITE_ODIRECT_PLUG; 1126 rw = WRITE_ODIRECT_PLUG;
@@ -1168,43 +1161,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1168 */ 1161 */
1169 memset(dio, 0, offsetof(struct dio, pages)); 1162 memset(dio, 0, offsetof(struct dio, pages));
1170 1163
1171 /* 1164 dio->flags = flags;
1172 * For block device access DIO_NO_LOCKING is used, 1165 if (dio->flags & DIO_LOCKING) {
1173 * neither readers nor writers do any locking at all
1174 * For regular files using DIO_LOCKING,
1175 * readers need to grab i_mutex and i_alloc_sem
1176 * writers need to grab i_alloc_sem only (i_mutex is already held)
1177 * For regular files using DIO_OWN_LOCKING,
1178 * neither readers nor writers take any locks here
1179 */
1180 dio->lock_type = dio_lock_type;
1181 if (dio_lock_type != DIO_NO_LOCKING) {
1182 /* watch out for a 0 len io from a tricksy fs */ 1166 /* watch out for a 0 len io from a tricksy fs */
1183 if (rw == READ && end > offset) { 1167 if (rw == READ && end > offset) {
1184 struct address_space *mapping; 1168 struct address_space *mapping =
1169 iocb->ki_filp->f_mapping;
1185 1170
1186 mapping = iocb->ki_filp->f_mapping; 1171 /* will be released by direct_io_worker */
1187 if (dio_lock_type != DIO_OWN_LOCKING) { 1172 mutex_lock(&inode->i_mutex);
1188 mutex_lock(&inode->i_mutex);
1189 release_i_mutex = 1;
1190 }
1191 1173
1192 retval = filemap_write_and_wait_range(mapping, offset, 1174 retval = filemap_write_and_wait_range(mapping, offset,
1193 end - 1); 1175 end - 1);
1194 if (retval) { 1176 if (retval) {
1177 mutex_unlock(&inode->i_mutex);
1195 kfree(dio); 1178 kfree(dio);
1196 goto out; 1179 goto out;
1197 } 1180 }
1198
1199 if (dio_lock_type == DIO_OWN_LOCKING) {
1200 mutex_unlock(&inode->i_mutex);
1201 acquire_i_mutex = 1;
1202 }
1203 } 1181 }
1204 1182
1205 if (dio_lock_type == DIO_LOCKING) 1183 /*
1206 /* lockdep: not the owner will release it */ 1184 * Will be released at I/O completion, possibly in a
1207 down_read_non_owner(&inode->i_alloc_sem); 1185 * different thread.
1186 */
1187 down_read_non_owner(&inode->i_alloc_sem);
1208 } 1188 }
1209 1189
1210 /* 1190 /*
@@ -1222,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1222 /* 1202 /*
1223 * In case of error extending write may have instantiated a few 1203 * In case of error extending write may have instantiated a few
1224 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1204 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1225 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by 1205 *
1226 * it's own meaner. 1206 * NOTE: filesystems with their own locking have to handle this
1207 * on their own.
1227 */ 1208 */
1228 if (unlikely(retval < 0 && (rw & WRITE))) { 1209 if (dio->flags & DIO_LOCKING) {
1229 loff_t isize = i_size_read(inode); 1210 if (unlikely((rw & WRITE) && retval < 0)) {
1230 1211 loff_t isize = i_size_read(inode);
1231 if (end > isize && dio_lock_type == DIO_LOCKING) 1212 if (end > isize)
1232 vmtruncate(inode, isize); 1213 vmtruncate(inode, isize);
1214 }
1233 } 1215 }
1234 1216
1235 if (rw == READ && dio_lock_type == DIO_LOCKING)
1236 release_i_mutex = 0;
1237
1238out: 1217out:
1239 if (release_i_mutex)
1240 mutex_unlock(&inode->i_mutex);
1241 else if (acquire_i_mutex)
1242 mutex_lock(&inode->i_mutex);
1243 return retval; 1218 return retval;
1244} 1219}
1245EXPORT_SYMBOL(__blockdev_direct_IO); 1220EXPORT_SYMBOL(__blockdev_direct_IO);