aboutsummaryrefslogtreecommitdiffstats
path: root/fs/direct-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r--fs/direct-io.c129
1 files changed, 52 insertions, 77 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b912270942fa..7dde0df8e8b6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53 * 53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's 54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize. 55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks. If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */ 56 */
64 57
65struct dio { 58struct dio {
@@ -68,7 +61,7 @@ struct dio {
68 struct inode *inode; 61 struct inode *inode;
69 int rw; 62 int rw;
70 loff_t i_size; /* i_size when submitted */ 63 loff_t i_size; /* i_size when submitted */
71 int lock_type; /* doesn't change */ 64 int flags; /* doesn't change */
72 unsigned blkbits; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */
73 unsigned blkfactor; /* When we're using an alignment which 66 unsigned blkfactor; /* When we're using an alignment which
74 is finer than the filesystem's soft 67 is finer than the filesystem's soft
@@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
240 if (dio->end_io && dio->result) 233 if (dio->end_io && dio->result)
241 dio->end_io(dio->iocb, offset, transferred, 234 dio->end_io(dio->iocb, offset, transferred,
242 dio->map_bh.b_private); 235 dio->map_bh.b_private);
243 if (dio->lock_type == DIO_LOCKING) 236
237 if (dio->flags & DIO_LOCKING)
244 /* lockdep: non-owner release */ 238 /* lockdep: non-owner release */
245 up_read_non_owner(&dio->inode->i_alloc_sem); 239 up_read_non_owner(&dio->inode->i_alloc_sem);
246 240
@@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *dio)
515 map_bh->b_state = 0; 509 map_bh->b_state = 0;
516 map_bh->b_size = fs_count << dio->inode->i_blkbits; 510 map_bh->b_size = fs_count << dio->inode->i_blkbits;
517 511
512 /*
513 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
514 * forbid block creations: only overwrites are permitted.
515 * We will return early to the caller once we see an
516 * unmapped buffer head returned, and the caller will fall
517 * back to buffered I/O.
518 *
519 * Otherwise the decision is left to the get_blocks method,
520 * which may decide to handle it or also return an unmapped
521 * buffer head.
522 */
518 create = dio->rw & WRITE; 523 create = dio->rw & WRITE;
519 if (dio->lock_type == DIO_LOCKING) { 524 if (dio->flags & DIO_SKIP_HOLES) {
520 if (dio->block_in_file < (i_size_read(dio->inode) >> 525 if (dio->block_in_file < (i_size_read(dio->inode) >>
521 dio->blkbits)) 526 dio->blkbits))
522 create = 0; 527 create = 0;
523 } else if (dio->lock_type == DIO_NO_LOCKING) {
524 create = 0;
525 } 528 }
526 529
527 /*
528 * For writes inside i_size we forbid block creations: only
529 * overwrites are permitted. We fall back to buffered writes
530 * at a higher level for inside-i_size block-instantiating
531 * writes.
532 */
533 ret = (*dio->get_block)(dio->inode, fs_startblk, 530 ret = (*dio->get_block)(dio->inode, fs_startblk,
534 map_bh, create); 531 map_bh, create);
535 } 532 }
@@ -1039,7 +1036,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1039 * we can let i_mutex go now that its achieved its purpose 1036 * we can let i_mutex go now that its achieved its purpose
1040 * of protecting us from looking up uninitialized blocks. 1037 * of protecting us from looking up uninitialized blocks.
1041 */ 1038 */
1042 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1039 if (rw == READ && (dio->flags & DIO_LOCKING))
1043 mutex_unlock(&dio->inode->i_mutex); 1040 mutex_unlock(&dio->inode->i_mutex);
1044 1041
1045 /* 1042 /*
@@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1086 1083
1087/* 1084/*
1088 * This is a library function for use by filesystem drivers. 1085 * This is a library function for use by filesystem drivers.
1089 * The locking rules are governed by the dio_lock_type parameter.
1090 * 1086 *
1091 * DIO_NO_LOCKING (no locking, for raw block device access) 1087 * The locking rules are governed by the flags parameter:
1092 * For writes, i_mutex is not held on entry; it is never taken. 1088 * - if the flags value contains DIO_LOCKING we use a fancy locking
1089 * scheme for dumb filesystems.
1090 * For writes this function is called under i_mutex and returns with
1091 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1092 * taken and dropped again before returning.
1093 * For reads and writes i_alloc_sem is taken in shared mode and released
1094 * on I/O completion (which may happen asynchronously after returning to
1095 * the caller).
1093 * 1096 *
1094 * DIO_LOCKING (simple locking for regular files) 1097 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1095 * For writes we are called under i_mutex and return with i_mutex held, even 1098 * internal locking but rather rely on the filesystem to synchronize
1096 * though it is internally dropped. 1099 * direct I/O reads/writes versus each other and truncate.
1097 * For reads, i_mutex is not held on entry, but it is taken and dropped before 1100 * For reads and writes both i_mutex and i_alloc_sem are not held on
1098 * returning. 1101 * entry and are never taken.
1099 *
1100 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1101 * uninitialised data, allowing parallel direct readers and writers)
1102 * For writes we are called without i_mutex, return without it, never touch it.
1103 * For reads we are called under i_mutex and return with i_mutex held, even
1104 * though it may be internally dropped.
1105 *
1106 * Additional i_alloc_sem locking requirements described inline below.
1107 */ 1102 */
1108ssize_t 1103ssize_t
1109__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1104__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1110 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1105 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1111 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1106 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1112 int dio_lock_type) 1107 int flags)
1113{ 1108{
1114 int seg; 1109 int seg;
1115 size_t size; 1110 size_t size;
@@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1120 ssize_t retval = -EINVAL; 1115 ssize_t retval = -EINVAL;
1121 loff_t end = offset; 1116 loff_t end = offset;
1122 struct dio *dio; 1117 struct dio *dio;
1123 int release_i_mutex = 0;
1124 int acquire_i_mutex = 0;
1125 1118
1126 if (rw & WRITE) 1119 if (rw & WRITE)
1127 rw = WRITE_ODIRECT_PLUG; 1120 rw = WRITE_ODIRECT_PLUG;
@@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1156 if (!dio) 1149 if (!dio)
1157 goto out; 1150 goto out;
1158 1151
1159 /* 1152 dio->flags = flags;
1160 * For block device access DIO_NO_LOCKING is used, 1153 if (dio->flags & DIO_LOCKING) {
1161 * neither readers nor writers do any locking at all
1162 * For regular files using DIO_LOCKING,
1163 * readers need to grab i_mutex and i_alloc_sem
1164 * writers need to grab i_alloc_sem only (i_mutex is already held)
1165 * For regular files using DIO_OWN_LOCKING,
1166 * neither readers nor writers take any locks here
1167 */
1168 dio->lock_type = dio_lock_type;
1169 if (dio_lock_type != DIO_NO_LOCKING) {
1170 /* watch out for a 0 len io from a tricksy fs */ 1154 /* watch out for a 0 len io from a tricksy fs */
1171 if (rw == READ && end > offset) { 1155 if (rw == READ && end > offset) {
1172 struct address_space *mapping; 1156 struct address_space *mapping =
1157 iocb->ki_filp->f_mapping;
1173 1158
1174 mapping = iocb->ki_filp->f_mapping; 1159 /* will be released by direct_io_worker */
1175 if (dio_lock_type != DIO_OWN_LOCKING) { 1160 mutex_lock(&inode->i_mutex);
1176 mutex_lock(&inode->i_mutex);
1177 release_i_mutex = 1;
1178 }
1179 1161
1180 retval = filemap_write_and_wait_range(mapping, offset, 1162 retval = filemap_write_and_wait_range(mapping, offset,
1181 end - 1); 1163 end - 1);
1182 if (retval) { 1164 if (retval) {
1165 mutex_unlock(&inode->i_mutex);
1183 kfree(dio); 1166 kfree(dio);
1184 goto out; 1167 goto out;
1185 } 1168 }
1186
1187 if (dio_lock_type == DIO_OWN_LOCKING) {
1188 mutex_unlock(&inode->i_mutex);
1189 acquire_i_mutex = 1;
1190 }
1191 } 1169 }
1192 1170
1193 if (dio_lock_type == DIO_LOCKING) 1171 /*
1194 /* lockdep: not the owner will release it */ 1172 * Will be released at I/O completion, possibly in a
1195 down_read_non_owner(&inode->i_alloc_sem); 1173 * different thread.
1174 */
1175 down_read_non_owner(&inode->i_alloc_sem);
1196 } 1176 }
1197 1177
1198 /* 1178 /*
@@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1210 /* 1190 /*
1211 * In case of error extending write may have instantiated a few 1191 * In case of error extending write may have instantiated a few
1212 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1192 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1213 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by 1193 *
1214 * it's own meaner. 1194 * NOTE: filesystems with their own locking have to handle this
1195 * on their own.
1215 */ 1196 */
1216 if (unlikely(retval < 0 && (rw & WRITE))) { 1197 if (dio->flags & DIO_LOCKING) {
1217 loff_t isize = i_size_read(inode); 1198 if (unlikely((rw & WRITE) && retval < 0)) {
1218 1199 loff_t isize = i_size_read(inode);
1219 if (end > isize && dio_lock_type == DIO_LOCKING) 1200 if (end > isize )
1220 vmtruncate(inode, isize); 1201 vmtruncate(inode, isize);
1202 }
1221 } 1203 }
1222 1204
1223 if (rw == READ && dio_lock_type == DIO_LOCKING)
1224 release_i_mutex = 0;
1225
1226out: 1205out:
1227 if (release_i_mutex)
1228 mutex_unlock(&inode->i_mutex);
1229 else if (acquire_i_mutex)
1230 mutex_lock(&inode->i_mutex);
1231 return retval; 1206 return retval;
1232} 1207}
1233EXPORT_SYMBOL(__blockdev_direct_IO); 1208EXPORT_SYMBOL(__blockdev_direct_IO);