diff options
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r-- | fs/direct-io.c | 129 |
1 files changed, 52 insertions, 77 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index 9f34bb9b1ecb..4012885d027f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -53,13 +53,6 @@ | |||
53 | * | 53 | * |
54 | * If blkfactor is zero then the user's request was aligned to the filesystem's | 54 | * If blkfactor is zero then the user's request was aligned to the filesystem's |
55 | * blocksize. | 55 | * blocksize. |
56 | * | ||
57 | * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. | ||
58 | * This determines whether we need to do the fancy locking which prevents | ||
59 | * direct-IO from being able to read uninitialised disk blocks. If its zero | ||
60 | * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is | ||
61 | * not held for the entire direct write (taken briefly, initially, during a | ||
62 | * direct read though, but its never held for the duration of a direct-IO). | ||
63 | */ | 56 | */ |
64 | 57 | ||
65 | struct dio { | 58 | struct dio { |
@@ -68,7 +61,7 @@ struct dio { | |||
68 | struct inode *inode; | 61 | struct inode *inode; |
69 | int rw; | 62 | int rw; |
70 | loff_t i_size; /* i_size when submitted */ | 63 | loff_t i_size; /* i_size when submitted */ |
71 | int lock_type; /* doesn't change */ | 64 | int flags; /* doesn't change */ |
72 | unsigned blkbits; /* doesn't change */ | 65 | unsigned blkbits; /* doesn't change */ |
73 | unsigned blkfactor; /* When we're using an alignment which | 66 | unsigned blkfactor; /* When we're using an alignment which |
74 | is finer than the filesystem's soft | 67 | is finer than the filesystem's soft |
@@ -246,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) | |||
246 | if (dio->end_io && dio->result) | 239 | if (dio->end_io && dio->result) |
247 | dio->end_io(dio->iocb, offset, transferred, | 240 | dio->end_io(dio->iocb, offset, transferred, |
248 | dio->map_bh.b_private); | 241 | dio->map_bh.b_private); |
249 | if (dio->lock_type == DIO_LOCKING) | 242 | |
243 | if (dio->flags & DIO_LOCKING) | ||
250 | /* lockdep: non-owner release */ | 244 | /* lockdep: non-owner release */ |
251 | up_read_non_owner(&dio->inode->i_alloc_sem); | 245 | up_read_non_owner(&dio->inode->i_alloc_sem); |
252 | 246 | ||
@@ -521,21 +515,24 @@ static int get_more_blocks(struct dio *dio) | |||
521 | map_bh->b_state = 0; | 515 | map_bh->b_state = 0; |
522 | map_bh->b_size = fs_count << dio->inode->i_blkbits; | 516 | map_bh->b_size = fs_count << dio->inode->i_blkbits; |
523 | 517 | ||
518 | /* | ||
519 | * For writes inside i_size on a DIO_SKIP_HOLES filesystem we | ||
520 | * forbid block creations: only overwrites are permitted. | ||
521 | * We will return early to the caller once we see an | ||
522 | * unmapped buffer head returned, and the caller will fall | ||
523 | * back to buffered I/O. | ||
524 | * | ||
525 | * Otherwise the decision is left to the get_blocks method, | ||
526 | * which may decide to handle it or also return an unmapped | ||
527 | * buffer head. | ||
528 | */ | ||
524 | create = dio->rw & WRITE; | 529 | create = dio->rw & WRITE; |
525 | if (dio->lock_type == DIO_LOCKING) { | 530 | if (dio->flags & DIO_SKIP_HOLES) { |
526 | if (dio->block_in_file < (i_size_read(dio->inode) >> | 531 | if (dio->block_in_file < (i_size_read(dio->inode) >> |
527 | dio->blkbits)) | 532 | dio->blkbits)) |
528 | create = 0; | 533 | create = 0; |
529 | } else if (dio->lock_type == DIO_NO_LOCKING) { | ||
530 | create = 0; | ||
531 | } | 534 | } |
532 | 535 | ||
533 | /* | ||
534 | * For writes inside i_size we forbid block creations: only | ||
535 | * overwrites are permitted. We fall back to buffered writes | ||
536 | * at a higher level for inside-i_size block-instantiating | ||
537 | * writes. | ||
538 | */ | ||
539 | ret = (*dio->get_block)(dio->inode, fs_startblk, | 536 | ret = (*dio->get_block)(dio->inode, fs_startblk, |
540 | map_bh, create); | 537 | map_bh, create); |
541 | } | 538 | } |
@@ -1045,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1045 | * we can let i_mutex go now that its achieved its purpose | 1042 | * we can let i_mutex go now that its achieved its purpose |
1046 | * of protecting us from looking up uninitialized blocks. | 1043 | * of protecting us from looking up uninitialized blocks. |
1047 | */ | 1044 | */ |
1048 | if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) | 1045 | if (rw == READ && (dio->flags & DIO_LOCKING)) |
1049 | mutex_unlock(&dio->inode->i_mutex); | 1046 | mutex_unlock(&dio->inode->i_mutex); |
1050 | 1047 | ||
1051 | /* | 1048 | /* |
@@ -1092,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1092 | 1089 | ||
1093 | /* | 1090 | /* |
1094 | * This is a library function for use by filesystem drivers. | 1091 | * This is a library function for use by filesystem drivers. |
1095 | * The locking rules are governed by the dio_lock_type parameter. | ||
1096 | * | 1092 | * |
1097 | * DIO_NO_LOCKING (no locking, for raw block device access) | 1093 | * The locking rules are governed by the flags parameter: |
1098 | * For writes, i_mutex is not held on entry; it is never taken. | 1094 | * - if the flags value contains DIO_LOCKING we use a fancy locking |
1095 | * scheme for dumb filesystems. | ||
1096 | * For writes this function is called under i_mutex and returns with | ||
1097 | * i_mutex held, for reads, i_mutex is not held on entry, but it is | ||
1098 | * taken and dropped again before returning. | ||
1099 | * For reads and writes i_alloc_sem is taken in shared mode and released | ||
1100 | * on I/O completion (which may happen asynchronously after returning to | ||
1101 | * the caller). | ||
1099 | * | 1102 | * |
1100 | * DIO_LOCKING (simple locking for regular files) | 1103 | * - if the flags value does NOT contain DIO_LOCKING we don't use any |
1101 | * For writes we are called under i_mutex and return with i_mutex held, even | 1104 | * internal locking but rather rely on the filesystem to synchronize |
1102 | * though it is internally dropped. | 1105 | * direct I/O reads/writes versus each other and truncate. |
1103 | * For reads, i_mutex is not held on entry, but it is taken and dropped before | 1106 | * For reads and writes both i_mutex and i_alloc_sem are not held on |
1104 | * returning. | 1107 | * entry and are never taken. |
1105 | * | ||
1106 | * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of | ||
1107 | * uninitialised data, allowing parallel direct readers and writers) | ||
1108 | * For writes we are called without i_mutex, return without it, never touch it. | ||
1109 | * For reads we are called under i_mutex and return with i_mutex held, even | ||
1110 | * though it may be internally dropped. | ||
1111 | * | ||
1112 | * Additional i_alloc_sem locking requirements described inline below. | ||
1113 | */ | 1108 | */ |
1114 | ssize_t | 1109 | ssize_t |
1115 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 1110 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
1116 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 1111 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
1117 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 1112 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
1118 | int dio_lock_type) | 1113 | int flags) |
1119 | { | 1114 | { |
1120 | int seg; | 1115 | int seg; |
1121 | size_t size; | 1116 | size_t size; |
@@ -1126,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1126 | ssize_t retval = -EINVAL; | 1121 | ssize_t retval = -EINVAL; |
1127 | loff_t end = offset; | 1122 | loff_t end = offset; |
1128 | struct dio *dio; | 1123 | struct dio *dio; |
1129 | int release_i_mutex = 0; | ||
1130 | int acquire_i_mutex = 0; | ||
1131 | 1124 | ||
1132 | if (rw & WRITE) | 1125 | if (rw & WRITE) |
1133 | rw = WRITE_ODIRECT_PLUG; | 1126 | rw = WRITE_ODIRECT_PLUG; |
@@ -1168,43 +1161,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1168 | */ | 1161 | */ |
1169 | memset(dio, 0, offsetof(struct dio, pages)); | 1162 | memset(dio, 0, offsetof(struct dio, pages)); |
1170 | 1163 | ||
1171 | /* | 1164 | dio->flags = flags; |
1172 | * For block device access DIO_NO_LOCKING is used, | 1165 | if (dio->flags & DIO_LOCKING) { |
1173 | * neither readers nor writers do any locking at all | ||
1174 | * For regular files using DIO_LOCKING, | ||
1175 | * readers need to grab i_mutex and i_alloc_sem | ||
1176 | * writers need to grab i_alloc_sem only (i_mutex is already held) | ||
1177 | * For regular files using DIO_OWN_LOCKING, | ||
1178 | * neither readers nor writers take any locks here | ||
1179 | */ | ||
1180 | dio->lock_type = dio_lock_type; | ||
1181 | if (dio_lock_type != DIO_NO_LOCKING) { | ||
1182 | /* watch out for a 0 len io from a tricksy fs */ | 1166 | /* watch out for a 0 len io from a tricksy fs */ |
1183 | if (rw == READ && end > offset) { | 1167 | if (rw == READ && end > offset) { |
1184 | struct address_space *mapping; | 1168 | struct address_space *mapping = |
1169 | iocb->ki_filp->f_mapping; | ||
1185 | 1170 | ||
1186 | mapping = iocb->ki_filp->f_mapping; | 1171 | /* will be released by direct_io_worker */ |
1187 | if (dio_lock_type != DIO_OWN_LOCKING) { | 1172 | mutex_lock(&inode->i_mutex); |
1188 | mutex_lock(&inode->i_mutex); | ||
1189 | release_i_mutex = 1; | ||
1190 | } | ||
1191 | 1173 | ||
1192 | retval = filemap_write_and_wait_range(mapping, offset, | 1174 | retval = filemap_write_and_wait_range(mapping, offset, |
1193 | end - 1); | 1175 | end - 1); |
1194 | if (retval) { | 1176 | if (retval) { |
1177 | mutex_unlock(&inode->i_mutex); | ||
1195 | kfree(dio); | 1178 | kfree(dio); |
1196 | goto out; | 1179 | goto out; |
1197 | } | 1180 | } |
1198 | |||
1199 | if (dio_lock_type == DIO_OWN_LOCKING) { | ||
1200 | mutex_unlock(&inode->i_mutex); | ||
1201 | acquire_i_mutex = 1; | ||
1202 | } | ||
1203 | } | 1181 | } |
1204 | 1182 | ||
1205 | if (dio_lock_type == DIO_LOCKING) | 1183 | /* |
1206 | /* lockdep: not the owner will release it */ | 1184 | * Will be released at I/O completion, possibly in a |
1207 | down_read_non_owner(&inode->i_alloc_sem); | 1185 | * different thread. |
1186 | */ | ||
1187 | down_read_non_owner(&inode->i_alloc_sem); | ||
1208 | } | 1188 | } |
1209 | 1189 | ||
1210 | /* | 1190 | /* |
@@ -1222,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1222 | /* | 1202 | /* |
1223 | * In case of error extending write may have instantiated a few | 1203 | * In case of error extending write may have instantiated a few |
1224 | * blocks outside i_size. Trim these off again for DIO_LOCKING. | 1204 | * blocks outside i_size. Trim these off again for DIO_LOCKING. |
1225 | * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by | 1205 | * |
1226 | * it's own meaner. | 1206 | * NOTE: filesystems with their own locking have to handle this |
1207 | * on their own. | ||
1227 | */ | 1208 | */ |
1228 | if (unlikely(retval < 0 && (rw & WRITE))) { | 1209 | if (dio->flags & DIO_LOCKING) { |
1229 | loff_t isize = i_size_read(inode); | 1210 | if (unlikely((rw & WRITE) && retval < 0)) { |
1230 | 1211 | loff_t isize = i_size_read(inode); | |
1231 | if (end > isize && dio_lock_type == DIO_LOCKING) | 1212 | if (end > isize) |
1232 | vmtruncate(inode, isize); | 1213 | vmtruncate(inode, isize); |
1214 | } | ||
1233 | } | 1215 | } |
1234 | 1216 | ||
1235 | if (rw == READ && dio_lock_type == DIO_LOCKING) | ||
1236 | release_i_mutex = 0; | ||
1237 | |||
1238 | out: | 1217 | out: |
1239 | if (release_i_mutex) | ||
1240 | mutex_unlock(&inode->i_mutex); | ||
1241 | else if (acquire_i_mutex) | ||
1242 | mutex_lock(&inode->i_mutex); | ||
1243 | return retval; | 1218 | return retval; |
1244 | } | 1219 | } |
1245 | EXPORT_SYMBOL(__blockdev_direct_IO); | 1220 | EXPORT_SYMBOL(__blockdev_direct_IO); |