aboutsummaryrefslogtreecommitdiffstats
path: root/fs/direct-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r--fs/direct-io.c165
1 files changed, 76 insertions, 89 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b912270942fa..4012885d027f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
53 * 53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's 54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize. 55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks. If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */ 56 */
64 57
65struct dio { 58struct dio {
@@ -68,7 +61,7 @@ struct dio {
68 struct inode *inode; 61 struct inode *inode;
69 int rw; 62 int rw;
70 loff_t i_size; /* i_size when submitted */ 63 loff_t i_size; /* i_size when submitted */
71 int lock_type; /* doesn't change */ 64 int flags; /* doesn't change */
72 unsigned blkbits; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */
73 unsigned blkfactor; /* When we're using an alignment which 66 unsigned blkfactor; /* When we're using an alignment which
74 is finer than the filesystem's soft 67 is finer than the filesystem's soft
@@ -104,6 +97,18 @@ struct dio {
104 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 97 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
105 sector_t cur_page_block; /* Where it starts */ 98 sector_t cur_page_block; /* Where it starts */
106 99
100 /* BIO completion state */
101 spinlock_t bio_lock; /* protects BIO fields below */
102 unsigned long refcount; /* direct_io_worker() and bios */
103 struct bio *bio_list; /* singly linked via bi_private */
104 struct task_struct *waiter; /* waiting task (NULL if none) */
105
106 /* AIO related stuff */
107 struct kiocb *iocb; /* kiocb */
108 int is_async; /* is IO async ? */
109 int io_error; /* IO error in completion path */
110 ssize_t result; /* IO result */
111
107 /* 112 /*
108 * Page fetching state. These variables belong to dio_refill_pages(). 113 * Page fetching state. These variables belong to dio_refill_pages().
109 */ 114 */
@@ -115,22 +120,16 @@ struct dio {
115 * Page queue. These variables belong to dio_refill_pages() and 120 * Page queue. These variables belong to dio_refill_pages() and
116 * dio_get_page(). 121 * dio_get_page().
117 */ 122 */
118 struct page *pages[DIO_PAGES]; /* page buffer */
119 unsigned head; /* next page to process */ 123 unsigned head; /* next page to process */
120 unsigned tail; /* last valid page + 1 */ 124 unsigned tail; /* last valid page + 1 */
121 int page_errors; /* errno from get_user_pages() */ 125 int page_errors; /* errno from get_user_pages() */
122 126
123 /* BIO completion state */ 127 /*
124 spinlock_t bio_lock; /* protects BIO fields below */ 128 * pages[] (and any fields placed after it) are not zeroed out at
125 unsigned long refcount; /* direct_io_worker() and bios */ 129 * allocation time. Don't add new fields after pages[] unless you
126 struct bio *bio_list; /* singly linked via bi_private */ 130 * wish that they not be zeroed.
127 struct task_struct *waiter; /* waiting task (NULL if none) */ 131 */
128 132 struct page *pages[DIO_PAGES]; /* page buffer */
129 /* AIO related stuff */
130 struct kiocb *iocb; /* kiocb */
131 int is_async; /* is IO async ? */
132 int io_error; /* IO error in completion path */
133 ssize_t result; /* IO result */
134}; 133};
135 134
136/* 135/*
@@ -240,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
240 if (dio->end_io && dio->result) 239 if (dio->end_io && dio->result)
241 dio->end_io(dio->iocb, offset, transferred, 240 dio->end_io(dio->iocb, offset, transferred,
242 dio->map_bh.b_private); 241 dio->map_bh.b_private);
243 if (dio->lock_type == DIO_LOCKING) 242
243 if (dio->flags & DIO_LOCKING)
244 /* lockdep: non-owner release */ 244 /* lockdep: non-owner release */
245 up_read_non_owner(&dio->inode->i_alloc_sem); 245 up_read_non_owner(&dio->inode->i_alloc_sem);
246 246
@@ -515,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
515 map_bh->b_state = 0; 515 map_bh->b_state = 0;
516 map_bh->b_size = fs_count << dio->inode->i_blkbits; 516 map_bh->b_size = fs_count << dio->inode->i_blkbits;
517 517
518 /*
519 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
520 * forbid block creations: only overwrites are permitted.
521 * We will return early to the caller once we see an
522 * unmapped buffer head returned, and the caller will fall
523 * back to buffered I/O.
524 *
525 * Otherwise the decision is left to the get_blocks method,
526 * which may decide to handle it or also return an unmapped
527 * buffer head.
528 */
518 create = dio->rw & WRITE; 529 create = dio->rw & WRITE;
519 if (dio->lock_type == DIO_LOCKING) { 530 if (dio->flags & DIO_SKIP_HOLES) {
520 if (dio->block_in_file < (i_size_read(dio->inode) >> 531 if (dio->block_in_file < (i_size_read(dio->inode) >>
521 dio->blkbits)) 532 dio->blkbits))
522 create = 0; 533 create = 0;
523 } else if (dio->lock_type == DIO_NO_LOCKING) {
524 create = 0;
525 } 534 }
526 535
527 /*
528 * For writes inside i_size we forbid block creations: only
529 * overwrites are permitted. We fall back to buffered writes
530 * at a higher level for inside-i_size block-instantiating
531 * writes.
532 */
533 ret = (*dio->get_block)(dio->inode, fs_startblk, 536 ret = (*dio->get_block)(dio->inode, fs_startblk,
534 map_bh, create); 537 map_bh, create);
535 } 538 }
@@ -1039,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1039 * we can let i_mutex go now that its achieved its purpose 1042 * we can let i_mutex go now that its achieved its purpose
1040 * of protecting us from looking up uninitialized blocks. 1043 * of protecting us from looking up uninitialized blocks.
1041 */ 1044 */
1042 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1045 if (rw == READ && (dio->flags & DIO_LOCKING))
1043 mutex_unlock(&dio->inode->i_mutex); 1046 mutex_unlock(&dio->inode->i_mutex);
1044 1047
1045 /* 1048 /*
@@ -1086,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1086 1089
1087/* 1090/*
1088 * This is a library function for use by filesystem drivers. 1091 * This is a library function for use by filesystem drivers.
1089 * The locking rules are governed by the dio_lock_type parameter.
1090 * 1092 *
1091 * DIO_NO_LOCKING (no locking, for raw block device access) 1093 * The locking rules are governed by the flags parameter:
1092 * For writes, i_mutex is not held on entry; it is never taken. 1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1093 * 1102 *
1094 * DIO_LOCKING (simple locking for regular files) 1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1095 * For writes we are called under i_mutex and return with i_mutex held, even 1104 * internal locking but rather rely on the filesystem to synchronize
1096 * though it is internally dropped. 1105 * direct I/O reads/writes versus each other and truncate.
1097 * For reads, i_mutex is not held on entry, but it is taken and dropped before 1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1098 * returning. 1107 * entry and are never taken.
1099 *
1100 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1101 * uninitialised data, allowing parallel direct readers and writers)
1102 * For writes we are called without i_mutex, return without it, never touch it.
1103 * For reads we are called under i_mutex and return with i_mutex held, even
1104 * though it may be internally dropped.
1105 *
1106 * Additional i_alloc_sem locking requirements described inline below.
1107 */ 1108 */
1108ssize_t 1109ssize_t
1109__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1110 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1111 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1111 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1112 int dio_lock_type) 1113 int flags)
1113{ 1114{
1114 int seg; 1115 int seg;
1115 size_t size; 1116 size_t size;
@@ -1120,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1120 ssize_t retval = -EINVAL; 1121 ssize_t retval = -EINVAL;
1121 loff_t end = offset; 1122 loff_t end = offset;
1122 struct dio *dio; 1123 struct dio *dio;
1123 int release_i_mutex = 0;
1124 int acquire_i_mutex = 0;
1125 1124
1126 if (rw & WRITE) 1125 if (rw & WRITE)
1127 rw = WRITE_ODIRECT_PLUG; 1126 rw = WRITE_ODIRECT_PLUG;
@@ -1151,48 +1150,41 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1151 } 1150 }
1152 } 1151 }
1153 1152
1154 dio = kzalloc(sizeof(*dio), GFP_KERNEL); 1153 dio = kmalloc(sizeof(*dio), GFP_KERNEL);
1155 retval = -ENOMEM; 1154 retval = -ENOMEM;
1156 if (!dio) 1155 if (!dio)
1157 goto out; 1156 goto out;
1158
1159 /* 1157 /*
1160 * For block device access DIO_NO_LOCKING is used, 1158 * Believe it or not, zeroing out the page array caused a .5%
1161 * neither readers nor writers do any locking at all 1159 * performance regression in a database benchmark. So, we take
1162 * For regular files using DIO_LOCKING, 1160 * care to only zero out what's needed.
1163 * readers need to grab i_mutex and i_alloc_sem
1164 * writers need to grab i_alloc_sem only (i_mutex is already held)
1165 * For regular files using DIO_OWN_LOCKING,
1166 * neither readers nor writers take any locks here
1167 */ 1161 */
1168 dio->lock_type = dio_lock_type; 1162 memset(dio, 0, offsetof(struct dio, pages));
1169 if (dio_lock_type != DIO_NO_LOCKING) { 1163
1164 dio->flags = flags;
1165 if (dio->flags & DIO_LOCKING) {
1170 /* watch out for a 0 len io from a tricksy fs */ 1166 /* watch out for a 0 len io from a tricksy fs */
1171 if (rw == READ && end > offset) { 1167 if (rw == READ && end > offset) {
1172 struct address_space *mapping; 1168 struct address_space *mapping =
1169 iocb->ki_filp->f_mapping;
1173 1170
1174 mapping = iocb->ki_filp->f_mapping; 1171 /* will be released by direct_io_worker */
1175 if (dio_lock_type != DIO_OWN_LOCKING) { 1172 mutex_lock(&inode->i_mutex);
1176 mutex_lock(&inode->i_mutex);
1177 release_i_mutex = 1;
1178 }
1179 1173
1180 retval = filemap_write_and_wait_range(mapping, offset, 1174 retval = filemap_write_and_wait_range(mapping, offset,
1181 end - 1); 1175 end - 1);
1182 if (retval) { 1176 if (retval) {
1177 mutex_unlock(&inode->i_mutex);
1183 kfree(dio); 1178 kfree(dio);
1184 goto out; 1179 goto out;
1185 } 1180 }
1186
1187 if (dio_lock_type == DIO_OWN_LOCKING) {
1188 mutex_unlock(&inode->i_mutex);
1189 acquire_i_mutex = 1;
1190 }
1191 } 1181 }
1192 1182
1193 if (dio_lock_type == DIO_LOCKING) 1183 /*
1194 /* lockdep: not the owner will release it */ 1184 * Will be released at I/O completion, possibly in a
1195 down_read_non_owner(&inode->i_alloc_sem); 1185 * different thread.
1186 */
1187 down_read_non_owner(&inode->i_alloc_sem);
1196 } 1188 }
1197 1189
1198 /* 1190 /*
@@ -1210,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1210 /* 1202 /*
1211 * In case of error extending write may have instantiated a few 1203 * In case of error extending write may have instantiated a few
1212 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1204 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1213 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by 1205 *
1214 * it's own meaner. 1206 * NOTE: filesystems with their own locking have to handle this
1207 * on their own.
1215 */ 1208 */
1216 if (unlikely(retval < 0 && (rw & WRITE))) { 1209 if (dio->flags & DIO_LOCKING) {
1217 loff_t isize = i_size_read(inode); 1210 if (unlikely((rw & WRITE) && retval < 0)) {
1218 1211 loff_t isize = i_size_read(inode);
1219 if (end > isize && dio_lock_type == DIO_LOCKING) 1212 if (end > isize)
1220 vmtruncate(inode, isize); 1213 vmtruncate(inode, isize);
1214 }
1221 } 1215 }
1222 1216
1223 if (rw == READ && dio_lock_type == DIO_LOCKING)
1224 release_i_mutex = 0;
1225
1226out: 1217out:
1227 if (release_i_mutex)
1228 mutex_unlock(&inode->i_mutex);
1229 else if (acquire_i_mutex)
1230 mutex_lock(&inode->i_mutex);
1231 return retval; 1218 return retval;
1232} 1219}
1233EXPORT_SYMBOL(__blockdev_direct_IO); 1220EXPORT_SYMBOL(__blockdev_direct_IO);