diff options
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r-- | fs/direct-io.c | 165 |
1 files changed, 76 insertions, 89 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index b912270942fa..4012885d027f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -53,13 +53,6 @@ | |||
53 | * | 53 | * |
54 | * If blkfactor is zero then the user's request was aligned to the filesystem's | 54 | * If blkfactor is zero then the user's request was aligned to the filesystem's |
55 | * blocksize. | 55 | * blocksize. |
56 | * | ||
57 | * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. | ||
58 | * This determines whether we need to do the fancy locking which prevents | ||
59 | * direct-IO from being able to read uninitialised disk blocks. If its zero | ||
60 | * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is | ||
61 | * not held for the entire direct write (taken briefly, initially, during a | ||
62 | * direct read though, but its never held for the duration of a direct-IO). | ||
63 | */ | 56 | */ |
64 | 57 | ||
65 | struct dio { | 58 | struct dio { |
@@ -68,7 +61,7 @@ struct dio { | |||
68 | struct inode *inode; | 61 | struct inode *inode; |
69 | int rw; | 62 | int rw; |
70 | loff_t i_size; /* i_size when submitted */ | 63 | loff_t i_size; /* i_size when submitted */ |
71 | int lock_type; /* doesn't change */ | 64 | int flags; /* doesn't change */ |
72 | unsigned blkbits; /* doesn't change */ | 65 | unsigned blkbits; /* doesn't change */ |
73 | unsigned blkfactor; /* When we're using an alignment which | 66 | unsigned blkfactor; /* When we're using an alignment which |
74 | is finer than the filesystem's soft | 67 | is finer than the filesystem's soft |
@@ -104,6 +97,18 @@ struct dio { | |||
104 | unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ | 97 | unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ |
105 | sector_t cur_page_block; /* Where it starts */ | 98 | sector_t cur_page_block; /* Where it starts */ |
106 | 99 | ||
100 | /* BIO completion state */ | ||
101 | spinlock_t bio_lock; /* protects BIO fields below */ | ||
102 | unsigned long refcount; /* direct_io_worker() and bios */ | ||
103 | struct bio *bio_list; /* singly linked via bi_private */ | ||
104 | struct task_struct *waiter; /* waiting task (NULL if none) */ | ||
105 | |||
106 | /* AIO related stuff */ | ||
107 | struct kiocb *iocb; /* kiocb */ | ||
108 | int is_async; /* is IO async ? */ | ||
109 | int io_error; /* IO error in completion path */ | ||
110 | ssize_t result; /* IO result */ | ||
111 | |||
107 | /* | 112 | /* |
108 | * Page fetching state. These variables belong to dio_refill_pages(). | 113 | * Page fetching state. These variables belong to dio_refill_pages(). |
109 | */ | 114 | */ |
@@ -115,22 +120,16 @@ struct dio { | |||
115 | * Page queue. These variables belong to dio_refill_pages() and | 120 | * Page queue. These variables belong to dio_refill_pages() and |
116 | * dio_get_page(). | 121 | * dio_get_page(). |
117 | */ | 122 | */ |
118 | struct page *pages[DIO_PAGES]; /* page buffer */ | ||
119 | unsigned head; /* next page to process */ | 123 | unsigned head; /* next page to process */ |
120 | unsigned tail; /* last valid page + 1 */ | 124 | unsigned tail; /* last valid page + 1 */ |
121 | int page_errors; /* errno from get_user_pages() */ | 125 | int page_errors; /* errno from get_user_pages() */ |
122 | 126 | ||
123 | /* BIO completion state */ | 127 | /* |
124 | spinlock_t bio_lock; /* protects BIO fields below */ | 128 | * pages[] (and any fields placed after it) are not zeroed out at |
125 | unsigned long refcount; /* direct_io_worker() and bios */ | 129 | * allocation time. Don't add new fields after pages[] unless you |
126 | struct bio *bio_list; /* singly linked via bi_private */ | 130 | * wish that they not be zeroed. |
127 | struct task_struct *waiter; /* waiting task (NULL if none) */ | 131 | */ |
128 | 132 | struct page *pages[DIO_PAGES]; /* page buffer */ | |
129 | /* AIO related stuff */ | ||
130 | struct kiocb *iocb; /* kiocb */ | ||
131 | int is_async; /* is IO async ? */ | ||
132 | int io_error; /* IO error in completion path */ | ||
133 | ssize_t result; /* IO result */ | ||
134 | }; | 133 | }; |
135 | 134 | ||
136 | /* | 135 | /* |
@@ -240,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) | |||
240 | if (dio->end_io && dio->result) | 239 | if (dio->end_io && dio->result) |
241 | dio->end_io(dio->iocb, offset, transferred, | 240 | dio->end_io(dio->iocb, offset, transferred, |
242 | dio->map_bh.b_private); | 241 | dio->map_bh.b_private); |
243 | if (dio->lock_type == DIO_LOCKING) | 242 | |
243 | if (dio->flags & DIO_LOCKING) | ||
244 | /* lockdep: non-owner release */ | 244 | /* lockdep: non-owner release */ |
245 | up_read_non_owner(&dio->inode->i_alloc_sem); | 245 | up_read_non_owner(&dio->inode->i_alloc_sem); |
246 | 246 | ||
@@ -515,21 +515,24 @@ static int get_more_blocks(struct dio *dio) | |||
515 | map_bh->b_state = 0; | 515 | map_bh->b_state = 0; |
516 | map_bh->b_size = fs_count << dio->inode->i_blkbits; | 516 | map_bh->b_size = fs_count << dio->inode->i_blkbits; |
517 | 517 | ||
518 | /* | ||
519 | * For writes inside i_size on a DIO_SKIP_HOLES filesystem we | ||
520 | * forbid block creations: only overwrites are permitted. | ||
521 | * We will return early to the caller once we see an | ||
522 | * unmapped buffer head returned, and the caller will fall | ||
523 | * back to buffered I/O. | ||
524 | * | ||
525 | * Otherwise the decision is left to the get_blocks method, | ||
526 | * which may decide to handle it or also return an unmapped | ||
527 | * buffer head. | ||
528 | */ | ||
518 | create = dio->rw & WRITE; | 529 | create = dio->rw & WRITE; |
519 | if (dio->lock_type == DIO_LOCKING) { | 530 | if (dio->flags & DIO_SKIP_HOLES) { |
520 | if (dio->block_in_file < (i_size_read(dio->inode) >> | 531 | if (dio->block_in_file < (i_size_read(dio->inode) >> |
521 | dio->blkbits)) | 532 | dio->blkbits)) |
522 | create = 0; | 533 | create = 0; |
523 | } else if (dio->lock_type == DIO_NO_LOCKING) { | ||
524 | create = 0; | ||
525 | } | 534 | } |
526 | 535 | ||
527 | /* | ||
528 | * For writes inside i_size we forbid block creations: only | ||
529 | * overwrites are permitted. We fall back to buffered writes | ||
530 | * at a higher level for inside-i_size block-instantiating | ||
531 | * writes. | ||
532 | */ | ||
533 | ret = (*dio->get_block)(dio->inode, fs_startblk, | 536 | ret = (*dio->get_block)(dio->inode, fs_startblk, |
534 | map_bh, create); | 537 | map_bh, create); |
535 | } | 538 | } |
@@ -1039,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1039 | * we can let i_mutex go now that its achieved its purpose | 1042 | * we can let i_mutex go now that its achieved its purpose |
1040 | * of protecting us from looking up uninitialized blocks. | 1043 | * of protecting us from looking up uninitialized blocks. |
1041 | */ | 1044 | */ |
1042 | if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) | 1045 | if (rw == READ && (dio->flags & DIO_LOCKING)) |
1043 | mutex_unlock(&dio->inode->i_mutex); | 1046 | mutex_unlock(&dio->inode->i_mutex); |
1044 | 1047 | ||
1045 | /* | 1048 | /* |
@@ -1086,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1086 | 1089 | ||
1087 | /* | 1090 | /* |
1088 | * This is a library function for use by filesystem drivers. | 1091 | * This is a library function for use by filesystem drivers. |
1089 | * The locking rules are governed by the dio_lock_type parameter. | ||
1090 | * | 1092 | * |
1091 | * DIO_NO_LOCKING (no locking, for raw block device access) | 1093 | * The locking rules are governed by the flags parameter: |
1092 | * For writes, i_mutex is not held on entry; it is never taken. | 1094 | * - if the flags value contains DIO_LOCKING we use a fancy locking |
1095 | * scheme for dumb filesystems. | ||
1096 | * For writes this function is called under i_mutex and returns with | ||
1097 | * i_mutex held, for reads, i_mutex is not held on entry, but it is | ||
1098 | * taken and dropped again before returning. | ||
1099 | * For reads and writes i_alloc_sem is taken in shared mode and released | ||
1100 | * on I/O completion (which may happen asynchronously after returning to | ||
1101 | * the caller). | ||
1093 | * | 1102 | * |
1094 | * DIO_LOCKING (simple locking for regular files) | 1103 | * - if the flags value does NOT contain DIO_LOCKING we don't use any |
1095 | * For writes we are called under i_mutex and return with i_mutex held, even | 1104 | * internal locking but rather rely on the filesystem to synchronize |
1096 | * though it is internally dropped. | 1105 | * direct I/O reads/writes versus each other and truncate. |
1097 | * For reads, i_mutex is not held on entry, but it is taken and dropped before | 1106 | * For reads and writes both i_mutex and i_alloc_sem are not held on |
1098 | * returning. | 1107 | * entry and are never taken. |
1099 | * | ||
1100 | * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of | ||
1101 | * uninitialised data, allowing parallel direct readers and writers) | ||
1102 | * For writes we are called without i_mutex, return without it, never touch it. | ||
1103 | * For reads we are called under i_mutex and return with i_mutex held, even | ||
1104 | * though it may be internally dropped. | ||
1105 | * | ||
1106 | * Additional i_alloc_sem locking requirements described inline below. | ||
1107 | */ | 1108 | */ |
1108 | ssize_t | 1109 | ssize_t |
1109 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 1110 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
1110 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 1111 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
1111 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 1112 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
1112 | int dio_lock_type) | 1113 | int flags) |
1113 | { | 1114 | { |
1114 | int seg; | 1115 | int seg; |
1115 | size_t size; | 1116 | size_t size; |
@@ -1120,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1120 | ssize_t retval = -EINVAL; | 1121 | ssize_t retval = -EINVAL; |
1121 | loff_t end = offset; | 1122 | loff_t end = offset; |
1122 | struct dio *dio; | 1123 | struct dio *dio; |
1123 | int release_i_mutex = 0; | ||
1124 | int acquire_i_mutex = 0; | ||
1125 | 1124 | ||
1126 | if (rw & WRITE) | 1125 | if (rw & WRITE) |
1127 | rw = WRITE_ODIRECT_PLUG; | 1126 | rw = WRITE_ODIRECT_PLUG; |
@@ -1151,48 +1150,41 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1151 | } | 1150 | } |
1152 | } | 1151 | } |
1153 | 1152 | ||
1154 | dio = kzalloc(sizeof(*dio), GFP_KERNEL); | 1153 | dio = kmalloc(sizeof(*dio), GFP_KERNEL); |
1155 | retval = -ENOMEM; | 1154 | retval = -ENOMEM; |
1156 | if (!dio) | 1155 | if (!dio) |
1157 | goto out; | 1156 | goto out; |
1158 | |||
1159 | /* | 1157 | /* |
1160 | * For block device access DIO_NO_LOCKING is used, | 1158 | * Believe it or not, zeroing out the page array caused a .5% |
1161 | * neither readers nor writers do any locking at all | 1159 | * performance regression in a database benchmark. So, we take |
1162 | * For regular files using DIO_LOCKING, | 1160 | * care to only zero out what's needed. |
1163 | * readers need to grab i_mutex and i_alloc_sem | ||
1164 | * writers need to grab i_alloc_sem only (i_mutex is already held) | ||
1165 | * For regular files using DIO_OWN_LOCKING, | ||
1166 | * neither readers nor writers take any locks here | ||
1167 | */ | 1161 | */ |
1168 | dio->lock_type = dio_lock_type; | 1162 | memset(dio, 0, offsetof(struct dio, pages)); |
1169 | if (dio_lock_type != DIO_NO_LOCKING) { | 1163 | |
1164 | dio->flags = flags; | ||
1165 | if (dio->flags & DIO_LOCKING) { | ||
1170 | /* watch out for a 0 len io from a tricksy fs */ | 1166 | /* watch out for a 0 len io from a tricksy fs */ |
1171 | if (rw == READ && end > offset) { | 1167 | if (rw == READ && end > offset) { |
1172 | struct address_space *mapping; | 1168 | struct address_space *mapping = |
1169 | iocb->ki_filp->f_mapping; | ||
1173 | 1170 | ||
1174 | mapping = iocb->ki_filp->f_mapping; | 1171 | /* will be released by direct_io_worker */ |
1175 | if (dio_lock_type != DIO_OWN_LOCKING) { | 1172 | mutex_lock(&inode->i_mutex); |
1176 | mutex_lock(&inode->i_mutex); | ||
1177 | release_i_mutex = 1; | ||
1178 | } | ||
1179 | 1173 | ||
1180 | retval = filemap_write_and_wait_range(mapping, offset, | 1174 | retval = filemap_write_and_wait_range(mapping, offset, |
1181 | end - 1); | 1175 | end - 1); |
1182 | if (retval) { | 1176 | if (retval) { |
1177 | mutex_unlock(&inode->i_mutex); | ||
1183 | kfree(dio); | 1178 | kfree(dio); |
1184 | goto out; | 1179 | goto out; |
1185 | } | 1180 | } |
1186 | |||
1187 | if (dio_lock_type == DIO_OWN_LOCKING) { | ||
1188 | mutex_unlock(&inode->i_mutex); | ||
1189 | acquire_i_mutex = 1; | ||
1190 | } | ||
1191 | } | 1181 | } |
1192 | 1182 | ||
1193 | if (dio_lock_type == DIO_LOCKING) | 1183 | /* |
1194 | /* lockdep: not the owner will release it */ | 1184 | * Will be released at I/O completion, possibly in a |
1195 | down_read_non_owner(&inode->i_alloc_sem); | 1185 | * different thread. |
1186 | */ | ||
1187 | down_read_non_owner(&inode->i_alloc_sem); | ||
1196 | } | 1188 | } |
1197 | 1189 | ||
1198 | /* | 1190 | /* |
@@ -1210,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1210 | /* | 1202 | /* |
1211 | * In case of error extending write may have instantiated a few | 1203 | * In case of error extending write may have instantiated a few |
1212 | * blocks outside i_size. Trim these off again for DIO_LOCKING. | 1204 | * blocks outside i_size. Trim these off again for DIO_LOCKING. |
1213 | * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by | 1205 | * |
1214 | * it's own meaner. | 1206 | * NOTE: filesystems with their own locking have to handle this |
1207 | * on their own. | ||
1215 | */ | 1208 | */ |
1216 | if (unlikely(retval < 0 && (rw & WRITE))) { | 1209 | if (dio->flags & DIO_LOCKING) { |
1217 | loff_t isize = i_size_read(inode); | 1210 | if (unlikely((rw & WRITE) && retval < 0)) { |
1218 | 1211 | loff_t isize = i_size_read(inode); | |
1219 | if (end > isize && dio_lock_type == DIO_LOCKING) | 1212 | if (end > isize) |
1220 | vmtruncate(inode, isize); | 1213 | vmtruncate(inode, isize); |
1214 | } | ||
1221 | } | 1215 | } |
1222 | 1216 | ||
1223 | if (rw == READ && dio_lock_type == DIO_LOCKING) | ||
1224 | release_i_mutex = 0; | ||
1225 | |||
1226 | out: | 1217 | out: |
1227 | if (release_i_mutex) | ||
1228 | mutex_unlock(&inode->i_mutex); | ||
1229 | else if (acquire_i_mutex) | ||
1230 | mutex_lock(&inode->i_mutex); | ||
1231 | return retval; | 1218 | return retval; |
1232 | } | 1219 | } |
1233 | EXPORT_SYMBOL(__blockdev_direct_IO); | 1220 | EXPORT_SYMBOL(__blockdev_direct_IO); |