diff options
author | Christoph Hellwig <hch@infradead.org> | 2011-06-24 14:29:43 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2011-07-20 20:47:46 -0400 |
commit | bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 (patch) | |
tree | ef5341c7747f809aec7ae233f6e3ef90af39be5f /fs | |
parent | f9b5570d7fdedff32a2e78102bfb54cd1b12b289 (diff) |
fs: kill i_alloc_sem
i_alloc_sem is a rather special rw_semaphore. It's the last one that may
be released by a non-owner, and it's write side is always mirrored by
real exclusion. It's intended use it to wait for all pending direct I/O
requests to finish before starting a truncate.
Replace it with a hand-grown construct:
- exclusion for truncates is already guaranteed by i_mutex, so it can
simply fall way
- the reader side is replaced by an i_dio_count member in struct inode
that counts the number of pending direct I/O requests. Truncate can't
proceed as long as it's non-zero
- when i_dio_count reaches non-zero we wake up a pending truncate using
wake_up_bit on a new bit in i_flags
- new references to i_dio_count can't appear while we are waiting for
it to read zero because the direct I/O count always needs i_mutex
(or an equivalent like XFS's i_iolock) for starting a new operation.
This scheme is much simpler, and saves the space of a spinlock_t and a
struct list_head in struct inode (typically 160 bits on a non-debug 64-bit
system).
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/attr.c | 5 | ||||
-rw-r--r-- | fs/direct-io.c | 65 | ||||
-rw-r--r-- | fs/inode.c | 3 | ||||
-rw-r--r-- | fs/ntfs/file.c | 3 | ||||
-rw-r--r-- | fs/ntfs/inode.c | 10 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 7 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 15 | ||||
-rw-r--r-- | fs/reiserfs/xattr.c | 3 |
8 files changed, 67 insertions, 44 deletions
@@ -233,16 +233,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr) | |||
233 | return error; | 233 | return error; |
234 | 234 | ||
235 | if (ia_valid & ATTR_SIZE) | 235 | if (ia_valid & ATTR_SIZE) |
236 | down_write(&dentry->d_inode->i_alloc_sem); | 236 | inode_dio_wait(inode); |
237 | 237 | ||
238 | if (inode->i_op->setattr) | 238 | if (inode->i_op->setattr) |
239 | error = inode->i_op->setattr(dentry, attr); | 239 | error = inode->i_op->setattr(dentry, attr); |
240 | else | 240 | else |
241 | error = simple_setattr(dentry, attr); | 241 | error = simple_setattr(dentry, attr); |
242 | 242 | ||
243 | if (ia_valid & ATTR_SIZE) | ||
244 | up_write(&dentry->d_inode->i_alloc_sem); | ||
245 | |||
246 | if (!error) | 243 | if (!error) |
247 | fsnotify_change(dentry, ia_valid); | 244 | fsnotify_change(dentry, ia_valid); |
248 | 245 | ||
diff --git a/fs/direct-io.c b/fs/direct-io.c index 98ce3ac0d94b..354cbdbc14bd 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -135,6 +135,50 @@ struct dio { | |||
135 | struct page *pages[DIO_PAGES]; /* page buffer */ | 135 | struct page *pages[DIO_PAGES]; /* page buffer */ |
136 | }; | 136 | }; |
137 | 137 | ||
138 | static void __inode_dio_wait(struct inode *inode) | ||
139 | { | ||
140 | wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); | ||
141 | DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); | ||
142 | |||
143 | do { | ||
144 | prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); | ||
145 | if (atomic_read(&inode->i_dio_count)) | ||
146 | schedule(); | ||
147 | } while (atomic_read(&inode->i_dio_count)); | ||
148 | finish_wait(wq, &q.wait); | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | * inode_dio_wait - wait for outstanding DIO requests to finish | ||
153 | * @inode: inode to wait for | ||
154 | * | ||
155 | * Waits for all pending direct I/O requests to finish so that we can | ||
156 | * proceed with a truncate or equivalent operation. | ||
157 | * | ||
158 | * Must be called under a lock that serializes taking new references | ||
159 | * to i_dio_count, usually by inode->i_mutex. | ||
160 | */ | ||
161 | void inode_dio_wait(struct inode *inode) | ||
162 | { | ||
163 | if (atomic_read(&inode->i_dio_count)) | ||
164 | __inode_dio_wait(inode); | ||
165 | } | ||
166 | EXPORT_SYMBOL_GPL(inode_dio_wait); | ||
167 | |||
168 | /* | ||
169 | * inode_dio_done - signal finish of a direct I/O requests | ||
170 | * @inode: inode the direct I/O happens on | ||
171 | * | ||
172 | * This is called once we've finished processing a direct I/O request, | ||
173 | * and is used to wake up callers waiting for direct I/O to be quiesced. | ||
174 | */ | ||
175 | void inode_dio_done(struct inode *inode) | ||
176 | { | ||
177 | if (atomic_dec_and_test(&inode->i_dio_count)) | ||
178 | wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); | ||
179 | } | ||
180 | EXPORT_SYMBOL_GPL(inode_dio_done); | ||
181 | |||
138 | /* | 182 | /* |
139 | * How many pages are in the queue? | 183 | * How many pages are in the queue? |
140 | */ | 184 | */ |
@@ -254,9 +298,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is | |||
254 | } | 298 | } |
255 | 299 | ||
256 | if (dio->flags & DIO_LOCKING) | 300 | if (dio->flags & DIO_LOCKING) |
257 | /* lockdep: non-owner release */ | 301 | inode_dio_done(dio->inode); |
258 | up_read_non_owner(&dio->inode->i_alloc_sem); | ||
259 | |||
260 | return ret; | 302 | return ret; |
261 | } | 303 | } |
262 | 304 | ||
@@ -980,9 +1022,6 @@ out: | |||
980 | return ret; | 1022 | return ret; |
981 | } | 1023 | } |
982 | 1024 | ||
983 | /* | ||
984 | * Releases both i_mutex and i_alloc_sem | ||
985 | */ | ||
986 | static ssize_t | 1025 | static ssize_t |
987 | direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | 1026 | direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, |
988 | const struct iovec *iov, loff_t offset, unsigned long nr_segs, | 1027 | const struct iovec *iov, loff_t offset, unsigned long nr_segs, |
@@ -1146,15 +1185,14 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1146 | * For writes this function is called under i_mutex and returns with | 1185 | * For writes this function is called under i_mutex and returns with |
1147 | * i_mutex held, for reads, i_mutex is not held on entry, but it is | 1186 | * i_mutex held, for reads, i_mutex is not held on entry, but it is |
1148 | * taken and dropped again before returning. | 1187 | * taken and dropped again before returning. |
1149 | * For reads and writes i_alloc_sem is taken in shared mode and released | 1188 | * The i_dio_count counter keeps track of the number of outstanding |
1150 | * on I/O completion (which may happen asynchronously after returning to | 1189 | * direct I/O requests, and truncate waits for it to reach zero. |
1151 | * the caller). | 1190 | * New references to i_dio_count must only be grabbed with i_mutex |
1191 | * held. | ||
1152 | * | 1192 | * |
1153 | * - if the flags value does NOT contain DIO_LOCKING we don't use any | 1193 | * - if the flags value does NOT contain DIO_LOCKING we don't use any |
1154 | * internal locking but rather rely on the filesystem to synchronize | 1194 | * internal locking but rather rely on the filesystem to synchronize |
1155 | * direct I/O reads/writes versus each other and truncate. | 1195 | * direct I/O reads/writes versus each other and truncate. |
1156 | * For reads and writes both i_mutex and i_alloc_sem are not held on | ||
1157 | * entry and are never taken. | ||
1158 | */ | 1196 | */ |
1159 | ssize_t | 1197 | ssize_t |
1160 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 1198 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
@@ -1234,10 +1272,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1234 | } | 1272 | } |
1235 | 1273 | ||
1236 | /* | 1274 | /* |
1237 | * Will be released at I/O completion, possibly in a | 1275 | * Will be decremented at I/O completion time. |
1238 | * different thread. | ||
1239 | */ | 1276 | */ |
1240 | down_read_non_owner(&inode->i_alloc_sem); | 1277 | atomic_inc(&inode->i_dio_count); |
1241 | } | 1278 | } |
1242 | 1279 | ||
1243 | /* | 1280 | /* |
diff --git a/fs/inode.c b/fs/inode.c index cf81baf1898a..96c77b81167c 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -168,8 +168,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) | |||
168 | mutex_init(&inode->i_mutex); | 168 | mutex_init(&inode->i_mutex); |
169 | lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); | 169 | lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); |
170 | 170 | ||
171 | init_rwsem(&inode->i_alloc_sem); | 171 | atomic_set(&inode->i_dio_count, 0); |
172 | lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); | ||
173 | 172 | ||
174 | mapping->a_ops = &empty_aops; | 173 | mapping->a_ops = &empty_aops; |
175 | mapping->host = inode; | 174 | mapping->host = inode; |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f4b1057abdd2..b59f5ac26bef 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
@@ -1832,9 +1832,8 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, | |||
1832 | * fails again. | 1832 | * fails again. |
1833 | */ | 1833 | */ |
1834 | if (unlikely(NInoTruncateFailed(ni))) { | 1834 | if (unlikely(NInoTruncateFailed(ni))) { |
1835 | down_write(&vi->i_alloc_sem); | 1835 | inode_dio_wait(vi); |
1836 | err = ntfs_truncate(vi); | 1836 | err = ntfs_truncate(vi); |
1837 | up_write(&vi->i_alloc_sem); | ||
1838 | if (err || NInoTruncateFailed(ni)) { | 1837 | if (err || NInoTruncateFailed(ni)) { |
1839 | if (!err) | 1838 | if (!err) |
1840 | err = -EIO; | 1839 | err = -EIO; |
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index c05d6dcf77a4..1371487da955 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c | |||
@@ -2357,12 +2357,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run " | |||
2357 | * | 2357 | * |
2358 | * Returns 0 on success or -errno on error. | 2358 | * Returns 0 on success or -errno on error. |
2359 | * | 2359 | * |
2360 | * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for | 2360 | * Called with ->i_mutex held. |
2361 | * writing. The only case in the kernel where ->i_alloc_sem is not held is | ||
2362 | * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called | ||
2363 | * with the current i_size as the offset. The analogous place in NTFS is in | ||
2364 | * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again | ||
2365 | * without holding ->i_alloc_sem. | ||
2366 | */ | 2361 | */ |
2367 | int ntfs_truncate(struct inode *vi) | 2362 | int ntfs_truncate(struct inode *vi) |
2368 | { | 2363 | { |
@@ -2887,8 +2882,7 @@ void ntfs_truncate_vfs(struct inode *vi) { | |||
2887 | * We also abort all changes of user, group, and mode as we do not implement | 2882 | * We also abort all changes of user, group, and mode as we do not implement |
2888 | * the NTFS ACLs yet. | 2883 | * the NTFS ACLs yet. |
2889 | * | 2884 | * |
2890 | * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also | 2885 | * Called with ->i_mutex held. |
2891 | * called with ->i_alloc_sem held for writing. | ||
2892 | */ | 2886 | */ |
2893 | int ntfs_setattr(struct dentry *dentry, struct iattr *attr) | 2887 | int ntfs_setattr(struct dentry *dentry, struct iattr *attr) |
2894 | { | 2888 | { |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ac97bca282d2..de1d3953599d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -551,9 +551,8 @@ bail: | |||
551 | 551 | ||
552 | /* | 552 | /* |
553 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're | 553 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're |
554 | * particularly interested in the aio/dio case. Like the core uses | 554 | * particularly interested in the aio/dio case. We use the rw_lock DLM lock |
555 | * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from | 555 | * to protect io on one node from truncation on another. |
556 | * truncation on another. | ||
557 | */ | 556 | */ |
558 | static void ocfs2_dio_end_io(struct kiocb *iocb, | 557 | static void ocfs2_dio_end_io(struct kiocb *iocb, |
559 | loff_t offset, | 558 | loff_t offset, |
@@ -569,7 +568,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
569 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 568 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
570 | 569 | ||
571 | if (ocfs2_iocb_is_sem_locked(iocb)) { | 570 | if (ocfs2_iocb_is_sem_locked(iocb)) { |
572 | up_read(&inode->i_alloc_sem); | 571 | inode_dio_done(inode); |
573 | ocfs2_iocb_clear_sem_locked(iocb); | 572 | ocfs2_iocb_clear_sem_locked(iocb); |
574 | } | 573 | } |
575 | 574 | ||
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1406c37a5722..2c3a465514a2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -2236,9 +2236,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
2236 | ocfs2_iocb_clear_sem_locked(iocb); | 2236 | ocfs2_iocb_clear_sem_locked(iocb); |
2237 | 2237 | ||
2238 | relock: | 2238 | relock: |
2239 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ | 2239 | /* to match setattr's i_mutex -> rw_lock ordering */ |
2240 | if (direct_io) { | 2240 | if (direct_io) { |
2241 | down_read(&inode->i_alloc_sem); | 2241 | atomic_inc(&inode->i_dio_count); |
2242 | have_alloc_sem = 1; | 2242 | have_alloc_sem = 1; |
2243 | /* communicate with ocfs2_dio_end_io */ | 2243 | /* communicate with ocfs2_dio_end_io */ |
2244 | ocfs2_iocb_set_sem_locked(iocb); | 2244 | ocfs2_iocb_set_sem_locked(iocb); |
@@ -2290,7 +2290,7 @@ relock: | |||
2290 | */ | 2290 | */ |
2291 | if (direct_io && !can_do_direct) { | 2291 | if (direct_io && !can_do_direct) { |
2292 | ocfs2_rw_unlock(inode, rw_level); | 2292 | ocfs2_rw_unlock(inode, rw_level); |
2293 | up_read(&inode->i_alloc_sem); | 2293 | inode_dio_done(inode); |
2294 | 2294 | ||
2295 | have_alloc_sem = 0; | 2295 | have_alloc_sem = 0; |
2296 | rw_level = -1; | 2296 | rw_level = -1; |
@@ -2361,8 +2361,7 @@ out_dio: | |||
2361 | /* | 2361 | /* |
2362 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | 2362 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io |
2363 | * function pointer which is called when o_direct io completes so that | 2363 | * function pointer which is called when o_direct io completes so that |
2364 | * it can unlock our rw lock. (it's the clustered equivalent of | 2364 | * it can unlock our rw lock. |
2365 | * i_alloc_sem; protects truncate from racing with pending ios). | ||
2366 | * Unfortunately there are error cases which call end_io and others | 2365 | * Unfortunately there are error cases which call end_io and others |
2367 | * that don't. so we don't have to unlock the rw_lock if either an | 2366 | * that don't. so we don't have to unlock the rw_lock if either an |
2368 | * async dio is going to do it in the future or an end_io after an | 2367 | * async dio is going to do it in the future or an end_io after an |
@@ -2379,7 +2378,7 @@ out: | |||
2379 | 2378 | ||
2380 | out_sems: | 2379 | out_sems: |
2381 | if (have_alloc_sem) { | 2380 | if (have_alloc_sem) { |
2382 | up_read(&inode->i_alloc_sem); | 2381 | inode_dio_done(inode); |
2383 | ocfs2_iocb_clear_sem_locked(iocb); | 2382 | ocfs2_iocb_clear_sem_locked(iocb); |
2384 | } | 2383 | } |
2385 | 2384 | ||
@@ -2531,8 +2530,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | |||
2531 | * need locks to protect pending reads from racing with truncate. | 2530 | * need locks to protect pending reads from racing with truncate. |
2532 | */ | 2531 | */ |
2533 | if (filp->f_flags & O_DIRECT) { | 2532 | if (filp->f_flags & O_DIRECT) { |
2534 | down_read(&inode->i_alloc_sem); | ||
2535 | have_alloc_sem = 1; | 2533 | have_alloc_sem = 1; |
2534 | atomic_inc(&inode->i_dio_count); | ||
2536 | ocfs2_iocb_set_sem_locked(iocb); | 2535 | ocfs2_iocb_set_sem_locked(iocb); |
2537 | 2536 | ||
2538 | ret = ocfs2_rw_lock(inode, 0); | 2537 | ret = ocfs2_rw_lock(inode, 0); |
@@ -2575,7 +2574,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | |||
2575 | 2574 | ||
2576 | bail: | 2575 | bail: |
2577 | if (have_alloc_sem) { | 2576 | if (have_alloc_sem) { |
2578 | up_read(&inode->i_alloc_sem); | 2577 | inode_dio_done(inode); |
2579 | ocfs2_iocb_clear_sem_locked(iocb); | 2578 | ocfs2_iocb_clear_sem_locked(iocb); |
2580 | } | 2579 | } |
2581 | if (rw_level != -1) | 2580 | if (rw_level != -1) |
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4ea2ab41fdee..6938d8c68d6e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c | |||
@@ -555,11 +555,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, | |||
555 | 555 | ||
556 | reiserfs_write_unlock(inode->i_sb); | 556 | reiserfs_write_unlock(inode->i_sb); |
557 | mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); | 557 | mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); |
558 | down_write(&dentry->d_inode->i_alloc_sem); | 558 | inode_dio_wait(dentry->d_inode); |
559 | reiserfs_write_lock(inode->i_sb); | 559 | reiserfs_write_lock(inode->i_sb); |
560 | 560 | ||
561 | err = reiserfs_setattr(dentry, &newattrs); | 561 | err = reiserfs_setattr(dentry, &newattrs); |
562 | up_write(&dentry->d_inode->i_alloc_sem); | ||
563 | mutex_unlock(&dentry->d_inode->i_mutex); | 562 | mutex_unlock(&dentry->d_inode->i_mutex); |
564 | } else | 563 | } else |
565 | update_ctime(inode); | 564 | update_ctime(inode); |