aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2011-06-24 14:29:43 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2011-07-20 20:47:46 -0400
commitbd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 (patch)
treeef5341c7747f809aec7ae233f6e3ef90af39be5f /fs
parentf9b5570d7fdedff32a2e78102bfb54cd1b12b289 (diff)
fs: kill i_alloc_sem
i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs')
-rw-r--r--fs/attr.c5
-rw-r--r--fs/direct-io.c65
-rw-r--r--fs/inode.c3
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/inode.c10
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/file.c15
-rw-r--r--fs/reiserfs/xattr.c3
8 files changed, 67 insertions, 44 deletions
diff --git a/fs/attr.c b/fs/attr.c
index caf2aa521e2b..f177ac86fa48 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -233,16 +233,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
233 return error; 233 return error;
234 234
235 if (ia_valid & ATTR_SIZE) 235 if (ia_valid & ATTR_SIZE)
236 down_write(&dentry->d_inode->i_alloc_sem); 236 inode_dio_wait(inode);
237 237
238 if (inode->i_op->setattr) 238 if (inode->i_op->setattr)
239 error = inode->i_op->setattr(dentry, attr); 239 error = inode->i_op->setattr(dentry, attr);
240 else 240 else
241 error = simple_setattr(dentry, attr); 241 error = simple_setattr(dentry, attr);
242 242
243 if (ia_valid & ATTR_SIZE)
244 up_write(&dentry->d_inode->i_alloc_sem);
245
246 if (!error) 243 if (!error)
247 fsnotify_change(dentry, ia_valid); 244 fsnotify_change(dentry, ia_valid);
248 245
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 98ce3ac0d94b..354cbdbc14bd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -135,6 +135,50 @@ struct dio {
135 struct page *pages[DIO_PAGES]; /* page buffer */ 135 struct page *pages[DIO_PAGES]; /* page buffer */
136}; 136};
137 137
138static void __inode_dio_wait(struct inode *inode)
139{
140 wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
141 DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
142
143 do {
144 prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
145 if (atomic_read(&inode->i_dio_count))
146 schedule();
147 } while (atomic_read(&inode->i_dio_count));
148 finish_wait(wq, &q.wait);
149}
150
151/**
152 * inode_dio_wait - wait for outstanding DIO requests to finish
153 * @inode: inode to wait for
154 *
155 * Waits for all pending direct I/O requests to finish so that we can
156 * proceed with a truncate or equivalent operation.
157 *
158 * Must be called under a lock that serializes taking new references
159 * to i_dio_count, usually by inode->i_mutex.
160 */
161void inode_dio_wait(struct inode *inode)
162{
163 if (atomic_read(&inode->i_dio_count))
164 __inode_dio_wait(inode);
165}
166EXPORT_SYMBOL_GPL(inode_dio_wait);
167
168/*
169 * inode_dio_done - signal finish of a direct I/O requests
170 * @inode: inode the direct I/O happens on
171 *
172 * This is called once we've finished processing a direct I/O request,
173 * and is used to wake up callers waiting for direct I/O to be quiesced.
174 */
175void inode_dio_done(struct inode *inode)
176{
177 if (atomic_dec_and_test(&inode->i_dio_count))
178 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
179}
180EXPORT_SYMBOL_GPL(inode_dio_done);
181
138/* 182/*
139 * How many pages are in the queue? 183 * How many pages are in the queue?
140 */ 184 */
@@ -254,9 +298,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
254 } 298 }
255 299
256 if (dio->flags & DIO_LOCKING) 300 if (dio->flags & DIO_LOCKING)
257 /* lockdep: non-owner release */ 301 inode_dio_done(dio->inode);
258 up_read_non_owner(&dio->inode->i_alloc_sem);
259
260 return ret; 302 return ret;
261} 303}
262 304
@@ -980,9 +1022,6 @@ out:
980 return ret; 1022 return ret;
981} 1023}
982 1024
983/*
984 * Releases both i_mutex and i_alloc_sem
985 */
986static ssize_t 1025static ssize_t
987direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 1026direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
988 const struct iovec *iov, loff_t offset, unsigned long nr_segs, 1027 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
@@ -1146,15 +1185,14 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1146 * For writes this function is called under i_mutex and returns with 1185 * For writes this function is called under i_mutex and returns with
1147 * i_mutex held, for reads, i_mutex is not held on entry, but it is 1186 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1148 * taken and dropped again before returning. 1187 * taken and dropped again before returning.
1149 * For reads and writes i_alloc_sem is taken in shared mode and released 1188 * The i_dio_count counter keeps track of the number of outstanding
1150 * on I/O completion (which may happen asynchronously after returning to 1189 * direct I/O requests, and truncate waits for it to reach zero.
1151 * the caller). 1190 * New references to i_dio_count must only be grabbed with i_mutex
1191 * held.
1152 * 1192 *
1153 * - if the flags value does NOT contain DIO_LOCKING we don't use any 1193 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1154 * internal locking but rather rely on the filesystem to synchronize 1194 * internal locking but rather rely on the filesystem to synchronize
1155 * direct I/O reads/writes versus each other and truncate. 1195 * direct I/O reads/writes versus each other and truncate.
1156 * For reads and writes both i_mutex and i_alloc_sem are not held on
1157 * entry and are never taken.
1158 */ 1196 */
1159ssize_t 1197ssize_t
1160__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1198__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1234,10 +1272,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1234 } 1272 }
1235 1273
1236 /* 1274 /*
1237 * Will be released at I/O completion, possibly in a 1275 * Will be decremented at I/O completion time.
1238 * different thread.
1239 */ 1276 */
1240 down_read_non_owner(&inode->i_alloc_sem); 1277 atomic_inc(&inode->i_dio_count);
1241 } 1278 }
1242 1279
1243 /* 1280 /*
diff --git a/fs/inode.c b/fs/inode.c
index cf81baf1898a..96c77b81167c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -168,8 +168,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
168 mutex_init(&inode->i_mutex); 168 mutex_init(&inode->i_mutex);
169 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); 169 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
170 170
171 init_rwsem(&inode->i_alloc_sem); 171 atomic_set(&inode->i_dio_count, 0);
172 lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
173 172
174 mapping->a_ops = &empty_aops; 173 mapping->a_ops = &empty_aops;
175 mapping->host = inode; 174 mapping->host = inode;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f4b1057abdd2..b59f5ac26bef 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1832,9 +1832,8 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1832 * fails again. 1832 * fails again.
1833 */ 1833 */
1834 if (unlikely(NInoTruncateFailed(ni))) { 1834 if (unlikely(NInoTruncateFailed(ni))) {
1835 down_write(&vi->i_alloc_sem); 1835 inode_dio_wait(vi);
1836 err = ntfs_truncate(vi); 1836 err = ntfs_truncate(vi);
1837 up_write(&vi->i_alloc_sem);
1838 if (err || NInoTruncateFailed(ni)) { 1837 if (err || NInoTruncateFailed(ni)) {
1839 if (!err) 1838 if (!err)
1840 err = -EIO; 1839 err = -EIO;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index c05d6dcf77a4..1371487da955 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2357,12 +2357,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run "
2357 * 2357 *
2358 * Returns 0 on success or -errno on error. 2358 * Returns 0 on success or -errno on error.
2359 * 2359 *
2360 * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for 2360 * Called with ->i_mutex held.
2361 * writing. The only case in the kernel where ->i_alloc_sem is not held is
2362 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
2363 * with the current i_size as the offset. The analogous place in NTFS is in
2364 * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
2365 * without holding ->i_alloc_sem.
2366 */ 2361 */
2367int ntfs_truncate(struct inode *vi) 2362int ntfs_truncate(struct inode *vi)
2368{ 2363{
@@ -2887,8 +2882,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
2887 * We also abort all changes of user, group, and mode as we do not implement 2882 * We also abort all changes of user, group, and mode as we do not implement
2888 * the NTFS ACLs yet. 2883 * the NTFS ACLs yet.
2889 * 2884 *
2890 * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also 2885 * Called with ->i_mutex held.
2891 * called with ->i_alloc_sem held for writing.
2892 */ 2886 */
2893int ntfs_setattr(struct dentry *dentry, struct iattr *attr) 2887int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2894{ 2888{
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ac97bca282d2..de1d3953599d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -551,9 +551,8 @@ bail:
551 551
552/* 552/*
553 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 553 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
554 * particularly interested in the aio/dio case. Like the core uses 554 * particularly interested in the aio/dio case. We use the rw_lock DLM lock
555 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 555 * to protect io on one node from truncation on another.
556 * truncation on another.
557 */ 556 */
558static void ocfs2_dio_end_io(struct kiocb *iocb, 557static void ocfs2_dio_end_io(struct kiocb *iocb,
559 loff_t offset, 558 loff_t offset,
@@ -569,7 +568,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
569 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 568 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
570 569
571 if (ocfs2_iocb_is_sem_locked(iocb)) { 570 if (ocfs2_iocb_is_sem_locked(iocb)) {
572 up_read(&inode->i_alloc_sem); 571 inode_dio_done(inode);
573 ocfs2_iocb_clear_sem_locked(iocb); 572 ocfs2_iocb_clear_sem_locked(iocb);
574 } 573 }
575 574
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1406c37a5722..2c3a465514a2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2236,9 +2236,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2236 ocfs2_iocb_clear_sem_locked(iocb); 2236 ocfs2_iocb_clear_sem_locked(iocb);
2237 2237
2238relock: 2238relock:
2239 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 2239 /* to match setattr's i_mutex -> rw_lock ordering */
2240 if (direct_io) { 2240 if (direct_io) {
2241 down_read(&inode->i_alloc_sem); 2241 atomic_inc(&inode->i_dio_count);
2242 have_alloc_sem = 1; 2242 have_alloc_sem = 1;
2243 /* communicate with ocfs2_dio_end_io */ 2243 /* communicate with ocfs2_dio_end_io */
2244 ocfs2_iocb_set_sem_locked(iocb); 2244 ocfs2_iocb_set_sem_locked(iocb);
@@ -2290,7 +2290,7 @@ relock:
2290 */ 2290 */
2291 if (direct_io && !can_do_direct) { 2291 if (direct_io && !can_do_direct) {
2292 ocfs2_rw_unlock(inode, rw_level); 2292 ocfs2_rw_unlock(inode, rw_level);
2293 up_read(&inode->i_alloc_sem); 2293 inode_dio_done(inode);
2294 2294
2295 have_alloc_sem = 0; 2295 have_alloc_sem = 0;
2296 rw_level = -1; 2296 rw_level = -1;
@@ -2361,8 +2361,7 @@ out_dio:
2361 /* 2361 /*
2362 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2362 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2363 * function pointer which is called when o_direct io completes so that 2363 * function pointer which is called when o_direct io completes so that
2364 * it can unlock our rw lock. (it's the clustered equivalent of 2364 * it can unlock our rw lock.
2365 * i_alloc_sem; protects truncate from racing with pending ios).
2366 * Unfortunately there are error cases which call end_io and others 2365 * Unfortunately there are error cases which call end_io and others
2367 * that don't. so we don't have to unlock the rw_lock if either an 2366 * that don't. so we don't have to unlock the rw_lock if either an
2368 * async dio is going to do it in the future or an end_io after an 2367 * async dio is going to do it in the future or an end_io after an
@@ -2379,7 +2378,7 @@ out:
2379 2378
2380out_sems: 2379out_sems:
2381 if (have_alloc_sem) { 2380 if (have_alloc_sem) {
2382 up_read(&inode->i_alloc_sem); 2381 inode_dio_done(inode);
2383 ocfs2_iocb_clear_sem_locked(iocb); 2382 ocfs2_iocb_clear_sem_locked(iocb);
2384 } 2383 }
2385 2384
@@ -2531,8 +2530,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2531 * need locks to protect pending reads from racing with truncate. 2530 * need locks to protect pending reads from racing with truncate.
2532 */ 2531 */
2533 if (filp->f_flags & O_DIRECT) { 2532 if (filp->f_flags & O_DIRECT) {
2534 down_read(&inode->i_alloc_sem);
2535 have_alloc_sem = 1; 2533 have_alloc_sem = 1;
2534 atomic_inc(&inode->i_dio_count);
2536 ocfs2_iocb_set_sem_locked(iocb); 2535 ocfs2_iocb_set_sem_locked(iocb);
2537 2536
2538 ret = ocfs2_rw_lock(inode, 0); 2537 ret = ocfs2_rw_lock(inode, 0);
@@ -2575,7 +2574,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2575 2574
2576bail: 2575bail:
2577 if (have_alloc_sem) { 2576 if (have_alloc_sem) {
2578 up_read(&inode->i_alloc_sem); 2577 inode_dio_done(inode);
2579 ocfs2_iocb_clear_sem_locked(iocb); 2578 ocfs2_iocb_clear_sem_locked(iocb);
2580 } 2579 }
2581 if (rw_level != -1) 2580 if (rw_level != -1)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 4ea2ab41fdee..6938d8c68d6e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -555,11 +555,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
555 555
556 reiserfs_write_unlock(inode->i_sb); 556 reiserfs_write_unlock(inode->i_sb);
557 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); 557 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
558 down_write(&dentry->d_inode->i_alloc_sem); 558 inode_dio_wait(dentry->d_inode);
559 reiserfs_write_lock(inode->i_sb); 559 reiserfs_write_lock(inode->i_sb);
560 560
561 err = reiserfs_setattr(dentry, &newattrs); 561 err = reiserfs_setattr(dentry, &newattrs);
562 up_write(&dentry->d_inode->i_alloc_sem);
563 mutex_unlock(&dentry->d_inode->i_mutex); 562 mutex_unlock(&dentry->d_inode->i_mutex);
564 } else 563 } else
565 update_ctime(inode); 564 update_ctime(inode);