aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2015-02-23 06:38:08 -0500
committerDave Chinner <david@fromorbit.com>2015-02-23 06:38:08 -0500
commit58c904734cd0917cd0953067dd68003572407c7b (patch)
treef23f98ce11dcf36c1b582b5a04357ab9eb97df19 /fs
parentc517d838eb7d07bbe9507871fab3931deccff539 (diff)
xfs: inodes are new until the dentry cache is set up
Al Viro noticed a generic set of issues to do with filehandle lookup racing with dentry cache setup. They involve a filehandle lookup occurring while an inode is being created and the filehandle lookup racing with the dentry creation for the real file. This can lead to multiple dentries for the one path being instantiated. There are a host of other issues around this same set of paths. The underlying cause is that file handle lookup only waits on inode cache instantiation rather than full dentry cache instantiation. XFS is mostly immune to the problems discovered due to it's own internal inode cache, but there are a couple of corner cases where races can happen. We currently clear the XFS_INEW flag when the inode is fully set up after insertion into the cache. Newly allocated inodes are inserted locked and so aren't usable until the allocation transaction commits. This, however, occurs before the dentry and security information is fully initialised and hence the inode is unlocked and available for lookups to find too early. To solve the problem, only clear the XFS_INEW flag for newly created inodes once the dentry is fully instantiated. This means lookups will retry until the XFS_INEW flag is removed from the inode and hence avoids the race conditions in questions. THis also means that xfs_create(), xfs_create_tmpfile() and xfs_symlink() need to finish the setup of the inode in their error paths if we had allocated the inode but failed later in the creation process. xfs_symlink(), in particular, needed a lot of help to make it's error handling match that of xfs_create(). Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_inode.c22
-rw-r--r--fs/xfs/xfs_inode.h22
-rw-r--r--fs/xfs/xfs_iops.c24
-rw-r--r--fs/xfs/xfs_iops.h2
-rw-r--r--fs/xfs/xfs_qm.c13
-rw-r--r--fs/xfs/xfs_symlink.c58
7 files changed, 90 insertions, 55 deletions
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9771b7ef62ed..76a9f2783282 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -439,11 +439,11 @@ again:
439 *ipp = ip; 439 *ipp = ip;
440 440
441 /* 441 /*
442 * If we have a real type for an on-disk inode, we can set ops(&unlock) 442 * If we have a real type for an on-disk inode, we can setup the inode
443 * now. If it's a new inode being created, xfs_ialloc will handle it. 443 * now. If it's a new inode being created, xfs_ialloc will handle it.
444 */ 444 */
445 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) 445 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
446 xfs_setup_inode(ip); 446 xfs_setup_existing_inode(ip);
447 return 0; 447 return 0;
448 448
449out_error_or_again: 449out_error_or_again:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index daafa1f6d260..d0414f305967 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -818,7 +818,7 @@ xfs_ialloc(
818 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 818 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
819 xfs_trans_log_inode(tp, ip, flags); 819 xfs_trans_log_inode(tp, ip, flags);
820 820
821 /* now that we have an i_mode we can setup inode ops and unlock */ 821 /* now that we have an i_mode we can setup the inode structure */
822 xfs_setup_inode(ip); 822 xfs_setup_inode(ip);
823 823
824 *ipp = ip; 824 *ipp = ip;
@@ -1235,12 +1235,14 @@ xfs_create(
1235 xfs_trans_cancel(tp, cancel_flags); 1235 xfs_trans_cancel(tp, cancel_flags);
1236 out_release_inode: 1236 out_release_inode:
1237 /* 1237 /*
1238 * Wait until after the current transaction is aborted to 1238 * Wait until after the current transaction is aborted to finish the
1239 * release the inode. This prevents recursive transactions 1239 * setup of the inode and release the inode. This prevents recursive
1240 * and deadlocks from xfs_inactive. 1240 * transactions and deadlocks from xfs_inactive.
1241 */ 1241 */
1242 if (ip) 1242 if (ip) {
1243 xfs_finish_inode_setup(ip);
1243 IRELE(ip); 1244 IRELE(ip);
1245 }
1244 1246
1245 xfs_qm_dqrele(udqp); 1247 xfs_qm_dqrele(udqp);
1246 xfs_qm_dqrele(gdqp); 1248 xfs_qm_dqrele(gdqp);
@@ -1345,12 +1347,14 @@ xfs_create_tmpfile(
1345 xfs_trans_cancel(tp, cancel_flags); 1347 xfs_trans_cancel(tp, cancel_flags);
1346 out_release_inode: 1348 out_release_inode:
1347 /* 1349 /*
1348 * Wait until after the current transaction is aborted to 1350 * Wait until after the current transaction is aborted to finish the
1349 * release the inode. This prevents recursive transactions 1351 * setup of the inode and release the inode. This prevents recursive
1350 * and deadlocks from xfs_inactive. 1352 * transactions and deadlocks from xfs_inactive.
1351 */ 1353 */
1352 if (ip) 1354 if (ip) {
1355 xfs_finish_inode_setup(ip);
1353 IRELE(ip); 1356 IRELE(ip);
1357 }
1354 1358
1355 xfs_qm_dqrele(udqp); 1359 xfs_qm_dqrele(udqp);
1356 xfs_qm_dqrele(gdqp); 1360 xfs_qm_dqrele(gdqp);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 86cd6b39bed7..8e82b41d2050 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -390,6 +390,28 @@ int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
390int xfs_iozero(struct xfs_inode *, loff_t, size_t); 390int xfs_iozero(struct xfs_inode *, loff_t, size_t);
391 391
392 392
393/* from xfs_iops.c */
394/*
395 * When setting up a newly allocated inode, we need to call
396 * xfs_finish_inode_setup() once the inode is fully instantiated at
397 * the VFS level to prevent the rest of the world seeing the inode
398 * before we've completed instantiation. Otherwise we can do it
399 * the moment the inode lookup is complete.
400 */
401extern void xfs_setup_inode(struct xfs_inode *ip);
402static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
403{
404 xfs_iflags_clear(ip, XFS_INEW);
405 barrier();
406 unlock_new_inode(VFS_I(ip));
407}
408
409static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
410{
411 xfs_setup_inode(ip);
412 xfs_finish_inode_setup(ip);
413}
414
393#define IHOLD(ip) \ 415#define IHOLD(ip) \
394do { \ 416do { \
395 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 417 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d919ad7b16bf..d7782ae1af3c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -187,6 +187,8 @@ xfs_generic_create(
187 else 187 else
188 d_instantiate(dentry, inode); 188 d_instantiate(dentry, inode);
189 189
190 xfs_finish_inode_setup(ip);
191
190 out_free_acl: 192 out_free_acl:
191 if (default_acl) 193 if (default_acl)
192 posix_acl_release(default_acl); 194 posix_acl_release(default_acl);
@@ -195,6 +197,7 @@ xfs_generic_create(
195 return error; 197 return error;
196 198
197 out_cleanup_inode: 199 out_cleanup_inode:
200 xfs_finish_inode_setup(ip);
198 if (!tmpfile) 201 if (!tmpfile)
199 xfs_cleanup_inode(dir, inode, dentry); 202 xfs_cleanup_inode(dir, inode, dentry);
200 iput(inode); 203 iput(inode);
@@ -367,9 +370,11 @@ xfs_vn_symlink(
367 goto out_cleanup_inode; 370 goto out_cleanup_inode;
368 371
369 d_instantiate(dentry, inode); 372 d_instantiate(dentry, inode);
373 xfs_finish_inode_setup(cip);
370 return 0; 374 return 0;
371 375
372 out_cleanup_inode: 376 out_cleanup_inode:
377 xfs_finish_inode_setup(cip);
373 xfs_cleanup_inode(dir, inode, dentry); 378 xfs_cleanup_inode(dir, inode, dentry);
374 iput(inode); 379 iput(inode);
375 out: 380 out:
@@ -1236,16 +1241,12 @@ xfs_diflags_to_iflags(
1236} 1241}
1237 1242
1238/* 1243/*
1239 * Initialize the Linux inode, set up the operation vectors and 1244 * Initialize the Linux inode and set up the operation vectors.
1240 * unlock the inode.
1241 *
1242 * When reading existing inodes from disk this is called directly
1243 * from xfs_iget, when creating a new inode it is called from
1244 * xfs_ialloc after setting up the inode.
1245 * 1245 *
1246 * We are always called with an uninitialised linux inode here. 1246 * When reading existing inodes from disk this is called directly from xfs_iget,
1247 * We need to initialise the necessary fields and take a reference 1247 * when creating a new inode it is called from xfs_ialloc after setting up the
1248 * on it. 1248 * inode. These callers have different criteria for clearing XFS_INEW, so leave
1249 * it up to the caller to deal with unlocking the inode appropriately.
1249 */ 1250 */
1250void 1251void
1251xfs_setup_inode( 1252xfs_setup_inode(
@@ -1332,9 +1333,4 @@ xfs_setup_inode(
1332 inode_has_no_xattr(inode); 1333 inode_has_no_xattr(inode);
1333 cache_no_acl(inode); 1334 cache_no_acl(inode);
1334 } 1335 }
1335
1336 xfs_iflags_clear(ip, XFS_INEW);
1337 barrier();
1338
1339 unlock_new_inode(inode);
1340} 1336}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index ea7a98e9cb70..a0f84abb0d09 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations;
25 25
26extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); 26extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
27 27
28extern void xfs_setup_inode(struct xfs_inode *);
29
30/* 28/*
31 * Internal setattr interfaces. 29 * Internal setattr interfaces.
32 */ 30 */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 53cc2aaf8d2b..c6b22e1e77ed 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -719,6 +719,7 @@ xfs_qm_qino_alloc(
719 xfs_trans_t *tp; 719 xfs_trans_t *tp;
720 int error; 720 int error;
721 int committed; 721 int committed;
722 bool need_alloc = true;
722 723
723 *ip = NULL; 724 *ip = NULL;
724 /* 725 /*
@@ -747,6 +748,7 @@ xfs_qm_qino_alloc(
747 return error; 748 return error;
748 mp->m_sb.sb_gquotino = NULLFSINO; 749 mp->m_sb.sb_gquotino = NULLFSINO;
749 mp->m_sb.sb_pquotino = NULLFSINO; 750 mp->m_sb.sb_pquotino = NULLFSINO;
751 need_alloc = false;
750 } 752 }
751 } 753 }
752 754
@@ -758,7 +760,7 @@ xfs_qm_qino_alloc(
758 return error; 760 return error;
759 } 761 }
760 762
761 if (!*ip) { 763 if (need_alloc) {
762 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, 764 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
763 &committed); 765 &committed);
764 if (error) { 766 if (error) {
@@ -794,11 +796,14 @@ xfs_qm_qino_alloc(
794 spin_unlock(&mp->m_sb_lock); 796 spin_unlock(&mp->m_sb_lock);
795 xfs_log_sb(tp); 797 xfs_log_sb(tp);
796 798
797 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 799 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
800 if (error) {
801 ASSERT(XFS_FORCED_SHUTDOWN(mp));
798 xfs_alert(mp, "%s failed (error %d)!", __func__, error); 802 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
799 return error;
800 } 803 }
801 return 0; 804 if (need_alloc)
805 xfs_finish_inode_setup(*ip);
806 return error;
802} 807}
803 808
804 809
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 25791df6f638..3df411eadb86 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -177,7 +177,7 @@ xfs_symlink(
177 int pathlen; 177 int pathlen;
178 struct xfs_bmap_free free_list; 178 struct xfs_bmap_free free_list;
179 xfs_fsblock_t first_block; 179 xfs_fsblock_t first_block;
180 bool unlock_dp_on_error = false; 180 bool unlock_dp_on_error = false;
181 uint cancel_flags; 181 uint cancel_flags;
182 int committed; 182 int committed;
183 xfs_fileoff_t first_fsb; 183 xfs_fileoff_t first_fsb;
@@ -221,7 +221,7 @@ xfs_symlink(
221 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 221 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
222 &udqp, &gdqp, &pdqp); 222 &udqp, &gdqp, &pdqp);
223 if (error) 223 if (error)
224 goto std_return; 224 return error;
225 225
226 tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); 226 tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
227 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 227 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
@@ -241,7 +241,7 @@ xfs_symlink(
241 } 241 }
242 if (error) { 242 if (error) {
243 cancel_flags = 0; 243 cancel_flags = 0;
244 goto error_return; 244 goto out_trans_cancel;
245 } 245 }
246 246
247 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 247 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -252,7 +252,7 @@ xfs_symlink(
252 */ 252 */
253 if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { 253 if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
254 error = -EPERM; 254 error = -EPERM;
255 goto error_return; 255 goto out_trans_cancel;
256 } 256 }
257 257
258 /* 258 /*
@@ -261,7 +261,7 @@ xfs_symlink(
261 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, 261 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
262 pdqp, resblks, 1, 0); 262 pdqp, resblks, 1, 0);
263 if (error) 263 if (error)
264 goto error_return; 264 goto out_trans_cancel;
265 265
266 /* 266 /*
267 * Check for ability to enter directory entry, if no space reserved. 267 * Check for ability to enter directory entry, if no space reserved.
@@ -269,7 +269,7 @@ xfs_symlink(
269 if (!resblks) { 269 if (!resblks) {
270 error = xfs_dir_canenter(tp, dp, link_name); 270 error = xfs_dir_canenter(tp, dp, link_name);
271 if (error) 271 if (error)
272 goto error_return; 272 goto out_trans_cancel;
273 } 273 }
274 /* 274 /*
275 * Initialize the bmap freelist prior to calling either 275 * Initialize the bmap freelist prior to calling either
@@ -282,15 +282,14 @@ xfs_symlink(
282 */ 282 */
283 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, 283 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
284 prid, resblks > 0, &ip, NULL); 284 prid, resblks > 0, &ip, NULL);
285 if (error) { 285 if (error)
286 if (error == -ENOSPC) 286 goto out_trans_cancel;
287 goto error_return;
288 goto error1;
289 }
290 287
291 /* 288 /*
292 * An error after we've joined dp to the transaction will result in the 289 * Now we join the directory inode to the transaction. We do not do it
293 * transaction cancel unlocking dp so don't do it explicitly in the 290 * earlier because xfs_dir_ialloc might commit the previous transaction
291 * (and release all the locks). An error from here on will result in
292 * the transaction cancel unlocking dp so don't do it explicitly in the
294 * error path. 293 * error path.
295 */ 294 */
296 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 295 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
@@ -330,7 +329,7 @@ xfs_symlink(
330 XFS_BMAPI_METADATA, &first_block, resblks, 329 XFS_BMAPI_METADATA, &first_block, resblks,
331 mval, &nmaps, &free_list); 330 mval, &nmaps, &free_list);
332 if (error) 331 if (error)
333 goto error2; 332 goto out_bmap_cancel;
334 333
335 if (resblks) 334 if (resblks)
336 resblks -= fs_blocks; 335 resblks -= fs_blocks;
@@ -348,7 +347,7 @@ xfs_symlink(
348 BTOBB(byte_cnt), 0); 347 BTOBB(byte_cnt), 0);
349 if (!bp) { 348 if (!bp) {
350 error = -ENOMEM; 349 error = -ENOMEM;
351 goto error2; 350 goto out_bmap_cancel;
352 } 351 }
353 bp->b_ops = &xfs_symlink_buf_ops; 352 bp->b_ops = &xfs_symlink_buf_ops;
354 353
@@ -378,7 +377,7 @@ xfs_symlink(
378 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, 377 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
379 &first_block, &free_list, resblks); 378 &first_block, &free_list, resblks);
380 if (error) 379 if (error)
381 goto error2; 380 goto out_bmap_cancel;
382 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 381 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
383 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 382 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
384 383
@@ -392,10 +391,13 @@ xfs_symlink(
392 } 391 }
393 392
394 error = xfs_bmap_finish(&tp, &free_list, &committed); 393 error = xfs_bmap_finish(&tp, &free_list, &committed);
395 if (error) { 394 if (error)
396 goto error2; 395 goto out_bmap_cancel;
397 } 396
398 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 397 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
398 if (error)
399 goto out_release_inode;
400
399 xfs_qm_dqrele(udqp); 401 xfs_qm_dqrele(udqp);
400 xfs_qm_dqrele(gdqp); 402 xfs_qm_dqrele(gdqp);
401 xfs_qm_dqrele(pdqp); 403 xfs_qm_dqrele(pdqp);
@@ -403,20 +405,28 @@ xfs_symlink(
403 *ipp = ip; 405 *ipp = ip;
404 return 0; 406 return 0;
405 407
406 error2: 408out_bmap_cancel:
407 IRELE(ip);
408 error1:
409 xfs_bmap_cancel(&free_list); 409 xfs_bmap_cancel(&free_list);
410 cancel_flags |= XFS_TRANS_ABORT; 410 cancel_flags |= XFS_TRANS_ABORT;
411 error_return: 411out_trans_cancel:
412 xfs_trans_cancel(tp, cancel_flags); 412 xfs_trans_cancel(tp, cancel_flags);
413out_release_inode:
414 /*
415 * Wait until after the current transaction is aborted to finish the
416 * setup of the inode and release the inode. This prevents recursive
417 * transactions and deadlocks from xfs_inactive.
418 */
419 if (ip) {
420 xfs_finish_inode_setup(ip);
421 IRELE(ip);
422 }
423
413 xfs_qm_dqrele(udqp); 424 xfs_qm_dqrele(udqp);
414 xfs_qm_dqrele(gdqp); 425 xfs_qm_dqrele(gdqp);
415 xfs_qm_dqrele(pdqp); 426 xfs_qm_dqrele(pdqp);
416 427
417 if (unlock_dp_on_error) 428 if (unlock_dp_on_error)
418 xfs_iunlock(dp, XFS_ILOCK_EXCL); 429 xfs_iunlock(dp, XFS_ILOCK_EXCL);
419 std_return:
420 return error; 430 return error;
421} 431}
422 432