aboutsummaryrefslogtreecommitdiffstats
path: root/fs/namei.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/namei.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'fs/namei.c')
-rw-r--r--fs/namei.c2055
1 files changed, 1255 insertions, 800 deletions
diff --git a/fs/namei.c b/fs/namei.c
index 24896e833565..14ab8d3f2f0c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -70,7 +70,7 @@
70 * name indicated by the symlink. The old code always complained that the 70 * name indicated by the symlink. The old code always complained that the
71 * name already exists, due to not following the symlink even if its target 71 * name already exists, due to not following the symlink even if its target
72 * is nonexistent. The new semantics affects also mknod() and link() when 72 * is nonexistent. The new semantics affects also mknod() and link() when
73 * the name is a symlink pointing to a non-existant name. 73 * the name is a symlink pointing to a non-existent name.
74 * 74 *
75 * I don't know which semantics is the right one, since I have no access 75 * I don't know which semantics is the right one, since I have no access
76 * to standards. But I found by trial that HP-UX 9.0 has the full "new" 76 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
136 return retval; 136 return retval;
137} 137}
138 138
139char * getname(const char __user * filename) 139static char *getname_flags(const char __user * filename, int flags)
140{ 140{
141 char *tmp, *result; 141 char *tmp, *result;
142 142
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
147 147
148 result = tmp; 148 result = tmp;
149 if (retval < 0) { 149 if (retval < 0) {
150 __putname(tmp); 150 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
151 result = ERR_PTR(retval); 151 __putname(tmp);
152 result = ERR_PTR(retval);
153 }
152 } 154 }
153 } 155 }
154 audit_getname(result); 156 audit_getname(result);
155 return result; 157 return result;
156} 158}
157 159
160char *getname(const char __user * filename)
161{
162 return getname_flags(filename, 0);
163}
164
158#ifdef CONFIG_AUDITSYSCALL 165#ifdef CONFIG_AUDITSYSCALL
159void putname(const char *name) 166void putname(const char *name)
160{ 167{
@@ -169,18 +176,21 @@ EXPORT_SYMBOL(putname);
169/* 176/*
170 * This does basic POSIX ACL permission checking 177 * This does basic POSIX ACL permission checking
171 */ 178 */
172static int acl_permission_check(struct inode *inode, int mask, 179static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
173 int (*check_acl)(struct inode *inode, int mask)) 180 int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
174{ 181{
175 umode_t mode = inode->i_mode; 182 unsigned int mode = inode->i_mode;
176 183
177 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 184 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
178 185
186 if (current_user_ns() != inode_userns(inode))
187 goto other_perms;
188
179 if (current_fsuid() == inode->i_uid) 189 if (current_fsuid() == inode->i_uid)
180 mode >>= 6; 190 mode >>= 6;
181 else { 191 else {
182 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 192 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
183 int error = check_acl(inode, mask); 193 int error = check_acl(inode, mask, flags);
184 if (error != -EAGAIN) 194 if (error != -EAGAIN)
185 return error; 195 return error;
186 } 196 }
@@ -189,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask,
189 mode >>= 3; 199 mode >>= 3;
190 } 200 }
191 201
202other_perms:
192 /* 203 /*
193 * If the DACs are ok we don't need any capability check. 204 * If the DACs are ok we don't need any capability check.
194 */ 205 */
@@ -198,34 +209,40 @@ static int acl_permission_check(struct inode *inode, int mask,
198} 209}
199 210
200/** 211/**
201 * generic_permission - check for access rights on a Posix-like filesystem 212 * generic_permission - check for access rights on a Posix-like filesystem
202 * @inode: inode to check access rights for 213 * @inode: inode to check access rights for
203 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 214 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
204 * @check_acl: optional callback to check for Posix ACLs 215 * @check_acl: optional callback to check for Posix ACLs
216 * @flags: IPERM_FLAG_ flags.
205 * 217 *
206 * Used to check for read/write/execute permissions on a file. 218 * Used to check for read/write/execute permissions on a file.
207 * We use "fsuid" for this, letting us set arbitrary permissions 219 * We use "fsuid" for this, letting us set arbitrary permissions
208 * for filesystem access without changing the "normal" uids which 220 * for filesystem access without changing the "normal" uids which
209 * are used for other things.. 221 * are used for other things.
222 *
223 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
224 * request cannot be satisfied (eg. requires blocking or too much complexity).
225 * It would then be called again in ref-walk mode.
210 */ 226 */
211int generic_permission(struct inode *inode, int mask, 227int generic_permission(struct inode *inode, int mask, unsigned int flags,
212 int (*check_acl)(struct inode *inode, int mask)) 228 int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
213{ 229{
214 int ret; 230 int ret;
215 231
216 /* 232 /*
217 * Do the basic POSIX ACL permission checks. 233 * Do the basic POSIX ACL permission checks.
218 */ 234 */
219 ret = acl_permission_check(inode, mask, check_acl); 235 ret = acl_permission_check(inode, mask, flags, check_acl);
220 if (ret != -EACCES) 236 if (ret != -EACCES)
221 return ret; 237 return ret;
222 238
223 /* 239 /*
224 * Read/write DACs are always overridable. 240 * Read/write DACs are always overridable.
225 * Executable DACs are overridable if at least one exec bit is set. 241 * Executable DACs are overridable for all directories and
242 * for non-directories that have least one exec bit set.
226 */ 243 */
227 if (!(mask & MAY_EXEC) || execute_ok(inode)) 244 if (!(mask & MAY_EXEC) || execute_ok(inode))
228 if (capable(CAP_DAC_OVERRIDE)) 245 if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
229 return 0; 246 return 0;
230 247
231 /* 248 /*
@@ -233,7 +250,7 @@ int generic_permission(struct inode *inode, int mask,
233 */ 250 */
234 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 251 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
235 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) 252 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
236 if (capable(CAP_DAC_READ_SEARCH)) 253 if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
237 return 0; 254 return 0;
238 255
239 return -EACCES; 256 return -EACCES;
@@ -271,9 +288,10 @@ int inode_permission(struct inode *inode, int mask)
271 } 288 }
272 289
273 if (inode->i_op->permission) 290 if (inode->i_op->permission)
274 retval = inode->i_op->permission(inode, mask); 291 retval = inode->i_op->permission(inode, mask, 0);
275 else 292 else
276 retval = generic_permission(inode, mask, inode->i_op->check_acl); 293 retval = generic_permission(inode, mask, 0,
294 inode->i_op->check_acl);
277 295
278 if (retval) 296 if (retval)
279 return retval; 297 return retval;
@@ -374,22 +392,110 @@ void path_put(struct path *path)
374} 392}
375EXPORT_SYMBOL(path_put); 393EXPORT_SYMBOL(path_put);
376 394
395/*
396 * Path walking has 2 modes, rcu-walk and ref-walk (see
397 * Documentation/filesystems/path-lookup.txt). In situations when we can't
398 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
399 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
400 * mode. Refcounts are grabbed at the last known good point before rcu-walk
401 * got stuck, so ref-walk may continue from there. If this is not successful
402 * (eg. a seqcount has changed), then failure is returned and it's up to caller
403 * to restart the path walk from the beginning in ref-walk mode.
404 */
405
406/**
407 * unlazy_walk - try to switch to ref-walk mode.
408 * @nd: nameidata pathwalk data
409 * @dentry: child of nd->path.dentry or NULL
410 * Returns: 0 on success, -ECHILD on failure
411 *
412 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
413 * for ref-walk mode. @dentry must be a path found by a do_lookup call on
414 * @nd or NULL. Must be called from rcu-walk context.
415 */
416static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
417{
418 struct fs_struct *fs = current->fs;
419 struct dentry *parent = nd->path.dentry;
420 int want_root = 0;
421
422 BUG_ON(!(nd->flags & LOOKUP_RCU));
423 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
424 want_root = 1;
425 spin_lock(&fs->lock);
426 if (nd->root.mnt != fs->root.mnt ||
427 nd->root.dentry != fs->root.dentry)
428 goto err_root;
429 }
430 spin_lock(&parent->d_lock);
431 if (!dentry) {
432 if (!__d_rcu_to_refcount(parent, nd->seq))
433 goto err_parent;
434 BUG_ON(nd->inode != parent->d_inode);
435 } else {
436 if (dentry->d_parent != parent)
437 goto err_parent;
438 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
439 if (!__d_rcu_to_refcount(dentry, nd->seq))
440 goto err_child;
441 /*
442 * If the sequence check on the child dentry passed, then
443 * the child has not been removed from its parent. This
444 * means the parent dentry must be valid and able to take
445 * a reference at this point.
446 */
447 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
448 BUG_ON(!parent->d_count);
449 parent->d_count++;
450 spin_unlock(&dentry->d_lock);
451 }
452 spin_unlock(&parent->d_lock);
453 if (want_root) {
454 path_get(&nd->root);
455 spin_unlock(&fs->lock);
456 }
457 mntget(nd->path.mnt);
458
459 rcu_read_unlock();
460 br_read_unlock(vfsmount_lock);
461 nd->flags &= ~LOOKUP_RCU;
462 return 0;
463
464err_child:
465 spin_unlock(&dentry->d_lock);
466err_parent:
467 spin_unlock(&parent->d_lock);
468err_root:
469 if (want_root)
470 spin_unlock(&fs->lock);
471 return -ECHILD;
472}
473
377/** 474/**
378 * release_open_intent - free up open intent resources 475 * release_open_intent - free up open intent resources
379 * @nd: pointer to nameidata 476 * @nd: pointer to nameidata
380 */ 477 */
381void release_open_intent(struct nameidata *nd) 478void release_open_intent(struct nameidata *nd)
382{ 479{
383 if (nd->intent.open.file->f_path.dentry == NULL) 480 struct file *file = nd->intent.open.file;
384 put_filp(nd->intent.open.file); 481
385 else 482 if (file && !IS_ERR(file)) {
386 fput(nd->intent.open.file); 483 if (file->f_path.dentry == NULL)
484 put_filp(file);
485 else
486 fput(file);
487 }
488}
489
490static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
491{
492 return dentry->d_op->d_revalidate(dentry, nd);
387} 493}
388 494
389static inline struct dentry * 495static struct dentry *
390do_revalidate(struct dentry *dentry, struct nameidata *nd) 496do_revalidate(struct dentry *dentry, struct nameidata *nd)
391{ 497{
392 int status = dentry->d_op->d_revalidate(dentry, nd); 498 int status = d_revalidate(dentry, nd);
393 if (unlikely(status <= 0)) { 499 if (unlikely(status <= 0)) {
394 /* 500 /*
395 * The dentry failed validation. 501 * The dentry failed validation.
@@ -397,56 +503,68 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
397 * the dentry otherwise d_revalidate is asking us 503 * the dentry otherwise d_revalidate is asking us
398 * to return a fail status. 504 * to return a fail status.
399 */ 505 */
400 if (!status) { 506 if (status < 0) {
401 if (!d_invalidate(dentry)) {
402 dput(dentry);
403 dentry = NULL;
404 }
405 } else {
406 dput(dentry); 507 dput(dentry);
407 dentry = ERR_PTR(status); 508 dentry = ERR_PTR(status);
509 } else if (!d_invalidate(dentry)) {
510 dput(dentry);
511 dentry = NULL;
408 } 512 }
409 } 513 }
410 return dentry; 514 return dentry;
411} 515}
412 516
413/* 517/**
414 * force_reval_path - force revalidation of a dentry 518 * complete_walk - successful completion of path walk
415 * 519 * @nd: pointer nameidata
416 * In some situations the path walking code will trust dentries without
417 * revalidating them. This causes problems for filesystems that depend on
418 * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
419 * (which indicates that it's possible for the dentry to go stale), force
420 * a d_revalidate call before proceeding.
421 * 520 *
422 * Returns 0 if the revalidation was successful. If the revalidation fails, 521 * If we had been in RCU mode, drop out of it and legitimize nd->path.
423 * either return the error returned by d_revalidate or -ESTALE if the 522 * Revalidate the final result, unless we'd already done that during
424 * revalidation it just returned 0. If d_revalidate returns 0, we attempt to 523 * the path walk or the filesystem doesn't ask for it. Return 0 on
425 * invalidate the dentry. It's up to the caller to handle putting references 524 * success, -error on failure. In case of failure caller does not
426 * to the path if necessary. 525 * need to drop nd->path.
427 */ 526 */
428static int 527static int complete_walk(struct nameidata *nd)
429force_reval_path(struct path *path, struct nameidata *nd)
430{ 528{
529 struct dentry *dentry = nd->path.dentry;
431 int status; 530 int status;
432 struct dentry *dentry = path->dentry;
433 531
434 /* 532 if (nd->flags & LOOKUP_RCU) {
435 * only check on filesystems where it's possible for the dentry to 533 nd->flags &= ~LOOKUP_RCU;
436 * become stale. It's assumed that if this flag is set then the 534 if (!(nd->flags & LOOKUP_ROOT))
437 * d_revalidate op will also be defined. 535 nd->root.mnt = NULL;
438 */ 536 spin_lock(&dentry->d_lock);
439 if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) 537 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
538 spin_unlock(&dentry->d_lock);
539 rcu_read_unlock();
540 br_read_unlock(vfsmount_lock);
541 return -ECHILD;
542 }
543 BUG_ON(nd->inode != dentry->d_inode);
544 spin_unlock(&dentry->d_lock);
545 mntget(nd->path.mnt);
546 rcu_read_unlock();
547 br_read_unlock(vfsmount_lock);
548 }
549
550 if (likely(!(nd->flags & LOOKUP_JUMPED)))
440 return 0; 551 return 0;
441 552
442 status = dentry->d_op->d_revalidate(dentry, nd); 553 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
554 return 0;
555
556 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
557 return 0;
558
559 /* Note: we do not d_invalidate() */
560 status = d_revalidate(dentry, nd);
443 if (status > 0) 561 if (status > 0)
444 return 0; 562 return 0;
445 563
446 if (!status) { 564 if (!status)
447 d_invalidate(dentry);
448 status = -ESTALE; 565 status = -ESTALE;
449 } 566
567 path_put(&nd->path);
450 return status; 568 return status;
451} 569}
452 570
@@ -459,26 +577,29 @@ force_reval_path(struct path *path, struct nameidata *nd)
459 * short-cut DAC fails, then call ->permission() to do more 577 * short-cut DAC fails, then call ->permission() to do more
460 * complete permission check. 578 * complete permission check.
461 */ 579 */
462static int exec_permission(struct inode *inode) 580static inline int exec_permission(struct inode *inode, unsigned int flags)
463{ 581{
464 int ret; 582 int ret;
583 struct user_namespace *ns = inode_userns(inode);
465 584
466 if (inode->i_op->permission) { 585 if (inode->i_op->permission) {
467 ret = inode->i_op->permission(inode, MAY_EXEC); 586 ret = inode->i_op->permission(inode, MAY_EXEC, flags);
468 if (!ret) 587 } else {
469 goto ok; 588 ret = acl_permission_check(inode, MAY_EXEC, flags,
470 return ret; 589 inode->i_op->check_acl);
471 } 590 }
472 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); 591 if (likely(!ret))
473 if (!ret)
474 goto ok; 592 goto ok;
593 if (ret == -ECHILD)
594 return ret;
475 595
476 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) 596 if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
597 ns_capable(ns, CAP_DAC_READ_SEARCH))
477 goto ok; 598 goto ok;
478 599
479 return ret; 600 return ret;
480ok: 601ok:
481 return security_inode_permission(inode, MAY_EXEC); 602 return security_inode_exec_permission(inode, flags);
482} 603}
483 604
484static __always_inline void set_root(struct nameidata *nd) 605static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +610,24 @@ static __always_inline void set_root(struct nameidata *nd)
489 610
490static int link_path_walk(const char *, struct nameidata *); 611static int link_path_walk(const char *, struct nameidata *);
491 612
613static __always_inline void set_root_rcu(struct nameidata *nd)
614{
615 if (!nd->root.mnt) {
616 struct fs_struct *fs = current->fs;
617 unsigned seq;
618
619 do {
620 seq = read_seqcount_begin(&fs->seq);
621 nd->root = fs->root;
622 nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
623 } while (read_seqcount_retry(&fs->seq, seq));
624 }
625}
626
492static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 627static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
493{ 628{
629 int ret;
630
494 if (IS_ERR(link)) 631 if (IS_ERR(link))
495 goto fail; 632 goto fail;
496 633
@@ -499,9 +636,12 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
499 path_put(&nd->path); 636 path_put(&nd->path);
500 nd->path = nd->root; 637 nd->path = nd->root;
501 path_get(&nd->root); 638 path_get(&nd->root);
639 nd->flags |= LOOKUP_JUMPED;
502 } 640 }
641 nd->inode = nd->path.dentry->d_inode;
503 642
504 return link_path_walk(link, nd); 643 ret = link_path_walk(link, nd);
644 return ret;
505fail: 645fail:
506 path_put(&nd->path); 646 path_put(&nd->path);
507 return PTR_ERR(link); 647 return PTR_ERR(link);
@@ -514,30 +654,55 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
514 mntput(path->mnt); 654 mntput(path->mnt);
515} 655}
516 656
517static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 657static inline void path_to_nameidata(const struct path *path,
658 struct nameidata *nd)
518{ 659{
519 dput(nd->path.dentry); 660 if (!(nd->flags & LOOKUP_RCU)) {
520 if (nd->path.mnt != path->mnt) { 661 dput(nd->path.dentry);
521 mntput(nd->path.mnt); 662 if (nd->path.mnt != path->mnt)
522 nd->path.mnt = path->mnt; 663 mntput(nd->path.mnt);
523 } 664 }
665 nd->path.mnt = path->mnt;
524 nd->path.dentry = path->dentry; 666 nd->path.dentry = path->dentry;
525} 667}
526 668
669static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
670{
671 struct inode *inode = link->dentry->d_inode;
672 if (!IS_ERR(cookie) && inode->i_op->put_link)
673 inode->i_op->put_link(link->dentry, nd, cookie);
674 path_put(link);
675}
676
527static __always_inline int 677static __always_inline int
528__do_follow_link(struct path *path, struct nameidata *nd, void **p) 678follow_link(struct path *link, struct nameidata *nd, void **p)
529{ 679{
530 int error; 680 int error;
531 struct dentry *dentry = path->dentry; 681 struct dentry *dentry = link->dentry;
682
683 BUG_ON(nd->flags & LOOKUP_RCU);
684
685 if (link->mnt == nd->path.mnt)
686 mntget(link->mnt);
687
688 if (unlikely(current->total_link_count >= 40)) {
689 *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
690 path_put(&nd->path);
691 return -ELOOP;
692 }
693 cond_resched();
694 current->total_link_count++;
532 695
533 touch_atime(path->mnt, dentry); 696 touch_atime(link->mnt, dentry);
534 nd_set_link(nd, NULL); 697 nd_set_link(nd, NULL);
535 698
536 if (path->mnt != nd->path.mnt) { 699 error = security_inode_follow_link(link->dentry, nd);
537 path_to_nameidata(path, nd); 700 if (error) {
538 dget(dentry); 701 *p = ERR_PTR(error); /* no ->put_link(), please */
702 path_put(&nd->path);
703 return error;
539 } 704 }
540 mntget(path->mnt); 705
541 nd->last_type = LAST_BIND; 706 nd->last_type = LAST_BIND;
542 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 707 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
543 error = PTR_ERR(*p); 708 error = PTR_ERR(*p);
@@ -547,48 +712,30 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p)
547 if (s) 712 if (s)
548 error = __vfs_follow_link(nd, s); 713 error = __vfs_follow_link(nd, s);
549 else if (nd->last_type == LAST_BIND) { 714 else if (nd->last_type == LAST_BIND) {
550 error = force_reval_path(&nd->path, nd); 715 nd->flags |= LOOKUP_JUMPED;
551 if (error) 716 nd->inode = nd->path.dentry->d_inode;
717 if (nd->inode->i_op->follow_link) {
718 /* stepped on a _really_ weird one */
552 path_put(&nd->path); 719 path_put(&nd->path);
720 error = -ELOOP;
721 }
553 } 722 }
554 } 723 }
555 return error; 724 return error;
556} 725}
557 726
558/* 727static int follow_up_rcu(struct path *path)
559 * This limits recursive symlink follows to 8, while
560 * limiting consecutive symlinks to 40.
561 *
562 * Without that kind of total limit, nasty chains of consecutive
563 * symlinks can cause almost arbitrarily long lookups.
564 */
565static inline int do_follow_link(struct path *path, struct nameidata *nd)
566{ 728{
567 void *cookie; 729 struct vfsmount *parent;
568 int err = -ELOOP; 730 struct dentry *mountpoint;
569 if (current->link_count >= MAX_NESTED_LINKS) 731
570 goto loop; 732 parent = path->mnt->mnt_parent;
571 if (current->total_link_count >= 40) 733 if (parent == path->mnt)
572 goto loop; 734 return 0;
573 BUG_ON(nd->depth >= MAX_NESTED_LINKS); 735 mountpoint = path->mnt->mnt_mountpoint;
574 cond_resched(); 736 path->dentry = mountpoint;
575 err = security_inode_follow_link(path->dentry, nd); 737 path->mnt = parent;
576 if (err) 738 return 1;
577 goto loop;
578 current->link_count++;
579 current->total_link_count++;
580 nd->depth++;
581 err = __do_follow_link(path, nd, &cookie);
582 if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
583 path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
584 path_put(path);
585 current->link_count--;
586 nd->depth--;
587 return err;
588loop:
589 path_put_conditional(path, nd);
590 path_put(&nd->path);
591 return err;
592} 739}
593 740
594int follow_up(struct path *path) 741int follow_up(struct path *path)
@@ -612,58 +759,328 @@ int follow_up(struct path *path)
612 return 1; 759 return 1;
613} 760}
614 761
615/* no need for dcache_lock, as serialization is taken care in 762/*
616 * namespace.c 763 * Perform an automount
764 * - return -EISDIR to tell follow_managed() to stop and return the path we
765 * were called with.
617 */ 766 */
618static int __follow_mount(struct path *path) 767static int follow_automount(struct path *path, unsigned flags,
768 bool *need_mntput)
619{ 769{
620 int res = 0; 770 struct vfsmount *mnt;
621 while (d_mountpoint(path->dentry)) { 771 int err;
622 struct vfsmount *mounted = lookup_mnt(path); 772
623 if (!mounted) 773 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
624 break; 774 return -EREMOTE;
775
776 /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
777 * and this is the terminal part of the path.
778 */
779 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
780 return -EISDIR; /* we actually want to stop here */
781
782 /* We want to mount if someone is trying to open/create a file of any
783 * type under the mountpoint, wants to traverse through the mountpoint
784 * or wants to open the mounted directory.
785 *
786 * We don't want to mount if someone's just doing a stat and they've
787 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
788 * appended a '/' to the name.
789 */
790 if (!(flags & LOOKUP_FOLLOW) &&
791 !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
792 LOOKUP_OPEN | LOOKUP_CREATE)))
793 return -EISDIR;
794
795 current->total_link_count++;
796 if (current->total_link_count >= 40)
797 return -ELOOP;
798
799 mnt = path->dentry->d_op->d_automount(path);
800 if (IS_ERR(mnt)) {
801 /*
802 * The filesystem is allowed to return -EISDIR here to indicate
803 * it doesn't want to automount. For instance, autofs would do
804 * this so that its userspace daemon can mount on this dentry.
805 *
806 * However, we can only permit this if it's a terminal point in
807 * the path being looked up; if it wasn't then the remainder of
808 * the path is inaccessible and we should say so.
809 */
810 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
811 return -EREMOTE;
812 return PTR_ERR(mnt);
813 }
814
815 if (!mnt) /* mount collision */
816 return 0;
817
818 if (!*need_mntput) {
819 /* lock_mount() may release path->mnt on error */
820 mntget(path->mnt);
821 *need_mntput = true;
822 }
823 err = finish_automount(mnt, path);
824
825 switch (err) {
826 case -EBUSY:
827 /* Someone else made a mount here whilst we were busy */
828 return 0;
829 case 0:
830 path_put(path);
831 path->mnt = mnt;
832 path->dentry = dget(mnt->mnt_root);
833 return 0;
834 default:
835 return err;
836 }
837
838}
839
840/*
841 * Handle a dentry that is managed in some way.
842 * - Flagged for transit management (autofs)
843 * - Flagged as mountpoint
844 * - Flagged as automount point
845 *
846 * This may only be called in refwalk mode.
847 *
848 * Serialization is taken care of in namespace.c
849 */
850static int follow_managed(struct path *path, unsigned flags)
851{
852 struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
853 unsigned managed;
854 bool need_mntput = false;
855 int ret = 0;
856
857 /* Given that we're not holding a lock here, we retain the value in a
858 * local variable for each dentry as we look at it so that we don't see
859 * the components of that value change under us */
860 while (managed = ACCESS_ONCE(path->dentry->d_flags),
861 managed &= DCACHE_MANAGED_DENTRY,
862 unlikely(managed != 0)) {
863 /* Allow the filesystem to manage the transit without i_mutex
864 * being held. */
865 if (managed & DCACHE_MANAGE_TRANSIT) {
866 BUG_ON(!path->dentry->d_op);
867 BUG_ON(!path->dentry->d_op->d_manage);
868 ret = path->dentry->d_op->d_manage(path->dentry, false);
869 if (ret < 0)
870 break;
871 }
872
873 /* Transit to a mounted filesystem. */
874 if (managed & DCACHE_MOUNTED) {
875 struct vfsmount *mounted = lookup_mnt(path);
876 if (mounted) {
877 dput(path->dentry);
878 if (need_mntput)
879 mntput(path->mnt);
880 path->mnt = mounted;
881 path->dentry = dget(mounted->mnt_root);
882 need_mntput = true;
883 continue;
884 }
885
886 /* Something is mounted on this dentry in another
887 * namespace and/or whatever was mounted there in this
888 * namespace got unmounted before we managed to get the
889 * vfsmount_lock */
890 }
891
892 /* Handle an automount point */
893 if (managed & DCACHE_NEED_AUTOMOUNT) {
894 ret = follow_automount(path, flags, &need_mntput);
895 if (ret < 0)
896 break;
897 continue;
898 }
899
900 /* We didn't change the current path point */
901 break;
902 }
903
904 if (need_mntput && path->mnt == mnt)
905 mntput(path->mnt);
906 if (ret == -EISDIR)
907 ret = 0;
908 return ret;
909}
910
911int follow_down_one(struct path *path)
912{
913 struct vfsmount *mounted;
914
915 mounted = lookup_mnt(path);
916 if (mounted) {
625 dput(path->dentry); 917 dput(path->dentry);
626 if (res) 918 mntput(path->mnt);
627 mntput(path->mnt);
628 path->mnt = mounted; 919 path->mnt = mounted;
629 path->dentry = dget(mounted->mnt_root); 920 path->dentry = dget(mounted->mnt_root);
630 res = 1; 921 return 1;
631 } 922 }
632 return res; 923 return 0;
633} 924}
634 925
635static void follow_mount(struct path *path) 926static inline bool managed_dentry_might_block(struct dentry *dentry)
636{ 927{
637 while (d_mountpoint(path->dentry)) { 928 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
638 struct vfsmount *mounted = lookup_mnt(path); 929 dentry->d_op->d_manage(dentry, true) < 0);
930}
931
932/*
933 * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
934 * we meet a managed dentry that would need blocking.
935 */
936static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
937 struct inode **inode)
938{
939 for (;;) {
940 struct vfsmount *mounted;
941 /*
942 * Don't forget we might have a non-mountpoint managed dentry
943 * that wants to block transit.
944 */
945 if (unlikely(managed_dentry_might_block(path->dentry)))
946 return false;
947
948 if (!d_mountpoint(path->dentry))
949 break;
950
951 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
639 if (!mounted) 952 if (!mounted)
640 break; 953 break;
641 dput(path->dentry);
642 mntput(path->mnt);
643 path->mnt = mounted; 954 path->mnt = mounted;
644 path->dentry = dget(mounted->mnt_root); 955 path->dentry = mounted->mnt_root;
956 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
957 /*
958 * Update the inode too. We don't need to re-check the
959 * dentry sequence number here after this d_inode read,
960 * because a mount-point is always pinned.
961 */
962 *inode = path->dentry->d_inode;
963 }
964 return true;
965}
966
967static void follow_mount_rcu(struct nameidata *nd)
968{
969 while (d_mountpoint(nd->path.dentry)) {
970 struct vfsmount *mounted;
971 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
972 if (!mounted)
973 break;
974 nd->path.mnt = mounted;
975 nd->path.dentry = mounted->mnt_root;
976 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
645 } 977 }
646} 978}
647 979
648/* no need for dcache_lock, as serialization is taken care in 980static int follow_dotdot_rcu(struct nameidata *nd)
649 * namespace.c 981{
982 set_root_rcu(nd);
983
984 while (1) {
985 if (nd->path.dentry == nd->root.dentry &&
986 nd->path.mnt == nd->root.mnt) {
987 break;
988 }
989 if (nd->path.dentry != nd->path.mnt->mnt_root) {
990 struct dentry *old = nd->path.dentry;
991 struct dentry *parent = old->d_parent;
992 unsigned seq;
993
994 seq = read_seqcount_begin(&parent->d_seq);
995 if (read_seqcount_retry(&old->d_seq, nd->seq))
996 goto failed;
997 nd->path.dentry = parent;
998 nd->seq = seq;
999 break;
1000 }
1001 if (!follow_up_rcu(&nd->path))
1002 break;
1003 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1004 }
1005 follow_mount_rcu(nd);
1006 nd->inode = nd->path.dentry->d_inode;
1007 return 0;
1008
1009failed:
1010 nd->flags &= ~LOOKUP_RCU;
1011 if (!(nd->flags & LOOKUP_ROOT))
1012 nd->root.mnt = NULL;
1013 rcu_read_unlock();
1014 br_read_unlock(vfsmount_lock);
1015 return -ECHILD;
1016}
1017
1018/*
1019 * Follow down to the covering mount currently visible to userspace. At each
1020 * point, the filesystem owning that dentry may be queried as to whether the
1021 * caller is permitted to proceed or not.
650 */ 1022 */
651int follow_down(struct path *path) 1023int follow_down(struct path *path)
652{ 1024{
653 struct vfsmount *mounted; 1025 unsigned managed;
1026 int ret;
654 1027
655 mounted = lookup_mnt(path); 1028 while (managed = ACCESS_ONCE(path->dentry->d_flags),
656 if (mounted) { 1029 unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1030 /* Allow the filesystem to manage the transit without i_mutex
1031 * being held.
1032 *
1033 * We indicate to the filesystem if someone is trying to mount
1034 * something here. This gives autofs the chance to deny anyone
1035 * other than its daemon the right to mount on its
1036 * superstructure.
1037 *
1038 * The filesystem may sleep at this point.
1039 */
1040 if (managed & DCACHE_MANAGE_TRANSIT) {
1041 BUG_ON(!path->dentry->d_op);
1042 BUG_ON(!path->dentry->d_op->d_manage);
1043 ret = path->dentry->d_op->d_manage(
1044 path->dentry, false);
1045 if (ret < 0)
1046 return ret == -EISDIR ? 0 : ret;
1047 }
1048
1049 /* Transit to a mounted filesystem. */
1050 if (managed & DCACHE_MOUNTED) {
1051 struct vfsmount *mounted = lookup_mnt(path);
1052 if (!mounted)
1053 break;
1054 dput(path->dentry);
1055 mntput(path->mnt);
1056 path->mnt = mounted;
1057 path->dentry = dget(mounted->mnt_root);
1058 continue;
1059 }
1060
1061 /* Don't handle automount points here */
1062 break;
1063 }
1064 return 0;
1065}
1066
1067/*
1068 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1069 */
1070static void follow_mount(struct path *path)
1071{
1072 while (d_mountpoint(path->dentry)) {
1073 struct vfsmount *mounted = lookup_mnt(path);
1074 if (!mounted)
1075 break;
657 dput(path->dentry); 1076 dput(path->dentry);
658 mntput(path->mnt); 1077 mntput(path->mnt);
659 path->mnt = mounted; 1078 path->mnt = mounted;
660 path->dentry = dget(mounted->mnt_root); 1079 path->dentry = dget(mounted->mnt_root);
661 return 1;
662 } 1080 }
663 return 0;
664} 1081}
665 1082
666static __always_inline void follow_dotdot(struct nameidata *nd) 1083static void follow_dotdot(struct nameidata *nd)
667{ 1084{
668 set_root(nd); 1085 set_root(nd);
669 1086
@@ -684,6 +1101,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
684 break; 1101 break;
685 } 1102 }
686 follow_mount(&nd->path); 1103 follow_mount(&nd->path);
1104 nd->inode = nd->path.dentry->d_inode;
687} 1105}
688 1106
689/* 1107/*
@@ -721,89 +1139,207 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
721 * It _is_ time-critical. 1139 * It _is_ time-critical.
722 */ 1140 */
723static int do_lookup(struct nameidata *nd, struct qstr *name, 1141static int do_lookup(struct nameidata *nd, struct qstr *name,
724 struct path *path) 1142 struct path *path, struct inode **inode)
725{ 1143{
726 struct vfsmount *mnt = nd->path.mnt; 1144 struct vfsmount *mnt = nd->path.mnt;
727 struct dentry *dentry, *parent; 1145 struct dentry *dentry, *parent = nd->path.dentry;
728 struct inode *dir; 1146 int need_reval = 1;
729 /* 1147 int status = 1;
730 * See if the low-level filesystem might want 1148 int err;
731 * to use its own hash..
732 */
733 if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
734 int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
735 if (err < 0)
736 return err;
737 }
738 1149
739 /* 1150 /*
740 * Rename seqlock is not required here because in the off chance 1151 * Rename seqlock is not required here because in the off chance
741 * of a false negative due to a concurrent rename, we're going to 1152 * of a false negative due to a concurrent rename, we're going to
742 * do the non-racy lookup, below. 1153 * do the non-racy lookup, below.
743 */ 1154 */
744 dentry = __d_lookup(nd->path.dentry, name); 1155 if (nd->flags & LOOKUP_RCU) {
745 if (!dentry) 1156 unsigned seq;
746 goto need_lookup; 1157 *inode = nd->inode;
747found: 1158 dentry = __d_lookup_rcu(parent, name, &seq, inode);
748 if (dentry->d_op && dentry->d_op->d_revalidate) 1159 if (!dentry)
749 goto need_revalidate; 1160 goto unlazy;
750done: 1161
1162 /* Memory barrier in read_seqcount_begin of child is enough */
1163 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1164 return -ECHILD;
1165 nd->seq = seq;
1166
1167 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1168 status = d_revalidate(dentry, nd);
1169 if (unlikely(status <= 0)) {
1170 if (status != -ECHILD)
1171 need_reval = 0;
1172 goto unlazy;
1173 }
1174 }
1175 path->mnt = mnt;
1176 path->dentry = dentry;
1177 if (unlikely(!__follow_mount_rcu(nd, path, inode)))
1178 goto unlazy;
1179 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1180 goto unlazy;
1181 return 0;
1182unlazy:
1183 if (unlazy_walk(nd, dentry))
1184 return -ECHILD;
1185 } else {
1186 dentry = __d_lookup(parent, name);
1187 }
1188
1189retry:
1190 if (unlikely(!dentry)) {
1191 struct inode *dir = parent->d_inode;
1192 BUG_ON(nd->inode != dir);
1193
1194 mutex_lock(&dir->i_mutex);
1195 dentry = d_lookup(parent, name);
1196 if (likely(!dentry)) {
1197 dentry = d_alloc_and_lookup(parent, name, nd);
1198 if (IS_ERR(dentry)) {
1199 mutex_unlock(&dir->i_mutex);
1200 return PTR_ERR(dentry);
1201 }
1202 /* known good */
1203 need_reval = 0;
1204 status = 1;
1205 }
1206 mutex_unlock(&dir->i_mutex);
1207 }
1208 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1209 status = d_revalidate(dentry, nd);
1210 if (unlikely(status <= 0)) {
1211 if (status < 0) {
1212 dput(dentry);
1213 return status;
1214 }
1215 if (!d_invalidate(dentry)) {
1216 dput(dentry);
1217 dentry = NULL;
1218 need_reval = 1;
1219 goto retry;
1220 }
1221 }
1222
751 path->mnt = mnt; 1223 path->mnt = mnt;
752 path->dentry = dentry; 1224 path->dentry = dentry;
753 __follow_mount(path); 1225 err = follow_managed(path, nd->flags);
1226 if (unlikely(err < 0)) {
1227 path_put_conditional(path, nd);
1228 return err;
1229 }
1230 *inode = path->dentry->d_inode;
754 return 0; 1231 return 0;
1232}
755 1233
756need_lookup: 1234static inline int may_lookup(struct nameidata *nd)
757 parent = nd->path.dentry; 1235{
758 dir = parent->d_inode; 1236 if (nd->flags & LOOKUP_RCU) {
1237 int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
1238 if (err != -ECHILD)
1239 return err;
1240 if (unlazy_walk(nd, NULL))
1241 return -ECHILD;
1242 }
1243 return exec_permission(nd->inode, 0);
1244}
759 1245
760 mutex_lock(&dir->i_mutex); 1246static inline int handle_dots(struct nameidata *nd, int type)
761 /* 1247{
762 * First re-do the cached lookup just in case it was created 1248 if (type == LAST_DOTDOT) {
763 * while we waited for the directory semaphore, or the first 1249 if (nd->flags & LOOKUP_RCU) {
764 * lookup failed due to an unrelated rename. 1250 if (follow_dotdot_rcu(nd))
765 * 1251 return -ECHILD;
766 * This could use version numbering or similar to avoid unnecessary 1252 } else
767 * cache lookups, but then we'd have to do the first lookup in the 1253 follow_dotdot(nd);
768 * non-racy way. However in the common case here, everything should
769 * be hot in cache, so would it be a big win?
770 */
771 dentry = d_lookup(parent, name);
772 if (likely(!dentry)) {
773 dentry = d_alloc_and_lookup(parent, name, nd);
774 mutex_unlock(&dir->i_mutex);
775 if (IS_ERR(dentry))
776 goto fail;
777 goto done;
778 } 1254 }
779 /* 1255 return 0;
780 * Uhhuh! Nasty case: the cache was re-populated while 1256}
781 * we waited on the semaphore. Need to revalidate.
782 */
783 mutex_unlock(&dir->i_mutex);
784 goto found;
785 1257
786need_revalidate: 1258static void terminate_walk(struct nameidata *nd)
787 dentry = do_revalidate(dentry, nd); 1259{
788 if (!dentry) 1260 if (!(nd->flags & LOOKUP_RCU)) {
789 goto need_lookup; 1261 path_put(&nd->path);
790 if (IS_ERR(dentry)) 1262 } else {
791 goto fail; 1263 nd->flags &= ~LOOKUP_RCU;
792 goto done; 1264 if (!(nd->flags & LOOKUP_ROOT))
1265 nd->root.mnt = NULL;
1266 rcu_read_unlock();
1267 br_read_unlock(vfsmount_lock);
1268 }
1269}
793 1270
794fail: 1271static inline int walk_component(struct nameidata *nd, struct path *path,
795 return PTR_ERR(dentry); 1272 struct qstr *name, int type, int follow)
1273{
1274 struct inode *inode;
1275 int err;
1276 /*
1277 * "." and ".." are special - ".." especially so because it has
1278 * to be able to know about the current root directory and
1279 * parent relationships.
1280 */
1281 if (unlikely(type != LAST_NORM))
1282 return handle_dots(nd, type);
1283 err = do_lookup(nd, name, path, &inode);
1284 if (unlikely(err)) {
1285 terminate_walk(nd);
1286 return err;
1287 }
1288 if (!inode) {
1289 path_to_nameidata(path, nd);
1290 terminate_walk(nd);
1291 return -ENOENT;
1292 }
1293 if (unlikely(inode->i_op->follow_link) && follow) {
1294 if (nd->flags & LOOKUP_RCU) {
1295 if (unlikely(unlazy_walk(nd, path->dentry))) {
1296 terminate_walk(nd);
1297 return -ECHILD;
1298 }
1299 }
1300 BUG_ON(inode != path->dentry->d_inode);
1301 return 1;
1302 }
1303 path_to_nameidata(path, nd);
1304 nd->inode = inode;
1305 return 0;
796} 1306}
797 1307
798/* 1308/*
799 * This is a temporary kludge to deal with "automount" symlinks; proper 1309 * This limits recursive symlink follows to 8, while
800 * solution is to trigger them on follow_mount(), so that do_lookup() 1310 * limiting consecutive symlinks to 40.
801 * would DTRT. To be killed before 2.6.34-final. 1311 *
1312 * Without that kind of total limit, nasty chains of consecutive
1313 * symlinks can cause almost arbitrarily long lookups.
802 */ 1314 */
803static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) 1315static inline int nested_symlink(struct path *path, struct nameidata *nd)
804{ 1316{
805 return inode && unlikely(inode->i_op->follow_link) && 1317 int res;
806 ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode)); 1318
1319 if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
1320 path_put_conditional(path, nd);
1321 path_put(&nd->path);
1322 return -ELOOP;
1323 }
1324 BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1325
1326 nd->depth++;
1327 current->link_count++;
1328
1329 do {
1330 struct path link = *path;
1331 void *cookie;
1332
1333 res = follow_link(&link, nd, &cookie);
1334 if (!res)
1335 res = walk_component(nd, path, &nd->last,
1336 nd->last_type, LOOKUP_FOLLOW);
1337 put_link(nd, &link, cookie);
1338 } while (res > 0);
1339
1340 current->link_count--;
1341 nd->depth--;
1342 return res;
807} 1343}
808 1344
809/* 1345/*
@@ -817,27 +1353,24 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
817static int link_path_walk(const char *name, struct nameidata *nd) 1353static int link_path_walk(const char *name, struct nameidata *nd)
818{ 1354{
819 struct path next; 1355 struct path next;
820 struct inode *inode;
821 int err; 1356 int err;
822 unsigned int lookup_flags = nd->flags; 1357 unsigned int lookup_flags = nd->flags;
823 1358
824 while (*name=='/') 1359 while (*name=='/')
825 name++; 1360 name++;
826 if (!*name) 1361 if (!*name)
827 goto return_reval; 1362 return 0;
828
829 inode = nd->path.dentry->d_inode;
830 if (nd->depth)
831 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
832 1363
833 /* At this point we know we have a real path component. */ 1364 /* At this point we know we have a real path component. */
834 for(;;) { 1365 for(;;) {
835 unsigned long hash; 1366 unsigned long hash;
836 struct qstr this; 1367 struct qstr this;
837 unsigned int c; 1368 unsigned int c;
1369 int type;
838 1370
839 nd->flags |= LOOKUP_CONTINUE; 1371 nd->flags |= LOOKUP_CONTINUE;
840 err = exec_permission(inode); 1372
1373 err = may_lookup(nd);
841 if (err) 1374 if (err)
842 break; 1375 break;
843 1376
@@ -853,195 +1386,154 @@ static int link_path_walk(const char *name, struct nameidata *nd)
853 this.len = name - (const char *) this.name; 1386 this.len = name - (const char *) this.name;
854 this.hash = end_name_hash(hash); 1387 this.hash = end_name_hash(hash);
855 1388
1389 type = LAST_NORM;
1390 if (this.name[0] == '.') switch (this.len) {
1391 case 2:
1392 if (this.name[1] == '.') {
1393 type = LAST_DOTDOT;
1394 nd->flags |= LOOKUP_JUMPED;
1395 }
1396 break;
1397 case 1:
1398 type = LAST_DOT;
1399 }
1400 if (likely(type == LAST_NORM)) {
1401 struct dentry *parent = nd->path.dentry;
1402 nd->flags &= ~LOOKUP_JUMPED;
1403 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1404 err = parent->d_op->d_hash(parent, nd->inode,
1405 &this);
1406 if (err < 0)
1407 break;
1408 }
1409 }
1410
856 /* remove trailing slashes? */ 1411 /* remove trailing slashes? */
857 if (!c) 1412 if (!c)
858 goto last_component; 1413 goto last_component;
859 while (*++name == '/'); 1414 while (*++name == '/');
860 if (!*name) 1415 if (!*name)
861 goto last_with_slashes; 1416 goto last_component;
862
863 /*
864 * "." and ".." are special - ".." especially so because it has
865 * to be able to know about the current root directory and
866 * parent relationships.
867 */
868 if (this.name[0] == '.') switch (this.len) {
869 default:
870 break;
871 case 2:
872 if (this.name[1] != '.')
873 break;
874 follow_dotdot(nd);
875 inode = nd->path.dentry->d_inode;
876 /* fallthrough */
877 case 1:
878 continue;
879 }
880 /* This does the actual lookups.. */
881 err = do_lookup(nd, &this, &next);
882 if (err)
883 break;
884 1417
885 err = -ENOENT; 1418 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
886 inode = next.dentry->d_inode; 1419 if (err < 0)
887 if (!inode) 1420 return err;
888 goto out_dput;
889 1421
890 if (inode->i_op->follow_link) { 1422 if (err) {
891 err = do_follow_link(&next, nd); 1423 err = nested_symlink(&next, nd);
892 if (err) 1424 if (err)
893 goto return_err; 1425 return err;
894 err = -ENOENT; 1426 }
895 inode = nd->path.dentry->d_inode;
896 if (!inode)
897 break;
898 } else
899 path_to_nameidata(&next, nd);
900 err = -ENOTDIR; 1427 err = -ENOTDIR;
901 if (!inode->i_op->lookup) 1428 if (!nd->inode->i_op->lookup)
902 break; 1429 break;
903 continue; 1430 continue;
904 /* here ends the main loop */ 1431 /* here ends the main loop */
905 1432
906last_with_slashes:
907 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
908last_component: 1433last_component:
909 /* Clear LOOKUP_CONTINUE iff it was previously unset */ 1434 /* Clear LOOKUP_CONTINUE iff it was previously unset */
910 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; 1435 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
911 if (lookup_flags & LOOKUP_PARENT)
912 goto lookup_parent;
913 if (this.name[0] == '.') switch (this.len) {
914 default:
915 break;
916 case 2:
917 if (this.name[1] != '.')
918 break;
919 follow_dotdot(nd);
920 inode = nd->path.dentry->d_inode;
921 /* fallthrough */
922 case 1:
923 goto return_reval;
924 }
925 err = do_lookup(nd, &this, &next);
926 if (err)
927 break;
928 inode = next.dentry->d_inode;
929 if (follow_on_final(inode, lookup_flags)) {
930 err = do_follow_link(&next, nd);
931 if (err)
932 goto return_err;
933 inode = nd->path.dentry->d_inode;
934 } else
935 path_to_nameidata(&next, nd);
936 err = -ENOENT;
937 if (!inode)
938 break;
939 if (lookup_flags & LOOKUP_DIRECTORY) {
940 err = -ENOTDIR;
941 if (!inode->i_op->lookup)
942 break;
943 }
944 goto return_base;
945lookup_parent:
946 nd->last = this; 1436 nd->last = this;
947 nd->last_type = LAST_NORM; 1437 nd->last_type = type;
948 if (this.name[0] != '.')
949 goto return_base;
950 if (this.len == 1)
951 nd->last_type = LAST_DOT;
952 else if (this.len == 2 && this.name[1] == '.')
953 nd->last_type = LAST_DOTDOT;
954 else
955 goto return_base;
956return_reval:
957 /*
958 * We bypassed the ordinary revalidation routines.
959 * We may need to check the cached dentry for staleness.
960 */
961 if (nd->path.dentry && nd->path.dentry->d_sb &&
962 (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
963 err = -ESTALE;
964 /* Note: we do not d_invalidate() */
965 if (!nd->path.dentry->d_op->d_revalidate(
966 nd->path.dentry, nd))
967 break;
968 }
969return_base:
970 return 0; 1438 return 0;
971out_dput:
972 path_put_conditional(&next, nd);
973 break;
974 } 1439 }
975 path_put(&nd->path); 1440 terminate_walk(nd);
976return_err:
977 return err; 1441 return err;
978} 1442}
979 1443
980static int path_walk(const char *name, struct nameidata *nd) 1444static int path_init(int dfd, const char *name, unsigned int flags,
981{ 1445 struct nameidata *nd, struct file **fp)
982 struct path save = nd->path;
983 int result;
984
985 current->total_link_count = 0;
986
987 /* make sure the stuff we saved doesn't go away */
988 path_get(&save);
989
990 result = link_path_walk(name, nd);
991 if (result == -ESTALE) {
992 /* nd->path had been dropped */
993 current->total_link_count = 0;
994 nd->path = save;
995 path_get(&nd->path);
996 nd->flags |= LOOKUP_REVAL;
997 result = link_path_walk(name, nd);
998 }
999
1000 path_put(&save);
1001
1002 return result;
1003}
1004
1005static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1006{ 1446{
1007 int retval = 0; 1447 int retval = 0;
1008 int fput_needed; 1448 int fput_needed;
1009 struct file *file; 1449 struct file *file;
1010 1450
1011 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1451 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1012 nd->flags = flags; 1452 nd->flags = flags | LOOKUP_JUMPED;
1013 nd->depth = 0; 1453 nd->depth = 0;
1454 if (flags & LOOKUP_ROOT) {
1455 struct inode *inode = nd->root.dentry->d_inode;
1456 if (*name) {
1457 if (!inode->i_op->lookup)
1458 return -ENOTDIR;
1459 retval = inode_permission(inode, MAY_EXEC);
1460 if (retval)
1461 return retval;
1462 }
1463 nd->path = nd->root;
1464 nd->inode = inode;
1465 if (flags & LOOKUP_RCU) {
1466 br_read_lock(vfsmount_lock);
1467 rcu_read_lock();
1468 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1469 } else {
1470 path_get(&nd->path);
1471 }
1472 return 0;
1473 }
1474
1014 nd->root.mnt = NULL; 1475 nd->root.mnt = NULL;
1015 1476
1016 if (*name=='/') { 1477 if (*name=='/') {
1017 set_root(nd); 1478 if (flags & LOOKUP_RCU) {
1479 br_read_lock(vfsmount_lock);
1480 rcu_read_lock();
1481 set_root_rcu(nd);
1482 } else {
1483 set_root(nd);
1484 path_get(&nd->root);
1485 }
1018 nd->path = nd->root; 1486 nd->path = nd->root;
1019 path_get(&nd->root);
1020 } else if (dfd == AT_FDCWD) { 1487 } else if (dfd == AT_FDCWD) {
1021 get_fs_pwd(current->fs, &nd->path); 1488 if (flags & LOOKUP_RCU) {
1489 struct fs_struct *fs = current->fs;
1490 unsigned seq;
1491
1492 br_read_lock(vfsmount_lock);
1493 rcu_read_lock();
1494
1495 do {
1496 seq = read_seqcount_begin(&fs->seq);
1497 nd->path = fs->pwd;
1498 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1499 } while (read_seqcount_retry(&fs->seq, seq));
1500 } else {
1501 get_fs_pwd(current->fs, &nd->path);
1502 }
1022 } else { 1503 } else {
1023 struct dentry *dentry; 1504 struct dentry *dentry;
1024 1505
1025 file = fget_light(dfd, &fput_needed); 1506 file = fget_raw_light(dfd, &fput_needed);
1026 retval = -EBADF; 1507 retval = -EBADF;
1027 if (!file) 1508 if (!file)
1028 goto out_fail; 1509 goto out_fail;
1029 1510
1030 dentry = file->f_path.dentry; 1511 dentry = file->f_path.dentry;
1031 1512
1032 retval = -ENOTDIR; 1513 if (*name) {
1033 if (!S_ISDIR(dentry->d_inode->i_mode)) 1514 retval = -ENOTDIR;
1034 goto fput_fail; 1515 if (!S_ISDIR(dentry->d_inode->i_mode))
1516 goto fput_fail;
1035 1517
1036 retval = file_permission(file, MAY_EXEC); 1518 retval = file_permission(file, MAY_EXEC);
1037 if (retval) 1519 if (retval)
1038 goto fput_fail; 1520 goto fput_fail;
1521 }
1039 1522
1040 nd->path = file->f_path; 1523 nd->path = file->f_path;
1041 path_get(&file->f_path); 1524 if (flags & LOOKUP_RCU) {
1042 1525 if (fput_needed)
1043 fput_light(file, fput_needed); 1526 *fp = file;
1527 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1528 br_read_lock(vfsmount_lock);
1529 rcu_read_lock();
1530 } else {
1531 path_get(&file->f_path);
1532 fput_light(file, fput_needed);
1533 }
1044 } 1534 }
1535
1536 nd->inode = nd->path.dentry->d_inode;
1045 return 0; 1537 return 0;
1046 1538
1047fput_fail: 1539fput_fail:
@@ -1050,27 +1542,100 @@ out_fail:
1050 return retval; 1542 return retval;
1051} 1543}
1052 1544
1545static inline int lookup_last(struct nameidata *nd, struct path *path)
1546{
1547 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
1548 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1549
1550 nd->flags &= ~LOOKUP_PARENT;
1551 return walk_component(nd, path, &nd->last, nd->last_type,
1552 nd->flags & LOOKUP_FOLLOW);
1553}
1554
1053/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1555/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1054static int do_path_lookup(int dfd, const char *name, 1556static int path_lookupat(int dfd, const char *name,
1055 unsigned int flags, struct nameidata *nd) 1557 unsigned int flags, struct nameidata *nd)
1056{ 1558{
1057 int retval = path_init(dfd, name, flags, nd); 1559 struct file *base = NULL;
1058 if (!retval) 1560 struct path path;
1059 retval = path_walk(name, nd); 1561 int err;
1060 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && 1562
1061 nd->path.dentry->d_inode)) 1563 /*
1062 audit_inode(name, nd->path.dentry); 1564 * Path walking is largely split up into 2 different synchronisation
1063 if (nd->root.mnt) { 1565 * schemes, rcu-walk and ref-walk (explained in
1566 * Documentation/filesystems/path-lookup.txt). These share much of the
1567 * path walk code, but some things particularly setup, cleanup, and
1568 * following mounts are sufficiently divergent that functions are
1569 * duplicated. Typically there is a function foo(), and its RCU
1570 * analogue, foo_rcu().
1571 *
1572 * -ECHILD is the error number of choice (just to avoid clashes) that
1573 * is returned if some aspect of an rcu-walk fails. Such an error must
1574 * be handled by restarting a traditional ref-walk (which will always
1575 * be able to complete).
1576 */
1577 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1578
1579 if (unlikely(err))
1580 return err;
1581
1582 current->total_link_count = 0;
1583 err = link_path_walk(name, nd);
1584
1585 if (!err && !(flags & LOOKUP_PARENT)) {
1586 err = lookup_last(nd, &path);
1587 while (err > 0) {
1588 void *cookie;
1589 struct path link = path;
1590 nd->flags |= LOOKUP_PARENT;
1591 err = follow_link(&link, nd, &cookie);
1592 if (!err)
1593 err = lookup_last(nd, &path);
1594 put_link(nd, &link, cookie);
1595 }
1596 }
1597
1598 if (!err)
1599 err = complete_walk(nd);
1600
1601 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1602 if (!nd->inode->i_op->lookup) {
1603 path_put(&nd->path);
1604 err = -ENOTDIR;
1605 }
1606 }
1607
1608 if (base)
1609 fput(base);
1610
1611 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
1064 path_put(&nd->root); 1612 path_put(&nd->root);
1065 nd->root.mnt = NULL; 1613 nd->root.mnt = NULL;
1066 } 1614 }
1615 return err;
1616}
1617
1618static int do_path_lookup(int dfd, const char *name,
1619 unsigned int flags, struct nameidata *nd)
1620{
1621 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
1622 if (unlikely(retval == -ECHILD))
1623 retval = path_lookupat(dfd, name, flags, nd);
1624 if (unlikely(retval == -ESTALE))
1625 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
1626
1627 if (likely(!retval)) {
1628 if (unlikely(!audit_dummy_context())) {
1629 if (nd->path.dentry && nd->inode)
1630 audit_inode(name, nd->path.dentry);
1631 }
1632 }
1067 return retval; 1633 return retval;
1068} 1634}
1069 1635
1070int path_lookup(const char *name, unsigned int flags, 1636int kern_path_parent(const char *name, struct nameidata *nd)
1071 struct nameidata *nd)
1072{ 1637{
1073 return do_path_lookup(AT_FDCWD, name, flags, nd); 1638 return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
1074} 1639}
1075 1640
1076int kern_path(const char *name, unsigned int flags, struct path *path) 1641int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1094,49 +1659,22 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1094 const char *name, unsigned int flags, 1659 const char *name, unsigned int flags,
1095 struct nameidata *nd) 1660 struct nameidata *nd)
1096{ 1661{
1097 int retval; 1662 nd->root.dentry = dentry;
1098 1663 nd->root.mnt = mnt;
1099 /* same as do_path_lookup */ 1664 /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
1100 nd->last_type = LAST_ROOT; 1665 return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
1101 nd->flags = flags;
1102 nd->depth = 0;
1103
1104 nd->path.dentry = dentry;
1105 nd->path.mnt = mnt;
1106 path_get(&nd->path);
1107 nd->root = nd->path;
1108 path_get(&nd->root);
1109
1110 retval = path_walk(name, nd);
1111 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1112 nd->path.dentry->d_inode))
1113 audit_inode(name, nd->path.dentry);
1114
1115 path_put(&nd->root);
1116 nd->root.mnt = NULL;
1117
1118 return retval;
1119} 1666}
1120 1667
1121static struct dentry *__lookup_hash(struct qstr *name, 1668static struct dentry *__lookup_hash(struct qstr *name,
1122 struct dentry *base, struct nameidata *nd) 1669 struct dentry *base, struct nameidata *nd)
1123{ 1670{
1671 struct inode *inode = base->d_inode;
1124 struct dentry *dentry; 1672 struct dentry *dentry;
1125 struct inode *inode;
1126 int err; 1673 int err;
1127 1674
1128 inode = base->d_inode; 1675 err = exec_permission(inode, 0);
1129 1676 if (err)
1130 /* 1677 return ERR_PTR(err);
1131 * See if the low-level filesystem might want
1132 * to use its own hash..
1133 */
1134 if (base->d_op && base->d_op->d_hash) {
1135 err = base->d_op->d_hash(base, name);
1136 dentry = ERR_PTR(err);
1137 if (err < 0)
1138 goto out;
1139 }
1140 1678
1141 /* 1679 /*
1142 * Don't bother with __d_lookup: callers are for creat as 1680 * Don't bother with __d_lookup: callers are for creat as
@@ -1145,12 +1683,12 @@ static struct dentry *__lookup_hash(struct qstr *name,
1145 */ 1683 */
1146 dentry = d_lookup(base, name); 1684 dentry = d_lookup(base, name);
1147 1685
1148 if (dentry && dentry->d_op && dentry->d_op->d_revalidate) 1686 if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
1149 dentry = do_revalidate(dentry, nd); 1687 dentry = do_revalidate(dentry, nd);
1150 1688
1151 if (!dentry) 1689 if (!dentry)
1152 dentry = d_alloc_and_lookup(base, name, nd); 1690 dentry = d_alloc_and_lookup(base, name, nd);
1153out: 1691
1154 return dentry; 1692 return dentry;
1155} 1693}
1156 1694
@@ -1161,36 +1699,9 @@ out:
1161 */ 1699 */
1162static struct dentry *lookup_hash(struct nameidata *nd) 1700static struct dentry *lookup_hash(struct nameidata *nd)
1163{ 1701{
1164 int err;
1165
1166 err = exec_permission(nd->path.dentry->d_inode);
1167 if (err)
1168 return ERR_PTR(err);
1169 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1702 return __lookup_hash(&nd->last, nd->path.dentry, nd);
1170} 1703}
1171 1704
1172static int __lookup_one_len(const char *name, struct qstr *this,
1173 struct dentry *base, int len)
1174{
1175 unsigned long hash;
1176 unsigned int c;
1177
1178 this->name = name;
1179 this->len = len;
1180 if (!len)
1181 return -EACCES;
1182
1183 hash = init_name_hash();
1184 while (len--) {
1185 c = *(const unsigned char *)name++;
1186 if (c == '/' || c == '\0')
1187 return -EACCES;
1188 hash = partial_name_hash(c, hash);
1189 }
1190 this->hash = end_name_hash(hash);
1191 return 0;
1192}
1193
1194/** 1705/**
1195 * lookup_one_len - filesystem helper to lookup single pathname component 1706 * lookup_one_len - filesystem helper to lookup single pathname component
1196 * @name: pathname component to lookup 1707 * @name: pathname component to lookup
@@ -1204,18 +1715,35 @@ static int __lookup_one_len(const char *name, struct qstr *this,
1204 */ 1715 */
1205struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 1716struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1206{ 1717{
1207 int err;
1208 struct qstr this; 1718 struct qstr this;
1719 unsigned long hash;
1720 unsigned int c;
1209 1721
1210 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); 1722 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1211 1723
1212 err = __lookup_one_len(name, &this, base, len); 1724 this.name = name;
1213 if (err) 1725 this.len = len;
1214 return ERR_PTR(err); 1726 if (!len)
1727 return ERR_PTR(-EACCES);
1728
1729 hash = init_name_hash();
1730 while (len--) {
1731 c = *(const unsigned char *)name++;
1732 if (c == '/' || c == '\0')
1733 return ERR_PTR(-EACCES);
1734 hash = partial_name_hash(c, hash);
1735 }
1736 this.hash = end_name_hash(hash);
1737 /*
1738 * See if the low-level filesystem might want
1739 * to use its own hash..
1740 */
1741 if (base->d_flags & DCACHE_OP_HASH) {
1742 int err = base->d_op->d_hash(base, base->d_inode, &this);
1743 if (err < 0)
1744 return ERR_PTR(err);
1745 }
1215 1746
1216 err = exec_permission(base->d_inode);
1217 if (err)
1218 return ERR_PTR(err);
1219 return __lookup_hash(&this, base, NULL); 1747 return __lookup_hash(&this, base, NULL);
1220} 1748}
1221 1749
@@ -1223,7 +1751,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
1223 struct path *path) 1751 struct path *path)
1224{ 1752{
1225 struct nameidata nd; 1753 struct nameidata nd;
1226 char *tmp = getname(name); 1754 char *tmp = getname_flags(name, flags);
1227 int err = PTR_ERR(tmp); 1755 int err = PTR_ERR(tmp);
1228 if (!IS_ERR(tmp)) { 1756 if (!IS_ERR(tmp)) {
1229 1757
@@ -1265,11 +1793,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
1265 1793
1266 if (!(dir->i_mode & S_ISVTX)) 1794 if (!(dir->i_mode & S_ISVTX))
1267 return 0; 1795 return 0;
1796 if (current_user_ns() != inode_userns(inode))
1797 goto other_userns;
1268 if (inode->i_uid == fsuid) 1798 if (inode->i_uid == fsuid)
1269 return 0; 1799 return 0;
1270 if (dir->i_uid == fsuid) 1800 if (dir->i_uid == fsuid)
1271 return 0; 1801 return 0;
1272 return !capable(CAP_FOWNER); 1802
1803other_userns:
1804 return !ns_capable(inode_userns(inode), CAP_FOWNER);
1273} 1805}
1274 1806
1275/* 1807/*
@@ -1403,12 +1935,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1403 return error; 1935 return error;
1404} 1936}
1405 1937
1406int may_open(struct path *path, int acc_mode, int flag) 1938static int may_open(struct path *path, int acc_mode, int flag)
1407{ 1939{
1408 struct dentry *dentry = path->dentry; 1940 struct dentry *dentry = path->dentry;
1409 struct inode *inode = dentry->d_inode; 1941 struct inode *inode = dentry->d_inode;
1410 int error; 1942 int error;
1411 1943
1944 /* O_PATH? */
1945 if (!acc_mode)
1946 return 0;
1947
1412 if (!inode) 1948 if (!inode)
1413 return -ENOENT; 1949 return -ENOENT;
1414 1950
@@ -1445,7 +1981,7 @@ int may_open(struct path *path, int acc_mode, int flag)
1445 } 1981 }
1446 1982
1447 /* O_NOATIME can only be set by the owner or superuser */ 1983 /* O_NOATIME can only be set by the owner or superuser */
1448 if (flag & O_NOATIME && !is_owner_or_cap(inode)) 1984 if (flag & O_NOATIME && !inode_owner_or_capable(inode))
1449 return -EPERM; 1985 return -EPERM;
1450 1986
1451 /* 1987 /*
@@ -1454,8 +1990,9 @@ int may_open(struct path *path, int acc_mode, int flag)
1454 return break_lease(inode, flag); 1990 return break_lease(inode, flag);
1455} 1991}
1456 1992
1457static int handle_truncate(struct path *path) 1993static int handle_truncate(struct file *filp)
1458{ 1994{
1995 struct path *path = &filp->f_path;
1459 struct inode *inode = path->dentry->d_inode; 1996 struct inode *inode = path->dentry->d_inode;
1460 int error = get_write_access(inode); 1997 int error = get_write_access(inode);
1461 if (error) 1998 if (error)
@@ -1469,40 +2006,13 @@ static int handle_truncate(struct path *path)
1469 if (!error) { 2006 if (!error) {
1470 error = do_truncate(path->dentry, 0, 2007 error = do_truncate(path->dentry, 0,
1471 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 2008 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1472 NULL); 2009 filp);
1473 } 2010 }
1474 put_write_access(inode); 2011 put_write_access(inode);
1475 return error; 2012 return error;
1476} 2013}
1477 2014
1478/* 2015/*
1479 * Be careful about ever adding any more callers of this
1480 * function. Its flags must be in the namei format, not
1481 * what get passed to sys_open().
1482 */
1483static int __open_namei_create(struct nameidata *nd, struct path *path,
1484 int open_flag, int mode)
1485{
1486 int error;
1487 struct dentry *dir = nd->path.dentry;
1488
1489 if (!IS_POSIXACL(dir->d_inode))
1490 mode &= ~current_umask();
1491 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1492 if (error)
1493 goto out_unlock;
1494 error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1495out_unlock:
1496 mutex_unlock(&dir->d_inode->i_mutex);
1497 dput(nd->path.dentry);
1498 nd->path.dentry = path->dentry;
1499 if (error)
1500 return error;
1501 /* Don't check for write permission, don't truncate */
1502 return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
1503}
1504
1505/*
1506 * Note that while the flag value (low two bits) for sys_open means: 2016 * Note that while the flag value (low two bits) for sys_open means:
1507 * 00 - read-only 2017 * 00 - read-only
1508 * 01 - write-only 2018 * 01 - write-only
@@ -1526,147 +2036,107 @@ static inline int open_to_namei_flags(int flag)
1526 return flag; 2036 return flag;
1527} 2037}
1528 2038
1529static int open_will_truncate(int flag, struct inode *inode) 2039/*
1530{ 2040 * Handle the last step of open()
1531 /* 2041 */
1532 * We'll never write to the fs underlying
1533 * a device file.
1534 */
1535 if (special_file(inode->i_mode))
1536 return 0;
1537 return (flag & O_TRUNC);
1538}
1539
1540static struct file *finish_open(struct nameidata *nd,
1541 int open_flag, int acc_mode)
1542{
1543 struct file *filp;
1544 int will_truncate;
1545 int error;
1546
1547 will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
1548 if (will_truncate) {
1549 error = mnt_want_write(nd->path.mnt);
1550 if (error)
1551 goto exit;
1552 }
1553 error = may_open(&nd->path, acc_mode, open_flag);
1554 if (error) {
1555 if (will_truncate)
1556 mnt_drop_write(nd->path.mnt);
1557 goto exit;
1558 }
1559 filp = nameidata_to_filp(nd);
1560 if (!IS_ERR(filp)) {
1561 error = ima_file_check(filp, acc_mode);
1562 if (error) {
1563 fput(filp);
1564 filp = ERR_PTR(error);
1565 }
1566 }
1567 if (!IS_ERR(filp)) {
1568 if (will_truncate) {
1569 error = handle_truncate(&nd->path);
1570 if (error) {
1571 fput(filp);
1572 filp = ERR_PTR(error);
1573 }
1574 }
1575 }
1576 /*
1577 * It is now safe to drop the mnt write
1578 * because the filp has had a write taken
1579 * on its behalf.
1580 */
1581 if (will_truncate)
1582 mnt_drop_write(nd->path.mnt);
1583 return filp;
1584
1585exit:
1586 if (!IS_ERR(nd->intent.open.file))
1587 release_open_intent(nd);
1588 path_put(&nd->path);
1589 return ERR_PTR(error);
1590}
1591
1592static struct file *do_last(struct nameidata *nd, struct path *path, 2042static struct file *do_last(struct nameidata *nd, struct path *path,
1593 int open_flag, int acc_mode, 2043 const struct open_flags *op, const char *pathname)
1594 int mode, const char *pathname)
1595{ 2044{
1596 struct dentry *dir = nd->path.dentry; 2045 struct dentry *dir = nd->path.dentry;
2046 struct dentry *dentry;
2047 int open_flag = op->open_flag;
2048 int will_truncate = open_flag & O_TRUNC;
2049 int want_write = 0;
2050 int acc_mode = op->acc_mode;
1597 struct file *filp; 2051 struct file *filp;
1598 int error = -EISDIR; 2052 int error;
2053
2054 nd->flags &= ~LOOKUP_PARENT;
2055 nd->flags |= op->intent;
1599 2056
1600 switch (nd->last_type) { 2057 switch (nd->last_type) {
1601 case LAST_DOTDOT: 2058 case LAST_DOTDOT:
1602 follow_dotdot(nd);
1603 dir = nd->path.dentry;
1604 case LAST_DOT: 2059 case LAST_DOT:
1605 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { 2060 error = handle_dots(nd, nd->last_type);
1606 if (!dir->d_op->d_revalidate(dir, nd)) { 2061 if (error)
1607 error = -ESTALE; 2062 return ERR_PTR(error);
1608 goto exit;
1609 }
1610 }
1611 /* fallthrough */ 2063 /* fallthrough */
1612 case LAST_ROOT: 2064 case LAST_ROOT:
1613 if (open_flag & O_CREAT) 2065 error = complete_walk(nd);
2066 if (error)
2067 return ERR_PTR(error);
2068 audit_inode(pathname, nd->path.dentry);
2069 if (open_flag & O_CREAT) {
2070 error = -EISDIR;
1614 goto exit; 2071 goto exit;
1615 /* fallthrough */ 2072 }
2073 goto ok;
1616 case LAST_BIND: 2074 case LAST_BIND:
2075 error = complete_walk(nd);
2076 if (error)
2077 return ERR_PTR(error);
1617 audit_inode(pathname, dir); 2078 audit_inode(pathname, dir);
1618 goto ok; 2079 goto ok;
1619 } 2080 }
1620 2081
1621 /* trailing slashes? */
1622 if (nd->last.name[nd->last.len]) {
1623 if (open_flag & O_CREAT)
1624 goto exit;
1625 nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
1626 }
1627
1628 /* just plain open? */
1629 if (!(open_flag & O_CREAT)) { 2082 if (!(open_flag & O_CREAT)) {
1630 error = do_lookup(nd, &nd->last, path); 2083 int symlink_ok = 0;
1631 if (error) 2084 if (nd->last.name[nd->last.len])
1632 goto exit; 2085 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1633 error = -ENOENT; 2086 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
1634 if (!path->dentry->d_inode) 2087 symlink_ok = 1;
1635 goto exit_dput; 2088 /* we _can_ be in RCU mode here */
1636 if (path->dentry->d_inode->i_op->follow_link) 2089 error = walk_component(nd, path, &nd->last, LAST_NORM,
2090 !symlink_ok);
2091 if (error < 0)
2092 return ERR_PTR(error);
2093 if (error) /* symlink */
1637 return NULL; 2094 return NULL;
2095 /* sayonara */
2096 error = complete_walk(nd);
2097 if (error)
2098 return ERR_PTR(-ECHILD);
2099
1638 error = -ENOTDIR; 2100 error = -ENOTDIR;
1639 if (nd->flags & LOOKUP_DIRECTORY) { 2101 if (nd->flags & LOOKUP_DIRECTORY) {
1640 if (!path->dentry->d_inode->i_op->lookup) 2102 if (!nd->inode->i_op->lookup)
1641 goto exit_dput; 2103 goto exit;
1642 } 2104 }
1643 path_to_nameidata(path, nd);
1644 audit_inode(pathname, nd->path.dentry); 2105 audit_inode(pathname, nd->path.dentry);
1645 goto ok; 2106 goto ok;
1646 } 2107 }
1647 2108
1648 /* OK, it's O_CREAT */ 2109 /* create side of things */
1649 mutex_lock(&dir->d_inode->i_mutex); 2110 error = complete_walk(nd);
2111 if (error)
2112 return ERR_PTR(error);
1650 2113
1651 path->dentry = lookup_hash(nd); 2114 audit_inode(pathname, dir);
1652 path->mnt = nd->path.mnt; 2115 error = -EISDIR;
2116 /* trailing slashes? */
2117 if (nd->last.name[nd->last.len])
2118 goto exit;
2119
2120 mutex_lock(&dir->d_inode->i_mutex);
1653 2121
1654 error = PTR_ERR(path->dentry); 2122 dentry = lookup_hash(nd);
1655 if (IS_ERR(path->dentry)) { 2123 error = PTR_ERR(dentry);
2124 if (IS_ERR(dentry)) {
1656 mutex_unlock(&dir->d_inode->i_mutex); 2125 mutex_unlock(&dir->d_inode->i_mutex);
1657 goto exit; 2126 goto exit;
1658 } 2127 }
1659 2128
1660 if (IS_ERR(nd->intent.open.file)) { 2129 path->dentry = dentry;
1661 error = PTR_ERR(nd->intent.open.file); 2130 path->mnt = nd->path.mnt;
1662 goto exit_mutex_unlock;
1663 }
1664 2131
1665 /* Negative dentry, just create the file */ 2132 /* Negative dentry, just create the file */
1666 if (!path->dentry->d_inode) { 2133 if (!dentry->d_inode) {
2134 int mode = op->mode;
2135 if (!IS_POSIXACL(dir->d_inode))
2136 mode &= ~current_umask();
1667 /* 2137 /*
1668 * This write is needed to ensure that a 2138 * This write is needed to ensure that a
1669 * ro->rw transition does not occur between 2139 * rw->ro transition does not occur between
1670 * the time when the file is created and when 2140 * the time when the file is created and when
1671 * a permanent write count is taken through 2141 * a permanent write count is taken through
1672 * the 'struct file' in nameidata_to_filp(). 2142 * the 'struct file' in nameidata_to_filp().
@@ -1674,21 +2144,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1674 error = mnt_want_write(nd->path.mnt); 2144 error = mnt_want_write(nd->path.mnt);
1675 if (error) 2145 if (error)
1676 goto exit_mutex_unlock; 2146 goto exit_mutex_unlock;
1677 error = __open_namei_create(nd, path, open_flag, mode); 2147 want_write = 1;
1678 if (error) { 2148 /* Don't check for write permission, don't truncate */
1679 mnt_drop_write(nd->path.mnt); 2149 open_flag &= ~O_TRUNC;
1680 goto exit; 2150 will_truncate = 0;
1681 } 2151 acc_mode = MAY_OPEN;
1682 filp = nameidata_to_filp(nd); 2152 error = security_path_mknod(&nd->path, dentry, mode, 0);
1683 mnt_drop_write(nd->path.mnt); 2153 if (error)
1684 if (!IS_ERR(filp)) { 2154 goto exit_mutex_unlock;
1685 error = ima_file_check(filp, acc_mode); 2155 error = vfs_create(dir->d_inode, dentry, mode, nd);
1686 if (error) { 2156 if (error)
1687 fput(filp); 2157 goto exit_mutex_unlock;
1688 filp = ERR_PTR(error); 2158 mutex_unlock(&dir->d_inode->i_mutex);
1689 } 2159 dput(nd->path.dentry);
1690 } 2160 nd->path.dentry = dentry;
1691 return filp; 2161 goto common;
1692 } 2162 }
1693 2163
1694 /* 2164 /*
@@ -1701,11 +2171,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1701 if (open_flag & O_EXCL) 2171 if (open_flag & O_EXCL)
1702 goto exit_dput; 2172 goto exit_dput;
1703 2173
1704 if (__follow_mount(path)) { 2174 error = follow_managed(path, nd->flags);
1705 error = -ELOOP; 2175 if (error < 0)
1706 if (open_flag & O_NOFOLLOW) 2176 goto exit_dput;
1707 goto exit_dput;
1708 }
1709 2177
1710 error = -ENOENT; 2178 error = -ENOENT;
1711 if (!path->dentry->d_inode) 2179 if (!path->dentry->d_inode)
@@ -1715,11 +2183,45 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1715 return NULL; 2183 return NULL;
1716 2184
1717 path_to_nameidata(path, nd); 2185 path_to_nameidata(path, nd);
2186 nd->inode = path->dentry->d_inode;
1718 error = -EISDIR; 2187 error = -EISDIR;
1719 if (S_ISDIR(path->dentry->d_inode->i_mode)) 2188 if (S_ISDIR(nd->inode->i_mode))
1720 goto exit; 2189 goto exit;
1721ok: 2190ok:
1722 filp = finish_open(nd, open_flag, acc_mode); 2191 if (!S_ISREG(nd->inode->i_mode))
2192 will_truncate = 0;
2193
2194 if (will_truncate) {
2195 error = mnt_want_write(nd->path.mnt);
2196 if (error)
2197 goto exit;
2198 want_write = 1;
2199 }
2200common:
2201 error = may_open(&nd->path, acc_mode, open_flag);
2202 if (error)
2203 goto exit;
2204 filp = nameidata_to_filp(nd);
2205 if (!IS_ERR(filp)) {
2206 error = ima_file_check(filp, op->acc_mode);
2207 if (error) {
2208 fput(filp);
2209 filp = ERR_PTR(error);
2210 }
2211 }
2212 if (!IS_ERR(filp)) {
2213 if (will_truncate) {
2214 error = handle_truncate(filp);
2215 if (error) {
2216 fput(filp);
2217 filp = ERR_PTR(error);
2218 }
2219 }
2220 }
2221out:
2222 if (want_write)
2223 mnt_drop_write(nd->path.mnt);
2224 path_put(&nd->path);
1723 return filp; 2225 return filp;
1724 2226
1725exit_mutex_unlock: 2227exit_mutex_unlock:
@@ -1727,170 +2229,103 @@ exit_mutex_unlock:
1727exit_dput: 2229exit_dput:
1728 path_put_conditional(path, nd); 2230 path_put_conditional(path, nd);
1729exit: 2231exit:
1730 if (!IS_ERR(nd->intent.open.file)) 2232 filp = ERR_PTR(error);
1731 release_open_intent(nd); 2233 goto out;
1732 path_put(&nd->path);
1733 return ERR_PTR(error);
1734} 2234}
1735 2235
1736/* 2236static struct file *path_openat(int dfd, const char *pathname,
1737 * Note that the low bits of the passed in "open_flag" 2237 struct nameidata *nd, const struct open_flags *op, int flags)
1738 * are not the same as in the local variable "flag". See
1739 * open_to_namei_flags() for more details.
1740 */
1741struct file *do_filp_open(int dfd, const char *pathname,
1742 int open_flag, int mode, int acc_mode)
1743{ 2238{
2239 struct file *base = NULL;
1744 struct file *filp; 2240 struct file *filp;
1745 struct nameidata nd;
1746 int error;
1747 struct path path; 2241 struct path path;
1748 int count = 0; 2242 int error;
1749 int flag = open_to_namei_flags(open_flag);
1750 int force_reval = 0;
1751
1752 if (!(open_flag & O_CREAT))
1753 mode = 0;
1754
1755 /*
1756 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1757 * check for O_DSYNC if the need any syncing at all we enforce it's
1758 * always set instead of having to deal with possibly weird behaviour
1759 * for malicious applications setting only __O_SYNC.
1760 */
1761 if (open_flag & __O_SYNC)
1762 open_flag |= O_DSYNC;
1763
1764 if (!acc_mode)
1765 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1766 2243
1767 /* O_TRUNC implies we need access checks for write permissions */ 2244 filp = get_empty_filp();
1768 if (open_flag & O_TRUNC) 2245 if (!filp)
1769 acc_mode |= MAY_WRITE; 2246 return ERR_PTR(-ENFILE);
1770 2247
1771 /* Allow the LSM permission hook to distinguish append 2248 filp->f_flags = op->open_flag;
1772 access from general write access. */ 2249 nd->intent.open.file = filp;
1773 if (open_flag & O_APPEND) 2250 nd->intent.open.flags = open_to_namei_flags(op->open_flag);
1774 acc_mode |= MAY_APPEND; 2251 nd->intent.open.create_mode = op->mode;
1775 2252
1776 /* find the parent */ 2253 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
1777reval: 2254 if (unlikely(error))
1778 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 2255 goto out_filp;
1779 if (error)
1780 return ERR_PTR(error);
1781 if (force_reval)
1782 nd.flags |= LOOKUP_REVAL;
1783 2256
1784 current->total_link_count = 0; 2257 current->total_link_count = 0;
1785 error = link_path_walk(pathname, &nd); 2258 error = link_path_walk(pathname, nd);
1786 if (error) { 2259 if (unlikely(error))
1787 filp = ERR_PTR(error); 2260 goto out_filp;
1788 goto out;
1789 }
1790 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
1791 audit_inode(pathname, nd.path.dentry);
1792
1793 /*
1794 * We have the parent and last component.
1795 */
1796 2261
1797 error = -ENFILE; 2262 filp = do_last(nd, &path, op, pathname);
1798 filp = get_empty_filp();
1799 if (filp == NULL)
1800 goto exit_parent;
1801 nd.intent.open.file = filp;
1802 filp->f_flags = open_flag;
1803 nd.intent.open.flags = flag;
1804 nd.intent.open.create_mode = mode;
1805 nd.flags &= ~LOOKUP_PARENT;
1806 nd.flags |= LOOKUP_OPEN;
1807 if (open_flag & O_CREAT) {
1808 nd.flags |= LOOKUP_CREATE;
1809 if (open_flag & O_EXCL)
1810 nd.flags |= LOOKUP_EXCL;
1811 }
1812 if (open_flag & O_DIRECTORY)
1813 nd.flags |= LOOKUP_DIRECTORY;
1814 if (!(open_flag & O_NOFOLLOW))
1815 nd.flags |= LOOKUP_FOLLOW;
1816 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1817 while (unlikely(!filp)) { /* trailing symlink */ 2263 while (unlikely(!filp)) { /* trailing symlink */
1818 struct path holder; 2264 struct path link = path;
1819 struct inode *inode = path.dentry->d_inode;
1820 void *cookie; 2265 void *cookie;
1821 error = -ELOOP; 2266 if (!(nd->flags & LOOKUP_FOLLOW)) {
1822 /* S_ISDIR part is a temporary automount kludge */ 2267 path_put_conditional(&path, nd);
1823 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode)) 2268 path_put(&nd->path);
1824 goto exit_dput; 2269 filp = ERR_PTR(-ELOOP);
1825 if (count++ == 32) 2270 break;
1826 goto exit_dput;
1827 /*
1828 * This is subtle. Instead of calling do_follow_link() we do
1829 * the thing by hands. The reason is that this way we have zero
1830 * link_count and path_walk() (called from ->follow_link)
1831 * honoring LOOKUP_PARENT. After that we have the parent and
1832 * last component, i.e. we are in the same situation as after
1833 * the first path_walk(). Well, almost - if the last component
1834 * is normal we get its copy stored in nd->last.name and we will
1835 * have to putname() it when we are done. Procfs-like symlinks
1836 * just set LAST_BIND.
1837 */
1838 nd.flags |= LOOKUP_PARENT;
1839 error = security_inode_follow_link(path.dentry, &nd);
1840 if (error)
1841 goto exit_dput;
1842 error = __do_follow_link(&path, &nd, &cookie);
1843 if (unlikely(error)) {
1844 /* nd.path had been dropped */
1845 if (!IS_ERR(cookie) && inode->i_op->put_link)
1846 inode->i_op->put_link(path.dentry, &nd, cookie);
1847 path_put(&path);
1848 release_open_intent(&nd);
1849 filp = ERR_PTR(error);
1850 goto out;
1851 } 2271 }
1852 holder = path; 2272 nd->flags |= LOOKUP_PARENT;
1853 nd.flags &= ~LOOKUP_PARENT; 2273 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
1854 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2274 error = follow_link(&link, nd, &cookie);
1855 if (inode->i_op->put_link) 2275 if (unlikely(error))
1856 inode->i_op->put_link(holder.dentry, &nd, cookie); 2276 filp = ERR_PTR(error);
1857 path_put(&holder); 2277 else
2278 filp = do_last(nd, &path, op, pathname);
2279 put_link(nd, &link, cookie);
1858 } 2280 }
1859out: 2281out:
1860 if (nd.root.mnt) 2282 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
1861 path_put(&nd.root); 2283 path_put(&nd->root);
1862 if (filp == ERR_PTR(-ESTALE) && !force_reval) { 2284 if (base)
1863 force_reval = 1; 2285 fput(base);
1864 goto reval; 2286 release_open_intent(nd);
1865 }
1866 return filp; 2287 return filp;
1867 2288
1868exit_dput: 2289out_filp:
1869 path_put_conditional(&path, &nd);
1870 if (!IS_ERR(nd.intent.open.file))
1871 release_open_intent(&nd);
1872exit_parent:
1873 path_put(&nd.path);
1874 filp = ERR_PTR(error); 2290 filp = ERR_PTR(error);
1875 goto out; 2291 goto out;
1876} 2292}
1877 2293
1878/** 2294struct file *do_filp_open(int dfd, const char *pathname,
1879 * filp_open - open file and return file pointer 2295 const struct open_flags *op, int flags)
1880 *
1881 * @filename: path to open
1882 * @flags: open flags as per the open(2) second argument
1883 * @mode: mode for the new file if O_CREAT is set, else ignored
1884 *
1885 * This is the helper to open a file from kernelspace if you really
1886 * have to. But in generally you should not do this, so please move
1887 * along, nothing to see here..
1888 */
1889struct file *filp_open(const char *filename, int flags, int mode)
1890{ 2296{
1891 return do_filp_open(AT_FDCWD, filename, flags, mode, 0); 2297 struct nameidata nd;
2298 struct file *filp;
2299
2300 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
2301 if (unlikely(filp == ERR_PTR(-ECHILD)))
2302 filp = path_openat(dfd, pathname, &nd, op, flags);
2303 if (unlikely(filp == ERR_PTR(-ESTALE)))
2304 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
2305 return filp;
2306}
2307
2308struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2309 const char *name, const struct open_flags *op, int flags)
2310{
2311 struct nameidata nd;
2312 struct file *file;
2313
2314 nd.root.mnt = mnt;
2315 nd.root.dentry = dentry;
2316
2317 flags |= LOOKUP_ROOT;
2318
2319 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
2320 return ERR_PTR(-ELOOP);
2321
2322 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
2323 if (unlikely(file == ERR_PTR(-ECHILD)))
2324 file = path_openat(-1, name, &nd, op, flags);
2325 if (unlikely(file == ERR_PTR(-ESTALE)))
2326 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
2327 return file;
1892} 2328}
1893EXPORT_SYMBOL(filp_open);
1894 2329
1895/** 2330/**
1896 * lookup_create - lookup a dentry, creating it if it doesn't exist 2331 * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -1952,7 +2387,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1952 if (error) 2387 if (error)
1953 return error; 2388 return error;
1954 2389
1955 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 2390 if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
2391 !ns_capable(inode_userns(dir), CAP_MKNOD))
1956 return -EPERM; 2392 return -EPERM;
1957 2393
1958 if (!dir->i_op->mknod) 2394 if (!dir->i_op->mknod)
@@ -2113,10 +2549,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2113} 2549}
2114 2550
2115/* 2551/*
2116 * We try to drop the dentry early: we should have 2552 * The dentry_unhash() helper will try to drop the dentry early: we
2117 * a usage count of 2 if we're the only user of this 2553 * should have a usage count of 2 if we're the only user of this
2118 * dentry, and if that is true (possibly after pruning 2554 * dentry, and if that is true (possibly after pruning the dcache),
2119 * the dcache), then we drop the dentry now. 2555 * then we drop the dentry now.
2120 * 2556 *
2121 * A low-level filesystem can, if it choses, legally 2557 * A low-level filesystem can, if it choses, legally
2122 * do a 2558 * do a
@@ -2129,14 +2565,11 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2129 */ 2565 */
2130void dentry_unhash(struct dentry *dentry) 2566void dentry_unhash(struct dentry *dentry)
2131{ 2567{
2132 dget(dentry);
2133 shrink_dcache_parent(dentry); 2568 shrink_dcache_parent(dentry);
2134 spin_lock(&dcache_lock);
2135 spin_lock(&dentry->d_lock); 2569 spin_lock(&dentry->d_lock);
2136 if (atomic_read(&dentry->d_count) == 2) 2570 if (dentry->d_count == 1)
2137 __d_drop(dentry); 2571 __d_drop(dentry);
2138 spin_unlock(&dentry->d_lock); 2572 spin_unlock(&dentry->d_lock);
2139 spin_unlock(&dcache_lock);
2140} 2573}
2141 2574
2142int vfs_rmdir(struct inode *dir, struct dentry *dentry) 2575int vfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -2150,25 +2583,27 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2150 return -EPERM; 2583 return -EPERM;
2151 2584
2152 mutex_lock(&dentry->d_inode->i_mutex); 2585 mutex_lock(&dentry->d_inode->i_mutex);
2153 dentry_unhash(dentry); 2586
2587 error = -EBUSY;
2154 if (d_mountpoint(dentry)) 2588 if (d_mountpoint(dentry))
2155 error = -EBUSY; 2589 goto out;
2156 else { 2590
2157 error = security_inode_rmdir(dir, dentry); 2591 error = security_inode_rmdir(dir, dentry);
2158 if (!error) { 2592 if (error)
2159 error = dir->i_op->rmdir(dir, dentry); 2593 goto out;
2160 if (!error) { 2594
2161 dentry->d_inode->i_flags |= S_DEAD; 2595 shrink_dcache_parent(dentry);
2162 dont_mount(dentry); 2596 error = dir->i_op->rmdir(dir, dentry);
2163 } 2597 if (error)
2164 } 2598 goto out;
2165 } 2599
2600 dentry->d_inode->i_flags |= S_DEAD;
2601 dont_mount(dentry);
2602
2603out:
2166 mutex_unlock(&dentry->d_inode->i_mutex); 2604 mutex_unlock(&dentry->d_inode->i_mutex);
2167 if (!error) { 2605 if (!error)
2168 d_delete(dentry); 2606 d_delete(dentry);
2169 }
2170 dput(dentry);
2171
2172 return error; 2607 return error;
2173} 2608}
2174 2609
@@ -2202,6 +2637,10 @@ static long do_rmdir(int dfd, const char __user *pathname)
2202 error = PTR_ERR(dentry); 2637 error = PTR_ERR(dentry);
2203 if (IS_ERR(dentry)) 2638 if (IS_ERR(dentry))
2204 goto exit2; 2639 goto exit2;
2640 if (!dentry->d_inode) {
2641 error = -ENOENT;
2642 goto exit3;
2643 }
2205 error = mnt_want_write(nd.path.mnt); 2644 error = mnt_want_write(nd.path.mnt);
2206 if (error) 2645 if (error)
2207 goto exit3; 2646 goto exit3;
@@ -2290,8 +2729,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2290 if (nd.last.name[nd.last.len]) 2729 if (nd.last.name[nd.last.len])
2291 goto slashes; 2730 goto slashes;
2292 inode = dentry->d_inode; 2731 inode = dentry->d_inode;
2293 if (inode) 2732 if (!inode)
2294 atomic_inc(&inode->i_count); 2733 goto slashes;
2734 ihold(inode);
2295 error = mnt_want_write(nd.path.mnt); 2735 error = mnt_want_write(nd.path.mnt);
2296 if (error) 2736 if (error)
2297 goto exit2; 2737 goto exit2;
@@ -2431,7 +2871,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2431 return error; 2871 return error;
2432 2872
2433 mutex_lock(&inode->i_mutex); 2873 mutex_lock(&inode->i_mutex);
2434 error = dir->i_op->link(old_dentry, dir, new_dentry); 2874 /* Make sure we don't allow creating hardlink to an unlinked file */
2875 if (inode->i_nlink == 0)
2876 error = -ENOENT;
2877 else
2878 error = dir->i_op->link(old_dentry, dir, new_dentry);
2435 mutex_unlock(&inode->i_mutex); 2879 mutex_unlock(&inode->i_mutex);
2436 if (!error) 2880 if (!error)
2437 fsnotify_link(dir, inode, new_dentry); 2881 fsnotify_link(dir, inode, new_dentry);
@@ -2453,15 +2897,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
2453 struct dentry *new_dentry; 2897 struct dentry *new_dentry;
2454 struct nameidata nd; 2898 struct nameidata nd;
2455 struct path old_path; 2899 struct path old_path;
2900 int how = 0;
2456 int error; 2901 int error;
2457 char *to; 2902 char *to;
2458 2903
2459 if ((flags & ~AT_SYMLINK_FOLLOW) != 0) 2904 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
2460 return -EINVAL; 2905 return -EINVAL;
2906 /*
2907 * To use null names we require CAP_DAC_READ_SEARCH
2908 * This ensures that not everyone will be able to create
2909 * handlink using the passed filedescriptor.
2910 */
2911 if (flags & AT_EMPTY_PATH) {
2912 if (!capable(CAP_DAC_READ_SEARCH))
2913 return -ENOENT;
2914 how = LOOKUP_EMPTY;
2915 }
2461 2916
2462 error = user_path_at(olddfd, oldname, 2917 if (flags & AT_SYMLINK_FOLLOW)
2463 flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, 2918 how |= LOOKUP_FOLLOW;
2464 &old_path); 2919
2920 error = user_path_at(olddfd, oldname, how, &old_path);
2465 if (error) 2921 if (error)
2466 return error; 2922 return error;
2467 2923
@@ -2523,12 +2979,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
2523 * HOWEVER, it relies on the assumption that any object with ->lookup() 2979 * HOWEVER, it relies on the assumption that any object with ->lookup()
2524 * has no more than 1 dentry. If "hybrid" objects will ever appear, 2980 * has no more than 1 dentry. If "hybrid" objects will ever appear,
2525 * we'd better make sure that there's no link(2) for them. 2981 * we'd better make sure that there's no link(2) for them.
2526 * d) some filesystems don't support opened-but-unlinked directories, 2982 * d) conversion from fhandle to dentry may come in the wrong moment - when
2527 * either because of layout or because they are not ready to deal with
2528 * all cases correctly. The latter will be fixed (taking this sort of
2529 * stuff into VFS), but the former is not going away. Solution: the same
2530 * trick as in rmdir().
2531 * e) conversion from fhandle to dentry may come in the wrong moment - when
2532 * we are removing the target. Solution: we will have to grab ->i_mutex 2983 * we are removing the target. Solution: we will have to grab ->i_mutex
2533 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2984 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2534 * ->i_mutex on parents, which works but leads to some truly excessive 2985 * ->i_mutex on parents, which works but leads to some truly excessive
@@ -2538,7 +2989,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2538 struct inode *new_dir, struct dentry *new_dentry) 2989 struct inode *new_dir, struct dentry *new_dentry)
2539{ 2990{
2540 int error = 0; 2991 int error = 0;
2541 struct inode *target; 2992 struct inode *target = new_dentry->d_inode;
2542 2993
2543 /* 2994 /*
2544 * If we are going to change the parent - check write permissions, 2995 * If we are going to change the parent - check write permissions,
@@ -2554,26 +3005,26 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2554 if (error) 3005 if (error)
2555 return error; 3006 return error;
2556 3007
2557 target = new_dentry->d_inode;
2558 if (target) 3008 if (target)
2559 mutex_lock(&target->i_mutex); 3009 mutex_lock(&target->i_mutex);
2560 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 3010
2561 error = -EBUSY; 3011 error = -EBUSY;
2562 else { 3012 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
2563 if (target) 3013 goto out;
2564 dentry_unhash(new_dentry); 3014
2565 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3015 if (target)
2566 } 3016 shrink_dcache_parent(new_dentry);
3017 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
3018 if (error)
3019 goto out;
3020
2567 if (target) { 3021 if (target) {
2568 if (!error) { 3022 target->i_flags |= S_DEAD;
2569 target->i_flags |= S_DEAD; 3023 dont_mount(new_dentry);
2570 dont_mount(new_dentry);
2571 }
2572 mutex_unlock(&target->i_mutex);
2573 if (d_unhashed(new_dentry))
2574 d_rehash(new_dentry);
2575 dput(new_dentry);
2576 } 3024 }
3025out:
3026 if (target)
3027 mutex_unlock(&target->i_mutex);
2577 if (!error) 3028 if (!error)
2578 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3029 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2579 d_move(old_dentry,new_dentry); 3030 d_move(old_dentry,new_dentry);
@@ -2583,7 +3034,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2583static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, 3034static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2584 struct inode *new_dir, struct dentry *new_dentry) 3035 struct inode *new_dir, struct dentry *new_dentry)
2585{ 3036{
2586 struct inode *target; 3037 struct inode *target = new_dentry->d_inode;
2587 int error; 3038 int error;
2588 3039
2589 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3040 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -2591,19 +3042,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2591 return error; 3042 return error;
2592 3043
2593 dget(new_dentry); 3044 dget(new_dentry);
2594 target = new_dentry->d_inode;
2595 if (target) 3045 if (target)
2596 mutex_lock(&target->i_mutex); 3046 mutex_lock(&target->i_mutex);
3047
3048 error = -EBUSY;
2597 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 3049 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2598 error = -EBUSY; 3050 goto out;
2599 else 3051
2600 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3052 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2601 if (!error) { 3053 if (error)
2602 if (target) 3054 goto out;
2603 dont_mount(new_dentry); 3055
2604 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3056 if (target)
2605 d_move(old_dentry, new_dentry); 3057 dont_mount(new_dentry);
2606 } 3058 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3059 d_move(old_dentry, new_dentry);
3060out:
2607 if (target) 3061 if (target)
2608 mutex_unlock(&target->i_mutex); 3062 mutex_unlock(&target->i_mutex);
2609 dput(new_dentry); 3063 dput(new_dentry);
@@ -2885,6 +3339,7 @@ const struct inode_operations page_symlink_inode_operations = {
2885}; 3339};
2886 3340
2887EXPORT_SYMBOL(user_path_at); 3341EXPORT_SYMBOL(user_path_at);
3342EXPORT_SYMBOL(follow_down_one);
2888EXPORT_SYMBOL(follow_down); 3343EXPORT_SYMBOL(follow_down);
2889EXPORT_SYMBOL(follow_up); 3344EXPORT_SYMBOL(follow_up);
2890EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 3345EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
@@ -2897,7 +3352,7 @@ EXPORT_SYMBOL(page_readlink);
2897EXPORT_SYMBOL(__page_symlink); 3352EXPORT_SYMBOL(__page_symlink);
2898EXPORT_SYMBOL(page_symlink); 3353EXPORT_SYMBOL(page_symlink);
2899EXPORT_SYMBOL(page_symlink_inode_operations); 3354EXPORT_SYMBOL(page_symlink_inode_operations);
2900EXPORT_SYMBOL(path_lookup); 3355EXPORT_SYMBOL(kern_path_parent);
2901EXPORT_SYMBOL(kern_path); 3356EXPORT_SYMBOL(kern_path);
2902EXPORT_SYMBOL(vfs_path_lookup); 3357EXPORT_SYMBOL(vfs_path_lookup);
2903EXPORT_SYMBOL(inode_permission); 3358EXPORT_SYMBOL(inode_permission);