/*
* linux/fs/namei.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* Some corrections by tytso.
*/
/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
* lookup logic.
*/
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
*/
#include <linux/init.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/ima.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <asm/uaccess.h>
#include "internal.h"
#include "mount.h"
/* [Feb-1997 T. Schoebel-Theuer]
* Fundamental changes in the pathname lookup mechanisms (namei)
* were necessary because of omirr. The reason is that omirr needs
* to know the _real_ pathname, not the user-supplied one, in case
* of symlinks (and also when transname replacements occur).
*
* The new code replaces the old recursive symlink resolution with
* an iterative one (in case of non-nested symlink chains). It does
* this with calls to <fs>_follow_link().
* As a side effect, dir_namei(), _namei() and follow_link() are now
* replaced with a single function lookup_dentry() that can handle all
* the special cases of the former code.
*
* With the new dcache, the pathname is stored at each inode, at least as
* long as the refcount of the inode is positive. As a side effect, the
* size of the dcache depends on the inode cache and thus is dynamic.
*
* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
* resolution to correspond with current state of the code.
*
* Note that the symlink resolution is not *completely* iterative.
* There is still a significant amount of tail- and mid- recursion in
* the algorithm. Also, note that <fs>_readlink() is not used in
* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
* may return different results than <fs>_follow_link(). Many virtual
* filesystems (including /proc) exhibit this behavior.
*/
/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
* New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
* and the name already exists in form of a symlink, try to create the new
* name indicated by the symlink. The old code always complained that the
* name already exists, due to not following the symlink even if its target
* is nonexistent. The new semantics affects also mknod() and link() when
* the name is a symlink pointing to a non-existent name.
*
* I don't know which semantics is the right one, since I have no access
* to standards. But I found by trial that HP-UX 9.0 has the full "new"
* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
* "old" one. Personally, I think the new semantics is much more logical.
* Note that "ln old new" where "new" is a symlink pointing to a non-existing
* file does succeed in both HP-UX and SunOs, but not in Solaris
* and in the old Linux semantics.
*/
/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
* semantics. See the comments in "open_namei" and "do_link" below.
*
* [10-Sep-98 Alan Modra] Another symlink change.
*/
/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
* inside the path - always follow.
* in the last component in creation/removal/renaming - never follow.
* if LOOKUP_FOLLOW passed - follow.
* if the pathname has trailing slashes - follow.
* otherwise - don't follow.
* (applied in that order).
*
* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
* During the 2.4 we need to fix the userland stuff depending on it -
* hopefully we will be able to get rid of that wart in 2.5. So far only
* XEmacs seems to be relying on it...
*/
/*
* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
* any extra contention...
*/
/* In order to reduce some races, while at the same time doing additional
* checking and hopefully speeding things up, we copy filenames to the
* kernel data space before using them..
*
* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
* PATH_MAX includes the nul terminator --RR.
*/
void final_putname(struct filename *name)
{
if (name->separate) {
__putname(name->name);
kfree(name);
} else {
__putname(name);
}
}
#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
static struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
struct filename *result, *err;
int len;
long max;
char *kname;
result = audit_reusename(filename);
if (result)
return result;
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
/*
* First, try to embed the struct filename inside the names_cache
* allocation
*/
kname = (char *)result + sizeof(*result);
result->name = kname;
result->separate = false;
max = EMBEDDED_NAME_MAX;
recopy:
len = strncpy_from_user(kname, filename, max);
if (unlikely(len < 0)) {
err = ERR_PTR(len);
goto error;
}
/*
* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
* separate struct filename so we can dedicate the entire
* names_cache allocation for the pathname, and re-do the copy from
* userland.
*/
if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
kname = (char *)result;
result = kzalloc(sizeof(*result), GFP_KERNEL);
if (!result) {
err = ERR_PTR(-ENOMEM);
result = (struct filename *)kname;
goto error;
}
result->name = kname;
result->separate = true;
max = PATH_MAX;
goto recopy;
}
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
*empty = 1;
err = ERR_PTR(-ENOENT);
if (!(flags & LOOKUP_EMPTY))
goto error;
}
err = ERR_PTR(-ENAMETOOLONG);
if (unlikely(len >= PATH_MAX))
goto error;
result->uptr = filename;
audit_getname(result);
return result;
error:
final_putname(result);
return err;
}
struct filename *
getname(const char __user * filename)
{
return getname_flags(filename, 0, NULL);
}
EXPORT_SYMBOL(getname);
#ifdef CONFIG_AUDITSYSCALL
void putname(struct filename *name)
{
if (unlikely(!audit_dummy_context()))
return audit_putname(name);
final_putname(name);
}
#endif
static int check_acl(struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *acl;
if (mask & MAY_NOT_BLOCK) {
acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
if (!acl)
return -EAGAIN;
/* no ->get_acl() calls in RCU mode... */
if (acl == ACL_NOT_CACHED)
return -ECHILD;
return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
}
acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
/*
* A filesystem can force a ACL callback by just never filling the
* ACL cache. But normally you'd fill the cache either at inode
* instantiation time, or on the first ->get_acl call.
*
* If the filesystem doesn't have a get_acl() function at all, we'll
* just create the negative cache entry.
*/
if (acl == ACL_NOT_CACHED) {
if (inode->i_op->get_acl) {
acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
} else {
set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
return -EAGAIN;
}
}
if (acl) {
int error = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
return error;
}
#endif
return -EAGAIN;
}
/*
* This does the basic permission checking
*/
static int acl_permission_check(struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
if (likely(uid_eq(current_fsuid(), inode->i_uid)))
mode >>= 6;
else {
if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
int error = check_acl(inode, mask);
if (error != -EAGAIN)
return error;
}
if (in_group_p(inode->i_gid))
mode >>= 3;
}
/*
* If the DACs are ok we don't need any capability check.
*/
if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
return 0;
return -EACCES;
}
/**
* generic_permission - check for access rights on a Posix-like filesystem
* @inode: inode to check access rights for
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
*
* Used to check for read/write/execute permissions on a file.
* We use "fsuid" for this, letting us set arbitrary permissions
* for filesystem access without changing the "normal" uids which
* are used for other things.
*
* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
* request cannot be satisfied (eg. requires blocking or too much complexity).
* It would then be called again in ref-walk mode.
*/
int generic_permission(struct inode *inode, int mask)
{
int ret;
/*
* Do the basic permission checks.
*/
ret = acl_permission_check(inode, mask);
if (ret != -EACCES)
return ret;
if (S_ISDIR(inode->i_mode)) {
/* DACs are overridable for directories */
if (inode_capable(inode, CAP_DAC_OVERRIDE))
return 0;
if (!(mask & MAY_WRITE))
if (inode_capable(inode, CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
}
/*
* Read/write DACs are always overridable.
* Executable DACs are overridable when there is
* at least one exec bit set.
*/
if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
if (inode_capable(inode, CAP_DAC_OVERRIDE))
return 0;
/*
* Searching includes executable on directories, else just read.
*/
mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (mask == MAY_READ)
if (inode_capable(inode, CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
}
/*
* We _really_ want to just do "generic_permission()" without
* even looking at the inode->i_op values. So we keep a cache
* flag in inode->i_opflags, that says "this has not special
* permission function, use the fast case".
*/
static inline int do_inode_permission(struct inode *inode, int mask)
{
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
if (likely(inode->i_op->permission))
return inode->i_op->permission(inode, mask);
/* This gets set once for the inode lifetime */
spin_lock(&inode->i_lock);
inode->i_opflags |= IOP_FASTPERM;
spin_unlock(&inode->i_lock);
}
return generic_permission(inode, mask);
}
/**
* __inode_permission - Check for access rights to a given inode
* @inode: Inode to check permission on
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Check for read/write/execute permissions on an inode.
*
* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*
* This does not check for a read-only file system. You probably want
* inode_permission().
*/
int __inode_permission(struct inode *inode, int mask)
{
int retval;
if (unlikely(mask & MAY_WRITE)) {
/*
* Nobody gets write access to an immutable file.
*/
if (IS_IMMUTABLE(inode))
return -EACCES;
}
retval = do_inode_permission(inode, mask);
if (retval)
return retval;
retval = devcgroup_inode_permission(inode, mask);
if (retval)
return retval;
return security_inode_permission(inode, mask);
}
/**
* sb_permission - Check superblock-level permissions
* @sb: Superblock of inode to check permission on
* @inode: Inode to check permission on
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Separate out file-system wide checks from inode-specific permission checks.
*/
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
if (unlikely(mask & MAY_WRITE)) {
umode_t mode = inode->i_mode;
/* Nobody gets write access to a read-only fs. */
if ((sb->s_flags & MS_RDONLY) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
return -EROFS;
}
return 0;
}
/**
* inode_permission - Check for access rights to a given inode
* @inode: Inode to check permission on
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Check for read/write/execute permissions on an inode. We use fs[ug]id for
* this, letting us set arbitrary permissions for filesystem access without
* changing the "normal" UIDs which are used for other things.
*
* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
int inode_permission(struct inode *inode, int mask)
{
int retval;
retval = sb_permission(inode->i_sb, inode, mask);
if (retval)
return retval;
return __inode_permission(inode, mask);
}
/**
* path_get - get a reference to a path
* @path: path to get the reference to
*
* Given a path increment the reference count to the dentry and the vfsmount.
*/
void path_get(struct path *path)
{
mntget(path->mnt);
dget(path->dentry);
}
EXPORT_SYMBOL(path_get);
/**
* path_put - put a reference to a path
* @path: path to put the reference to
*
* Given a path decrement the reference count to the dentry and the vfsmount.
*/
void path_put(struct path *path)
{
dput(path->dentry);
mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);
/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt). In situations when we can't
* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
* normal reference counts on dentries and vfsmounts to transition to rcu-walk
* mode. Refcounts are grabbed at the last known good point before rcu-walk
* got stuck, so ref-walk may continue from there. If this is not successful
* (eg. a seqcount has changed), then failure is returned and it's up to caller
* to restart the path walk from the beginning in ref-walk mode.
*/
static inline void lock_rcu_walk(void)
{
br_read_lock(&vfsmount_lock);
rcu_read_lock();
}
static inline void unlock_rcu_walk(void)
{
rcu_read_unlock();
br_read_unlock(&vfsmount_lock);
}
/**
* unlazy_walk - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
* @dentry: child of nd->path.dentry or NULL
* Returns: 0 on success, -ECHILD on failure
*
* unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
* for ref-walk mode. @dentry must be a path found by a do_lookup call on
* @nd or NULL. Must be called from rcu-walk context.
*/
static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
{
struct fs_struct *fs = current->fs;
struct dentry *parent = nd->path.dentry;
int want_root = 0;
BUG_ON(!(nd->flags & LOOKUP_RCU));
if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
want_root = 1;
spin_lock(&fs->lock);
if (nd->root.mnt != fs->root.mnt ||
nd->root.dentry != fs->root.dentry)
goto err_root;
}
spin_lock(&parent->d_lock);
if (!dentry) {
if (!__d_rcu_to_refcount(parent, nd->seq))
goto err_parent;
BUG_ON(nd->inode != parent->d_inode);
} else {
if (dentry->d_parent != parent)
goto err_parent;
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
if (!__d_rcu_to_refcount(dentry, nd->seq))
goto err_child;
/*
* If the sequence check on the child dentry passed, then
* the child has not been removed from its parent. This
* means the parent dentry must be valid and able to take
* a reference at this point.
*/
BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
BUG_ON(!parent->d_count);
parent->d_count++;
spin_unlock(&dentry->d_lock);
}
spin_unlock(&parent->d_lock);
if (want_root) {
path_get(&nd->root);
spin_unlock(&fs->lock);
}
mntget(nd->path.mnt);
unlock_rcu_walk();
nd->flags &= ~LOOKUP_RCU;
return 0;
err_child:
spin_unlock(&dentry->d_lock);
err_parent:
spin_unlock(&parent->d_lock);
err_root:
if (want_root)
spin_unlock(&fs->lock);
return -ECHILD;
}
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
{
return dentry->d_op->d_revalidate(dentry, flags);
}
/**
* complete_walk - successful completion of path walk
* @nd: pointer nameidata
*
* If we had been in RCU mode, drop out of it and legitimize nd->path.
* Revalidate the final result, unless we'd already done that during
* the path walk or the filesystem doesn't ask for it. Return 0 on
* success, -error on failure. In case of failure caller does not
* need to drop nd->path.
*/
static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;
if (nd->flags & LOOKUP_RCU) {
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
spin_lock(&dentry->d_lock);
if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
spin_unlock(&dentry->d_lock);
unlock_rcu_walk();
return -ECHILD;
}
BUG_ON(nd->inode != dentry->d_inode);
spin_unlock(&dentry->d_lock);
mntget(nd->path.mnt);
unlock_rcu_walk();
}
if (likely(!(nd->flags & LOOKUP_JUMPED)))
return 0;
if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
return 0;
if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
return 0;
/* Note: we do not d_invalidate() */
status = d_revalidate(dentry, nd->flags);
if (status > 0)
return 0;
if (!status)
status = -ESTALE;
path_put(&nd->path);
return status;
}
static __always_inline void set_root(struct nameidata *nd)
{
if (!nd->root.mnt)
get_fs_root(current->fs, &nd->root);
}
static int link_path_walk(const char *, struct nameidata *);
static __always_inline void set_root_rcu(struct nameidata *nd)
{
if (!nd->root.mnt) {
struct fs_struct *fs = current->fs;
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
nd->root = fs->root;
nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
}
}
static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
{
int ret;
if (IS_ERR(link))
goto fail;
if (*link == '/') {
set_root(nd);
path_put(&nd->path);
nd->path = nd->root;
path_get(&nd->root);
nd->flags |= LOOKUP_JUMPED;
}
nd->inode = nd->path.dentry->d_inode;
ret = link_path_walk(link, nd);
return ret;
fail:
path_put(&nd->path);
return PTR_ERR(link);
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
{
dput(path->dentry);
if (path->mnt != nd->path.mnt)
mntput(path->mnt);
}
static inline void path_to_nameidata(const struct path *path,
struct nameidata *nd)
{
if (!(nd->flags & LOOKUP_RCU)) {
dput(nd->path.dentry);
if (nd->path.mnt != path->mnt)
mntput(nd->path.mnt);
}
nd->path.mnt = path->mnt;
nd->path.dentry = path->dentry;
}
/*
* Helper to directly jump to a known parsed path from ->follow_link,
* caller must have taken a reference to path beforehand.
*/
void nd_jump_link(struct nameidata *nd, struct path *path)
{
path_put(&nd->path);
nd->path = *path;
nd->inode = nd->path.dentry->d_inode;
nd->flags |= LOOKUP_JUMPED;
BUG_ON(nd->inode->i_op->follow_link);
}
static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
{