Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull namespace updates from Eric Biederman: "There is a lot here. A lot of these changes result in subtle user visible differences in kernel behavior. I don't expect anything will care but I will revert/fix things immediately if any regressions show up. From Seth Forshee there is a continuation of the work to make the vfs ready for unpriviled mounts. We had thought the previous changes prevented the creation of files outside of s_user_ns of a filesystem, but it turns we missed the O_CREAT path. Ooops. Pavel Tikhomirov and Oleg Nesterov worked together to fix a long standing bug in the implemenation of PR_SET_CHILD_SUBREAPER where only children that are forked after the prctl are considered and not children forked before the prctl. The only known user of this prctl systemd forks all children after the prctl. So no userspace regressions will occur. Holding earlier forked children to the same rules as later forked children creates a semantic that is sane enough to allow checkpoing of processes that use this feature. There is a long delayed change by Nikolay Borisov to limit inotify instances inside a user namespace. Michael Kerrisk extends the API for files used to maniuplate namespaces with two new trivial ioctls to allow discovery of the hierachy and properties of namespaces. Konstantin Khlebnikov with the help of Al Viro adds code that when a network namespace exits purges it's sysctl entries from the dcache. As in some circumstances this could use a lot of memory. Vivek Goyal fixed a bug with stacked filesystems where the permissions on the wrong inode were being checked. I continue previous work on ptracing across exec. Allowing a file to be setuid across exec while being ptraced if the tracer has enough credentials in the user namespace, and if the process has CAP_SETUID in it's own namespace. Proc files for setuid or otherwise undumpable executables are now owned by the root in the user namespace of their mm. Allowing debugging of setuid applications in containers to work better. A bug I introduced with permission checking and automount is now fixed. The big change is to mark the mounts that the kernel initiates as a result of an automount. This allows the permission checks in sget to be safely suppressed for this kind of mount. As the permission check happened when the original filesystem was mounted. Finally a special case in the mount namespace is removed preventing unbounded chains in the mount hash table, and making the semantics simpler which benefits CRIU. The vfs fix along with related work in ima and evm I believe makes us ready to finish developing and merge fully unprivileged mounts of the fuse filesystem. The cleanups of the mount namespace makes discussing how to fix the worst case complexity of umount. The stacked filesystem fixes pave the way for adding multiple mappings for the filesystem uids so that efficient and safer containers can be implemented" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: proc/sysctl: Don't grab i_lock under sysctl_lock. vfs: Use upper filesystem inode in bprm_fill_uid() proc/sysctl: prune stale dentries during unregistering mnt: Tuck mounts under others instead of creating shadow/side mounts. prctl: propagate has_child_subreaper flag to every descendant introduce the walk_process_tree() helper nsfs: Add an ioctl() to return owner UID of a userns fs: Better permission checking for submounts exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction vfs: open() with O_CREAT should not create inodes with unknown ids nsfs: Add an ioctl() to return the namespace type proc: Better ownership of files for non-dumpable tasks in user namespaces exec: Remove LSM_UNSAFE_PTRACE_CAP exec: Test the ptracer's saved cred to see if the tracee can gain caps exec: Don't reset euid and egid when the tracee has CAP_SETUID inotify: Convert to using per-namespace limits
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-23 23:33:51 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-23 23:33:51 -0500
commit: f1ef09fde17f9b77ca1435a5b53a28b203afb81c (patch)
tree: 0efcd2c5b5da451a7ca780c8aa5e26d7ec712b85 /fs/proc
parent: ef96152e6a36e0510387cb174178b7982c1ae879 (diff)
parent: ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb (diff)
5 files changed, 117 insertions, 89 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3d773eb9e144..b73b4de8fb36 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = {
 /* building an inode */
+void task_dump_owner(struct task_struct *task, mode_t mode,
+                     kuid_t *ruid, kgid_t *rgid)
+{
+        /* Depending on the state of dumpable compute who should own a
+         * proc file for a task.
+         */
+        const struct cred *cred;
+        kuid_t uid;
+        kgid_t gid;
+        /* Default to the tasks effective ownership */
+        rcu_read_lock();
+        cred = __task_cred(task);
+        uid = cred->euid;
+        gid = cred->egid;
+        rcu_read_unlock();
+        /*
+         * Before the /proc/pid/status file was created the only way to read
+         * the effective uid of a /process was to stat /proc/pid.  Reading
+         * /proc/pid/status is slow enough that procps and other packages
+         * kept stating /proc/pid.  To keep the rules in /proc simple I have
+         * made this apply to all per process world readable and executable
+         * directories.
+         */
+        if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
+                struct mm_struct *mm;
+                task_lock(task);
+                mm = task->mm;
+                /* Make non-dumpable tasks owned by some root */
+                if (mm) {
+                        if (get_dumpable(mm) != SUID_DUMP_USER) {
+                                struct user_namespace *user_ns = mm->user_ns;
+                                uid = make_kuid(user_ns, 0);
+                                if (!uid_valid(uid))
+                                        uid = GLOBAL_ROOT_UID;
+                                gid = make_kgid(user_ns, 0);
+                                if (!gid_valid(gid))
+                                        gid = GLOBAL_ROOT_GID;
+                        }
+                } else {
+                        uid = GLOBAL_ROOT_UID;
+                        gid = GLOBAL_ROOT_GID;
+                }
+                task_unlock(task);
+        }
+        *ruid = uid;
+        *rgid = gid;
+}
 struct inode *proc_pid_make_inode(struct super_block * sb,
                                  struct task_struct *task, umode_t mode)
 {
        struct inode * inode;
        struct proc_inode *ei;
-        const struct cred *cred;
        /* We need a new inode */
@@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
        if (!ei->pid)
                goto out_unlock;
-        if (task_dumpable(task)) {
+        task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-                rcu_read_lock();
-                cred = __task_cred(task);
-                inode->i_uid = cred->euid;
-                inode->i_gid = cred->egid;
-                rcu_read_unlock();
-        }
        security_task_to_inode(task, inode);
 out:
@@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = d_inode(dentry);
        struct task_struct *task;
-        const struct cred *cred;
        struct pid_namespace *pid = dentry->d_sb->s_fs_info;
        generic_fillattr(inode, stat);
@@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
                         */
                        return -ENOENT;
                }
-                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+                task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
-                    task_dumpable(task)) {
-                        cred = __task_cred(task);
-                        stat->uid = cred->euid;
-                        stat->gid = cred->egid;
-                }
        }
        rcu_read_unlock();
        return 0;
@@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 * Rewrite the inode's ownerships here because the owning task may have
 * performed a setuid(), etc.
 *
- * Before the /proc/pid/status file was created the only way to read
- * the effective uid of a /process was to stat /proc/pid.  Reading
- * /proc/pid/status is slow enough that procps and other packages
- * kept stating /proc/pid.  To keep the rules in /proc simple I have
- * made this apply to all per process world readable and executable
- * directories.
 */
 int pid_revalidate(struct dentry *dentry, unsigned int flags)
 {
        struct inode *inode;
        struct task_struct *task;
-        const struct cred *cred;
        if (flags & LOOKUP_RCU)
                return -ECHILD;
@@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
        task = get_proc_task(inode);
        if (task) {
-                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+                task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
-                    task_dumpable(task)) {
-                        rcu_read_lock();
-                        cred = __task_cred(task);
-                        inode->i_uid = cred->euid;
-                        inode->i_gid = cred->egid;
-                        rcu_read_unlock();
-                } else {
-                        inode->i_uid = GLOBAL_ROOT_UID;
-                        inode->i_gid = GLOBAL_ROOT_GID;
-                }
                inode->i_mode &= ~(S_ISUID | S_ISGID);
                security_task_to_inode(task, inode);
                put_task_struct(task);
@@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
        bool exact_vma_exists = false;
        struct mm_struct *mm = NULL;
        struct task_struct *task;
-        const struct cred *cred;
        struct inode *inode;
        int status = 0;
@@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
        mmput(mm);
        if (exact_vma_exists) {
-                if (task_dumpable(task)) {
+                task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-                        rcu_read_lock();
-                        cred = __task_cred(task);
-                        inode->i_uid = cred->euid;
-                        inode->i_gid = cred->egid;
-                        rcu_read_unlock();
-                } else {
-                        inode->i_uid = GLOBAL_ROOT_UID;
-                        inode->i_gid = GLOBAL_ROOT_GID;
-                }
                security_task_to_inode(task, inode);
                status = 1;
        }
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 4274f83bf100..00ce1531b2f5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 {
        struct files_struct *files;
        struct task_struct *task;
-        const struct cred *cred;
        struct inode *inode;
        unsigned int fd;
@@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
                                rcu_read_unlock();
                                put_files_struct(files);
-                                if (task_dumpable(task)) {
+                                task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
-                                        rcu_read_lock();
-                                        cred = __task_cred(task);
-                                        inode->i_uid = cred->euid;
-                                        inode->i_gid = cred->egid;
-                                        rcu_read_unlock();
-                                } else {
-                                        inode->i_uid = GLOBAL_ROOT_UID;
-                                        inode->i_gid = GLOBAL_ROOT_GID;
-                                }
                                if (S_ISLNK(inode->i_mode)) {
                                        unsigned i_mode = S_IFLNK;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
        de = PDE(inode);
        if (de)
                pde_put(de);
        head = PROC_I(inode)->sysctl;
        if (head) {
                RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
-                sysctl_head_put(head);
+                proc_sys_evict_inode(inode, head);
        }
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..5d6960f5f1c0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
        struct proc_dir_entry *pde;
        struct ctl_table_header *sysctl;
        struct ctl_table *sysctl_entry;
+        struct list_head sysctl_inodes;
        const struct proc_ns_operations *ns_ops;
        struct inode vfs_inode;
 };
@@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
-static inline int task_dumpable(struct task_struct *task)
+void task_dump_owner(struct task_struct *task, mode_t mode,
-{
+                     kuid_t *ruid, kgid_t *rgid);
-        int dumpable = 0;
-        struct mm_struct *mm;
-        task_lock(task);
-        mm = task->mm;
-        if (mm)
-                dumpable = get_dumpable(mm);
-        task_unlock(task);
-        if (dumpable == SUID_DUMP_USER)
-                return 1;
-        return 0;
-}
 static inline unsigned name_to_int(const struct qstr *qstr)
 {
@@ -249,10 +238,12 @@ extern void proc_thread_self_init(void);
 */
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *);
+extern void proc_sys_evict_inode(struct inode *inode,
+                                 struct ctl_table_header *head);
 #else
 static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
+static inline void proc_sys_evict_inode(struct  inode *inode,
+                                        struct ctl_table_header *head) { }
 #endif
 /*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d4e37acd4821..3e64c6502dc8 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
        head->set = set;
        head->parent = NULL;
        head->node = node;
+        INIT_LIST_HEAD(&head->inodes);
        if (node) {
                struct ctl_table *entry;
                for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p)
                        complete(p->unregistering);
 }
+/* called under sysctl_lock */
+static void proc_sys_prune_dcache(struct ctl_table_header *head)
+{
+        struct inode *inode, *prev = NULL;
+        struct proc_inode *ei;
+        rcu_read_lock();
+        list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
+                inode = igrab(&ei->vfs_inode);
+                if (inode) {
+                        rcu_read_unlock();
+                        iput(prev);
+                        prev = inode;
+                        d_prune_aliases(inode);
+                        rcu_read_lock();
+                }
+        }
+        rcu_read_unlock();
+        iput(prev);
+}
 /* called under sysctl_lock, will reacquire if has to wait */
 static void start_unregistering(struct ctl_table_header *p)
 {
@@ -272,31 +294,22 @@ static void start_unregistering(struct ctl_table_header *p)
                p->unregistering = &wait;
                spin_unlock(&sysctl_lock);
                wait_for_completion(&wait);
-                spin_lock(&sysctl_lock);
        } else {
                /* anything non-NULL; we'll never dereference it */
                p->unregistering = ERR_PTR(-EINVAL);
+                spin_unlock(&sysctl_lock);
        }
        /*
+         * Prune dentries for unregistered sysctls: namespaced sysctls
+         * can have duplicate names and contaminate dcache very badly.
+         */
+        proc_sys_prune_dcache(p);
+        /*
         * do not remove from the list until nobody holds it; walking the
         * list in do_sysctl() relies on that.
         */
-        erase_header(p);
-}
-static void sysctl_head_get(struct ctl_table_header *head)
-{
        spin_lock(&sysctl_lock);
-        head->count++;
+        erase_header(p);
-        spin_unlock(&sysctl_lock);
-}
-void sysctl_head_put(struct ctl_table_header *head)
-{
-        spin_lock(&sysctl_lock);
-        if (!--head->count)
-                kfree_rcu(head, rcu);
-        spin_unlock(&sysctl_lock);
 }
 static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
@@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        inode->i_ino = get_next_ino();
-        sysctl_head_get(head);
        ei = PROC_I(inode);
+        spin_lock(&sysctl_lock);
+        if (unlikely(head->unregistering)) {
+                spin_unlock(&sysctl_lock);
+                iput(inode);
+                inode = NULL;
+                goto out;
+        }
        ei->sysctl = head;
        ei->sysctl_entry = table;
+        list_add_rcu(&ei->sysctl_inodes, &head->inodes);
+        head->count++;
+        spin_unlock(&sysctl_lock);
        inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
        inode->i_mode = table->mode;
@@ -466,6 +489,15 @@ out:
        return inode;
 }
+void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        list_del_rcu(&PROC_I(inode)->sysctl_inodes);
+        if (!--head->count)
+                kfree_rcu(head, rcu);
+        spin_unlock(&sysctl_lock);
+}
 static struct ctl_table_header *grab_header(struct inode *inode)
 {
        struct ctl_table_header *head = PROC_I(inode)->sysctl;
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-02-23 23:33:51 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-02-23 23:33:51 -0500
commit	f1ef09fde17f9b77ca1435a5b53a28b203afb81c (patch)
tree	0efcd2c5b5da451a7ca780c8aa5e26d7ec712b85 /fs/proc
parent	ef96152e6a36e0510387cb174178b7982c1ae879 (diff)
parent	ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb (diff)

diff --git a/fs/proc/base.c b/fs/proc/base.c index 3d773eb9e144..b73b4de8fb36 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c
@@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = {
1667		1667
1668	/* building an inode */	1668	/* building an inode */
1669		1669
		1670	void task_dump_owner(struct task_struct *task, mode_t mode,
		1671	kuid_t ruid, kgid_t rgid)
		1672	{
		1673	/* Depending on the state of dumpable compute who should own a
		1674	* proc file for a task.
		1675	*/
		1676	const struct cred *cred;
		1677	kuid_t uid;
		1678	kgid_t gid;
		1679
		1680	/* Default to the tasks effective ownership */
		1681	rcu_read_lock();
		1682	cred = __task_cred(task);
		1683	uid = cred->euid;
		1684	gid = cred->egid;
		1685	rcu_read_unlock();
		1686
		1687	/*
		1688	* Before the /proc/pid/status file was created the only way to read
		1689	* the effective uid of a /process was to stat /proc/pid. Reading
		1690	* /proc/pid/status is slow enough that procps and other packages
		1691	* kept stating /proc/pid. To keep the rules in /proc simple I have
		1692	* made this apply to all per process world readable and executable
		1693	* directories.
		1694	*/
		1695	if (mode != (S_IFDIR\|S_IRUGO\|S_IXUGO)) {
		1696	struct mm_struct *mm;
		1697	task_lock(task);
		1698	mm = task->mm;
		1699	/* Make non-dumpable tasks owned by some root */
		1700	if (mm) {
		1701	if (get_dumpable(mm) != SUID_DUMP_USER) {
		1702	struct user_namespace *user_ns = mm->user_ns;
		1703
		1704	uid = make_kuid(user_ns, 0);
		1705	if (!uid_valid(uid))
		1706	uid = GLOBAL_ROOT_UID;
		1707
		1708	gid = make_kgid(user_ns, 0);
		1709	if (!gid_valid(gid))
		1710	gid = GLOBAL_ROOT_GID;
		1711	}
		1712	} else {
		1713	uid = GLOBAL_ROOT_UID;
		1714	gid = GLOBAL_ROOT_GID;
		1715	}
		1716	task_unlock(task);
		1717	}
		1718	*ruid = uid;
		1719	*rgid = gid;
		1720	}
		1721
1670	struct inode proc_pid_make_inode(struct super_block sb,	1722	struct inode proc_pid_make_inode(struct super_block sb,
1671	struct task_struct *task, umode_t mode)	1723	struct task_struct *task, umode_t mode)
1672	{	1724	{
1673	struct inode * inode;	1725	struct inode * inode;
1674	struct proc_inode *ei;	1726	struct proc_inode *ei;
1675	const struct cred *cred;
1676		1727
1677	/* We need a new inode */	1728	/* We need a new inode */
1678		1729
@@ -1694,13 +1745,7 @@ struct inode proc_pid_make_inode(struct super_block sb,
1694	if (!ei->pid)	1745	if (!ei->pid)
1695	goto out_unlock;	1746	goto out_unlock;
1696		1747
1697	if (task_dumpable(task)) {	1748	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1698	rcu_read_lock();
1699	cred = __task_cred(task);
1700	inode->i_uid = cred->euid;
1701	inode->i_gid = cred->egid;
1702	rcu_read_unlock();
1703	}
1704	security_task_to_inode(task, inode);	1749	security_task_to_inode(task, inode);
1705		1750
1706	out:	1751	out:
@@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount mnt, struct dentry dentry, struct kstat *stat)
1715	{	1760	{
1716	struct inode *inode = d_inode(dentry);	1761	struct inode *inode = d_inode(dentry);
1717	struct task_struct *task;	1762	struct task_struct *task;
1718	const struct cred *cred;
1719	struct pid_namespace *pid = dentry->d_sb->s_fs_info;	1763	struct pid_namespace *pid = dentry->d_sb->s_fs_info;
1720		1764
1721	generic_fillattr(inode, stat);	1765	generic_fillattr(inode, stat);
@@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount mnt, struct dentry dentry, struct kstat *stat)
1733	*/	1777	*/
1734	return -ENOENT;	1778	return -ENOENT;
1735	}	1779	}
1736	if ((inode->i_mode == (S_IFDIR\|S_IRUGO\|S_IXUGO)) \|\|	1780	task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1737	task_dumpable(task)) {
1738	cred = __task_cred(task);
1739	stat->uid = cred->euid;
1740	stat->gid = cred->egid;
1741	}
1742	}	1781	}
1743	rcu_read_unlock();	1782	rcu_read_unlock();
1744	return 0;	1783	return 0;
@@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount mnt, struct dentry dentry, struct kstat *stat)
1754	* Rewrite the inode's ownerships here because the owning task may have	1793	* Rewrite the inode's ownerships here because the owning task may have
1755	* performed a setuid(), etc.	1794	* performed a setuid(), etc.
1756	*	1795	*
1757	* Before the /proc/pid/status file was created the only way to read
1758	* the effective uid of a /process was to stat /proc/pid. Reading
1759	* /proc/pid/status is slow enough that procps and other packages
1760	* kept stating /proc/pid. To keep the rules in /proc simple I have
1761	* made this apply to all per process world readable and executable
1762	* directories.
1763	*/	1796	*/
1764	int pid_revalidate(struct dentry *dentry, unsigned int flags)	1797	int pid_revalidate(struct dentry *dentry, unsigned int flags)
1765	{	1798	{
1766	struct inode *inode;	1799	struct inode *inode;
1767	struct task_struct *task;	1800	struct task_struct *task;
1768	const struct cred *cred;
1769		1801
1770	if (flags & LOOKUP_RCU)	1802	if (flags & LOOKUP_RCU)
1771	return -ECHILD;	1803	return -ECHILD;
@@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1774	task = get_proc_task(inode);	1806	task = get_proc_task(inode);
1775		1807
1776	if (task) {	1808	if (task) {
1777	if ((inode->i_mode == (S_IFDIR\|S_IRUGO\|S_IXUGO)) \|\|	1809	task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1778	task_dumpable(task)) {	1810
1779	rcu_read_lock();
1780	cred = __task_cred(task);
1781	inode->i_uid = cred->euid;
1782	inode->i_gid = cred->egid;
1783	rcu_read_unlock();
1784	} else {
1785	inode->i_uid = GLOBAL_ROOT_UID;
1786	inode->i_gid = GLOBAL_ROOT_GID;
1787	}
1788	inode->i_mode &= ~(S_ISUID \| S_ISGID);	1811	inode->i_mode &= ~(S_ISUID \| S_ISGID);
1789	security_task_to_inode(task, inode);	1812	security_task_to_inode(task, inode);
1790	put_task_struct(task);	1813	put_task_struct(task);
@@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1881	bool exact_vma_exists = false;	1904	bool exact_vma_exists = false;
1882	struct mm_struct *mm = NULL;	1905	struct mm_struct *mm = NULL;
1883	struct task_struct *task;	1906	struct task_struct *task;
1884	const struct cred *cred;
1885	struct inode *inode;	1907	struct inode *inode;
1886	int status = 0;	1908	int status = 0;
1887		1909
@@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1906	mmput(mm);	1928	mmput(mm);
1907		1929
1908	if (exact_vma_exists) {	1930	if (exact_vma_exists) {
1909	if (task_dumpable(task)) {	1931	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1910	rcu_read_lock();	1932
1911	cred = __task_cred(task);
1912	inode->i_uid = cred->euid;
1913	inode->i_gid = cred->egid;
1914	rcu_read_unlock();
1915	} else {
1916	inode->i_uid = GLOBAL_ROOT_UID;
1917	inode->i_gid = GLOBAL_ROOT_GID;
1918	}
1919	security_task_to_inode(task, inode);	1933	security_task_to_inode(task, inode);
1920	status = 1;	1934	status = 1;
1921	}	1935	}


diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 4274f83bf100..00ce1531b2f5 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c
@@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
84	{	84	{
85	struct files_struct *files;	85	struct files_struct *files;
86	struct task_struct *task;	86	struct task_struct *task;
87	const struct cred *cred;
88	struct inode *inode;	87	struct inode *inode;
89	unsigned int fd;	88	unsigned int fd;
90		89
@@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
108	rcu_read_unlock();	107	rcu_read_unlock();
109	put_files_struct(files);	108	put_files_struct(files);
110		109
111	if (task_dumpable(task)) {	110	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
112	rcu_read_lock();
113	cred = __task_cred(task);
114	inode->i_uid = cred->euid;
115	inode->i_gid = cred->egid;
116	rcu_read_unlock();
117	} else {
118	inode->i_uid = GLOBAL_ROOT_UID;
119	inode->i_gid = GLOBAL_ROOT_GID;
120	}
121		111
122	if (S_ISLNK(inode->i_mode)) {	112	if (S_ISLNK(inode->i_mode)) {
123	unsigned i_mode = S_IFLNK;	113	unsigned i_mode = S_IFLNK;


diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 842a5ff5b85c..7ad9ed7958af 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
43	de = PDE(inode);	43	de = PDE(inode);
44	if (de)	44	if (de)
45	pde_put(de);	45	pde_put(de);
		46
46	head = PROC_I(inode)->sysctl;	47	head = PROC_I(inode)->sysctl;
47	if (head) {	48	if (head) {
48	RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);	49	RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
49	sysctl_head_put(head);	50	proc_sys_evict_inode(inode, head);
50	}	51	}
51	}	52	}
52		53


diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 2de5194ba378..5d6960f5f1c0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
65	struct proc_dir_entry *pde;	65	struct proc_dir_entry *pde;
66	struct ctl_table_header *sysctl;	66	struct ctl_table_header *sysctl;
67	struct ctl_table *sysctl_entry;	67	struct ctl_table *sysctl_entry;
		68	struct list_head sysctl_inodes;
68	const struct proc_ns_operations *ns_ops;	69	const struct proc_ns_operations *ns_ops;
69	struct inode vfs_inode;	70	struct inode vfs_inode;
70	};	71	};
@@ -97,20 +98,8 @@ static inline struct task_struct get_proc_task(struct inode inode)
97	return get_pid_task(proc_pid(inode), PIDTYPE_PID);	98	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
98	}	99	}
99		100
100	static inline int task_dumpable(struct task_struct *task)	101	void task_dump_owner(struct task_struct *task, mode_t mode,
101	{	102	kuid_t ruid, kgid_t rgid);
102	int dumpable = 0;
103	struct mm_struct *mm;
104
105	task_lock(task);
106	mm = task->mm;
107	if (mm)
108	dumpable = get_dumpable(mm);
109	task_unlock(task);
110	if (dumpable == SUID_DUMP_USER)
111	return 1;
112	return 0;
113	}
114		103
115	static inline unsigned name_to_int(const struct qstr *qstr)	104	static inline unsigned name_to_int(const struct qstr *qstr)
116	{	105	{
@@ -249,10 +238,12 @@ extern void proc_thread_self_init(void);
249	*/	238	*/
250	#ifdef CONFIG_PROC_SYSCTL	239	#ifdef CONFIG_PROC_SYSCTL
251	extern int proc_sys_init(void);	240	extern int proc_sys_init(void);
252	extern void sysctl_head_put(struct ctl_table_header *);	241	extern void proc_sys_evict_inode(struct inode *inode,
		242	struct ctl_table_header *head);
253	#else	243	#else
254	static inline void proc_sys_init(void) { }	244	static inline void proc_sys_init(void) { }
255	static inline void sysctl_head_put(struct ctl_table_header *head) { }	245	static inline void proc_sys_evict_inode(struct inode *inode,
		246	struct ctl_table_header *head) { }
256	#endif	247	#endif
257		248
258	/*	249	/*


diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d4e37acd4821..3e64c6502dc8 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
190	head->set = set;	190	head->set = set;
191	head->parent = NULL;	191	head->parent = NULL;
192	head->node = node;	192	head->node = node;
		193	INIT_LIST_HEAD(&head->inodes);
193	if (node) {	194	if (node) {
194	struct ctl_table *entry;	195	struct ctl_table *entry;
195	for (entry = table; entry->procname; entry++, node++)	196	for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p)
259	complete(p->unregistering);	260	complete(p->unregistering);
260	}	261	}
261		262
		263	/* called under sysctl_lock */
		264	static void proc_sys_prune_dcache(struct ctl_table_header *head)
		265	{
		266	struct inode inode, prev = NULL;
		267	struct proc_inode *ei;
		268
		269	rcu_read_lock();
		270	list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
		271	inode = igrab(&ei->vfs_inode);
		272	if (inode) {
		273	rcu_read_unlock();
		274	iput(prev);
		275	prev = inode;
		276	d_prune_aliases(inode);
		277	rcu_read_lock();
		278	}
		279	}
		280	rcu_read_unlock();
		281	iput(prev);
		282	}
		283
262	/* called under sysctl_lock, will reacquire if has to wait */	284	/* called under sysctl_lock, will reacquire if has to wait */
263	static void start_unregistering(struct ctl_table_header *p)	285	static void start_unregistering(struct ctl_table_header *p)
264	{	286	{
@@ -272,31 +294,22 @@ static void start_unregistering(struct ctl_table_header *p)
272	p->unregistering = &wait;	294	p->unregistering = &wait;
273	spin_unlock(&sysctl_lock);	295	spin_unlock(&sysctl_lock);
274	wait_for_completion(&wait);	296	wait_for_completion(&wait);
275	spin_lock(&sysctl_lock);
276	} else {	297	} else {
277	/* anything non-NULL; we'll never dereference it */	298	/* anything non-NULL; we'll never dereference it */
278	p->unregistering = ERR_PTR(-EINVAL);	299	p->unregistering = ERR_PTR(-EINVAL);
		300	spin_unlock(&sysctl_lock);
279	}	301	}
280	/*	302	/*
		303	* Prune dentries for unregistered sysctls: namespaced sysctls
		304	* can have duplicate names and contaminate dcache very badly.
		305	*/
		306	proc_sys_prune_dcache(p);
		307	/*
281	* do not remove from the list until nobody holds it; walking the	308	* do not remove from the list until nobody holds it; walking the
282	* list in do_sysctl() relies on that.	309	* list in do_sysctl() relies on that.
283	*/	310	*/
284	erase_header(p);
285	}
286
287	static void sysctl_head_get(struct ctl_table_header *head)
288	{
289	spin_lock(&sysctl_lock);	311	spin_lock(&sysctl_lock);
290	head->count++;	312	erase_header(p);
291	spin_unlock(&sysctl_lock);
292	}
293
294	void sysctl_head_put(struct ctl_table_header *head)
295	{
296	spin_lock(&sysctl_lock);
297	if (!--head->count)
298	kfree_rcu(head, rcu);
299	spin_unlock(&sysctl_lock);
300	}	313	}
301		314
302	static struct ctl_table_header sysctl_head_grab(struct ctl_table_header head)	315	static struct ctl_table_header sysctl_head_grab(struct ctl_table_header head)
@@ -440,10 +453,20 @@ static struct inode proc_sys_make_inode(struct super_block sb,
440		453
441	inode->i_ino = get_next_ino();	454	inode->i_ino = get_next_ino();
442		455
443	sysctl_head_get(head);
444	ei = PROC_I(inode);	456	ei = PROC_I(inode);
		457
		458	spin_lock(&sysctl_lock);
		459	if (unlikely(head->unregistering)) {
		460	spin_unlock(&sysctl_lock);
		461	iput(inode);
		462	inode = NULL;
		463	goto out;
		464	}
445	ei->sysctl = head;	465	ei->sysctl = head;
446	ei->sysctl_entry = table;	466	ei->sysctl_entry = table;
		467	list_add_rcu(&ei->sysctl_inodes, &head->inodes);
		468	head->count++;
		469	spin_unlock(&sysctl_lock);
447		470
448	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);	471	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
449	inode->i_mode = table->mode;	472	inode->i_mode = table->mode;
@@ -466,6 +489,15 @@ out:
466	return inode;	489	return inode;
467	}	490	}
468		491
		492	void proc_sys_evict_inode(struct inode inode, struct ctl_table_header head)
		493	{
		494	spin_lock(&sysctl_lock);
		495	list_del_rcu(&PROC_I(inode)->sysctl_inodes);
		496	if (!--head->count)
		497	kfree_rcu(head, rcu);
		498	spin_unlock(&sysctl_lock);
		499	}
		500
469	static struct ctl_table_header grab_header(struct inode inode)	501	static struct ctl_table_header grab_header(struct inode inode)
470	{	502	{
471	struct ctl_table_header *head = PROC_I(inode)->sysctl;	503	struct ctl_table_header *head = PROC_I(inode)->sysctl;