summaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-23 23:33:51 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-23 23:33:51 -0500
commitf1ef09fde17f9b77ca1435a5b53a28b203afb81c (patch)
tree0efcd2c5b5da451a7ca780c8aa5e26d7ec712b85 /fs/proc
parentef96152e6a36e0510387cb174178b7982c1ae879 (diff)
parentace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull namespace updates from Eric Biederman: "There is a lot here. A lot of these changes result in subtle user visible differences in kernel behavior. I don't expect anything will care but I will revert/fix things immediately if any regressions show up. From Seth Forshee there is a continuation of the work to make the vfs ready for unpriviled mounts. We had thought the previous changes prevented the creation of files outside of s_user_ns of a filesystem, but it turns we missed the O_CREAT path. Ooops. Pavel Tikhomirov and Oleg Nesterov worked together to fix a long standing bug in the implemenation of PR_SET_CHILD_SUBREAPER where only children that are forked after the prctl are considered and not children forked before the prctl. The only known user of this prctl systemd forks all children after the prctl. So no userspace regressions will occur. Holding earlier forked children to the same rules as later forked children creates a semantic that is sane enough to allow checkpoing of processes that use this feature. There is a long delayed change by Nikolay Borisov to limit inotify instances inside a user namespace. Michael Kerrisk extends the API for files used to maniuplate namespaces with two new trivial ioctls to allow discovery of the hierachy and properties of namespaces. Konstantin Khlebnikov with the help of Al Viro adds code that when a network namespace exits purges it's sysctl entries from the dcache. As in some circumstances this could use a lot of memory. Vivek Goyal fixed a bug with stacked filesystems where the permissions on the wrong inode were being checked. I continue previous work on ptracing across exec. Allowing a file to be setuid across exec while being ptraced if the tracer has enough credentials in the user namespace, and if the process has CAP_SETUID in it's own namespace. Proc files for setuid or otherwise undumpable executables are now owned by the root in the user namespace of their mm. Allowing debugging of setuid applications in containers to work better. A bug I introduced with permission checking and automount is now fixed. The big change is to mark the mounts that the kernel initiates as a result of an automount. This allows the permission checks in sget to be safely suppressed for this kind of mount. As the permission check happened when the original filesystem was mounted. Finally a special case in the mount namespace is removed preventing unbounded chains in the mount hash table, and making the semantics simpler which benefits CRIU. The vfs fix along with related work in ima and evm I believe makes us ready to finish developing and merge fully unprivileged mounts of the fuse filesystem. The cleanups of the mount namespace makes discussing how to fix the worst case complexity of umount. The stacked filesystem fixes pave the way for adding multiple mappings for the filesystem uids so that efficient and safer containers can be implemented" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: proc/sysctl: Don't grab i_lock under sysctl_lock. vfs: Use upper filesystem inode in bprm_fill_uid() proc/sysctl: prune stale dentries during unregistering mnt: Tuck mounts under others instead of creating shadow/side mounts. prctl: propagate has_child_subreaper flag to every descendant introduce the walk_process_tree() helper nsfs: Add an ioctl() to return owner UID of a userns fs: Better permission checking for submounts exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction vfs: open() with O_CREAT should not create inodes with unknown ids nsfs: Add an ioctl() to return the namespace type proc: Better ownership of files for non-dumpable tasks in user namespaces exec: Remove LSM_UNSAFE_PTRACE_CAP exec: Test the ptracer's saved cred to see if the tracee can gain caps exec: Don't reset euid and egid when the tracee has CAP_SETUID inotify: Convert to using per-namespace limits
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/base.c102
-rw-r--r--fs/proc/fd.c12
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h23
-rw-r--r--fs/proc/proc_sysctl.c66
5 files changed, 117 insertions, 89 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3d773eb9e144..b73b4de8fb36 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = {
1667 1667
1668/* building an inode */ 1668/* building an inode */
1669 1669
1670void task_dump_owner(struct task_struct *task, mode_t mode,
1671 kuid_t *ruid, kgid_t *rgid)
1672{
1673 /* Depending on the state of dumpable compute who should own a
1674 * proc file for a task.
1675 */
1676 const struct cred *cred;
1677 kuid_t uid;
1678 kgid_t gid;
1679
1680 /* Default to the tasks effective ownership */
1681 rcu_read_lock();
1682 cred = __task_cred(task);
1683 uid = cred->euid;
1684 gid = cred->egid;
1685 rcu_read_unlock();
1686
1687 /*
1688 * Before the /proc/pid/status file was created the only way to read
1689 * the effective uid of a /process was to stat /proc/pid. Reading
1690 * /proc/pid/status is slow enough that procps and other packages
1691 * kept stating /proc/pid. To keep the rules in /proc simple I have
1692 * made this apply to all per process world readable and executable
1693 * directories.
1694 */
1695 if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
1696 struct mm_struct *mm;
1697 task_lock(task);
1698 mm = task->mm;
1699 /* Make non-dumpable tasks owned by some root */
1700 if (mm) {
1701 if (get_dumpable(mm) != SUID_DUMP_USER) {
1702 struct user_namespace *user_ns = mm->user_ns;
1703
1704 uid = make_kuid(user_ns, 0);
1705 if (!uid_valid(uid))
1706 uid = GLOBAL_ROOT_UID;
1707
1708 gid = make_kgid(user_ns, 0);
1709 if (!gid_valid(gid))
1710 gid = GLOBAL_ROOT_GID;
1711 }
1712 } else {
1713 uid = GLOBAL_ROOT_UID;
1714 gid = GLOBAL_ROOT_GID;
1715 }
1716 task_unlock(task);
1717 }
1718 *ruid = uid;
1719 *rgid = gid;
1720}
1721
1670struct inode *proc_pid_make_inode(struct super_block * sb, 1722struct inode *proc_pid_make_inode(struct super_block * sb,
1671 struct task_struct *task, umode_t mode) 1723 struct task_struct *task, umode_t mode)
1672{ 1724{
1673 struct inode * inode; 1725 struct inode * inode;
1674 struct proc_inode *ei; 1726 struct proc_inode *ei;
1675 const struct cred *cred;
1676 1727
1677 /* We need a new inode */ 1728 /* We need a new inode */
1678 1729
@@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
1694 if (!ei->pid) 1745 if (!ei->pid)
1695 goto out_unlock; 1746 goto out_unlock;
1696 1747
1697 if (task_dumpable(task)) { 1748 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1698 rcu_read_lock();
1699 cred = __task_cred(task);
1700 inode->i_uid = cred->euid;
1701 inode->i_gid = cred->egid;
1702 rcu_read_unlock();
1703 }
1704 security_task_to_inode(task, inode); 1749 security_task_to_inode(task, inode);
1705 1750
1706out: 1751out:
@@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1715{ 1760{
1716 struct inode *inode = d_inode(dentry); 1761 struct inode *inode = d_inode(dentry);
1717 struct task_struct *task; 1762 struct task_struct *task;
1718 const struct cred *cred;
1719 struct pid_namespace *pid = dentry->d_sb->s_fs_info; 1763 struct pid_namespace *pid = dentry->d_sb->s_fs_info;
1720 1764
1721 generic_fillattr(inode, stat); 1765 generic_fillattr(inode, stat);
@@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1733 */ 1777 */
1734 return -ENOENT; 1778 return -ENOENT;
1735 } 1779 }
1736 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1780 task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1737 task_dumpable(task)) {
1738 cred = __task_cred(task);
1739 stat->uid = cred->euid;
1740 stat->gid = cred->egid;
1741 }
1742 } 1781 }
1743 rcu_read_unlock(); 1782 rcu_read_unlock();
1744 return 0; 1783 return 0;
@@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1754 * Rewrite the inode's ownerships here because the owning task may have 1793 * Rewrite the inode's ownerships here because the owning task may have
1755 * performed a setuid(), etc. 1794 * performed a setuid(), etc.
1756 * 1795 *
1757 * Before the /proc/pid/status file was created the only way to read
1758 * the effective uid of a /process was to stat /proc/pid. Reading
1759 * /proc/pid/status is slow enough that procps and other packages
1760 * kept stating /proc/pid. To keep the rules in /proc simple I have
1761 * made this apply to all per process world readable and executable
1762 * directories.
1763 */ 1796 */
1764int pid_revalidate(struct dentry *dentry, unsigned int flags) 1797int pid_revalidate(struct dentry *dentry, unsigned int flags)
1765{ 1798{
1766 struct inode *inode; 1799 struct inode *inode;
1767 struct task_struct *task; 1800 struct task_struct *task;
1768 const struct cred *cred;
1769 1801
1770 if (flags & LOOKUP_RCU) 1802 if (flags & LOOKUP_RCU)
1771 return -ECHILD; 1803 return -ECHILD;
@@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1774 task = get_proc_task(inode); 1806 task = get_proc_task(inode);
1775 1807
1776 if (task) { 1808 if (task) {
1777 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1809 task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1778 task_dumpable(task)) { 1810
1779 rcu_read_lock();
1780 cred = __task_cred(task);
1781 inode->i_uid = cred->euid;
1782 inode->i_gid = cred->egid;
1783 rcu_read_unlock();
1784 } else {
1785 inode->i_uid = GLOBAL_ROOT_UID;
1786 inode->i_gid = GLOBAL_ROOT_GID;
1787 }
1788 inode->i_mode &= ~(S_ISUID | S_ISGID); 1811 inode->i_mode &= ~(S_ISUID | S_ISGID);
1789 security_task_to_inode(task, inode); 1812 security_task_to_inode(task, inode);
1790 put_task_struct(task); 1813 put_task_struct(task);
@@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1881 bool exact_vma_exists = false; 1904 bool exact_vma_exists = false;
1882 struct mm_struct *mm = NULL; 1905 struct mm_struct *mm = NULL;
1883 struct task_struct *task; 1906 struct task_struct *task;
1884 const struct cred *cred;
1885 struct inode *inode; 1907 struct inode *inode;
1886 int status = 0; 1908 int status = 0;
1887 1909
@@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1906 mmput(mm); 1928 mmput(mm);
1907 1929
1908 if (exact_vma_exists) { 1930 if (exact_vma_exists) {
1909 if (task_dumpable(task)) { 1931 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1910 rcu_read_lock(); 1932
1911 cred = __task_cred(task);
1912 inode->i_uid = cred->euid;
1913 inode->i_gid = cred->egid;
1914 rcu_read_unlock();
1915 } else {
1916 inode->i_uid = GLOBAL_ROOT_UID;
1917 inode->i_gid = GLOBAL_ROOT_GID;
1918 }
1919 security_task_to_inode(task, inode); 1933 security_task_to_inode(task, inode);
1920 status = 1; 1934 status = 1;
1921 } 1935 }
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 4274f83bf100..00ce1531b2f5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
84{ 84{
85 struct files_struct *files; 85 struct files_struct *files;
86 struct task_struct *task; 86 struct task_struct *task;
87 const struct cred *cred;
88 struct inode *inode; 87 struct inode *inode;
89 unsigned int fd; 88 unsigned int fd;
90 89
@@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
108 rcu_read_unlock(); 107 rcu_read_unlock();
109 put_files_struct(files); 108 put_files_struct(files);
110 109
111 if (task_dumpable(task)) { 110 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
112 rcu_read_lock();
113 cred = __task_cred(task);
114 inode->i_uid = cred->euid;
115 inode->i_gid = cred->egid;
116 rcu_read_unlock();
117 } else {
118 inode->i_uid = GLOBAL_ROOT_UID;
119 inode->i_gid = GLOBAL_ROOT_GID;
120 }
121 111
122 if (S_ISLNK(inode->i_mode)) { 112 if (S_ISLNK(inode->i_mode)) {
123 unsigned i_mode = S_IFLNK; 113 unsigned i_mode = S_IFLNK;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
43 de = PDE(inode); 43 de = PDE(inode);
44 if (de) 44 if (de)
45 pde_put(de); 45 pde_put(de);
46
46 head = PROC_I(inode)->sysctl; 47 head = PROC_I(inode)->sysctl;
47 if (head) { 48 if (head) {
48 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); 49 RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
49 sysctl_head_put(head); 50 proc_sys_evict_inode(inode, head);
50 } 51 }
51} 52}
52 53
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..5d6960f5f1c0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
65 struct proc_dir_entry *pde; 65 struct proc_dir_entry *pde;
66 struct ctl_table_header *sysctl; 66 struct ctl_table_header *sysctl;
67 struct ctl_table *sysctl_entry; 67 struct ctl_table *sysctl_entry;
68 struct list_head sysctl_inodes;
68 const struct proc_ns_operations *ns_ops; 69 const struct proc_ns_operations *ns_ops;
69 struct inode vfs_inode; 70 struct inode vfs_inode;
70}; 71};
@@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
97 return get_pid_task(proc_pid(inode), PIDTYPE_PID); 98 return get_pid_task(proc_pid(inode), PIDTYPE_PID);
98} 99}
99 100
100static inline int task_dumpable(struct task_struct *task) 101void task_dump_owner(struct task_struct *task, mode_t mode,
101{ 102 kuid_t *ruid, kgid_t *rgid);
102 int dumpable = 0;
103 struct mm_struct *mm;
104
105 task_lock(task);
106 mm = task->mm;
107 if (mm)
108 dumpable = get_dumpable(mm);
109 task_unlock(task);
110 if (dumpable == SUID_DUMP_USER)
111 return 1;
112 return 0;
113}
114 103
115static inline unsigned name_to_int(const struct qstr *qstr) 104static inline unsigned name_to_int(const struct qstr *qstr)
116{ 105{
@@ -249,10 +238,12 @@ extern void proc_thread_self_init(void);
249 */ 238 */
250#ifdef CONFIG_PROC_SYSCTL 239#ifdef CONFIG_PROC_SYSCTL
251extern int proc_sys_init(void); 240extern int proc_sys_init(void);
252extern void sysctl_head_put(struct ctl_table_header *); 241extern void proc_sys_evict_inode(struct inode *inode,
242 struct ctl_table_header *head);
253#else 243#else
254static inline void proc_sys_init(void) { } 244static inline void proc_sys_init(void) { }
255static inline void sysctl_head_put(struct ctl_table_header *head) { } 245static inline void proc_sys_evict_inode(struct inode *inode,
246 struct ctl_table_header *head) { }
256#endif 247#endif
257 248
258/* 249/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d4e37acd4821..3e64c6502dc8 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
190 head->set = set; 190 head->set = set;
191 head->parent = NULL; 191 head->parent = NULL;
192 head->node = node; 192 head->node = node;
193 INIT_LIST_HEAD(&head->inodes);
193 if (node) { 194 if (node) {
194 struct ctl_table *entry; 195 struct ctl_table *entry;
195 for (entry = table; entry->procname; entry++, node++) 196 for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p)
259 complete(p->unregistering); 260 complete(p->unregistering);
260} 261}
261 262
263/* called under sysctl_lock */
264static void proc_sys_prune_dcache(struct ctl_table_header *head)
265{
266 struct inode *inode, *prev = NULL;
267 struct proc_inode *ei;
268
269 rcu_read_lock();
270 list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
271 inode = igrab(&ei->vfs_inode);
272 if (inode) {
273 rcu_read_unlock();
274 iput(prev);
275 prev = inode;
276 d_prune_aliases(inode);
277 rcu_read_lock();
278 }
279 }
280 rcu_read_unlock();
281 iput(prev);
282}
283
262/* called under sysctl_lock, will reacquire if has to wait */ 284/* called under sysctl_lock, will reacquire if has to wait */
263static void start_unregistering(struct ctl_table_header *p) 285static void start_unregistering(struct ctl_table_header *p)
264{ 286{
@@ -272,31 +294,22 @@ static void start_unregistering(struct ctl_table_header *p)
272 p->unregistering = &wait; 294 p->unregistering = &wait;
273 spin_unlock(&sysctl_lock); 295 spin_unlock(&sysctl_lock);
274 wait_for_completion(&wait); 296 wait_for_completion(&wait);
275 spin_lock(&sysctl_lock);
276 } else { 297 } else {
277 /* anything non-NULL; we'll never dereference it */ 298 /* anything non-NULL; we'll never dereference it */
278 p->unregistering = ERR_PTR(-EINVAL); 299 p->unregistering = ERR_PTR(-EINVAL);
300 spin_unlock(&sysctl_lock);
279 } 301 }
280 /* 302 /*
303 * Prune dentries for unregistered sysctls: namespaced sysctls
304 * can have duplicate names and contaminate dcache very badly.
305 */
306 proc_sys_prune_dcache(p);
307 /*
281 * do not remove from the list until nobody holds it; walking the 308 * do not remove from the list until nobody holds it; walking the
282 * list in do_sysctl() relies on that. 309 * list in do_sysctl() relies on that.
283 */ 310 */
284 erase_header(p);
285}
286
287static void sysctl_head_get(struct ctl_table_header *head)
288{
289 spin_lock(&sysctl_lock); 311 spin_lock(&sysctl_lock);
290 head->count++; 312 erase_header(p);
291 spin_unlock(&sysctl_lock);
292}
293
294void sysctl_head_put(struct ctl_table_header *head)
295{
296 spin_lock(&sysctl_lock);
297 if (!--head->count)
298 kfree_rcu(head, rcu);
299 spin_unlock(&sysctl_lock);
300} 313}
301 314
302static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) 315static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
@@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
440 453
441 inode->i_ino = get_next_ino(); 454 inode->i_ino = get_next_ino();
442 455
443 sysctl_head_get(head);
444 ei = PROC_I(inode); 456 ei = PROC_I(inode);
457
458 spin_lock(&sysctl_lock);
459 if (unlikely(head->unregistering)) {
460 spin_unlock(&sysctl_lock);
461 iput(inode);
462 inode = NULL;
463 goto out;
464 }
445 ei->sysctl = head; 465 ei->sysctl = head;
446 ei->sysctl_entry = table; 466 ei->sysctl_entry = table;
467 list_add_rcu(&ei->sysctl_inodes, &head->inodes);
468 head->count++;
469 spin_unlock(&sysctl_lock);
447 470
448 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); 471 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
449 inode->i_mode = table->mode; 472 inode->i_mode = table->mode;
@@ -466,6 +489,15 @@ out:
466 return inode; 489 return inode;
467} 490}
468 491
492void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
493{
494 spin_lock(&sysctl_lock);
495 list_del_rcu(&PROC_I(inode)->sysctl_inodes);
496 if (!--head->count)
497 kfree_rcu(head, rcu);
498 spin_unlock(&sysctl_lock);
499}
500
469static struct ctl_table_header *grab_header(struct inode *inode) 501static struct ctl_table_header *grab_header(struct inode *inode)
470{ 502{
471 struct ctl_table_header *head = PROC_I(inode)->sysctl; 503 struct ctl_table_header *head = PROC_I(inode)->sysctl;