aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sys.c
diff options
context:
space:
mode:
authorCyrill Gorcunov <gorcunov@openvz.org>2014-10-09 18:27:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-09 22:25:55 -0400
commitf606b77f1a9e362451aca8f81d8f36a3a112139e (patch)
treec21d63af5b7fafa9c01cf93ce2b6a4232d087857 /kernel/sys.c
parent71fe97e185040c5dac3216cd54e186dfa534efa0 (diff)
prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation
During development of c/r we've noticed that in case if we need to support user namespaces we face a problem with capabilities in prctl(PR_SET_MM, ...) call, in particular once new user namespace is created capable(CAP_SYS_RESOURCE) no longer passes. A approach is to eliminate CAP_SYS_RESOURCE check but pass all new values in one bundle, which would allow the kernel to make more intensive test for sanity of values and same time allow us to support checkpoint/restore of user namespaces. Thus a new command PR_SET_MM_MAP introduced. It takes a pointer of prctl_mm_map structure which carries all the members to be updated. prctl(PR_SET_MM, PR_SET_MM_MAP, struct prctl_mm_map *, size) struct prctl_mm_map { __u64 start_code; __u64 end_code; __u64 start_data; __u64 end_data; __u64 start_brk; __u64 brk; __u64 start_stack; __u64 arg_start; __u64 arg_end; __u64 env_start; __u64 env_end; __u64 *auxv; __u32 auxv_size; __u32 exe_fd; }; All members except @exe_fd correspond ones of struct mm_struct. To figure out which available values these members may take here are meanings of the members. - start_code, end_code: represent bounds of executable code area - start_data, end_data: represent bounds of data area - start_brk, brk: used to calculate bounds for brk() syscall - start_stack: used when accounting space needed for command line arguments, environment and shmat() syscall - arg_start, arg_end, env_start, env_end: represent memory area supplied for command line arguments and environment variables - auxv, auxv_size: carries auxiliary vector, Elf format specifics - exe_fd: file descriptor number for executable link (/proc/self/exe) Thus we apply the following requirements to the values 1) Any member except @auxv, @auxv_size, @exe_fd is rather an address in user space thus it must be laying inside [mmap_min_addr, mmap_max_addr) interval. 2) While @[start|end]_code and @[start|end]_data may point to an nonexisting VMAs (say a program maps own new .text and .data segments during execution) the rest of members should belong to VMA which must exist. 3) Addresses must be ordered, ie @start_ member must not be greater or equal to appropriate @end_ member. 4) As in regular Elf loading procedure we require that @start_brk and @brk be greater than @end_data. 5) If RLIMIT_DATA rlimit is set to non-infinity new values should not exceed existing limit. Same applies to RLIMIT_STACK. 6) Auxiliary vector size must not exceed existing one (which is predefined as AT_VECTOR_SIZE and depends on architecture). 7) File descriptor passed in @exe_file should be pointing to executable file (because we use existing prctl_set_mm_exe_file_locked helper it ensures that the file we are going to use as exe link has all required permission granted). Now about where these members are involved inside kernel code: - @start_code and @end_code are used in /proc/$pid/[stat|statm] output; - @start_data and @end_data are used in /proc/$pid/[stat|statm] output, also they are considered if there enough space for brk() syscall result if RLIMIT_DATA is set; - @start_brk shown in /proc/$pid/stat output and accounted in brk() syscall if RLIMIT_DATA is set; also this member is tested to find a symbolic name of mmap event for perf system (we choose if event is generated for "heap" area); one more aplication is selinux -- we test if a process has PROCESS__EXECHEAP permission if trying to make heap area being executable with mprotect() syscall; - @brk is a current value for brk() syscall which lays inside heap area, it's shown in /proc/$pid/stat. When syscall brk() succesfully provides new memory area to a user space upon brk() completion the mm::brk is updated to carry new value; Both @start_brk and @brk are actively used in /proc/$pid/maps and /proc/$pid/smaps output to find a symbolic name "heap" for VMA being scanned; - @start_stack is printed out in /proc/$pid/stat and used to find a symbolic name "stack" for task and threads in /proc/$pid/maps and /proc/$pid/smaps output, and as the same as with @start_brk -- perf system uses it for event naming. Also kernel treat this member as a start address of where to map vDSO pages and to check if there is enough space for shmat() syscall; - @arg_start, @arg_end, @env_start and @env_end are printed out in /proc/$pid/stat. Another access to the data these members represent is to read /proc/$pid/environ or /proc/$pid/cmdline. Any attempt to read these areas kernel tests with access_process_vm helper so a user must have enough rights for this action; - @auxv and @auxv_size may be read from /proc/$pid/auxv. Strictly speaking kernel doesn't care much about which exactly data is sitting there because it is solely for userspace; - @exe_fd is referred from /proc/$pid/exe and when generating coredump. We uses prctl_set_mm_exe_file_locked helper to update this member, so exe-file link modification remains one-shot action. Still note that updating exe-file link now doesn't require sys-resource capability anymore, after all there is no much profit in preventing setup own file link (there are a number of ways to execute own code -- ptrace, ld-preload, so that the only reliable way to find which exactly code is executed is to inspect running program memory). Still we require the caller to be at least user-namespace root user. I believe the old interface should be deprecated and ripped off in a couple of kernel releases if no one against. To test if new interface is implemented in the kernel one can pass PR_SET_MM_MAP_SIZE opcode and the kernel returns the size of currently supported struct prctl_mm_map. [akpm@linux-foundation.org: fix 80-col wordwrap in macro definitions] Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Kees Cook <keescook@chromium.org> Cc: Tejun Heo <tj@kernel.org> Acked-by: Andrew Vagin <avagin@openvz.org> Tested-by: Andrew Vagin <avagin@openvz.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: H. Peter Anvin <hpa@zytor.com> Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Cc: Pavel Emelyanov <xemul@parallels.com> Cc: Vasiliy Kulikov <segoon@openwall.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Julien Tinnes <jln@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/sys.c')
-rw-r--r--kernel/sys.c190
1 files changed, 189 insertions, 1 deletions
diff --git a/kernel/sys.c b/kernel/sys.c
index 14222a1699c0..f7030b060018 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1687,6 +1687,187 @@ exit:
1687 return err; 1687 return err;
1688} 1688}
1689 1689
1690#ifdef CONFIG_CHECKPOINT_RESTORE
1691/*
1692 * WARNING: we don't require any capability here so be very careful
1693 * in what is allowed for modification from userspace.
1694 */
1695static int validate_prctl_map(struct prctl_mm_map *prctl_map)
1696{
1697 unsigned long mmap_max_addr = TASK_SIZE;
1698 struct mm_struct *mm = current->mm;
1699 int error = -EINVAL, i;
1700
1701 static const unsigned char offsets[] = {
1702 offsetof(struct prctl_mm_map, start_code),
1703 offsetof(struct prctl_mm_map, end_code),
1704 offsetof(struct prctl_mm_map, start_data),
1705 offsetof(struct prctl_mm_map, end_data),
1706 offsetof(struct prctl_mm_map, start_brk),
1707 offsetof(struct prctl_mm_map, brk),
1708 offsetof(struct prctl_mm_map, start_stack),
1709 offsetof(struct prctl_mm_map, arg_start),
1710 offsetof(struct prctl_mm_map, arg_end),
1711 offsetof(struct prctl_mm_map, env_start),
1712 offsetof(struct prctl_mm_map, env_end),
1713 };
1714
1715 /*
1716 * Make sure the members are not somewhere outside
1717 * of allowed address space.
1718 */
1719 for (i = 0; i < ARRAY_SIZE(offsets); i++) {
1720 u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
1721
1722 if ((unsigned long)val >= mmap_max_addr ||
1723 (unsigned long)val < mmap_min_addr)
1724 goto out;
1725 }
1726
1727 /*
1728 * Make sure the pairs are ordered.
1729 */
1730#define __prctl_check_order(__m1, __op, __m2) \
1731 ((unsigned long)prctl_map->__m1 __op \
1732 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
1733 error = __prctl_check_order(start_code, <, end_code);
1734 error |= __prctl_check_order(start_data, <, end_data);
1735 error |= __prctl_check_order(start_brk, <=, brk);
1736 error |= __prctl_check_order(arg_start, <=, arg_end);
1737 error |= __prctl_check_order(env_start, <=, env_end);
1738 if (error)
1739 goto out;
1740#undef __prctl_check_order
1741
1742 error = -EINVAL;
1743
1744 /*
1745 * @brk should be after @end_data in traditional maps.
1746 */
1747 if (prctl_map->start_brk <= prctl_map->end_data ||
1748 prctl_map->brk <= prctl_map->end_data)
1749 goto out;
1750
1751 /*
1752 * Neither we should allow to override limits if they set.
1753 */
1754 if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
1755 prctl_map->start_brk, prctl_map->end_data,
1756 prctl_map->start_data))
1757 goto out;
1758
1759 /*
1760 * Someone is trying to cheat the auxv vector.
1761 */
1762 if (prctl_map->auxv_size) {
1763 if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
1764 goto out;
1765 }
1766
1767 /*
1768 * Finally, make sure the caller has the rights to
1769 * change /proc/pid/exe link: only local root should
1770 * be allowed to.
1771 */
1772 if (prctl_map->exe_fd != (u32)-1) {
1773 struct user_namespace *ns = current_user_ns();
1774 const struct cred *cred = current_cred();
1775
1776 if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
1777 !gid_eq(cred->gid, make_kgid(ns, 0)))
1778 goto out;
1779 }
1780
1781 error = 0;
1782out:
1783 return error;
1784}
1785
1786static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
1787{
1788 struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
1789 unsigned long user_auxv[AT_VECTOR_SIZE];
1790 struct mm_struct *mm = current->mm;
1791 int error;
1792
1793 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1794 BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
1795
1796 if (opt == PR_SET_MM_MAP_SIZE)
1797 return put_user((unsigned int)sizeof(prctl_map),
1798 (unsigned int __user *)addr);
1799
1800 if (data_size != sizeof(prctl_map))
1801 return -EINVAL;
1802
1803 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
1804 return -EFAULT;
1805
1806 error = validate_prctl_map(&prctl_map);
1807 if (error)
1808 return error;
1809
1810 if (prctl_map.auxv_size) {
1811 memset(user_auxv, 0, sizeof(user_auxv));
1812 if (copy_from_user(user_auxv,
1813 (const void __user *)prctl_map.auxv,
1814 prctl_map.auxv_size))
1815 return -EFAULT;
1816
1817 /* Last entry must be AT_NULL as specification requires */
1818 user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
1819 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
1820 }
1821
1822 down_write(&mm->mmap_sem);
1823 if (prctl_map.exe_fd != (u32)-1)
1824 error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
1825 downgrade_write(&mm->mmap_sem);
1826 if (error)
1827 goto out;
1828
1829 /*
1830 * We don't validate if these members are pointing to
1831 * real present VMAs because application may have correspond
1832 * VMAs already unmapped and kernel uses these members for statistics
1833 * output in procfs mostly, except
1834 *
1835 * - @start_brk/@brk which are used in do_brk but kernel lookups
1836 * for VMAs when updating these memvers so anything wrong written
1837 * here cause kernel to swear at userspace program but won't lead
1838 * to any problem in kernel itself
1839 */
1840
1841 mm->start_code = prctl_map.start_code;
1842 mm->end_code = prctl_map.end_code;
1843 mm->start_data = prctl_map.start_data;
1844 mm->end_data = prctl_map.end_data;
1845 mm->start_brk = prctl_map.start_brk;
1846 mm->brk = prctl_map.brk;
1847 mm->start_stack = prctl_map.start_stack;
1848 mm->arg_start = prctl_map.arg_start;
1849 mm->arg_end = prctl_map.arg_end;
1850 mm->env_start = prctl_map.env_start;
1851 mm->env_end = prctl_map.env_end;
1852
1853 /*
1854 * Note this update of @saved_auxv is lockless thus
1855 * if someone reads this member in procfs while we're
1856 * updating -- it may get partly updated results. It's
1857 * known and acceptable trade off: we leave it as is to
1858 * not introduce additional locks here making the kernel
1859 * more complex.
1860 */
1861 if (prctl_map.auxv_size)
1862 memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
1863
1864 error = 0;
1865out:
1866 up_read(&mm->mmap_sem);
1867 return error;
1868}
1869#endif /* CONFIG_CHECKPOINT_RESTORE */
1870
1690static int prctl_set_mm(int opt, unsigned long addr, 1871static int prctl_set_mm(int opt, unsigned long addr,
1691 unsigned long arg4, unsigned long arg5) 1872 unsigned long arg4, unsigned long arg5)
1692{ 1873{
@@ -1694,9 +1875,16 @@ static int prctl_set_mm(int opt, unsigned long addr,
1694 struct vm_area_struct *vma; 1875 struct vm_area_struct *vma;
1695 int error; 1876 int error;
1696 1877
1697 if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) 1878 if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
1879 opt != PR_SET_MM_MAP &&
1880 opt != PR_SET_MM_MAP_SIZE)))
1698 return -EINVAL; 1881 return -EINVAL;
1699 1882
1883#ifdef CONFIG_CHECKPOINT_RESTORE
1884 if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
1885 return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
1886#endif
1887
1700 if (!capable(CAP_SYS_RESOURCE)) 1888 if (!capable(CAP_SYS_RESOURCE))
1701 return -EPERM; 1889 return -EPERM;
1702 1890