aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile10
-rw-r--r--kernel/cgroup.c3
-rw-r--r--kernel/compat.c17
-rw-r--r--kernel/cred.c27
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c73
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/modsign_certificate.S19
-rw-r--r--kernel/modsign_pubkey.c6
-rw-r--r--kernel/module.c444
-rw-r--r--kernel/nsproxy.c36
-rw-r--r--kernel/pid.c62
-rw-r--r--kernel/pid_namespace.c113
-rw-r--r--kernel/posix-cpu-timers.c3
-rw-r--r--kernel/printk.c40
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/res_counter.c20
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/fair.c5
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/trace.c60
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_uprobe.c8
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c147
-rw-r--r--kernel/utsname.c34
-rw-r--r--kernel/watchdog.c24
31 files changed, 806 insertions, 399 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ac0d533eb7de..6c072b6da239 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,7 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 54obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
55obj-$(CONFIG_UID16) += uid16.o 55obj-$(CONFIG_UID16) += uid16.o
56obj-$(CONFIG_MODULES) += module.o 56obj-$(CONFIG_MODULES) += module.o
57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o 57obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
58obj-$(CONFIG_KALLSYMS) += kallsyms.o 58obj-$(CONFIG_KALLSYMS) += kallsyms.o
59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 59obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
60obj-$(CONFIG_KEXEC) += kexec.o 60obj-$(CONFIG_KEXEC) += kexec.o
@@ -137,10 +137,14 @@ ifeq ($(CONFIG_MODULE_SIG),y)
137# 137#
138# Pull the signing certificate and any extra certificates into the kernel 138# Pull the signing certificate and any extra certificates into the kernel
139# 139#
140
141quiet_cmd_touch = TOUCH $@
142 cmd_touch = touch $@
143
140extra_certificates: 144extra_certificates:
141 touch $@ 145 $(call cmd,touch)
142 146
143kernel/modsign_pubkey.o: signing_key.x509 extra_certificates 147kernel/modsign_certificate.o: signing_key.x509 extra_certificates
144 148
145############################################################################### 149###############################################################################
146# 150#
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f34c41bfaa37..4855892798fd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1333,7 +1333,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1333 if (ret) 1333 if (ret)
1334 goto out_unlock; 1334 goto out_unlock;
1335 1335
1336 /* See feature-removal-schedule.txt */
1337 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1336 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
1338 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1337 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1339 task_tgid_nr(current), current->comm); 1338 task_tgid_nr(current), current->comm);
@@ -3409,7 +3408,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3409{ 3408{
3410 struct cgroup_pidlist *l; 3409 struct cgroup_pidlist *l;
3411 /* don't need task_nsproxy() if we're looking at ourself */ 3410 /* don't need task_nsproxy() if we're looking at ourself */
3412 struct pid_namespace *ns = current->nsproxy->pid_ns; 3411 struct pid_namespace *ns = task_active_pid_ns(current);
3413 3412
3414 /* 3413 /*
3415 * We can't drop the pidlist_mutex before taking the l->mutex in case 3414 * We can't drop the pidlist_mutex before taking the l->mutex in case
diff --git a/kernel/compat.c b/kernel/compat.c
index c28a306ae05c..f6150e92dfc9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1215 return 0; 1215 return 0;
1216} 1216}
1217 1217
1218#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
1219asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
1220 struct compat_timespec __user *interval)
1221{
1222 struct timespec t;
1223 int ret;
1224 mm_segment_t old_fs = get_fs();
1225
1226 set_fs(KERNEL_DS);
1227 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
1228 set_fs(old_fs);
1229 if (put_compat_timespec(&t, interval))
1230 return -EFAULT;
1231 return ret;
1232}
1233#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
1234
1218/* 1235/*
1219 * Allocate user-space memory for the duration of a single system call, 1236 * Allocate user-space memory for the duration of a single system call,
1220 * in order to marshall parameters inside a compat thunk. 1237 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/cred.c b/kernel/cred.c
index 8888afb846e9..e0573a43c7df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -372,6 +372,31 @@ error_put:
372 return ret; 372 return ret;
373} 373}
374 374
375static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
376{
377 const struct user_namespace *set_ns = set->user_ns;
378 const struct user_namespace *subset_ns = subset->user_ns;
379
380 /* If the two credentials are in the same user namespace see if
381 * the capabilities of subset are a subset of set.
382 */
383 if (set_ns == subset_ns)
384 return cap_issubset(subset->cap_permitted, set->cap_permitted);
385
386 /* The credentials are in a different user namespaces
387 * therefore one is a subset of the other only if a set is an
388 * ancestor of subset and set->euid is owner of subset or one
389 * of subsets ancestors.
390 */
391 for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
392 if ((set_ns == subset_ns->parent) &&
393 uid_eq(subset_ns->owner, set->euid))
394 return true;
395 }
396
397 return false;
398}
399
375/** 400/**
376 * commit_creds - Install new credentials upon the current task 401 * commit_creds - Install new credentials upon the current task
377 * @new: The credentials to be assigned 402 * @new: The credentials to be assigned
@@ -410,7 +435,7 @@ int commit_creds(struct cred *new)
410 !gid_eq(old->egid, new->egid) || 435 !gid_eq(old->egid, new->egid) ||
411 !uid_eq(old->fsuid, new->fsuid) || 436 !uid_eq(old->fsuid, new->fsuid) ||
412 !gid_eq(old->fsgid, new->fsgid) || 437 !gid_eq(old->fsgid, new->fsgid) ||
413 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 438 !cred_cap_issubset(old, new)) {
414 if (task->mm) 439 if (task->mm)
415 set_dumpable(task->mm, suid_dumpable); 440 set_dumpable(task->mm, suid_dumpable);
416 task->pdeath_signal = 0; 441 task->pdeath_signal = 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f9ff5493171d..301079d06f24 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6155 6155
6156 event->parent = parent_event; 6156 event->parent = parent_event;
6157 6157
6158 event->ns = get_pid_ns(current->nsproxy->pid_ns); 6158 event->ns = get_pid_ns(task_active_pid_ns(current));
6159 event->id = atomic64_inc_return(&perf_event_id); 6159 event->id = atomic64_inc_return(&perf_event_id);
6160 6160
6161 event->state = PERF_EVENT_STATE_INACTIVE; 6161 event->state = PERF_EVENT_STATE_INACTIVE;
diff --git a/kernel/exit.c b/kernel/exit.c
index 50d2e93c36ea..b4df21937216 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
72 list_del_rcu(&p->tasks); 72 list_del_rcu(&p->tasks);
73 list_del_init(&p->sibling); 73 list_del_init(&p->sibling);
74 __this_cpu_dec(process_counts); 74 __this_cpu_dec(process_counts);
75 /*
76 * If we are the last child process in a pid namespace to be
77 * reaped, notify the reaper sleeping zap_pid_ns_processes().
78 */
79 if (IS_ENABLED(CONFIG_PID_NS)) {
80 struct task_struct *parent = p->real_parent;
81
82 if ((task_active_pid_ns(parent)->child_reaper == parent) &&
83 list_empty(&parent->children) &&
84 (parent->flags & PF_EXITING))
85 wake_up_process(parent);
86 }
87 } 75 }
88 list_del_rcu(&p->thread_group); 76 list_del_rcu(&p->thread_group);
89} 77}
diff --git a/kernel/fork.c b/kernel/fork.c
index 115d6c2e4cca..85f6d536608d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 146static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
147 int node) 147 int node)
148{ 148{
149 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 149 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
150 THREAD_SIZE_ORDER); 150 THREAD_SIZE_ORDER);
151 151
152 return page ? page_address(page) : NULL; 152 return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
154 154
155static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
156{ 156{
157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
158} 158}
159# else 159# else
160static struct kmem_cache *thread_info_cache; 160static struct kmem_cache *thread_info_cache;
@@ -1044,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1044 atomic_set(&sig->live, 1); 1044 atomic_set(&sig->live, 1);
1045 atomic_set(&sig->sigcnt, 1); 1045 atomic_set(&sig->sigcnt, 1);
1046 init_waitqueue_head(&sig->wait_chldexit); 1046 init_waitqueue_head(&sig->wait_chldexit);
1047 if (clone_flags & CLONE_NEWPID)
1048 sig->flags |= SIGNAL_UNKILLABLE;
1049 sig->curr_target = tsk; 1047 sig->curr_target = tsk;
1050 init_sigpending(&sig->shared_pending); 1048 init_sigpending(&sig->shared_pending);
1051 INIT_LIST_HEAD(&sig->posix_timers); 1049 INIT_LIST_HEAD(&sig->posix_timers);
@@ -1438,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1438 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1436 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1439 1437
1440 if (thread_group_leader(p)) { 1438 if (thread_group_leader(p)) {
1441 if (is_child_reaper(pid)) 1439 if (is_child_reaper(pid)) {
1442 p->nsproxy->pid_ns->child_reaper = p; 1440 ns_of_pid(pid)->child_reaper = p;
1441 p->signal->flags |= SIGNAL_UNKILLABLE;
1442 }
1443 1443
1444 p->signal->leader_pid = pid; 1444 p->signal->leader_pid = pid;
1445 p->signal->tty = tty_kref_get(current->signal->tty); 1445 p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1473,8 +1473,6 @@ bad_fork_cleanup_io:
1473 if (p->io_context) 1473 if (p->io_context)
1474 exit_io_context(p); 1474 exit_io_context(p);
1475bad_fork_cleanup_namespaces: 1475bad_fork_cleanup_namespaces:
1476 if (unlikely(clone_flags & CLONE_NEWPID))
1477 pid_ns_release_proc(p->nsproxy->pid_ns);
1478 exit_task_namespaces(p); 1476 exit_task_namespaces(p);
1479bad_fork_cleanup_mm: 1477bad_fork_cleanup_mm:
1480 if (p->mm) 1478 if (p->mm)
@@ -1554,15 +1552,9 @@ long do_fork(unsigned long clone_flags,
1554 * Do some preliminary argument and permissions checking before we 1552 * Do some preliminary argument and permissions checking before we
1555 * actually start allocating stuff 1553 * actually start allocating stuff
1556 */ 1554 */
1557 if (clone_flags & CLONE_NEWUSER) { 1555 if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
1558 if (clone_flags & CLONE_THREAD) 1556 if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
1559 return -EINVAL; 1557 return -EINVAL;
1560 /* hopefully this check will go away when userns support is
1561 * complete
1562 */
1563 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1564 !capable(CAP_SETGID))
1565 return -EPERM;
1566 } 1558 }
1567 1559
1568 /* 1560 /*
@@ -1724,7 +1716,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
1724{ 1716{
1725 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1717 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1726 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1718 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1727 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) 1719 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
1720 CLONE_NEWUSER|CLONE_NEWPID))
1728 return -EINVAL; 1721 return -EINVAL;
1729 /* 1722 /*
1730 * Not implemented, but pretend it works if there is nothing to 1723 * Not implemented, but pretend it works if there is nothing to
@@ -1791,19 +1784,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1791{ 1784{
1792 struct fs_struct *fs, *new_fs = NULL; 1785 struct fs_struct *fs, *new_fs = NULL;
1793 struct files_struct *fd, *new_fd = NULL; 1786 struct files_struct *fd, *new_fd = NULL;
1787 struct cred *new_cred = NULL;
1794 struct nsproxy *new_nsproxy = NULL; 1788 struct nsproxy *new_nsproxy = NULL;
1795 int do_sysvsem = 0; 1789 int do_sysvsem = 0;
1796 int err; 1790 int err;
1797 1791
1798 err = check_unshare_flags(unshare_flags); 1792 /*
1799 if (err) 1793 * If unsharing a user namespace must also unshare the thread.
1800 goto bad_unshare_out; 1794 */
1801 1795 if (unshare_flags & CLONE_NEWUSER)
1796 unshare_flags |= CLONE_THREAD;
1797 /*
1798 * If unsharing a pid namespace must also unshare the thread.
1799 */
1800 if (unshare_flags & CLONE_NEWPID)
1801 unshare_flags |= CLONE_THREAD;
1802 /*
1803 * If unsharing a thread from a thread group, must also unshare vm.
1804 */
1805 if (unshare_flags & CLONE_THREAD)
1806 unshare_flags |= CLONE_VM;
1807 /*
1808 * If unsharing vm, must also unshare signal handlers.
1809 */
1810 if (unshare_flags & CLONE_VM)
1811 unshare_flags |= CLONE_SIGHAND;
1802 /* 1812 /*
1803 * If unsharing namespace, must also unshare filesystem information. 1813 * If unsharing namespace, must also unshare filesystem information.
1804 */ 1814 */
1805 if (unshare_flags & CLONE_NEWNS) 1815 if (unshare_flags & CLONE_NEWNS)
1806 unshare_flags |= CLONE_FS; 1816 unshare_flags |= CLONE_FS;
1817
1818 err = check_unshare_flags(unshare_flags);
1819 if (err)
1820 goto bad_unshare_out;
1807 /* 1821 /*
1808 * CLONE_NEWIPC must also detach from the undolist: after switching 1822 * CLONE_NEWIPC must also detach from the undolist: after switching
1809 * to a new ipc namespace, the semaphore arrays from the old 1823 * to a new ipc namespace, the semaphore arrays from the old
@@ -1817,11 +1831,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1817 err = unshare_fd(unshare_flags, &new_fd); 1831 err = unshare_fd(unshare_flags, &new_fd);
1818 if (err) 1832 if (err)
1819 goto bad_unshare_cleanup_fs; 1833 goto bad_unshare_cleanup_fs;
1820 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); 1834 err = unshare_userns(unshare_flags, &new_cred);
1821 if (err) 1835 if (err)
1822 goto bad_unshare_cleanup_fd; 1836 goto bad_unshare_cleanup_fd;
1837 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1838 new_cred, new_fs);
1839 if (err)
1840 goto bad_unshare_cleanup_cred;
1823 1841
1824 if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1842 if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
1825 if (do_sysvsem) { 1843 if (do_sysvsem) {
1826 /* 1844 /*
1827 * CLONE_SYSVSEM is equivalent to sys_exit(). 1845 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1854,11 +1872,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1854 } 1872 }
1855 1873
1856 task_unlock(current); 1874 task_unlock(current);
1875
1876 if (new_cred) {
1877 /* Install the new user namespace */
1878 commit_creds(new_cred);
1879 new_cred = NULL;
1880 }
1857 } 1881 }
1858 1882
1859 if (new_nsproxy) 1883 if (new_nsproxy)
1860 put_nsproxy(new_nsproxy); 1884 put_nsproxy(new_nsproxy);
1861 1885
1886bad_unshare_cleanup_cred:
1887 if (new_cred)
1888 put_cred(new_cred);
1862bad_unshare_cleanup_fd: 1889bad_unshare_cleanup_fd:
1863 if (new_fd) 1890 if (new_fd)
1864 put_files_struct(new_fd); 1891 put_files_struct(new_fd);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 35c70c9e24d8..e49a288fa479 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -818,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused)
818 action = kthread_data(tsk); 818 action = kthread_data(tsk);
819 819
820 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 820 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
821 tsk->comm ? tsk->comm : "", tsk->pid, action->irq); 821 tsk->comm, tsk->pid, action->irq);
822 822
823 823
824 desc = irq_to_desc(action->irq); 824 desc = irq_to_desc(action->irq);
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
new file mode 100644
index 000000000000..246b4c6e6135
--- /dev/null
+++ b/kernel/modsign_certificate.S
@@ -0,0 +1,19 @@
1/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
2#ifndef SYMBOL_PREFIX
3#define ASM_SYMBOL(sym) sym
4#else
5#define PASTE2(x,y) x##y
6#define PASTE(x,y) PASTE2(x,y)
7#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
8#endif
9
10#define GLOBAL(name) \
11 .globl ASM_SYMBOL(name); \
12 ASM_SYMBOL(name):
13
14 .section ".init.data","aw"
15
16GLOBAL(modsign_certificate_list)
17 .incbin "signing_key.x509"
18 .incbin "extra_certificates"
19GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 767e559dfb10..045504fffbb2 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -20,12 +20,6 @@ struct key *modsign_keyring;
20 20
21extern __initdata const u8 modsign_certificate_list[]; 21extern __initdata const u8 modsign_certificate_list[];
22extern __initdata const u8 modsign_certificate_list_end[]; 22extern __initdata const u8 modsign_certificate_list_end[];
23asm(".section .init.data,\"aw\"\n"
24 SYMBOL_PREFIX "modsign_certificate_list:\n"
25 ".incbin \"signing_key.x509\"\n"
26 ".incbin \"extra_certificates\"\n"
27 SYMBOL_PREFIX "modsign_certificate_list_end:"
28 );
29 23
30/* 24/*
31 * We need to make sure ccache doesn't cache the .o file as it doesn't notice 25 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
diff --git a/kernel/module.c b/kernel/module.c
index 6e48c3a43599..250092c1d57d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -21,6 +21,7 @@
21#include <linux/ftrace_event.h> 21#include <linux/ftrace_event.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/file.h>
24#include <linux/fs.h> 25#include <linux/fs.h>
25#include <linux/sysfs.h> 26#include <linux/sysfs.h>
26#include <linux/kernel.h> 27#include <linux/kernel.h>
@@ -28,6 +29,7 @@
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
29#include <linux/elf.h> 30#include <linux/elf.h>
30#include <linux/proc_fs.h> 31#include <linux/proc_fs.h>
32#include <linux/security.h>
31#include <linux/seq_file.h> 33#include <linux/seq_file.h>
32#include <linux/syscalls.h> 34#include <linux/syscalls.h>
33#include <linux/fcntl.h> 35#include <linux/fcntl.h>
@@ -59,6 +61,7 @@
59#include <linux/pfn.h> 61#include <linux/pfn.h>
60#include <linux/bsearch.h> 62#include <linux/bsearch.h>
61#include <linux/fips.h> 63#include <linux/fips.h>
64#include <uapi/linux/module.h>
62#include "module-internal.h" 65#include "module-internal.h"
63 66
64#define CREATE_TRACE_POINTS 67#define CREATE_TRACE_POINTS
@@ -372,9 +375,6 @@ static bool check_symbol(const struct symsearch *syms,
372 printk(KERN_WARNING "Symbol %s is being used " 375 printk(KERN_WARNING "Symbol %s is being used "
373 "by a non-GPL module, which will not " 376 "by a non-GPL module, which will not "
374 "be allowed in the future\n", fsa->name); 377 "be allowed in the future\n", fsa->name);
375 printk(KERN_WARNING "Please see the file "
376 "Documentation/feature-removal-schedule.txt "
377 "in the kernel source tree for more details.\n");
378 } 378 }
379 } 379 }
380 380
@@ -2282,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2282 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2282 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2283 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2283 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2284 const Elf_Sym *src; 2284 const Elf_Sym *src;
2285 unsigned int i, nsrc, ndst, strtab_size; 2285 unsigned int i, nsrc, ndst, strtab_size = 0;
2286 2286
2287 /* Put symbol section at end of init part of module. */ 2287 /* Put symbol section at end of init part of module. */
2288 symsect->sh_flags |= SHF_ALLOC; 2288 symsect->sh_flags |= SHF_ALLOC;
@@ -2293,9 +2293,6 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2293 src = (void *)info->hdr + symsect->sh_offset; 2293 src = (void *)info->hdr + symsect->sh_offset;
2294 nsrc = symsect->sh_size / sizeof(*src); 2294 nsrc = symsect->sh_size / sizeof(*src);
2295 2295
2296 /* strtab always starts with a nul, so offset 0 is the empty string. */
2297 strtab_size = 1;
2298
2299 /* Compute total space required for the core symbols' strtab. */ 2296 /* Compute total space required for the core symbols' strtab. */
2300 for (ndst = i = 0; i < nsrc; i++) { 2297 for (ndst = i = 0; i < nsrc; i++) {
2301 if (i == 0 || 2298 if (i == 0 ||
@@ -2337,7 +2334,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2337 mod->core_symtab = dst = mod->module_core + info->symoffs; 2334 mod->core_symtab = dst = mod->module_core + info->symoffs;
2338 mod->core_strtab = s = mod->module_core + info->stroffs; 2335 mod->core_strtab = s = mod->module_core + info->stroffs;
2339 src = mod->symtab; 2336 src = mod->symtab;
2340 *s++ = 0;
2341 for (ndst = i = 0; i < mod->num_symtab; i++) { 2337 for (ndst = i = 0; i < mod->num_symtab; i++) {
2342 if (i == 0 || 2338 if (i == 0 ||
2343 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { 2339 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
@@ -2378,7 +2374,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2378 2374
2379void * __weak module_alloc(unsigned long size) 2375void * __weak module_alloc(unsigned long size)
2380{ 2376{
2381 return size == 0 ? NULL : vmalloc_exec(size); 2377 return vmalloc_exec(size);
2382} 2378}
2383 2379
2384static void *module_alloc_update_bounds(unsigned long size) 2380static void *module_alloc_update_bounds(unsigned long size)
@@ -2425,18 +2421,17 @@ static inline void kmemleak_load_module(const struct module *mod,
2425#endif 2421#endif
2426 2422
2427#ifdef CONFIG_MODULE_SIG 2423#ifdef CONFIG_MODULE_SIG
2428static int module_sig_check(struct load_info *info, 2424static int module_sig_check(struct load_info *info)
2429 const void *mod, unsigned long *_len)
2430{ 2425{
2431 int err = -ENOKEY; 2426 int err = -ENOKEY;
2432 unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; 2427 const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
2433 unsigned long len = *_len; 2428 const void *mod = info->hdr;
2434 2429
2435 if (len > markerlen && 2430 if (info->len > markerlen &&
2436 memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { 2431 memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
2437 /* We truncate the module to discard the signature */ 2432 /* We truncate the module to discard the signature */
2438 *_len -= markerlen; 2433 info->len -= markerlen;
2439 err = mod_verify_sig(mod, _len); 2434 err = mod_verify_sig(mod, &info->len);
2440 } 2435 }
2441 2436
2442 if (!err) { 2437 if (!err) {
@@ -2454,59 +2449,107 @@ static int module_sig_check(struct load_info *info,
2454 return err; 2449 return err;
2455} 2450}
2456#else /* !CONFIG_MODULE_SIG */ 2451#else /* !CONFIG_MODULE_SIG */
2457static int module_sig_check(struct load_info *info, 2452static int module_sig_check(struct load_info *info)
2458 void *mod, unsigned long *len)
2459{ 2453{
2460 return 0; 2454 return 0;
2461} 2455}
2462#endif /* !CONFIG_MODULE_SIG */ 2456#endif /* !CONFIG_MODULE_SIG */
2463 2457
2464/* Sets info->hdr, info->len and info->sig_ok. */ 2458/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
2465static int copy_and_check(struct load_info *info, 2459static int elf_header_check(struct load_info *info)
2466 const void __user *umod, unsigned long len, 2460{
2467 const char __user *uargs) 2461 if (info->len < sizeof(*(info->hdr)))
2462 return -ENOEXEC;
2463
2464 if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
2465 || info->hdr->e_type != ET_REL
2466 || !elf_check_arch(info->hdr)
2467 || info->hdr->e_shentsize != sizeof(Elf_Shdr))
2468 return -ENOEXEC;
2469
2470 if (info->hdr->e_shoff >= info->len
2471 || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
2472 info->len - info->hdr->e_shoff))
2473 return -ENOEXEC;
2474
2475 return 0;
2476}
2477
2478/* Sets info->hdr and info->len. */
2479static int copy_module_from_user(const void __user *umod, unsigned long len,
2480 struct load_info *info)
2468{ 2481{
2469 int err; 2482 int err;
2470 Elf_Ehdr *hdr;
2471 2483
2472 if (len < sizeof(*hdr)) 2484 info->len = len;
2485 if (info->len < sizeof(*(info->hdr)))
2473 return -ENOEXEC; 2486 return -ENOEXEC;
2474 2487
2488 err = security_kernel_module_from_file(NULL);
2489 if (err)
2490 return err;
2491
2475 /* Suck in entire file: we'll want most of it. */ 2492 /* Suck in entire file: we'll want most of it. */
2476 if ((hdr = vmalloc(len)) == NULL) 2493 info->hdr = vmalloc(info->len);
2494 if (!info->hdr)
2477 return -ENOMEM; 2495 return -ENOMEM;
2478 2496
2479 if (copy_from_user(hdr, umod, len) != 0) { 2497 if (copy_from_user(info->hdr, umod, info->len) != 0) {
2480 err = -EFAULT; 2498 vfree(info->hdr);
2481 goto free_hdr; 2499 return -EFAULT;
2482 } 2500 }
2483 2501
2484 err = module_sig_check(info, hdr, &len); 2502 return 0;
2503}
2504
2505/* Sets info->hdr and info->len. */
2506static int copy_module_from_fd(int fd, struct load_info *info)
2507{
2508 struct file *file;
2509 int err;
2510 struct kstat stat;
2511 loff_t pos;
2512 ssize_t bytes = 0;
2513
2514 file = fget(fd);
2515 if (!file)
2516 return -ENOEXEC;
2517
2518 err = security_kernel_module_from_file(file);
2485 if (err) 2519 if (err)
2486 goto free_hdr; 2520 goto out;
2487 2521
2488 /* Sanity checks against insmoding binaries or wrong arch, 2522 err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
2489 weird elf version */ 2523 if (err)
2490 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 2524 goto out;
2491 || hdr->e_type != ET_REL
2492 || !elf_check_arch(hdr)
2493 || hdr->e_shentsize != sizeof(Elf_Shdr)) {
2494 err = -ENOEXEC;
2495 goto free_hdr;
2496 }
2497 2525
2498 if (hdr->e_shoff >= len || 2526 if (stat.size > INT_MAX) {
2499 hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { 2527 err = -EFBIG;
2500 err = -ENOEXEC; 2528 goto out;
2501 goto free_hdr; 2529 }
2530 info->hdr = vmalloc(stat.size);
2531 if (!info->hdr) {
2532 err = -ENOMEM;
2533 goto out;
2502 } 2534 }
2503 2535
2504 info->hdr = hdr; 2536 pos = 0;
2505 info->len = len; 2537 while (pos < stat.size) {
2506 return 0; 2538 bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
2539 stat.size - pos);
2540 if (bytes < 0) {
2541 vfree(info->hdr);
2542 err = bytes;
2543 goto out;
2544 }
2545 if (bytes == 0)
2546 break;
2547 pos += bytes;
2548 }
2549 info->len = pos;
2507 2550
2508free_hdr: 2551out:
2509 vfree(hdr); 2552 fput(file);
2510 return err; 2553 return err;
2511} 2554}
2512 2555
@@ -2515,7 +2558,7 @@ static void free_copy(struct load_info *info)
2515 vfree(info->hdr); 2558 vfree(info->hdr);
2516} 2559}
2517 2560
2518static int rewrite_section_headers(struct load_info *info) 2561static int rewrite_section_headers(struct load_info *info, int flags)
2519{ 2562{
2520 unsigned int i; 2563 unsigned int i;
2521 2564
@@ -2543,7 +2586,10 @@ static int rewrite_section_headers(struct load_info *info)
2543 } 2586 }
2544 2587
2545 /* Track but don't keep modinfo and version sections. */ 2588 /* Track but don't keep modinfo and version sections. */
2546 info->index.vers = find_sec(info, "__versions"); 2589 if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
2590 info->index.vers = 0; /* Pretend no __versions section! */
2591 else
2592 info->index.vers = find_sec(info, "__versions");
2547 info->index.info = find_sec(info, ".modinfo"); 2593 info->index.info = find_sec(info, ".modinfo");
2548 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; 2594 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2549 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; 2595 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2558,7 +2604,7 @@ static int rewrite_section_headers(struct load_info *info)
2558 * Return the temporary module pointer (we'll replace it with the final 2604 * Return the temporary module pointer (we'll replace it with the final
2559 * one when we move the module sections around). 2605 * one when we move the module sections around).
2560 */ 2606 */
2561static struct module *setup_load_info(struct load_info *info) 2607static struct module *setup_load_info(struct load_info *info, int flags)
2562{ 2608{
2563 unsigned int i; 2609 unsigned int i;
2564 int err; 2610 int err;
@@ -2569,7 +2615,7 @@ static struct module *setup_load_info(struct load_info *info)
2569 info->secstrings = (void *)info->hdr 2615 info->secstrings = (void *)info->hdr
2570 + info->sechdrs[info->hdr->e_shstrndx].sh_offset; 2616 + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
2571 2617
2572 err = rewrite_section_headers(info); 2618 err = rewrite_section_headers(info, flags);
2573 if (err) 2619 if (err)
2574 return ERR_PTR(err); 2620 return ERR_PTR(err);
2575 2621
@@ -2607,11 +2653,14 @@ static struct module *setup_load_info(struct load_info *info)
2607 return mod; 2653 return mod;
2608} 2654}
2609 2655
2610static int check_modinfo(struct module *mod, struct load_info *info) 2656static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2611{ 2657{
2612 const char *modmagic = get_modinfo(info, "vermagic"); 2658 const char *modmagic = get_modinfo(info, "vermagic");
2613 int err; 2659 int err;
2614 2660
2661 if (flags & MODULE_INIT_IGNORE_VERMAGIC)
2662 modmagic = NULL;
2663
2615 /* This is allowed: modprobe --force will invalidate it. */ 2664 /* This is allowed: modprobe --force will invalidate it. */
2616 if (!modmagic) { 2665 if (!modmagic) {
2617 err = try_to_force_load(mod, "bad vermagic"); 2666 err = try_to_force_load(mod, "bad vermagic");
@@ -2741,20 +2790,23 @@ static int move_module(struct module *mod, struct load_info *info)
2741 memset(ptr, 0, mod->core_size); 2790 memset(ptr, 0, mod->core_size);
2742 mod->module_core = ptr; 2791 mod->module_core = ptr;
2743 2792
2744 ptr = module_alloc_update_bounds(mod->init_size); 2793 if (mod->init_size) {
2745 /* 2794 ptr = module_alloc_update_bounds(mod->init_size);
2746 * The pointer to this block is stored in the module structure 2795 /*
2747 * which is inside the block. This block doesn't need to be 2796 * The pointer to this block is stored in the module structure
2748 * scanned as it contains data and code that will be freed 2797 * which is inside the block. This block doesn't need to be
2749 * after the module is initialized. 2798 * scanned as it contains data and code that will be freed
2750 */ 2799 * after the module is initialized.
2751 kmemleak_ignore(ptr); 2800 */
2752 if (!ptr && mod->init_size) { 2801 kmemleak_ignore(ptr);
2753 module_free(mod, mod->module_core); 2802 if (!ptr) {
2754 return -ENOMEM; 2803 module_free(mod, mod->module_core);
2755 } 2804 return -ENOMEM;
2756 memset(ptr, 0, mod->init_size); 2805 }
2757 mod->module_init = ptr; 2806 memset(ptr, 0, mod->init_size);
2807 mod->module_init = ptr;
2808 } else
2809 mod->module_init = NULL;
2758 2810
2759 /* Transfer each section which specifies SHF_ALLOC */ 2811 /* Transfer each section which specifies SHF_ALLOC */
2760 pr_debug("final section addresses:\n"); 2812 pr_debug("final section addresses:\n");
@@ -2847,18 +2899,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2847 return 0; 2899 return 0;
2848} 2900}
2849 2901
2850static struct module *layout_and_allocate(struct load_info *info) 2902static struct module *layout_and_allocate(struct load_info *info, int flags)
2851{ 2903{
2852 /* Module within temporary copy. */ 2904 /* Module within temporary copy. */
2853 struct module *mod; 2905 struct module *mod;
2854 Elf_Shdr *pcpusec; 2906 Elf_Shdr *pcpusec;
2855 int err; 2907 int err;
2856 2908
2857 mod = setup_load_info(info); 2909 mod = setup_load_info(info, flags);
2858 if (IS_ERR(mod)) 2910 if (IS_ERR(mod))
2859 return mod; 2911 return mod;
2860 2912
2861 err = check_modinfo(mod, info); 2913 err = check_modinfo(mod, info, flags);
2862 if (err) 2914 if (err)
2863 return ERR_PTR(err); 2915 return ERR_PTR(err);
2864 2916
@@ -2945,33 +2997,124 @@ static bool finished_loading(const char *name)
2945 return ret; 2997 return ret;
2946} 2998}
2947 2999
3000/* Call module constructors. */
3001static void do_mod_ctors(struct module *mod)
3002{
3003#ifdef CONFIG_CONSTRUCTORS
3004 unsigned long i;
3005
3006 for (i = 0; i < mod->num_ctors; i++)
3007 mod->ctors[i]();
3008#endif
3009}
3010
3011/* This is where the real work happens */
3012static int do_init_module(struct module *mod)
3013{
3014 int ret = 0;
3015
3016 blocking_notifier_call_chain(&module_notify_list,
3017 MODULE_STATE_COMING, mod);
3018
3019 /* Set RO and NX regions for core */
3020 set_section_ro_nx(mod->module_core,
3021 mod->core_text_size,
3022 mod->core_ro_size,
3023 mod->core_size);
3024
3025 /* Set RO and NX regions for init */
3026 set_section_ro_nx(mod->module_init,
3027 mod->init_text_size,
3028 mod->init_ro_size,
3029 mod->init_size);
3030
3031 do_mod_ctors(mod);
3032 /* Start the module */
3033 if (mod->init != NULL)
3034 ret = do_one_initcall(mod->init);
3035 if (ret < 0) {
3036 /* Init routine failed: abort. Try to protect us from
3037 buggy refcounters. */
3038 mod->state = MODULE_STATE_GOING;
3039 synchronize_sched();
3040 module_put(mod);
3041 blocking_notifier_call_chain(&module_notify_list,
3042 MODULE_STATE_GOING, mod);
3043 free_module(mod);
3044 wake_up_all(&module_wq);
3045 return ret;
3046 }
3047 if (ret > 0) {
3048 printk(KERN_WARNING
3049"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3050"%s: loading module anyway...\n",
3051 __func__, mod->name, ret,
3052 __func__);
3053 dump_stack();
3054 }
3055
3056 /* Now it's a first class citizen! */
3057 mod->state = MODULE_STATE_LIVE;
3058 blocking_notifier_call_chain(&module_notify_list,
3059 MODULE_STATE_LIVE, mod);
3060
3061 /* We need to finish all async code before the module init sequence is done */
3062 async_synchronize_full();
3063
3064 mutex_lock(&module_mutex);
3065 /* Drop initial reference. */
3066 module_put(mod);
3067 trim_init_extable(mod);
3068#ifdef CONFIG_KALLSYMS
3069 mod->num_symtab = mod->core_num_syms;
3070 mod->symtab = mod->core_symtab;
3071 mod->strtab = mod->core_strtab;
3072#endif
3073 unset_module_init_ro_nx(mod);
3074 module_free(mod, mod->module_init);
3075 mod->module_init = NULL;
3076 mod->init_size = 0;
3077 mod->init_ro_size = 0;
3078 mod->init_text_size = 0;
3079 mutex_unlock(&module_mutex);
3080 wake_up_all(&module_wq);
3081
3082 return 0;
3083}
3084
3085static int may_init_module(void)
3086{
3087 if (!capable(CAP_SYS_MODULE) || modules_disabled)
3088 return -EPERM;
3089
3090 return 0;
3091}
3092
2948/* Allocate and load the module: note that size of section 0 is always 3093/* Allocate and load the module: note that size of section 0 is always
2949 zero, and we rely on this for optional sections. */ 3094 zero, and we rely on this for optional sections. */
2950static struct module *load_module(void __user *umod, 3095static int load_module(struct load_info *info, const char __user *uargs,
2951 unsigned long len, 3096 int flags)
2952 const char __user *uargs)
2953{ 3097{
2954 struct load_info info = { NULL, };
2955 struct module *mod, *old; 3098 struct module *mod, *old;
2956 long err; 3099 long err;
2957 3100
2958 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", 3101 err = module_sig_check(info);
2959 umod, len, uargs); 3102 if (err)
3103 goto free_copy;
2960 3104
2961 /* Copy in the blobs from userspace, check they are vaguely sane. */ 3105 err = elf_header_check(info);
2962 err = copy_and_check(&info, umod, len, uargs);
2963 if (err) 3106 if (err)
2964 return ERR_PTR(err); 3107 goto free_copy;
2965 3108
2966 /* Figure out module layout, and allocate all the memory. */ 3109 /* Figure out module layout, and allocate all the memory. */
2967 mod = layout_and_allocate(&info); 3110 mod = layout_and_allocate(info, flags);
2968 if (IS_ERR(mod)) { 3111 if (IS_ERR(mod)) {
2969 err = PTR_ERR(mod); 3112 err = PTR_ERR(mod);
2970 goto free_copy; 3113 goto free_copy;
2971 } 3114 }
2972 3115
2973#ifdef CONFIG_MODULE_SIG 3116#ifdef CONFIG_MODULE_SIG
2974 mod->sig_ok = info.sig_ok; 3117 mod->sig_ok = info->sig_ok;
2975 if (!mod->sig_ok) 3118 if (!mod->sig_ok)
2976 add_taint_module(mod, TAINT_FORCED_MODULE); 3119 add_taint_module(mod, TAINT_FORCED_MODULE);
2977#endif 3120#endif
@@ -2983,25 +3126,25 @@ static struct module *load_module(void __user *umod,
2983 3126
2984 /* Now we've got everything in the final locations, we can 3127 /* Now we've got everything in the final locations, we can
2985 * find optional sections. */ 3128 * find optional sections. */
2986 find_module_sections(mod, &info); 3129 find_module_sections(mod, info);
2987 3130
2988 err = check_module_license_and_versions(mod); 3131 err = check_module_license_and_versions(mod);
2989 if (err) 3132 if (err)
2990 goto free_unload; 3133 goto free_unload;
2991 3134
2992 /* Set up MODINFO_ATTR fields */ 3135 /* Set up MODINFO_ATTR fields */
2993 setup_modinfo(mod, &info); 3136 setup_modinfo(mod, info);
2994 3137
2995 /* Fix up syms, so that st_value is a pointer to location. */ 3138 /* Fix up syms, so that st_value is a pointer to location. */
2996 err = simplify_symbols(mod, &info); 3139 err = simplify_symbols(mod, info);
2997 if (err < 0) 3140 if (err < 0)
2998 goto free_modinfo; 3141 goto free_modinfo;
2999 3142
3000 err = apply_relocations(mod, &info); 3143 err = apply_relocations(mod, info);
3001 if (err < 0) 3144 if (err < 0)
3002 goto free_modinfo; 3145 goto free_modinfo;
3003 3146
3004 err = post_relocation(mod, &info); 3147 err = post_relocation(mod, info);
3005 if (err < 0) 3148 if (err < 0)
3006 goto free_modinfo; 3149 goto free_modinfo;
3007 3150
@@ -3041,14 +3184,14 @@ again:
3041 } 3184 }
3042 3185
3043 /* This has to be done once we're sure module name is unique. */ 3186 /* This has to be done once we're sure module name is unique. */
3044 dynamic_debug_setup(info.debug, info.num_debug); 3187 dynamic_debug_setup(info->debug, info->num_debug);
3045 3188
3046 /* Find duplicate symbols */ 3189 /* Find duplicate symbols */
3047 err = verify_export_symbols(mod); 3190 err = verify_export_symbols(mod);
3048 if (err < 0) 3191 if (err < 0)
3049 goto ddebug; 3192 goto ddebug;
3050 3193
3051 module_bug_finalize(info.hdr, info.sechdrs, mod); 3194 module_bug_finalize(info->hdr, info->sechdrs, mod);
3052 list_add_rcu(&mod->list, &modules); 3195 list_add_rcu(&mod->list, &modules);
3053 mutex_unlock(&module_mutex); 3196 mutex_unlock(&module_mutex);
3054 3197
@@ -3059,16 +3202,17 @@ again:
3059 goto unlink; 3202 goto unlink;
3060 3203
3061 /* Link in to syfs. */ 3204 /* Link in to syfs. */
3062 err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); 3205 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
3063 if (err < 0) 3206 if (err < 0)
3064 goto unlink; 3207 goto unlink;
3065 3208
3066 /* Get rid of temporary copy. */ 3209 /* Get rid of temporary copy. */
3067 free_copy(&info); 3210 free_copy(info);
3068 3211
3069 /* Done! */ 3212 /* Done! */
3070 trace_module_load(mod); 3213 trace_module_load(mod);
3071 return mod; 3214
3215 return do_init_module(mod);
3072 3216
3073 unlink: 3217 unlink:
3074 mutex_lock(&module_mutex); 3218 mutex_lock(&module_mutex);
@@ -3077,7 +3221,7 @@ again:
3077 module_bug_cleanup(mod); 3221 module_bug_cleanup(mod);
3078 wake_up_all(&module_wq); 3222 wake_up_all(&module_wq);
3079 ddebug: 3223 ddebug:
3080 dynamic_debug_remove(info.debug); 3224 dynamic_debug_remove(info->debug);
3081 unlock: 3225 unlock:
3082 mutex_unlock(&module_mutex); 3226 mutex_unlock(&module_mutex);
3083 synchronize_sched(); 3227 synchronize_sched();
@@ -3089,106 +3233,52 @@ again:
3089 free_unload: 3233 free_unload:
3090 module_unload_free(mod); 3234 module_unload_free(mod);
3091 free_module: 3235 free_module:
3092 module_deallocate(mod, &info); 3236 module_deallocate(mod, info);
3093 free_copy: 3237 free_copy:
3094 free_copy(&info); 3238 free_copy(info);
3095 return ERR_PTR(err); 3239 return err;
3096}
3097
3098/* Call module constructors. */
3099static void do_mod_ctors(struct module *mod)
3100{
3101#ifdef CONFIG_CONSTRUCTORS
3102 unsigned long i;
3103
3104 for (i = 0; i < mod->num_ctors; i++)
3105 mod->ctors[i]();
3106#endif
3107} 3240}
3108 3241
3109/* This is where the real work happens */
3110SYSCALL_DEFINE3(init_module, void __user *, umod, 3242SYSCALL_DEFINE3(init_module, void __user *, umod,
3111 unsigned long, len, const char __user *, uargs) 3243 unsigned long, len, const char __user *, uargs)
3112{ 3244{
3113 struct module *mod; 3245 int err;
3114 int ret = 0; 3246 struct load_info info = { };
3115 3247
3116 /* Must have permission */ 3248 err = may_init_module();
3117 if (!capable(CAP_SYS_MODULE) || modules_disabled) 3249 if (err)
3118 return -EPERM; 3250 return err;
3119 3251
3120 /* Do all the hard work */ 3252 pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
3121 mod = load_module(umod, len, uargs); 3253 umod, len, uargs);
3122 if (IS_ERR(mod))
3123 return PTR_ERR(mod);
3124 3254
3125 blocking_notifier_call_chain(&module_notify_list, 3255 err = copy_module_from_user(umod, len, &info);
3126 MODULE_STATE_COMING, mod); 3256 if (err)
3257 return err;
3127 3258
3128 /* Set RO and NX regions for core */ 3259 return load_module(&info, uargs, 0);
3129 set_section_ro_nx(mod->module_core, 3260}
3130 mod->core_text_size,
3131 mod->core_ro_size,
3132 mod->core_size);
3133 3261
3134 /* Set RO and NX regions for init */ 3262SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
3135 set_section_ro_nx(mod->module_init, 3263{
3136 mod->init_text_size, 3264 int err;
3137 mod->init_ro_size, 3265 struct load_info info = { };
3138 mod->init_size);
3139 3266
3140 do_mod_ctors(mod); 3267 err = may_init_module();
3141 /* Start the module */ 3268 if (err)
3142 if (mod->init != NULL) 3269 return err;
3143 ret = do_one_initcall(mod->init);
3144 if (ret < 0) {
3145 /* Init routine failed: abort. Try to protect us from
3146 buggy refcounters. */
3147 mod->state = MODULE_STATE_GOING;
3148 synchronize_sched();
3149 module_put(mod);
3150 blocking_notifier_call_chain(&module_notify_list,
3151 MODULE_STATE_GOING, mod);
3152 free_module(mod);
3153 wake_up_all(&module_wq);
3154 return ret;
3155 }
3156 if (ret > 0) {
3157 printk(KERN_WARNING
3158"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
3159"%s: loading module anyway...\n",
3160 __func__, mod->name, ret,
3161 __func__);
3162 dump_stack();
3163 }
3164 3270
3165 /* Now it's a first class citizen! */ 3271 pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
3166 mod->state = MODULE_STATE_LIVE;
3167 blocking_notifier_call_chain(&module_notify_list,
3168 MODULE_STATE_LIVE, mod);
3169 3272
3170 /* We need to finish all async code before the module init sequence is done */ 3273 if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
3171 async_synchronize_full(); 3274 |MODULE_INIT_IGNORE_VERMAGIC))
3275 return -EINVAL;
3172 3276
3173 mutex_lock(&module_mutex); 3277 err = copy_module_from_fd(fd, &info);
3174 /* Drop initial reference. */ 3278 if (err)
3175 module_put(mod); 3279 return err;
3176 trim_init_extable(mod);
3177#ifdef CONFIG_KALLSYMS
3178 mod->num_symtab = mod->core_num_syms;
3179 mod->symtab = mod->core_symtab;
3180 mod->strtab = mod->core_strtab;
3181#endif
3182 unset_module_init_ro_nx(mod);
3183 module_free(mod, mod->module_init);
3184 mod->module_init = NULL;
3185 mod->init_size = 0;
3186 mod->init_ro_size = 0;
3187 mod->init_text_size = 0;
3188 mutex_unlock(&module_mutex);
3189 wake_up_all(&module_wq);
3190 3280
3191 return 0; 3281 return load_module(&info, uargs, flags);
3192} 3282}
3193 3283
3194static inline int within(unsigned long addr, void *start, unsigned long size) 3284static inline int within(unsigned long addr, void *start, unsigned long size)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 7e1c3de1ce45..78e2ecb20165 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
57 * leave it to the caller to do proper locking and attach it to task. 57 * leave it to the caller to do proper locking and attach it to task.
58 */ 58 */
59static struct nsproxy *create_new_namespaces(unsigned long flags, 59static struct nsproxy *create_new_namespaces(unsigned long flags,
60 struct task_struct *tsk, struct fs_struct *new_fs) 60 struct task_struct *tsk, struct user_namespace *user_ns,
61 struct fs_struct *new_fs)
61{ 62{
62 struct nsproxy *new_nsp; 63 struct nsproxy *new_nsp;
63 int err; 64 int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
66 if (!new_nsp) 67 if (!new_nsp)
67 return ERR_PTR(-ENOMEM); 68 return ERR_PTR(-ENOMEM);
68 69
69 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 70 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
70 if (IS_ERR(new_nsp->mnt_ns)) { 71 if (IS_ERR(new_nsp->mnt_ns)) {
71 err = PTR_ERR(new_nsp->mnt_ns); 72 err = PTR_ERR(new_nsp->mnt_ns);
72 goto out_ns; 73 goto out_ns;
73 } 74 }
74 75
75 new_nsp->uts_ns = copy_utsname(flags, tsk); 76 new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
76 if (IS_ERR(new_nsp->uts_ns)) { 77 if (IS_ERR(new_nsp->uts_ns)) {
77 err = PTR_ERR(new_nsp->uts_ns); 78 err = PTR_ERR(new_nsp->uts_ns);
78 goto out_uts; 79 goto out_uts;
79 } 80 }
80 81
81 new_nsp->ipc_ns = copy_ipcs(flags, tsk); 82 new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
82 if (IS_ERR(new_nsp->ipc_ns)) { 83 if (IS_ERR(new_nsp->ipc_ns)) {
83 err = PTR_ERR(new_nsp->ipc_ns); 84 err = PTR_ERR(new_nsp->ipc_ns);
84 goto out_ipc; 85 goto out_ipc;
85 } 86 }
86 87
87 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); 88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
88 if (IS_ERR(new_nsp->pid_ns)) { 89 if (IS_ERR(new_nsp->pid_ns)) {
89 err = PTR_ERR(new_nsp->pid_ns); 90 err = PTR_ERR(new_nsp->pid_ns);
90 goto out_pid; 91 goto out_pid;
91 } 92 }
92 93
93 new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); 94 new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
94 if (IS_ERR(new_nsp->net_ns)) { 95 if (IS_ERR(new_nsp->net_ns)) {
95 err = PTR_ERR(new_nsp->net_ns); 96 err = PTR_ERR(new_nsp->net_ns);
96 goto out_net; 97 goto out_net;
@@ -122,6 +123,7 @@ out_ns:
122int copy_namespaces(unsigned long flags, struct task_struct *tsk) 123int copy_namespaces(unsigned long flags, struct task_struct *tsk)
123{ 124{
124 struct nsproxy *old_ns = tsk->nsproxy; 125 struct nsproxy *old_ns = tsk->nsproxy;
126 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
125 struct nsproxy *new_ns; 127 struct nsproxy *new_ns;
126 int err = 0; 128 int err = 0;
127 129
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
134 CLONE_NEWPID | CLONE_NEWNET))) 136 CLONE_NEWPID | CLONE_NEWNET)))
135 return 0; 137 return 0;
136 138
137 if (!capable(CAP_SYS_ADMIN)) { 139 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
138 err = -EPERM; 140 err = -EPERM;
139 goto out; 141 goto out;
140 } 142 }
@@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
151 goto out; 153 goto out;
152 } 154 }
153 155
154 new_ns = create_new_namespaces(flags, tsk, tsk->fs); 156 new_ns = create_new_namespaces(flags, tsk,
157 task_cred_xxx(tsk, user_ns), tsk->fs);
155 if (IS_ERR(new_ns)) { 158 if (IS_ERR(new_ns)) {
156 err = PTR_ERR(new_ns); 159 err = PTR_ERR(new_ns);
157 goto out; 160 goto out;
@@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns)
183 * On success, returns the new nsproxy. 186 * On success, returns the new nsproxy.
184 */ 187 */
185int unshare_nsproxy_namespaces(unsigned long unshare_flags, 188int unshare_nsproxy_namespaces(unsigned long unshare_flags,
186 struct nsproxy **new_nsp, struct fs_struct *new_fs) 189 struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
187{ 190{
191 struct user_namespace *user_ns;
188 int err = 0; 192 int err = 0;
189 193
190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 194 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
191 CLONE_NEWNET))) 195 CLONE_NEWNET | CLONE_NEWPID)))
192 return 0; 196 return 0;
193 197
194 if (!capable(CAP_SYS_ADMIN)) 198 user_ns = new_cred ? new_cred->user_ns : current_user_ns();
199 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
195 return -EPERM; 200 return -EPERM;
196 201
197 *new_nsp = create_new_namespaces(unshare_flags, current, 202 *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
198 new_fs ? new_fs : current->fs); 203 new_fs ? new_fs : current->fs);
199 if (IS_ERR(*new_nsp)) { 204 if (IS_ERR(*new_nsp)) {
200 err = PTR_ERR(*new_nsp); 205 err = PTR_ERR(*new_nsp);
201 goto out; 206 goto out;
@@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
241 struct file *file; 246 struct file *file;
242 int err; 247 int err;
243 248
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd); 249 file = proc_ns_fget(fd);
248 if (IS_ERR(file)) 250 if (IS_ERR(file))
249 return PTR_ERR(file); 251 return PTR_ERR(file);
@@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
254 if (nstype && (ops->type != nstype)) 256 if (nstype && (ops->type != nstype))
255 goto out; 257 goto out;
256 258
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); 259 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
258 if (IS_ERR(new_nsproxy)) { 260 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy); 261 err = PTR_ERR(new_nsproxy);
260 goto out; 262 goto out;
diff --git a/kernel/pid.c b/kernel/pid.c
index fd996c1ed9f8..36aa02ff17d6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_fs.h>
39 40
40#define pid_hashfn(nr, ns) \ 41#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = {
78 .last_pid = 0, 79 .last_pid = 0,
79 .level = 0, 80 .level = 0,
80 .child_reaper = &init_task, 81 .child_reaper = &init_task,
82 .user_ns = &init_user_ns,
83 .proc_inum = PROC_PID_INIT_INO,
81}; 84};
82EXPORT_SYMBOL_GPL(init_pid_ns); 85EXPORT_SYMBOL_GPL(init_pid_ns);
83 86
84int is_container_init(struct task_struct *tsk)
85{
86 int ret = 0;
87 struct pid *pid;
88
89 rcu_read_lock();
90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1;
93 rcu_read_unlock();
94
95 return ret;
96}
97EXPORT_SYMBOL(is_container_init);
98
99/* 87/*
100 * Note: disable interrupts while the pidmap_lock is held as an 88 * Note: disable interrupts while the pidmap_lock is held as an
101 * interrupt might come in and do read_lock(&tasklist_lock). 89 * interrupt might come in and do read_lock(&tasklist_lock).
@@ -269,8 +257,24 @@ void free_pid(struct pid *pid)
269 unsigned long flags; 257 unsigned long flags;
270 258
271 spin_lock_irqsave(&pidmap_lock, flags); 259 spin_lock_irqsave(&pidmap_lock, flags);
272 for (i = 0; i <= pid->level; i++) 260 for (i = 0; i <= pid->level; i++) {
273 hlist_del_rcu(&pid->numbers[i].pid_chain); 261 struct upid *upid = pid->numbers + i;
262 struct pid_namespace *ns = upid->ns;
263 hlist_del_rcu(&upid->pid_chain);
264 switch(--ns->nr_hashed) {
265 case 1:
266 /* When all that is left in the pid namespace
267 * is the reaper wake up the reaper. The reaper
268 * may be sleeping in zap_pid_ns_processes().
269 */
270 wake_up_process(ns->child_reaper);
271 break;
272 case 0:
273 ns->nr_hashed = -1;
274 schedule_work(&ns->proc_work);
275 break;
276 }
277 }
274 spin_unlock_irqrestore(&pidmap_lock, flags); 278 spin_unlock_irqrestore(&pidmap_lock, flags);
275 279
276 for (i = 0; i <= pid->level; i++) 280 for (i = 0; i <= pid->level; i++)
@@ -292,6 +296,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
292 goto out; 296 goto out;
293 297
294 tmp = ns; 298 tmp = ns;
299 pid->level = ns->level;
295 for (i = ns->level; i >= 0; i--) { 300 for (i = ns->level; i >= 0; i--) {
296 nr = alloc_pidmap(tmp); 301 nr = alloc_pidmap(tmp);
297 if (nr < 0) 302 if (nr < 0)
@@ -302,22 +307,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
302 tmp = tmp->parent; 307 tmp = tmp->parent;
303 } 308 }
304 309
310 if (unlikely(is_child_reaper(pid))) {
311 if (pid_ns_prepare_proc(ns))
312 goto out_free;
313 }
314
305 get_pid_ns(ns); 315 get_pid_ns(ns);
306 pid->level = ns->level;
307 atomic_set(&pid->count, 1); 316 atomic_set(&pid->count, 1);
308 for (type = 0; type < PIDTYPE_MAX; ++type) 317 for (type = 0; type < PIDTYPE_MAX; ++type)
309 INIT_HLIST_HEAD(&pid->tasks[type]); 318 INIT_HLIST_HEAD(&pid->tasks[type]);
310 319
311 upid = pid->numbers + ns->level; 320 upid = pid->numbers + ns->level;
312 spin_lock_irq(&pidmap_lock); 321 spin_lock_irq(&pidmap_lock);
313 for ( ; upid >= pid->numbers; --upid) 322 if (ns->nr_hashed < 0)
323 goto out_unlock;
324 for ( ; upid >= pid->numbers; --upid) {
314 hlist_add_head_rcu(&upid->pid_chain, 325 hlist_add_head_rcu(&upid->pid_chain,
315 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 326 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
327 upid->ns->nr_hashed++;
328 }
316 spin_unlock_irq(&pidmap_lock); 329 spin_unlock_irq(&pidmap_lock);
317 330
318out: 331out:
319 return pid; 332 return pid;
320 333
334out_unlock:
335 spin_unlock(&pidmap_lock);
321out_free: 336out_free:
322 while (++i <= ns->level) 337 while (++i <= ns->level)
323 free_pidmap(pid->numbers + i); 338 free_pidmap(pid->numbers + i);
@@ -344,7 +359,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
344 359
345struct pid *find_vpid(int nr) 360struct pid *find_vpid(int nr)
346{ 361{
347 return find_pid_ns(nr, current->nsproxy->pid_ns); 362 return find_pid_ns(nr, task_active_pid_ns(current));
348} 363}
349EXPORT_SYMBOL_GPL(find_vpid); 364EXPORT_SYMBOL_GPL(find_vpid);
350 365
@@ -428,7 +443,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
428 443
429struct task_struct *find_task_by_vpid(pid_t vnr) 444struct task_struct *find_task_by_vpid(pid_t vnr)
430{ 445{
431 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 446 return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
432} 447}
433 448
434struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 449struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -483,7 +498,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
483 498
484pid_t pid_vnr(struct pid *pid) 499pid_t pid_vnr(struct pid *pid)
485{ 500{
486 return pid_nr_ns(pid, current->nsproxy->pid_ns); 501 return pid_nr_ns(pid, task_active_pid_ns(current));
487} 502}
488EXPORT_SYMBOL_GPL(pid_vnr); 503EXPORT_SYMBOL_GPL(pid_vnr);
489 504
@@ -494,7 +509,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
494 509
495 rcu_read_lock(); 510 rcu_read_lock();
496 if (!ns) 511 if (!ns)
497 ns = current->nsproxy->pid_ns; 512 ns = task_active_pid_ns(current);
498 if (likely(pid_alive(task))) { 513 if (likely(pid_alive(task))) {
499 if (type != PIDTYPE_PID) 514 if (type != PIDTYPE_PID)
500 task = task->group_leader; 515 task = task->group_leader;
@@ -569,6 +584,7 @@ void __init pidmap_init(void)
569 /* Reserve PID 0. We never call free_pidmap(0) */ 584 /* Reserve PID 0. We never call free_pidmap(0) */
570 set_bit(0, init_pid_ns.pidmap[0].page); 585 set_bit(0, init_pid_ns.pidmap[0].page);
571 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 586 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
587 init_pid_ns.nr_hashed = 1;
572 588
573 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 589 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
574 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 590 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7b07cc0dfb75..fdbd0cdf271a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/pid.h> 11#include <linux/pid.h>
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/user_namespace.h>
13#include <linux/syscalls.h> 14#include <linux/syscalls.h>
14#include <linux/err.h> 15#include <linux/err.h>
15#include <linux/acct.h> 16#include <linux/acct.h>
@@ -71,10 +72,17 @@ err_alloc:
71 return NULL; 72 return NULL;
72} 73}
73 74
75static void proc_cleanup_work(struct work_struct *work)
76{
77 struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
78 pid_ns_release_proc(ns);
79}
80
74/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 81/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
75#define MAX_PID_NS_LEVEL 32 82#define MAX_PID_NS_LEVEL 32
76 83
77static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) 84static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
85 struct pid_namespace *parent_pid_ns)
78{ 86{
79 struct pid_namespace *ns; 87 struct pid_namespace *ns;
80 unsigned int level = parent_pid_ns->level + 1; 88 unsigned int level = parent_pid_ns->level + 1;
@@ -99,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
99 if (ns->pid_cachep == NULL) 107 if (ns->pid_cachep == NULL)
100 goto out_free_map; 108 goto out_free_map;
101 109
110 err = proc_alloc_inum(&ns->proc_inum);
111 if (err)
112 goto out_free_map;
113
102 kref_init(&ns->kref); 114 kref_init(&ns->kref);
103 ns->level = level; 115 ns->level = level;
104 ns->parent = get_pid_ns(parent_pid_ns); 116 ns->parent = get_pid_ns(parent_pid_ns);
117 ns->user_ns = get_user_ns(user_ns);
118 INIT_WORK(&ns->proc_work, proc_cleanup_work);
105 119
106 set_bit(0, ns->pidmap[0].page); 120 set_bit(0, ns->pidmap[0].page);
107 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 121 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -109,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
109 for (i = 1; i < PIDMAP_ENTRIES; i++) 123 for (i = 1; i < PIDMAP_ENTRIES; i++)
110 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 124 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
111 125
112 err = pid_ns_prepare_proc(ns);
113 if (err)
114 goto out_put_parent_pid_ns;
115
116 return ns; 126 return ns;
117 127
118out_put_parent_pid_ns:
119 put_pid_ns(parent_pid_ns);
120out_free_map: 128out_free_map:
121 kfree(ns->pidmap[0].page); 129 kfree(ns->pidmap[0].page);
122out_free: 130out_free:
@@ -129,18 +137,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
129{ 137{
130 int i; 138 int i;
131 139
140 proc_free_inum(ns->proc_inum);
132 for (i = 0; i < PIDMAP_ENTRIES; i++) 141 for (i = 0; i < PIDMAP_ENTRIES; i++)
133 kfree(ns->pidmap[i].page); 142 kfree(ns->pidmap[i].page);
143 put_user_ns(ns->user_ns);
134 kmem_cache_free(pid_ns_cachep, ns); 144 kmem_cache_free(pid_ns_cachep, ns);
135} 145}
136 146
137struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 147struct pid_namespace *copy_pid_ns(unsigned long flags,
148 struct user_namespace *user_ns, struct pid_namespace *old_ns)
138{ 149{
139 if (!(flags & CLONE_NEWPID)) 150 if (!(flags & CLONE_NEWPID))
140 return get_pid_ns(old_ns); 151 return get_pid_ns(old_ns);
141 if (flags & (CLONE_THREAD|CLONE_PARENT)) 152 if (task_active_pid_ns(current) != old_ns)
142 return ERR_PTR(-EINVAL); 153 return ERR_PTR(-EINVAL);
143 return create_pid_namespace(old_ns); 154 return create_pid_namespace(user_ns, old_ns);
144} 155}
145 156
146static void free_pid_ns(struct kref *kref) 157static void free_pid_ns(struct kref *kref)
@@ -211,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
211 222
212 /* 223 /*
213 * sys_wait4() above can't reap the TASK_DEAD children. 224 * sys_wait4() above can't reap the TASK_DEAD children.
214 * Make sure they all go away, see __unhash_process(). 225 * Make sure they all go away, see free_pid().
215 */ 226 */
216 for (;;) { 227 for (;;) {
217 bool need_wait = false; 228 set_current_state(TASK_UNINTERRUPTIBLE);
218 229 if (pid_ns->nr_hashed == 1)
219 read_lock(&tasklist_lock);
220 if (!list_empty(&current->children)) {
221 __set_current_state(TASK_UNINTERRUPTIBLE);
222 need_wait = true;
223 }
224 read_unlock(&tasklist_lock);
225
226 if (!need_wait)
227 break; 230 break;
228 schedule(); 231 schedule();
229 } 232 }
233 __set_current_state(TASK_RUNNING);
230 234
231 if (pid_ns->reboot) 235 if (pid_ns->reboot)
232 current->signal->group_exit_code = pid_ns->reboot; 236 current->signal->group_exit_code = pid_ns->reboot;
@@ -239,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
239static int pid_ns_ctl_handler(struct ctl_table *table, int write, 243static int pid_ns_ctl_handler(struct ctl_table *table, int write,
240 void __user *buffer, size_t *lenp, loff_t *ppos) 244 void __user *buffer, size_t *lenp, loff_t *ppos)
241{ 245{
246 struct pid_namespace *pid_ns = task_active_pid_ns(current);
242 struct ctl_table tmp = *table; 247 struct ctl_table tmp = *table;
243 248
244 if (write && !capable(CAP_SYS_ADMIN)) 249 if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
245 return -EPERM; 250 return -EPERM;
246 251
247 /* 252 /*
@@ -250,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
250 * it should synchronize its usage with external means. 255 * it should synchronize its usage with external means.
251 */ 256 */
252 257
253 tmp.data = &current->nsproxy->pid_ns->last_pid; 258 tmp.data = &pid_ns->last_pid;
254 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 259 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
255} 260}
256 261
@@ -299,6 +304,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
299 return 0; 304 return 0;
300} 305}
301 306
307static void *pidns_get(struct task_struct *task)
308{
309 struct pid_namespace *ns;
310
311 rcu_read_lock();
312 ns = get_pid_ns(task_active_pid_ns(task));
313 rcu_read_unlock();
314
315 return ns;
316}
317
318static void pidns_put(void *ns)
319{
320 put_pid_ns(ns);
321}
322
323static int pidns_install(struct nsproxy *nsproxy, void *ns)
324{
325 struct pid_namespace *active = task_active_pid_ns(current);
326 struct pid_namespace *ancestor, *new = ns;
327
328 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
329 !nsown_capable(CAP_SYS_ADMIN))
330 return -EPERM;
331
332 /*
333 * Only allow entering the current active pid namespace
334 * or a child of the current active pid namespace.
335 *
336 * This is required for fork to return a usable pid value and
337 * this maintains the property that processes and their
338 * children can not escape their current pid namespace.
339 */
340 if (new->level < active->level)
341 return -EINVAL;
342
343 ancestor = new;
344 while (ancestor->level > active->level)
345 ancestor = ancestor->parent;
346 if (ancestor != active)
347 return -EINVAL;
348
349 put_pid_ns(nsproxy->pid_ns);
350 nsproxy->pid_ns = get_pid_ns(new);
351 return 0;
352}
353
354static unsigned int pidns_inum(void *ns)
355{
356 struct pid_namespace *pid_ns = ns;
357 return pid_ns->proc_inum;
358}
359
360const struct proc_ns_operations pidns_operations = {
361 .name = "pid",
362 .type = CLONE_NEWPID,
363 .get = pidns_get,
364 .put = pidns_put,
365 .install = pidns_install,
366 .inum = pidns_inum,
367};
368
302static __init int pid_namespaces_init(void) 369static __init int pid_namespaces_init(void)
303{ 370{
304 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 371 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d73840271dce..a278cad1d5d6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12#include <linux/random.h>
12 13
13/* 14/*
14 * Called after updating RLIMIT_CPU to run cpu timer and update 15 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -470,6 +471,8 @@ static void cleanup_timers(struct list_head *head,
470 */ 471 */
471void posix_cpu_timers_exit(struct task_struct *tsk) 472void posix_cpu_timers_exit(struct task_struct *tsk)
472{ 473{
474 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
475 sizeof(unsigned long long));
473 cleanup_timers(tsk->cpu_timers, 476 cleanup_timers(tsk->cpu_timers,
474 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); 477 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
475 478
diff --git a/kernel/printk.c b/kernel/printk.c
index 22e070f3470a..19c0d7bcf24a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -747,6 +747,21 @@ void __init setup_log_buf(int early)
747 free, (free * 100) / __LOG_BUF_LEN); 747 free, (free * 100) / __LOG_BUF_LEN);
748} 748}
749 749
750static bool __read_mostly ignore_loglevel;
751
752static int __init ignore_loglevel_setup(char *str)
753{
754 ignore_loglevel = 1;
755 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
756
757 return 0;
758}
759
760early_param("ignore_loglevel", ignore_loglevel_setup);
761module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
762MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
763 "print all kernel messages to the console.");
764
750#ifdef CONFIG_BOOT_PRINTK_DELAY 765#ifdef CONFIG_BOOT_PRINTK_DELAY
751 766
752static int boot_delay; /* msecs delay after each printk during bootup */ 767static int boot_delay; /* msecs delay after each printk during bootup */
@@ -770,13 +785,15 @@ static int __init boot_delay_setup(char *str)
770} 785}
771__setup("boot_delay=", boot_delay_setup); 786__setup("boot_delay=", boot_delay_setup);
772 787
773static void boot_delay_msec(void) 788static void boot_delay_msec(int level)
774{ 789{
775 unsigned long long k; 790 unsigned long long k;
776 unsigned long timeout; 791 unsigned long timeout;
777 792
778 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 793 if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
794 || (level >= console_loglevel && !ignore_loglevel)) {
779 return; 795 return;
796 }
780 797
781 k = (unsigned long long)loops_per_msec * boot_delay; 798 k = (unsigned long long)loops_per_msec * boot_delay;
782 799
@@ -795,7 +812,7 @@ static void boot_delay_msec(void)
795 } 812 }
796} 813}
797#else 814#else
798static inline void boot_delay_msec(void) 815static inline void boot_delay_msec(int level)
799{ 816{
800} 817}
801#endif 818#endif
@@ -1238,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1238 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1255 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1239} 1256}
1240 1257
1241static bool __read_mostly ignore_loglevel;
1242
1243static int __init ignore_loglevel_setup(char *str)
1244{
1245 ignore_loglevel = 1;
1246 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
1247
1248 return 0;
1249}
1250
1251early_param("ignore_loglevel", ignore_loglevel_setup);
1252module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
1253MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
1254 "print all kernel messages to the console.");
1255
1256/* 1258/*
1257 * Call the console drivers, asking them to write out 1259 * Call the console drivers, asking them to write out
1258 * log_buf[start] to log_buf[end - 1]. 1260 * log_buf[start] to log_buf[end - 1].
@@ -1498,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1498 int this_cpu; 1500 int this_cpu;
1499 int printed_len = 0; 1501 int printed_len = 0;
1500 1502
1501 boot_delay_msec(); 1503 boot_delay_msec(level);
1502 printk_delay(); 1504 printk_delay();
1503 1505
1504 /* This stops the holder of console_sem just where we want him */ 1506 /* This stops the holder of console_sem just where we want him */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f5e55dda955..1599157336a6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -215,8 +215,12 @@ ok:
215 smp_rmb(); 215 smp_rmb();
216 if (task->mm) 216 if (task->mm)
217 dumpable = get_dumpable(task->mm); 217 dumpable = get_dumpable(task->mm);
218 if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) 218 rcu_read_lock();
219 if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
220 rcu_read_unlock();
219 return -EPERM; 221 return -EPERM;
222 }
223 rcu_read_unlock();
220 224
221 return security_ptrace_access_check(task, mode); 225 return security_ptrace_access_check(task, mode);
222} 226}
@@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request,
280 284
281 if (seize) 285 if (seize)
282 flags |= PT_SEIZED; 286 flags |= PT_SEIZED;
283 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 287 rcu_read_lock();
288 if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
284 flags |= PT_PTRACE_CAP; 289 flags |= PT_PTRACE_CAP;
290 rcu_read_unlock();
285 task->ptrace = flags; 291 task->ptrace = flags;
286 292
287 __ptrace_link(task, current); 293 __ptrace_link(task, current);
@@ -457,6 +463,9 @@ void exit_ptrace(struct task_struct *tracer)
457 return; 463 return;
458 464
459 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 465 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
466 if (unlikely(p->ptrace & PT_EXITKILL))
467 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
468
460 if (__ptrace_detach(tracer, p)) 469 if (__ptrace_detach(tracer, p))
461 list_add(&p->ptrace_entry, &ptrace_dead); 470 list_add(&p->ptrace_entry, &ptrace_dead);
462 } 471 }
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 3920d593e63c..ff55247e7049 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
86 return __res_counter_charge(counter, val, limit_fail_at, true); 86 return __res_counter_charge(counter, val, limit_fail_at, true);
87} 87}
88 88
89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
90{ 90{
91 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
92 val = counter->usage; 92 val = counter->usage;
93 93
94 counter->usage -= val; 94 counter->usage -= val;
95 return counter->usage;
95} 96}
96 97
97void res_counter_uncharge_until(struct res_counter *counter, 98u64 res_counter_uncharge_until(struct res_counter *counter,
98 struct res_counter *top, 99 struct res_counter *top,
99 unsigned long val) 100 unsigned long val)
100{ 101{
101 unsigned long flags; 102 unsigned long flags;
102 struct res_counter *c; 103 struct res_counter *c;
104 u64 ret = 0;
103 105
104 local_irq_save(flags); 106 local_irq_save(flags);
105 for (c = counter; c != top; c = c->parent) { 107 for (c = counter; c != top; c = c->parent) {
108 u64 r;
106 spin_lock(&c->lock); 109 spin_lock(&c->lock);
107 res_counter_uncharge_locked(c, val); 110 r = res_counter_uncharge_locked(c, val);
111 if (c == counter)
112 ret = r;
108 spin_unlock(&c->lock); 113 spin_unlock(&c->lock);
109 } 114 }
110 local_irq_restore(flags); 115 local_irq_restore(flags);
116 return ret;
111} 117}
112 118
113void res_counter_uncharge(struct res_counter *counter, unsigned long val) 119u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
114{ 120{
115 res_counter_uncharge_until(counter, NULL, val); 121 return res_counter_uncharge_until(counter, NULL, val);
116} 122}
117 123
118static inline unsigned long long * 124static inline unsigned long long *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1fb82104bfb..257002c13bb0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4097,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4097 goto out_free_cpus_allowed; 4097 goto out_free_cpus_allowed;
4098 } 4098 }
4099 retval = -EPERM; 4099 retval = -EPERM;
4100 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4100 if (!check_same_owner(p)) {
4101 goto out_unlock; 4101 rcu_read_lock();
4102 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4103 rcu_read_unlock();
4104 goto out_unlock;
4105 }
4106 rcu_read_unlock();
4107 }
4102 4108
4103 retval = security_task_setscheduler(p); 4109 retval = security_task_setscheduler(p);
4104 if (retval) 4110 if (retval)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4603d6cb9e25..5eea8707234a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -793,8 +793,11 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
793 793
794static void task_numa_placement(struct task_struct *p) 794static void task_numa_placement(struct task_struct *p)
795{ 795{
796 int seq = ACCESS_ONCE(p->mm->numa_scan_seq); 796 int seq;
797 797
798 if (!p->mm) /* for example, ksmd faulting in a user's mm */
799 return;
800 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
798 if (p->numa_scan_seq == seq) 801 if (p->numa_scan_seq == seq)
799 return; 802 return;
800 p->numa_scan_seq = seq; 803 p->numa_scan_seq = seq;
diff --git a/kernel/signal.c b/kernel/signal.c
index a49c7f36ceb3..580a91e63471 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1753,7 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1753 * see comment in do_notify_parent() about the following 4 lines 1753 * see comment in do_notify_parent() about the following 4 lines
1754 */ 1754 */
1755 rcu_read_lock(); 1755 rcu_read_lock();
1756 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1756 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
1757 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1757 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1758 rcu_read_unlock(); 1758 rcu_read_unlock();
1759 1759
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index dbff751e4086..395084d4ce16 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff);
25cond_syscall(sys_kexec_load); 25cond_syscall(sys_kexec_load);
26cond_syscall(compat_sys_kexec_load); 26cond_syscall(compat_sys_kexec_load);
27cond_syscall(sys_init_module); 27cond_syscall(sys_init_module);
28cond_syscall(sys_finit_module);
28cond_syscall(sys_delete_module); 29cond_syscall(sys_delete_module);
29cond_syscall(sys_socketpair); 30cond_syscall(sys_socketpair);
30cond_syscall(sys_bind); 31cond_syscall(sys_bind);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4e..5a6384450501 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1344 goto out_putname; 1344 goto out_putname;
1345 } 1345 }
1346 1346
1347 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = task_active_pid_ns(current)->proc_mnt;
1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1349 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1350 if (IS_ERR(file)) 1350 if (IS_ERR(file))
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index afd092de45b7..3ffe4c5ad3f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2675,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2675} 2675}
2676 2676
2677loff_t 2677loff_t
2678ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 2678ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2679{ 2679{
2680 loff_t ret; 2680 loff_t ret;
2681 2681
2682 if (file->f_mode & FMODE_READ) 2682 if (file->f_mode & FMODE_READ)
2683 ret = seq_lseek(file, offset, origin); 2683 ret = seq_lseek(file, offset, whence);
2684 else 2684 else
2685 file->f_pos = ret = 1; 2685 file->f_pos = ret = 1;
2686 2686
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 61e081b4ba11..e5125677efa0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3034,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
3034 tr->data[cpu]->entries = val; 3034 tr->data[cpu]->entries = val;
3035} 3035}
3036 3036
3037/* resize @tr's buffer to the size of @size_tr's entries */
3038static int resize_buffer_duplicate_size(struct trace_array *tr,
3039 struct trace_array *size_tr, int cpu_id)
3040{
3041 int cpu, ret = 0;
3042
3043 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3044 for_each_tracing_cpu(cpu) {
3045 ret = ring_buffer_resize(tr->buffer,
3046 size_tr->data[cpu]->entries, cpu);
3047 if (ret < 0)
3048 break;
3049 tr->data[cpu]->entries = size_tr->data[cpu]->entries;
3050 }
3051 } else {
3052 ret = ring_buffer_resize(tr->buffer,
3053 size_tr->data[cpu_id]->entries, cpu_id);
3054 if (ret == 0)
3055 tr->data[cpu_id]->entries =
3056 size_tr->data[cpu_id]->entries;
3057 }
3058
3059 return ret;
3060}
3061
3037static int __tracing_resize_ring_buffer(unsigned long size, int cpu) 3062static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3038{ 3063{
3039 int ret; 3064 int ret;
@@ -3058,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3058 3083
3059 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 3084 ret = ring_buffer_resize(max_tr.buffer, size, cpu);
3060 if (ret < 0) { 3085 if (ret < 0) {
3061 int r = 0; 3086 int r = resize_buffer_duplicate_size(&global_trace,
3062 3087 &global_trace, cpu);
3063 if (cpu == RING_BUFFER_ALL_CPUS) {
3064 int i;
3065 for_each_tracing_cpu(i) {
3066 r = ring_buffer_resize(global_trace.buffer,
3067 global_trace.data[i]->entries,
3068 i);
3069 if (r < 0)
3070 break;
3071 }
3072 } else {
3073 r = ring_buffer_resize(global_trace.buffer,
3074 global_trace.data[cpu]->entries,
3075 cpu);
3076 }
3077
3078 if (r < 0) { 3088 if (r < 0) {
3079 /* 3089 /*
3080 * AARGH! We are left with different 3090 * AARGH! We are left with different
@@ -3212,17 +3222,11 @@ static int tracing_set_tracer(const char *buf)
3212 3222
3213 topts = create_trace_option_files(t); 3223 topts = create_trace_option_files(t);
3214 if (t->use_max_tr) { 3224 if (t->use_max_tr) {
3215 int cpu;
3216 /* we need to make per cpu buffer sizes equivalent */ 3225 /* we need to make per cpu buffer sizes equivalent */
3217 for_each_tracing_cpu(cpu) { 3226 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3218 ret = ring_buffer_resize(max_tr.buffer, 3227 RING_BUFFER_ALL_CPUS);
3219 global_trace.data[cpu]->entries, 3228 if (ret < 0)
3220 cpu); 3229 goto out;
3221 if (ret < 0)
3222 goto out;
3223 max_tr.data[cpu]->entries =
3224 global_trace.data[cpu]->entries;
3225 }
3226 } 3230 }
3227 3231
3228 if (t->init) { 3232 if (t->init) {
@@ -4271,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4271 return -ENOMEM; 4275 return -ENOMEM;
4272 4276
4273 if (*ppos & (PAGE_SIZE - 1)) { 4277 if (*ppos & (PAGE_SIZE - 1)) {
4274 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
4275 ret = -EINVAL; 4278 ret = -EINVAL;
4276 goto out; 4279 goto out;
4277 } 4280 }
4278 4281
4279 if (len & (PAGE_SIZE - 1)) { 4282 if (len & (PAGE_SIZE - 1)) {
4280 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
4281 if (len < PAGE_SIZE) { 4283 if (len < PAGE_SIZE) {
4282 ret = -EINVAL; 4284 ret = -EINVAL;
4283 goto out; 4285 goto out;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0c1b165778e5..42ca822fc701 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -33,7 +33,6 @@ static unsigned long max_stack_size;
33static arch_spinlock_t max_stack_lock = 33static arch_spinlock_t max_stack_lock =
34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 34 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
35 35
36static int stack_trace_disabled __read_mostly;
37static DEFINE_PER_CPU(int, trace_active); 36static DEFINE_PER_CPU(int, trace_active);
38static DEFINE_MUTEX(stack_sysctl_mutex); 37static DEFINE_MUTEX(stack_sysctl_mutex);
39 38
@@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
116{ 115{
117 int cpu; 116 int cpu;
118 117
119 if (unlikely(!ftrace_enabled || stack_trace_disabled))
120 return;
121
122 preempt_disable_notrace(); 118 preempt_disable_notrace();
123 119
124 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9614db8b0f8c..c86e6d4f67fb 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -22,6 +22,7 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/uprobes.h> 23#include <linux/uprobes.h>
24#include <linux/namei.h> 24#include <linux/namei.h>
25#include <linux/string.h>
25 26
26#include "trace_probe.h" 27#include "trace_probe.h"
27 28
@@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv)
263 264
264 /* setup a probe */ 265 /* setup a probe */
265 if (!event) { 266 if (!event) {
266 char *tail = strrchr(filename, '/'); 267 char *tail;
267 char *ptr; 268 char *ptr;
268 269
269 ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); 270 tail = kstrdup(kbasename(filename), GFP_KERNEL);
270 if (!ptr) { 271 if (!tail) {
271 ret = -ENOMEM; 272 ret = -ENOMEM;
272 goto fail_address_parse; 273 goto fail_address_parse;
273 } 274 }
274 275
275 tail = ptr;
276 ptr = strpbrk(tail, ".-_"); 276 ptr = strpbrk(tail, ".-_");
277 if (ptr) 277 if (ptr)
278 *ptr = '\0'; 278 *ptr = '\0';
diff --git a/kernel/user.c b/kernel/user.c
index 750acffbe9ec..33acb5e53a5f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include <linux/proc_fs.h>
19 20
20/* 21/*
21 * userns count is 1 for root user, 1 for init_uts_ns, 22 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +52,7 @@ struct user_namespace init_user_ns = {
51 }, 52 },
52 .owner = GLOBAL_ROOT_UID, 53 .owner = GLOBAL_ROOT_UID,
53 .group = GLOBAL_ROOT_GID, 54 .group = GLOBAL_ROOT_GID,
55 .proc_inum = PROC_USER_INIT_INO,
54}; 56};
55EXPORT_SYMBOL_GPL(init_user_ns); 57EXPORT_SYMBOL_GPL(init_user_ns);
56 58
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 456a6b9fba34..2b042c42fbc4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/proc_fs.h>
12#include <linux/highuid.h> 13#include <linux/highuid.h>
13#include <linux/cred.h> 14#include <linux/cred.h>
14#include <linux/securebits.h> 15#include <linux/securebits.h>
@@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
26static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 27static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
27 struct uid_gid_map *map); 28 struct uid_gid_map *map);
28 29
30static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
31{
32 /* Start with the same capabilities as init but useless for doing
33 * anything as the capabilities are bound to the new user namespace.
34 */
35 cred->securebits = SECUREBITS_DEFAULT;
36 cred->cap_inheritable = CAP_EMPTY_SET;
37 cred->cap_permitted = CAP_FULL_SET;
38 cred->cap_effective = CAP_FULL_SET;
39 cred->cap_bset = CAP_FULL_SET;
40#ifdef CONFIG_KEYS
41 key_put(cred->request_key_auth);
42 cred->request_key_auth = NULL;
43#endif
44 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
45 cred->user_ns = user_ns;
46}
47
29/* 48/*
30 * Create a new user namespace, deriving the creator from the user in the 49 * Create a new user namespace, deriving the creator from the user in the
31 * passed credentials, and replacing that user with the new root user for the 50 * passed credentials, and replacing that user with the new root user for the
@@ -39,6 +58,7 @@ int create_user_ns(struct cred *new)
39 struct user_namespace *ns, *parent_ns = new->user_ns; 58 struct user_namespace *ns, *parent_ns = new->user_ns;
40 kuid_t owner = new->euid; 59 kuid_t owner = new->euid;
41 kgid_t group = new->egid; 60 kgid_t group = new->egid;
61 int ret;
42 62
43 /* The creator needs a mapping in the parent user namespace 63 /* The creator needs a mapping in the parent user namespace
44 * or else we won't be able to reasonably tell userspace who 64 * or else we won't be able to reasonably tell userspace who
@@ -52,38 +72,45 @@ int create_user_ns(struct cred *new)
52 if (!ns) 72 if (!ns)
53 return -ENOMEM; 73 return -ENOMEM;
54 74
75 ret = proc_alloc_inum(&ns->proc_inum);
76 if (ret) {
77 kmem_cache_free(user_ns_cachep, ns);
78 return ret;
79 }
80
55 kref_init(&ns->kref); 81 kref_init(&ns->kref);
82 /* Leave the new->user_ns reference with the new user namespace. */
56 ns->parent = parent_ns; 83 ns->parent = parent_ns;
57 ns->owner = owner; 84 ns->owner = owner;
58 ns->group = group; 85 ns->group = group;
59 86
60 /* Start with the same capabilities as init but useless for doing 87 set_cred_user_ns(new, ns);
61 * anything as the capabilities are bound to the new user namespace.
62 */
63 new->securebits = SECUREBITS_DEFAULT;
64 new->cap_inheritable = CAP_EMPTY_SET;
65 new->cap_permitted = CAP_FULL_SET;
66 new->cap_effective = CAP_FULL_SET;
67 new->cap_bset = CAP_FULL_SET;
68#ifdef CONFIG_KEYS
69 key_put(new->request_key_auth);
70 new->request_key_auth = NULL;
71#endif
72 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
73
74 /* Leave the new->user_ns reference with the new user namespace. */
75 /* Leave the reference to our user_ns with the new cred. */
76 new->user_ns = ns;
77 88
78 return 0; 89 return 0;
79} 90}
80 91
92int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
93{
94 struct cred *cred;
95
96 if (!(unshare_flags & CLONE_NEWUSER))
97 return 0;
98
99 cred = prepare_creds();
100 if (!cred)
101 return -ENOMEM;
102
103 *new_cred = cred;
104 return create_user_ns(cred);
105}
106
81void free_user_ns(struct kref *kref) 107void free_user_ns(struct kref *kref)
82{ 108{
83 struct user_namespace *parent, *ns = 109 struct user_namespace *parent, *ns =
84 container_of(kref, struct user_namespace, kref); 110 container_of(kref, struct user_namespace, kref);
85 111
86 parent = ns->parent; 112 parent = ns->parent;
113 proc_free_inum(ns->proc_inum);
87 kmem_cache_free(user_ns_cachep, ns); 114 kmem_cache_free(user_ns_cachep, ns);
88 put_user_ns(parent); 115 put_user_ns(parent);
89} 116}
@@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
372 struct user_namespace *lower_ns; 399 struct user_namespace *lower_ns;
373 uid_t lower; 400 uid_t lower;
374 401
375 lower_ns = current_user_ns(); 402 lower_ns = seq_user_ns(seq);
376 if ((lower_ns == ns) && lower_ns->parent) 403 if ((lower_ns == ns) && lower_ns->parent)
377 lower_ns = lower_ns->parent; 404 lower_ns = lower_ns->parent;
378 405
@@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
393 struct user_namespace *lower_ns; 420 struct user_namespace *lower_ns;
394 gid_t lower; 421 gid_t lower;
395 422
396 lower_ns = current_user_ns(); 423 lower_ns = seq_user_ns(seq);
397 if ((lower_ns == ns) && lower_ns->parent) 424 if ((lower_ns == ns) && lower_ns->parent)
398 lower_ns = lower_ns->parent; 425 lower_ns = lower_ns->parent;
399 426
@@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
669{ 696{
670 struct seq_file *seq = file->private_data; 697 struct seq_file *seq = file->private_data;
671 struct user_namespace *ns = seq->private; 698 struct user_namespace *ns = seq->private;
699 struct user_namespace *seq_ns = seq_user_ns(seq);
672 700
673 if (!ns->parent) 701 if (!ns->parent)
674 return -EPERM; 702 return -EPERM;
675 703
704 if ((seq_ns != ns) && (seq_ns != ns->parent))
705 return -EPERM;
706
676 return map_write(file, buf, size, ppos, CAP_SETUID, 707 return map_write(file, buf, size, ppos, CAP_SETUID,
677 &ns->uid_map, &ns->parent->uid_map); 708 &ns->uid_map, &ns->parent->uid_map);
678} 709}
@@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
681{ 712{
682 struct seq_file *seq = file->private_data; 713 struct seq_file *seq = file->private_data;
683 struct user_namespace *ns = seq->private; 714 struct user_namespace *ns = seq->private;
715 struct user_namespace *seq_ns = seq_user_ns(seq);
684 716
685 if (!ns->parent) 717 if (!ns->parent)
686 return -EPERM; 718 return -EPERM;
687 719
720 if ((seq_ns != ns) && (seq_ns != ns->parent))
721 return -EPERM;
722
688 return map_write(file, buf, size, ppos, CAP_SETGID, 723 return map_write(file, buf, size, ppos, CAP_SETGID,
689 &ns->gid_map, &ns->parent->gid_map); 724 &ns->gid_map, &ns->parent->gid_map);
690} 725}
@@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
709static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 744static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
710 struct uid_gid_map *new_map) 745 struct uid_gid_map *new_map)
711{ 746{
747 /* Allow mapping to your own filesystem ids */
748 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
749 u32 id = new_map->extent[0].lower_first;
750 if (cap_setid == CAP_SETUID) {
751 kuid_t uid = make_kuid(ns->parent, id);
752 if (uid_eq(uid, current_fsuid()))
753 return true;
754 }
755 else if (cap_setid == CAP_SETGID) {
756 kgid_t gid = make_kgid(ns->parent, id);
757 if (gid_eq(gid, current_fsgid()))
758 return true;
759 }
760 }
761
712 /* Allow anyone to set a mapping that doesn't require privilege */ 762 /* Allow anyone to set a mapping that doesn't require privilege */
713 if (!cap_valid(cap_setid)) 763 if (!cap_valid(cap_setid))
714 return true; 764 return true;
@@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
722 return false; 772 return false;
723} 773}
724 774
775static void *userns_get(struct task_struct *task)
776{
777 struct user_namespace *user_ns;
778
779 rcu_read_lock();
780 user_ns = get_user_ns(__task_cred(task)->user_ns);
781 rcu_read_unlock();
782
783 return user_ns;
784}
785
786static void userns_put(void *ns)
787{
788 put_user_ns(ns);
789}
790
791static int userns_install(struct nsproxy *nsproxy, void *ns)
792{
793 struct user_namespace *user_ns = ns;
794 struct cred *cred;
795
796 /* Don't allow gaining capabilities by reentering
797 * the same user namespace.
798 */
799 if (user_ns == current_user_ns())
800 return -EINVAL;
801
802 /* Threaded processes may not enter a different user namespace */
803 if (atomic_read(&current->mm->mm_users) > 1)
804 return -EINVAL;
805
806 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
807 return -EPERM;
808
809 cred = prepare_creds();
810 if (!cred)
811 return -ENOMEM;
812
813 put_user_ns(cred->user_ns);
814 set_cred_user_ns(cred, get_user_ns(user_ns));
815
816 return commit_creds(cred);
817}
818
819static unsigned int userns_inum(void *ns)
820{
821 struct user_namespace *user_ns = ns;
822 return user_ns->proc_inum;
823}
824
825const struct proc_ns_operations userns_operations = {
826 .name = "user",
827 .type = CLONE_NEWUSER,
828 .get = userns_get,
829 .put = userns_put,
830 .install = userns_install,
831 .inum = userns_inum,
832};
833
725static __init int user_namespaces_init(void) 834static __init int user_namespaces_init(void)
726{ 835{
727 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 836 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 679d97a5d3fd..08b197e8c485 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void)
32 * @old_ns: namespace to clone 32 * @old_ns: namespace to clone
33 * Return NULL on error (failure to kmalloc), new ns otherwise 33 * Return NULL on error (failure to kmalloc), new ns otherwise
34 */ 34 */
35static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, 35static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
36 struct uts_namespace *old_ns) 36 struct uts_namespace *old_ns)
37{ 37{
38 struct uts_namespace *ns; 38 struct uts_namespace *ns;
39 int err;
39 40
40 ns = create_uts_ns(); 41 ns = create_uts_ns();
41 if (!ns) 42 if (!ns)
42 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
43 44
45 err = proc_alloc_inum(&ns->proc_inum);
46 if (err) {
47 kfree(ns);
48 return ERR_PTR(err);
49 }
50
44 down_read(&uts_sem); 51 down_read(&uts_sem);
45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 53 ns->user_ns = get_user_ns(user_ns);
47 up_read(&uts_sem); 54 up_read(&uts_sem);
48 return ns; 55 return ns;
49} 56}
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
55 * versa. 62 * versa.
56 */ 63 */
57struct uts_namespace *copy_utsname(unsigned long flags, 64struct uts_namespace *copy_utsname(unsigned long flags,
58 struct task_struct *tsk) 65 struct user_namespace *user_ns, struct uts_namespace *old_ns)
59{ 66{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
61 struct uts_namespace *new_ns; 67 struct uts_namespace *new_ns;
62 68
63 BUG_ON(!old_ns); 69 BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
66 if (!(flags & CLONE_NEWUTS)) 72 if (!(flags & CLONE_NEWUTS))
67 return old_ns; 73 return old_ns;
68 74
69 new_ns = clone_uts_ns(tsk, old_ns); 75 new_ns = clone_uts_ns(user_ns, old_ns);
70 76
71 put_uts_ns(old_ns); 77 put_uts_ns(old_ns);
72 return new_ns; 78 return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
78 84
79 ns = container_of(kref, struct uts_namespace, kref); 85 ns = container_of(kref, struct uts_namespace, kref);
80 put_user_ns(ns->user_ns); 86 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum);
81 kfree(ns); 88 kfree(ns);
82} 89}
83 90
@@ -102,19 +109,32 @@ static void utsns_put(void *ns)
102 put_uts_ns(ns); 109 put_uts_ns(ns);
103} 110}
104 111
105static int utsns_install(struct nsproxy *nsproxy, void *ns) 112static int utsns_install(struct nsproxy *nsproxy, void *new)
106{ 113{
114 struct uts_namespace *ns = new;
115
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !nsown_capable(CAP_SYS_ADMIN))
118 return -EPERM;
119
107 get_uts_ns(ns); 120 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns); 121 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns; 122 nsproxy->uts_ns = ns;
110 return 0; 123 return 0;
111} 124}
112 125
126static unsigned int utsns_inum(void *vp)
127{
128 struct uts_namespace *ns = vp;
129
130 return ns->proc_inum;
131}
132
113const struct proc_ns_operations utsns_operations = { 133const struct proc_ns_operations utsns_operations = {
114 .name = "uts", 134 .name = "uts",
115 .type = CLONE_NEWUTS, 135 .type = CLONE_NEWUTS,
116 .get = utsns_get, 136 .get = utsns_get,
117 .put = utsns_put, 137 .put = utsns_put,
118 .install = utsns_install, 138 .install = utsns_install,
139 .inum = utsns_inum,
119}; 140};
120
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c8c21be11ab4..75a2ab3d0b02 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,7 @@
31int watchdog_enabled = 1; 31int watchdog_enabled = 1;
32int __read_mostly watchdog_thresh = 10; 32int __read_mostly watchdog_thresh = 10;
33static int __read_mostly watchdog_disabled; 33static int __read_mostly watchdog_disabled;
34static u64 __read_mostly sample_period;
34 35
35static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 36static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
36static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 37static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -116,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu)
116 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 117 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
117} 118}
118 119
119static u64 get_sample_period(void) 120static void set_sample_period(void)
120{ 121{
121 /* 122 /*
122 * convert watchdog_thresh from seconds to ns 123 * convert watchdog_thresh from seconds to ns
@@ -125,7 +126,7 @@ static u64 get_sample_period(void)
125 * and hard thresholds) to increment before the 126 * and hard thresholds) to increment before the
126 * hardlockup detector generates a warning 127 * hardlockup detector generates a warning
127 */ 128 */
128 return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); 129 sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
129} 130}
130 131
131/* Commands for resetting the watchdog */ 132/* Commands for resetting the watchdog */
@@ -275,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
275 wake_up_process(__this_cpu_read(softlockup_watchdog)); 276 wake_up_process(__this_cpu_read(softlockup_watchdog));
276 277
277 /* .. and repeat */ 278 /* .. and repeat */
278 hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); 279 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
279 280
280 if (touch_ts == 0) { 281 if (touch_ts == 0) {
281 if (unlikely(__this_cpu_read(softlockup_touch_sync))) { 282 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -343,6 +344,10 @@ static void watchdog_enable(unsigned int cpu)
343{ 344{
344 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 345 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
345 346
347 /* kick off the timer for the hardlockup detector */
348 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
349 hrtimer->function = watchdog_timer_fn;
350
346 if (!watchdog_enabled) { 351 if (!watchdog_enabled) {
347 kthread_park(current); 352 kthread_park(current);
348 return; 353 return;
@@ -351,12 +356,8 @@ static void watchdog_enable(unsigned int cpu)
351 /* Enable the perf event */ 356 /* Enable the perf event */
352 watchdog_nmi_enable(cpu); 357 watchdog_nmi_enable(cpu);
353 358
354 /* kick off the timer for the hardlockup detector */
355 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
356 hrtimer->function = watchdog_timer_fn;
357
358 /* done here because hrtimer_start can only pin to smp_processor_id() */ 359 /* done here because hrtimer_start can only pin to smp_processor_id() */
359 hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), 360 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
360 HRTIMER_MODE_REL_PINNED); 361 HRTIMER_MODE_REL_PINNED);
361 362
362 /* initialize timestamp */ 363 /* initialize timestamp */
@@ -368,9 +369,6 @@ static void watchdog_disable(unsigned int cpu)
368{ 369{
369 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 370 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
370 371
371 if (!watchdog_enabled)
372 return;
373
374 watchdog_set_prio(SCHED_NORMAL, 0); 372 watchdog_set_prio(SCHED_NORMAL, 0);
375 hrtimer_cancel(hrtimer); 373 hrtimer_cancel(hrtimer);
376 /* disable the perf event */ 374 /* disable the perf event */
@@ -386,7 +384,7 @@ static int watchdog_should_run(unsigned int cpu)
386/* 384/*
387 * The watchdog thread function - touches the timestamp. 385 * The watchdog thread function - touches the timestamp.
388 * 386 *
389 * It only runs once every get_sample_period() seconds (4 seconds by 387 * It only runs once every sample_period seconds (4 seconds by
390 * default) to reset the softlockup timestamp. If this gets delayed 388 * default) to reset the softlockup timestamp. If this gets delayed
391 * for more than 2*watchdog_thresh seconds then the debug-printout 389 * for more than 2*watchdog_thresh seconds then the debug-printout
392 * triggers in watchdog_timer_fn(). 390 * triggers in watchdog_timer_fn().
@@ -519,6 +517,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
519 if (ret || !write) 517 if (ret || !write)
520 return ret; 518 return ret;
521 519
520 set_sample_period();
522 if (watchdog_enabled && watchdog_thresh) 521 if (watchdog_enabled && watchdog_thresh)
523 watchdog_enable_all_cpus(); 522 watchdog_enable_all_cpus();
524 else 523 else
@@ -540,6 +539,7 @@ static struct smp_hotplug_thread watchdog_threads = {
540 539
541void __init lockup_detector_init(void) 540void __init lockup_detector_init(void)
542{ 541{
542 set_sample_period();
543 if (smpboot_register_percpu_thread(&watchdog_threads)) { 543 if (smpboot_register_percpu_thread(&watchdog_threads)) {
544 pr_err("Failed to create watchdog threads, disabled\n"); 544 pr_err("Failed to create watchdog threads, disabled\n");
545 watchdog_disabled = -ENODEV; 545 watchdog_disabled = -ENODEV;