aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c96
-rw-r--r--kernel/cgroup.c14
-rw-r--r--kernel/cpu.c11
-rw-r--r--kernel/cpuset.c80
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c155
-rw-r--r--kernel/futex.c15
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/irq/irqdesc.c14
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/kallsyms.c58
-rw-r--r--kernel/kthread.c31
-rw-r--r--kernel/lockdep_proc.c9
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/perf_event.c32
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/pm_qos_params.c24
-rw-r--r--kernel/power/Kconfig237
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c9
-rw-r--r--kernel/power/main.c3
-rw-r--r--kernel/power/snapshot.c8
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk.c174
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/rcupdate.c10
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c1
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/sched.c30
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c152
-rw-r--r--kernel/softirq.c5
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c81
-rw-r--r--kernel/sysctl.c40
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/blktrace.c15
-rw-r--r--kernel/trace/ftrace.c3
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/watchdog.c27
-rw-r--r--kernel/workqueue.c12
63 files changed, 1061 insertions, 576 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..85cbfb31e73e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 109obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
110 111
111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index e4956244ae50..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int audit_initialized;
74int audit_enabled; 74int audit_enabled;
75int audit_ever_enabled; 75int audit_ever_enabled;
76 76
77EXPORT_SYMBOL_GPL(audit_enabled);
78
77/* Default state when kernel boots without any parameters. */ 79/* Default state when kernel boots without any parameters. */
78static int audit_default; 80static int audit_default;
79 81
@@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
671 673
672 pid = NETLINK_CREDS(skb)->pid; 674 pid = NETLINK_CREDS(skb)->pid;
673 uid = NETLINK_CREDS(skb)->uid; 675 uid = NETLINK_CREDS(skb)->uid;
674 loginuid = NETLINK_CB(skb).loginuid; 676 loginuid = audit_get_loginuid(current);
675 sessionid = NETLINK_CB(skb).sessionid; 677 sessionid = audit_get_sessionid(current);
676 sid = NETLINK_CB(skb).sid; 678 security_task_getsecid(current, &sid);
677 seq = nlh->nlmsg_seq; 679 seq = nlh->nlmsg_seq;
678 data = NLMSG_DATA(nlh); 680 data = NLMSG_DATA(nlh);
679 681
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71b..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1238 for (i = 0; i < rule->field_count; i++) { 1238 for (i = 0; i < rule->field_count; i++) {
1239 struct audit_field *f = &rule->fields[i]; 1239 struct audit_field *f = &rule->fields[i];
1240 int result = 0; 1240 int result = 0;
1241 u32 sid;
1241 1242
1242 switch (f->type) { 1243 switch (f->type) {
1243 case AUDIT_PID: 1244 case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1250 result = audit_comparator(cb->creds.gid, f->op, f->val); 1251 result = audit_comparator(cb->creds.gid, f->op, f->val);
1251 break; 1252 break;
1252 case AUDIT_LOGINUID: 1253 case AUDIT_LOGINUID:
1253 result = audit_comparator(cb->loginuid, f->op, f->val); 1254 result = audit_comparator(audit_get_loginuid(current),
1255 f->op, f->val);
1254 break; 1256 break;
1255 case AUDIT_SUBJ_USER: 1257 case AUDIT_SUBJ_USER:
1256 case AUDIT_SUBJ_ROLE: 1258 case AUDIT_SUBJ_ROLE:
1257 case AUDIT_SUBJ_TYPE: 1259 case AUDIT_SUBJ_TYPE:
1258 case AUDIT_SUBJ_SEN: 1260 case AUDIT_SUBJ_SEN:
1259 case AUDIT_SUBJ_CLR: 1261 case AUDIT_SUBJ_CLR:
1260 if (f->lsm_rule) 1262 if (f->lsm_rule) {
1261 result = security_audit_rule_match(cb->sid, 1263 security_task_getsecid(current, &sid);
1264 result = security_audit_rule_match(sid,
1262 f->type, 1265 f->type,
1263 f->op, 1266 f->op,
1264 f->lsm_rule, 1267 f->lsm_rule,
1265 NULL); 1268 NULL);
1269 }
1266 break; 1270 break;
1267 } 1271 }
1268 1272
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h>
12 13
13void foo(void) 14void foo(void)
14{ 15{
15 /* The enum constants to put into include/generated/bounds.h */ 16 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
18 /* End of constants */ 20 /* End of constants */
19} 21}
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c8..bf0c734d0c12 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <linux/user_namespace.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18 19
19/* 20/*
@@ -290,6 +291,60 @@ error:
290} 291}
291 292
292/** 293/**
294 * has_capability - Does a task have a capability in init_user_ns
295 * @t: The task in question
296 * @cap: The capability to be tested for
297 *
298 * Return true if the specified task has the given superior capability
299 * currently in effect to the initial user namespace, false if not.
300 *
301 * Note that this does not set PF_SUPERPRIV on the task.
302 */
303bool has_capability(struct task_struct *t, int cap)
304{
305 int ret = security_real_capable(t, &init_user_ns, cap);
306
307 return (ret == 0);
308}
309
310/**
311 * has_capability - Does a task have a capability in a specific user ns
312 * @t: The task in question
313 * @ns: target user namespace
314 * @cap: The capability to be tested for
315 *
316 * Return true if the specified task has the given superior capability
317 * currently in effect to the specified user namespace, false if not.
318 *
319 * Note that this does not set PF_SUPERPRIV on the task.
320 */
321bool has_ns_capability(struct task_struct *t,
322 struct user_namespace *ns, int cap)
323{
324 int ret = security_real_capable(t, ns, cap);
325
326 return (ret == 0);
327}
328
329/**
330 * has_capability_noaudit - Does a task have a capability (unaudited)
331 * @t: The task in question
332 * @cap: The capability to be tested for
333 *
334 * Return true if the specified task has the given superior capability
335 * currently in effect to init_user_ns, false if not. Don't write an
336 * audit message for the check.
337 *
338 * Note that this does not set PF_SUPERPRIV on the task.
339 */
340bool has_capability_noaudit(struct task_struct *t, int cap)
341{
342 int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
343
344 return (ret == 0);
345}
346
347/**
293 * capable - Determine if the current task has a superior capability in effect 348 * capable - Determine if the current task has a superior capability in effect
294 * @cap: The capability to be tested for 349 * @cap: The capability to be tested for
295 * 350 *
@@ -299,17 +354,48 @@ error:
299 * This sets PF_SUPERPRIV on the task if the capability is available on the 354 * This sets PF_SUPERPRIV on the task if the capability is available on the
300 * assumption that it's about to be used. 355 * assumption that it's about to be used.
301 */ 356 */
302int capable(int cap) 357bool capable(int cap)
358{
359 return ns_capable(&init_user_ns, cap);
360}
361EXPORT_SYMBOL(capable);
362
363/**
364 * ns_capable - Determine if the current task has a superior capability in effect
365 * @ns: The usernamespace we want the capability in
366 * @cap: The capability to be tested for
367 *
368 * Return true if the current task has the given superior capability currently
369 * available for use, false if not.
370 *
371 * This sets PF_SUPERPRIV on the task if the capability is available on the
372 * assumption that it's about to be used.
373 */
374bool ns_capable(struct user_namespace *ns, int cap)
303{ 375{
304 if (unlikely(!cap_valid(cap))) { 376 if (unlikely(!cap_valid(cap))) {
305 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); 377 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
306 BUG(); 378 BUG();
307 } 379 }
308 380
309 if (security_capable(current_cred(), cap) == 0) { 381 if (security_capable(ns, current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 382 current->flags |= PF_SUPERPRIV;
311 return 1; 383 return true;
312 } 384 }
313 return 0; 385 return false;
314} 386}
315EXPORT_SYMBOL(capable); 387EXPORT_SYMBOL(ns_capable);
388
389/**
390 * task_ns_capable - Determine whether current task has a superior
391 * capability targeted at a specific task's user namespace.
392 * @t: The task whose user namespace is targeted.
393 * @cap: The capability in question.
394 *
395 * Return true if it does, false otherwise.
396 */
397bool task_ns_capable(struct task_struct *t, int cap)
398{
399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
400}
401EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 95362d15128c..e31b220a743d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1813 1813
1814 /* Update the css_set linked lists if we're using them */ 1814 /* Update the css_set linked lists if we're using them */
1815 write_lock(&css_set_lock); 1815 write_lock(&css_set_lock);
1816 if (!list_empty(&tsk->cg_list)) { 1816 if (!list_empty(&tsk->cg_list))
1817 list_del(&tsk->cg_list); 1817 list_move(&tsk->cg_list, &newcg->tasks);
1818 list_add(&tsk->cg_list, &newcg->tasks);
1819 }
1820 write_unlock(&css_set_lock); 1818 write_unlock(&css_set_lock);
1821 1819
1822 for_each_subsys(root, ss) { 1820 for_each_subsys(root, ss) {
@@ -3655,12 +3653,12 @@ again:
3655 spin_lock(&release_list_lock); 3653 spin_lock(&release_list_lock);
3656 set_bit(CGRP_REMOVED, &cgrp->flags); 3654 set_bit(CGRP_REMOVED, &cgrp->flags);
3657 if (!list_empty(&cgrp->release_list)) 3655 if (!list_empty(&cgrp->release_list))
3658 list_del(&cgrp->release_list); 3656 list_del_init(&cgrp->release_list);
3659 spin_unlock(&release_list_lock); 3657 spin_unlock(&release_list_lock);
3660 3658
3661 cgroup_lock_hierarchy(cgrp->root); 3659 cgroup_lock_hierarchy(cgrp->root);
3662 /* delete this cgroup from parent->children */ 3660 /* delete this cgroup from parent->children */
3663 list_del(&cgrp->sibling); 3661 list_del_init(&cgrp->sibling);
3664 cgroup_unlock_hierarchy(cgrp->root); 3662 cgroup_unlock_hierarchy(cgrp->root);
3665 3663
3666 d = dget(cgrp->dentry); 3664 d = dget(cgrp->dentry);
@@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
3879 subsys[ss->subsys_id] = NULL; 3877 subsys[ss->subsys_id] = NULL;
3880 3878
3881 /* remove subsystem from rootnode's list of subsystems */ 3879 /* remove subsystem from rootnode's list of subsystems */
3882 list_del(&ss->sibling); 3880 list_del_init(&ss->sibling);
3883 3881
3884 /* 3882 /*
3885 * disentangle the css from all css_sets attached to the dummytop. as 3883 * disentangle the css from all css_sets attached to the dummytop. as
@@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4241 if (!list_empty(&tsk->cg_list)) { 4239 if (!list_empty(&tsk->cg_list)) {
4242 write_lock(&css_set_lock); 4240 write_lock(&css_set_lock);
4243 if (!list_empty(&tsk->cg_list)) 4241 if (!list_empty(&tsk->cg_list))
4244 list_del(&tsk->cg_list); 4242 list_del_init(&tsk->cg_list);
4245 write_unlock(&css_set_lock); 4243 write_unlock(&css_set_lock);
4246 } 4244 }
4247 4245
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..c95fc4df0faa 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
160{ 160{
161 BUG_ON(cpu_notify(val, v)); 161 BUG_ON(cpu_notify(val, v));
162} 162}
163
164EXPORT_SYMBOL(register_cpu_notifier); 163EXPORT_SYMBOL(register_cpu_notifier);
165 164
166void __ref unregister_cpu_notifier(struct notifier_block *nb) 165void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
205 return err; 204 return err;
206 205
207 cpu_notify(CPU_DYING | param->mod, param->hcpu); 206 cpu_notify(CPU_DYING | param->mod, param->hcpu);
208
209 return 0; 207 return 0;
210} 208}
211 209
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
227 return -EINVAL; 225 return -EINVAL;
228 226
229 cpu_hotplug_begin(); 227 cpu_hotplug_begin();
228
230 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 229 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
231 if (err) { 230 if (err) {
232 nr_calls--; 231 nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
304 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
305 if (ret) { 304 if (ret) {
306 nr_calls--; 305 nr_calls--;
307 printk("%s: attempt to bring up CPU %u failed\n", 306 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
308 __func__, cpu); 307 __func__, cpu);
309 goto out_notify; 308 goto out_notify;
310 } 309 }
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
450 if (cpumask_empty(frozen_cpus)) 449 if (cpumask_empty(frozen_cpus))
451 goto out; 450 goto out;
452 451
453 printk("Enabling non-boot CPUs ...\n"); 452 printk(KERN_INFO "Enabling non-boot CPUs ...\n");
454 453
455 arch_enable_nonboot_cpus_begin(); 454 arch_enable_nonboot_cpus_begin();
456 455
457 for_each_cpu(cpu, frozen_cpus) { 456 for_each_cpu(cpu, frozen_cpus) {
458 error = _cpu_up(cpu, 1); 457 error = _cpu_up(cpu, 1);
459 if (!error) { 458 if (!error) {
460 printk("CPU%d is up\n", cpu); 459 printk(KERN_INFO "CPU%d is up\n", cpu);
461 continue; 460 continue;
462 } 461 }
463 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 462 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
509 */ 508 */
510 509
511/* cpu_bit_bitmap[0] is empty - so we can back into it */ 510/* cpu_bit_bitmap[0] is empty - so we can back into it */
512#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) 511#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
513#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) 512#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
514#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) 513#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
515#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) 514#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e92e98189032..33eee16addb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
1015 struct cpuset *cs; 1015 struct cpuset *cs;
1016 int migrate; 1016 int migrate;
1017 const nodemask_t *oldmem = scan->data; 1017 const nodemask_t *oldmem = scan->data;
1018 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); 1018 static nodemask_t newmems; /* protected by cgroup_mutex */
1019
1020 if (!newmems)
1021 return;
1022 1019
1023 cs = cgroup_cs(scan->cg); 1020 cs = cgroup_cs(scan->cg);
1024 guarantee_online_mems(cs, newmems); 1021 guarantee_online_mems(cs, &newmems);
1025
1026 cpuset_change_task_nodemask(p, newmems);
1027 1022
1028 NODEMASK_FREE(newmems); 1023 cpuset_change_task_nodemask(p, &newmems);
1029 1024
1030 mm = get_task_mm(p); 1025 mm = get_task_mm(p);
1031 if (!mm) 1026 if (!mm)
@@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1438 struct mm_struct *mm; 1433 struct mm_struct *mm;
1439 struct cpuset *cs = cgroup_cs(cont); 1434 struct cpuset *cs = cgroup_cs(cont);
1440 struct cpuset *oldcs = cgroup_cs(oldcont); 1435 struct cpuset *oldcs = cgroup_cs(oldcont);
1441 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); 1436 static nodemask_t to; /* protected by cgroup_mutex */
1442 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1443
1444 if (from == NULL || to == NULL)
1445 goto alloc_fail;
1446 1437
1447 if (cs == &top_cpuset) { 1438 if (cs == &top_cpuset) {
1448 cpumask_copy(cpus_attach, cpu_possible_mask); 1439 cpumask_copy(cpus_attach, cpu_possible_mask);
1449 } else { 1440 } else {
1450 guarantee_online_cpus(cs, cpus_attach); 1441 guarantee_online_cpus(cs, cpus_attach);
1451 } 1442 }
1452 guarantee_online_mems(cs, to); 1443 guarantee_online_mems(cs, &to);
1453 1444
1454 /* do per-task migration stuff possibly for each in the threadgroup */ 1445 /* do per-task migration stuff possibly for each in the threadgroup */
1455 cpuset_attach_task(tsk, to, cs); 1446 cpuset_attach_task(tsk, &to, cs);
1456 if (threadgroup) { 1447 if (threadgroup) {
1457 struct task_struct *c; 1448 struct task_struct *c;
1458 rcu_read_lock(); 1449 rcu_read_lock();
1459 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1450 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1460 cpuset_attach_task(c, to, cs); 1451 cpuset_attach_task(c, &to, cs);
1461 } 1452 }
1462 rcu_read_unlock(); 1453 rcu_read_unlock();
1463 } 1454 }
1464 1455
1465 /* change mm; only needs to be done once even if threadgroup */ 1456 /* change mm; only needs to be done once even if threadgroup */
1466 *from = oldcs->mems_allowed; 1457 to = cs->mems_allowed;
1467 *to = cs->mems_allowed;
1468 mm = get_task_mm(tsk); 1458 mm = get_task_mm(tsk);
1469 if (mm) { 1459 if (mm) {
1470 mpol_rebind_mm(mm, to); 1460 mpol_rebind_mm(mm, &to);
1471 if (is_memory_migrate(cs)) 1461 if (is_memory_migrate(cs))
1472 cpuset_migrate_mm(mm, from, to); 1462 cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
1473 mmput(mm); 1463 mmput(mm);
1474 } 1464 }
1475
1476alloc_fail:
1477 NODEMASK_FREE(from);
1478 NODEMASK_FREE(to);
1479} 1465}
1480 1466
1481/* The various types of files and directories in a cpuset file system */ 1467/* The various types of files and directories in a cpuset file system */
@@ -1610,34 +1596,26 @@ out:
1610 * across a page fault. 1596 * across a page fault.
1611 */ 1597 */
1612 1598
1613static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1599static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1614{ 1600{
1615 int ret; 1601 size_t count;
1616 1602
1617 mutex_lock(&callback_mutex); 1603 mutex_lock(&callback_mutex);
1618 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1604 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1619 mutex_unlock(&callback_mutex); 1605 mutex_unlock(&callback_mutex);
1620 1606
1621 return ret; 1607 return count;
1622} 1608}
1623 1609
1624static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1610static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1625{ 1611{
1626 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); 1612 size_t count;
1627 int retval;
1628
1629 if (mask == NULL)
1630 return -ENOMEM;
1631 1613
1632 mutex_lock(&callback_mutex); 1614 mutex_lock(&callback_mutex);
1633 *mask = cs->mems_allowed; 1615 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1634 mutex_unlock(&callback_mutex); 1616 mutex_unlock(&callback_mutex);
1635 1617
1636 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); 1618 return count;
1637
1638 NODEMASK_FREE(mask);
1639
1640 return retval;
1641} 1619}
1642 1620
1643static ssize_t cpuset_common_file_read(struct cgroup *cont, 1621static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1862,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1862 cs = cgroup_cs(cgroup); 1840 cs = cgroup_cs(cgroup);
1863 parent_cs = cgroup_cs(parent); 1841 parent_cs = cgroup_cs(parent);
1864 1842
1843 mutex_lock(&callback_mutex);
1865 cs->mems_allowed = parent_cs->mems_allowed; 1844 cs->mems_allowed = parent_cs->mems_allowed;
1866 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); 1845 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1846 mutex_unlock(&callback_mutex);
1867 return; 1847 return;
1868} 1848}
1869 1849
@@ -2066,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2066 struct cpuset *cp; /* scans cpusets being updated */ 2046 struct cpuset *cp; /* scans cpusets being updated */
2067 struct cpuset *child; /* scans child cpusets of cp */ 2047 struct cpuset *child; /* scans child cpusets of cp */
2068 struct cgroup *cont; 2048 struct cgroup *cont;
2069 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2049 static nodemask_t oldmems; /* protected by cgroup_mutex */
2070
2071 if (oldmems == NULL)
2072 return;
2073 2050
2074 list_add_tail((struct list_head *)&root->stack_list, &queue); 2051 list_add_tail((struct list_head *)&root->stack_list, &queue);
2075 2052
@@ -2086,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2086 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2063 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2087 continue; 2064 continue;
2088 2065
2089 *oldmems = cp->mems_allowed; 2066 oldmems = cp->mems_allowed;
2090 2067
2091 /* Remove offline cpus and mems from this cpuset. */ 2068 /* Remove offline cpus and mems from this cpuset. */
2092 mutex_lock(&callback_mutex); 2069 mutex_lock(&callback_mutex);
@@ -2102,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2102 remove_tasks_in_empty_cpuset(cp); 2079 remove_tasks_in_empty_cpuset(cp);
2103 else { 2080 else {
2104 update_tasks_cpumask(cp, NULL); 2081 update_tasks_cpumask(cp, NULL);
2105 update_tasks_nodemask(cp, oldmems, NULL); 2082 update_tasks_nodemask(cp, &oldmems, NULL);
2106 } 2083 }
2107 } 2084 }
2108 NODEMASK_FREE(oldmems);
2109} 2085}
2110 2086
2111/* 2087/*
@@ -2147,19 +2123,16 @@ void cpuset_update_active_cpus(void)
2147static int cpuset_track_online_nodes(struct notifier_block *self, 2123static int cpuset_track_online_nodes(struct notifier_block *self,
2148 unsigned long action, void *arg) 2124 unsigned long action, void *arg)
2149{ 2125{
2150 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2126 static nodemask_t oldmems; /* protected by cgroup_mutex */
2151
2152 if (oldmems == NULL)
2153 return NOTIFY_DONE;
2154 2127
2155 cgroup_lock(); 2128 cgroup_lock();
2156 switch (action) { 2129 switch (action) {
2157 case MEM_ONLINE: 2130 case MEM_ONLINE:
2158 *oldmems = top_cpuset.mems_allowed; 2131 oldmems = top_cpuset.mems_allowed;
2159 mutex_lock(&callback_mutex); 2132 mutex_lock(&callback_mutex);
2160 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2133 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2161 mutex_unlock(&callback_mutex); 2134 mutex_unlock(&callback_mutex);
2162 update_tasks_nodemask(&top_cpuset, oldmems, NULL); 2135 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2163 break; 2136 break;
2164 case MEM_OFFLINE: 2137 case MEM_OFFLINE:
2165 /* 2138 /*
@@ -2173,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2173 } 2146 }
2174 cgroup_unlock(); 2147 cgroup_unlock();
2175 2148
2176 NODEMASK_FREE(oldmems);
2177 return NOTIFY_OK; 2149 return NOTIFY_OK;
2178} 2150}
2179#endif 2151#endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
1#include <linux/kernel.h>
2#include <linux/crash_dump.h>
3#include <linux/init.h>
4#include <linux/errno.h>
5#include <linux/module.h>
6
7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need
9 * to know the amount of memory that the previous kernel used.
10 */
11unsigned long saved_max_pfn;
12
13/*
14 * stores the physical address of elf header of crash image
15 *
16 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
17 * is_kdump_kernel() to determine if we are booting after a panic. Hence put
18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
19 */
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21
22/*
23 * elfcorehdr= specifies the location of elf core header stored by the crashed
24 * kernel. This option will be passed by kexec loader to the capture kernel.
25 */
26static int __init setup_elfcorehdr(char *arg)
27{
28 char *end;
29 if (!arg)
30 return -EINVAL;
31 elfcorehdr_addr = memparse(arg, &end);
32 return end > arg ? 0 : -EINVAL;
33}
34early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 2343c132c5a7..5557b55048df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
741} 741}
742EXPORT_SYMBOL(set_create_files_as); 742EXPORT_SYMBOL(set_create_files_as);
743 743
744struct user_namespace *current_user_ns(void)
745{
746 return _current_user_ns();
747}
748EXPORT_SYMBOL(current_user_ns);
749
744#ifdef CONFIG_DEBUG_CREDENTIALS 750#ifdef CONFIG_DEBUG_CREDENTIALS
745 751
746bool creds_are_invalid(const struct cred *cred) 752bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1093 put_packet(remcom_out_buffer); 1093 put_packet(remcom_out_buffer);
1094 return 0; 1094 return 0;
1095} 1095}
1096
1097/**
1098 * gdbstub_exit - Send an exit message to GDB
1099 * @status: The exit code to report.
1100 */
1101void gdbstub_exit(int status)
1102{
1103 unsigned char checksum, ch, buffer[3];
1104 int loop;
1105
1106 buffer[0] = 'W';
1107 buffer[1] = hex_asc_hi(status);
1108 buffer[2] = hex_asc_lo(status);
1109
1110 dbg_io_ops->write_char('$');
1111 checksum = 0;
1112
1113 for (loop = 0; loop < 3; loop++) {
1114 ch = buffer[loop];
1115 checksum += ch;
1116 dbg_io_ops->write_char(ch);
1117 }
1118
1119 dbg_io_ops->write_char('#');
1120 dbg_io_ops->write_char(hex_asc_hi(checksum));
1121 dbg_io_ops->write_char(hex_asc_lo(checksum));
1122
1123 /* make sure the output is flushed, lest the bootloader clobber it */
1124 dbg_io_ops->flush();
1125}
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..6a488ad2dce5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
908 profile_task_exit(tsk); 908 profile_task_exit(tsk);
909 909
910 WARN_ON(atomic_read(&tsk->fs_excl)); 910 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk));
911 912
912 if (unlikely(in_interrupt())) 913 if (unlikely(in_interrupt()))
913 panic("Aiee, killing interrupt handler!"); 914 panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..e7548dee636b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/tracehook.h> 40#include <linux/tracehook.h>
41#include <linux/futex.h> 41#include <linux/futex.h>
42#include <linux/compat.h> 42#include <linux/compat.h>
43#include <linux/kthread.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -109,20 +110,25 @@ int nr_processes(void)
109} 110}
110 111
111#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 112#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
112# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 113# define alloc_task_struct_node(node) \
113# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 114 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
115# define free_task_struct(tsk) \
116 kmem_cache_free(task_struct_cachep, (tsk))
114static struct kmem_cache *task_struct_cachep; 117static struct kmem_cache *task_struct_cachep;
115#endif 118#endif
116 119
117#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 120#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
118static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) 121static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
122 int node)
119{ 123{
120#ifdef CONFIG_DEBUG_STACK_USAGE 124#ifdef CONFIG_DEBUG_STACK_USAGE
121 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 125 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
122#else 126#else
123 gfp_t mask = GFP_KERNEL; 127 gfp_t mask = GFP_KERNEL;
124#endif 128#endif
125 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); 129 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
130
131 return page ? page_address(page) : NULL;
126} 132}
127 133
128static inline void free_thread_info(struct thread_info *ti) 134static inline void free_thread_info(struct thread_info *ti)
@@ -193,6 +199,7 @@ void __put_task_struct(struct task_struct *tsk)
193 if (!profile_handoff_task(tsk)) 199 if (!profile_handoff_task(tsk))
194 free_task(tsk); 200 free_task(tsk);
195} 201}
202EXPORT_SYMBOL_GPL(__put_task_struct);
196 203
197/* 204/*
198 * macro override instead of weak attribute alias, to workaround 205 * macro override instead of weak attribute alias, to workaround
@@ -248,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
248 struct task_struct *tsk; 255 struct task_struct *tsk;
249 struct thread_info *ti; 256 struct thread_info *ti;
250 unsigned long *stackend; 257 unsigned long *stackend;
251 258 int node = tsk_fork_get_node(orig);
252 int err; 259 int err;
253 260
254 prepare_to_copy(orig); 261 prepare_to_copy(orig);
255 262
256 tsk = alloc_task_struct(); 263 tsk = alloc_task_struct_node(node);
257 if (!tsk) 264 if (!tsk)
258 return NULL; 265 return NULL;
259 266
260 ti = alloc_thread_info(tsk); 267 ti = alloc_thread_info_node(tsk, node);
261 if (!ti) { 268 if (!ti) {
262 free_task_struct(tsk); 269 free_task_struct(tsk);
263 return NULL; 270 return NULL;
@@ -1180,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1180 pid = alloc_pid(p->nsproxy->pid_ns); 1187 pid = alloc_pid(p->nsproxy->pid_ns);
1181 if (!pid) 1188 if (!pid)
1182 goto bad_fork_cleanup_io; 1189 goto bad_fork_cleanup_io;
1183
1184 if (clone_flags & CLONE_NEWPID) {
1185 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1186 if (retval < 0)
1187 goto bad_fork_free_pid;
1188 }
1189 } 1190 }
1190 1191
1191 p->pid = pid_nr(pid); 1192 p->pid = pid_nr(pid);
@@ -1204,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204 * Clear TID on mm_release()? 1205 * Clear TID on mm_release()?
1205 */ 1206 */
1206 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1207 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1208#ifdef CONFIG_BLOCK
1209 p->plug = NULL;
1210#endif
1207#ifdef CONFIG_FUTEX 1211#ifdef CONFIG_FUTEX
1208 p->robust_list = NULL; 1212 p->robust_list = NULL;
1209#ifdef CONFIG_COMPAT 1213#ifdef CONFIG_COMPAT
@@ -1289,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1289 tracehook_finish_clone(p, clone_flags, trace); 1293 tracehook_finish_clone(p, clone_flags, trace);
1290 1294
1291 if (thread_group_leader(p)) { 1295 if (thread_group_leader(p)) {
1292 if (clone_flags & CLONE_NEWPID) 1296 if (is_child_reaper(pid))
1293 p->nsproxy->pid_ns->child_reaper = p; 1297 p->nsproxy->pid_ns->child_reaper = p;
1294 1298
1295 p->signal->leader_pid = pid; 1299 p->signal->leader_pid = pid;
@@ -1512,38 +1516,24 @@ void __init proc_caches_init(void)
1512} 1516}
1513 1517
1514/* 1518/*
1515 * Check constraints on flags passed to the unshare system call and 1519 * Check constraints on flags passed to the unshare system call.
1516 * force unsharing of additional process context as appropriate.
1517 */ 1520 */
1518static void check_unshare_flags(unsigned long *flags_ptr) 1521static int check_unshare_flags(unsigned long unshare_flags)
1519{ 1522{
1523 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1524 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1525 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1526 return -EINVAL;
1520 /* 1527 /*
1521 * If unsharing a thread from a thread group, must also 1528 * Not implemented, but pretend it works if there is nothing to
1522 * unshare vm. 1529 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1523 */ 1530 * needs to unshare vm.
1524 if (*flags_ptr & CLONE_THREAD)
1525 *flags_ptr |= CLONE_VM;
1526
1527 /*
1528 * If unsharing vm, must also unshare signal handlers.
1529 */
1530 if (*flags_ptr & CLONE_VM)
1531 *flags_ptr |= CLONE_SIGHAND;
1532
1533 /*
1534 * If unsharing namespace, must also unshare filesystem information.
1535 */ 1531 */
1536 if (*flags_ptr & CLONE_NEWNS) 1532 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1537 *flags_ptr |= CLONE_FS; 1533 /* FIXME: get_task_mm() increments ->mm_users */
1538} 1534 if (atomic_read(&current->mm->mm_users) > 1)
1539 1535 return -EINVAL;
1540/* 1536 }
1541 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1542 */
1543static int unshare_thread(unsigned long unshare_flags)
1544{
1545 if (unshare_flags & CLONE_THREAD)
1546 return -EINVAL;
1547 1537
1548 return 0; 1538 return 0;
1549} 1539}
@@ -1570,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1570} 1560}
1571 1561
1572/* 1562/*
1573 * Unsharing of sighand is not supported yet
1574 */
1575static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1576{
1577 struct sighand_struct *sigh = current->sighand;
1578
1579 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1580 return -EINVAL;
1581 else
1582 return 0;
1583}
1584
1585/*
1586 * Unshare vm if it is being shared
1587 */
1588static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1589{
1590 struct mm_struct *mm = current->mm;
1591
1592 if ((unshare_flags & CLONE_VM) &&
1593 (mm && atomic_read(&mm->mm_users) > 1)) {
1594 return -EINVAL;
1595 }
1596
1597 return 0;
1598}
1599
1600/*
1601 * Unshare file descriptor table if it is being shared 1563 * Unshare file descriptor table if it is being shared
1602 */ 1564 */
1603static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) 1565static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1625,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1625 */ 1587 */
1626SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) 1588SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1627{ 1589{
1628 int err = 0;
1629 struct fs_struct *fs, *new_fs = NULL; 1590 struct fs_struct *fs, *new_fs = NULL;
1630 struct sighand_struct *new_sigh = NULL;
1631 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1632 struct files_struct *fd, *new_fd = NULL; 1591 struct files_struct *fd, *new_fd = NULL;
1633 struct nsproxy *new_nsproxy = NULL; 1592 struct nsproxy *new_nsproxy = NULL;
1634 int do_sysvsem = 0; 1593 int do_sysvsem = 0;
1594 int err;
1635 1595
1636 check_unshare_flags(&unshare_flags); 1596 err = check_unshare_flags(unshare_flags);
1637 1597 if (err)
1638 /* Return -EINVAL for all unsupported flags */
1639 err = -EINVAL;
1640 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1641 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1642 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1643 goto bad_unshare_out; 1598 goto bad_unshare_out;
1644 1599
1645 /* 1600 /*
1601 * If unsharing namespace, must also unshare filesystem information.
1602 */
1603 if (unshare_flags & CLONE_NEWNS)
1604 unshare_flags |= CLONE_FS;
1605 /*
1646 * CLONE_NEWIPC must also detach from the undolist: after switching 1606 * CLONE_NEWIPC must also detach from the undolist: after switching
1647 * to a new ipc namespace, the semaphore arrays from the old 1607 * to a new ipc namespace, the semaphore arrays from the old
1648 * namespace are unreachable. 1608 * namespace are unreachable.
1649 */ 1609 */
1650 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1610 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1651 do_sysvsem = 1; 1611 do_sysvsem = 1;
1652 if ((err = unshare_thread(unshare_flags)))
1653 goto bad_unshare_out;
1654 if ((err = unshare_fs(unshare_flags, &new_fs))) 1612 if ((err = unshare_fs(unshare_flags, &new_fs)))
1655 goto bad_unshare_cleanup_thread; 1613 goto bad_unshare_out;
1656 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1657 goto bad_unshare_cleanup_fs;
1658 if ((err = unshare_vm(unshare_flags, &new_mm)))
1659 goto bad_unshare_cleanup_sigh;
1660 if ((err = unshare_fd(unshare_flags, &new_fd))) 1614 if ((err = unshare_fd(unshare_flags, &new_fd)))
1661 goto bad_unshare_cleanup_vm; 1615 goto bad_unshare_cleanup_fs;
1662 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1616 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1663 new_fs))) 1617 new_fs)))
1664 goto bad_unshare_cleanup_fd; 1618 goto bad_unshare_cleanup_fd;
1665 1619
1666 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { 1620 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
1667 if (do_sysvsem) { 1621 if (do_sysvsem) {
1668 /* 1622 /*
1669 * CLONE_SYSVSEM is equivalent to sys_exit(). 1623 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1689,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1689 spin_unlock(&fs->lock); 1643 spin_unlock(&fs->lock);
1690 } 1644 }
1691 1645
1692 if (new_mm) {
1693 mm = current->mm;
1694 active_mm = current->active_mm;
1695 current->mm = new_mm;
1696 current->active_mm = new_mm;
1697 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1698 atomic_dec(&mm->oom_disable_count);
1699 atomic_inc(&new_mm->oom_disable_count);
1700 }
1701 activate_mm(active_mm, new_mm);
1702 new_mm = mm;
1703 }
1704
1705 if (new_fd) { 1646 if (new_fd) {
1706 fd = current->files; 1647 fd = current->files;
1707 current->files = new_fd; 1648 current->files = new_fd;
@@ -1718,20 +1659,10 @@ bad_unshare_cleanup_fd:
1718 if (new_fd) 1659 if (new_fd)
1719 put_files_struct(new_fd); 1660 put_files_struct(new_fd);
1720 1661
1721bad_unshare_cleanup_vm:
1722 if (new_mm)
1723 mmput(new_mm);
1724
1725bad_unshare_cleanup_sigh:
1726 if (new_sigh)
1727 if (atomic_dec_and_test(&new_sigh->count))
1728 kmem_cache_free(sighand_cachep, new_sigh);
1729
1730bad_unshare_cleanup_fs: 1662bad_unshare_cleanup_fs:
1731 if (new_fs) 1663 if (new_fs)
1732 free_fs_struct(new_fs); 1664 free_fs_struct(new_fs);
1733 1665
1734bad_unshare_cleanup_thread:
1735bad_unshare_out: 1666bad_unshare_out:
1736 return err; 1667 return err;
1737} 1668}
diff --git a/kernel/futex.c b/kernel/futex.c
index bda415715382..dfb924ffe65b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -782,8 +782,8 @@ static void __unqueue_futex(struct futex_q *q)
782{ 782{
783 struct futex_hash_bucket *hb; 783 struct futex_hash_bucket *hb;
784 784
785 if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) 785 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
786 || plist_node_empty(&q->list))) 786 || WARN_ON(plist_node_empty(&q->list)))
787 return; 787 return;
788 788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
@@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2418 goto err_unlock; 2418 goto err_unlock;
2419 ret = -EPERM; 2419 ret = -EPERM;
2420 pcred = __task_cred(p); 2420 pcred = __task_cred(p);
2421 /* If victim is in different user_ns, then uids are not
2422 comparable, so we must have CAP_SYS_PTRACE */
2423 if (cred->user->user_ns != pcred->user->user_ns) {
2424 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2425 goto err_unlock;
2426 goto ok;
2427 }
2428 /* If victim is in same user_ns, then uids are comparable */
2421 if (cred->euid != pcred->euid && 2429 if (cred->euid != pcred->euid &&
2422 cred->euid != pcred->uid && 2430 cred->euid != pcred->uid &&
2423 !capable(CAP_SYS_PTRACE)) 2431 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2424 goto err_unlock; 2432 goto err_unlock;
2433ok:
2425 head = p->robust_list; 2434 head = p->robust_list;
2426 rcu_read_unlock(); 2435 rcu_read_unlock();
2427 } 2436 }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
153 goto err_unlock; 153 goto err_unlock;
154 ret = -EPERM; 154 ret = -EPERM;
155 pcred = __task_cred(p); 155 pcred = __task_cred(p);
156 /* If victim is in different user_ns, then uids are not
157 comparable, so we must have CAP_SYS_PTRACE */
158 if (cred->user->user_ns != pcred->user->user_ns) {
159 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
160 goto err_unlock;
161 goto ok;
162 }
163 /* If victim is in same user_ns, then uids are comparable */
156 if (cred->euid != pcred->euid && 164 if (cred->euid != pcred->euid &&
157 cred->euid != pcred->uid && 165 cred->euid != pcred->uid &&
158 !capable(CAP_SYS_PTRACE)) 166 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
159 goto err_unlock; 167 goto err_unlock;
168ok:
160 head = p->compat_robust_list; 169 head = p->compat_robust_list;
161 rcu_read_unlock(); 170 rcu_read_unlock();
162 } 171 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..b8cadf70b1fb 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 37 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o 3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!capable(CAP_SETGID)) 236 if (!nsown_capable(CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index dbccc799407f..6fb014f172f7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -198,15 +198,6 @@ err:
198 return -ENOMEM; 198 return -ENOMEM;
199} 199}
200 200
201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
202{
203 int res = irq_alloc_descs(irq, irq, 1, node);
204
205 if (res == -EEXIST || res == irq)
206 return irq_to_desc(irq);
207 return NULL;
208}
209
210static int irq_expand_nr_irqs(unsigned int nr) 201static int irq_expand_nr_irqs(unsigned int nr)
211{ 202{
212 if (nr > IRQ_BITMAP_BITS) 203 if (nr > IRQ_BITMAP_BITS)
@@ -283,11 +274,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
283 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 274 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
284} 275}
285 276
286struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
287{
288 return irq_to_desc(irq);
289}
290
291static void free_desc(unsigned int irq) 277static void free_desc(unsigned int irq)
292{ 278{
293 dynamic_irq_cleanup(irq); 279 dynamic_irq_cleanup(irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index acd599a43bfb..0a2aa73e536c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1064,10 +1064,10 @@ mismatch:
1064 ret = -EBUSY; 1064 ret = -EBUSY;
1065 1065
1066out_mask: 1066out_mask:
1067 raw_spin_unlock_irqrestore(&desc->lock, flags);
1067 free_cpumask_var(mask); 1068 free_cpumask_var(mask);
1068 1069
1069out_thread: 1070out_thread:
1070 raw_spin_unlock_irqrestore(&desc->lock, flags);
1071 if (new->thread) { 1071 if (new->thread) {
1072 struct task_struct *t = new->thread; 1072 struct task_struct *t = new->thread;
1073 1073
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4cc2e5ed0bec..760248de109d 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -405,7 +405,8 @@ int show_interrupts(struct seq_file *p, void *v)
405 for_each_online_cpu(j) 405 for_each_online_cpu(j)
406 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 406 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
407 seq_printf(p, " %8s", desc->irq_data.chip->name); 407 seq_printf(p, " %8s", desc->irq_data.chip->name);
408 seq_printf(p, "-%-8s", desc->name); 408 if (desc->name)
409 seq_printf(p, "-%-8s", desc->name);
409 410
410 if (action) { 411 if (action) {
411 seq_printf(p, " %s", action->name); 412 seq_printf(p, " %s", action->name);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || 64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
65 arch_is_kernel_text(addr)) 65 arch_is_kernel_text(addr))
66 return 1; 66 return 1;
67 return in_gate_area_no_task(addr); 67 return in_gate_area_no_mm(addr);
68} 68}
69 69
70static inline int is_kernel(unsigned long addr) 70static inline int is_kernel(unsigned long addr)
71{ 71{
72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) 72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
73 return 1; 73 return 1;
74 return in_gate_area_no_task(addr); 74 return in_gate_area_no_mm(addr);
75} 75}
76 76
77static int is_ksym_addr(unsigned long addr) 77static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
342} 342}
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345int sprint_symbol(char *buffer, unsigned long address) 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset)
346{ 347{
347 char *modname; 348 char *modname;
348 const char *name; 349 const char *name;
349 unsigned long offset, size; 350 unsigned long offset, size;
350 int len; 351 int len;
351 352
353 address += symbol_offset;
352 name = kallsyms_lookup(address, &size, &offset, &modname, buffer); 354 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
353 if (!name) 355 if (!name)
354 return sprintf(buffer, "0x%lx", address); 356 return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
357 strcpy(buffer, name); 359 strcpy(buffer, name);
358 len = strlen(buffer); 360 len = strlen(buffer);
359 buffer += len; 361 buffer += len;
362 offset -= symbol_offset;
360 363
361 if (modname) 364 if (modname)
362 len += sprintf(buffer, "+%#lx/%#lx [%s]", 365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
363 offset, size, modname);
364 else 366 else
365 len += sprintf(buffer, "+%#lx/%#lx", offset, size); 367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
366 368
367 return len; 369 return len;
368} 370}
371
372/**
373 * sprint_symbol - Look up a kernel symbol and return it in a text buffer
374 * @buffer: buffer to be stored
375 * @address: address to lookup
376 *
377 * This function looks up a kernel symbol with @address and stores its name,
378 * offset, size and module name to @buffer if possible. If no symbol was found,
379 * just saves its @address as is.
380 *
381 * This function returns the number of bytes stored in @buffer.
382 */
383int sprint_symbol(char *buffer, unsigned long address)
384{
385 return __sprint_symbol(buffer, address, 0);
386}
387
369EXPORT_SYMBOL_GPL(sprint_symbol); 388EXPORT_SYMBOL_GPL(sprint_symbol);
370 389
390/**
391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
392 * @buffer: buffer to be stored
393 * @address: address to lookup
394 *
395 * This function is for stack backtrace and does the same thing as
396 * sprint_symbol() but with modified/decreased @address. If there is a
397 * tail-call to the function marked "noreturn", gcc optimized out code after
398 * the call so that the stack-saved return address could point outside of the
399 * caller. This function ensures that kallsyms will find the original caller
400 * by decreasing @address.
401 *
402 * This function returns the number of bytes stored in @buffer.
403 */
404int sprint_backtrace(char *buffer, unsigned long address)
405{
406 return __sprint_symbol(buffer, address, -1);
407}
408
371/* Look up a kernel symbol and print it to the kernel messages. */ 409/* Look up a kernel symbol and print it to the kernel messages. */
372void __print_symbol(const char *fmt, unsigned long address) 410void __print_symbol(const char *fmt, unsigned long address)
373{ 411{
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
477 */ 515 */
478 type = iter->exported ? toupper(iter->type) : 516 type = iter->exported ? toupper(iter->type) :
479 tolower(iter->type); 517 tolower(iter->type);
480 seq_printf(m, "%0*lx %c %s\t[%s]\n", 518 seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
481 (int)(2 * sizeof(void *)), 519 type, iter->name, iter->module_name);
482 iter->value, type, iter->name, iter->module_name);
483 } else 520 } else
484 seq_printf(m, "%0*lx %c %s\n", 521 seq_printf(m, "%pK %c %s\n", (void *)iter->value,
485 (int)(2 * sizeof(void *)), 522 iter->type, iter->name);
486 iter->value, iter->type, iter->name);
487 return 0; 523 return 0;
488} 524}
489 525
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..684ab3f7dd72 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
27 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
28 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
29 void *data; 29 void *data;
30 int node;
30 31
31 /* Result passed back to kthread_create() from kthreadd. */ 32 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 33 struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
98 do_exit(ret); 99 do_exit(ret);
99} 100}
100 101
102/* called from do_fork() to get node information for about to be created task */
103int tsk_fork_get_node(struct task_struct *tsk)
104{
105#ifdef CONFIG_NUMA
106 if (tsk == kthreadd_task)
107 return tsk->pref_node_fork;
108#endif
109 return numa_node_id();
110}
111
101static void create_kthread(struct kthread_create_info *create) 112static void create_kthread(struct kthread_create_info *create)
102{ 113{
103 int pid; 114 int pid;
104 115
116#ifdef CONFIG_NUMA
117 current->pref_node_fork = create->node;
118#endif
105 /* We want our own signal handler (we take no signals by default). */ 119 /* We want our own signal handler (we take no signals by default). */
106 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 120 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
107 if (pid < 0) { 121 if (pid < 0) {
@@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create)
111} 125}
112 126
113/** 127/**
114 * kthread_create - create a kthread. 128 * kthread_create_on_node - create a kthread.
115 * @threadfn: the function to run until signal_pending(current). 129 * @threadfn: the function to run until signal_pending(current).
116 * @data: data ptr for @threadfn. 130 * @data: data ptr for @threadfn.
131 * @node: memory node number.
117 * @namefmt: printf-style name for the thread. 132 * @namefmt: printf-style name for the thread.
118 * 133 *
119 * Description: This helper function creates and names a kernel 134 * Description: This helper function creates and names a kernel
120 * thread. The thread will be stopped: use wake_up_process() to start 135 * thread. The thread will be stopped: use wake_up_process() to start
121 * it. See also kthread_run(). 136 * it. See also kthread_run().
122 * 137 *
138 * If thread is going to be bound on a particular cpu, give its node
139 * in @node, to get NUMA affinity for kthread stack, or else give -1.
123 * When woken, the thread will run @threadfn() with @data as its 140 * When woken, the thread will run @threadfn() with @data as its
124 * argument. @threadfn() can either call do_exit() directly if it is a 141 * argument. @threadfn() can either call do_exit() directly if it is a
125 * standalone thread for which noone will call kthread_stop(), or 142 * standalone thread for which noone will call kthread_stop(), or
@@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create)
129 * 146 *
130 * Returns a task_struct or ERR_PTR(-ENOMEM). 147 * Returns a task_struct or ERR_PTR(-ENOMEM).
131 */ 148 */
132struct task_struct *kthread_create(int (*threadfn)(void *data), 149struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
133 void *data, 150 void *data,
134 const char namefmt[], 151 int node,
135 ...) 152 const char namefmt[],
153 ...)
136{ 154{
137 struct kthread_create_info create; 155 struct kthread_create_info create;
138 156
139 create.threadfn = threadfn; 157 create.threadfn = threadfn;
140 create.data = data; 158 create.data = data;
159 create.node = node;
141 init_completion(&create.done); 160 init_completion(&create.done);
142 161
143 spin_lock(&kthread_create_lock); 162 spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
164 } 183 }
165 return create.result; 184 return create.result;
166} 185}
167EXPORT_SYMBOL(kthread_create); 186EXPORT_SYMBOL(kthread_create_on_node);
168 187
169/** 188/**
170 * kthread_bind - bind a just-created kthread to a cpu. 189 * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, 225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, 226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, 227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
228 sum_forward_deps = 0, factor = 0; 228 sum_forward_deps = 0;
229 229
230 list_for_each_entry(class, &all_lock_classes, lock_entry) { 230 list_for_each_entry(class, &all_lock_classes, lock_entry) {
231 231
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
283 nr_hardirq_unsafe * nr_hardirq_safe + 283 nr_hardirq_unsafe * nr_hardirq_safe +
284 nr_list_entries); 284 nr_list_entries);
285 285
286 /*
287 * Estimated factor between direct and indirect
288 * dependencies:
289 */
290 if (nr_list_entries)
291 factor = sum_forward_deps / nr_list_entries;
292
293#ifdef CONFIG_PROVE_LOCKING 286#ifdef CONFIG_PROVE_LOCKING
294 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 287 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
295 nr_lock_chains, MAX_LOCKDEP_CHAINS); 288 nr_lock_chains, MAX_LOCKDEP_CHAINS);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..1f9f7bc56ca1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1168{ 1168{
1169 struct module_sect_attr *sattr = 1169 struct module_sect_attr *sattr =
1170 container_of(mattr, struct module_sect_attr, mattr); 1170 container_of(mattr, struct module_sect_attr, mattr);
1171 return sprintf(buf, "0x%lx\n", sattr->address); 1171 return sprintf(buf, "0x%pK\n", (void *)sattr->address);
1172} 1172}
1173 1173
1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
3224 mod->state == MODULE_STATE_COMING ? "Loading": 3224 mod->state == MODULE_STATE_COMING ? "Loading":
3225 "Live"); 3225 "Live");
3226 /* Used by oprofile and other similar tools. */ 3226 /* Used by oprofile and other similar tools. */
3227 seq_printf(m, " 0x%p", mod->module_core); 3227 seq_printf(m, " 0x%pK", mod->module_core);
3228 3228
3229 /* Taints info */ 3229 /* Taints info */
3230 if (mod->taints) 3230 if (mod->taints)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..a05d191ffdd9 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
69 goto out_ns; 69 goto out_ns;
70 } 70 }
71 71
72 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 72 new_nsp->uts_ns = copy_utsname(flags, tsk);
73 if (IS_ERR(new_nsp->uts_ns)) { 73 if (IS_ERR(new_nsp->uts_ns)) {
74 err = PTR_ERR(new_nsp->uts_ns); 74 err = PTR_ERR(new_nsp->uts_ns);
75 goto out_uts; 75 goto out_uts;
76 } 76 }
77 77
78 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 78 new_nsp->ipc_ns = copy_ipcs(flags, tsk);
79 if (IS_ERR(new_nsp->ipc_ns)) { 79 if (IS_ERR(new_nsp->ipc_ns)) {
80 err = PTR_ERR(new_nsp->ipc_ns); 80 err = PTR_ERR(new_nsp->ipc_ns);
81 goto out_ipc; 81 goto out_ipc;
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
433 433
434core_param(panic, panic_timeout, int, 0644); 434core_param(panic, panic_timeout, int, 0644);
435core_param(pause_on_oops, pause_on_oops, int, 0644); 435core_param(pause_on_oops, pause_on_oops, int, 0644);
436
437static int __init oops_setup(char *s)
438{
439 if (!s)
440 return -EINVAL;
441 if (!strcmp(s, "panic"))
442 panic_on_oops = 1;
443 return 0;
444}
445early_param("oops", oops_setup);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ed253aa24ba4..c75925c4d1e2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -145,7 +145,8 @@ static struct srcu_struct pmus_srcu;
145 */ 145 */
146int sysctl_perf_event_paranoid __read_mostly = 1; 146int sysctl_perf_event_paranoid __read_mostly = 1;
147 147
148int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 148/* Minimum for 128 pages + 1 for the user control page */
149int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */
149 150
150/* 151/*
151 * max perf event sample rate 152 * max perf event sample rate
@@ -941,6 +942,7 @@ static void perf_group_attach(struct perf_event *event)
941static void 942static void
942list_del_event(struct perf_event *event, struct perf_event_context *ctx) 943list_del_event(struct perf_event *event, struct perf_event_context *ctx)
943{ 944{
945 struct perf_cpu_context *cpuctx;
944 /* 946 /*
945 * We can have double detach due to exit/hot-unplug + close. 947 * We can have double detach due to exit/hot-unplug + close.
946 */ 948 */
@@ -949,8 +951,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
949 951
950 event->attach_state &= ~PERF_ATTACH_CONTEXT; 952 event->attach_state &= ~PERF_ATTACH_CONTEXT;
951 953
952 if (is_cgroup_event(event)) 954 if (is_cgroup_event(event)) {
953 ctx->nr_cgroups--; 955 ctx->nr_cgroups--;
956 cpuctx = __get_cpu_context(ctx);
957 /*
958 * if there are no more cgroup events
959 * then cler cgrp to avoid stale pointer
960 * in update_cgrp_time_from_cpuctx()
961 */
962 if (!ctx->nr_cgroups)
963 cpuctx->cgrp = NULL;
964 }
954 965
955 ctx->nr_events--; 966 ctx->nr_events--;
956 if (event->attr.inherit_stat) 967 if (event->attr.inherit_stat)
@@ -5122,7 +5133,7 @@ static int perf_exclude_event(struct perf_event *event,
5122 struct pt_regs *regs) 5133 struct pt_regs *regs)
5123{ 5134{
5124 if (event->hw.state & PERF_HES_STOPPED) 5135 if (event->hw.state & PERF_HES_STOPPED)
5125 return 0; 5136 return 1;
5126 5137
5127 if (regs) { 5138 if (regs) {
5128 if (event->attr.exclude_user && user_mode(regs)) 5139 if (event->attr.exclude_user && user_mode(regs))
@@ -5478,6 +5489,8 @@ static int perf_tp_event_match(struct perf_event *event,
5478 struct perf_sample_data *data, 5489 struct perf_sample_data *data,
5479 struct pt_regs *regs) 5490 struct pt_regs *regs)
5480{ 5491{
5492 if (event->hw.state & PERF_HES_STOPPED)
5493 return 0;
5481 /* 5494 /*
5482 * All tracepoints are from kernel-space. 5495 * All tracepoints are from kernel-space.
5483 */ 5496 */
@@ -6720,17 +6733,20 @@ __perf_event_exit_task(struct perf_event *child_event,
6720 struct perf_event_context *child_ctx, 6733 struct perf_event_context *child_ctx,
6721 struct task_struct *child) 6734 struct task_struct *child)
6722{ 6735{
6723 struct perf_event *parent_event; 6736 if (child_event->parent) {
6737 raw_spin_lock_irq(&child_ctx->lock);
6738 perf_group_detach(child_event);
6739 raw_spin_unlock_irq(&child_ctx->lock);
6740 }
6724 6741
6725 perf_remove_from_context(child_event); 6742 perf_remove_from_context(child_event);
6726 6743
6727 parent_event = child_event->parent;
6728 /* 6744 /*
6729 * It can happen that parent exits first, and has events 6745 * It can happen that the parent exits first, and has events
6730 * that are still around due to the child reference. These 6746 * that are still around due to the child reference. These
6731 * events need to be zapped - but otherwise linger. 6747 * events need to be zapped.
6732 */ 6748 */
6733 if (parent_event) { 6749 if (child_event->parent) {
6734 sync_child_event(child_event, child); 6750 sync_child_event(child_event, child);
6735 free_event(child_event); 6751 free_event(child_event);
6736 } 6752 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584f..02f221274265 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
435 rcu_read_unlock(); 435 rcu_read_unlock();
436 return pid; 436 return pid;
437} 437}
438EXPORT_SYMBOL_GPL(get_task_pid);
438 439
439struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) 440struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
440{ 441{
@@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
446 rcu_read_unlock(); 447 rcu_read_unlock();
447 return result; 448 return result;
448} 449}
450EXPORT_SYMBOL_GPL(get_pid_task);
449 451
450struct pid *find_get_pid(pid_t nr) 452struct pid *find_get_pid(pid_t nr)
451{ 453{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h>
17 18
18#define BITS_PER_PAGE (PAGE_SIZE*8) 19#define BITS_PER_PAGE (PAGE_SIZE*8)
19 20
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
72{ 73{
73 struct pid_namespace *ns; 74 struct pid_namespace *ns;
74 unsigned int level = parent_pid_ns->level + 1; 75 unsigned int level = parent_pid_ns->level + 1;
75 int i; 76 int i, err = -ENOMEM;
76 77
77 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 78 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
78 if (ns == NULL) 79 if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
96 for (i = 1; i < PIDMAP_ENTRIES; i++) 97 for (i = 1; i < PIDMAP_ENTRIES; i++)
97 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 98 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
98 99
100 err = pid_ns_prepare_proc(ns);
101 if (err)
102 goto out_put_parent_pid_ns;
103
99 return ns; 104 return ns;
100 105
106out_put_parent_pid_ns:
107 put_pid_ns(parent_pid_ns);
101out_free_map: 108out_free_map:
102 kfree(ns->pidmap[0].page); 109 kfree(ns->pidmap[0].page);
103out_free: 110out_free:
104 kmem_cache_free(pid_ns_cachep, ns); 111 kmem_cache_free(pid_ns_cachep, ns);
105out: 112out:
106 return ERR_PTR(-ENOMEM); 113 return ERR_PTR(err);
107} 114}
108 115
109static void destroy_pid_namespace(struct pid_namespace *ns) 116static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f846821..0da058bff8eb 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = {
103 103
104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
105 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
106static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
107 size_t count, loff_t *f_pos);
106static int pm_qos_power_open(struct inode *inode, struct file *filp); 108static int pm_qos_power_open(struct inode *inode, struct file *filp);
107static int pm_qos_power_release(struct inode *inode, struct file *filp); 109static int pm_qos_power_release(struct inode *inode, struct file *filp);
108 110
109static const struct file_operations pm_qos_power_fops = { 111static const struct file_operations pm_qos_power_fops = {
110 .write = pm_qos_power_write, 112 .write = pm_qos_power_write,
113 .read = pm_qos_power_read,
111 .open = pm_qos_power_open, 114 .open = pm_qos_power_open,
112 .release = pm_qos_power_release, 115 .release = pm_qos_power_release,
113 .llseek = noop_llseek, 116 .llseek = noop_llseek,
@@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
376} 379}
377 380
378 381
382static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
383 size_t count, loff_t *f_pos)
384{
385 s32 value;
386 unsigned long flags;
387 struct pm_qos_object *o;
388 struct pm_qos_request_list *pm_qos_req = filp->private_data;;
389
390 if (!pm_qos_req)
391 return -EINVAL;
392 if (!pm_qos_request_active(pm_qos_req))
393 return -EINVAL;
394
395 o = pm_qos_array[pm_qos_req->pm_qos_class];
396 spin_lock_irqsave(&pm_qos_lock, flags);
397 value = pm_qos_get_value(o);
398 spin_unlock_irqrestore(&pm_qos_lock, flags);
399
400 return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
401}
402
379static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 403static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
380 size_t count, loff_t *f_pos) 404 size_t count, loff_t *f_pos)
381{ 405{
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 265729966ece..4603f08dc47b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,125 +1,12 @@
1config PM
2 bool "Power Management support"
3 depends on !IA64_HP_SIM
4 ---help---
5 "Power Management" means that parts of your computer are shut
6 off or put into a power conserving "sleep" mode if they are not
7 being used. There are two competing standards for doing this: APM
8 and ACPI. If you want to use either one, say Y here and then also
9 to the requisite support below.
10
11 Power Management is most important for battery powered laptop
12 computers; if you have a laptop, check out the Linux Laptop home
13 page on the WWW at <http://www.linux-on-laptops.com/> or
14 Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
15 and the Battery Powered Linux mini-HOWTO, available from
16 <http://www.tldp.org/docs.html#howto>.
17
18 Note that, even if you say N here, Linux on the x86 architecture
19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power.
21
22config PM_DEBUG
23 bool "Power Management Debug Support"
24 depends on PM
25 ---help---
26 This option enables various debugging support in the Power Management
27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support.
29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
39config PM_VERBOSE
40 bool "Verbose Power Management debugging"
41 depends on PM_DEBUG
42 default n
43 ---help---
44 This option enables verbose messages from the Power Management code.
45
46config CAN_PM_TRACE
47 def_bool y
48 depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
49
50config PM_TRACE
51 bool
52 help
53 This enables code to save the last PM event point across
54 reboot. The architecture needs to support this, x86 for
55 example does by saving things in the RTC, see below.
56
57 The architecture specific code must provide the extern
58 functions from <linux/resume-trace.h> as well as the
59 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
60
61 The way the information is presented is architecture-
62 dependent, x86 will print the information during a
63 late_initcall.
64
65config PM_TRACE_RTC
66 bool "Suspend/resume event tracing"
67 depends on CAN_PM_TRACE
68 depends on X86
69 select PM_TRACE
70 default n
71 ---help---
72 This enables some cheesy code to save the last PM event point in the
73 RTC across reboots, so that you can debug a machine that just hangs
74 during suspend (or more commonly, during resume).
75
76 To use this debugging feature you should attempt to suspend the
77 machine, reboot it and then run
78
79 dmesg -s 1000000 | grep 'hash matches'
80
81 CAUTION: this option will cause your machine's real-time clock to be
82 set to an invalid time after a resume.
83
84config PM_SLEEP_SMP
85 bool
86 depends on SMP
87 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
88 depends on PM_SLEEP
89 select HOTPLUG
90 select HOTPLUG_CPU
91 default y
92
93config PM_SLEEP
94 bool
95 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
96 default y
97
98config PM_SLEEP_ADVANCED_DEBUG
99 bool
100 depends on PM_ADVANCED_DEBUG
101 default n
102
103config SUSPEND 1config SUSPEND
104 bool "Suspend to RAM and standby" 2 bool "Suspend to RAM and standby"
105 depends on PM && ARCH_SUSPEND_POSSIBLE 3 depends on ARCH_SUSPEND_POSSIBLE
106 default y 4 default y
107 ---help--- 5 ---help---
108 Allow the system to enter sleep states in which main memory is 6 Allow the system to enter sleep states in which main memory is
109 powered and thus its contents are preserved, such as the 7 powered and thus its contents are preserved, such as the
110 suspend-to-RAM state (e.g. the ACPI S3 state). 8 suspend-to-RAM state (e.g. the ACPI S3 state).
111 9
112config PM_TEST_SUSPEND
113 bool "Test suspend/resume and wakealarm during bootup"
114 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
115 ---help---
116 This option will let you suspend your machine during bootup, and
117 make it wake up a few seconds later using an RTC wakeup alarm.
118 Enable this with a kernel parameter like "test_suspend=mem".
119
120 You probably want to have your system's RTC driver statically
121 linked, ensuring that it's available when this test runs.
122
123config SUSPEND_FREEZER 10config SUSPEND_FREEZER
124 bool "Enable freezer for suspend to RAM/standby" \ 11 bool "Enable freezer for suspend to RAM/standby" \
125 if ARCH_WANTS_FREEZER_CONTROL || BROKEN 12 if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -133,7 +20,7 @@ config SUSPEND_FREEZER
133 20
134config HIBERNATION 21config HIBERNATION
135 bool "Hibernation (aka 'suspend to disk')" 22 bool "Hibernation (aka 'suspend to disk')"
136 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 23 depends on SWAP && ARCH_HIBERNATION_POSSIBLE
137 select LZO_COMPRESS 24 select LZO_COMPRESS
138 select LZO_DECOMPRESS 25 select LZO_DECOMPRESS
139 ---help--- 26 ---help---
@@ -196,6 +83,106 @@ config PM_STD_PARTITION
196 suspended image to. It will simply pick the first available swap 83 suspended image to. It will simply pick the first available swap
197 device. 84 device.
198 85
86config PM_SLEEP
87 def_bool y
88 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
89
90config PM_SLEEP_SMP
91 def_bool y
92 depends on SMP
93 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
94 depends on PM_SLEEP
95 select HOTPLUG
96 select HOTPLUG_CPU
97
98config PM_RUNTIME
99 bool "Run-time PM core functionality"
100 depends on !IA64_HP_SIM
101 ---help---
102 Enable functionality allowing I/O devices to be put into energy-saving
103 (low power) states at run time (or autosuspended) after a specified
104 period of inactivity and woken up in response to a hardware-generated
105 wake-up event or a driver's request.
106
107 Hardware support is generally required for this functionality to work
108 and the bus type drivers of the buses the devices are on are
109 responsible for the actual handling of the autosuspend requests and
110 wake-up events.
111
112config PM
113 def_bool y
114 depends on PM_SLEEP || PM_RUNTIME
115
116config PM_DEBUG
117 bool "Power Management Debug Support"
118 depends on PM
119 ---help---
120 This option enables various debugging support in the Power Management
121 code. This is helpful when debugging and reporting PM bugs, like
122 suspend support.
123
124config PM_VERBOSE
125 bool "Verbose Power Management debugging"
126 depends on PM_DEBUG
127 ---help---
128 This option enables verbose messages from the Power Management code.
129
130config PM_ADVANCED_DEBUG
131 bool "Extra PM attributes in sysfs for low-level debugging/testing"
132 depends on PM_DEBUG
133 ---help---
134 Add extra sysfs attributes allowing one to access some Power Management
135 fields of device objects from user space. If you are not a kernel
136 developer interested in debugging/testing Power Management, say "no".
137
138config PM_TEST_SUSPEND
139 bool "Test suspend/resume and wakealarm during bootup"
140 depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
141 ---help---
142 This option will let you suspend your machine during bootup, and
143 make it wake up a few seconds later using an RTC wakeup alarm.
144 Enable this with a kernel parameter like "test_suspend=mem".
145
146 You probably want to have your system's RTC driver statically
147 linked, ensuring that it's available when this test runs.
148
149config CAN_PM_TRACE
150 def_bool y
151 depends on PM_DEBUG && PM_SLEEP
152
153config PM_TRACE
154 bool
155 help
156 This enables code to save the last PM event point across
157 reboot. The architecture needs to support this, x86 for
158 example does by saving things in the RTC, see below.
159
160 The architecture specific code must provide the extern
161 functions from <linux/resume-trace.h> as well as the
162 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
163
164 The way the information is presented is architecture-
165 dependent, x86 will print the information during a
166 late_initcall.
167
168config PM_TRACE_RTC
169 bool "Suspend/resume event tracing"
170 depends on CAN_PM_TRACE
171 depends on X86
172 select PM_TRACE
173 ---help---
174 This enables some cheesy code to save the last PM event point in the
175 RTC across reboots, so that you can debug a machine that just hangs
176 during suspend (or more commonly, during resume).
177
178 To use this debugging feature you should attempt to suspend the
179 machine, reboot it and then run
180
181 dmesg -s 1000000 | grep 'hash matches'
182
183 CAUTION: this option will cause your machine's real-time clock to be
184 set to an invalid time after a resume.
185
199config APM_EMULATION 186config APM_EMULATION
200 tristate "Advanced Power Management Emulation" 187 tristate "Advanced Power Management Emulation"
201 depends on PM && SYS_SUPPORTS_APM_EMULATION 188 depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -222,31 +209,11 @@ config APM_EMULATION
222 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
223 APM in your BIOS). 210 APM in your BIOS).
224 211
225config PM_RUNTIME
226 bool "Run-time PM core functionality"
227 depends on PM
228 ---help---
229 Enable functionality allowing I/O devices to be put into energy-saving
230 (low power) states at run time (or autosuspended) after a specified
231 period of inactivity and woken up in response to a hardware-generated
232 wake-up event or a driver's request.
233
234 Hardware support is generally required for this functionality to work
235 and the bus type drivers of the buses the devices are on are
236 responsible for the actual handling of the autosuspend requests and
237 wake-up events.
238
239config PM_OPS
240 bool
241 depends on PM_SLEEP || PM_RUNTIME
242 default y
243
244config ARCH_HAS_OPP 212config ARCH_HAS_OPP
245 bool 213 bool
246 214
247config PM_OPP 215config PM_OPP
248 bool "Operating Performance Point (OPP) Layer library" 216 bool "Operating Performance Point (OPP) Layer library"
249 depends on PM
250 depends on ARCH_HAS_OPP 217 depends on ARCH_HAS_OPP
251 ---help--- 218 ---help---
252 SOCs have a standard set of tuples consisting of frequency and 219 SOCs have a standard set of tuples consisting of frequency and
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2 3
3obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o
4obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_PM_SLEEP) += console.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; 31 const int bio_rw = rw | REQ_SYNC;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1832bd264219..aeabd26e3342 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,6 +23,7 @@
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/syscore_ops.h>
26#include <scsi/scsi_scan.h> 27#include <scsi/scsi_scan.h>
27#include <asm/suspend.h> 28#include <asm/suspend.h>
28 29
@@ -272,6 +273,8 @@ static int create_image(int platform_mode)
272 local_irq_disable(); 273 local_irq_disable();
273 274
274 error = sysdev_suspend(PMSG_FREEZE); 275 error = sysdev_suspend(PMSG_FREEZE);
276 if (!error)
277 error = syscore_suspend();
275 if (error) { 278 if (error) {
276 printk(KERN_ERR "PM: Some system devices failed to power down, " 279 printk(KERN_ERR "PM: Some system devices failed to power down, "
277 "aborting hibernation\n"); 280 "aborting hibernation\n");
@@ -295,6 +298,7 @@ static int create_image(int platform_mode)
295 } 298 }
296 299
297 Power_up: 300 Power_up:
301 syscore_resume();
298 sysdev_resume(); 302 sysdev_resume();
299 /* NOTE: dpm_resume_noirq() is just a resume() for devices 303 /* NOTE: dpm_resume_noirq() is just a resume() for devices
300 * that suspended with irqs off ... no overall powerup. 304 * that suspended with irqs off ... no overall powerup.
@@ -403,6 +407,8 @@ static int resume_target_kernel(bool platform_mode)
403 local_irq_disable(); 407 local_irq_disable();
404 408
405 error = sysdev_suspend(PMSG_QUIESCE); 409 error = sysdev_suspend(PMSG_QUIESCE);
410 if (!error)
411 error = syscore_suspend();
406 if (error) 412 if (error)
407 goto Enable_irqs; 413 goto Enable_irqs;
408 414
@@ -429,6 +435,7 @@ static int resume_target_kernel(bool platform_mode)
429 restore_processor_state(); 435 restore_processor_state();
430 touch_softlockup_watchdog(); 436 touch_softlockup_watchdog();
431 437
438 syscore_resume();
432 sysdev_resume(); 439 sysdev_resume();
433 440
434 Enable_irqs: 441 Enable_irqs:
@@ -516,6 +523,7 @@ int hibernation_platform_enter(void)
516 523
517 local_irq_disable(); 524 local_irq_disable();
518 sysdev_suspend(PMSG_HIBERNATE); 525 sysdev_suspend(PMSG_HIBERNATE);
526 syscore_suspend();
519 if (pm_wakeup_pending()) { 527 if (pm_wakeup_pending()) {
520 error = -EAGAIN; 528 error = -EAGAIN;
521 goto Power_up; 529 goto Power_up;
@@ -526,6 +534,7 @@ int hibernation_platform_enter(void)
526 while (1); 534 while (1);
527 535
528 Power_up: 536 Power_up:
537 syscore_resume();
529 sysdev_resume(); 538 sysdev_resume();
530 local_irq_enable(); 539 local_irq_enable();
531 enable_nonboot_cpus(); 540 enable_nonboot_cpus();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 701853042c28..8eaba5f27b10 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
17 17
18DEFINE_MUTEX(pm_mutex); 18DEFINE_MUTEX(pm_mutex);
19 19
20unsigned int pm_flags;
21EXPORT_SYMBOL(pm_flags);
22
23#ifdef CONFIG_PM_SLEEP 20#ifdef CONFIG_PM_SLEEP
24 21
25/* Routines for PM-transition notifications */ 22/* Routines for PM-transition notifications */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 64db648ff911..ca0aacc24874 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 44 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, swsusp will do its best to ensure the image 45 * When it is set to N, the image creating code will do its best to
46 * size will not exceed N bytes, but if that is impossible, it will 46 * ensure the image size will not exceed N bytes, but if that is
47 * try to create the smallest image possible. 47 * impossible, it will try to create the smallest image possible.
48 */ 48 */
49unsigned long image_size; 49unsigned long image_size;
50 50
51void __init hibernate_image_size_init(void) 51void __init hibernate_image_size_init(void)
52{ 52{
53 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; 53 image_size = (totalram_pages / 3) * PAGE_SIZE;
54} 54}
55 55
56/* List of PBEs needed for restoring the pages that were allocated before 56/* List of PBEs needed for restoring the pages that were allocated before
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index de6f86bfa303..2814c32aed51 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/suspend.h> 24#include <linux/suspend.h>
25#include <linux/syscore_ops.h>
25#include <trace/events/power.h> 26#include <trace/events/power.h>
26 27
27#include "power.h" 28#include "power.h"
@@ -163,11 +164,14 @@ static int suspend_enter(suspend_state_t state)
163 BUG_ON(!irqs_disabled()); 164 BUG_ON(!irqs_disabled());
164 165
165 error = sysdev_suspend(PMSG_SUSPEND); 166 error = sysdev_suspend(PMSG_SUSPEND);
167 if (!error)
168 error = syscore_suspend();
166 if (!error) { 169 if (!error) {
167 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 170 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
168 error = suspend_ops->enter(state); 171 error = suspend_ops->enter(state);
169 events_check_enabled = false; 172 events_check_enabled = false;
170 } 173 }
174 syscore_resume();
171 sysdev_resume(); 175 sysdev_resume();
172 } 176 }
173 177
diff --git a/kernel/printk.c b/kernel/printk.c
index 36231525e22f..da8ca817eae3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
54 54
55/* printk's without a loglevel use this.. */ 55/* printk's without a loglevel use this.. */
56#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ 56#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
57 57
58/* We show everything that is MORE important than this.. */ 58/* We show everything that is MORE important than this.. */
59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ 113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
114 114
115/* 115/*
116 * If exclusive_console is non-NULL then only this console is to be printed to.
117 */
118static struct console *exclusive_console;
119
120/*
116 * Array of consoles built from command line options (console=) 121 * Array of consoles built from command line options (console=)
117 */ 122 */
118struct console_cmdline 123struct console_cmdline
@@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
476 struct console *con; 481 struct console *con;
477 482
478 for_each_console(con) { 483 for_each_console(con) {
484 if (exclusive_console && con != exclusive_console)
485 continue;
479 if ((con->flags & CON_ENABLED) && con->write && 486 if ((con->flags & CON_ENABLED) && con->write &&
480 (cpu_online(smp_processor_id()) || 487 (cpu_online(smp_processor_id()) ||
481 (con->flags & CON_ANYTIME))) 488 (con->flags & CON_ANYTIME)))
@@ -515,6 +522,71 @@ static void _call_console_drivers(unsigned start,
515} 522}
516 523
517/* 524/*
525 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
526 * lower 3 bit are the log level, the rest are the log facility. In case
527 * userspace passes usual userspace syslog messages to /dev/kmsg or
528 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
529 * to extract the correct log level for in-kernel processing, and not mangle
530 * the original value.
531 *
532 * If a prefix is found, the length of the prefix is returned. If 'level' is
533 * passed, it will be filled in with the log level without a possible facility
534 * value. If 'special' is passed, the special printk prefix chars are accepted
535 * and returned. If no valid header is found, 0 is returned and the passed
536 * variables are not touched.
537 */
538static size_t log_prefix(const char *p, unsigned int *level, char *special)
539{
540 unsigned int lev = 0;
541 char sp = '\0';
542 size_t len;
543
544 if (p[0] != '<' || !p[1])
545 return 0;
546 if (p[2] == '>') {
547 /* usual single digit level number or special char */
548 switch (p[1]) {
549 case '0' ... '7':
550 lev = p[1] - '0';
551 break;
552 case 'c': /* KERN_CONT */
553 case 'd': /* KERN_DEFAULT */
554 sp = p[1];
555 break;
556 default:
557 return 0;
558 }
559 len = 3;
560 } else {
561 /* multi digit including the level and facility number */
562 char *endp = NULL;
563
564 if (p[1] < '0' && p[1] > '9')
565 return 0;
566
567 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
568 if (endp == NULL || endp[0] != '>')
569 return 0;
570 len = (endp + 1) - p;
571 }
572
573 /* do not accept special char if not asked for */
574 if (sp && !special)
575 return 0;
576
577 if (special) {
578 *special = sp;
579 /* return special char, do not touch level */
580 if (sp)
581 return len;
582 }
583
584 if (level)
585 *level = lev;
586 return len;
587}
588
589/*
518 * Call the console drivers, asking them to write out 590 * Call the console drivers, asking them to write out
519 * log_buf[start] to log_buf[end - 1]. 591 * log_buf[start] to log_buf[end - 1].
520 * The console_lock must be held. 592 * The console_lock must be held.
@@ -529,13 +601,9 @@ static void call_console_drivers(unsigned start, unsigned end)
529 cur_index = start; 601 cur_index = start;
530 start_print = start; 602 start_print = start;
531 while (cur_index != end) { 603 while (cur_index != end) {
532 if (msg_level < 0 && ((end - cur_index) > 2) && 604 if (msg_level < 0 && ((end - cur_index) > 2)) {
533 LOG_BUF(cur_index + 0) == '<' && 605 /* strip log prefix */
534 LOG_BUF(cur_index + 1) >= '0' && 606 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
535 LOG_BUF(cur_index + 1) <= '7' &&
536 LOG_BUF(cur_index + 2) == '>') {
537 msg_level = LOG_BUF(cur_index + 1) - '0';
538 cur_index += 3;
539 start_print = cur_index; 607 start_print = cur_index;
540 } 608 }
541 while (cur_index != end) { 609 while (cur_index != end) {
@@ -733,6 +801,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
733 unsigned long flags; 801 unsigned long flags;
734 int this_cpu; 802 int this_cpu;
735 char *p; 803 char *p;
804 size_t plen;
805 char special;
736 806
737 boot_delay_msec(); 807 boot_delay_msec();
738 printk_delay(); 808 printk_delay();
@@ -773,45 +843,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
773 printed_len += vscnprintf(printk_buf + printed_len, 843 printed_len += vscnprintf(printk_buf + printed_len,
774 sizeof(printk_buf) - printed_len, fmt, args); 844 sizeof(printk_buf) - printed_len, fmt, args);
775 845
776
777 p = printk_buf; 846 p = printk_buf;
778 847
779 /* Do we have a loglevel in the string? */ 848 /* Read log level and handle special printk prefix */
780 if (p[0] == '<') { 849 plen = log_prefix(p, &current_log_level, &special);
781 unsigned char c = p[1]; 850 if (plen) {
782 if (c && p[2] == '>') { 851 p += plen;
783 switch (c) { 852
784 case '0' ... '7': /* loglevel */ 853 switch (special) {
785 current_log_level = c - '0'; 854 case 'c': /* Strip <c> KERN_CONT, continue line */
786 /* Fallthrough - make sure we're on a new line */ 855 plen = 0;
787 case 'd': /* KERN_DEFAULT */ 856 break;
788 if (!new_text_line) { 857 case 'd': /* Strip <d> KERN_DEFAULT, start new line */
789 emit_log_char('\n'); 858 plen = 0;
790 new_text_line = 1; 859 default:
791 } 860 if (!new_text_line) {
792 /* Fallthrough - skip the loglevel */ 861 emit_log_char('\n');
793 case 'c': /* KERN_CONT */ 862 new_text_line = 1;
794 p += 3;
795 break;
796 } 863 }
797 } 864 }
798 } 865 }
799 866
800 /* 867 /*
801 * Copy the output into log_buf. If the caller didn't provide 868 * Copy the output into log_buf. If the caller didn't provide
802 * appropriate log level tags, we insert them here 869 * the appropriate log prefix, we insert them here
803 */ 870 */
804 for ( ; *p; p++) { 871 for (; *p; p++) {
805 if (new_text_line) { 872 if (new_text_line) {
806 /* Always output the token */
807 emit_log_char('<');
808 emit_log_char(current_log_level + '0');
809 emit_log_char('>');
810 printed_len += 3;
811 new_text_line = 0; 873 new_text_line = 0;
812 874
875 if (plen) {
876 /* Copy original log prefix */
877 int i;
878
879 for (i = 0; i < plen; i++)
880 emit_log_char(printk_buf[i]);
881 printed_len += plen;
882 } else {
883 /* Add log prefix */
884 emit_log_char('<');
885 emit_log_char(current_log_level + '0');
886 emit_log_char('>');
887 printed_len += 3;
888 }
889
813 if (printk_time) { 890 if (printk_time) {
814 /* Follow the token with the time */ 891 /* Add the current time stamp */
815 char tbuf[50], *tp; 892 char tbuf[50], *tp;
816 unsigned tlen; 893 unsigned tlen;
817 unsigned long long t; 894 unsigned long long t;
@@ -1160,6 +1237,11 @@ void console_unlock(void)
1160 local_irq_restore(flags); 1237 local_irq_restore(flags);
1161 } 1238 }
1162 console_locked = 0; 1239 console_locked = 0;
1240
1241 /* Release the exclusive_console once it is used */
1242 if (unlikely(exclusive_console))
1243 exclusive_console = NULL;
1244
1163 up(&console_sem); 1245 up(&console_sem);
1164 spin_unlock_irqrestore(&logbuf_lock, flags); 1246 spin_unlock_irqrestore(&logbuf_lock, flags);
1165 if (wake_klogd) 1247 if (wake_klogd)
@@ -1246,6 +1328,18 @@ void console_start(struct console *console)
1246} 1328}
1247EXPORT_SYMBOL(console_start); 1329EXPORT_SYMBOL(console_start);
1248 1330
1331static int __read_mostly keep_bootcon;
1332
1333static int __init keep_bootcon_setup(char *str)
1334{
1335 keep_bootcon = 1;
1336 printk(KERN_INFO "debug: skip boot console de-registration.\n");
1337
1338 return 0;
1339}
1340
1341early_param("keep_bootcon", keep_bootcon_setup);
1342
1249/* 1343/*
1250 * The console driver calls this routine during kernel initialization 1344 * The console driver calls this routine during kernel initialization
1251 * to register the console printing procedure with printk() and to 1345 * to register the console printing procedure with printk() and to
@@ -1382,6 +1476,12 @@ void register_console(struct console *newcon)
1382 spin_lock_irqsave(&logbuf_lock, flags); 1476 spin_lock_irqsave(&logbuf_lock, flags);
1383 con_start = log_start; 1477 con_start = log_start;
1384 spin_unlock_irqrestore(&logbuf_lock, flags); 1478 spin_unlock_irqrestore(&logbuf_lock, flags);
1479 /*
1480 * We're about to replay the log buffer. Only do this to the
1481 * just-registered console to avoid excessive message spam to
1482 * the already-registered consoles.
1483 */
1484 exclusive_console = newcon;
1385 } 1485 }
1386 console_unlock(); 1486 console_unlock();
1387 console_sysfs_notify(); 1487 console_sysfs_notify();
@@ -1393,7 +1493,9 @@ void register_console(struct console *newcon)
1393 * users know there might be something in the kernel's log buffer that 1493 * users know there might be something in the kernel's log buffer that
1394 * went to the bootconsole (that they do not see on the real console) 1494 * went to the bootconsole (that they do not see on the real console)
1395 */ 1495 */
1396 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { 1496 if (bcon &&
1497 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
1498 !keep_bootcon) {
1397 /* we need to iterate through twice, to make sure we print 1499 /* we need to iterate through twice, to make sure we print
1398 * everything out, before we unregister the console(s) 1500 * everything out, before we unregister the console(s)
1399 */ 1501 */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e2302e40b360..0fc1eed28d27 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
134 return 0; 134 return 0;
135 rcu_read_lock(); 135 rcu_read_lock();
136 tcred = __task_cred(task); 136 tcred = __task_cred(task);
137 if ((cred->uid != tcred->euid || 137 if (cred->user->user_ns == tcred->user->user_ns &&
138 cred->uid != tcred->suid || 138 (cred->uid == tcred->euid &&
139 cred->uid != tcred->uid || 139 cred->uid == tcred->suid &&
140 cred->gid != tcred->egid || 140 cred->uid == tcred->uid &&
141 cred->gid != tcred->sgid || 141 cred->gid == tcred->egid &&
142 cred->gid != tcred->gid) && 142 cred->gid == tcred->sgid &&
143 !capable(CAP_SYS_PTRACE)) { 143 cred->gid == tcred->gid))
144 rcu_read_unlock(); 144 goto ok;
145 return -EPERM; 145 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
146 } 146 goto ok;
147 rcu_read_unlock();
148 return -EPERM;
149ok:
147 rcu_read_unlock(); 150 rcu_read_unlock();
148 smp_rmb(); 151 smp_rmb();
149 if (task->mm) 152 if (task->mm)
150 dumpable = get_dumpable(task->mm); 153 dumpable = get_dumpable(task->mm);
151 if (!dumpable && !capable(CAP_SYS_PTRACE)) 154 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
152 return -EPERM; 155 return -EPERM;
153 156
154 return security_ptrace_access_check(task, mode); 157 return security_ptrace_access_check(task, mode);
@@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task)
198 goto unlock_tasklist; 201 goto unlock_tasklist;
199 202
200 task->ptrace = PT_PTRACED; 203 task->ptrace = PT_PTRACED;
201 if (capable(CAP_SYS_PTRACE)) 204 if (task_ns_capable(task, CAP_SYS_PTRACE))
202 task->ptrace |= PT_PTRACE_CAP; 205 task->ptrace |= PT_PTRACE_CAP;
203 206
204 __ptrace_link(task, current); 207 __ptrace_link(task, current);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..f3240e987928 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 214 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 215 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 216 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether
218 * or not we are in an RCU read-side critical section
219 * exists only in the preemptible RCU implementations
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
217 */ 222 */
218#ifndef CONFIG_PREEMPT
219 WARN_ON(1);
220 return 0;
221#else
222 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
223 irqs_disabled()) { 224 irqs_disabled()) {
224 WARN_ON(1); 225 WARN_ON(1);
@@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
229 rcu_barrier_bh(); 230 rcu_barrier_bh();
230 debug_object_free(head, &rcuhead_debug_descr); 231 debug_object_free(head, &rcuhead_debug_descr);
231 return 1; 232 return 1;
232#endif
233 default: 233 default:
234 return 0; 234 return 0;
235 } 235 }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..3cb8e362e883 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
852 if (t->rcu_read_lock_nesting == 0) 852 if (t->rcu_read_lock_nesting == 0)
853 return; 853 return;
854 t->rcu_read_lock_nesting = 1; 854 t->rcu_read_lock_nesting = 1;
855 rcu_read_unlock(); 855 __rcu_read_unlock();
856} 856}
857 857
858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..c224da41890c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
51 50
52MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
126 pos, buf, s - buf); 126 pos, buf, s - buf);
127} 127}
128 128
129#if BITS_PER_LONG == 32
130u64 res_counter_read_u64(struct res_counter *counter, int member)
131{
132 unsigned long flags;
133 u64 ret;
134
135 spin_lock_irqsave(&counter->lock, flags);
136 ret = *res_counter_member(counter, member);
137 spin_unlock_irqrestore(&counter->lock, flags);
138
139 return ret;
140}
141#else
129u64 res_counter_read_u64(struct res_counter *counter, int member) 142u64 res_counter_read_u64(struct res_counter *counter, int member)
130{ 143{
131 return *res_counter_member(counter, member); 144 return *res_counter_member(counter, member);
132} 145}
146#endif
133 147
134int res_counter_memparse_write_strategy(const char *buf, 148int res_counter_memparse_write_strategy(const char *buf,
135 unsigned long long *res) 149 unsigned long long *res)
diff --git a/kernel/sched.c b/kernel/sched.c
index a361e20ec2cd..f592ce6f8616 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -4086,9 +4085,6 @@ need_resched:
4086 rcu_note_context_switch(cpu); 4085 rcu_note_context_switch(cpu);
4087 prev = rq->curr; 4086 prev = rq->curr;
4088 4087
4089 release_kernel_lock(prev);
4090need_resched_nonpreemptible:
4091
4092 schedule_debug(prev); 4088 schedule_debug(prev);
4093 4089
4094 if (sched_feat(HRTICK)) 4090 if (sched_feat(HRTICK))
@@ -4119,6 +4115,16 @@ need_resched_nonpreemptible:
4119 switch_count = &prev->nvcsw; 4115 switch_count = &prev->nvcsw;
4120 } 4116 }
4121 4117
4118 /*
4119 * If we are going to sleep and we have plugged IO queued, make
4120 * sure to submit it to avoid deadlocks.
4121 */
4122 if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
4123 raw_spin_unlock(&rq->lock);
4124 blk_flush_plug(prev);
4125 raw_spin_lock(&rq->lock);
4126 }
4127
4122 pre_schedule(rq, prev); 4128 pre_schedule(rq, prev);
4123 4129
4124 if (unlikely(!rq->nr_running)) 4130 if (unlikely(!rq->nr_running))
@@ -4148,9 +4154,6 @@ need_resched_nonpreemptible:
4148 4154
4149 post_schedule(rq); 4155 post_schedule(rq);
4150 4156
4151 if (unlikely(reacquire_kernel_lock(prev)))
4152 goto need_resched_nonpreemptible;
4153
4154 preempt_enable_no_resched(); 4157 preempt_enable_no_resched();
4155 if (need_resched()) 4158 if (need_resched())
4156 goto need_resched; 4159 goto need_resched;
@@ -4899,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
4899 4902
4900 rcu_read_lock(); 4903 rcu_read_lock();
4901 pcred = __task_cred(p); 4904 pcred = __task_cred(p);
4902 match = (cred->euid == pcred->euid || 4905 if (cred->user->user_ns == pcred->user->user_ns)
4903 cred->euid == pcred->uid); 4906 match = (cred->euid == pcred->euid ||
4907 cred->euid == pcred->uid);
4908 else
4909 match = false;
4904 rcu_read_unlock(); 4910 rcu_read_unlock();
4905 return match; 4911 return match;
4906} 4912}
@@ -5228,7 +5234,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5228 goto out_free_cpus_allowed; 5234 goto out_free_cpus_allowed;
5229 } 5235 }
5230 retval = -EPERM; 5236 retval = -EPERM;
5231 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5237 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5232 goto out_unlock; 5238 goto out_unlock;
5233 5239
5234 retval = security_task_setscheduler(p); 5240 retval = security_task_setscheduler(p);
@@ -5534,6 +5540,7 @@ void __sched io_schedule(void)
5534 5540
5535 delayacct_blkio_start(); 5541 delayacct_blkio_start();
5536 atomic_inc(&rq->nr_iowait); 5542 atomic_inc(&rq->nr_iowait);
5543 blk_flush_plug(current);
5537 current->in_iowait = 1; 5544 current->in_iowait = 1;
5538 schedule(); 5545 schedule();
5539 current->in_iowait = 0; 5546 current->in_iowait = 0;
@@ -5549,6 +5556,7 @@ long __sched io_schedule_timeout(long timeout)
5549 5556
5550 delayacct_blkio_start(); 5557 delayacct_blkio_start();
5551 atomic_inc(&rq->nr_iowait); 5558 atomic_inc(&rq->nr_iowait);
5559 blk_flush_plug(current);
5552 current->in_iowait = 1; 5560 current->in_iowait = 1;
5553 ret = schedule_timeout(timeout); 5561 ret = schedule_timeout(timeout);
5554 current->in_iowait = 0; 5562 current->in_iowait = 0;
@@ -8279,7 +8287,7 @@ static inline int preempt_count_equals(int preempt_offset)
8279{ 8287{
8280 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8288 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8281 8289
8282 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 8290 return (nested == preempt_offset);
8283} 8291}
8284 8292
8285void __might_sleep(const char *file, int line, int preempt_offset) 8293void __might_sleep(const char *file, int line, int preempt_offset)
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..324eff5468ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -636,13 +636,33 @@ static inline bool si_fromuser(const struct siginfo *info)
636} 636}
637 637
638/* 638/*
639 * called with RCU read lock from check_kill_permission()
640 */
641static int kill_ok_by_cred(struct task_struct *t)
642{
643 const struct cred *cred = current_cred();
644 const struct cred *tcred = __task_cred(t);
645
646 if (cred->user->user_ns == tcred->user->user_ns &&
647 (cred->euid == tcred->suid ||
648 cred->euid == tcred->uid ||
649 cred->uid == tcred->suid ||
650 cred->uid == tcred->uid))
651 return 1;
652
653 if (ns_capable(tcred->user->user_ns, CAP_KILL))
654 return 1;
655
656 return 0;
657}
658
659/*
639 * Bad permissions for sending the signal 660 * Bad permissions for sending the signal
640 * - the caller must hold the RCU read lock 661 * - the caller must hold the RCU read lock
641 */ 662 */
642static int check_kill_permission(int sig, struct siginfo *info, 663static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 664 struct task_struct *t)
644{ 665{
645 const struct cred *cred, *tcred;
646 struct pid *sid; 666 struct pid *sid;
647 int error; 667 int error;
648 668
@@ -656,14 +676,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 676 if (error)
657 return error; 677 return error;
658 678
659 cred = current_cred();
660 tcred = __task_cred(t);
661 if (!same_thread_group(current, t) && 679 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) && 680 !kill_ok_by_cred(t)) {
663 (cred->euid ^ tcred->uid) &&
664 (cred->uid ^ tcred->suid) &&
665 (cred->uid ^ tcred->uid) &&
666 !capable(CAP_KILL)) {
667 switch (sig) { 681 switch (sig) {
668 case SIGCONT: 682 case SIGCONT:
669 sid = task_session(t); 683 sid = task_session(t);
@@ -2421,9 +2435,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2421 return -EFAULT; 2435 return -EFAULT;
2422 2436
2423 /* Not even root can pretend to send signals from the kernel. 2437 /* Not even root can pretend to send signals from the kernel.
2424 Nor can they impersonate a kill(), which adds source info. */ 2438 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2425 if (info.si_code >= 0) 2439 */
2440 if (info.si_code != SI_QUEUE) {
2441 /* We used to allow any < 0 si_code */
2442 WARN_ON_ONCE(info.si_code < 0);
2426 return -EPERM; 2443 return -EPERM;
2444 }
2427 info.si_signo = sig; 2445 info.si_signo = sig;
2428 2446
2429 /* POSIX.1b doesn't mention process groups. */ 2447 /* POSIX.1b doesn't mention process groups. */
@@ -2437,9 +2455,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2437 return -EINVAL; 2455 return -EINVAL;
2438 2456
2439 /* Not even root can pretend to send signals from the kernel. 2457 /* Not even root can pretend to send signals from the kernel.
2440 Nor can they impersonate a kill(), which adds source info. */ 2458 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2441 if (info->si_code >= 0) 2459 */
2460 if (info->si_code != SI_QUEUE) {
2461 /* We used to allow any < 0 si_code */
2462 WARN_ON_ONCE(info->si_code < 0);
2442 return -EPERM; 2463 return -EPERM;
2464 }
2443 info->si_signo = sig; 2465 info->si_signo = sig;
2444 2466
2445 return do_send_specific(tgid, pid, sig, info); 2467 return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/smp.c b/kernel/smp.c
index 9910744f0856..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void)
194 */ 194 */
195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 195 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
196 int refs; 196 int refs;
197 void (*func) (void *info); 197 smp_call_func_t func;
198 198
199 /* 199 /*
200 * Since we walk the list without any locks, we might 200 * Since we walk the list without any locks, we might
@@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void)
214 if (atomic_read(&data->refs) == 0) 214 if (atomic_read(&data->refs) == 0)
215 continue; 215 continue;
216 216
217 func = data->csd.func; /* for later warn */ 217 func = data->csd.func; /* save for later warn */
218 data->csd.func(data->csd.info); 218 func(data->csd.info);
219 219
220 /* 220 /*
221 * If the cpu mask is not still set then it enabled interrupts, 221 * If the cpu mask is not still set then func enabled
222 * we took another smp interrupt, and executed the function 222 * interrupts (BUG), and this cpu took another smp call
223 * twice on this cpu. In theory that copy decremented refs. 223 * function interrupt and executed func(info) twice
224 * on this cpu. That nested execution decremented refs.
224 */ 225 */
225 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { 226 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
226 WARN(1, "%pS enabled interrupts and double executed\n", 227 WARN(1, "%pf enabled interrupts and double executed\n", func);
227 func);
228 continue; 228 continue;
229 } 229 }
230 230
@@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
450{ 450{
451 struct call_function_data *data; 451 struct call_function_data *data;
452 unsigned long flags; 452 unsigned long flags;
453 int cpu, next_cpu, this_cpu = smp_processor_id(); 453 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
454 454
455 /* 455 /*
456 * Can deadlock when called with interrupts disabled. 456 * Can deadlock when called with interrupts disabled.
@@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask,
461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 461 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
462 && !oops_in_progress && !early_boot_irqs_disabled); 462 && !oops_in_progress && !early_boot_irqs_disabled);
463 463
464 /* So, what's a CPU they want? Ignoring this one. */ 464 /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
465 cpu = cpumask_first_and(mask, cpu_online_mask); 465 cpu = cpumask_first_and(mask, cpu_online_mask);
466 if (cpu == this_cpu) 466 if (cpu == this_cpu)
467 cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 467 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask,
483 483
484 data = &__get_cpu_var(cfd_data); 484 data = &__get_cpu_var(cfd_data);
485 csd_lock(&data->csd); 485 csd_lock(&data->csd);
486
487 /* This BUG_ON verifies our reuse assertions and can be removed */
486 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); 488 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
487 489
490 /*
491 * The global call function queue list add and delete are protected
492 * by a lock, but the list is traversed without any lock, relying
493 * on the rcu list add and delete to allow safe concurrent traversal.
494 * We reuse the call function data without waiting for any grace
495 * period after some other cpu removes it from the global queue.
496 * This means a cpu might find our data block as it is being
497 * filled out.
498 *
499 * We hold off the interrupt handler on the other cpu by
500 * ordering our writes to the cpu mask vs our setting of the
501 * refs counter. We assert only the cpu owning the data block
502 * will set a bit in cpumask, and each bit will only be cleared
503 * by the subject cpu. Each cpu must first find its bit is
504 * set and then check that refs is set indicating the element is
505 * ready to be processed, otherwise it must skip the entry.
506 *
507 * On the previous iteration refs was set to 0 by another cpu.
508 * To avoid the use of transitivity, set the counter to 0 here
509 * so the wmb will pair with the rmb in the interrupt handler.
510 */
511 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
512
488 data->csd.func = func; 513 data->csd.func = func;
489 data->csd.info = info; 514 data->csd.info = info;
490 cpumask_and(data->cpumask, mask, cpu_online_mask);
491 cpumask_clear_cpu(this_cpu, data->cpumask);
492 515
493 /* 516 /* Ensure 0 refs is visible before mask. Also orders func and info */
494 * To ensure the interrupt handler gets an complete view
495 * we order the cpumask and refs writes and order the read
496 * of them in the interrupt handler. In addition we may
497 * only clear our own cpu bit from the mask.
498 */
499 smp_wmb(); 517 smp_wmb();
500 518
501 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 519 /* We rely on the "and" being processed before the store */
520 cpumask_and(data->cpumask, mask, cpu_online_mask);
521 cpumask_clear_cpu(this_cpu, data->cpumask);
522 refs = cpumask_weight(data->cpumask);
523
524 /* Some callers race with other cpus changing the passed mask */
525 if (unlikely(!refs)) {
526 csd_unlock(&data->csd);
527 return;
528 }
502 529
503 raw_spin_lock_irqsave(&call_function.lock, flags); 530 raw_spin_lock_irqsave(&call_function.lock, flags);
504 /* 531 /*
@@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
507 * will not miss any other list entries: 534 * will not miss any other list entries:
508 */ 535 */
509 list_add_rcu(&data->csd.list, &call_function.queue); 536 list_add_rcu(&data->csd.list, &call_function.queue);
537 /*
538 * We rely on the wmb() in list_add_rcu to complete our writes
539 * to the cpumask before this write to refs, which indicates
540 * data is on the list and is ready to be processed.
541 */
542 atomic_set(&data->refs, refs);
510 raw_spin_unlock_irqrestore(&call_function.lock, flags); 543 raw_spin_unlock_irqrestore(&call_function.lock, flags);
511 544
512 /* 545 /*
@@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void)
571} 604}
572#endif /* USE_GENERIC_SMP_HELPERS */ 605#endif /* USE_GENERIC_SMP_HELPERS */
573 606
607/* Setup configured maximum number of CPUs to activate */
608unsigned int setup_max_cpus = NR_CPUS;
609EXPORT_SYMBOL(setup_max_cpus);
610
611
612/*
613 * Setup routine for controlling SMP activation
614 *
615 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
616 * activation entirely (the MPS table probe still happens, though).
617 *
618 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
619 * greater than 0, limits the maximum number of CPUs activated in
620 * SMP mode to <NUM>.
621 */
622
623void __weak arch_disable_smp_support(void) { }
624
625static int __init nosmp(char *str)
626{
627 setup_max_cpus = 0;
628 arch_disable_smp_support();
629
630 return 0;
631}
632
633early_param("nosmp", nosmp);
634
635/* this is hard limit */
636static int __init nrcpus(char *str)
637{
638 int nr_cpus;
639
640 get_option(&str, &nr_cpus);
641 if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
642 nr_cpu_ids = nr_cpus;
643
644 return 0;
645}
646
647early_param("nr_cpus", nrcpus);
648
649static int __init maxcpus(char *str)
650{
651 get_option(&str, &setup_max_cpus);
652 if (setup_max_cpus == 0)
653 arch_disable_smp_support();
654
655 return 0;
656}
657
658early_param("maxcpus", maxcpus);
659
660/* Setup number of possible processor ids */
661int nr_cpu_ids __read_mostly = NR_CPUS;
662EXPORT_SYMBOL(nr_cpu_ids);
663
664/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
665void __init setup_nr_cpu_ids(void)
666{
667 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
668}
669
670/* Called by boot processor to activate the rest. */
671void __init smp_init(void)
672{
673 unsigned int cpu;
674
675 /* FIXME: This should be done in userspace --RR */
676 for_each_present_cpu(cpu) {
677 if (num_online_cpus() >= setup_max_cpus)
678 break;
679 if (!cpu_online(cpu))
680 cpu_up(cpu);
681 }
682
683 /* Any cleanup work */
684 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
685 smp_cpus_done(setup_max_cpus);
686}
687
574/* 688/*
575 * Call a function on all processors. May be used during early boot while 689 * Call a function on all processors. May be used during early boot while
576 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead 690 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 56e5dec837f0..735d87095172 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
845 switch (action) { 845 switch (action) {
846 case CPU_UP_PREPARE: 846 case CPU_UP_PREPARE:
847 case CPU_UP_PREPARE_FROZEN: 847 case CPU_UP_PREPARE_FROZEN:
848 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 848 p = kthread_create_on_node(run_ksoftirqd,
849 hcpu,
850 cpu_to_node(hotcpu),
851 "ksoftirqd/%d", hotcpu);
849 if (IS_ERR(p)) { 852 if (IS_ERR(p)) {
850 printk("ksoftirqd for %i failed\n", hotcpu); 853 printk("ksoftirqd for %i failed\n", hotcpu);
851 return notifier_from_errno(PTR_ERR(p)); 854 return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
301 case CPU_UP_PREPARE: 301 case CPU_UP_PREPARE:
302 BUG_ON(stopper->thread || stopper->enabled || 302 BUG_ON(stopper->thread || stopper->enabled ||
303 !list_empty(&stopper->works)); 303 !list_empty(&stopper->works));
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create_on_node(cpu_stopper_thread,
305 cpu); 305 stopper,
306 cpu_to_node(cpu),
307 "migration/%d", cpu);
306 if (IS_ERR(p)) 308 if (IS_ERR(p))
307 return notifier_from_errno(PTR_ERR(p)); 309 return notifier_from_errno(PTR_ERR(p));
308 get_task_struct(p); 310 get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702ec813..af468edf096a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,6 +37,7 @@
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h> 39#include <linux/gfp.h>
40#include <linux/syscore_ops.h>
40 41
41#include <linux/compat.h> 42#include <linux/compat.h>
42#include <linux/syscalls.h> 43#include <linux/syscalls.h>
@@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
119void (*pm_power_off_prepare)(void); 120void (*pm_power_off_prepare)(void);
120 121
121/* 122/*
123 * Returns true if current's euid is same as p's uid or euid,
124 * or has CAP_SYS_NICE to p's user_ns.
125 *
126 * Called with rcu_read_lock, creds are safe
127 */
128static bool set_one_prio_perm(struct task_struct *p)
129{
130 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
131
132 if (pcred->user->user_ns == cred->user->user_ns &&
133 (pcred->uid == cred->euid ||
134 pcred->euid == cred->euid))
135 return true;
136 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
137 return true;
138 return false;
139}
140
141/*
122 * set the priority of a task 142 * set the priority of a task
123 * - the caller must hold the RCU read lock 143 * - the caller must hold the RCU read lock
124 */ 144 */
125static int set_one_prio(struct task_struct *p, int niceval, int error) 145static int set_one_prio(struct task_struct *p, int niceval, int error)
126{ 146{
127 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
128 int no_nice; 147 int no_nice;
129 148
130 if (pcred->uid != cred->euid && 149 if (!set_one_prio_perm(p)) {
131 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
132 error = -EPERM; 150 error = -EPERM;
133 goto out; 151 goto out;
134 } 152 }
@@ -298,6 +316,7 @@ void kernel_restart_prepare(char *cmd)
298 system_state = SYSTEM_RESTART; 316 system_state = SYSTEM_RESTART;
299 device_shutdown(); 317 device_shutdown();
300 sysdev_shutdown(); 318 sysdev_shutdown();
319 syscore_shutdown();
301} 320}
302 321
303/** 322/**
@@ -336,6 +355,7 @@ void kernel_halt(void)
336{ 355{
337 kernel_shutdown_prepare(SYSTEM_HALT); 356 kernel_shutdown_prepare(SYSTEM_HALT);
338 sysdev_shutdown(); 357 sysdev_shutdown();
358 syscore_shutdown();
339 printk(KERN_EMERG "System halted.\n"); 359 printk(KERN_EMERG "System halted.\n");
340 kmsg_dump(KMSG_DUMP_HALT); 360 kmsg_dump(KMSG_DUMP_HALT);
341 machine_halt(); 361 machine_halt();
@@ -355,6 +375,7 @@ void kernel_power_off(void)
355 pm_power_off_prepare(); 375 pm_power_off_prepare();
356 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
357 sysdev_shutdown(); 377 sysdev_shutdown();
378 syscore_shutdown();
358 printk(KERN_EMERG "Power down.\n"); 379 printk(KERN_EMERG "Power down.\n");
359 kmsg_dump(KMSG_DUMP_POWEROFF); 380 kmsg_dump(KMSG_DUMP_POWEROFF);
360 machine_power_off(); 381 machine_power_off();
@@ -502,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
502 if (rgid != (gid_t) -1) { 523 if (rgid != (gid_t) -1) {
503 if (old->gid == rgid || 524 if (old->gid == rgid ||
504 old->egid == rgid || 525 old->egid == rgid ||
505 capable(CAP_SETGID)) 526 nsown_capable(CAP_SETGID))
506 new->gid = rgid; 527 new->gid = rgid;
507 else 528 else
508 goto error; 529 goto error;
@@ -511,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
511 if (old->gid == egid || 532 if (old->gid == egid ||
512 old->egid == egid || 533 old->egid == egid ||
513 old->sgid == egid || 534 old->sgid == egid ||
514 capable(CAP_SETGID)) 535 nsown_capable(CAP_SETGID))
515 new->egid = egid; 536 new->egid = egid;
516 else 537 else
517 goto error; 538 goto error;
@@ -546,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
546 old = current_cred(); 567 old = current_cred();
547 568
548 retval = -EPERM; 569 retval = -EPERM;
549 if (capable(CAP_SETGID)) 570 if (nsown_capable(CAP_SETGID))
550 new->gid = new->egid = new->sgid = new->fsgid = gid; 571 new->gid = new->egid = new->sgid = new->fsgid = gid;
551 else if (gid == old->gid || gid == old->sgid) 572 else if (gid == old->gid || gid == old->sgid)
552 new->egid = new->fsgid = gid; 573 new->egid = new->fsgid = gid;
@@ -613,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
613 new->uid = ruid; 634 new->uid = ruid;
614 if (old->uid != ruid && 635 if (old->uid != ruid &&
615 old->euid != ruid && 636 old->euid != ruid &&
616 !capable(CAP_SETUID)) 637 !nsown_capable(CAP_SETUID))
617 goto error; 638 goto error;
618 } 639 }
619 640
@@ -622,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
622 if (old->uid != euid && 643 if (old->uid != euid &&
623 old->euid != euid && 644 old->euid != euid &&
624 old->suid != euid && 645 old->suid != euid &&
625 !capable(CAP_SETUID)) 646 !nsown_capable(CAP_SETUID))
626 goto error; 647 goto error;
627 } 648 }
628 649
@@ -670,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
670 old = current_cred(); 691 old = current_cred();
671 692
672 retval = -EPERM; 693 retval = -EPERM;
673 if (capable(CAP_SETUID)) { 694 if (nsown_capable(CAP_SETUID)) {
674 new->suid = new->uid = uid; 695 new->suid = new->uid = uid;
675 if (uid != old->uid) { 696 if (uid != old->uid) {
676 retval = set_user(new); 697 retval = set_user(new);
@@ -712,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
712 old = current_cred(); 733 old = current_cred();
713 734
714 retval = -EPERM; 735 retval = -EPERM;
715 if (!capable(CAP_SETUID)) { 736 if (!nsown_capable(CAP_SETUID)) {
716 if (ruid != (uid_t) -1 && ruid != old->uid && 737 if (ruid != (uid_t) -1 && ruid != old->uid &&
717 ruid != old->euid && ruid != old->suid) 738 ruid != old->euid && ruid != old->suid)
718 goto error; 739 goto error;
@@ -776,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
776 old = current_cred(); 797 old = current_cred();
777 798
778 retval = -EPERM; 799 retval = -EPERM;
779 if (!capable(CAP_SETGID)) { 800 if (!nsown_capable(CAP_SETGID)) {
780 if (rgid != (gid_t) -1 && rgid != old->gid && 801 if (rgid != (gid_t) -1 && rgid != old->gid &&
781 rgid != old->egid && rgid != old->sgid) 802 rgid != old->egid && rgid != old->sgid)
782 goto error; 803 goto error;
@@ -836,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
836 857
837 if (uid == old->uid || uid == old->euid || 858 if (uid == old->uid || uid == old->euid ||
838 uid == old->suid || uid == old->fsuid || 859 uid == old->suid || uid == old->fsuid ||
839 capable(CAP_SETUID)) { 860 nsown_capable(CAP_SETUID)) {
840 if (uid != old_fsuid) { 861 if (uid != old_fsuid) {
841 new->fsuid = uid; 862 new->fsuid = uid;
842 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 863 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -869,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
869 890
870 if (gid == old->gid || gid == old->egid || 891 if (gid == old->gid || gid == old->egid ||
871 gid == old->sgid || gid == old->fsgid || 892 gid == old->sgid || gid == old->fsgid ||
872 capable(CAP_SETGID)) { 893 nsown_capable(CAP_SETGID)) {
873 if (gid != old_fsgid) { 894 if (gid != old_fsgid) {
874 new->fsgid = gid; 895 new->fsgid = gid;
875 goto change_okay; 896 goto change_okay;
@@ -1177,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1177 int errno; 1198 int errno;
1178 char tmp[__NEW_UTS_LEN]; 1199 char tmp[__NEW_UTS_LEN];
1179 1200
1180 if (!capable(CAP_SYS_ADMIN)) 1201 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1181 return -EPERM; 1202 return -EPERM;
1203
1182 if (len < 0 || len > __NEW_UTS_LEN) 1204 if (len < 0 || len > __NEW_UTS_LEN)
1183 return -EINVAL; 1205 return -EINVAL;
1184 down_write(&uts_sem); 1206 down_write(&uts_sem);
@@ -1226,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1226 int errno; 1248 int errno;
1227 char tmp[__NEW_UTS_LEN]; 1249 char tmp[__NEW_UTS_LEN];
1228 1250
1229 if (!capable(CAP_SYS_ADMIN)) 1251 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1230 return -EPERM; 1252 return -EPERM;
1231 if (len < 0 || len > __NEW_UTS_LEN) 1253 if (len < 0 || len > __NEW_UTS_LEN)
1232 return -EINVAL; 1254 return -EINVAL;
@@ -1341,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
1341 rlim = tsk->signal->rlim + resource; 1363 rlim = tsk->signal->rlim + resource;
1342 task_lock(tsk->group_leader); 1364 task_lock(tsk->group_leader);
1343 if (new_rlim) { 1365 if (new_rlim) {
1366 /* Keep the capable check against init_user_ns until
1367 cgroups can contain all limits */
1344 if (new_rlim->rlim_max > rlim->rlim_max && 1368 if (new_rlim->rlim_max > rlim->rlim_max &&
1345 !capable(CAP_SYS_RESOURCE)) 1369 !capable(CAP_SYS_RESOURCE))
1346 retval = -EPERM; 1370 retval = -EPERM;
@@ -1384,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
1384{ 1408{
1385 const struct cred *cred = current_cred(), *tcred; 1409 const struct cred *cred = current_cred(), *tcred;
1386 1410
1387 tcred = __task_cred(task); 1411 if (current == task)
1388 if (current != task && 1412 return 0;
1389 (cred->uid != tcred->euid ||
1390 cred->uid != tcred->suid ||
1391 cred->uid != tcred->uid ||
1392 cred->gid != tcred->egid ||
1393 cred->gid != tcred->sgid ||
1394 cred->gid != tcred->gid) &&
1395 !capable(CAP_SYS_RESOURCE)) {
1396 return -EPERM;
1397 }
1398 1413
1399 return 0; 1414 tcred = __task_cred(task);
1415 if (cred->user->user_ns == tcred->user->user_ns &&
1416 (cred->uid == tcred->euid &&
1417 cred->uid == tcred->suid &&
1418 cred->uid == tcred->uid &&
1419 cred->gid == tcred->egid &&
1420 cred->gid == tcred->sgid &&
1421 cred->gid == tcred->gid))
1422 return 0;
1423 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
1424 return 0;
1425
1426 return -EPERM;
1400} 1427}
1401 1428
1402SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1429SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 51054fea5d99..c0bb32414b17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,7 @@ static int neg_one = -1;
117static int zero; 117static int zero;
118static int __maybe_unused one = 1; 118static int __maybe_unused one = 1;
119static int __maybe_unused two = 2; 119static int __maybe_unused two = 2;
120static int __maybe_unused three = 3;
120static unsigned long one_ul = 1; 121static unsigned long one_ul = 1;
121static int one_hundred = 100; 122static int one_hundred = 100;
122#ifdef CONFIG_PRINTK 123#ifdef CONFIG_PRINTK
@@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
169 void __user *buffer, size_t *lenp, loff_t *ppos); 170 void __user *buffer, size_t *lenp, loff_t *ppos);
170#endif 171#endif
171 172
173#ifdef CONFIG_PRINTK
174static int proc_dmesg_restrict(struct ctl_table *table, int write,
175 void __user *buffer, size_t *lenp, loff_t *ppos);
176#endif
177
172#ifdef CONFIG_MAGIC_SYSRQ 178#ifdef CONFIG_MAGIC_SYSRQ
173/* Note: sysrq code uses it's own private copy */ 179/* Note: sysrq code uses it's own private copy */
174static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 180static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -706,7 +712,7 @@ static struct ctl_table kern_table[] = {
706 .data = &kptr_restrict, 712 .data = &kptr_restrict,
707 .maxlen = sizeof(int), 713 .maxlen = sizeof(int),
708 .mode = 0644, 714 .mode = 0644,
709 .proc_handler = proc_dointvec_minmax, 715 .proc_handler = proc_dmesg_restrict,
710 .extra1 = &zero, 716 .extra1 = &zero,
711 .extra2 = &two, 717 .extra2 = &two,
712 }, 718 },
@@ -971,14 +977,18 @@ static struct ctl_table vm_table[] = {
971 .data = &sysctl_overcommit_memory, 977 .data = &sysctl_overcommit_memory,
972 .maxlen = sizeof(sysctl_overcommit_memory), 978 .maxlen = sizeof(sysctl_overcommit_memory),
973 .mode = 0644, 979 .mode = 0644,
974 .proc_handler = proc_dointvec, 980 .proc_handler = proc_dointvec_minmax,
981 .extra1 = &zero,
982 .extra2 = &two,
975 }, 983 },
976 { 984 {
977 .procname = "panic_on_oom", 985 .procname = "panic_on_oom",
978 .data = &sysctl_panic_on_oom, 986 .data = &sysctl_panic_on_oom,
979 .maxlen = sizeof(sysctl_panic_on_oom), 987 .maxlen = sizeof(sysctl_panic_on_oom),
980 .mode = 0644, 988 .mode = 0644,
981 .proc_handler = proc_dointvec, 989 .proc_handler = proc_dointvec_minmax,
990 .extra1 = &zero,
991 .extra2 = &two,
982 }, 992 },
983 { 993 {
984 .procname = "oom_kill_allocating_task", 994 .procname = "oom_kill_allocating_task",
@@ -1006,7 +1016,8 @@ static struct ctl_table vm_table[] = {
1006 .data = &page_cluster, 1016 .data = &page_cluster,
1007 .maxlen = sizeof(int), 1017 .maxlen = sizeof(int),
1008 .mode = 0644, 1018 .mode = 0644,
1009 .proc_handler = proc_dointvec, 1019 .proc_handler = proc_dointvec_minmax,
1020 .extra1 = &zero,
1010 }, 1021 },
1011 { 1022 {
1012 .procname = "dirty_background_ratio", 1023 .procname = "dirty_background_ratio",
@@ -1054,7 +1065,8 @@ static struct ctl_table vm_table[] = {
1054 .data = &dirty_expire_interval, 1065 .data = &dirty_expire_interval,
1055 .maxlen = sizeof(dirty_expire_interval), 1066 .maxlen = sizeof(dirty_expire_interval),
1056 .mode = 0644, 1067 .mode = 0644,
1057 .proc_handler = proc_dointvec, 1068 .proc_handler = proc_dointvec_minmax,
1069 .extra1 = &zero,
1058 }, 1070 },
1059 { 1071 {
1060 .procname = "nr_pdflush_threads", 1072 .procname = "nr_pdflush_threads",
@@ -1130,6 +1142,8 @@ static struct ctl_table vm_table[] = {
1130 .maxlen = sizeof(int), 1142 .maxlen = sizeof(int),
1131 .mode = 0644, 1143 .mode = 0644,
1132 .proc_handler = drop_caches_sysctl_handler, 1144 .proc_handler = drop_caches_sysctl_handler,
1145 .extra1 = &one,
1146 .extra2 = &three,
1133 }, 1147 },
1134#ifdef CONFIG_COMPACTION 1148#ifdef CONFIG_COMPACTION
1135 { 1149 {
@@ -1683,13 +1697,8 @@ static int test_perm(int mode, int op)
1683 1697
1684int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) 1698int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1685{ 1699{
1686 int error;
1687 int mode; 1700 int mode;
1688 1701
1689 error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
1690 if (error)
1691 return error;
1692
1693 if (root->permissions) 1702 if (root->permissions)
1694 mode = root->permissions(root, current->nsproxy, table); 1703 mode = root->permissions(root, current->nsproxy, table);
1695 else 1704 else
@@ -2390,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
2390 return err; 2399 return err;
2391} 2400}
2392 2401
2402#ifdef CONFIG_PRINTK
2403static int proc_dmesg_restrict(struct ctl_table *table, int write,
2404 void __user *buffer, size_t *lenp, loff_t *ppos)
2405{
2406 if (write && !capable(CAP_SYS_ADMIN))
2407 return -EPERM;
2408
2409 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2410}
2411#endif
2412
2393struct do_proc_dointvec_minmax_conv_param { 2413struct do_proc_dointvec_minmax_conv_param {
2394 int *min; 2414 int *min;
2395 int *max; 2415 int *max;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
111 const char *fail = NULL; 111 const char *fail = NULL;
112 112
113 if (table->parent) { 113 if (table->parent) {
114 if (table->procname && !table->parent->procname) 114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
116 } 116 }
117 if (!table->procname)
118 set_fail(&fail, table, "No procname");
119 if (table->child) { 117 if (table->child) {
120 if (table->data) 118 if (table->data)
121 set_fail(&fail, table, "Directory with data?"); 119 set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
144 set_fail(&fail, table, "No maxlen"); 142 set_fail(&fail, table, "No maxlen");
145 } 143 }
146#ifdef CONFIG_PROC_SYSCTL 144#ifdef CONFIG_PROC_SYSCTL
147 if (table->procname && !table->proc_handler) 145 if (!table->proc_handler)
148 set_fail(&fail, table, "No proc_handler"); 146 set_fail(&fail, table, "No proc_handler");
149#endif 147#endif
150#if 0
151 if (!table->procname && table->proc_handler)
152 set_fail(&fail, table, "proc_handler without procname");
153#endif
154 sysctl_check_leaf(namespaces, table, &fail); 148 sysctl_check_leaf(namespaces, table, &fail);
155 } 149 }
156 if (table->mode > 0777) 150 if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
685 goto err_cgroup_ops; 685 goto err_cgroup_ops;
686 686
687 family_registered = 1; 687 family_registered = 1;
688 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 688 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
689 return 0; 689 return 0;
690err_cgroup_ops: 690err_cgroup_ops:
691 genl_unregister_ops(&family, &taskstats_ops); 691 genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 14674dce77a6..61d7d59f4a1a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
275 This tracer profiles all the the likely and unlikely macros 275 This tracer profiles all the the likely and unlikely macros
276 in the kernel. It will display the results in: 276 in the kernel. It will display the results in:
277 277
278 /sys/kernel/debug/tracing/profile_annotated_branch 278 /sys/kernel/debug/tracing/trace_stat/branch_annotated
279 279
280 Note: this will add a significant overhead; only turn this 280 Note: this will add a significant overhead; only turn this
281 on if you need to profile the system's use of these macros. 281 on if you need to profile the system's use of these macros.
@@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES
288 taken in the kernel is recorded whether it hit or miss. 288 taken in the kernel is recorded whether it hit or miss.
289 The results will be displayed in: 289 The results will be displayed in:
290 290
291 /sys/kernel/debug/tracing/profile_branch 291 /sys/kernel/debug/tracing/trace_stat/branch_all
292 292
293 This option also enables the likely/unlikely profiler. 293 This option also enables the likely/unlikely profiler.
294 294
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f38..7aa40f8e182d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
703 * 703 *
704 **/ 704 **/
705static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 705static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
706 u32 what) 706 u32 what)
707{ 707{
708 struct blk_trace *bt = q->blk_trace; 708 struct blk_trace *bt = q->blk_trace;
709 int rw = rq->cmd_flags & 0x03;
710 709
711 if (likely(!bt)) 710 if (likely(!bt))
712 return; 711 return;
713 712
714 if (rq->cmd_flags & REQ_DISCARD)
715 rw |= REQ_DISCARD;
716
717 if (rq->cmd_flags & REQ_SECURE)
718 rw |= REQ_SECURE;
719
720 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 713 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
721 what |= BLK_TC_ACT(BLK_TC_PC); 714 what |= BLK_TC_ACT(BLK_TC_PC);
722 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 715 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
723 what, rq->errors, rq->cmd_len, rq->cmd); 716 what, rq->errors, rq->cmd_len, rq->cmd);
724 } else { 717 } else {
725 what |= BLK_TC_ACT(BLK_TC_FS); 718 what |= BLK_TC_ACT(BLK_TC_FS);
726 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, 719 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
727 what, rq->errors, 0, NULL); 720 rq->cmd_flags, what, rq->errors, 0, NULL);
728 } 721 }
729} 722}
730 723
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 888b611897d3..c075f4ea6b94 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1467 return t_hash_next(m, pos); 1467 return t_hash_next(m, pos);
1468 1468
1469 (*pos)++; 1469 (*pos)++;
1470 iter->pos = *pos; 1470 iter->pos = iter->func_pos = *pos;
1471 1471
1472 if (iter->flags & FTRACE_ITER_PRINTALL) 1472 if (iter->flags & FTRACE_ITER_PRINTALL)
1473 return t_hash_start(m, pos); 1473 return t_hash_start(m, pos);
@@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1502 if (!rec) 1502 if (!rec)
1503 return t_hash_start(m, pos); 1503 return t_hash_start(m, pos);
1504 1504
1505 iter->func_pos = *pos;
1506 iter->func = rec; 1505 iter->func = rec;
1507 1506
1508 return iter; 1507 return iter;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index db7b439d23ee..d9c8bcafb120 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -668,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
668 * the reader page). But if the next page is a header page, 668 * the reader page). But if the next page is a header page,
669 * its flags will be non zero. 669 * its flags will be non zero.
670 */ 670 */
671static int inline 671static inline int
672rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, 672rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
673 struct buffer_page *page, struct list_head *list) 673 struct buffer_page *page, struct list_head *list)
674{ 674{
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 3249b4f77ef0..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -391,8 +391,8 @@ static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec) 391 struct filter_pred *op, void *rec)
392{ 392{
393 struct filter_pred *pred; 393 struct filter_pred *pred;
394 int match = 0;
394 int type; 395 int type;
395 int match;
396 int i; 396 int i;
397 397
398 /* 398 /*
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
189 struct group_info *group_info; 189 struct group_info *group_info;
190 int retval; 190 int retval;
191 191
192 if (!capable(CAP_SETGID)) 192 if (!nsown_capable(CAP_SETGID))
193 return -EPERM; 193 return -EPERM;
194 if ((unsigned)gidsetsize > NGROUPS_MAX) 194 if ((unsigned)gidsetsize > NGROUPS_MAX)
195 return -EINVAL; 195 return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20/*
21 * userns count is 1 for root user, 1 for init_uts_ns,
22 * and 1 for... ?
23 */
20struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
21 .kref = { 25 .kref = {
22 .refcount = ATOMIC_INIT(2), 26 .refcount = ATOMIC_INIT(3),
23 }, 27 },
24 .creator = &root_user, 28 .creator = &root_user,
25}; 29};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
47 */ 51 */
48static DEFINE_SPINLOCK(uidhash_lock); 52static DEFINE_SPINLOCK(uidhash_lock);
49 53
50/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ 54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
51struct user_struct root_user = { 55struct user_struct root_user = {
52 .__count = ATOMIC_INIT(2), 56 .__count = ATOMIC_INIT(2),
53 .processes = ATOMIC_INIT(1), 57 .processes = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..44646179eaba 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h>
17 18
18static struct uts_namespace *create_uts_ns(void) 19static struct uts_namespace *create_uts_ns(void)
19{ 20{
@@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
30 * @old_ns: namespace to clone 31 * @old_ns: namespace to clone
31 * Return NULL on error (failure to kmalloc), new ns otherwise 32 * Return NULL on error (failure to kmalloc), new ns otherwise
32 */ 33 */
33static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) 34static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
35 struct uts_namespace *old_ns)
34{ 36{
35 struct uts_namespace *ns; 37 struct uts_namespace *ns;
36 38
@@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
40 42
41 down_read(&uts_sem); 43 down_read(&uts_sem);
42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 44 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
45 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
43 up_read(&uts_sem); 46 up_read(&uts_sem);
44 return ns; 47 return ns;
45} 48}
@@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
50 * utsname of this process won't be seen by parent, and vice 53 * utsname of this process won't be seen by parent, and vice
51 * versa. 54 * versa.
52 */ 55 */
53struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) 56struct uts_namespace *copy_utsname(unsigned long flags,
57 struct task_struct *tsk)
54{ 58{
59 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
55 struct uts_namespace *new_ns; 60 struct uts_namespace *new_ns;
56 61
57 BUG_ON(!old_ns); 62 BUG_ON(!old_ns);
@@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
60 if (!(flags & CLONE_NEWUTS)) 65 if (!(flags & CLONE_NEWUTS))
61 return old_ns; 66 return old_ns;
62 67
63 new_ns = clone_uts_ns(old_ns); 68 new_ns = clone_uts_ns(tsk, old_ns);
64 69
65 put_uts_ns(old_ns); 70 put_uts_ns(old_ns);
66 return new_ns; 71 return new_ns;
@@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref)
71 struct uts_namespace *ns; 76 struct uts_namespace *ns;
72 77
73 ns = container_of(kref, struct uts_namespace, kref); 78 ns = container_of(kref, struct uts_namespace, kref);
79 put_user_ns(ns->user_ns);
74 kfree(ns); 80 kfree(ns);
75} 81}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c57..140dce750450 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
48 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
49 */ 49 */
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 50#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static int hardlockup_panic; 51static int hardlockup_panic =
52 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
52 53
53static int __init hardlockup_panic_setup(char *str) 54static int __init hardlockup_panic_setup(char *str)
54{ 55{
55 if (!strncmp(str, "panic", 5)) 56 if (!strncmp(str, "panic", 5))
56 hardlockup_panic = 1; 57 hardlockup_panic = 1;
58 else if (!strncmp(str, "nopanic", 7))
59 hardlockup_panic = 0;
57 else if (!strncmp(str, "0", 1)) 60 else if (!strncmp(str, "0", 1))
58 watchdog_enabled = 0; 61 watchdog_enabled = 0;
59 return 1; 62 return 1;
@@ -415,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu)
415static int watchdog_enable(int cpu) 418static int watchdog_enable(int cpu)
416{ 419{
417 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 420 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
418 int err; 421 int err = 0;
419 422
420 /* enable the perf event */ 423 /* enable the perf event */
421 err = watchdog_nmi_enable(cpu); 424 err = watchdog_nmi_enable(cpu);
422 if (err) 425
423 return err; 426 /* Regardless of err above, fall through and start softlockup */
424 427
425 /* create the watchdog thread */ 428 /* create the watchdog thread */
426 if (!p) { 429 if (!p) {
427 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 430 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
428 if (IS_ERR(p)) { 431 if (IS_ERR(p)) {
429 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 432 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
430 return PTR_ERR(p); 433 if (!err)
434 /* if hardlockup hasn't already set this */
435 err = PTR_ERR(p);
436 goto out;
431 } 437 }
432 kthread_bind(p, cpu); 438 kthread_bind(p, cpu);
433 per_cpu(watchdog_touch_ts, cpu) = 0; 439 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -435,7 +441,8 @@ static int watchdog_enable(int cpu)
435 wake_up_process(p); 441 wake_up_process(p);
436 } 442 }
437 443
438 return 0; 444out:
445 return err;
439} 446}
440 447
441static void watchdog_disable(int cpu) 448static void watchdog_disable(int cpu)
@@ -547,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
547 break; 554 break;
548#endif /* CONFIG_HOTPLUG_CPU */ 555#endif /* CONFIG_HOTPLUG_CPU */
549 } 556 }
550 return notifier_from_errno(err); 557
558 /*
559 * hardlockup and softlockup are not important enough
560 * to block cpu bring up. Just always succeed and
561 * rely on printk output to flag problems.
562 */
563 return NOTIFY_OK;
551} 564}
552 565
553static struct notifier_block __cpuinitdata cpu_nfb = { 566static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b5fe4c00eb3c..04ef830690ec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
251struct workqueue_struct *system_long_wq __read_mostly; 251struct workqueue_struct *system_long_wq __read_mostly;
252struct workqueue_struct *system_nrt_wq __read_mostly; 252struct workqueue_struct *system_nrt_wq __read_mostly;
253struct workqueue_struct *system_unbound_wq __read_mostly; 253struct workqueue_struct *system_unbound_wq __read_mostly;
254struct workqueue_struct *system_freezable_wq __read_mostly;
254EXPORT_SYMBOL_GPL(system_wq); 255EXPORT_SYMBOL_GPL(system_wq);
255EXPORT_SYMBOL_GPL(system_long_wq); 256EXPORT_SYMBOL_GPL(system_long_wq);
256EXPORT_SYMBOL_GPL(system_nrt_wq); 257EXPORT_SYMBOL_GPL(system_nrt_wq);
257EXPORT_SYMBOL_GPL(system_unbound_wq); 258EXPORT_SYMBOL_GPL(system_unbound_wq);
259EXPORT_SYMBOL_GPL(system_freezable_wq);
258 260
259#define CREATE_TRACE_POINTS 261#define CREATE_TRACE_POINTS
260#include <trace/events/workqueue.h> 262#include <trace/events/workqueue.h>
@@ -1364,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1364 worker->id = id; 1366 worker->id = id;
1365 1367
1366 if (!on_unbound_cpu) 1368 if (!on_unbound_cpu)
1367 worker->task = kthread_create(worker_thread, worker, 1369 worker->task = kthread_create_on_node(worker_thread,
1368 "kworker/%u:%d", gcwq->cpu, id); 1370 worker,
1371 cpu_to_node(gcwq->cpu),
1372 "kworker/%u:%d", gcwq->cpu, id);
1369 else 1373 else
1370 worker->task = kthread_create(worker_thread, worker, 1374 worker->task = kthread_create(worker_thread, worker,
1371 "kworker/u:%d", id); 1375 "kworker/u:%d", id);
@@ -3781,8 +3785,10 @@ static int __init init_workqueues(void)
3781 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); 3785 system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3782 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, 3786 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3783 WQ_UNBOUND_MAX_ACTIVE); 3787 WQ_UNBOUND_MAX_ACTIVE);
3788 system_freezable_wq = alloc_workqueue("events_freezable",
3789 WQ_FREEZABLE, 0);
3784 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || 3790 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3785 !system_unbound_wq); 3791 !system_unbound_wq || !system_freezable_wq);
3786 return 0; 3792 return 0;
3787} 3793}
3788early_initcall(init_workqueues); 3794early_initcall(init_workqueues);