aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-04-26 04:22:15 -0400
committerJiri Kosina <jkosina@suse.cz>2011-04-26 04:22:59 -0400
commit07f9479a40cc778bc1462ada11f95b01360ae4ff (patch)
tree0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /kernel
parent9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff)
parentcd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff)
Merge branch 'master' into for-next
Fast-forwarded to current state of Linus' tree as there are patches to be applied for files that didn't exist on the old branch.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c96
-rw-r--r--kernel/cgroup.c16
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c80
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/debug/kdb/kdb_main.c10
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c154
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/irq/Kconfig15
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c285
-rw-r--r--kernel/irq/compat.h72
-rw-r--r--kernel/irq/debug.h12
-rw-r--r--kernel/irq/dummychip.c9
-rw-r--r--kernel/irq/handle.c19
-rw-r--r--kernel/irq/internals.h16
-rw-r--r--kernel/irq/irqdesc.c17
-rw-r--r--kernel/irq/manage.c103
-rw-r--r--kernel/irq/migration.c17
-rw-r--r--kernel/irq/proc.c23
-rw-r--r--kernel/irq/resend.c1
-rw-r--r--kernel/irq/settings.h55
-rw-r--r--kernel/irq/spurious.c11
-rw-r--r--kernel/kallsyms.c58
-rw-r--r--kernel/kexec.c18
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c4
-rw-r--r--kernel/lockdep_proc.c9
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/perf_event.c32
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/posix-cpu-timers.c2
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig6
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c10
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/suspend.c5
-rw-r--r--kernel/printk.c36
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/sched.c46
-rw-r--r--kernel/sched_autogroup.c2
-rw-r--r--kernel/sched_fair.c33
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c4
-rw-r--r--kernel/sched_stoptask.c2
-rw-r--r--kernel/signal.c201
-rw-r--r--kernel/smp.c81
-rw-r--r--kernel/softirq.c7
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c77
-rw-r--r--kernel/sysctl.c35
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-clock.c24
-rw-r--r--kernel/time/timekeeping.c27
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/trace/blktrace.c48
-rw-r--r--kernel/trace/ftrace.c7
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c27
-rw-r--r--kernel/workqueue.c8
96 files changed, 1247 insertions, 911 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..85cbfb31e73e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 109obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
110 111
111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 37b2bea170c8..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -607,7 +607,7 @@ void audit_trim_trees(void)
607 spin_lock(&hash_lock); 607 spin_lock(&hash_lock);
608 list_for_each_entry(node, &tree->chunks, list) { 608 list_for_each_entry(node, &tree->chunks, list) {
609 struct audit_chunk *chunk = find_chunk(node); 609 struct audit_chunk *chunk = find_chunk(node);
610 /* this could be NULL if the watch is dieing else where... */ 610 /* this could be NULL if the watch is dying else where... */
611 struct inode *inode = chunk->mark.i.inode; 611 struct inode *inode = chunk->mark.i.inode;
612 node->index |= 1U<<31; 612 node->index |= 1U<<31;
613 if (iterate_mounts(compare_root, inode, root_mnt)) 613 if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f49a0318c2ed..b33513a08beb 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
1011/* 1011/*
1012 * to_send and len_sent accounting are very loose estimates. We aren't 1012 * to_send and len_sent accounting are very loose estimates. We aren't
1013 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being 1013 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
1014 * within about 500 bytes (next page boundry) 1014 * within about 500 bytes (next page boundary)
1015 * 1015 *
1016 * why snprintf? an int is up to 12 digits long. if we just assumed when 1016 * why snprintf? an int is up to 12 digits long. if we just assumed when
1017 * logging that a[%d]= was going to be 16 characters long we would be wasting 1017 * logging that a[%d]= was going to be 16 characters long we would be wasting
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h>
12 13
13void foo(void) 14void foo(void)
14{ 15{
15 /* The enum constants to put into include/generated/bounds.h */ 16 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
18 /* End of constants */ 20 /* End of constants */
19} 21}
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c8..bf0c734d0c12 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <linux/user_namespace.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18 19
19/* 20/*
@@ -290,6 +291,60 @@ error:
290} 291}
291 292
292/** 293/**
294 * has_capability - Does a task have a capability in init_user_ns
295 * @t: The task in question
296 * @cap: The capability to be tested for
297 *
298 * Return true if the specified task has the given superior capability
299 * currently in effect to the initial user namespace, false if not.
300 *
301 * Note that this does not set PF_SUPERPRIV on the task.
302 */
303bool has_capability(struct task_struct *t, int cap)
304{
305 int ret = security_real_capable(t, &init_user_ns, cap);
306
307 return (ret == 0);
308}
309
310/**
311 * has_capability - Does a task have a capability in a specific user ns
312 * @t: The task in question
313 * @ns: target user namespace
314 * @cap: The capability to be tested for
315 *
316 * Return true if the specified task has the given superior capability
317 * currently in effect to the specified user namespace, false if not.
318 *
319 * Note that this does not set PF_SUPERPRIV on the task.
320 */
321bool has_ns_capability(struct task_struct *t,
322 struct user_namespace *ns, int cap)
323{
324 int ret = security_real_capable(t, ns, cap);
325
326 return (ret == 0);
327}
328
329/**
330 * has_capability_noaudit - Does a task have a capability (unaudited)
331 * @t: The task in question
332 * @cap: The capability to be tested for
333 *
334 * Return true if the specified task has the given superior capability
335 * currently in effect to init_user_ns, false if not. Don't write an
336 * audit message for the check.
337 *
338 * Note that this does not set PF_SUPERPRIV on the task.
339 */
340bool has_capability_noaudit(struct task_struct *t, int cap)
341{
342 int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
343
344 return (ret == 0);
345}
346
347/**
293 * capable - Determine if the current task has a superior capability in effect 348 * capable - Determine if the current task has a superior capability in effect
294 * @cap: The capability to be tested for 349 * @cap: The capability to be tested for
295 * 350 *
@@ -299,17 +354,48 @@ error:
299 * This sets PF_SUPERPRIV on the task if the capability is available on the 354 * This sets PF_SUPERPRIV on the task if the capability is available on the
300 * assumption that it's about to be used. 355 * assumption that it's about to be used.
301 */ 356 */
302int capable(int cap) 357bool capable(int cap)
358{
359 return ns_capable(&init_user_ns, cap);
360}
361EXPORT_SYMBOL(capable);
362
363/**
364 * ns_capable - Determine if the current task has a superior capability in effect
365 * @ns: The usernamespace we want the capability in
366 * @cap: The capability to be tested for
367 *
368 * Return true if the current task has the given superior capability currently
369 * available for use, false if not.
370 *
371 * This sets PF_SUPERPRIV on the task if the capability is available on the
372 * assumption that it's about to be used.
373 */
374bool ns_capable(struct user_namespace *ns, int cap)
303{ 375{
304 if (unlikely(!cap_valid(cap))) { 376 if (unlikely(!cap_valid(cap))) {
305 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); 377 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
306 BUG(); 378 BUG();
307 } 379 }
308 380
309 if (security_capable(current_cred(), cap) == 0) { 381 if (security_capable(ns, current_cred(), cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 382 current->flags |= PF_SUPERPRIV;
311 return 1; 383 return true;
312 } 384 }
313 return 0; 385 return false;
314} 386}
315EXPORT_SYMBOL(capable); 387EXPORT_SYMBOL(ns_capable);
388
389/**
390 * task_ns_capable - Determine whether current task has a superior
391 * capability targeted at a specific task's user namespace.
392 * @t: The task whose user namespace is targeted.
393 * @cap: The capability in question.
394 *
395 * Return true if it does, false otherwise.
396 */
397bool task_ns_capable(struct task_struct *t, int cap)
398{
399 return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
400}
401EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 95362d15128c..25c7eb52de1a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -157,7 +157,7 @@ struct css_id {
157}; 157};
158 158
159/* 159/*
160 * cgroup_event represents events which userspace want to recieve. 160 * cgroup_event represents events which userspace want to receive.
161 */ 161 */
162struct cgroup_event { 162struct cgroup_event {
163 /* 163 /*
@@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1813 1813
1814 /* Update the css_set linked lists if we're using them */ 1814 /* Update the css_set linked lists if we're using them */
1815 write_lock(&css_set_lock); 1815 write_lock(&css_set_lock);
1816 if (!list_empty(&tsk->cg_list)) { 1816 if (!list_empty(&tsk->cg_list))
1817 list_del(&tsk->cg_list); 1817 list_move(&tsk->cg_list, &newcg->tasks);
1818 list_add(&tsk->cg_list, &newcg->tasks);
1819 }
1820 write_unlock(&css_set_lock); 1818 write_unlock(&css_set_lock);
1821 1819
1822 for_each_subsys(root, ss) { 1820 for_each_subsys(root, ss) {
@@ -3655,12 +3653,12 @@ again:
3655 spin_lock(&release_list_lock); 3653 spin_lock(&release_list_lock);
3656 set_bit(CGRP_REMOVED, &cgrp->flags); 3654 set_bit(CGRP_REMOVED, &cgrp->flags);
3657 if (!list_empty(&cgrp->release_list)) 3655 if (!list_empty(&cgrp->release_list))
3658 list_del(&cgrp->release_list); 3656 list_del_init(&cgrp->release_list);
3659 spin_unlock(&release_list_lock); 3657 spin_unlock(&release_list_lock);
3660 3658
3661 cgroup_lock_hierarchy(cgrp->root); 3659 cgroup_lock_hierarchy(cgrp->root);
3662 /* delete this cgroup from parent->children */ 3660 /* delete this cgroup from parent->children */
3663 list_del(&cgrp->sibling); 3661 list_del_init(&cgrp->sibling);
3664 cgroup_unlock_hierarchy(cgrp->root); 3662 cgroup_unlock_hierarchy(cgrp->root);
3665 3663
3666 d = dget(cgrp->dentry); 3664 d = dget(cgrp->dentry);
@@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
3879 subsys[ss->subsys_id] = NULL; 3877 subsys[ss->subsys_id] = NULL;
3880 3878
3881 /* remove subsystem from rootnode's list of subsystems */ 3879 /* remove subsystem from rootnode's list of subsystems */
3882 list_del(&ss->sibling); 3880 list_del_init(&ss->sibling);
3883 3881
3884 /* 3882 /*
3885 * disentangle the css from all css_sets attached to the dummytop. as 3883 * disentangle the css from all css_sets attached to the dummytop. as
@@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4241 if (!list_empty(&tsk->cg_list)) { 4239 if (!list_empty(&tsk->cg_list)) {
4242 write_lock(&css_set_lock); 4240 write_lock(&css_set_lock);
4243 if (!list_empty(&tsk->cg_list)) 4241 if (!list_empty(&tsk->cg_list))
4244 list_del(&tsk->cg_list); 4242 list_del_init(&tsk->cg_list);
4245 write_unlock(&css_set_lock); 4243 write_unlock(&css_set_lock);
4246 } 4244 }
4247 4245
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
126#else /* #if CONFIG_HOTPLUG_CPU */ 126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {} 127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {} 128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */ 129#endif /* #else #if CONFIG_HOTPLUG_CPU */
130 130
131/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
132int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
160{ 160{
161 BUG_ON(cpu_notify(val, v)); 161 BUG_ON(cpu_notify(val, v));
162} 162}
163
164EXPORT_SYMBOL(register_cpu_notifier); 163EXPORT_SYMBOL(register_cpu_notifier);
165 164
166void __ref unregister_cpu_notifier(struct notifier_block *nb) 165void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
205 return err; 204 return err;
206 205
207 cpu_notify(CPU_DYING | param->mod, param->hcpu); 206 cpu_notify(CPU_DYING | param->mod, param->hcpu);
208
209 return 0; 207 return 0;
210} 208}
211 209
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
227 return -EINVAL; 225 return -EINVAL;
228 226
229 cpu_hotplug_begin(); 227 cpu_hotplug_begin();
228
230 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 229 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
231 if (err) { 230 if (err) {
232 nr_calls--; 231 nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
304 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
305 if (ret) { 304 if (ret) {
306 nr_calls--; 305 nr_calls--;
307 printk("%s: attempt to bring up CPU %u failed\n", 306 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
308 __func__, cpu); 307 __func__, cpu);
309 goto out_notify; 308 goto out_notify;
310 } 309 }
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
450 if (cpumask_empty(frozen_cpus)) 449 if (cpumask_empty(frozen_cpus))
451 goto out; 450 goto out;
452 451
453 printk("Enabling non-boot CPUs ...\n"); 452 printk(KERN_INFO "Enabling non-boot CPUs ...\n");
454 453
455 arch_enable_nonboot_cpus_begin(); 454 arch_enable_nonboot_cpus_begin();
456 455
457 for_each_cpu(cpu, frozen_cpus) { 456 for_each_cpu(cpu, frozen_cpus) {
458 error = _cpu_up(cpu, 1); 457 error = _cpu_up(cpu, 1);
459 if (!error) { 458 if (!error) {
460 printk("CPU%d is up\n", cpu); 459 printk(KERN_INFO "CPU%d is up\n", cpu);
461 continue; 460 continue;
462 } 461 }
463 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 462 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
509 */ 508 */
510 509
511/* cpu_bit_bitmap[0] is empty - so we can back into it */ 510/* cpu_bit_bitmap[0] is empty - so we can back into it */
512#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) 511#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
513#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) 512#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
514#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) 513#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
515#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) 514#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e92e98189032..33eee16addb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
1015 struct cpuset *cs; 1015 struct cpuset *cs;
1016 int migrate; 1016 int migrate;
1017 const nodemask_t *oldmem = scan->data; 1017 const nodemask_t *oldmem = scan->data;
1018 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); 1018 static nodemask_t newmems; /* protected by cgroup_mutex */
1019
1020 if (!newmems)
1021 return;
1022 1019
1023 cs = cgroup_cs(scan->cg); 1020 cs = cgroup_cs(scan->cg);
1024 guarantee_online_mems(cs, newmems); 1021 guarantee_online_mems(cs, &newmems);
1025
1026 cpuset_change_task_nodemask(p, newmems);
1027 1022
1028 NODEMASK_FREE(newmems); 1023 cpuset_change_task_nodemask(p, &newmems);
1029 1024
1030 mm = get_task_mm(p); 1025 mm = get_task_mm(p);
1031 if (!mm) 1026 if (!mm)
@@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1438 struct mm_struct *mm; 1433 struct mm_struct *mm;
1439 struct cpuset *cs = cgroup_cs(cont); 1434 struct cpuset *cs = cgroup_cs(cont);
1440 struct cpuset *oldcs = cgroup_cs(oldcont); 1435 struct cpuset *oldcs = cgroup_cs(oldcont);
1441 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); 1436 static nodemask_t to; /* protected by cgroup_mutex */
1442 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1443
1444 if (from == NULL || to == NULL)
1445 goto alloc_fail;
1446 1437
1447 if (cs == &top_cpuset) { 1438 if (cs == &top_cpuset) {
1448 cpumask_copy(cpus_attach, cpu_possible_mask); 1439 cpumask_copy(cpus_attach, cpu_possible_mask);
1449 } else { 1440 } else {
1450 guarantee_online_cpus(cs, cpus_attach); 1441 guarantee_online_cpus(cs, cpus_attach);
1451 } 1442 }
1452 guarantee_online_mems(cs, to); 1443 guarantee_online_mems(cs, &to);
1453 1444
1454 /* do per-task migration stuff possibly for each in the threadgroup */ 1445 /* do per-task migration stuff possibly for each in the threadgroup */
1455 cpuset_attach_task(tsk, to, cs); 1446 cpuset_attach_task(tsk, &to, cs);
1456 if (threadgroup) { 1447 if (threadgroup) {
1457 struct task_struct *c; 1448 struct task_struct *c;
1458 rcu_read_lock(); 1449 rcu_read_lock();
1459 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1450 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1460 cpuset_attach_task(c, to, cs); 1451 cpuset_attach_task(c, &to, cs);
1461 } 1452 }
1462 rcu_read_unlock(); 1453 rcu_read_unlock();
1463 } 1454 }
1464 1455
1465 /* change mm; only needs to be done once even if threadgroup */ 1456 /* change mm; only needs to be done once even if threadgroup */
1466 *from = oldcs->mems_allowed; 1457 to = cs->mems_allowed;
1467 *to = cs->mems_allowed;
1468 mm = get_task_mm(tsk); 1458 mm = get_task_mm(tsk);
1469 if (mm) { 1459 if (mm) {
1470 mpol_rebind_mm(mm, to); 1460 mpol_rebind_mm(mm, &to);
1471 if (is_memory_migrate(cs)) 1461 if (is_memory_migrate(cs))
1472 cpuset_migrate_mm(mm, from, to); 1462 cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
1473 mmput(mm); 1463 mmput(mm);
1474 } 1464 }
1475
1476alloc_fail:
1477 NODEMASK_FREE(from);
1478 NODEMASK_FREE(to);
1479} 1465}
1480 1466
1481/* The various types of files and directories in a cpuset file system */ 1467/* The various types of files and directories in a cpuset file system */
@@ -1610,34 +1596,26 @@ out:
1610 * across a page fault. 1596 * across a page fault.
1611 */ 1597 */
1612 1598
1613static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1599static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1614{ 1600{
1615 int ret; 1601 size_t count;
1616 1602
1617 mutex_lock(&callback_mutex); 1603 mutex_lock(&callback_mutex);
1618 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1604 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1619 mutex_unlock(&callback_mutex); 1605 mutex_unlock(&callback_mutex);
1620 1606
1621 return ret; 1607 return count;
1622} 1608}
1623 1609
1624static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1610static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1625{ 1611{
1626 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); 1612 size_t count;
1627 int retval;
1628
1629 if (mask == NULL)
1630 return -ENOMEM;
1631 1613
1632 mutex_lock(&callback_mutex); 1614 mutex_lock(&callback_mutex);
1633 *mask = cs->mems_allowed; 1615 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1634 mutex_unlock(&callback_mutex); 1616 mutex_unlock(&callback_mutex);
1635 1617
1636 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); 1618 return count;
1637
1638 NODEMASK_FREE(mask);
1639
1640 return retval;
1641} 1619}
1642 1620
1643static ssize_t cpuset_common_file_read(struct cgroup *cont, 1621static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1862,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1862 cs = cgroup_cs(cgroup); 1840 cs = cgroup_cs(cgroup);
1863 parent_cs = cgroup_cs(parent); 1841 parent_cs = cgroup_cs(parent);
1864 1842
1843 mutex_lock(&callback_mutex);
1865 cs->mems_allowed = parent_cs->mems_allowed; 1844 cs->mems_allowed = parent_cs->mems_allowed;
1866 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); 1845 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1846 mutex_unlock(&callback_mutex);
1867 return; 1847 return;
1868} 1848}
1869 1849
@@ -2066,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2066 struct cpuset *cp; /* scans cpusets being updated */ 2046 struct cpuset *cp; /* scans cpusets being updated */
2067 struct cpuset *child; /* scans child cpusets of cp */ 2047 struct cpuset *child; /* scans child cpusets of cp */
2068 struct cgroup *cont; 2048 struct cgroup *cont;
2069 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2049 static nodemask_t oldmems; /* protected by cgroup_mutex */
2070
2071 if (oldmems == NULL)
2072 return;
2073 2050
2074 list_add_tail((struct list_head *)&root->stack_list, &queue); 2051 list_add_tail((struct list_head *)&root->stack_list, &queue);
2075 2052
@@ -2086,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2086 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2063 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2087 continue; 2064 continue;
2088 2065
2089 *oldmems = cp->mems_allowed; 2066 oldmems = cp->mems_allowed;
2090 2067
2091 /* Remove offline cpus and mems from this cpuset. */ 2068 /* Remove offline cpus and mems from this cpuset. */
2092 mutex_lock(&callback_mutex); 2069 mutex_lock(&callback_mutex);
@@ -2102,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2102 remove_tasks_in_empty_cpuset(cp); 2079 remove_tasks_in_empty_cpuset(cp);
2103 else { 2080 else {
2104 update_tasks_cpumask(cp, NULL); 2081 update_tasks_cpumask(cp, NULL);
2105 update_tasks_nodemask(cp, oldmems, NULL); 2082 update_tasks_nodemask(cp, &oldmems, NULL);
2106 } 2083 }
2107 } 2084 }
2108 NODEMASK_FREE(oldmems);
2109} 2085}
2110 2086
2111/* 2087/*
@@ -2147,19 +2123,16 @@ void cpuset_update_active_cpus(void)
2147static int cpuset_track_online_nodes(struct notifier_block *self, 2123static int cpuset_track_online_nodes(struct notifier_block *self,
2148 unsigned long action, void *arg) 2124 unsigned long action, void *arg)
2149{ 2125{
2150 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2126 static nodemask_t oldmems; /* protected by cgroup_mutex */
2151
2152 if (oldmems == NULL)
2153 return NOTIFY_DONE;
2154 2127
2155 cgroup_lock(); 2128 cgroup_lock();
2156 switch (action) { 2129 switch (action) {
2157 case MEM_ONLINE: 2130 case MEM_ONLINE:
2158 *oldmems = top_cpuset.mems_allowed; 2131 oldmems = top_cpuset.mems_allowed;
2159 mutex_lock(&callback_mutex); 2132 mutex_lock(&callback_mutex);
2160 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2133 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2161 mutex_unlock(&callback_mutex); 2134 mutex_unlock(&callback_mutex);
2162 update_tasks_nodemask(&top_cpuset, oldmems, NULL); 2135 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2163 break; 2136 break;
2164 case MEM_OFFLINE: 2137 case MEM_OFFLINE:
2165 /* 2138 /*
@@ -2173,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2173 } 2146 }
2174 cgroup_unlock(); 2147 cgroup_unlock();
2175 2148
2176 NODEMASK_FREE(oldmems);
2177 return NOTIFY_OK; 2149 return NOTIFY_OK;
2178} 2150}
2179#endif 2151#endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
1#include <linux/kernel.h>
2#include <linux/crash_dump.h>
3#include <linux/init.h>
4#include <linux/errno.h>
5#include <linux/module.h>
6
7/*
8 * If we have booted due to a crash, max_pfn will be a very low value. We need
9 * to know the amount of memory that the previous kernel used.
10 */
11unsigned long saved_max_pfn;
12
13/*
14 * stores the physical address of elf header of crash image
15 *
16 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
17 * is_kdump_kernel() to determine if we are booting after a panic. Hence put
18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
19 */
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21
22/*
23 * elfcorehdr= specifies the location of elf core header stored by the crashed
24 * kernel. This option will be passed by kexec loader to the capture kernel.
25 */
26static int __init setup_elfcorehdr(char *arg)
27{
28 char *end;
29 if (!arg)
30 return -EINVAL;
31 elfcorehdr_addr = memparse(arg, &end);
32 return end > arg ? 0 : -EINVAL;
33}
34early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 2343c132c5a7..5557b55048df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
741} 741}
742EXPORT_SYMBOL(set_create_files_as); 742EXPORT_SYMBOL(set_create_files_as);
743 743
744struct user_namespace *current_user_ns(void)
745{
746 return _current_user_ns();
747}
748EXPORT_SYMBOL(current_user_ns);
749
744#ifdef CONFIG_DEBUG_CREDENTIALS 750#ifdef CONFIG_DEBUG_CREDENTIALS
745 751
746bool creds_are_invalid(const struct cred *cred) 752bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index cefd4a11f6d9..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -538,7 +538,7 @@ return_normal:
538 538
539 /* 539 /*
540 * For single stepping, try to only enter on the processor 540 * For single stepping, try to only enter on the processor
541 * that was single stepping. To gaurd against a deadlock, the 541 * that was single stepping. To guard against a deadlock, the
542 * kernel will only try for the value of sstep_tries before 542 * kernel will only try for the value of sstep_tries before
543 * giving up and continuing on. 543 * giving up and continuing on.
544 */ 544 */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1093 put_packet(remcom_out_buffer); 1093 put_packet(remcom_out_buffer);
1094 return 0; 1094 return 0;
1095} 1095}
1096
1097/**
1098 * gdbstub_exit - Send an exit message to GDB
1099 * @status: The exit code to report.
1100 */
1101void gdbstub_exit(int status)
1102{
1103 unsigned char checksum, ch, buffer[3];
1104 int loop;
1105
1106 buffer[0] = 'W';
1107 buffer[1] = hex_asc_hi(status);
1108 buffer[2] = hex_asc_lo(status);
1109
1110 dbg_io_ops->write_char('$');
1111 checksum = 0;
1112
1113 for (loop = 0; loop < 3; loop++) {
1114 ch = buffer[loop];
1115 checksum += ch;
1116 dbg_io_ops->write_char(ch);
1117 }
1118
1119 dbg_io_ops->write_char('#');
1120 dbg_io_ops->write_char(hex_asc_hi(checksum));
1121 dbg_io_ops->write_char(hex_asc_lo(checksum));
1122
1123 /* make sure the output is flushed, lest the bootloader clobber it */
1124 dbg_io_ops->flush();
1125}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bd3e8e29caa3..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic;
78static kdbtab_t *kdb_commands; 78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50 79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX; 80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50]; 81static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
82#define for_each_kdbcmd(cmd, num) \ 82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \ 83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \ 84 num < kdb_max_commands; \
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
441 * symbol name, and offset to the caller. 441 * symbol name, and offset to the caller.
442 * 442 *
443 * The argument may consist of a numeric value (decimal or 443 * The argument may consist of a numeric value (decimal or
444 * hexidecimal), a symbol name, a register name (preceeded by the 444 * hexidecimal), a symbol name, a register name (preceded by the
445 * percent sign), an environment variable with a numeric value 445 * percent sign), an environment variable with a numeric value
446 * (preceeded by a dollar sign) or a simple arithmetic expression 446 * (preceded by a dollar sign) or a simple arithmetic expression
447 * consisting of a symbol name, +/-, and a numeric constant value 447 * consisting of a symbol name, +/-, and a numeric constant value
448 * (offset). 448 * (offset).
449 * Parameters: 449 * Parameters:
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
1335 * error The hardware-defined error code 1335 * error The hardware-defined error code
1336 * reason2 kdb's current reason code. 1336 * reason2 kdb's current reason code.
1337 * Initially error but can change 1337 * Initially error but can change
1338 * acording to kdb state. 1338 * according to kdb state.
1339 * db_result Result code from break or debug point. 1339 * db_result Result code from break or debug point.
1340 * regs The exception frame at time of fault/breakpoint. 1340 * regs The exception frame at time of fault/breakpoint.
1341 * should always be valid. 1341 * should always be valid.
@@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void)
2892 "Send a signal to a process", 0, KDB_REPEAT_NONE); 2892 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2893 kdb_register_repeat("summary", kdb_summary, "", 2893 kdb_register_repeat("summary", kdb_summary, "",
2894 "Summarize the system", 4, KDB_REPEAT_NONE); 2894 "Summarize the system", 4, KDB_REPEAT_NONE);
2895 kdb_register_repeat("per_cpu", kdb_per_cpu, "", 2895 kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
2896 "Display per_cpu variables", 3, KDB_REPEAT_NONE); 2896 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2897 kdb_register_repeat("grephelp", kdb_grep_help, "", 2897 kdb_register_repeat("grephelp", kdb_grep_help, "",
2898 "Display help on | grep", 0, KDB_REPEAT_NONE); 2898 "Display help on | grep", 0, KDB_REPEAT_NONE);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
545 * Mask for process state. 545 * Mask for process state.
546 * Notes: 546 * Notes:
547 * The mask folds data from several sources into a single long value, so 547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB, 548 * be careful not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there 549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be 550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in 551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..f5d2f63bae0b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
841 /* Let father know we died 841 /* Let father know we died
842 * 842 *
843 * Thread signals are configurable, but you aren't going to use 843 * Thread signals are configurable, but you aren't going to use
844 * that to send signals to arbitary processes. 844 * that to send signals to arbitrary processes.
845 * That stops right now. 845 * That stops right now.
846 * 846 *
847 * If the parent exec id doesn't match the exec id we saved 847 * If the parent exec id doesn't match the exec id we saved
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
908 profile_task_exit(tsk); 908 profile_task_exit(tsk);
909 909
910 WARN_ON(atomic_read(&tsk->fs_excl)); 910 WARN_ON(atomic_read(&tsk->fs_excl));
911 WARN_ON(blk_needs_flush_plug(tsk));
911 912
912 if (unlikely(in_interrupt())) 913 if (unlikely(in_interrupt()))
913 panic("Aiee, killing interrupt handler!"); 914 panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 05b92c457010..e7548dee636b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/tracehook.h> 40#include <linux/tracehook.h>
41#include <linux/futex.h> 41#include <linux/futex.h>
42#include <linux/compat.h> 42#include <linux/compat.h>
43#include <linux/kthread.h>
43#include <linux/task_io_accounting_ops.h> 44#include <linux/task_io_accounting_ops.h>
44#include <linux/rcupdate.h> 45#include <linux/rcupdate.h>
45#include <linux/ptrace.h> 46#include <linux/ptrace.h>
@@ -109,20 +110,25 @@ int nr_processes(void)
109} 110}
110 111
111#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 112#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
112# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 113# define alloc_task_struct_node(node) \
113# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 114 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
115# define free_task_struct(tsk) \
116 kmem_cache_free(task_struct_cachep, (tsk))
114static struct kmem_cache *task_struct_cachep; 117static struct kmem_cache *task_struct_cachep;
115#endif 118#endif
116 119
117#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 120#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
118static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) 121static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
122 int node)
119{ 123{
120#ifdef CONFIG_DEBUG_STACK_USAGE 124#ifdef CONFIG_DEBUG_STACK_USAGE
121 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 125 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
122#else 126#else
123 gfp_t mask = GFP_KERNEL; 127 gfp_t mask = GFP_KERNEL;
124#endif 128#endif
125 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); 129 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
130
131 return page ? page_address(page) : NULL;
126} 132}
127 133
128static inline void free_thread_info(struct thread_info *ti) 134static inline void free_thread_info(struct thread_info *ti)
@@ -249,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
249 struct task_struct *tsk; 255 struct task_struct *tsk;
250 struct thread_info *ti; 256 struct thread_info *ti;
251 unsigned long *stackend; 257 unsigned long *stackend;
252 258 int node = tsk_fork_get_node(orig);
253 int err; 259 int err;
254 260
255 prepare_to_copy(orig); 261 prepare_to_copy(orig);
256 262
257 tsk = alloc_task_struct(); 263 tsk = alloc_task_struct_node(node);
258 if (!tsk) 264 if (!tsk)
259 return NULL; 265 return NULL;
260 266
261 ti = alloc_thread_info(tsk); 267 ti = alloc_thread_info_node(tsk, node);
262 if (!ti) { 268 if (!ti) {
263 free_task_struct(tsk); 269 free_task_struct(tsk);
264 return NULL; 270 return NULL;
@@ -1181,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1181 pid = alloc_pid(p->nsproxy->pid_ns); 1187 pid = alloc_pid(p->nsproxy->pid_ns);
1182 if (!pid) 1188 if (!pid)
1183 goto bad_fork_cleanup_io; 1189 goto bad_fork_cleanup_io;
1184
1185 if (clone_flags & CLONE_NEWPID) {
1186 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1187 if (retval < 0)
1188 goto bad_fork_free_pid;
1189 }
1190 } 1190 }
1191 1191
1192 p->pid = pid_nr(pid); 1192 p->pid = pid_nr(pid);
@@ -1205,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1205 * Clear TID on mm_release()? 1205 * Clear TID on mm_release()?
1206 */ 1206 */
1207 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1207 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1208#ifdef CONFIG_BLOCK
1209 p->plug = NULL;
1210#endif
1208#ifdef CONFIG_FUTEX 1211#ifdef CONFIG_FUTEX
1209 p->robust_list = NULL; 1212 p->robust_list = NULL;
1210#ifdef CONFIG_COMPAT 1213#ifdef CONFIG_COMPAT
@@ -1290,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1290 tracehook_finish_clone(p, clone_flags, trace); 1293 tracehook_finish_clone(p, clone_flags, trace);
1291 1294
1292 if (thread_group_leader(p)) { 1295 if (thread_group_leader(p)) {
1293 if (clone_flags & CLONE_NEWPID) 1296 if (is_child_reaper(pid))
1294 p->nsproxy->pid_ns->child_reaper = p; 1297 p->nsproxy->pid_ns->child_reaper = p;
1295 1298
1296 p->signal->leader_pid = pid; 1299 p->signal->leader_pid = pid;
@@ -1513,38 +1516,24 @@ void __init proc_caches_init(void)
1513} 1516}
1514 1517
1515/* 1518/*
1516 * Check constraints on flags passed to the unshare system call and 1519 * Check constraints on flags passed to the unshare system call.
1517 * force unsharing of additional process context as appropriate.
1518 */ 1520 */
1519static void check_unshare_flags(unsigned long *flags_ptr) 1521static int check_unshare_flags(unsigned long unshare_flags)
1520{ 1522{
1523 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1524 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1525 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1526 return -EINVAL;
1521 /* 1527 /*
1522 * If unsharing a thread from a thread group, must also 1528 * Not implemented, but pretend it works if there is nothing to
1523 * unshare vm. 1529 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1524 */ 1530 * needs to unshare vm.
1525 if (*flags_ptr & CLONE_THREAD)
1526 *flags_ptr |= CLONE_VM;
1527
1528 /*
1529 * If unsharing vm, must also unshare signal handlers.
1530 */
1531 if (*flags_ptr & CLONE_VM)
1532 *flags_ptr |= CLONE_SIGHAND;
1533
1534 /*
1535 * If unsharing namespace, must also unshare filesystem information.
1536 */ 1531 */
1537 if (*flags_ptr & CLONE_NEWNS) 1532 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1538 *flags_ptr |= CLONE_FS; 1533 /* FIXME: get_task_mm() increments ->mm_users */
1539} 1534 if (atomic_read(&current->mm->mm_users) > 1)
1540 1535 return -EINVAL;
1541/* 1536 }
1542 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1543 */
1544static int unshare_thread(unsigned long unshare_flags)
1545{
1546 if (unshare_flags & CLONE_THREAD)
1547 return -EINVAL;
1548 1537
1549 return 0; 1538 return 0;
1550} 1539}
@@ -1571,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1571} 1560}
1572 1561
1573/* 1562/*
1574 * Unsharing of sighand is not supported yet
1575 */
1576static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1577{
1578 struct sighand_struct *sigh = current->sighand;
1579
1580 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1581 return -EINVAL;
1582 else
1583 return 0;
1584}
1585
1586/*
1587 * Unshare vm if it is being shared
1588 */
1589static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1590{
1591 struct mm_struct *mm = current->mm;
1592
1593 if ((unshare_flags & CLONE_VM) &&
1594 (mm && atomic_read(&mm->mm_users) > 1)) {
1595 return -EINVAL;
1596 }
1597
1598 return 0;
1599}
1600
1601/*
1602 * Unshare file descriptor table if it is being shared 1563 * Unshare file descriptor table if it is being shared
1603 */ 1564 */
1604static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) 1565static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1626,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1626 */ 1587 */
1627SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) 1588SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1628{ 1589{
1629 int err = 0;
1630 struct fs_struct *fs, *new_fs = NULL; 1590 struct fs_struct *fs, *new_fs = NULL;
1631 struct sighand_struct *new_sigh = NULL;
1632 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1633 struct files_struct *fd, *new_fd = NULL; 1591 struct files_struct *fd, *new_fd = NULL;
1634 struct nsproxy *new_nsproxy = NULL; 1592 struct nsproxy *new_nsproxy = NULL;
1635 int do_sysvsem = 0; 1593 int do_sysvsem = 0;
1594 int err;
1636 1595
1637 check_unshare_flags(&unshare_flags); 1596 err = check_unshare_flags(unshare_flags);
1638 1597 if (err)
1639 /* Return -EINVAL for all unsupported flags */
1640 err = -EINVAL;
1641 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1642 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1643 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1644 goto bad_unshare_out; 1598 goto bad_unshare_out;
1645 1599
1646 /* 1600 /*
1601 * If unsharing namespace, must also unshare filesystem information.
1602 */
1603 if (unshare_flags & CLONE_NEWNS)
1604 unshare_flags |= CLONE_FS;
1605 /*
1647 * CLONE_NEWIPC must also detach from the undolist: after switching 1606 * CLONE_NEWIPC must also detach from the undolist: after switching
1648 * to a new ipc namespace, the semaphore arrays from the old 1607 * to a new ipc namespace, the semaphore arrays from the old
1649 * namespace are unreachable. 1608 * namespace are unreachable.
1650 */ 1609 */
1651 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1610 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1652 do_sysvsem = 1; 1611 do_sysvsem = 1;
1653 if ((err = unshare_thread(unshare_flags)))
1654 goto bad_unshare_out;
1655 if ((err = unshare_fs(unshare_flags, &new_fs))) 1612 if ((err = unshare_fs(unshare_flags, &new_fs)))
1656 goto bad_unshare_cleanup_thread; 1613 goto bad_unshare_out;
1657 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1658 goto bad_unshare_cleanup_fs;
1659 if ((err = unshare_vm(unshare_flags, &new_mm)))
1660 goto bad_unshare_cleanup_sigh;
1661 if ((err = unshare_fd(unshare_flags, &new_fd))) 1614 if ((err = unshare_fd(unshare_flags, &new_fd)))
1662 goto bad_unshare_cleanup_vm; 1615 goto bad_unshare_cleanup_fs;
1663 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1616 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1664 new_fs))) 1617 new_fs)))
1665 goto bad_unshare_cleanup_fd; 1618 goto bad_unshare_cleanup_fd;
1666 1619
1667 if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { 1620 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
1668 if (do_sysvsem) { 1621 if (do_sysvsem) {
1669 /* 1622 /*
1670 * CLONE_SYSVSEM is equivalent to sys_exit(). 1623 * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1690,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1690 spin_unlock(&fs->lock); 1643 spin_unlock(&fs->lock);
1691 } 1644 }
1692 1645
1693 if (new_mm) {
1694 mm = current->mm;
1695 active_mm = current->active_mm;
1696 current->mm = new_mm;
1697 current->active_mm = new_mm;
1698 if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
1699 atomic_dec(&mm->oom_disable_count);
1700 atomic_inc(&new_mm->oom_disable_count);
1701 }
1702 activate_mm(active_mm, new_mm);
1703 new_mm = mm;
1704 }
1705
1706 if (new_fd) { 1646 if (new_fd) {
1707 fd = current->files; 1647 fd = current->files;
1708 current->files = new_fd; 1648 current->files = new_fd;
@@ -1719,20 +1659,10 @@ bad_unshare_cleanup_fd:
1719 if (new_fd) 1659 if (new_fd)
1720 put_files_struct(new_fd); 1660 put_files_struct(new_fd);
1721 1661
1722bad_unshare_cleanup_vm:
1723 if (new_mm)
1724 mmput(new_mm);
1725
1726bad_unshare_cleanup_sigh:
1727 if (new_sigh)
1728 if (atomic_dec_and_test(&new_sigh->count))
1729 kmem_cache_free(sighand_cachep, new_sigh);
1730
1731bad_unshare_cleanup_fs: 1662bad_unshare_cleanup_fs:
1732 if (new_fs) 1663 if (new_fs)
1733 free_fs_struct(new_fs); 1664 free_fs_struct(new_fs);
1734 1665
1735bad_unshare_cleanup_thread:
1736bad_unshare_out: 1666bad_unshare_out:
1737 return err; 1667 return err;
1738} 1668}
diff --git a/kernel/futex.c b/kernel/futex.c
index bda415715382..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -782,8 +782,8 @@ static void __unqueue_futex(struct futex_q *q)
782{ 782{
783 struct futex_hash_bucket *hb; 783 struct futex_hash_bucket *hb;
784 784
785 if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) 785 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
786 || plist_node_empty(&q->list))) 786 || WARN_ON(plist_node_empty(&q->list)))
787 return; 787 return;
788 788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
@@ -1886,7 +1886,7 @@ retry:
1886 restart->futex.val = val; 1886 restart->futex.val = val;
1887 restart->futex.time = abs_time->tv64; 1887 restart->futex.time = abs_time->tv64;
1888 restart->futex.bitset = bitset; 1888 restart->futex.bitset = bitset;
1889 restart->futex.flags = flags; 1889 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1890 1890
1891 ret = -ERESTART_RESTARTBLOCK; 1891 ret = -ERESTART_RESTARTBLOCK;
1892 1892
@@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2418 goto err_unlock; 2418 goto err_unlock;
2419 ret = -EPERM; 2419 ret = -EPERM;
2420 pcred = __task_cred(p); 2420 pcred = __task_cred(p);
2421 /* If victim is in different user_ns, then uids are not
2422 comparable, so we must have CAP_SYS_PTRACE */
2423 if (cred->user->user_ns != pcred->user->user_ns) {
2424 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2425 goto err_unlock;
2426 goto ok;
2427 }
2428 /* If victim is in same user_ns, then uids are comparable */
2421 if (cred->euid != pcred->euid && 2429 if (cred->euid != pcred->euid &&
2422 cred->euid != pcred->uid && 2430 cred->euid != pcred->uid &&
2423 !capable(CAP_SYS_PTRACE)) 2431 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2424 goto err_unlock; 2432 goto err_unlock;
2433ok:
2425 head = p->robust_list; 2434 head = p->robust_list;
2426 rcu_read_unlock(); 2435 rcu_read_unlock();
2427 } 2436 }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
153 goto err_unlock; 153 goto err_unlock;
154 ret = -EPERM; 154 ret = -EPERM;
155 pcred = __task_cred(p); 155 pcred = __task_cred(p);
156 /* If victim is in different user_ns, then uids are not
157 comparable, so we must have CAP_SYS_PTRACE */
158 if (cred->user->user_ns != pcred->user->user_ns) {
159 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
160 goto err_unlock;
161 goto ok;
162 }
163 /* If victim is in same user_ns, then uids are comparable */
156 if (cred->euid != pcred->euid && 164 if (cred->euid != pcred->euid &&
157 cred->euid != pcred->uid && 165 cred->euid != pcred->uid &&
158 !capable(CAP_SYS_PTRACE)) 166 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
159 goto err_unlock; 167 goto err_unlock;
168ok:
160 head = p->compat_robust_list; 169 head = p->compat_robust_list;
161 rcu_read_unlock(); 170 rcu_read_unlock();
162 } 171 }
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o 3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
233 struct group_info *group_info; 233 struct group_info *group_info;
234 int retval; 234 int retval;
235 235
236 if (!capable(CAP_SETGID)) 236 if (!nsown_capable(CAP_SETGID))
237 return -EPERM; 237 return -EPERM;
238 if ((unsigned)gidsetsize > NGROUPS_MAX) 238 if ((unsigned)gidsetsize > NGROUPS_MAX)
239 return -EINVAL; 239 return -EINVAL;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 09bef82d74cb..c574f9a12c48 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -10,13 +10,6 @@ menu "IRQ subsystem"
10config GENERIC_HARDIRQS 10config GENERIC_HARDIRQS
11 def_bool y 11 def_bool y
12 12
13# Select this to disable the deprecated stuff
14config GENERIC_HARDIRQS_NO_DEPRECATED
15 bool
16
17config GENERIC_HARDIRQS_NO_COMPAT
18 bool
19
20# Options selectable by the architecture code 13# Options selectable by the architecture code
21 14
22# Make sparse irq Kconfig switch below available 15# Make sparse irq Kconfig switch below available
@@ -31,6 +24,10 @@ config GENERIC_IRQ_PROBE
31config GENERIC_IRQ_SHOW 24config GENERIC_IRQ_SHOW
32 bool 25 bool
33 26
27# Print level/edge extra information
28config GENERIC_IRQ_SHOW_LEVEL
29 bool
30
34# Support for delayed migration from interrupt context 31# Support for delayed migration from interrupt context
35config GENERIC_PENDING_IRQ 32config GENERIC_PENDING_IRQ
36 bool 33 bool
@@ -47,6 +44,10 @@ config HARDIRQS_SW_RESEND
47config IRQ_PREFLOW_FASTEOI 44config IRQ_PREFLOW_FASTEOI
48 bool 45 bool
49 46
47# Edge style eoi based handler (cell)
48config IRQ_EDGE_EOI_HANDLER
49 bool
50
50# Support forced irq threading 51# Support forced irq threading
51config IRQ_FORCED_THREADING 52config IRQ_FORCED_THREADING
52 bool 53 bool
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 394784c57060..342d8f44e401 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -70,10 +70,8 @@ unsigned long probe_irq_on(void)
70 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
71 if (!desc->action && irq_settings_can_probe(desc)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
73 if (irq_startup(desc)) { 73 if (irq_startup(desc))
74 irq_compat_set_pending(desc);
75 desc->istate |= IRQS_PENDING; 74 desc->istate |= IRQS_PENDING;
76 }
77 } 75 }
78 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
79 } 77 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c9c0601f0615..4af1e2b244cb 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,9 +34,14 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
34 if (!chip) 34 if (!chip)
35 chip = &no_irq_chip; 35 chip = &no_irq_chip;
36 36
37 irq_chip_set_defaults(chip);
38 desc->irq_data.chip = chip; 37 desc->irq_data.chip = chip;
39 irq_put_desc_unlock(desc, flags); 38 irq_put_desc_unlock(desc, flags);
39 /*
40 * For !CONFIG_SPARSE_IRQ make the irq show up in
41 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
42 * already marked, and this call is harmless.
43 */
44 irq_reserve_irq(irq);
40 return 0; 45 return 0;
41} 46}
42EXPORT_SYMBOL(irq_set_chip); 47EXPORT_SYMBOL(irq_set_chip);
@@ -134,26 +139,22 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data);
134 139
135static void irq_state_clr_disabled(struct irq_desc *desc) 140static void irq_state_clr_disabled(struct irq_desc *desc)
136{ 141{
137 desc->istate &= ~IRQS_DISABLED; 142 irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
138 irq_compat_clr_disabled(desc);
139} 143}
140 144
141static void irq_state_set_disabled(struct irq_desc *desc) 145static void irq_state_set_disabled(struct irq_desc *desc)
142{ 146{
143 desc->istate |= IRQS_DISABLED; 147 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
144 irq_compat_set_disabled(desc);
145} 148}
146 149
147static void irq_state_clr_masked(struct irq_desc *desc) 150static void irq_state_clr_masked(struct irq_desc *desc)
148{ 151{
149 desc->istate &= ~IRQS_MASKED; 152 irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
150 irq_compat_clr_masked(desc);
151} 153}
152 154
153static void irq_state_set_masked(struct irq_desc *desc) 155static void irq_state_set_masked(struct irq_desc *desc)
154{ 156{
155 desc->istate |= IRQS_MASKED; 157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
156 irq_compat_set_masked(desc);
157} 158}
158 159
159int irq_startup(struct irq_desc *desc) 160int irq_startup(struct irq_desc *desc)
@@ -203,126 +204,6 @@ void irq_disable(struct irq_desc *desc)
203 } 204 }
204} 205}
205 206
206#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
207/* Temporary migration helpers */
208static void compat_irq_mask(struct irq_data *data)
209{
210 data->chip->mask(data->irq);
211}
212
213static void compat_irq_unmask(struct irq_data *data)
214{
215 data->chip->unmask(data->irq);
216}
217
218static void compat_irq_ack(struct irq_data *data)
219{
220 data->chip->ack(data->irq);
221}
222
223static void compat_irq_mask_ack(struct irq_data *data)
224{
225 data->chip->mask_ack(data->irq);
226}
227
228static void compat_irq_eoi(struct irq_data *data)
229{
230 data->chip->eoi(data->irq);
231}
232
233static void compat_irq_enable(struct irq_data *data)
234{
235 data->chip->enable(data->irq);
236}
237
238static void compat_irq_disable(struct irq_data *data)
239{
240 data->chip->disable(data->irq);
241}
242
243static void compat_irq_shutdown(struct irq_data *data)
244{
245 data->chip->shutdown(data->irq);
246}
247
248static unsigned int compat_irq_startup(struct irq_data *data)
249{
250 return data->chip->startup(data->irq);
251}
252
253static int compat_irq_set_affinity(struct irq_data *data,
254 const struct cpumask *dest, bool force)
255{
256 return data->chip->set_affinity(data->irq, dest);
257}
258
259static int compat_irq_set_type(struct irq_data *data, unsigned int type)
260{
261 return data->chip->set_type(data->irq, type);
262}
263
264static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
265{
266 return data->chip->set_wake(data->irq, on);
267}
268
269static int compat_irq_retrigger(struct irq_data *data)
270{
271 return data->chip->retrigger(data->irq);
272}
273
274static void compat_bus_lock(struct irq_data *data)
275{
276 data->chip->bus_lock(data->irq);
277}
278
279static void compat_bus_sync_unlock(struct irq_data *data)
280{
281 data->chip->bus_sync_unlock(data->irq);
282}
283#endif
284
285/*
286 * Fixup enable/disable function pointers
287 */
288void irq_chip_set_defaults(struct irq_chip *chip)
289{
290#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
291 if (chip->enable)
292 chip->irq_enable = compat_irq_enable;
293 if (chip->disable)
294 chip->irq_disable = compat_irq_disable;
295 if (chip->shutdown)
296 chip->irq_shutdown = compat_irq_shutdown;
297 if (chip->startup)
298 chip->irq_startup = compat_irq_startup;
299 if (!chip->end)
300 chip->end = dummy_irq_chip.end;
301 if (chip->bus_lock)
302 chip->irq_bus_lock = compat_bus_lock;
303 if (chip->bus_sync_unlock)
304 chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
305 if (chip->mask)
306 chip->irq_mask = compat_irq_mask;
307 if (chip->unmask)
308 chip->irq_unmask = compat_irq_unmask;
309 if (chip->ack)
310 chip->irq_ack = compat_irq_ack;
311 if (chip->mask_ack)
312 chip->irq_mask_ack = compat_irq_mask_ack;
313 if (chip->eoi)
314 chip->irq_eoi = compat_irq_eoi;
315 if (chip->set_affinity)
316 chip->irq_set_affinity = compat_irq_set_affinity;
317 if (chip->set_type)
318 chip->irq_set_type = compat_irq_set_type;
319 if (chip->set_wake)
320 chip->irq_set_wake = compat_irq_set_wake;
321 if (chip->retrigger)
322 chip->irq_retrigger = compat_irq_retrigger;
323#endif
324}
325
326static inline void mask_ack_irq(struct irq_desc *desc) 207static inline void mask_ack_irq(struct irq_desc *desc)
327{ 208{
328 if (desc->irq_data.chip->irq_mask_ack) 209 if (desc->irq_data.chip->irq_mask_ack)
@@ -372,11 +253,10 @@ void handle_nested_irq(unsigned int irq)
372 kstat_incr_irqs_this_cpu(irq, desc); 253 kstat_incr_irqs_this_cpu(irq, desc);
373 254
374 action = desc->action; 255 action = desc->action;
375 if (unlikely(!action || (desc->istate & IRQS_DISABLED))) 256 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
376 goto out_unlock; 257 goto out_unlock;
377 258
378 irq_compat_set_progress(desc); 259 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
379 desc->istate |= IRQS_INPROGRESS;
380 raw_spin_unlock_irq(&desc->lock); 260 raw_spin_unlock_irq(&desc->lock);
381 261
382 action_ret = action->thread_fn(action->irq, action->dev_id); 262 action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -384,8 +264,7 @@ void handle_nested_irq(unsigned int irq)
384 note_interrupt(irq, desc, action_ret); 264 note_interrupt(irq, desc, action_ret);
385 265
386 raw_spin_lock_irq(&desc->lock); 266 raw_spin_lock_irq(&desc->lock);
387 desc->istate &= ~IRQS_INPROGRESS; 267 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
388 irq_compat_clr_progress(desc);
389 268
390out_unlock: 269out_unlock:
391 raw_spin_unlock_irq(&desc->lock); 270 raw_spin_unlock_irq(&desc->lock);
@@ -416,14 +295,14 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
416{ 295{
417 raw_spin_lock(&desc->lock); 296 raw_spin_lock(&desc->lock);
418 297
419 if (unlikely(desc->istate & IRQS_INPROGRESS)) 298 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
420 if (!irq_check_poll(desc)) 299 if (!irq_check_poll(desc))
421 goto out_unlock; 300 goto out_unlock;
422 301
423 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 302 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
424 kstat_incr_irqs_this_cpu(irq, desc); 303 kstat_incr_irqs_this_cpu(irq, desc);
425 304
426 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) 305 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
427 goto out_unlock; 306 goto out_unlock;
428 307
429 handle_irq_event(desc); 308 handle_irq_event(desc);
@@ -448,7 +327,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
448 raw_spin_lock(&desc->lock); 327 raw_spin_lock(&desc->lock);
449 mask_ack_irq(desc); 328 mask_ack_irq(desc);
450 329
451 if (unlikely(desc->istate & IRQS_INPROGRESS)) 330 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
452 if (!irq_check_poll(desc)) 331 if (!irq_check_poll(desc))
453 goto out_unlock; 332 goto out_unlock;
454 333
@@ -459,12 +338,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
459 * If its disabled or no action available 338 * If its disabled or no action available
460 * keep it masked and get out of here 339 * keep it masked and get out of here
461 */ 340 */
462 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) 341 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
463 goto out_unlock; 342 goto out_unlock;
464 343
465 handle_irq_event(desc); 344 handle_irq_event(desc);
466 345
467 if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) 346 if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
468 unmask_irq(desc); 347 unmask_irq(desc);
469out_unlock: 348out_unlock:
470 raw_spin_unlock(&desc->lock); 349 raw_spin_unlock(&desc->lock);
@@ -496,7 +375,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
496{ 375{
497 raw_spin_lock(&desc->lock); 376 raw_spin_lock(&desc->lock);
498 377
499 if (unlikely(desc->istate & IRQS_INPROGRESS)) 378 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
500 if (!irq_check_poll(desc)) 379 if (!irq_check_poll(desc))
501 goto out; 380 goto out;
502 381
@@ -507,8 +386,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
507 * If its disabled or no action available 386 * If its disabled or no action available
508 * then mask it and get out of here: 387 * then mask it and get out of here:
509 */ 388 */
510 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { 389 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
511 irq_compat_set_pending(desc);
512 desc->istate |= IRQS_PENDING; 390 desc->istate |= IRQS_PENDING;
513 mask_irq(desc); 391 mask_irq(desc);
514 goto out; 392 goto out;
@@ -537,7 +415,7 @@ out:
537 * @desc: the interrupt description structure for this irq 415 * @desc: the interrupt description structure for this irq
538 * 416 *
539 * Interrupt occures on the falling and/or rising edge of a hardware 417 * Interrupt occures on the falling and/or rising edge of a hardware
540 * signal. The occurence is latched into the irq controller hardware 418 * signal. The occurrence is latched into the irq controller hardware
541 * and must be acked in order to be reenabled. After the ack another 419 * and must be acked in order to be reenabled. After the ack another
542 * interrupt can happen on the same source even before the first one 420 * interrupt can happen on the same source even before the first one
543 * is handled by the associated event handler. If this happens it 421 * is handled by the associated event handler. If this happens it
@@ -558,10 +436,9 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
558 * we shouldn't process the IRQ. Mark it pending, handle 436 * we shouldn't process the IRQ. Mark it pending, handle
559 * the necessary masking and go out 437 * the necessary masking and go out
560 */ 438 */
561 if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || 439 if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
562 !desc->action))) { 440 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
563 if (!irq_check_poll(desc)) { 441 if (!irq_check_poll(desc)) {
564 irq_compat_set_pending(desc);
565 desc->istate |= IRQS_PENDING; 442 desc->istate |= IRQS_PENDING;
566 mask_ack_irq(desc); 443 mask_ack_irq(desc);
567 goto out_unlock; 444 goto out_unlock;
@@ -584,20 +461,65 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
584 * Renable it, if it was not disabled in meantime. 461 * Renable it, if it was not disabled in meantime.
585 */ 462 */
586 if (unlikely(desc->istate & IRQS_PENDING)) { 463 if (unlikely(desc->istate & IRQS_PENDING)) {
587 if (!(desc->istate & IRQS_DISABLED) && 464 if (!irqd_irq_disabled(&desc->irq_data) &&
588 (desc->istate & IRQS_MASKED)) 465 irqd_irq_masked(&desc->irq_data))
589 unmask_irq(desc); 466 unmask_irq(desc);
590 } 467 }
591 468
592 handle_irq_event(desc); 469 handle_irq_event(desc);
593 470
594 } while ((desc->istate & IRQS_PENDING) && 471 } while ((desc->istate & IRQS_PENDING) &&
595 !(desc->istate & IRQS_DISABLED)); 472 !irqd_irq_disabled(&desc->irq_data));
596 473
597out_unlock: 474out_unlock:
598 raw_spin_unlock(&desc->lock); 475 raw_spin_unlock(&desc->lock);
599} 476}
600 477
478#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
479/**
480 * handle_edge_eoi_irq - edge eoi type IRQ handler
481 * @irq: the interrupt number
482 * @desc: the interrupt description structure for this irq
483 *
484 * Similar as the above handle_edge_irq, but using eoi and w/o the
485 * mask/unmask logic.
486 */
487void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
488{
489 struct irq_chip *chip = irq_desc_get_chip(desc);
490
491 raw_spin_lock(&desc->lock);
492
493 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
494 /*
495 * If we're currently running this IRQ, or its disabled,
496 * we shouldn't process the IRQ. Mark it pending, handle
497 * the necessary masking and go out
498 */
499 if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
500 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
501 if (!irq_check_poll(desc)) {
502 desc->istate |= IRQS_PENDING;
503 goto out_eoi;
504 }
505 }
506 kstat_incr_irqs_this_cpu(irq, desc);
507
508 do {
509 if (unlikely(!desc->action))
510 goto out_eoi;
511
512 handle_irq_event(desc);
513
514 } while ((desc->istate & IRQS_PENDING) &&
515 !irqd_irq_disabled(&desc->irq_data));
516
517out_eoi:
518 chip->irq_eoi(&desc->irq_data);
519 raw_spin_unlock(&desc->lock);
520}
521#endif
522
601/** 523/**
602 * handle_percpu_irq - Per CPU local irq handler 524 * handle_percpu_irq - Per CPU local irq handler
603 * @irq: the interrupt number 525 * @irq: the interrupt number
@@ -642,8 +564,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
642 if (handle == handle_bad_irq) { 564 if (handle == handle_bad_irq) {
643 if (desc->irq_data.chip != &no_irq_chip) 565 if (desc->irq_data.chip != &no_irq_chip)
644 mask_ack_irq(desc); 566 mask_ack_irq(desc);
645 irq_compat_set_disabled(desc); 567 irq_state_set_disabled(desc);
646 desc->istate |= IRQS_DISABLED;
647 desc->depth = 1; 568 desc->depth = 1;
648 } 569 }
649 desc->handle_irq = handle; 570 desc->handle_irq = handle;
@@ -684,8 +605,70 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
684 irqd_set(&desc->irq_data, IRQD_PER_CPU); 605 irqd_set(&desc->irq_data, IRQD_PER_CPU);
685 if (irq_settings_can_move_pcntxt(desc)) 606 if (irq_settings_can_move_pcntxt(desc))
686 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); 607 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
608 if (irq_settings_is_level(desc))
609 irqd_set(&desc->irq_data, IRQD_LEVEL);
687 610
688 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); 611 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
689 612
690 irq_put_desc_unlock(desc, flags); 613 irq_put_desc_unlock(desc, flags);
691} 614}
615
616/**
617 * irq_cpu_online - Invoke all irq_cpu_online functions.
618 *
619 * Iterate through all irqs and invoke the chip.irq_cpu_online()
620 * for each.
621 */
622void irq_cpu_online(void)
623{
624 struct irq_desc *desc;
625 struct irq_chip *chip;
626 unsigned long flags;
627 unsigned int irq;
628
629 for_each_active_irq(irq) {
630 desc = irq_to_desc(irq);
631 if (!desc)
632 continue;
633
634 raw_spin_lock_irqsave(&desc->lock, flags);
635
636 chip = irq_data_get_irq_chip(&desc->irq_data);
637 if (chip && chip->irq_cpu_online &&
638 (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
639 !irqd_irq_disabled(&desc->irq_data)))
640 chip->irq_cpu_online(&desc->irq_data);
641
642 raw_spin_unlock_irqrestore(&desc->lock, flags);
643 }
644}
645
646/**
647 * irq_cpu_offline - Invoke all irq_cpu_offline functions.
648 *
649 * Iterate through all irqs and invoke the chip.irq_cpu_offline()
650 * for each.
651 */
652void irq_cpu_offline(void)
653{
654 struct irq_desc *desc;
655 struct irq_chip *chip;
656 unsigned long flags;
657 unsigned int irq;
658
659 for_each_active_irq(irq) {
660 desc = irq_to_desc(irq);
661 if (!desc)
662 continue;
663
664 raw_spin_lock_irqsave(&desc->lock, flags);
665
666 chip = irq_data_get_irq_chip(&desc->irq_data);
667 if (chip && chip->irq_cpu_offline &&
668 (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
669 !irqd_irq_disabled(&desc->irq_data)))
670 chip->irq_cpu_offline(&desc->irq_data);
671
672 raw_spin_unlock_irqrestore(&desc->lock, flags);
673 }
674}
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
deleted file mode 100644
index 6bbaf66aca85..000000000000
--- a/kernel/irq/compat.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * Compat layer for transition period
3 */
4#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
5static inline void irq_compat_set_progress(struct irq_desc *desc)
6{
7 desc->status |= IRQ_INPROGRESS;
8}
9
10static inline void irq_compat_clr_progress(struct irq_desc *desc)
11{
12 desc->status &= ~IRQ_INPROGRESS;
13}
14static inline void irq_compat_set_disabled(struct irq_desc *desc)
15{
16 desc->status |= IRQ_DISABLED;
17}
18static inline void irq_compat_clr_disabled(struct irq_desc *desc)
19{
20 desc->status &= ~IRQ_DISABLED;
21}
22static inline void irq_compat_set_pending(struct irq_desc *desc)
23{
24 desc->status |= IRQ_PENDING;
25}
26
27static inline void irq_compat_clr_pending(struct irq_desc *desc)
28{
29 desc->status &= ~IRQ_PENDING;
30}
31static inline void irq_compat_set_masked(struct irq_desc *desc)
32{
33 desc->status |= IRQ_MASKED;
34}
35
36static inline void irq_compat_clr_masked(struct irq_desc *desc)
37{
38 desc->status &= ~IRQ_MASKED;
39}
40static inline void irq_compat_set_move_pending(struct irq_desc *desc)
41{
42 desc->status |= IRQ_MOVE_PENDING;
43}
44
45static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
46{
47 desc->status &= ~IRQ_MOVE_PENDING;
48}
49static inline void irq_compat_set_affinity(struct irq_desc *desc)
50{
51 desc->status |= IRQ_AFFINITY_SET;
52}
53
54static inline void irq_compat_clr_affinity(struct irq_desc *desc)
55{
56 desc->status &= ~IRQ_AFFINITY_SET;
57}
58#else
59static inline void irq_compat_set_progress(struct irq_desc *desc) { }
60static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
61static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
62static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
63static inline void irq_compat_set_pending(struct irq_desc *desc) { }
64static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
65static inline void irq_compat_set_masked(struct irq_desc *desc) { }
66static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
67static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
68static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
69static inline void irq_compat_set_affinity(struct irq_desc *desc) { }
70static inline void irq_compat_clr_affinity(struct irq_desc *desc) { }
71#endif
72
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index d1a33b7fa61d..306cba37e9a5 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -4,8 +4,10 @@
4 4
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6 6
7#define P(f) if (desc->status & f) printk("%14s set\n", #f) 7#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) 8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9/* FIXME */
10#define PD(f) do { } while (0)
9 11
10static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 12static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
11{ 13{
@@ -28,13 +30,15 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
28 P(IRQ_NOAUTOEN); 30 P(IRQ_NOAUTOEN);
29 31
30 PS(IRQS_AUTODETECT); 32 PS(IRQS_AUTODETECT);
31 PS(IRQS_INPROGRESS);
32 PS(IRQS_REPLAY); 33 PS(IRQS_REPLAY);
33 PS(IRQS_WAITING); 34 PS(IRQS_WAITING);
34 PS(IRQS_DISABLED);
35 PS(IRQS_PENDING); 35 PS(IRQS_PENDING);
36 PS(IRQS_MASKED); 36
37 PD(IRQS_INPROGRESS);
38 PD(IRQS_DISABLED);
39 PD(IRQS_MASKED);
37} 40}
38 41
39#undef P 42#undef P
40#undef PS 43#undef PS
44#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 20dc5474947e..b5fcd96c7102 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data)
31 return 0; 31 return 0;
32} 32}
33 33
34#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
35static void compat_noop(unsigned int irq) { }
36#define END_INIT .end = compat_noop
37#else
38#define END_INIT
39#endif
40
41/* 34/*
42 * Generic no controller implementation 35 * Generic no controller implementation
43 */ 36 */
@@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = {
48 .irq_enable = noop, 41 .irq_enable = noop,
49 .irq_disable = noop, 42 .irq_disable = noop,
50 .irq_ack = ack_bad, 43 .irq_ack = ack_bad,
51 END_INIT
52}; 44};
53 45
54/* 46/*
@@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = {
64 .irq_ack = noop, 56 .irq_ack = noop,
65 .irq_mask = noop, 57 .irq_mask = noop,
66 .irq_unmask = noop, 58 .irq_unmask = noop,
67 END_INIT
68}; 59};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 517561fc7317..90cb55f6d7eb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -175,28 +175,13 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
175 struct irqaction *action = desc->action; 175 struct irqaction *action = desc->action;
176 irqreturn_t ret; 176 irqreturn_t ret;
177 177
178 irq_compat_clr_pending(desc);
179 desc->istate &= ~IRQS_PENDING; 178 desc->istate &= ~IRQS_PENDING;
180 irq_compat_set_progress(desc); 179 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
181 desc->istate |= IRQS_INPROGRESS;
182 raw_spin_unlock(&desc->lock); 180 raw_spin_unlock(&desc->lock);
183 181
184 ret = handle_irq_event_percpu(desc, action); 182 ret = handle_irq_event_percpu(desc, action);
185 183
186 raw_spin_lock(&desc->lock); 184 raw_spin_lock(&desc->lock);
187 desc->istate &= ~IRQS_INPROGRESS; 185 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
188 irq_compat_clr_progress(desc);
189 return ret; 186 return ret;
190} 187}
191
192/**
193 * handle_IRQ_event - irq action chain handler
194 * @irq: the interrupt number
195 * @action: the interrupt action chain for this irq
196 *
197 * Handles the action chain of an irq event
198 */
199irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
200{
201 return handle_irq_event_percpu(irq_to_desc(irq), action);
202}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6c6ec9a49027..6546431447d7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,10 +15,6 @@
15 15
16#define istate core_internal_state__do_not_mess_with_it 16#define istate core_internal_state__do_not_mess_with_it
17 17
18#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
19# define status status_use_accessors
20#endif
21
22extern int noirqdebug; 18extern int noirqdebug;
23 19
24/* 20/*
@@ -44,38 +40,28 @@ enum {
44 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt 40 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
45 * detection 41 * detection
46 * IRQS_POLL_INPROGRESS - polling in progress 42 * IRQS_POLL_INPROGRESS - polling in progress
47 * IRQS_INPROGRESS - Interrupt in progress
48 * IRQS_ONESHOT - irq is not unmasked in primary handler 43 * IRQS_ONESHOT - irq is not unmasked in primary handler
49 * IRQS_REPLAY - irq is replayed 44 * IRQS_REPLAY - irq is replayed
50 * IRQS_WAITING - irq is waiting 45 * IRQS_WAITING - irq is waiting
51 * IRQS_DISABLED - irq is disabled
52 * IRQS_PENDING - irq is pending and replayed later 46 * IRQS_PENDING - irq is pending and replayed later
53 * IRQS_MASKED - irq is masked
54 * IRQS_SUSPENDED - irq is suspended 47 * IRQS_SUSPENDED - irq is suspended
55 */ 48 */
56enum { 49enum {
57 IRQS_AUTODETECT = 0x00000001, 50 IRQS_AUTODETECT = 0x00000001,
58 IRQS_SPURIOUS_DISABLED = 0x00000002, 51 IRQS_SPURIOUS_DISABLED = 0x00000002,
59 IRQS_POLL_INPROGRESS = 0x00000008, 52 IRQS_POLL_INPROGRESS = 0x00000008,
60 IRQS_INPROGRESS = 0x00000010,
61 IRQS_ONESHOT = 0x00000020, 53 IRQS_ONESHOT = 0x00000020,
62 IRQS_REPLAY = 0x00000040, 54 IRQS_REPLAY = 0x00000040,
63 IRQS_WAITING = 0x00000080, 55 IRQS_WAITING = 0x00000080,
64 IRQS_DISABLED = 0x00000100,
65 IRQS_PENDING = 0x00000200, 56 IRQS_PENDING = 0x00000200,
66 IRQS_MASKED = 0x00000400,
67 IRQS_SUSPENDED = 0x00000800, 57 IRQS_SUSPENDED = 0x00000800,
68}; 58};
69 59
70#include "compat.h"
71#include "debug.h" 60#include "debug.h"
72#include "settings.h" 61#include "settings.h"
73 62
74#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) 63#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
75 64
76/* Set default functions for irq_chip structures: */
77extern void irq_chip_set_defaults(struct irq_chip *chip);
78
79extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 65extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
80 unsigned long flags); 66 unsigned long flags);
81extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
@@ -162,13 +148,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
162static inline void irqd_set_move_pending(struct irq_data *d) 148static inline void irqd_set_move_pending(struct irq_data *d)
163{ 149{
164 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; 150 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
165 irq_compat_set_move_pending(irq_data_to_desc(d));
166} 151}
167 152
168static inline void irqd_clr_move_pending(struct irq_data *d) 153static inline void irqd_clr_move_pending(struct irq_data *d)
169{ 154{
170 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; 155 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
171 irq_compat_clr_move_pending(irq_data_to_desc(d));
172} 156}
173 157
174static inline void irqd_clear(struct irq_data *d, unsigned int mask) 158static inline void irqd_clear(struct irq_data *d, unsigned int mask)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index dbccc799407f..2c039c9b9383 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -80,7 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
80 desc->irq_data.handler_data = NULL; 80 desc->irq_data.handler_data = NULL;
81 desc->irq_data.msi_desc = NULL; 81 desc->irq_data.msi_desc = NULL;
82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); 82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
83 desc->istate = IRQS_DISABLED; 83 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
84 desc->handle_irq = handle_bad_irq; 84 desc->handle_irq = handle_bad_irq;
85 desc->depth = 1; 85 desc->depth = 1;
86 desc->irq_count = 0; 86 desc->irq_count = 0;
@@ -198,15 +198,6 @@ err:
198 return -ENOMEM; 198 return -ENOMEM;
199} 199}
200 200
201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
202{
203 int res = irq_alloc_descs(irq, irq, 1, node);
204
205 if (res == -EEXIST || res == irq)
206 return irq_to_desc(irq);
207 return NULL;
208}
209
210static int irq_expand_nr_irqs(unsigned int nr) 201static int irq_expand_nr_irqs(unsigned int nr)
211{ 202{
212 if (nr > IRQ_BITMAP_BITS) 203 if (nr > IRQ_BITMAP_BITS)
@@ -247,7 +238,6 @@ int __init early_irq_init(void)
247 238
248struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 239struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
249 [0 ... NR_IRQS-1] = { 240 [0 ... NR_IRQS-1] = {
250 .istate = IRQS_DISABLED,
251 .handle_irq = handle_bad_irq, 241 .handle_irq = handle_bad_irq,
252 .depth = 1, 242 .depth = 1,
253 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), 243 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
@@ -283,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
283 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 273 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
284} 274}
285 275
286struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
287{
288 return irq_to_desc(irq);
289}
290
291static void free_desc(unsigned int irq) 276static void free_desc(unsigned int irq)
292{ 277{
293 dynamic_irq_cleanup(irq); 278 dynamic_irq_cleanup(irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a2aa73e536c..07c1611f3899 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -41,7 +41,7 @@ early_param("threadirqs", setup_forced_irqthreads);
41void synchronize_irq(unsigned int irq) 41void synchronize_irq(unsigned int irq)
42{ 42{
43 struct irq_desc *desc = irq_to_desc(irq); 43 struct irq_desc *desc = irq_to_desc(irq);
44 unsigned int state; 44 bool inprogress;
45 45
46 if (!desc) 46 if (!desc)
47 return; 47 return;
@@ -53,16 +53,16 @@ void synchronize_irq(unsigned int irq)
53 * Wait until we're out of the critical section. This might 53 * Wait until we're out of the critical section. This might
54 * give the wrong answer due to the lack of memory barriers. 54 * give the wrong answer due to the lack of memory barriers.
55 */ 55 */
56 while (desc->istate & IRQS_INPROGRESS) 56 while (irqd_irq_inprogress(&desc->irq_data))
57 cpu_relax(); 57 cpu_relax();
58 58
59 /* Ok, that indicated we're done: double-check carefully. */ 59 /* Ok, that indicated we're done: double-check carefully. */
60 raw_spin_lock_irqsave(&desc->lock, flags); 60 raw_spin_lock_irqsave(&desc->lock, flags);
61 state = desc->istate; 61 inprogress = irqd_irq_inprogress(&desc->irq_data);
62 raw_spin_unlock_irqrestore(&desc->lock, flags); 62 raw_spin_unlock_irqrestore(&desc->lock, flags);
63 63
64 /* Oops, that failed? */ 64 /* Oops, that failed? */
65 } while (state & IRQS_INPROGRESS); 65 } while (inprogress);
66 66
67 /* 67 /*
68 * We made sure that no hardirq handler is running. Now verify 68 * We made sure that no hardirq handler is running. Now verify
@@ -112,13 +112,13 @@ void irq_set_thread_affinity(struct irq_desc *desc)
112} 112}
113 113
114#ifdef CONFIG_GENERIC_PENDING_IRQ 114#ifdef CONFIG_GENERIC_PENDING_IRQ
115static inline bool irq_can_move_pcntxt(struct irq_desc *desc) 115static inline bool irq_can_move_pcntxt(struct irq_data *data)
116{ 116{
117 return irq_settings_can_move_pcntxt(desc); 117 return irqd_can_move_in_process_context(data);
118} 118}
119static inline bool irq_move_pending(struct irq_desc *desc) 119static inline bool irq_move_pending(struct irq_data *data)
120{ 120{
121 return irqd_is_setaffinity_pending(&desc->irq_data); 121 return irqd_is_setaffinity_pending(data);
122} 122}
123static inline void 123static inline void
124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) 124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
@@ -131,43 +131,34 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
131 cpumask_copy(mask, desc->pending_mask); 131 cpumask_copy(mask, desc->pending_mask);
132} 132}
133#else 133#else
134static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } 134static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
135static inline bool irq_move_pending(struct irq_desc *desc) { return false; } 135static inline bool irq_move_pending(struct irq_data *data) { return false; }
136static inline void 136static inline void
137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } 137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
138static inline void 138static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } 139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif 140#endif
141 141
142/** 142int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
143 * irq_set_affinity - Set the irq affinity of a given irq
144 * @irq: Interrupt to set affinity
145 * @cpumask: cpumask
146 *
147 */
148int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
149{ 143{
150 struct irq_desc *desc = irq_to_desc(irq); 144 struct irq_chip *chip = irq_data_get_irq_chip(data);
151 struct irq_chip *chip = desc->irq_data.chip; 145 struct irq_desc *desc = irq_data_to_desc(data);
152 unsigned long flags;
153 int ret = 0; 146 int ret = 0;
154 147
155 if (!chip->irq_set_affinity) 148 if (!chip || !chip->irq_set_affinity)
156 return -EINVAL; 149 return -EINVAL;
157 150
158 raw_spin_lock_irqsave(&desc->lock, flags); 151 if (irq_can_move_pcntxt(data)) {
159 152 ret = chip->irq_set_affinity(data, mask, false);
160 if (irq_can_move_pcntxt(desc)) {
161 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
162 switch (ret) { 153 switch (ret) {
163 case IRQ_SET_MASK_OK: 154 case IRQ_SET_MASK_OK:
164 cpumask_copy(desc->irq_data.affinity, mask); 155 cpumask_copy(data->affinity, mask);
165 case IRQ_SET_MASK_OK_NOCOPY: 156 case IRQ_SET_MASK_OK_NOCOPY:
166 irq_set_thread_affinity(desc); 157 irq_set_thread_affinity(desc);
167 ret = 0; 158 ret = 0;
168 } 159 }
169 } else { 160 } else {
170 irqd_set_move_pending(&desc->irq_data); 161 irqd_set_move_pending(data);
171 irq_copy_pending(desc, mask); 162 irq_copy_pending(desc, mask);
172 } 163 }
173 164
@@ -175,8 +166,28 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
175 kref_get(&desc->affinity_notify->kref); 166 kref_get(&desc->affinity_notify->kref);
176 schedule_work(&desc->affinity_notify->work); 167 schedule_work(&desc->affinity_notify->work);
177 } 168 }
178 irq_compat_set_affinity(desc); 169 irqd_set(data, IRQD_AFFINITY_SET);
179 irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); 170
171 return ret;
172}
173
174/**
175 * irq_set_affinity - Set the irq affinity of a given irq
176 * @irq: Interrupt to set affinity
177 * @mask: cpumask
178 *
179 */
180int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
181{
182 struct irq_desc *desc = irq_to_desc(irq);
183 unsigned long flags;
184 int ret;
185
186 if (!desc)
187 return -EINVAL;
188
189 raw_spin_lock_irqsave(&desc->lock, flags);
190 ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
180 raw_spin_unlock_irqrestore(&desc->lock, flags); 191 raw_spin_unlock_irqrestore(&desc->lock, flags);
181 return ret; 192 return ret;
182} 193}
@@ -206,7 +217,7 @@ static void irq_affinity_notify(struct work_struct *work)
206 goto out; 217 goto out;
207 218
208 raw_spin_lock_irqsave(&desc->lock, flags); 219 raw_spin_lock_irqsave(&desc->lock, flags);
209 if (irq_move_pending(desc)) 220 if (irq_move_pending(&desc->irq_data))
210 irq_get_pending(cpumask, desc); 221 irq_get_pending(cpumask, desc);
211 else 222 else
212 cpumask_copy(cpumask, desc->irq_data.affinity); 223 cpumask_copy(cpumask, desc->irq_data.affinity);
@@ -285,10 +296,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
285 if (cpumask_intersects(desc->irq_data.affinity, 296 if (cpumask_intersects(desc->irq_data.affinity,
286 cpu_online_mask)) 297 cpu_online_mask))
287 set = desc->irq_data.affinity; 298 set = desc->irq_data.affinity;
288 else { 299 else
289 irq_compat_clr_affinity(desc);
290 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); 300 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
291 }
292 } 301 }
293 302
294 cpumask_and(mask, cpu_online_mask, set); 303 cpumask_and(mask, cpu_online_mask, set);
@@ -551,9 +560,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
551 flags &= IRQ_TYPE_SENSE_MASK; 560 flags &= IRQ_TYPE_SENSE_MASK;
552 561
553 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { 562 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
554 if (!(desc->istate & IRQS_MASKED)) 563 if (!irqd_irq_masked(&desc->irq_data))
555 mask_irq(desc); 564 mask_irq(desc);
556 if (!(desc->istate & IRQS_DISABLED)) 565 if (!irqd_irq_disabled(&desc->irq_data))
557 unmask = 1; 566 unmask = 1;
558 } 567 }
559 568
@@ -575,8 +584,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
575 irqd_set(&desc->irq_data, IRQD_LEVEL); 584 irqd_set(&desc->irq_data, IRQD_LEVEL);
576 } 585 }
577 586
578 if (chip != desc->irq_data.chip)
579 irq_chip_set_defaults(desc->irq_data.chip);
580 ret = 0; 587 ret = 0;
581 break; 588 break;
582 default: 589 default:
@@ -651,7 +658,7 @@ again:
651 * irq_wake_thread(). See the comment there which explains the 658 * irq_wake_thread(). See the comment there which explains the
652 * serialization. 659 * serialization.
653 */ 660 */
654 if (unlikely(desc->istate & IRQS_INPROGRESS)) { 661 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
655 raw_spin_unlock_irq(&desc->lock); 662 raw_spin_unlock_irq(&desc->lock);
656 chip_bus_sync_unlock(desc); 663 chip_bus_sync_unlock(desc);
657 cpu_relax(); 664 cpu_relax();
@@ -668,12 +675,10 @@ again:
668 675
669 desc->threads_oneshot &= ~action->thread_mask; 676 desc->threads_oneshot &= ~action->thread_mask;
670 677
671 if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && 678 if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
672 (desc->istate & IRQS_MASKED)) { 679 irqd_irq_masked(&desc->irq_data))
673 irq_compat_clr_masked(desc); 680 unmask_irq(desc);
674 desc->istate &= ~IRQS_MASKED; 681
675 desc->irq_data.chip->irq_unmask(&desc->irq_data);
676 }
677out_unlock: 682out_unlock:
678 raw_spin_unlock_irq(&desc->lock); 683 raw_spin_unlock_irq(&desc->lock);
679 chip_bus_sync_unlock(desc); 684 chip_bus_sync_unlock(desc);
@@ -767,7 +772,7 @@ static int irq_thread(void *data)
767 atomic_inc(&desc->threads_active); 772 atomic_inc(&desc->threads_active);
768 773
769 raw_spin_lock_irq(&desc->lock); 774 raw_spin_lock_irq(&desc->lock);
770 if (unlikely(desc->istate & IRQS_DISABLED)) { 775 if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
771 /* 776 /*
772 * CHECKME: We might need a dedicated 777 * CHECKME: We might need a dedicated
773 * IRQ_THREAD_PENDING flag here, which 778 * IRQ_THREAD_PENDING flag here, which
@@ -775,7 +780,6 @@ static int irq_thread(void *data)
775 * but AFAICT IRQS_PENDING should be fine as it 780 * but AFAICT IRQS_PENDING should be fine as it
776 * retriggers the interrupt itself --- tglx 781 * retriggers the interrupt itself --- tglx
777 */ 782 */
778 irq_compat_set_pending(desc);
779 desc->istate |= IRQS_PENDING; 783 desc->istate |= IRQS_PENDING;
780 raw_spin_unlock_irq(&desc->lock); 784 raw_spin_unlock_irq(&desc->lock);
781 } else { 785 } else {
@@ -971,8 +975,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
971 new->thread_mask = 1 << ffz(thread_mask); 975 new->thread_mask = 1 << ffz(thread_mask);
972 976
973 if (!shared) { 977 if (!shared) {
974 irq_chip_set_defaults(desc->irq_data.chip);
975
976 init_waitqueue_head(&desc->wait_for_threads); 978 init_waitqueue_head(&desc->wait_for_threads);
977 979
978 /* Setup the type (level, edge polarity) if configured: */ 980 /* Setup the type (level, edge polarity) if configured: */
@@ -985,8 +987,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
985 } 987 }
986 988
987 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ 989 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
988 IRQS_INPROGRESS | IRQS_ONESHOT | \ 990 IRQS_ONESHOT | IRQS_WAITING);
989 IRQS_WAITING); 991 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
990 992
991 if (new->flags & IRQF_PERCPU) { 993 if (new->flags & IRQF_PERCPU) {
992 irqd_set(&desc->irq_data, IRQD_PER_CPU); 994 irqd_set(&desc->irq_data, IRQD_PER_CPU);
@@ -1049,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1049 register_irq_proc(irq, desc); 1051 register_irq_proc(irq, desc);
1050 new->dir = NULL; 1052 new->dir = NULL;
1051 register_handler_proc(irq, new); 1053 register_handler_proc(irq, new);
1054 free_cpumask_var(mask);
1052 1055
1053 return 0; 1056 return 0;
1054 1057
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ec4806d4778b..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -35,7 +35,7 @@ void irq_move_masked_irq(struct irq_data *idata)
35 * do the disable, re-program, enable sequence. 35 * do the disable, re-program, enable sequence.
36 * This is *not* particularly important for level triggered 36 * This is *not* particularly important for level triggered
37 * but in a edge trigger case, we might be setting rte 37 * but in a edge trigger case, we might be setting rte
38 * when an active trigger is comming in. This could 38 * when an active trigger is coming in. This could
39 * cause some ioapics to mal-function. 39 * cause some ioapics to mal-function.
40 * Being paranoid i guess! 40 * Being paranoid i guess!
41 * 41 *
@@ -53,20 +53,14 @@ void irq_move_masked_irq(struct irq_data *idata)
53 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
54} 54}
55 55
56void move_masked_irq(int irq)
57{
58 irq_move_masked_irq(irq_get_irq_data(irq));
59}
60
61void irq_move_irq(struct irq_data *idata) 56void irq_move_irq(struct irq_data *idata)
62{ 57{
63 struct irq_desc *desc = irq_data_to_desc(idata);
64 bool masked; 58 bool masked;
65 59
66 if (likely(!irqd_is_setaffinity_pending(idata))) 60 if (likely(!irqd_is_setaffinity_pending(idata)))
67 return; 61 return;
68 62
69 if (unlikely(desc->istate & IRQS_DISABLED)) 63 if (unlikely(irqd_irq_disabled(idata)))
70 return; 64 return;
71 65
72 /* 66 /*
@@ -74,15 +68,10 @@ void irq_move_irq(struct irq_data *idata)
74 * threaded interrupt with ONESHOT set, we can end up with an 68 * threaded interrupt with ONESHOT set, we can end up with an
75 * interrupt storm. 69 * interrupt storm.
76 */ 70 */
77 masked = desc->istate & IRQS_MASKED; 71 masked = irqd_irq_masked(idata);
78 if (!masked) 72 if (!masked)
79 idata->chip->irq_mask(idata); 73 idata->chip->irq_mask(idata);
80 irq_move_masked_irq(idata); 74 irq_move_masked_irq(idata);
81 if (!masked) 75 if (!masked)
82 idata->chip->irq_unmask(idata); 76 idata->chip->irq_unmask(idata);
83} 77}
84
85void move_native_irq(int irq)
86{
87 irq_move_irq(irq_get_irq_data(irq));
88}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 760248de109d..dd201bd35103 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -364,6 +364,10 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec)
364 return 0; 364 return 0;
365} 365}
366 366
367#ifndef ACTUAL_NR_IRQS
368# define ACTUAL_NR_IRQS nr_irqs
369#endif
370
367int show_interrupts(struct seq_file *p, void *v) 371int show_interrupts(struct seq_file *p, void *v)
368{ 372{
369 static int prec; 373 static int prec;
@@ -373,10 +377,10 @@ int show_interrupts(struct seq_file *p, void *v)
373 struct irqaction *action; 377 struct irqaction *action;
374 struct irq_desc *desc; 378 struct irq_desc *desc;
375 379
376 if (i > nr_irqs) 380 if (i > ACTUAL_NR_IRQS)
377 return 0; 381 return 0;
378 382
379 if (i == nr_irqs) 383 if (i == ACTUAL_NR_IRQS)
380 return arch_show_interrupts(p, prec); 384 return arch_show_interrupts(p, prec);
381 385
382 /* print header and calculate the width of the first column */ 386 /* print header and calculate the width of the first column */
@@ -404,7 +408,20 @@ int show_interrupts(struct seq_file *p, void *v)
404 seq_printf(p, "%*d: ", prec, i); 408 seq_printf(p, "%*d: ", prec, i);
405 for_each_online_cpu(j) 409 for_each_online_cpu(j)
406 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 410 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
407 seq_printf(p, " %8s", desc->irq_data.chip->name); 411
412 if (desc->irq_data.chip) {
413 if (desc->irq_data.chip->irq_print_chip)
414 desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
415 else if (desc->irq_data.chip->name)
416 seq_printf(p, " %8s", desc->irq_data.chip->name);
417 else
418 seq_printf(p, " %8s", "-");
419 } else {
420 seq_printf(p, " %8s", "None");
421 }
422#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL
423 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
424#endif
408 if (desc->name) 425 if (desc->name)
409 seq_printf(p, "-%-8s", desc->name); 426 seq_printf(p, "-%-8s", desc->name);
410 427
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index ad683a99b1ec..14dd5761e8c9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -65,7 +65,6 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
65 if (desc->istate & IRQS_REPLAY) 65 if (desc->istate & IRQS_REPLAY)
66 return; 66 return;
67 if (desc->istate & IRQS_PENDING) { 67 if (desc->istate & IRQS_PENDING) {
68 irq_compat_clr_pending(desc);
69 desc->istate &= ~IRQS_PENDING; 68 desc->istate &= ~IRQS_PENDING;
70 desc->istate |= IRQS_REPLAY; 69 desc->istate |= IRQS_REPLAY;
71 70
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 0227ad358272..0d91730b6330 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,17 +15,8 @@ enum {
15 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, 15 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
16}; 16};
17 17
18#define IRQ_INPROGRESS GOT_YOU_MORON
19#define IRQ_REPLAY GOT_YOU_MORON
20#define IRQ_WAITING GOT_YOU_MORON
21#define IRQ_DISABLED GOT_YOU_MORON
22#define IRQ_PENDING GOT_YOU_MORON
23#define IRQ_MASKED GOT_YOU_MORON
24#define IRQ_WAKEUP GOT_YOU_MORON
25#define IRQ_MOVE_PENDING GOT_YOU_MORON
26#define IRQ_PER_CPU GOT_YOU_MORON 18#define IRQ_PER_CPU GOT_YOU_MORON
27#define IRQ_NO_BALANCING GOT_YOU_MORON 19#define IRQ_NO_BALANCING GOT_YOU_MORON
28#define IRQ_AFFINITY_SET GOT_YOU_MORON
29#define IRQ_LEVEL GOT_YOU_MORON 20#define IRQ_LEVEL GOT_YOU_MORON
30#define IRQ_NOPROBE GOT_YOU_MORON 21#define IRQ_NOPROBE GOT_YOU_MORON
31#define IRQ_NOREQUEST GOT_YOU_MORON 22#define IRQ_NOREQUEST GOT_YOU_MORON
@@ -37,102 +28,98 @@ enum {
37static inline void 28static inline void
38irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) 29irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
39{ 30{
40 desc->status &= ~(clr & _IRQF_MODIFY_MASK); 31 desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
41 desc->status |= (set & _IRQF_MODIFY_MASK); 32 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
42} 33}
43 34
44static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) 35static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
45{ 36{
46 return desc->status & _IRQ_PER_CPU; 37 return desc->status_use_accessors & _IRQ_PER_CPU;
47} 38}
48 39
49static inline void irq_settings_set_per_cpu(struct irq_desc *desc) 40static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
50{ 41{
51 desc->status |= _IRQ_PER_CPU; 42 desc->status_use_accessors |= _IRQ_PER_CPU;
52} 43}
53 44
54static inline void irq_settings_set_no_balancing(struct irq_desc *desc) 45static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
55{ 46{
56 desc->status |= _IRQ_NO_BALANCING; 47 desc->status_use_accessors |= _IRQ_NO_BALANCING;
57} 48}
58 49
59static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) 50static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
60{ 51{
61 return desc->status & _IRQ_NO_BALANCING; 52 return desc->status_use_accessors & _IRQ_NO_BALANCING;
62} 53}
63 54
64static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) 55static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
65{ 56{
66 return desc->status & IRQ_TYPE_SENSE_MASK; 57 return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
67} 58}
68 59
69static inline void 60static inline void
70irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) 61irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
71{ 62{
72 desc->status &= ~IRQ_TYPE_SENSE_MASK; 63 desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
73 desc->status |= mask & IRQ_TYPE_SENSE_MASK; 64 desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
74} 65}
75 66
76static inline bool irq_settings_is_level(struct irq_desc *desc) 67static inline bool irq_settings_is_level(struct irq_desc *desc)
77{ 68{
78 return desc->status & _IRQ_LEVEL; 69 return desc->status_use_accessors & _IRQ_LEVEL;
79} 70}
80 71
81static inline void irq_settings_clr_level(struct irq_desc *desc) 72static inline void irq_settings_clr_level(struct irq_desc *desc)
82{ 73{
83 desc->status &= ~_IRQ_LEVEL; 74 desc->status_use_accessors &= ~_IRQ_LEVEL;
84} 75}
85 76
86static inline void irq_settings_set_level(struct irq_desc *desc) 77static inline void irq_settings_set_level(struct irq_desc *desc)
87{ 78{
88 desc->status |= _IRQ_LEVEL; 79 desc->status_use_accessors |= _IRQ_LEVEL;
89} 80}
90 81
91static inline bool irq_settings_can_request(struct irq_desc *desc) 82static inline bool irq_settings_can_request(struct irq_desc *desc)
92{ 83{
93 return !(desc->status & _IRQ_NOREQUEST); 84 return !(desc->status_use_accessors & _IRQ_NOREQUEST);
94} 85}
95 86
96static inline void irq_settings_clr_norequest(struct irq_desc *desc) 87static inline void irq_settings_clr_norequest(struct irq_desc *desc)
97{ 88{
98 desc->status &= ~_IRQ_NOREQUEST; 89 desc->status_use_accessors &= ~_IRQ_NOREQUEST;
99} 90}
100 91
101static inline void irq_settings_set_norequest(struct irq_desc *desc) 92static inline void irq_settings_set_norequest(struct irq_desc *desc)
102{ 93{
103 desc->status |= _IRQ_NOREQUEST; 94 desc->status_use_accessors |= _IRQ_NOREQUEST;
104} 95}
105 96
106static inline bool irq_settings_can_probe(struct irq_desc *desc) 97static inline bool irq_settings_can_probe(struct irq_desc *desc)
107{ 98{
108 return !(desc->status & _IRQ_NOPROBE); 99 return !(desc->status_use_accessors & _IRQ_NOPROBE);
109} 100}
110 101
111static inline void irq_settings_clr_noprobe(struct irq_desc *desc) 102static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
112{ 103{
113 desc->status &= ~_IRQ_NOPROBE; 104 desc->status_use_accessors &= ~_IRQ_NOPROBE;
114} 105}
115 106
116static inline void irq_settings_set_noprobe(struct irq_desc *desc) 107static inline void irq_settings_set_noprobe(struct irq_desc *desc)
117{ 108{
118 desc->status |= _IRQ_NOPROBE; 109 desc->status_use_accessors |= _IRQ_NOPROBE;
119} 110}
120 111
121static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) 112static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
122{ 113{
123 return desc->status & _IRQ_MOVE_PCNTXT; 114 return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
124} 115}
125 116
126static inline bool irq_settings_can_autoenable(struct irq_desc *desc) 117static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
127{ 118{
128 return !(desc->status & _IRQ_NOAUTOEN); 119 return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
129} 120}
130 121
131static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) 122static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
132{ 123{
133 return desc->status & _IRQ_NESTED_THREAD; 124 return desc->status_use_accessors & _IRQ_NESTED_THREAD;
134} 125}
135
136/* Nothing should touch desc->status from now on */
137#undef status
138#define status USE_THE_PROPER_WRAPPERS_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd586ebf9c8c..dfbd550401b2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,12 +45,12 @@ bool irq_wait_for_poll(struct irq_desc *desc)
45#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
46 do { 46 do {
47 raw_spin_unlock(&desc->lock); 47 raw_spin_unlock(&desc->lock);
48 while (desc->istate & IRQS_INPROGRESS) 48 while (irqd_irq_inprogress(&desc->irq_data))
49 cpu_relax(); 49 cpu_relax();
50 raw_spin_lock(&desc->lock); 50 raw_spin_lock(&desc->lock);
51 } while (desc->istate & IRQS_INPROGRESS); 51 } while (irqd_irq_inprogress(&desc->irq_data));
52 /* Might have been disabled in meantime */ 52 /* Might have been disabled in meantime */
53 return !(desc->istate & IRQS_DISABLED) && desc->action; 53 return !irqd_irq_disabled(&desc->irq_data) && desc->action;
54#else 54#else
55 return false; 55 return false;
56#endif 56#endif
@@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
75 * Do not poll disabled interrupts unless the spurious 75 * Do not poll disabled interrupts unless the spurious
76 * disabled poller asks explicitely. 76 * disabled poller asks explicitely.
77 */ 77 */
78 if ((desc->istate & IRQS_DISABLED) && !force) 78 if (irqd_irq_disabled(&desc->irq_data) && !force)
79 goto out; 79 goto out;
80 80
81 /* 81 /*
@@ -88,12 +88,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
88 goto out; 88 goto out;
89 89
90 /* Already running on another processor */ 90 /* Already running on another processor */
91 if (desc->istate & IRQS_INPROGRESS) { 91 if (irqd_irq_inprogress(&desc->irq_data)) {
92 /* 92 /*
93 * Already running: If it is shared get the other 93 * Already running: If it is shared get the other
94 * CPU to go looking for our mystery interrupt too 94 * CPU to go looking for our mystery interrupt too
95 */ 95 */
96 irq_compat_set_pending(desc);
97 desc->istate |= IRQS_PENDING; 96 desc->istate |= IRQS_PENDING;
98 goto out; 97 goto out;
99 } 98 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || 64 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
65 arch_is_kernel_text(addr)) 65 arch_is_kernel_text(addr))
66 return 1; 66 return 1;
67 return in_gate_area_no_task(addr); 67 return in_gate_area_no_mm(addr);
68} 68}
69 69
70static inline int is_kernel(unsigned long addr) 70static inline int is_kernel(unsigned long addr)
71{ 71{
72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) 72 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
73 return 1; 73 return 1;
74 return in_gate_area_no_task(addr); 74 return in_gate_area_no_mm(addr);
75} 75}
76 76
77static int is_ksym_addr(unsigned long addr) 77static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
342} 342}
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345int sprint_symbol(char *buffer, unsigned long address) 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset)
346{ 347{
347 char *modname; 348 char *modname;
348 const char *name; 349 const char *name;
349 unsigned long offset, size; 350 unsigned long offset, size;
350 int len; 351 int len;
351 352
353 address += symbol_offset;
352 name = kallsyms_lookup(address, &size, &offset, &modname, buffer); 354 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
353 if (!name) 355 if (!name)
354 return sprintf(buffer, "0x%lx", address); 356 return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
357 strcpy(buffer, name); 359 strcpy(buffer, name);
358 len = strlen(buffer); 360 len = strlen(buffer);
359 buffer += len; 361 buffer += len;
362 offset -= symbol_offset;
360 363
361 if (modname) 364 if (modname)
362 len += sprintf(buffer, "+%#lx/%#lx [%s]", 365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
363 offset, size, modname);
364 else 366 else
365 len += sprintf(buffer, "+%#lx/%#lx", offset, size); 367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
366 368
367 return len; 369 return len;
368} 370}
371
372/**
373 * sprint_symbol - Look up a kernel symbol and return it in a text buffer
374 * @buffer: buffer to be stored
375 * @address: address to lookup
376 *
377 * This function looks up a kernel symbol with @address and stores its name,
378 * offset, size and module name to @buffer if possible. If no symbol was found,
379 * just saves its @address as is.
380 *
381 * This function returns the number of bytes stored in @buffer.
382 */
383int sprint_symbol(char *buffer, unsigned long address)
384{
385 return __sprint_symbol(buffer, address, 0);
386}
387
369EXPORT_SYMBOL_GPL(sprint_symbol); 388EXPORT_SYMBOL_GPL(sprint_symbol);
370 389
390/**
391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
392 * @buffer: buffer to be stored
393 * @address: address to lookup
394 *
395 * This function is for stack backtrace and does the same thing as
396 * sprint_symbol() but with modified/decreased @address. If there is a
397 * tail-call to the function marked "noreturn", gcc optimized out code after
398 * the call so that the stack-saved return address could point outside of the
399 * caller. This function ensures that kallsyms will find the original caller
400 * by decreasing @address.
401 *
402 * This function returns the number of bytes stored in @buffer.
403 */
404int sprint_backtrace(char *buffer, unsigned long address)
405{
406 return __sprint_symbol(buffer, address, -1);
407}
408
371/* Look up a kernel symbol and print it to the kernel messages. */ 409/* Look up a kernel symbol and print it to the kernel messages. */
372void __print_symbol(const char *fmt, unsigned long address) 410void __print_symbol(const char *fmt, unsigned long address)
373{ 411{
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
477 */ 515 */
478 type = iter->exported ? toupper(iter->type) : 516 type = iter->exported ? toupper(iter->type) :
479 tolower(iter->type); 517 tolower(iter->type);
480 seq_printf(m, "%0*lx %c %s\t[%s]\n", 518 seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
481 (int)(2 * sizeof(void *)), 519 type, iter->name, iter->module_name);
482 iter->value, type, iter->name, iter->module_name);
483 } else 520 } else
484 seq_printf(m, "%0*lx %c %s\n", 521 seq_printf(m, "%pK %c %s\n", (void *)iter->value,
485 (int)(2 * sizeof(void *)), 522 iter->type, iter->name);
486 iter->value, iter->type, iter->name);
487 return 0; 523 return 0;
488} 524}
489 525
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7ebd..87b77de03dd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h> 35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h>
36 37
37#include <asm/page.h> 38#include <asm/page.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
144 /* Initialize the list of destination pages */ 145 /* Initialize the list of destination pages */
145 INIT_LIST_HEAD(&image->dest_pages); 146 INIT_LIST_HEAD(&image->dest_pages);
146 147
147 /* Initialize the list of unuseable pages */ 148 /* Initialize the list of unusable pages */
148 INIT_LIST_HEAD(&image->unuseable_pages); 149 INIT_LIST_HEAD(&image->unuseable_pages);
149 150
150 /* Read in the segments */ 151 /* Read in the segments */
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
454 /* Deal with the destination pages I have inadvertently allocated. 455 /* Deal with the destination pages I have inadvertently allocated.
455 * 456 *
456 * Ideally I would convert multi-page allocations into single 457 * Ideally I would convert multi-page allocations into single
457 * page allocations, and add everyting to image->dest_pages. 458 * page allocations, and add everything to image->dest_pages.
458 * 459 *
459 * For now it is simpler to just free the pages. 460 * For now it is simpler to just free the pages.
460 */ 461 */
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
602 /* Walk through and free any extra destination pages I may have */ 603 /* Walk through and free any extra destination pages I may have */
603 kimage_free_page_list(&image->dest_pages); 604 kimage_free_page_list(&image->dest_pages);
604 605
605 /* Walk through and free any unuseable pages I have cached */ 606 /* Walk through and free any unusable pages I have cached */
606 kimage_free_page_list(&image->unuseable_pages); 607 kimage_free_page_list(&image->unuseable_pages);
607 608
608} 609}
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void)
1099 return size; 1100 return size;
1100} 1101}
1101 1102
1102static void free_reserved_phys_range(unsigned long begin, unsigned long end) 1103void __weak crash_free_reserved_phys_range(unsigned long begin,
1104 unsigned long end)
1103{ 1105{
1104 unsigned long addr; 1106 unsigned long addr;
1105 1107
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
1135 start = roundup(start, PAGE_SIZE); 1137 start = roundup(start, PAGE_SIZE);
1136 end = roundup(start + new_size, PAGE_SIZE); 1138 end = roundup(start + new_size, PAGE_SIZE);
1137 1139
1138 free_reserved_phys_range(end, crashk_res.end); 1140 crash_free_reserved_phys_range(end, crashk_res.end);
1139 1141
1140 if ((start == end) && (crashk_res.parent != NULL)) 1142 if ((start == end) && (crashk_res.parent != NULL))
1141 release_resource(&crashk_res); 1143 release_resource(&crashk_res);
@@ -1531,6 +1533,11 @@ int kernel_kexec(void)
1531 local_irq_disable(); 1533 local_irq_disable();
1532 /* Suspend system devices */ 1534 /* Suspend system devices */
1533 error = sysdev_suspend(PMSG_FREEZE); 1535 error = sysdev_suspend(PMSG_FREEZE);
1536 if (!error) {
1537 error = syscore_suspend();
1538 if (error)
1539 sysdev_resume();
1540 }
1534 if (error) 1541 if (error)
1535 goto Enable_irqs; 1542 goto Enable_irqs;
1536 } else 1543 } else
@@ -1545,6 +1552,7 @@ int kernel_kexec(void)
1545 1552
1546#ifdef CONFIG_KEXEC_JUMP 1553#ifdef CONFIG_KEXEC_JUMP
1547 if (kexec_image->preserve_context) { 1554 if (kexec_image->preserve_context) {
1555 syscore_resume();
1548 sysdev_resume(); 1556 sysdev_resume();
1549 Enable_irqs: 1557 Enable_irqs:
1550 local_irq_enable(); 1558 local_irq_enable();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..3b34d2732bce 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
27 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
28 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
29 void *data; 29 void *data;
30 int node;
30 31
31 /* Result passed back to kthread_create() from kthreadd. */ 32 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 33 struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
98 do_exit(ret); 99 do_exit(ret);
99} 100}
100 101
102/* called from do_fork() to get node information for about to be created task */
103int tsk_fork_get_node(struct task_struct *tsk)
104{
105#ifdef CONFIG_NUMA
106 if (tsk == kthreadd_task)
107 return tsk->pref_node_fork;
108#endif
109 return numa_node_id();
110}
111
101static void create_kthread(struct kthread_create_info *create) 112static void create_kthread(struct kthread_create_info *create)
102{ 113{
103 int pid; 114 int pid;
104 115
116#ifdef CONFIG_NUMA
117 current->pref_node_fork = create->node;
118#endif
105 /* We want our own signal handler (we take no signals by default). */ 119 /* We want our own signal handler (we take no signals by default). */
106 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 120 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
107 if (pid < 0) { 121 if (pid < 0) {
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create)
111} 125}
112 126
113/** 127/**
114 * kthread_create - create a kthread. 128 * kthread_create_on_node - create a kthread.
115 * @threadfn: the function to run until signal_pending(current). 129 * @threadfn: the function to run until signal_pending(current).
116 * @data: data ptr for @threadfn. 130 * @data: data ptr for @threadfn.
131 * @node: memory node number.
117 * @namefmt: printf-style name for the thread. 132 * @namefmt: printf-style name for the thread.
118 * 133 *
119 * Description: This helper function creates and names a kernel 134 * Description: This helper function creates and names a kernel
120 * thread. The thread will be stopped: use wake_up_process() to start 135 * thread. The thread will be stopped: use wake_up_process() to start
121 * it. See also kthread_run(). 136 * it. See also kthread_run().
122 * 137 *
138 * If thread is going to be bound on a particular cpu, give its node
139 * in @node, to get NUMA affinity for kthread stack, or else give -1.
123 * When woken, the thread will run @threadfn() with @data as its 140 * When woken, the thread will run @threadfn() with @data as its
124 * argument. @threadfn() can either call do_exit() directly if it is a 141 * argument. @threadfn() can either call do_exit() directly if it is a
125 * standalone thread for which noone will call kthread_stop(), or 142 * standalone thread for which no one will call kthread_stop(), or
126 * return when 'kthread_should_stop()' is true (which means 143 * return when 'kthread_should_stop()' is true (which means
127 * kthread_stop() has been called). The return value should be zero 144 * kthread_stop() has been called). The return value should be zero
128 * or a negative error number; it will be passed to kthread_stop(). 145 * or a negative error number; it will be passed to kthread_stop().
129 * 146 *
130 * Returns a task_struct or ERR_PTR(-ENOMEM). 147 * Returns a task_struct or ERR_PTR(-ENOMEM).
131 */ 148 */
132struct task_struct *kthread_create(int (*threadfn)(void *data), 149struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
133 void *data, 150 void *data,
134 const char namefmt[], 151 int node,
135 ...) 152 const char namefmt[],
153 ...)
136{ 154{
137 struct kthread_create_info create; 155 struct kthread_create_info create;
138 156
139 create.threadfn = threadfn; 157 create.threadfn = threadfn;
140 create.data = data; 158 create.data = data;
159 create.node = node;
141 init_completion(&create.done); 160 init_completion(&create.done);
142 161
143 spin_lock(&kthread_create_lock); 162 spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
164 } 183 }
165 return create.result; 184 return create.result;
166} 185}
167EXPORT_SYMBOL(kthread_create); 186EXPORT_SYMBOL(kthread_create_on_node);
168 187
169/** 188/**
170 * kthread_bind - bind a just-created kthread to a cpu. 189 * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ee74b35e528d..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
153} 153}
154 154
155/** 155/**
156 * __account_scheduler_latency - record an occured latency 156 * __account_scheduler_latency - record an occurred latency
157 * @tsk - the task struct of the task hitting the latency 157 * @tsk - the task struct of the task hitting the latency
158 * @usecs - the duration of the latency in microseconds 158 * @usecs - the duration of the latency in microseconds
159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible 159 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f5..53a68956f131 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2309,7 +2309,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2309 if (unlikely(curr->hardirqs_enabled)) { 2309 if (unlikely(curr->hardirqs_enabled)) {
2310 /* 2310 /*
2311 * Neither irq nor preemption are disabled here 2311 * Neither irq nor preemption are disabled here
2312 * so this is racy by nature but loosing one hit 2312 * so this is racy by nature but losing one hit
2313 * in a stat is not a big deal. 2313 * in a stat is not a big deal.
2314 */ 2314 */
2315 __debug_atomic_inc(redundant_hardirqs_on); 2315 __debug_atomic_inc(redundant_hardirqs_on);
@@ -2620,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2620 if (!graph_lock()) 2620 if (!graph_lock())
2621 return 0; 2621 return 0;
2622 /* 2622 /*
2623 * Make sure we didnt race: 2623 * Make sure we didn't race:
2624 */ 2624 */
2625 if (unlikely(hlock_class(this)->usage_mask & new_mask)) { 2625 if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
2626 graph_unlock(); 2626 graph_unlock();
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, 225 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, 226 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, 227 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
228 sum_forward_deps = 0, factor = 0; 228 sum_forward_deps = 0;
229 229
230 list_for_each_entry(class, &all_lock_classes, lock_entry) { 230 list_for_each_entry(class, &all_lock_classes, lock_entry) {
231 231
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
283 nr_hardirq_unsafe * nr_hardirq_safe + 283 nr_hardirq_unsafe * nr_hardirq_safe +
284 nr_list_entries); 284 nr_list_entries);
285 285
286 /*
287 * Estimated factor between direct and indirect
288 * dependencies:
289 */
290 if (nr_list_entries)
291 factor = sum_forward_deps / nr_list_entries;
292
293#ifdef CONFIG_PROVE_LOCKING 286#ifdef CONFIG_PROVE_LOCKING
294 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 287 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
295 nr_lock_chains, MAX_LOCKDEP_CHAINS); 288 nr_lock_chains, MAX_LOCKDEP_CHAINS);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..d5938a5c19c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -809,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
809 wait_for_zero_refcount(mod); 809 wait_for_zero_refcount(mod);
810 810
811 mutex_unlock(&module_mutex); 811 mutex_unlock(&module_mutex);
812 /* Final destruction now noone is using it. */ 812 /* Final destruction now no one is using it. */
813 if (mod->exit != NULL) 813 if (mod->exit != NULL)
814 mod->exit(); 814 mod->exit();
815 blocking_notifier_call_chain(&module_notify_list, 815 blocking_notifier_call_chain(&module_notify_list,
@@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
1168{ 1168{
1169 struct module_sect_attr *sattr = 1169 struct module_sect_attr *sattr =
1170 container_of(mattr, struct module_sect_attr, mattr); 1170 container_of(mattr, struct module_sect_attr, mattr);
1171 return sprintf(buf, "0x%lx\n", sattr->address); 1171 return sprintf(buf, "0x%pK\n", (void *)sattr->address);
1172} 1172}
1173 1173
1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs) 1174static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -2777,7 +2777,7 @@ static struct module *load_module(void __user *umod,
2777 mod->state = MODULE_STATE_COMING; 2777 mod->state = MODULE_STATE_COMING;
2778 2778
2779 /* Now sew it into the lists so we can get lockdep and oops 2779 /* Now sew it into the lists so we can get lockdep and oops
2780 * info during argument parsing. Noone should access us, since 2780 * info during argument parsing. No one should access us, since
2781 * strong_try_module_get() will fail. 2781 * strong_try_module_get() will fail.
2782 * lockdep/oops can run asynchronous, so use the RCU list insertion 2782 * lockdep/oops can run asynchronous, so use the RCU list insertion
2783 * function to insert in a way safe to concurrent readers. 2783 * function to insert in a way safe to concurrent readers.
@@ -2971,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod,
2971 else 2971 else
2972 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2972 nextval = (unsigned long)mod->module_core+mod->core_text_size;
2973 2973
2974 /* Scan for closest preceeding symbol, and next symbol. (ELF 2974 /* Scan for closest preceding symbol, and next symbol. (ELF
2975 starts real symbols at 1). */ 2975 starts real symbols at 1). */
2976 for (i = 1; i < mod->num_symtab; i++) { 2976 for (i = 1; i < mod->num_symtab; i++) {
2977 if (mod->symtab[i].st_shndx == SHN_UNDEF) 2977 if (mod->symtab[i].st_shndx == SHN_UNDEF)
@@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
3224 mod->state == MODULE_STATE_COMING ? "Loading": 3224 mod->state == MODULE_STATE_COMING ? "Loading":
3225 "Live"); 3225 "Live");
3226 /* Used by oprofile and other similar tools. */ 3226 /* Used by oprofile and other similar tools. */
3227 seq_printf(m, " 0x%p", mod->module_core); 3227 seq_printf(m, " 0x%pK", mod->module_core);
3228 3228
3229 /* Taints info */ 3229 /* Taints info */
3230 if (mod->taints) 3230 if (mod->taints)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a5889fb28ecf..c4195fa98900 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
245 } 245 }
246 __set_task_state(task, state); 246 __set_task_state(task, state);
247 247
248 /* didnt get the lock, go to sleep: */ 248 /* didn't get the lock, go to sleep: */
249 spin_unlock_mutex(&lock->wait_lock, flags); 249 spin_unlock_mutex(&lock->wait_lock, flags);
250 preempt_enable_no_resched(); 250 preempt_enable_no_resched();
251 schedule(); 251 schedule();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..a05d191ffdd9 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
69 goto out_ns; 69 goto out_ns;
70 } 70 }
71 71
72 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 72 new_nsp->uts_ns = copy_utsname(flags, tsk);
73 if (IS_ERR(new_nsp->uts_ns)) { 73 if (IS_ERR(new_nsp->uts_ns)) {
74 err = PTR_ERR(new_nsp->uts_ns); 74 err = PTR_ERR(new_nsp->uts_ns);
75 goto out_uts; 75 goto out_uts;
76 } 76 }
77 77
78 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 78 new_nsp->ipc_ns = copy_ipcs(flags, tsk);
79 if (IS_ERR(new_nsp->ipc_ns)) { 79 if (IS_ERR(new_nsp->ipc_ns)) {
80 err = PTR_ERR(new_nsp->ipc_ns); 80 err = PTR_ERR(new_nsp->ipc_ns);
81 goto out_ipc; 81 goto out_ipc;
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
262 /* 262 /*
263 * This cpu has to do the parallel processing of the next 263 * This cpu has to do the parallel processing of the next
264 * object. It's waiting in the cpu's parallelization queue, 264 * object. It's waiting in the cpu's parallelization queue,
265 * so exit imediately. 265 * so exit immediately.
266 */ 266 */
267 if (PTR_ERR(padata) == -ENODATA) { 267 if (PTR_ERR(padata) == -ENODATA) {
268 del_timer(&pd->timer); 268 del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
284 /* 284 /*
285 * The next object that needs serialization might have arrived to 285 * The next object that needs serialization might have arrived to
286 * the reorder queues in the meantime, we will be called again 286 * the reorder queues in the meantime, we will be called again
287 * from the timer function if noone else cares for it. 287 * from the timer function if no one else cares for it.
288 */ 288 */
289 if (atomic_read(&pd->reorder_objects) 289 if (atomic_read(&pd->reorder_objects)
290 && !(pinst->flags & PADATA_RESET)) 290 && !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
515 put_online_cpus(); 515 put_online_cpus();
516} 516}
517 517
518/* Replace the internal control stucture with a new one. */ 518/* Replace the internal control structure with a new one. */
519static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
520 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
521{ 521{
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
768} 768}
769 769
770 /** 770 /**
771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) 771 * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
772 * padata cpumasks. 772 * padata cpumasks.
773 * 773 *
774 * @pinst: padata instance 774 * @pinst: padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
433 433
434core_param(panic, panic_timeout, int, 0644); 434core_param(panic, panic_timeout, int, 0644);
435core_param(pause_on_oops, pause_on_oops, int, 0644); 435core_param(pause_on_oops, pause_on_oops, int, 0644);
436
437static int __init oops_setup(char *s)
438{
439 if (!s)
440 return -EINVAL;
441 if (!strcmp(s, "panic"))
442 panic_on_oops = 1;
443 return 0;
444}
445early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
index 0da1411222b9..7ab388a48a2e 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
95 /* Find parameter */ 95 /* Find parameter */
96 for (i = 0; i < num_params; i++) { 96 for (i = 0; i < num_params; i++) {
97 if (parameq(param, params[i].name)) { 97 if (parameq(param, params[i].name)) {
98 /* Noone handled NULL, so do it here. */ 98 /* No one handled NULL, so do it here. */
99 if (!val && params[i].ops->set != param_set_bool) 99 if (!val && params[i].ops->set != param_set_bool)
100 return -EINVAL; 100 return -EINVAL;
101 DEBUGP("They are equal! Calling %p\n", 101 DEBUGP("They are equal! Calling %p\n",
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3472bb1a070c..8e81a9860a0d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -145,7 +145,8 @@ static struct srcu_struct pmus_srcu;
145 */ 145 */
146int sysctl_perf_event_paranoid __read_mostly = 1; 146int sysctl_perf_event_paranoid __read_mostly = 1;
147 147
148int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 148/* Minimum for 512 kiB + 1 user control page */
149int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
149 150
150/* 151/*
151 * max perf event sample rate 152 * max perf event sample rate
@@ -363,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
363 } 364 }
364 365
365 if (mode & PERF_CGROUP_SWIN) { 366 if (mode & PERF_CGROUP_SWIN) {
367 WARN_ON_ONCE(cpuctx->cgrp);
366 /* set cgrp before ctxsw in to 368 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not 369 * allow event_filter_match() to not
368 * have to pass task around 370 * have to pass task around
@@ -941,6 +943,7 @@ static void perf_group_attach(struct perf_event *event)
941static void 943static void
942list_del_event(struct perf_event *event, struct perf_event_context *ctx) 944list_del_event(struct perf_event *event, struct perf_event_context *ctx)
943{ 945{
946 struct perf_cpu_context *cpuctx;
944 /* 947 /*
945 * We can have double detach due to exit/hot-unplug + close. 948 * We can have double detach due to exit/hot-unplug + close.
946 */ 949 */
@@ -949,8 +952,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
949 952
950 event->attach_state &= ~PERF_ATTACH_CONTEXT; 953 event->attach_state &= ~PERF_ATTACH_CONTEXT;
951 954
952 if (is_cgroup_event(event)) 955 if (is_cgroup_event(event)) {
953 ctx->nr_cgroups--; 956 ctx->nr_cgroups--;
957 cpuctx = __get_cpu_context(ctx);
958 /*
959 * if there are no more cgroup events
960 * then cler cgrp to avoid stale pointer
961 * in update_cgrp_time_from_cpuctx()
962 */
963 if (!ctx->nr_cgroups)
964 cpuctx->cgrp = NULL;
965 }
954 966
955 ctx->nr_events--; 967 ctx->nr_events--;
956 if (event->attr.inherit_stat) 968 if (event->attr.inherit_stat)
@@ -2412,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2412 if (!ctx || !ctx->nr_events) 2424 if (!ctx || !ctx->nr_events)
2413 goto out; 2425 goto out;
2414 2426
2427 /*
2428 * We must ctxsw out cgroup events to avoid conflict
2429 * when invoking perf_task_event_sched_in() later on
2430 * in this function. Otherwise we end up trying to
2431 * ctxswin cgroup events which are already scheduled
2432 * in.
2433 */
2434 perf_cgroup_sched_out(current);
2415 task_ctx_sched_out(ctx, EVENT_ALL); 2435 task_ctx_sched_out(ctx, EVENT_ALL);
2416 2436
2417 raw_spin_lock(&ctx->lock); 2437 raw_spin_lock(&ctx->lock);
@@ -2436,6 +2456,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2436 2456
2437 raw_spin_unlock(&ctx->lock); 2457 raw_spin_unlock(&ctx->lock);
2438 2458
2459 /*
2460 * Also calls ctxswin for cgroup events, if any:
2461 */
2439 perf_event_context_sched_in(ctx, ctx->task); 2462 perf_event_context_sched_in(ctx, ctx->task);
2440out: 2463out:
2441 local_irq_restore(flags); 2464 local_irq_restore(flags);
@@ -6520,6 +6543,11 @@ SYSCALL_DEFINE5(perf_event_open,
6520 goto err_alloc; 6543 goto err_alloc;
6521 } 6544 }
6522 6545
6546 if (task) {
6547 put_task_struct(task);
6548 task = NULL;
6549 }
6550
6523 /* 6551 /*
6524 * Look up the group leader (we will attach this event to it): 6552 * Look up the group leader (we will attach this event to it):
6525 */ 6553 */
diff --git a/kernel/pid.c b/kernel/pid.c
index 02f221274265..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
217 return -1; 217 return -1;
218} 218}
219 219
220int next_pidmap(struct pid_namespace *pid_ns, int last) 220int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
221{ 221{
222 int offset; 222 int offset;
223 struct pidmap *map, *end; 223 struct pidmap *map, *end;
224 224
225 if (last >= PID_MAX_LIMIT)
226 return -1;
227
225 offset = (last + 1) & BITS_PER_PAGE_MASK; 228 offset = (last + 1) & BITS_PER_PAGE_MASK;
226 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; 229 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
227 end = &pid_ns->pidmap[PIDMAP_ENTRIES]; 230 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h>
17 18
18#define BITS_PER_PAGE (PAGE_SIZE*8) 19#define BITS_PER_PAGE (PAGE_SIZE*8)
19 20
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
72{ 73{
73 struct pid_namespace *ns; 74 struct pid_namespace *ns;
74 unsigned int level = parent_pid_ns->level + 1; 75 unsigned int level = parent_pid_ns->level + 1;
75 int i; 76 int i, err = -ENOMEM;
76 77
77 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 78 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
78 if (ns == NULL) 79 if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
96 for (i = 1; i < PIDMAP_ENTRIES; i++) 97 for (i = 1; i < PIDMAP_ENTRIES; i++)
97 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 98 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
98 99
100 err = pid_ns_prepare_proc(ns);
101 if (err)
102 goto out_put_parent_pid_ns;
103
99 return ns; 104 return ns;
100 105
106out_put_parent_pid_ns:
107 put_pid_ns(parent_pid_ns);
101out_free_map: 108out_free_map:
102 kfree(ns->pidmap[0].page); 109 kfree(ns->pidmap[0].page);
103out_free: 110out_free:
104 kmem_cache_free(pid_ns_cachep, ns); 111 kmem_cache_free(pid_ns_cachep, ns);
105out: 112out:
106 return ERR_PTR(-ENOMEM); 113 return ERR_PTR(err);
107} 114}
108 115
109static void destroy_pid_namespace(struct pid_namespace *ns) 116static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 67fea9d25d55..0791b13df7bf 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1347,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1347 1347
1348 /* 1348 /*
1349 * Now that all the timers on our list have the firing flag, 1349 * Now that all the timers on our list have the firing flag,
1350 * noone will touch their list entries but us. We'll take 1350 * no one will touch their list entries but us. We'll take
1351 * each timer's lock before clearing its firing flag, so no 1351 * each timer's lock before clearing its firing flag, so no
1352 * timer call will interfere. 1352 * timer call will interfere.
1353 */ 1353 */
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4c0124919f9a..e5498d7405c3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -313,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
313 * restarted (i.e. we have flagged this in the sys_private entry of the 313 * restarted (i.e. we have flagged this in the sys_private entry of the
314 * info block). 314 * info block).
315 * 315 *
316 * To protect aginst the timer going away while the interrupt is queued, 316 * To protect against the timer going away while the interrupt is queued,
317 * we require that the it_requeue_pending flag be set. 317 * we require that the it_requeue_pending flag be set.
318 */ 318 */
319void do_schedule_next_timer(struct siginfo *info) 319void do_schedule_next_timer(struct siginfo *info)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 4603f08dc47b..6de9a8fc3417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,9 +18,13 @@ config SUSPEND_FREEZER
18 18
19 Turning OFF this setting is NOT recommended! If in doubt, say Y. 19 Turning OFF this setting is NOT recommended! If in doubt, say Y.
20 20
21config HIBERNATE_CALLBACKS
22 bool
23
21config HIBERNATION 24config HIBERNATION
22 bool "Hibernation (aka 'suspend to disk')" 25 bool "Hibernation (aka 'suspend to disk')"
23 depends on SWAP && ARCH_HIBERNATION_POSSIBLE 26 depends on SWAP && ARCH_HIBERNATION_POSSIBLE
27 select HIBERNATE_CALLBACKS
24 select LZO_COMPRESS 28 select LZO_COMPRESS
25 select LZO_DECOMPRESS 29 select LZO_DECOMPRESS
26 ---help--- 30 ---help---
@@ -85,7 +89,7 @@ config PM_STD_PARTITION
85 89
86config PM_SLEEP 90config PM_SLEEP
87 def_bool y 91 def_bool y
88 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 92 depends on SUSPEND || HIBERNATE_CALLBACKS
89 93
90config PM_SLEEP_SMP 94config PM_SLEEP_SMP
91 def_bool y 95 def_bool y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
1ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
2 3
3obj-$(CONFIG_PM) += main.o 4obj-$(CONFIG_PM) += main.o
4obj-$(CONFIG_PM_SLEEP) += console.o 5obj-$(CONFIG_PM_SLEEP) += console.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
28static int submit(int rw, struct block_device *bdev, sector_t sector, 28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain) 29 struct page *page, struct bio **bio_chain)
30{ 30{
31 const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; 31 const int bio_rw = rw | REQ_SYNC;
32 struct bio *bio; 32 struct bio *bio;
33 33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aeabd26e3342..50aae660174d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -273,8 +273,11 @@ static int create_image(int platform_mode)
273 local_irq_disable(); 273 local_irq_disable();
274 274
275 error = sysdev_suspend(PMSG_FREEZE); 275 error = sysdev_suspend(PMSG_FREEZE);
276 if (!error) 276 if (!error) {
277 error = syscore_suspend(); 277 error = syscore_suspend();
278 if (error)
279 sysdev_resume();
280 }
278 if (error) { 281 if (error) {
279 printk(KERN_ERR "PM: Some system devices failed to power down, " 282 printk(KERN_ERR "PM: Some system devices failed to power down, "
280 "aborting hibernation\n"); 283 "aborting hibernation\n");
@@ -407,8 +410,11 @@ static int resume_target_kernel(bool platform_mode)
407 local_irq_disable(); 410 local_irq_disable();
408 411
409 error = sysdev_suspend(PMSG_QUIESCE); 412 error = sysdev_suspend(PMSG_QUIESCE);
410 if (!error) 413 if (!error) {
411 error = syscore_suspend(); 414 error = syscore_suspend();
415 if (error)
416 sysdev_resume();
417 }
412 if (error) 418 if (error)
413 goto Enable_irqs; 419 goto Enable_irqs;
414 420
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8eaba5f27b10..de9aef8742f4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -224,7 +224,7 @@ power_attr(state);
224 * writing to 'state'. It first should read from 'wakeup_count' and store 224 * writing to 'state'. It first should read from 'wakeup_count' and store
225 * the read value. Then, after carrying out its own preparations for the system 225 * the read value. Then, after carrying out its own preparations for the system
226 * transition to a sleep state, it should write the stored value to 226 * transition to a sleep state, it should write the stored value to
227 * 'wakeup_count'. If that fails, at least one wakeup event has occured since 227 * 'wakeup_count'. If that fails, at least one wakeup event has occurred since
228 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it 228 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
229 * is allowed to write to 'state', but the transition will be aborted if there 229 * is allowed to write to 'state', but the transition will be aborted if there
230 * are any wakeup events detected after 'wakeup_count' was written to. 230 * are any wakeup events detected after 'wakeup_count' was written to.
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 2814c32aed51..8935369d503a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -164,8 +164,11 @@ static int suspend_enter(suspend_state_t state)
164 BUG_ON(!irqs_disabled()); 164 BUG_ON(!irqs_disabled());
165 165
166 error = sysdev_suspend(PMSG_SUSPEND); 166 error = sysdev_suspend(PMSG_SUSPEND);
167 if (!error) 167 if (!error) {
168 error = syscore_suspend(); 168 error = syscore_suspend();
169 if (error)
170 sysdev_resume();
171 }
169 if (!error) { 172 if (!error) {
170 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 173 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
171 error = suspend_ops->enter(state); 174 error = suspend_ops->enter(state);
diff --git a/kernel/printk.c b/kernel/printk.c
index 33284adb2189..da8ca817eae3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 53#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
54 54
55/* printk's without a loglevel use this.. */ 55/* printk's without a loglevel use this.. */
56#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ 56#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
57 57
58/* We show everything that is MORE important than this.. */ 58/* We show everything that is MORE important than this.. */
59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 59#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +113,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ 113static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
114 114
115/* 115/*
116 * If exclusive_console is non-NULL then only this console is to be printed to.
117 */
118static struct console *exclusive_console;
119
120/*
116 * Array of consoles built from command line options (console=) 121 * Array of consoles built from command line options (console=)
117 */ 122 */
118struct console_cmdline 123struct console_cmdline
@@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
476 struct console *con; 481 struct console *con;
477 482
478 for_each_console(con) { 483 for_each_console(con) {
484 if (exclusive_console && con != exclusive_console)
485 continue;
479 if ((con->flags & CON_ENABLED) && con->write && 486 if ((con->flags & CON_ENABLED) && con->write &&
480 (cpu_online(smp_processor_id()) || 487 (cpu_online(smp_processor_id()) ||
481 (con->flags & CON_ANYTIME))) 488 (con->flags & CON_ANYTIME)))
@@ -1230,6 +1237,11 @@ void console_unlock(void)
1230 local_irq_restore(flags); 1237 local_irq_restore(flags);
1231 } 1238 }
1232 console_locked = 0; 1239 console_locked = 0;
1240
1241 /* Release the exclusive_console once it is used */
1242 if (unlikely(exclusive_console))
1243 exclusive_console = NULL;
1244
1233 up(&console_sem); 1245 up(&console_sem);
1234 spin_unlock_irqrestore(&logbuf_lock, flags); 1246 spin_unlock_irqrestore(&logbuf_lock, flags);
1235 if (wake_klogd) 1247 if (wake_klogd)
@@ -1316,6 +1328,18 @@ void console_start(struct console *console)
1316} 1328}
1317EXPORT_SYMBOL(console_start); 1329EXPORT_SYMBOL(console_start);
1318 1330
1331static int __read_mostly keep_bootcon;
1332
1333static int __init keep_bootcon_setup(char *str)
1334{
1335 keep_bootcon = 1;
1336 printk(KERN_INFO "debug: skip boot console de-registration.\n");
1337
1338 return 0;
1339}
1340
1341early_param("keep_bootcon", keep_bootcon_setup);
1342
1319/* 1343/*
1320 * The console driver calls this routine during kernel initialization 1344 * The console driver calls this routine during kernel initialization
1321 * to register the console printing procedure with printk() and to 1345 * to register the console printing procedure with printk() and to
@@ -1452,6 +1476,12 @@ void register_console(struct console *newcon)
1452 spin_lock_irqsave(&logbuf_lock, flags); 1476 spin_lock_irqsave(&logbuf_lock, flags);
1453 con_start = log_start; 1477 con_start = log_start;
1454 spin_unlock_irqrestore(&logbuf_lock, flags); 1478 spin_unlock_irqrestore(&logbuf_lock, flags);
1479 /*
1480 * We're about to replay the log buffer. Only do this to the
1481 * just-registered console to avoid excessive message spam to
1482 * the already-registered consoles.
1483 */
1484 exclusive_console = newcon;
1455 } 1485 }
1456 console_unlock(); 1486 console_unlock();
1457 console_sysfs_notify(); 1487 console_sysfs_notify();
@@ -1463,7 +1493,9 @@ void register_console(struct console *newcon)
1463 * users know there might be something in the kernel's log buffer that 1493 * users know there might be something in the kernel's log buffer that
1464 * went to the bootconsole (that they do not see on the real console) 1494 * went to the bootconsole (that they do not see on the real console)
1465 */ 1495 */
1466 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { 1496 if (bcon &&
1497 ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
1498 !keep_bootcon) {
1467 /* we need to iterate through twice, to make sure we print 1499 /* we need to iterate through twice, to make sure we print
1468 * everything out, before we unregister the console(s) 1500 * everything out, before we unregister the console(s)
1469 */ 1501 */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e2302e40b360..0fc1eed28d27 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
134 return 0; 134 return 0;
135 rcu_read_lock(); 135 rcu_read_lock();
136 tcred = __task_cred(task); 136 tcred = __task_cred(task);
137 if ((cred->uid != tcred->euid || 137 if (cred->user->user_ns == tcred->user->user_ns &&
138 cred->uid != tcred->suid || 138 (cred->uid == tcred->euid &&
139 cred->uid != tcred->uid || 139 cred->uid == tcred->suid &&
140 cred->gid != tcred->egid || 140 cred->uid == tcred->uid &&
141 cred->gid != tcred->sgid || 141 cred->gid == tcred->egid &&
142 cred->gid != tcred->gid) && 142 cred->gid == tcred->sgid &&
143 !capable(CAP_SYS_PTRACE)) { 143 cred->gid == tcred->gid))
144 rcu_read_unlock(); 144 goto ok;
145 return -EPERM; 145 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
146 } 146 goto ok;
147 rcu_read_unlock();
148 return -EPERM;
149ok:
147 rcu_read_unlock(); 150 rcu_read_unlock();
148 smp_rmb(); 151 smp_rmb();
149 if (task->mm) 152 if (task->mm)
150 dumpable = get_dumpable(task->mm); 153 dumpable = get_dumpable(task->mm);
151 if (!dumpable && !capable(CAP_SYS_PTRACE)) 154 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
152 return -EPERM; 155 return -EPERM;
153 156
154 return security_ptrace_access_check(task, mode); 157 return security_ptrace_access_check(task, mode);
@@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task)
198 goto unlock_tasklist; 201 goto unlock_tasklist;
199 202
200 task->ptrace = PT_PTRACED; 203 task->ptrace = PT_PTRACED;
201 if (capable(CAP_SYS_PTRACE)) 204 if (task_ns_capable(task, CAP_SYS_PTRACE))
202 task->ptrace |= PT_PTRACE_CAP; 205 task->ptrace |= PT_PTRACE_CAP;
203 206
204 __ptrace_link(task, current); 207 __ptrace_link(task, current);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
126 pos, buf, s - buf); 126 pos, buf, s - buf);
127} 127}
128 128
129#if BITS_PER_LONG == 32
130u64 res_counter_read_u64(struct res_counter *counter, int member)
131{
132 unsigned long flags;
133 u64 ret;
134
135 spin_lock_irqsave(&counter->lock, flags);
136 ret = *res_counter_member(counter, member);
137 spin_unlock_irqrestore(&counter->lock, flags);
138
139 return ret;
140}
141#else
129u64 res_counter_read_u64(struct res_counter *counter, int member) 142u64 res_counter_read_u64(struct res_counter *counter, int member)
130{ 143{
131 return *res_counter_member(counter, member); 144 return *res_counter_member(counter, member);
132} 145}
146#endif
133 147
134int res_counter_memparse_write_strategy(const char *buf, 148int res_counter_memparse_write_strategy(const char *buf,
135 unsigned long long *res) 149 unsigned long long *res)
diff --git a/kernel/sched.c b/kernel/sched.c
index a172494a9a63..312f8b95c2d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2309,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2309 * Cause a process which is running on another CPU to enter 2309 * Cause a process which is running on another CPU to enter
2310 * kernel-mode, without any delay. (to get signals handled.) 2310 * kernel-mode, without any delay. (to get signals handled.)
2311 * 2311 *
2312 * NOTE: this function doesnt have to take the runqueue lock, 2312 * NOTE: this function doesn't have to take the runqueue lock,
2313 * because all it wants to ensure is that the remote task enters 2313 * because all it wants to ensure is that the remote task enters
2314 * the kernel. If the IPI races and the task has been migrated 2314 * the kernel. If the IPI races and the task has been migrated
2315 * to another CPU then no harm is done and the purpose has been 2315 * to another CPU then no harm is done and the purpose has been
@@ -4111,6 +4111,16 @@ need_resched:
4111 try_to_wake_up_local(to_wakeup); 4111 try_to_wake_up_local(to_wakeup);
4112 } 4112 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114
4115 /*
4116 * If we are going to sleep and we have plugged IO queued, make
4117 * sure to submit it to avoid deadlocks.
4118 */
4119 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock);
4121 blk_schedule_flush_plug(prev);
4122 raw_spin_lock(&rq->lock);
4123 }
4114 } 4124 }
4115 switch_count = &prev->nvcsw; 4125 switch_count = &prev->nvcsw;
4116 } 4126 }
@@ -4892,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
4892 4902
4893 rcu_read_lock(); 4903 rcu_read_lock();
4894 pcred = __task_cred(p); 4904 pcred = __task_cred(p);
4895 match = (cred->euid == pcred->euid || 4905 if (cred->user->user_ns == pcred->user->user_ns)
4896 cred->euid == pcred->uid); 4906 match = (cred->euid == pcred->euid ||
4907 cred->euid == pcred->uid);
4908 else
4909 match = false;
4897 rcu_read_unlock(); 4910 rcu_read_unlock();
4898 return match; 4911 return match;
4899} 4912}
@@ -4984,7 +4997,7 @@ recheck:
4984 */ 4997 */
4985 raw_spin_lock_irqsave(&p->pi_lock, flags); 4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4986 /* 4999 /*
4987 * To be able to change p->policy safely, the apropriate 5000 * To be able to change p->policy safely, the appropriate
4988 * runqueue lock must be held. 5001 * runqueue lock must be held.
4989 */ 5002 */
4990 rq = __task_rq_lock(p); 5003 rq = __task_rq_lock(p);
@@ -4998,6 +5011,17 @@ recheck:
4998 return -EINVAL; 5011 return -EINVAL;
4999 } 5012 }
5000 5013
5014 /*
5015 * If not changing anything there's no need to proceed further:
5016 */
5017 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5018 param->sched_priority == p->rt_priority))) {
5019
5020 __task_rq_unlock(rq);
5021 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5022 return 0;
5023 }
5024
5001#ifdef CONFIG_RT_GROUP_SCHED 5025#ifdef CONFIG_RT_GROUP_SCHED
5002 if (user) { 5026 if (user) {
5003 /* 5027 /*
@@ -5221,7 +5245,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5221 goto out_free_cpus_allowed; 5245 goto out_free_cpus_allowed;
5222 } 5246 }
5223 retval = -EPERM; 5247 retval = -EPERM;
5224 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5248 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5225 goto out_unlock; 5249 goto out_unlock;
5226 5250
5227 retval = security_task_setscheduler(p); 5251 retval = security_task_setscheduler(p);
@@ -5460,6 +5484,8 @@ EXPORT_SYMBOL(yield);
5460 * yield_to - yield the current processor to another thread in 5484 * yield_to - yield the current processor to another thread in
5461 * your thread group, or accelerate that thread toward the 5485 * your thread group, or accelerate that thread toward the
5462 * processor it's on. 5486 * processor it's on.
5487 * @p: target task
5488 * @preempt: whether task preemption is allowed or not
5463 * 5489 *
5464 * It's the caller's job to ensure that the target task struct 5490 * It's the caller's job to ensure that the target task struct
5465 * can't go away on us before we can do any checks. 5491 * can't go away on us before we can do any checks.
@@ -5525,6 +5551,7 @@ void __sched io_schedule(void)
5525 5551
5526 delayacct_blkio_start(); 5552 delayacct_blkio_start();
5527 atomic_inc(&rq->nr_iowait); 5553 atomic_inc(&rq->nr_iowait);
5554 blk_flush_plug(current);
5528 current->in_iowait = 1; 5555 current->in_iowait = 1;
5529 schedule(); 5556 schedule();
5530 current->in_iowait = 0; 5557 current->in_iowait = 0;
@@ -5540,6 +5567,7 @@ long __sched io_schedule_timeout(long timeout)
5540 5567
5541 delayacct_blkio_start(); 5568 delayacct_blkio_start();
5542 atomic_inc(&rq->nr_iowait); 5569 atomic_inc(&rq->nr_iowait);
5570 blk_flush_plug(current);
5543 current->in_iowait = 1; 5571 current->in_iowait = 1;
5544 ret = schedule_timeout(timeout); 5572 ret = schedule_timeout(timeout);
5545 current->in_iowait = 0; 5573 current->in_iowait = 0;
@@ -5688,7 +5716,7 @@ void show_state_filter(unsigned long state_filter)
5688 do_each_thread(g, p) { 5716 do_each_thread(g, p) {
5689 /* 5717 /*
5690 * reset the NMI-timeout, listing all files on a slow 5718 * reset the NMI-timeout, listing all files on a slow
5691 * console might take alot of time: 5719 * console might take a lot of time:
5692 */ 5720 */
5693 touch_nmi_watchdog(); 5721 touch_nmi_watchdog();
5694 if (!state_filter || (p->state & state_filter)) 5722 if (!state_filter || (p->state & state_filter))
@@ -6303,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6303 break; 6331 break;
6304#endif 6332#endif
6305 } 6333 }
6334
6335 update_max_interval();
6336
6306 return NOTIFY_OK; 6337 return NOTIFY_OK;
6307} 6338}
6308 6339
@@ -8434,7 +8465,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8434{ 8465{
8435 struct cfs_rq *cfs_rq; 8466 struct cfs_rq *cfs_rq;
8436 struct sched_entity *se; 8467 struct sched_entity *se;
8437 struct rq *rq;
8438 int i; 8468 int i;
8439 8469
8440 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8470 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8447,8 +8477,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8447 tg->shares = NICE_0_LOAD; 8477 tg->shares = NICE_0_LOAD;
8448 8478
8449 for_each_possible_cpu(i) { 8479 for_each_possible_cpu(i) {
8450 rq = cpu_rq(i);
8451
8452 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8480 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8453 GFP_KERNEL, cpu_to_node(i)); 8481 GFP_KERNEL, cpu_to_node(i));
8454 if (!cfs_rq) 8482 if (!cfs_rq)
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 5946ac515602..429242f3c484 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -179,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p)
179 struct autogroup *ag = autogroup_create(); 179 struct autogroup *ag = autogroup_create();
180 180
181 autogroup_move_group(p, ag); 181 autogroup_move_group(p, ag);
182 /* drop extra refrence added by autogroup_create() */ 182 /* drop extra reference added by autogroup_create() */
183 autogroup_kref_put(ag); 183 autogroup_kref_put(ag);
184} 184}
185EXPORT_SYMBOL(sched_autogroup_create_attach); 185EXPORT_SYMBOL(sched_autogroup_create_attach);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3f7ec9e27ee1..6fa833ab2cb8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h>
25 26
26/* 27/*
27 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
@@ -2103,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2103 enum cpu_idle_type idle, int *all_pinned, 2104 enum cpu_idle_type idle, int *all_pinned,
2104 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
2105{ 2106{
2106 int loops = 0, pulled = 0, pinned = 0; 2107 int loops = 0, pulled = 0;
2107 long rem_load_move = max_load_move; 2108 long rem_load_move = max_load_move;
2108 struct task_struct *p, *n; 2109 struct task_struct *p, *n;
2109 2110
2110 if (max_load_move == 0) 2111 if (max_load_move == 0)
2111 goto out; 2112 goto out;
2112 2113
2113 pinned = 1;
2114
2115 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 2114 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2116 if (loops++ > sysctl_sched_nr_migrate) 2115 if (loops++ > sysctl_sched_nr_migrate)
2117 break; 2116 break;
2118 2117
2119 if ((p->se.load.weight >> 1) > rem_load_move || 2118 if ((p->se.load.weight >> 1) > rem_load_move ||
2120 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) 2119 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2120 all_pinned))
2121 continue; 2121 continue;
2122 2122
2123 pull_task(busiest, p, this_rq, this_cpu); 2123 pull_task(busiest, p, this_rq, this_cpu);
@@ -2152,9 +2152,6 @@ out:
2152 */ 2152 */
2153 schedstat_add(sd, lb_gained[idle], pulled); 2153 schedstat_add(sd, lb_gained[idle], pulled);
2154 2154
2155 if (all_pinned)
2156 *all_pinned = pinned;
2157
2158 return max_load_move - rem_load_move; 2155 return max_load_move - rem_load_move;
2159} 2156}
2160 2157
@@ -3061,7 +3058,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3061 3058
3062 /* 3059 /*
3063 * if *imbalance is less than the average load per runnable task 3060 * if *imbalance is less than the average load per runnable task
3064 * there is no gaurantee that any tasks will be moved so we'll have 3061 * there is no guarantee that any tasks will be moved so we'll have
3065 * a think about bumping its value to force at least one task to be 3062 * a think about bumping its value to force at least one task to be
3066 * moved 3063 * moved
3067 */ 3064 */
@@ -3126,6 +3123,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3126 if (!sds.busiest || sds.busiest_nr_running == 0) 3123 if (!sds.busiest || sds.busiest_nr_running == 0)
3127 goto out_balanced; 3124 goto out_balanced;
3128 3125
3126 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3127
3129 /* 3128 /*
3130 * If the busiest group is imbalanced the below checks don't 3129 * If the busiest group is imbalanced the below checks don't
3131 * work because they assumes all things are equal, which typically 3130 * work because they assumes all things are equal, which typically
@@ -3150,7 +3149,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3150 * Don't pull any tasks if this group is already above the domain 3149 * Don't pull any tasks if this group is already above the domain
3151 * average load. 3150 * average load.
3152 */ 3151 */
3153 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3154 if (sds.this_load >= sds.avg_load) 3152 if (sds.this_load >= sds.avg_load)
3155 goto out_balanced; 3153 goto out_balanced;
3156 3154
@@ -3339,6 +3337,7 @@ redo:
3339 * still unbalanced. ld_moved simply stays zero, so it is 3337 * still unbalanced. ld_moved simply stays zero, so it is
3340 * correctly treated as an imbalance. 3338 * correctly treated as an imbalance.
3341 */ 3339 */
3340 all_pinned = 1;
3342 local_irq_save(flags); 3341 local_irq_save(flags);
3343 double_rq_lock(this_rq, busiest); 3342 double_rq_lock(this_rq, busiest);
3344 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3343 ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3819,6 +3818,17 @@ void select_nohz_load_balancer(int stop_tick)
3819 3818
3820static DEFINE_SPINLOCK(balancing); 3819static DEFINE_SPINLOCK(balancing);
3821 3820
3821static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3822
3823/*
3824 * Scale the max load_balance interval with the number of CPUs in the system.
3825 * This trades load-balance latency on larger machines for less cross talk.
3826 */
3827static void update_max_interval(void)
3828{
3829 max_load_balance_interval = HZ*num_online_cpus()/10;
3830}
3831
3822/* 3832/*
3823 * It checks each scheduling domain to see if it is due to be balanced, 3833 * It checks each scheduling domain to see if it is due to be balanced,
3824 * and initiates a balancing operation if so. 3834 * and initiates a balancing operation if so.
@@ -3848,10 +3858,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3848 3858
3849 /* scale ms to jiffies */ 3859 /* scale ms to jiffies */
3850 interval = msecs_to_jiffies(interval); 3860 interval = msecs_to_jiffies(interval);
3851 if (unlikely(!interval)) 3861 interval = clamp(interval, 1UL, max_load_balance_interval);
3852 interval = 1;
3853 if (interval > HZ*NR_CPUS/10)
3854 interval = HZ*NR_CPUS/10;
3855 3862
3856 need_serialize = sd->flags & SD_SERIALIZE; 3863 need_serialize = sd->flags & SD_SERIALIZE;
3857 3864
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index c82f26c1b7c3..a776a6396427 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -94,6 +94,4 @@ static const struct sched_class idle_sched_class = {
94 94
95 .prio_changed = prio_changed_idle, 95 .prio_changed = prio_changed_idle,
96 .switched_to = switched_to_idle, 96 .switched_to = switched_to_idle,
97
98 /* no .task_new for idle tasks */
99}; 97};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index db308cb08b75..e7cebdc65f82 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1378,7 +1378,7 @@ retry:
1378 task = pick_next_pushable_task(rq); 1378 task = pick_next_pushable_task(rq);
1379 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1379 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1380 /* 1380 /*
1381 * If we get here, the task hasnt moved at all, but 1381 * If we get here, the task hasn't moved at all, but
1382 * it has failed to push. We will not try again, 1382 * it has failed to push. We will not try again,
1383 * since the other cpus will pull from us when they 1383 * since the other cpus will pull from us when they
1384 * are ready. 1384 * are ready.
@@ -1488,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq)
1488 /* 1488 /*
1489 * We continue with the search, just in 1489 * We continue with the search, just in
1490 * case there's an even higher prio task 1490 * case there's an even higher prio task
1491 * in another runqueue. (low likelyhood 1491 * in another runqueue. (low likelihood
1492 * but possible) 1492 * but possible)
1493 */ 1493 */
1494 } 1494 }
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 84ec9bcf82d9..1ba2bd40fdac 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -102,6 +102,4 @@ static const struct sched_class stop_sched_class = {
102 102
103 .prio_changed = prio_changed_stop, 103 .prio_changed = prio_changed_stop,
104 .switched_to = switched_to_stop, 104 .switched_to = switched_to_stop,
105
106 /* no .task_new for stop tasks */
107}; 105};
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..7165af5f1b11 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig)
226/* 226/*
227 * allocate a new signal queue record 227 * allocate a new signal queue record
228 * - this may be called without locks if and only if t == current, otherwise an 228 * - this may be called without locks if and only if t == current, otherwise an
229 * appopriate lock must be held to stop the target task from exiting 229 * appropriate lock must be held to stop the target task from exiting
230 */ 230 */
231static struct sigqueue * 231static struct sigqueue *
232__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) 232__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
375 return !tracehook_consider_fatal_signal(tsk, sig); 375 return !tracehook_consider_fatal_signal(tsk, sig);
376} 376}
377 377
378 378/*
379/* Notify the system that a driver wants to block all signals for this 379 * Notify the system that a driver wants to block all signals for this
380 * process, and wants to be notified if any signals at all were to be 380 * process, and wants to be notified if any signals at all were to be
381 * sent/acted upon. If the notifier routine returns non-zero, then the 381 * sent/acted upon. If the notifier routine returns non-zero, then the
382 * signal will be acted upon after all. If the notifier routine returns 0, 382 * signal will be acted upon after all. If the notifier routine returns 0,
383 * then then signal will be blocked. Only one block per process is 383 * then then signal will be blocked. Only one block per process is
384 * allowed. priv is a pointer to private data that the notifier routine 384 * allowed. priv is a pointer to private data that the notifier routine
385 * can use to determine if the signal should be blocked or not. */ 385 * can use to determine if the signal should be blocked or not.
386 386 */
387void 387void
388block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) 388block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
389{ 389{
@@ -434,9 +434,10 @@ still_pending:
434 copy_siginfo(info, &first->info); 434 copy_siginfo(info, &first->info);
435 __sigqueue_free(first); 435 __sigqueue_free(first);
436 } else { 436 } else {
437 /* Ok, it wasn't in the queue. This must be 437 /*
438 a fast-pathed signal or we must have been 438 * Ok, it wasn't in the queue. This must be
439 out of queue space. So zero out the info. 439 * a fast-pathed signal or we must have been
440 * out of queue space. So zero out the info.
440 */ 441 */
441 info->si_signo = sig; 442 info->si_signo = sig;
442 info->si_errno = 0; 443 info->si_errno = 0;
@@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
468} 469}
469 470
470/* 471/*
471 * Dequeue a signal and return the element to the caller, which is 472 * Dequeue a signal and return the element to the caller, which is
472 * expected to free it. 473 * expected to free it.
473 * 474 *
474 * All callers have to hold the siglock. 475 * All callers have to hold the siglock.
@@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
490 * itimers are process shared and we restart periodic 491 * itimers are process shared and we restart periodic
491 * itimers in the signal delivery path to prevent DoS 492 * itimers in the signal delivery path to prevent DoS
492 * attacks in the high resolution timer case. This is 493 * attacks in the high resolution timer case. This is
493 * compliant with the old way of self restarting 494 * compliant with the old way of self-restarting
494 * itimers, as the SIGALRM is a legacy signal and only 495 * itimers, as the SIGALRM is a legacy signal and only
495 * queued once. Changing the restart behaviour to 496 * queued once. Changing the restart behaviour to
496 * restart the timer in the signal dequeue path is 497 * restart the timer in the signal dequeue path is
@@ -636,13 +637,33 @@ static inline bool si_fromuser(const struct siginfo *info)
636} 637}
637 638
638/* 639/*
640 * called with RCU read lock from check_kill_permission()
641 */
642static int kill_ok_by_cred(struct task_struct *t)
643{
644 const struct cred *cred = current_cred();
645 const struct cred *tcred = __task_cred(t);
646
647 if (cred->user->user_ns == tcred->user->user_ns &&
648 (cred->euid == tcred->suid ||
649 cred->euid == tcred->uid ||
650 cred->uid == tcred->suid ||
651 cred->uid == tcred->uid))
652 return 1;
653
654 if (ns_capable(tcred->user->user_ns, CAP_KILL))
655 return 1;
656
657 return 0;
658}
659
660/*
639 * Bad permissions for sending the signal 661 * Bad permissions for sending the signal
640 * - the caller must hold the RCU read lock 662 * - the caller must hold the RCU read lock
641 */ 663 */
642static int check_kill_permission(int sig, struct siginfo *info, 664static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 665 struct task_struct *t)
644{ 666{
645 const struct cred *cred, *tcred;
646 struct pid *sid; 667 struct pid *sid;
647 int error; 668 int error;
648 669
@@ -656,14 +677,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 677 if (error)
657 return error; 678 return error;
658 679
659 cred = current_cred();
660 tcred = __task_cred(t);
661 if (!same_thread_group(current, t) && 680 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) && 681 !kill_ok_by_cred(t)) {
663 (cred->euid ^ tcred->uid) &&
664 (cred->uid ^ tcred->suid) &&
665 (cred->uid ^ tcred->uid) &&
666 !capable(CAP_KILL)) {
667 switch (sig) { 682 switch (sig) {
668 case SIGCONT: 683 case SIGCONT:
669 sid = task_session(t); 684 sid = task_session(t);
@@ -909,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
909 if (info == SEND_SIG_FORCED) 924 if (info == SEND_SIG_FORCED)
910 goto out_set; 925 goto out_set;
911 926
912 /* Real-time signals must be queued if sent by sigqueue, or 927 /*
913 some other real-time mechanism. It is implementation 928 * Real-time signals must be queued if sent by sigqueue, or
914 defined whether kill() does so. We attempt to do so, on 929 * some other real-time mechanism. It is implementation
915 the principle of least surprise, but since kill is not 930 * defined whether kill() does so. We attempt to do so, on
916 allowed to fail with EAGAIN when low on memory we just 931 * the principle of least surprise, but since kill is not
917 make sure at least one signal gets delivered and don't 932 * allowed to fail with EAGAIN when low on memory we just
918 pass on the info struct. */ 933 * make sure at least one signal gets delivered and don't
919 934 * pass on the info struct.
935 */
920 if (sig < SIGRTMIN) 936 if (sig < SIGRTMIN)
921 override_rlimit = (is_si_special(info) || info->si_code >= 0); 937 override_rlimit = (is_si_special(info) || info->si_code >= 0);
922 else 938 else
@@ -1187,8 +1203,7 @@ retry:
1187 return error; 1203 return error;
1188} 1204}
1189 1205
1190int 1206int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1191kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1192{ 1207{
1193 int error; 1208 int error;
1194 rcu_read_lock(); 1209 rcu_read_lock();
@@ -1285,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1285 * These are for backward compatibility with the rest of the kernel source. 1300 * These are for backward compatibility with the rest of the kernel source.
1286 */ 1301 */
1287 1302
1288int 1303int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1289send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1290{ 1304{
1291 /* 1305 /*
1292 * Make sure legacy kernel users don't send in bad values 1306 * Make sure legacy kernel users don't send in bad values
@@ -1354,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid);
1354 * These functions support sending signals using preallocated sigqueue 1368 * These functions support sending signals using preallocated sigqueue
1355 * structures. This is needed "because realtime applications cannot 1369 * structures. This is needed "because realtime applications cannot
1356 * afford to lose notifications of asynchronous events, like timer 1370 * afford to lose notifications of asynchronous events, like timer
1357 * expirations or I/O completions". In the case of Posix Timers 1371 * expirations or I/O completions". In the case of POSIX Timers
1358 * we allocate the sigqueue structure from the timer_create. If this 1372 * we allocate the sigqueue structure from the timer_create. If this
1359 * allocation fails we are able to report the failure to the application 1373 * allocation fails we are able to report the failure to the application
1360 * with an EAGAIN error. 1374 * with an EAGAIN error.
@@ -1539,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1539 info.si_signo = SIGCHLD; 1553 info.si_signo = SIGCHLD;
1540 info.si_errno = 0; 1554 info.si_errno = 0;
1541 /* 1555 /*
1542 * see comment in do_notify_parent() abot the following 3 lines 1556 * see comment in do_notify_parent() about the following 4 lines
1543 */ 1557 */
1544 rcu_read_lock(); 1558 rcu_read_lock();
1545 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1559 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1597,7 +1611,7 @@ static inline int may_ptrace_stop(void)
1597} 1611}
1598 1612
1599/* 1613/*
1600 * Return nonzero if there is a SIGKILL that should be waking us up. 1614 * Return non-zero if there is a SIGKILL that should be waking us up.
1601 * Called with the siglock held. 1615 * Called with the siglock held.
1602 */ 1616 */
1603static int sigkill_pending(struct task_struct *tsk) 1617static int sigkill_pending(struct task_struct *tsk)
@@ -1721,7 +1735,7 @@ void ptrace_notify(int exit_code)
1721/* 1735/*
1722 * This performs the stopping for SIGSTOP and other stop signals. 1736 * This performs the stopping for SIGSTOP and other stop signals.
1723 * We have to stop all threads in the thread group. 1737 * We have to stop all threads in the thread group.
1724 * Returns nonzero if we've actually stopped and released the siglock. 1738 * Returns non-zero if we've actually stopped and released the siglock.
1725 * Returns zero if we didn't stop and still hold the siglock. 1739 * Returns zero if we didn't stop and still hold the siglock.
1726 */ 1740 */
1727static int do_signal_stop(int signr) 1741static int do_signal_stop(int signr)
@@ -1809,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
1809 1823
1810 current->exit_code = 0; 1824 current->exit_code = 0;
1811 1825
1812 /* Update the siginfo structure if the signal has 1826 /*
1813 changed. If the debugger wanted something 1827 * Update the siginfo structure if the signal has
1814 specific in the siginfo structure then it should 1828 * changed. If the debugger wanted something
1815 have updated *info via PTRACE_SETSIGINFO. */ 1829 * specific in the siginfo structure then it should
1830 * have updated *info via PTRACE_SETSIGINFO.
1831 */
1816 if (signr != info->si_signo) { 1832 if (signr != info->si_signo) {
1817 info->si_signo = signr; 1833 info->si_signo = signr;
1818 info->si_errno = 0; 1834 info->si_errno = 0;
@@ -1871,7 +1887,7 @@ relock:
1871 for (;;) { 1887 for (;;) {
1872 struct k_sigaction *ka; 1888 struct k_sigaction *ka;
1873 /* 1889 /*
1874 * Tracing can induce an artifical signal and choose sigaction. 1890 * Tracing can induce an artificial signal and choose sigaction.
1875 * The return value in @signr determines the default action, 1891 * The return value in @signr determines the default action,
1876 * but @info->si_signo is the signal number we will report. 1892 * but @info->si_signo is the signal number we will report.
1877 */ 1893 */
@@ -2020,7 +2036,8 @@ void exit_signals(struct task_struct *tsk)
2020 if (!signal_pending(tsk)) 2036 if (!signal_pending(tsk))
2021 goto out; 2037 goto out;
2022 2038
2023 /* It could be that __group_complete_signal() choose us to 2039 /*
2040 * It could be that __group_complete_signal() choose us to
2024 * notify about group-wide signal. Another thread should be 2041 * notify about group-wide signal. Another thread should be
2025 * woken now to take the signal since we will not. 2042 * woken now to take the signal since we will not.
2026 */ 2043 */
@@ -2058,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals);
2058 * System call entry points. 2075 * System call entry points.
2059 */ 2076 */
2060 2077
2078/**
2079 * sys_restart_syscall - restart a system call
2080 */
2061SYSCALL_DEFINE0(restart_syscall) 2081SYSCALL_DEFINE0(restart_syscall)
2062{ 2082{
2063 struct restart_block *restart = &current_thread_info()->restart_block; 2083 struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2111,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2111 return error; 2131 return error;
2112} 2132}
2113 2133
2134/**
2135 * sys_rt_sigprocmask - change the list of currently blocked signals
2136 * @how: whether to add, remove, or set signals
2137 * @set: stores pending signals
2138 * @oset: previous value of signal mask if non-null
2139 * @sigsetsize: size of sigset_t type
2140 */
2114SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, 2141SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
2115 sigset_t __user *, oset, size_t, sigsetsize) 2142 sigset_t __user *, oset, size_t, sigsetsize)
2116{ 2143{
@@ -2169,8 +2196,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
2169 2196
2170out: 2197out:
2171 return error; 2198 return error;
2172} 2199}
2173 2200
2201/**
2202 * sys_rt_sigpending - examine a pending signal that has been raised
2203 * while blocked
2204 * @set: stores pending signals
2205 * @sigsetsize: size of sigset_t type or larger
2206 */
2174SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) 2207SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
2175{ 2208{
2176 return do_sigpending(set, sigsetsize); 2209 return do_sigpending(set, sigsetsize);
@@ -2219,9 +2252,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2219 err |= __put_user(from->si_trapno, &to->si_trapno); 2252 err |= __put_user(from->si_trapno, &to->si_trapno);
2220#endif 2253#endif
2221#ifdef BUS_MCEERR_AO 2254#ifdef BUS_MCEERR_AO
2222 /* 2255 /*
2223 * Other callers might not initialize the si_lsb field, 2256 * Other callers might not initialize the si_lsb field,
2224 * so check explicitely for the right codes here. 2257 * so check explicitly for the right codes here.
2225 */ 2258 */
2226 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) 2259 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2227 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2260 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2250,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2250 2283
2251#endif 2284#endif
2252 2285
2286/**
2287 * sys_rt_sigtimedwait - synchronously wait for queued signals specified
2288 * in @uthese
2289 * @uthese: queued signals to wait for
2290 * @uinfo: if non-null, the signal's siginfo is returned here
2291 * @uts: upper bound on process time suspension
2292 * @sigsetsize: size of sigset_t type
2293 */
2253SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, 2294SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2254 siginfo_t __user *, uinfo, const struct timespec __user *, uts, 2295 siginfo_t __user *, uinfo, const struct timespec __user *, uts,
2255 size_t, sigsetsize) 2296 size_t, sigsetsize)
@@ -2266,7 +2307,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2266 2307
2267 if (copy_from_user(&these, uthese, sizeof(these))) 2308 if (copy_from_user(&these, uthese, sizeof(these)))
2268 return -EFAULT; 2309 return -EFAULT;
2269 2310
2270 /* 2311 /*
2271 * Invert the set of allowed signals to get those we 2312 * Invert the set of allowed signals to get those we
2272 * want to block. 2313 * want to block.
@@ -2291,9 +2332,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2291 + (ts.tv_sec || ts.tv_nsec)); 2332 + (ts.tv_sec || ts.tv_nsec));
2292 2333
2293 if (timeout) { 2334 if (timeout) {
2294 /* None ready -- temporarily unblock those we're 2335 /*
2336 * None ready -- temporarily unblock those we're
2295 * interested while we are sleeping in so that we'll 2337 * interested while we are sleeping in so that we'll
2296 * be awakened when they arrive. */ 2338 * be awakened when they arrive.
2339 */
2297 current->real_blocked = current->blocked; 2340 current->real_blocked = current->blocked;
2298 sigandsets(&current->blocked, &current->blocked, &these); 2341 sigandsets(&current->blocked, &current->blocked, &these);
2299 recalc_sigpending(); 2342 recalc_sigpending();
@@ -2325,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2325 return ret; 2368 return ret;
2326} 2369}
2327 2370
2371/**
2372 * sys_kill - send a signal to a process
2373 * @pid: the PID of the process
2374 * @sig: signal to be sent
2375 */
2328SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) 2376SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2329{ 2377{
2330 struct siginfo info; 2378 struct siginfo info;
@@ -2400,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
2400 return do_tkill(tgid, pid, sig); 2448 return do_tkill(tgid, pid, sig);
2401} 2449}
2402 2450
2403/* 2451/**
2452 * sys_tkill - send signal to one specific task
2453 * @pid: the PID of the task
2454 * @sig: signal to be sent
2455 *
2404 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2456 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2405 */ 2457 */
2406SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) 2458SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2412,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2412 return do_tkill(0, pid, sig); 2464 return do_tkill(0, pid, sig);
2413} 2465}
2414 2466
2467/**
2468 * sys_rt_sigqueueinfo - send signal information to a signal
2469 * @pid: the PID of the thread
2470 * @sig: signal to be sent
2471 * @uinfo: signal info to be sent
2472 */
2415SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, 2473SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2416 siginfo_t __user *, uinfo) 2474 siginfo_t __user *, uinfo)
2417{ 2475{
@@ -2421,9 +2479,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2421 return -EFAULT; 2479 return -EFAULT;
2422 2480
2423 /* Not even root can pretend to send signals from the kernel. 2481 /* Not even root can pretend to send signals from the kernel.
2424 Nor can they impersonate a kill(), which adds source info. */ 2482 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2425 if (info.si_code >= 0) 2483 */
2484 if (info.si_code >= 0 || info.si_code == SI_TKILL) {
2485 /* We used to allow any < 0 si_code */
2486 WARN_ON_ONCE(info.si_code < 0);
2426 return -EPERM; 2487 return -EPERM;
2488 }
2427 info.si_signo = sig; 2489 info.si_signo = sig;
2428 2490
2429 /* POSIX.1b doesn't mention process groups. */ 2491 /* POSIX.1b doesn't mention process groups. */
@@ -2437,9 +2499,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2437 return -EINVAL; 2499 return -EINVAL;
2438 2500
2439 /* Not even root can pretend to send signals from the kernel. 2501 /* Not even root can pretend to send signals from the kernel.
2440 Nor can they impersonate a kill(), which adds source info. */ 2502 * Nor can they impersonate a kill()/tgkill(), which adds source info.
2441 if (info->si_code >= 0) 2503 */
2504 if (info->si_code >= 0 || info->si_code == SI_TKILL) {
2505 /* We used to allow any < 0 si_code */
2506 WARN_ON_ONCE(info->si_code < 0);
2442 return -EPERM; 2507 return -EPERM;
2508 }
2443 info->si_signo = sig; 2509 info->si_signo = sig;
2444 2510
2445 return do_send_specific(tgid, pid, sig, info); 2511 return do_send_specific(tgid, pid, sig, info);
@@ -2531,12 +2597,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2531 2597
2532 error = -EINVAL; 2598 error = -EINVAL;
2533 /* 2599 /*
2534 * 2600 * Note - this code used to test ss_flags incorrectly:
2535 * Note - this code used to test ss_flags incorrectly
2536 * old code may have been written using ss_flags==0 2601 * old code may have been written using ss_flags==0
2537 * to mean ss_flags==SS_ONSTACK (as this was the only 2602 * to mean ss_flags==SS_ONSTACK (as this was the only
2538 * way that worked) - this fix preserves that older 2603 * way that worked) - this fix preserves that older
2539 * mechanism 2604 * mechanism.
2540 */ 2605 */
2541 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) 2606 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
2542 goto out; 2607 goto out;
@@ -2570,6 +2635,10 @@ out:
2570 2635
2571#ifdef __ARCH_WANT_SYS_SIGPENDING 2636#ifdef __ARCH_WANT_SYS_SIGPENDING
2572 2637
2638/**
2639 * sys_sigpending - examine pending signals
2640 * @set: where mask of pending signal is returned
2641 */
2573SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 2642SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2574{ 2643{
2575 return do_sigpending(set, sizeof(*set)); 2644 return do_sigpending(set, sizeof(*set));
@@ -2578,8 +2647,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2578#endif 2647#endif
2579 2648
2580#ifdef __ARCH_WANT_SYS_SIGPROCMASK 2649#ifdef __ARCH_WANT_SYS_SIGPROCMASK
2581/* Some platforms have their own version with special arguments others 2650/**
2582 support only sys_rt_sigprocmask. */ 2651 * sys_sigprocmask - examine and change blocked signals
2652 * @how: whether to add, remove, or set signals
2653 * @set: signals to add or remove (if non-null)
2654 * @oset: previous value of signal mask if non-null
2655 *
2656 * Some platforms have their own version with special arguments;
2657 * others support only sys_rt_sigprocmask.
2658 */
2583 2659
2584SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, 2660SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
2585 old_sigset_t __user *, oset) 2661 old_sigset_t __user *, oset)
@@ -2632,6 +2708,13 @@ out:
2632#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 2708#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2633 2709
2634#ifdef __ARCH_WANT_SYS_RT_SIGACTION 2710#ifdef __ARCH_WANT_SYS_RT_SIGACTION
2711/**
2712 * sys_rt_sigaction - alter an action taken by a process
2713 * @sig: signal to be sent
2714 * @act: new sigaction
2715 * @oact: used to save the previous sigaction
2716 * @sigsetsize: size of sigset_t type
2717 */
2635SYSCALL_DEFINE4(rt_sigaction, int, sig, 2718SYSCALL_DEFINE4(rt_sigaction, int, sig,
2636 const struct sigaction __user *, act, 2719 const struct sigaction __user *, act,
2637 struct sigaction __user *, oact, 2720 struct sigaction __user *, oact,
@@ -2718,6 +2801,12 @@ SYSCALL_DEFINE0(pause)
2718#endif 2801#endif
2719 2802
2720#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 2803#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
2804/**
2805 * sys_rt_sigsuspend - replace the signal mask for a value with the
2806 * @unewset value until a signal is received
2807 * @unewset: new signal mask value
2808 * @sigsetsize: size of sigset_t type
2809 */
2721SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) 2810SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
2722{ 2811{
2723 sigset_t newset; 2812 sigset_t newset;
diff --git a/kernel/smp.c b/kernel/smp.c
index 7cbd0f293df4..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void)
604} 604}
605#endif /* USE_GENERIC_SMP_HELPERS */ 605#endif /* USE_GENERIC_SMP_HELPERS */
606 606
607/* Setup configured maximum number of CPUs to activate */
608unsigned int setup_max_cpus = NR_CPUS;
609EXPORT_SYMBOL(setup_max_cpus);
610
611
612/*
613 * Setup routine for controlling SMP activation
614 *
615 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
616 * activation entirely (the MPS table probe still happens, though).
617 *
618 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
619 * greater than 0, limits the maximum number of CPUs activated in
620 * SMP mode to <NUM>.
621 */
622
623void __weak arch_disable_smp_support(void) { }
624
625static int __init nosmp(char *str)
626{
627 setup_max_cpus = 0;
628 arch_disable_smp_support();
629
630 return 0;
631}
632
633early_param("nosmp", nosmp);
634
635/* this is hard limit */
636static int __init nrcpus(char *str)
637{
638 int nr_cpus;
639
640 get_option(&str, &nr_cpus);
641 if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
642 nr_cpu_ids = nr_cpus;
643
644 return 0;
645}
646
647early_param("nr_cpus", nrcpus);
648
649static int __init maxcpus(char *str)
650{
651 get_option(&str, &setup_max_cpus);
652 if (setup_max_cpus == 0)
653 arch_disable_smp_support();
654
655 return 0;
656}
657
658early_param("maxcpus", maxcpus);
659
660/* Setup number of possible processor ids */
661int nr_cpu_ids __read_mostly = NR_CPUS;
662EXPORT_SYMBOL(nr_cpu_ids);
663
664/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
665void __init setup_nr_cpu_ids(void)
666{
667 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
668}
669
670/* Called by boot processor to activate the rest. */
671void __init smp_init(void)
672{
673 unsigned int cpu;
674
675 /* FIXME: This should be done in userspace --RR */
676 for_each_present_cpu(cpu) {
677 if (num_online_cpus() >= setup_max_cpus)
678 break;
679 if (!cpu_online(cpu))
680 cpu_up(cpu);
681 }
682
683 /* Any cleanup work */
684 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
685 smp_cpus_done(setup_max_cpus);
686}
687
607/* 688/*
608 * Call a function on all processors. May be used during early boot while 689 * Call a function on all processors. May be used during early boot while
609 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead 690 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 56e5dec837f0..174f976c2874 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -567,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
567/** 567/**
568 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks 568 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
569 * @ttimer: tasklet_hrtimer which is initialized 569 * @ttimer: tasklet_hrtimer which is initialized
570 * @function: hrtimer callback funtion which gets called from softirq context 570 * @function: hrtimer callback function which gets called from softirq context
571 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) 571 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
572 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) 572 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
573 */ 573 */
@@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
845 switch (action) { 845 switch (action) {
846 case CPU_UP_PREPARE: 846 case CPU_UP_PREPARE:
847 case CPU_UP_PREPARE_FROZEN: 847 case CPU_UP_PREPARE_FROZEN:
848 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 848 p = kthread_create_on_node(run_ksoftirqd,
849 hcpu,
850 cpu_to_node(hotcpu),
851 "ksoftirqd/%d", hotcpu);
849 if (IS_ERR(p)) { 852 if (IS_ERR(p)) {
850 printk("ksoftirqd for %i failed\n", hotcpu); 853 printk("ksoftirqd for %i failed\n", hotcpu);
851 return notifier_from_errno(PTR_ERR(p)); 854 return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
301 case CPU_UP_PREPARE: 301 case CPU_UP_PREPARE:
302 BUG_ON(stopper->thread || stopper->enabled || 302 BUG_ON(stopper->thread || stopper->enabled ||
303 !list_empty(&stopper->works)); 303 !list_empty(&stopper->works));
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", 304 p = kthread_create_on_node(cpu_stopper_thread,
305 cpu); 305 stopper,
306 cpu_to_node(cpu),
307 "migration/%d", cpu);
306 if (IS_ERR(p)) 308 if (IS_ERR(p))
307 return notifier_from_errno(PTR_ERR(p)); 309 return notifier_from_errno(PTR_ERR(p));
308 get_task_struct(p); 310 get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1ad48b3b9068..af468edf096a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -120,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
120void (*pm_power_off_prepare)(void); 120void (*pm_power_off_prepare)(void);
121 121
122/* 122/*
123 * Returns true if current's euid is same as p's uid or euid,
124 * or has CAP_SYS_NICE to p's user_ns.
125 *
126 * Called with rcu_read_lock, creds are safe
127 */
128static bool set_one_prio_perm(struct task_struct *p)
129{
130 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
131
132 if (pcred->user->user_ns == cred->user->user_ns &&
133 (pcred->uid == cred->euid ||
134 pcred->euid == cred->euid))
135 return true;
136 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
137 return true;
138 return false;
139}
140
141/*
123 * set the priority of a task 142 * set the priority of a task
124 * - the caller must hold the RCU read lock 143 * - the caller must hold the RCU read lock
125 */ 144 */
126static int set_one_prio(struct task_struct *p, int niceval, int error) 145static int set_one_prio(struct task_struct *p, int niceval, int error)
127{ 146{
128 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
129 int no_nice; 147 int no_nice;
130 148
131 if (pcred->uid != cred->euid && 149 if (!set_one_prio_perm(p)) {
132 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
133 error = -EPERM; 150 error = -EPERM;
134 goto out; 151 goto out;
135 } 152 }
@@ -506,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
506 if (rgid != (gid_t) -1) { 523 if (rgid != (gid_t) -1) {
507 if (old->gid == rgid || 524 if (old->gid == rgid ||
508 old->egid == rgid || 525 old->egid == rgid ||
509 capable(CAP_SETGID)) 526 nsown_capable(CAP_SETGID))
510 new->gid = rgid; 527 new->gid = rgid;
511 else 528 else
512 goto error; 529 goto error;
@@ -515,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
515 if (old->gid == egid || 532 if (old->gid == egid ||
516 old->egid == egid || 533 old->egid == egid ||
517 old->sgid == egid || 534 old->sgid == egid ||
518 capable(CAP_SETGID)) 535 nsown_capable(CAP_SETGID))
519 new->egid = egid; 536 new->egid = egid;
520 else 537 else
521 goto error; 538 goto error;
@@ -550,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
550 old = current_cred(); 567 old = current_cred();
551 568
552 retval = -EPERM; 569 retval = -EPERM;
553 if (capable(CAP_SETGID)) 570 if (nsown_capable(CAP_SETGID))
554 new->gid = new->egid = new->sgid = new->fsgid = gid; 571 new->gid = new->egid = new->sgid = new->fsgid = gid;
555 else if (gid == old->gid || gid == old->sgid) 572 else if (gid == old->gid || gid == old->sgid)
556 new->egid = new->fsgid = gid; 573 new->egid = new->fsgid = gid;
@@ -617,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
617 new->uid = ruid; 634 new->uid = ruid;
618 if (old->uid != ruid && 635 if (old->uid != ruid &&
619 old->euid != ruid && 636 old->euid != ruid &&
620 !capable(CAP_SETUID)) 637 !nsown_capable(CAP_SETUID))
621 goto error; 638 goto error;
622 } 639 }
623 640
@@ -626,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
626 if (old->uid != euid && 643 if (old->uid != euid &&
627 old->euid != euid && 644 old->euid != euid &&
628 old->suid != euid && 645 old->suid != euid &&
629 !capable(CAP_SETUID)) 646 !nsown_capable(CAP_SETUID))
630 goto error; 647 goto error;
631 } 648 }
632 649
@@ -674,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
674 old = current_cred(); 691 old = current_cred();
675 692
676 retval = -EPERM; 693 retval = -EPERM;
677 if (capable(CAP_SETUID)) { 694 if (nsown_capable(CAP_SETUID)) {
678 new->suid = new->uid = uid; 695 new->suid = new->uid = uid;
679 if (uid != old->uid) { 696 if (uid != old->uid) {
680 retval = set_user(new); 697 retval = set_user(new);
@@ -716,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
716 old = current_cred(); 733 old = current_cred();
717 734
718 retval = -EPERM; 735 retval = -EPERM;
719 if (!capable(CAP_SETUID)) { 736 if (!nsown_capable(CAP_SETUID)) {
720 if (ruid != (uid_t) -1 && ruid != old->uid && 737 if (ruid != (uid_t) -1 && ruid != old->uid &&
721 ruid != old->euid && ruid != old->suid) 738 ruid != old->euid && ruid != old->suid)
722 goto error; 739 goto error;
@@ -780,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
780 old = current_cred(); 797 old = current_cred();
781 798
782 retval = -EPERM; 799 retval = -EPERM;
783 if (!capable(CAP_SETGID)) { 800 if (!nsown_capable(CAP_SETGID)) {
784 if (rgid != (gid_t) -1 && rgid != old->gid && 801 if (rgid != (gid_t) -1 && rgid != old->gid &&
785 rgid != old->egid && rgid != old->sgid) 802 rgid != old->egid && rgid != old->sgid)
786 goto error; 803 goto error;
@@ -840,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
840 857
841 if (uid == old->uid || uid == old->euid || 858 if (uid == old->uid || uid == old->euid ||
842 uid == old->suid || uid == old->fsuid || 859 uid == old->suid || uid == old->fsuid ||
843 capable(CAP_SETUID)) { 860 nsown_capable(CAP_SETUID)) {
844 if (uid != old_fsuid) { 861 if (uid != old_fsuid) {
845 new->fsuid = uid; 862 new->fsuid = uid;
846 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 863 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -873,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
873 890
874 if (gid == old->gid || gid == old->egid || 891 if (gid == old->gid || gid == old->egid ||
875 gid == old->sgid || gid == old->fsgid || 892 gid == old->sgid || gid == old->fsgid ||
876 capable(CAP_SETGID)) { 893 nsown_capable(CAP_SETGID)) {
877 if (gid != old_fsgid) { 894 if (gid != old_fsgid) {
878 new->fsgid = gid; 895 new->fsgid = gid;
879 goto change_okay; 896 goto change_okay;
@@ -1181,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1181 int errno; 1198 int errno;
1182 char tmp[__NEW_UTS_LEN]; 1199 char tmp[__NEW_UTS_LEN];
1183 1200
1184 if (!capable(CAP_SYS_ADMIN)) 1201 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1185 return -EPERM; 1202 return -EPERM;
1203
1186 if (len < 0 || len > __NEW_UTS_LEN) 1204 if (len < 0 || len > __NEW_UTS_LEN)
1187 return -EINVAL; 1205 return -EINVAL;
1188 down_write(&uts_sem); 1206 down_write(&uts_sem);
@@ -1230,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1230 int errno; 1248 int errno;
1231 char tmp[__NEW_UTS_LEN]; 1249 char tmp[__NEW_UTS_LEN];
1232 1250
1233 if (!capable(CAP_SYS_ADMIN)) 1251 if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1234 return -EPERM; 1252 return -EPERM;
1235 if (len < 0 || len > __NEW_UTS_LEN) 1253 if (len < 0 || len > __NEW_UTS_LEN)
1236 return -EINVAL; 1254 return -EINVAL;
@@ -1345,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
1345 rlim = tsk->signal->rlim + resource; 1363 rlim = tsk->signal->rlim + resource;
1346 task_lock(tsk->group_leader); 1364 task_lock(tsk->group_leader);
1347 if (new_rlim) { 1365 if (new_rlim) {
1366 /* Keep the capable check against init_user_ns until
1367 cgroups can contain all limits */
1348 if (new_rlim->rlim_max > rlim->rlim_max && 1368 if (new_rlim->rlim_max > rlim->rlim_max &&
1349 !capable(CAP_SYS_RESOURCE)) 1369 !capable(CAP_SYS_RESOURCE))
1350 retval = -EPERM; 1370 retval = -EPERM;
@@ -1388,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
1388{ 1408{
1389 const struct cred *cred = current_cred(), *tcred; 1409 const struct cred *cred = current_cred(), *tcred;
1390 1410
1391 tcred = __task_cred(task); 1411 if (current == task)
1392 if (current != task && 1412 return 0;
1393 (cred->uid != tcred->euid ||
1394 cred->uid != tcred->suid ||
1395 cred->uid != tcred->uid ||
1396 cred->gid != tcred->egid ||
1397 cred->gid != tcred->sgid ||
1398 cred->gid != tcred->gid) &&
1399 !capable(CAP_SYS_RESOURCE)) {
1400 return -EPERM;
1401 }
1402 1413
1403 return 0; 1414 tcred = __task_cred(task);
1415 if (cred->user->user_ns == tcred->user->user_ns &&
1416 (cred->uid == tcred->euid &&
1417 cred->uid == tcred->suid &&
1418 cred->uid == tcred->uid &&
1419 cred->gid == tcred->egid &&
1420 cred->gid == tcred->sgid &&
1421 cred->gid == tcred->gid))
1422 return 0;
1423 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
1424 return 0;
1425
1426 return -EPERM;
1404} 1427}
1405 1428
1406SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, 1429SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 40245d697602..c0bb32414b17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,7 @@ static int neg_one = -1;
117static int zero; 117static int zero;
118static int __maybe_unused one = 1; 118static int __maybe_unused one = 1;
119static int __maybe_unused two = 2; 119static int __maybe_unused two = 2;
120static int __maybe_unused three = 3;
120static unsigned long one_ul = 1; 121static unsigned long one_ul = 1;
121static int one_hundred = 100; 122static int one_hundred = 100;
122#ifdef CONFIG_PRINTK 123#ifdef CONFIG_PRINTK
@@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
169 void __user *buffer, size_t *lenp, loff_t *ppos); 170 void __user *buffer, size_t *lenp, loff_t *ppos);
170#endif 171#endif
171 172
173#ifdef CONFIG_PRINTK
174static int proc_dmesg_restrict(struct ctl_table *table, int write,
175 void __user *buffer, size_t *lenp, loff_t *ppos);
176#endif
177
172#ifdef CONFIG_MAGIC_SYSRQ 178#ifdef CONFIG_MAGIC_SYSRQ
173/* Note: sysrq code uses it's own private copy */ 179/* Note: sysrq code uses it's own private copy */
174static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 180static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -706,7 +712,7 @@ static struct ctl_table kern_table[] = {
706 .data = &kptr_restrict, 712 .data = &kptr_restrict,
707 .maxlen = sizeof(int), 713 .maxlen = sizeof(int),
708 .mode = 0644, 714 .mode = 0644,
709 .proc_handler = proc_dointvec_minmax, 715 .proc_handler = proc_dmesg_restrict,
710 .extra1 = &zero, 716 .extra1 = &zero,
711 .extra2 = &two, 717 .extra2 = &two,
712 }, 718 },
@@ -971,14 +977,18 @@ static struct ctl_table vm_table[] = {
971 .data = &sysctl_overcommit_memory, 977 .data = &sysctl_overcommit_memory,
972 .maxlen = sizeof(sysctl_overcommit_memory), 978 .maxlen = sizeof(sysctl_overcommit_memory),
973 .mode = 0644, 979 .mode = 0644,
974 .proc_handler = proc_dointvec, 980 .proc_handler = proc_dointvec_minmax,
981 .extra1 = &zero,
982 .extra2 = &two,
975 }, 983 },
976 { 984 {
977 .procname = "panic_on_oom", 985 .procname = "panic_on_oom",
978 .data = &sysctl_panic_on_oom, 986 .data = &sysctl_panic_on_oom,
979 .maxlen = sizeof(sysctl_panic_on_oom), 987 .maxlen = sizeof(sysctl_panic_on_oom),
980 .mode = 0644, 988 .mode = 0644,
981 .proc_handler = proc_dointvec, 989 .proc_handler = proc_dointvec_minmax,
990 .extra1 = &zero,
991 .extra2 = &two,
982 }, 992 },
983 { 993 {
984 .procname = "oom_kill_allocating_task", 994 .procname = "oom_kill_allocating_task",
@@ -1006,7 +1016,8 @@ static struct ctl_table vm_table[] = {
1006 .data = &page_cluster, 1016 .data = &page_cluster,
1007 .maxlen = sizeof(int), 1017 .maxlen = sizeof(int),
1008 .mode = 0644, 1018 .mode = 0644,
1009 .proc_handler = proc_dointvec, 1019 .proc_handler = proc_dointvec_minmax,
1020 .extra1 = &zero,
1010 }, 1021 },
1011 { 1022 {
1012 .procname = "dirty_background_ratio", 1023 .procname = "dirty_background_ratio",
@@ -1054,7 +1065,8 @@ static struct ctl_table vm_table[] = {
1054 .data = &dirty_expire_interval, 1065 .data = &dirty_expire_interval,
1055 .maxlen = sizeof(dirty_expire_interval), 1066 .maxlen = sizeof(dirty_expire_interval),
1056 .mode = 0644, 1067 .mode = 0644,
1057 .proc_handler = proc_dointvec, 1068 .proc_handler = proc_dointvec_minmax,
1069 .extra1 = &zero,
1058 }, 1070 },
1059 { 1071 {
1060 .procname = "nr_pdflush_threads", 1072 .procname = "nr_pdflush_threads",
@@ -1130,6 +1142,8 @@ static struct ctl_table vm_table[] = {
1130 .maxlen = sizeof(int), 1142 .maxlen = sizeof(int),
1131 .mode = 0644, 1143 .mode = 0644,
1132 .proc_handler = drop_caches_sysctl_handler, 1144 .proc_handler = drop_caches_sysctl_handler,
1145 .extra1 = &one,
1146 .extra2 = &three,
1133 }, 1147 },
1134#ifdef CONFIG_COMPACTION 1148#ifdef CONFIG_COMPACTION
1135 { 1149 {
@@ -2385,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
2385 return err; 2399 return err;
2386} 2400}
2387 2401
2402#ifdef CONFIG_PRINTK
2403static int proc_dmesg_restrict(struct ctl_table *table, int write,
2404 void __user *buffer, size_t *lenp, loff_t *ppos)
2405{
2406 if (write && !capable(CAP_SYS_ADMIN))
2407 return -EPERM;
2408
2409 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2410}
2411#endif
2412
2388struct do_proc_dointvec_minmax_conv_param { 2413struct do_proc_dointvec_minmax_conv_param {
2389 int *min; 2414 int *min;
2390 int *max; 2415 int *max;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
111 const char *fail = NULL; 111 const char *fail = NULL;
112 112
113 if (table->parent) { 113 if (table->parent) {
114 if (table->procname && !table->parent->procname) 114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname"); 115 set_fail(&fail, table, "Parent without procname");
116 } 116 }
117 if (!table->procname)
118 set_fail(&fail, table, "No procname");
119 if (table->child) { 117 if (table->child) {
120 if (table->data) 118 if (table->data)
121 set_fail(&fail, table, "Directory with data?"); 119 set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
144 set_fail(&fail, table, "No maxlen"); 142 set_fail(&fail, table, "No maxlen");
145 } 143 }
146#ifdef CONFIG_PROC_SYSCTL 144#ifdef CONFIG_PROC_SYSCTL
147 if (table->procname && !table->proc_handler) 145 if (!table->proc_handler)
148 set_fail(&fail, table, "No proc_handler"); 146 set_fail(&fail, table, "No proc_handler");
149#endif 147#endif
150#if 0
151 if (!table->procname && table->proc_handler)
152 set_fail(&fail, table, "proc_handler without procname");
153#endif
154 sysctl_check_leaf(namespaces, table, &fail); 148 sysctl_check_leaf(namespaces, table, &fail);
155 } 149 }
156 if (table->mode > 0777) 150 if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
685 goto err_cgroup_ops; 685 goto err_cgroup_ops;
686 686
687 family_registered = 1; 687 family_registered = 1;
688 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 688 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
689 return 0; 689 return 0;
690err_cgroup_ops: 690err_cgroup_ops:
691 genl_unregister_ops(&family, &taskstats_ops); 691 genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index b2fa506667c0..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -34,7 +34,7 @@
34 * inaccuracies caused by missed or lost timer 34 * inaccuracies caused by missed or lost timer
35 * interrupts and the inability for the timer 35 * interrupts and the inability for the timer
36 * interrupt hardware to accuratly tick at the 36 * interrupt hardware to accuratly tick at the
37 * requested HZ value. It is also not reccomended 37 * requested HZ value. It is also not recommended
38 * for "tick-less" systems. 38 * for "tick-less" systems.
39 */ 39 */
40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) 40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5f1bb8e2008f..f6117a4c7cb8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -652,6 +652,8 @@ int do_adjtimex(struct timex *txc)
652 struct timespec delta; 652 struct timespec delta;
653 delta.tv_sec = txc->time.tv_sec; 653 delta.tv_sec = txc->time.tv_sec;
654 delta.tv_nsec = txc->time.tv_usec; 654 delta.tv_nsec = txc->time.tv_usec;
655 if (!capable(CAP_SYS_TIME))
656 return -EPERM;
655 if (!(txc->modes & ADJ_NANO)) 657 if (!(txc->modes & ADJ_NANO))
656 delta.tv_nsec *= 1000; 658 delta.tv_nsec *= 1000;
657 result = timekeeping_inject_offset(&delta); 659 result = timekeeping_inject_offset(&delta);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 25028dd4fa18..c340ca658f37 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,6 @@
19 */ 19 */
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/mutex.h>
23#include <linux/posix-clock.h> 22#include <linux/posix-clock.h>
24#include <linux/slab.h> 23#include <linux/slab.h>
25#include <linux/syscalls.h> 24#include <linux/syscalls.h>
@@ -34,19 +33,19 @@ static struct posix_clock *get_posix_clock(struct file *fp)
34{ 33{
35 struct posix_clock *clk = fp->private_data; 34 struct posix_clock *clk = fp->private_data;
36 35
37 mutex_lock(&clk->mutex); 36 down_read(&clk->rwsem);
38 37
39 if (!clk->zombie) 38 if (!clk->zombie)
40 return clk; 39 return clk;
41 40
42 mutex_unlock(&clk->mutex); 41 up_read(&clk->rwsem);
43 42
44 return NULL; 43 return NULL;
45} 44}
46 45
47static void put_posix_clock(struct posix_clock *clk) 46static void put_posix_clock(struct posix_clock *clk)
48{ 47{
49 mutex_unlock(&clk->mutex); 48 up_read(&clk->rwsem);
50} 49}
51 50
52static ssize_t posix_clock_read(struct file *fp, char __user *buf, 51static ssize_t posix_clock_read(struct file *fp, char __user *buf,
@@ -156,7 +155,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
156 struct posix_clock *clk = 155 struct posix_clock *clk =
157 container_of(inode->i_cdev, struct posix_clock, cdev); 156 container_of(inode->i_cdev, struct posix_clock, cdev);
158 157
159 mutex_lock(&clk->mutex); 158 down_read(&clk->rwsem);
160 159
161 if (clk->zombie) { 160 if (clk->zombie) {
162 err = -ENODEV; 161 err = -ENODEV;
@@ -172,7 +171,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
172 fp->private_data = clk; 171 fp->private_data = clk;
173 } 172 }
174out: 173out:
175 mutex_unlock(&clk->mutex); 174 up_read(&clk->rwsem);
176 return err; 175 return err;
177} 176}
178 177
@@ -211,25 +210,20 @@ int posix_clock_register(struct posix_clock *clk, dev_t devid)
211 int err; 210 int err;
212 211
213 kref_init(&clk->kref); 212 kref_init(&clk->kref);
214 mutex_init(&clk->mutex); 213 init_rwsem(&clk->rwsem);
215 214
216 cdev_init(&clk->cdev, &posix_clock_file_operations); 215 cdev_init(&clk->cdev, &posix_clock_file_operations);
217 clk->cdev.owner = clk->ops.owner; 216 clk->cdev.owner = clk->ops.owner;
218 err = cdev_add(&clk->cdev, devid, 1); 217 err = cdev_add(&clk->cdev, devid, 1);
219 if (err)
220 goto no_cdev;
221 218
222 return err; 219 return err;
223no_cdev:
224 mutex_destroy(&clk->mutex);
225 return err;
226} 220}
227EXPORT_SYMBOL_GPL(posix_clock_register); 221EXPORT_SYMBOL_GPL(posix_clock_register);
228 222
229static void delete_clock(struct kref *kref) 223static void delete_clock(struct kref *kref)
230{ 224{
231 struct posix_clock *clk = container_of(kref, struct posix_clock, kref); 225 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
232 mutex_destroy(&clk->mutex); 226
233 if (clk->release) 227 if (clk->release)
234 clk->release(clk); 228 clk->release(clk);
235} 229}
@@ -238,9 +232,9 @@ void posix_clock_unregister(struct posix_clock *clk)
238{ 232{
239 cdev_del(&clk->cdev); 233 cdev_del(&clk->cdev);
240 234
241 mutex_lock(&clk->mutex); 235 down_write(&clk->rwsem);
242 clk->zombie = true; 236 clk->zombie = true;
243 mutex_unlock(&clk->mutex); 237 up_write(&clk->rwsem);
244 238
245 kref_put(&clk->kref, delete_clock); 239 kref_put(&clk->kref, delete_clock);
246} 240}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3bd7e3d5c632..8ad5d576755e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/sysdev.h> 17#include <linux/syscore_ops.h>
18#include <linux/clocksource.h> 18#include <linux/clocksource.h>
19#include <linux/jiffies.h> 19#include <linux/jiffies.h>
20#include <linux/time.h> 20#include <linux/time.h>
@@ -597,13 +597,12 @@ static struct timespec timekeeping_suspend_time;
597 597
598/** 598/**
599 * timekeeping_resume - Resumes the generic timekeeping subsystem. 599 * timekeeping_resume - Resumes the generic timekeeping subsystem.
600 * @dev: unused
601 * 600 *
602 * This is for the generic clocksource timekeeping. 601 * This is for the generic clocksource timekeeping.
603 * xtime/wall_to_monotonic/jiffies/etc are 602 * xtime/wall_to_monotonic/jiffies/etc are
604 * still managed by arch specific suspend/resume code. 603 * still managed by arch specific suspend/resume code.
605 */ 604 */
606static int timekeeping_resume(struct sys_device *dev) 605static void timekeeping_resume(void)
607{ 606{
608 unsigned long flags; 607 unsigned long flags;
609 struct timespec ts; 608 struct timespec ts;
@@ -632,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev)
632 631
633 /* Resume hrtimers */ 632 /* Resume hrtimers */
634 hres_timers_resume(); 633 hres_timers_resume();
635
636 return 0;
637} 634}
638 635
639static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) 636static int timekeeping_suspend(void)
640{ 637{
641 unsigned long flags; 638 unsigned long flags;
642 639
@@ -654,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
654} 651}
655 652
656/* sysfs resume/suspend bits for timekeeping */ 653/* sysfs resume/suspend bits for timekeeping */
657static struct sysdev_class timekeeping_sysclass = { 654static struct syscore_ops timekeeping_syscore_ops = {
658 .name = "timekeeping",
659 .resume = timekeeping_resume, 655 .resume = timekeeping_resume,
660 .suspend = timekeeping_suspend, 656 .suspend = timekeeping_suspend,
661}; 657};
662 658
663static struct sys_device device_timer = { 659static int __init timekeeping_init_ops(void)
664 .id = 0,
665 .cls = &timekeeping_sysclass,
666};
667
668static int __init timekeeping_init_device(void)
669{ 660{
670 int error = sysdev_class_register(&timekeeping_sysclass); 661 register_syscore_ops(&timekeeping_syscore_ops);
671 if (!error) 662 return 0;
672 error = sysdev_register(&device_timer);
673 return error;
674} 663}
675 664
676device_initcall(timekeeping_init_device); 665device_initcall(timekeeping_init_ops);
677 666
678/* 667/*
679 * If the error is already larger, we look ahead even further 668 * If the error is already larger, we look ahead even further
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
236 unsigned int timer_flag) 236 unsigned int timer_flag)
237{ 237{
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesn't matter which lock we take:
240 */ 240 */
241 raw_spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f38..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
703 * 703 *
704 **/ 704 **/
705static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 705static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
706 u32 what) 706 u32 what)
707{ 707{
708 struct blk_trace *bt = q->blk_trace; 708 struct blk_trace *bt = q->blk_trace;
709 int rw = rq->cmd_flags & 0x03;
710 709
711 if (likely(!bt)) 710 if (likely(!bt))
712 return; 711 return;
713 712
714 if (rq->cmd_flags & REQ_DISCARD)
715 rw |= REQ_DISCARD;
716
717 if (rq->cmd_flags & REQ_SECURE)
718 rw |= REQ_SECURE;
719
720 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 713 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
721 what |= BLK_TC_ACT(BLK_TC_PC); 714 what |= BLK_TC_ACT(BLK_TC_PC);
722 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, 715 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
723 what, rq->errors, rq->cmd_len, rq->cmd); 716 what, rq->errors, rq->cmd_len, rq->cmd);
724 } else { 717 } else {
725 what |= BLK_TC_ACT(BLK_TC_FS); 718 what |= BLK_TC_ACT(BLK_TC_FS);
726 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, 719 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
727 what, rq->errors, 0, NULL); 720 rq->cmd_flags, what, rq->errors, 0, NULL);
728 } 721 }
729} 722}
730 723
@@ -857,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
857 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 850 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
858} 851}
859 852
860static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) 853static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
854 unsigned int depth, bool explicit)
861{ 855{
862 struct blk_trace *bt = q->blk_trace; 856 struct blk_trace *bt = q->blk_trace;
863 857
864 if (bt) { 858 if (bt) {
865 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; 859 __be64 rpdu = cpu_to_be64(depth);
866 __be64 rpdu = cpu_to_be64(pdu); 860 u32 what;
867
868 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
869 sizeof(rpdu), &rpdu);
870 }
871}
872 861
873static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) 862 if (explicit)
874{ 863 what = BLK_TA_UNPLUG_IO;
875 struct blk_trace *bt = q->blk_trace; 864 else
876 865 what = BLK_TA_UNPLUG_TIMER;
877 if (bt) {
878 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
879 __be64 rpdu = cpu_to_be64(pdu);
880 866
881 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, 867 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
882 sizeof(rpdu), &rpdu);
883 } 868 }
884} 869}
885 870
@@ -1022,9 +1007,7 @@ static void blk_register_tracepoints(void)
1022 WARN_ON(ret); 1007 WARN_ON(ret);
1023 ret = register_trace_block_plug(blk_add_trace_plug, NULL); 1008 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
1024 WARN_ON(ret); 1009 WARN_ON(ret);
1025 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); 1010 ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
1026 WARN_ON(ret);
1027 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1028 WARN_ON(ret); 1011 WARN_ON(ret);
1029 ret = register_trace_block_split(blk_add_trace_split, NULL); 1012 ret = register_trace_block_split(blk_add_trace_split, NULL);
1030 WARN_ON(ret); 1013 WARN_ON(ret);
@@ -1039,8 +1022,7 @@ static void blk_unregister_tracepoints(void)
1039 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); 1022 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1040 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); 1023 unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1041 unregister_trace_block_split(blk_add_trace_split, NULL); 1024 unregister_trace_block_split(blk_add_trace_split, NULL);
1042 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); 1025 unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
1043 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
1044 unregister_trace_block_plug(blk_add_trace_plug, NULL); 1026 unregister_trace_block_plug(blk_add_trace_plug, NULL);
1045 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); 1027 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
1046 unregister_trace_block_getrq(blk_add_trace_getrq, NULL); 1028 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 888b611897d3..ee24fa1935ac 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod)
1268 p->flags = 0L; 1268 p->flags = 0L;
1269 1269
1270 /* 1270 /*
1271 * Do the initial record convertion from mcount jump 1271 * Do the initial record conversion from mcount jump
1272 * to the NOP instructions. 1272 * to the NOP instructions.
1273 */ 1273 */
1274 if (!ftrace_code_disable(mod, p)) { 1274 if (!ftrace_code_disable(mod, p)) {
@@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1467 return t_hash_next(m, pos); 1467 return t_hash_next(m, pos);
1468 1468
1469 (*pos)++; 1469 (*pos)++;
1470 iter->pos = *pos; 1470 iter->pos = iter->func_pos = *pos;
1471 1471
1472 if (iter->flags & FTRACE_ITER_PRINTALL) 1472 if (iter->flags & FTRACE_ITER_PRINTALL)
1473 return t_hash_start(m, pos); 1473 return t_hash_start(m, pos);
@@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1502 if (!rec) 1502 if (!rec)
1503 return t_hash_start(m, pos); 1503 return t_hash_start(m, pos);
1504 1504
1505 iter->func_pos = *pos;
1506 iter->func = rec; 1505 iter->func = rec;
1507 1506
1508 return iter; 1507 return iter;
@@ -3426,7 +3425,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
3426 atomic_set(&t->tracing_graph_pause, 0); 3425 atomic_set(&t->tracing_graph_pause, 0);
3427 atomic_set(&t->trace_overrun, 0); 3426 atomic_set(&t->trace_overrun, 0);
3428 t->ftrace_timestamp = 0; 3427 t->ftrace_timestamp = 0;
3429 /* make curr_ret_stack visable before we add the ret_stack */ 3428 /* make curr_ret_stack visible before we add the ret_stack */
3430 smp_wmb(); 3429 smp_wmb();
3431 t->ret_stack = ret_stack; 3430 t->ret_stack = ret_stack;
3432} 3431}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d9c8bcafb120..0ef7b4b2a1f7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1478,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1478 return local_read(&bpage->entries) & RB_WRITE_MASK; 1478 return local_read(&bpage->entries) & RB_WRITE_MASK;
1479} 1479}
1480 1480
1481/* Size is determined by what has been commited */ 1481/* Size is determined by what has been committed */
1482static inline unsigned rb_page_size(struct buffer_page *bpage) 1482static inline unsigned rb_page_size(struct buffer_page *bpage)
1483{ 1483{
1484 return rb_page_commit(bpage); 1484 return rb_page_commit(bpage);
@@ -2932,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2932 /* 2932 /*
2933 * cpu_buffer->pages just needs to point to the buffer, it 2933 * cpu_buffer->pages just needs to point to the buffer, it
2934 * has no specific buffer page to point to. Lets move it out 2934 * has no specific buffer page to point to. Lets move it out
2935 * of our way so we don't accidently swap it. 2935 * of our way so we don't accidentally swap it.
2936 */ 2936 */
2937 cpu_buffer->pages = reader->list.prev; 2937 cpu_buffer->pages = reader->list.prev;
2938 2938
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9541c27c1cf2..d38c16a06a6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3239,7 +3239,7 @@ waitagain:
3239 trace_seq_init(&iter->seq); 3239 trace_seq_init(&iter->seq);
3240 3240
3241 /* 3241 /*
3242 * If there was nothing to send to user, inspite of consuming trace 3242 * If there was nothing to send to user, in spite of consuming trace
3243 * entries, go back to wait for more entries. 3243 * entries, go back to wait for more entries.
3244 */ 3244 */
3245 if (sret == -EBUSY) 3245 if (sret == -EBUSY)
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
46} 46}
47 47
48/* 48/*
49 * trace_clock(): 'inbetween' trace clock. Not completely serialized, 49 * trace_clock(): 'between' trace clock. Not completely serialized,
50 * but not completely incorrect when crossing CPUs either. 50 * but not completely incorrect when crossing CPUs either.
51 * 51 *
52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of 52 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 1516cb3ec549..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
27 * in the structure. 27 * in the structure.
28 * 28 *
29 * * for structures within structures, the format of the internal 29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure 30 * structure is laid out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros 31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they 32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the 33 * will create a compile error if it happens. Since the
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 76b05980225c..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
905 * 905 *
906 * returns 1 if 906 * returns 1 if
907 * - we are inside irq code 907 * - we are inside irq code
908 * - we just extered irq code 908 * - we just entered irq code
909 * 909 *
910 * retunns 0 if 910 * retunns 0 if
911 * - funcgraph-interrupts option is set 911 * - funcgraph-interrupts option is set
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 92b6e1e12d98..a4969b47afc1 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = {
80 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
81 * did a maximum and could disturb our measurement with serial console 81 * did a maximum and could disturb our measurement with serial console
82 * printouts, etc. Truly coinciding maximum latencies should be rare 82 * printouts, etc. Truly coinciding maximum latencies should be rare
83 * and what happens together happens separately as well, so this doesnt 83 * and what happens together happens separately as well, so this doesn't
84 * decrease the validity of the maximum found: 84 * decrease the validity of the maximum found:
85 */ 85 */
86static __cacheline_aligned_in_smp unsigned long max_sequence; 86static __cacheline_aligned_in_smp unsigned long max_sequence;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8435b43b1782..35d55a386145 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1839,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1839 kfree(tp->call.print_fmt); 1839 kfree(tp->call.print_fmt);
1840} 1840}
1841 1841
1842/* Make a debugfs interface for controling probe points */ 1842/* Make a debugfs interface for controlling probe points */
1843static __init int init_kprobe_trace(void) 1843static __init int init_kprobe_trace(void)
1844{ 1844{
1845 struct dentry *d_tracer; 1845 struct dentry *d_tracer;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
189 struct group_info *group_info; 189 struct group_info *group_info;
190 int retval; 190 int retval;
191 191
192 if (!capable(CAP_SETGID)) 192 if (!nsown_capable(CAP_SETGID))
193 return -EPERM; 193 return -EPERM;
194 if ((unsigned)gidsetsize > NGROUPS_MAX) 194 if ((unsigned)gidsetsize > NGROUPS_MAX)
195 return -EINVAL; 195 return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
20 20
21/* 21/*
22 * Removes a registered user return notifier. Must be called from atomic 22 * Removes a registered user return notifier. Must be called from atomic
23 * context, and from the same cpu registration occured in. 23 * context, and from the same cpu registration occurred in.
24 */ 24 */
25void user_return_notifier_unregister(struct user_return_notifier *urn) 25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{ 26{
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19 19
20/*
21 * userns count is 1 for root user, 1 for init_uts_ns,
22 * and 1 for... ?
23 */
20struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
21 .kref = { 25 .kref = {
22 .refcount = ATOMIC_INIT(2), 26 .refcount = ATOMIC_INIT(3),
23 }, 27 },
24 .creator = &root_user, 28 .creator = &root_user,
25}; 29};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
47 */ 51 */
48static DEFINE_SPINLOCK(uidhash_lock); 52static DEFINE_SPINLOCK(uidhash_lock);
49 53
50/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ 54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
51struct user_struct root_user = { 55struct user_struct root_user = {
52 .__count = ATOMIC_INIT(2), 56 .__count = ATOMIC_INIT(2),
53 .processes = ATOMIC_INIT(1), 57 .processes = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..44646179eaba 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h>
17 18
18static struct uts_namespace *create_uts_ns(void) 19static struct uts_namespace *create_uts_ns(void)
19{ 20{
@@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
30 * @old_ns: namespace to clone 31 * @old_ns: namespace to clone
31 * Return NULL on error (failure to kmalloc), new ns otherwise 32 * Return NULL on error (failure to kmalloc), new ns otherwise
32 */ 33 */
33static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) 34static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
35 struct uts_namespace *old_ns)
34{ 36{
35 struct uts_namespace *ns; 37 struct uts_namespace *ns;
36 38
@@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
40 42
41 down_read(&uts_sem); 43 down_read(&uts_sem);
42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 44 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
45 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
43 up_read(&uts_sem); 46 up_read(&uts_sem);
44 return ns; 47 return ns;
45} 48}
@@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
50 * utsname of this process won't be seen by parent, and vice 53 * utsname of this process won't be seen by parent, and vice
51 * versa. 54 * versa.
52 */ 55 */
53struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) 56struct uts_namespace *copy_utsname(unsigned long flags,
57 struct task_struct *tsk)
54{ 58{
59 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
55 struct uts_namespace *new_ns; 60 struct uts_namespace *new_ns;
56 61
57 BUG_ON(!old_ns); 62 BUG_ON(!old_ns);
@@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
60 if (!(flags & CLONE_NEWUTS)) 65 if (!(flags & CLONE_NEWUTS))
61 return old_ns; 66 return old_ns;
62 67
63 new_ns = clone_uts_ns(old_ns); 68 new_ns = clone_uts_ns(tsk, old_ns);
64 69
65 put_uts_ns(old_ns); 70 put_uts_ns(old_ns);
66 return new_ns; 71 return new_ns;
@@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref)
71 struct uts_namespace *ns; 76 struct uts_namespace *ns;
72 77
73 ns = container_of(kref, struct uts_namespace, kref); 78 ns = container_of(kref, struct uts_namespace, kref);
79 put_user_ns(ns->user_ns);
74 kfree(ns); 80 kfree(ns);
75} 81}
diff --git a/kernel/wait.c b/kernel/wait.c
index b0310eb6cc1e..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
142 * woken up through the queue. 142 * woken up through the queue.
143 * 143 *
144 * This prevents waiter starvation where an exclusive waiter 144 * This prevents waiter starvation where an exclusive waiter
145 * aborts and is woken up concurrently and noone wakes up 145 * aborts and is woken up concurrently and no one wakes up
146 * the next waiter. 146 * the next waiter.
147 */ 147 */
148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, 148void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c57..140dce750450 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
48 * Should we panic when a soft-lockup or hard-lockup occurs: 48 * Should we panic when a soft-lockup or hard-lockup occurs:
49 */ 49 */
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 50#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static int hardlockup_panic; 51static int hardlockup_panic =
52 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
52 53
53static int __init hardlockup_panic_setup(char *str) 54static int __init hardlockup_panic_setup(char *str)
54{ 55{
55 if (!strncmp(str, "panic", 5)) 56 if (!strncmp(str, "panic", 5))
56 hardlockup_panic = 1; 57 hardlockup_panic = 1;
58 else if (!strncmp(str, "nopanic", 7))
59 hardlockup_panic = 0;
57 else if (!strncmp(str, "0", 1)) 60 else if (!strncmp(str, "0", 1))
58 watchdog_enabled = 0; 61 watchdog_enabled = 0;
59 return 1; 62 return 1;
@@ -415,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu)
415static int watchdog_enable(int cpu) 418static int watchdog_enable(int cpu)
416{ 419{
417 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 420 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
418 int err; 421 int err = 0;
419 422
420 /* enable the perf event */ 423 /* enable the perf event */
421 err = watchdog_nmi_enable(cpu); 424 err = watchdog_nmi_enable(cpu);
422 if (err) 425
423 return err; 426 /* Regardless of err above, fall through and start softlockup */
424 427
425 /* create the watchdog thread */ 428 /* create the watchdog thread */
426 if (!p) { 429 if (!p) {
427 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 430 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
428 if (IS_ERR(p)) { 431 if (IS_ERR(p)) {
429 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 432 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
430 return PTR_ERR(p); 433 if (!err)
434 /* if hardlockup hasn't already set this */
435 err = PTR_ERR(p);
436 goto out;
431 } 437 }
432 kthread_bind(p, cpu); 438 kthread_bind(p, cpu);
433 per_cpu(watchdog_touch_ts, cpu) = 0; 439 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -435,7 +441,8 @@ static int watchdog_enable(int cpu)
435 wake_up_process(p); 441 wake_up_process(p);
436 } 442 }
437 443
438 return 0; 444out:
445 return err;
439} 446}
440 447
441static void watchdog_disable(int cpu) 448static void watchdog_disable(int cpu)
@@ -547,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
547 break; 554 break;
548#endif /* CONFIG_HOTPLUG_CPU */ 555#endif /* CONFIG_HOTPLUG_CPU */
549 } 556 }
550 return notifier_from_errno(err); 557
558 /*
559 * hardlockup and softlockup are not important enough
560 * to block cpu bring up. Just always succeed and
561 * rely on printk output to flag problems.
562 */
563 return NOTIFY_OK;
551} 564}
552 565
553static struct notifier_block __cpuinitdata cpu_nfb = { 566static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5ca7ce9ce754..8859a41806dd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1291,7 +1291,7 @@ __acquires(&gcwq->lock)
1291 return true; 1291 return true;
1292 spin_unlock_irq(&gcwq->lock); 1292 spin_unlock_irq(&gcwq->lock);
1293 1293
1294 /* CPU has come up inbetween, retry migration */ 1294 /* CPU has come up in between, retry migration */
1295 cpu_relax(); 1295 cpu_relax();
1296 } 1296 }
1297} 1297}
@@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1366 worker->id = id; 1366 worker->id = id;
1367 1367
1368 if (!on_unbound_cpu) 1368 if (!on_unbound_cpu)
1369 worker->task = kthread_create(worker_thread, worker, 1369 worker->task = kthread_create_on_node(worker_thread,
1370 "kworker/%u:%d", gcwq->cpu, id); 1370 worker,
1371 cpu_to_node(gcwq->cpu),
1372 "kworker/%u:%d", gcwq->cpu, id);
1371 else 1373 else
1372 worker->task = kthread_create(worker_thread, worker, 1374 worker->task = kthread_create(worker_thread, worker,
1373 "kworker/u:%d", id); 1375 "kworker/u:%d", id);